Untitled
Never
import sys import time from os.path import join from collections import Counter from corpustools import extract_fields, replace_disallowed from corpustools import verbose_generator, POLISH from corpustools.language_model import train_lm sys.setrecursionlimit(5000) def count_chunks(corpus_file, output_directory, sizes, filename_pattern="{size}grams.freq", min_freq=10, must_contain=None, symbols=POLISH, verbose=True, **kwargs): """Creates frequency files for n-grams from a corpus. Parameters ---------- corpus_file : str or path Tagged corpus file to extract events from output_directory : str or path n-gram file(s) will be created in this directory sizes : int or set of int N-gram size(s). If a set of sizes is provided, a frequency file will be created for each size. filename_pattern : str or path Each frequency file created will follow this pattern, but injected with the length of the ngram, i.e. filename_pattern.format(size) min_freq : int Minimum frequency of all words making up n-gram must_contain : container If provided, only n-grams containing at least one word in must_contain are counted symbols : str Symbols that are allowed. N-grams with other symbols are ignored. verbose : bool Write progress to stdout Notes ----- Other keyword arguments are passed on to extract_fields/extract_units """ start = time.time() with open(corpus_file) as corpus: tokens = extract_fields(corpus, **kwargs) frequencies = Counter(tokens) num_sentences = frequencies.pop("</s>") frequencies = {word for word, freq in frequencies.items() if freq >= min_freq} frequencies = set(replace_disallowed(frequencies, symbols, "REPL")) if "REPL" in frequencies: frequencies.remove("REPL") if must_contain: frequencies.update(must_contain) corpus.seek(0) if verbose: template = "\rProcessed {count} out of {total} sentences! Memory consumption: {memory}" corpus = verbose_generator(corpus, target="</s>\n", total=num_sentences, template=template, every_n=50000) if isinstance(sizes, int): size = sizes else: size = max(sizes) lm = train_lm(corpus, size, vocabulary=frequencies, targets=None, must_contain=must_contain, **kwargs) sys.stdout.write("lm trained.\n") sys.stdout.flush() if isinstance(sizes, int): n_gram_file = join(output_directory, filename_pattern.format(size)) with open(n_gram_file, "wt") as n_grams: for n_gram_string, freq in lm.completions(): n_gram = n_gram_string.split("#") n_gram_size = len(n_gram) if n_gram_size != size or freq < min_freq: continue line = f"{n_gram_string}\t{freq}\n" n_grams.write(line) else: files = {size: join(output_directory, filename_pattern.format(size=size)) for size in sizes} files = {size: open(file, "wt") for size, file in files.items()} for n_gram_string, freq in lm.completions(): n_gram = n_gram_string.split("#") n_gram_size = len(n_gram) if n_gram_size not in files or freq < min_freq: continue line = f"{n_gram_string}\t{freq}\n" files[n_gram_size].write(line) for file in files: files[file].close() took = time.time() - start hours = took // 3600 minutes = (took - 3600*hours) // 60 sys.stdout.write("All written to file(s).\n") sys.stdout.write(f"Took {hours} hours and {minutes} minutes!\n") sys.stdout.flush() breakpoint() del lm sys.stdout.write("lm deleted\n") sys.stdout.flush() breakpoint() del files sys.stdout.write("files deleted\n") sys.stdout.flush() breakpoint() sys.stdout.write("{}".format(locals())) def main(): top = "path" out_dir = join(top, "path2") araneum = join(top, "path3") genitive_freqs = join(top, "path4") min_freq = 10 with open(genitive_freqs) as frequencies: frequencies.readline() # ignore header genitives = set() for line in frequencies: genitive, frequency = line.rstrip("\n").split("\t") if int(frequency) >= min_freq: genitives.add(genitive) sys.stdout.write("Read genitives.\n") # this gets printed fine sys.stdout.flush() count_chunks(araneum, out_dir, {2, 3, 4}, min_freq=min_freq, filename_pattern="testbp_{size}grams.freq", must_contain=genitives, symbols=POLISH, verbose=False) sys.stdout.write("count_chunks done\n") # this never gets printed sys.stdout.flush() # exit explicitly, so job doesn't keep running sys.exit() sys.stdout.write("sys.exit() done.\n") sys.stdout.flush() if __name__ == "__main__": main()
Raw Text
-
https://www.facebook.com/TryLifeBoostCBDGummies/
1 min ago
-
bromentul
4 min ago
-
Abby and Bryana Lesbian Strap-On Fucking
4 min ago
-
haitani
5 min ago
-
💪 My PSYCHOLOGIST is a brunette MILF and she SUCKS my cock until I CUM on her FACE
36 min ago
-
Stacked ebony babe takes Viking dick
1 hour ago
-
Makers CBD Gummies Reviews
1 hour ago
-
Adult Telegram Channels
1 hour ago
-
No Deposit Bonus
1 hour ago
-
Johnny America Standing On Business!
1 hour ago