Untitled

                Never    
import sys
import time

from os.path import join
from collections import Counter

from corpustools import extract_fields, replace_disallowed
from corpustools import verbose_generator, POLISH
from corpustools.language_model import train_lm

sys.setrecursionlimit(5000)


def count_chunks(corpus_file, output_directory, sizes,
                 filename_pattern="{size}grams.freq",
                 min_freq=10,
                 must_contain=None,
                 symbols=POLISH,
                 verbose=True,
                 **kwargs):
    """Creates frequency files for n-grams from a corpus.
    Parameters
    ----------
    corpus_file : str or path
        Tagged corpus file to extract events from
    output_directory : str or path
        n-gram file(s) will be created in this directory
    sizes : int or set of int
        N-gram size(s). If a set of sizes is provided,
        a frequency file will be created for each size.
    filename_pattern : str or path
        Each frequency file created will follow this pattern, but injected
        with the length of the ngram, i.e. filename_pattern.format(size)
    min_freq : int
        Minimum frequency of all words making up n-gram
    must_contain : container
        If provided, only n-grams containing at least one word in
        must_contain are counted
    symbols : str
        Symbols that are allowed. N-grams with other symbols are ignored.
    verbose : bool
        Write progress to stdout
    Notes
    -----
    Other keyword arguments are passed on to extract_fields/extract_units
    """
    start = time.time()

    with open(corpus_file) as corpus:
        tokens = extract_fields(corpus, **kwargs)
        frequencies = Counter(tokens)
        num_sentences = frequencies.pop("</s>")

        frequencies = {word for word, freq in frequencies.items()
                       if freq >= min_freq}
        frequencies = set(replace_disallowed(frequencies, symbols, "REPL"))
        if "REPL" in frequencies:
            frequencies.remove("REPL")
        if must_contain:
            frequencies.update(must_contain)

        corpus.seek(0)

        if verbose:
            template = "\rProcessed {count} out of {total} sentences! Memory consumption: {memory}"
            corpus = verbose_generator(corpus,
                                       target="</s>\n",
                                       total=num_sentences,
                                       template=template,
                                       every_n=50000)

        if isinstance(sizes, int):
            size = sizes
        else:
            size = max(sizes)

        lm = train_lm(corpus, size,
                      vocabulary=frequencies,
                      targets=None,
                      must_contain=must_contain,
                      **kwargs)

        sys.stdout.write("lm trained.\n")
        sys.stdout.flush()

    if isinstance(sizes, int):
        n_gram_file = join(output_directory, filename_pattern.format(size))
        with open(n_gram_file, "wt") as n_grams:
            for n_gram_string, freq in lm.completions():
                n_gram = n_gram_string.split("#")
                n_gram_size = len(n_gram)

                if n_gram_size != size or freq < min_freq:
                    continue

                line = f"{n_gram_string}\t{freq}\n"
                n_grams.write(line)

    else:
        files = {size: join(output_directory, filename_pattern.format(size=size))
                 for size in sizes}
        files = {size: open(file, "wt") for size, file in files.items()}

        for n_gram_string, freq in lm.completions():
            n_gram = n_gram_string.split("#")
            n_gram_size = len(n_gram)

            if n_gram_size not in files or freq < min_freq:
                continue

            line = f"{n_gram_string}\t{freq}\n"
            files[n_gram_size].write(line)

        for file in files:
            files[file].close()

    took = time.time() - start
    hours = took // 3600
    minutes = (took - 3600*hours) // 60
    sys.stdout.write("All written to file(s).\n")
    sys.stdout.write(f"Took {hours} hours and {minutes} minutes!\n")
    sys.stdout.flush()
    breakpoint()

    del lm
    sys.stdout.write("lm deleted\n")
    sys.stdout.flush()
    breakpoint()

    del files
    sys.stdout.write("files deleted\n")
    sys.stdout.flush()
    breakpoint()

    sys.stdout.write("{}".format(locals()))



def main():
    top = "path"
    out_dir = join(top, "path2")
    araneum = join(top, "path3")
    genitive_freqs = join(top, "path4")
    min_freq = 10

    with open(genitive_freqs) as frequencies:
        frequencies.readline()  # ignore header
        genitives = set()
        for line in frequencies:
            genitive, frequency = line.rstrip("\n").split("\t")
            if int(frequency) >= min_freq:
                genitives.add(genitive)
    sys.stdout.write("Read genitives.\n")  # this gets printed fine
    sys.stdout.flush()

    count_chunks(araneum, out_dir, {2, 3, 4},
                 min_freq=min_freq,
                 filename_pattern="testbp_{size}grams.freq",
                 must_contain=genitives, symbols=POLISH,
                 verbose=False)

    sys.stdout.write("count_chunks done\n")  # this never gets printed
    sys.stdout.flush()

    # exit explicitly, so job doesn't keep running
    sys.exit()

    sys.stdout.write("sys.exit() done.\n")
    sys.stdout.flush()


if __name__ == "__main__":
    main()

Raw Text