hummingbird
/
fastNLP

import pickle

import Word2Idx


def get_sets(m, n):
    """
    get a train set containing m samples and a test set containing n samples
    """
    samples = pickle.load(open("tuples.pkl","rb"))
    if m+n > len(samples):
        print("asking for too many tuples\n")
        return
    train_samples = samples[ : m]
    test_samples = samples[m: m+n]
    return train_samples, test_samples

def build_wordidx():
    """
    build wordidx using word2idx
    """
    train, test = get_sets(500000, 2000)
    words = []
    for x in train:
        words += x[0]
    wordidx = Word2Idx.Word2Idx()
    wordidx.build(words)
    print(wordidx.num)
    print(wordidx.i2w(0))
    wordidx.save("wordidx.pkl")

def build_sets():
    """
    build train set and test set, transform word to index
    """
    train, test = get_sets(500000, 2000)
    wordidx = Word2Idx.Word2Idx()
    wordidx.load("wordidx.pkl")
    train_set = []
    for x in train:
        sent = [wordidx.w2i(w) for w in x[0]]
        train_set.append({"sent" : sent, "class" : x[1]})
    test_set = []
    for x in test:
        sent = [wordidx.w2i(w) for w in x[0]]
        test_set.append({"sent" : sent, "class" : x[1]})
    pickle.dump(train_set, open("train_set.pkl", "wb"))
    pickle.dump(test_set, open("test_set.pkl", "wb"))

if __name__ == "__main__":
    build_wordidx()
    build_sets()