import torch import json import os from fastNLP import Vocabulary from fastNLP.io.data_loader import ConllLoader, SSTLoader, SNLILoader from fastNLP.core import Const as C import numpy as np MAX_LEN = 128 def update_v(vocab, data, field): data.apply(lambda x: vocab.add_word_lst(x[field]), new_field_name=None) def to_index(vocab, data, field, name): def func(x): try: return [vocab.to_index(w) for w in x[field]] except ValueError: return [vocab.padding_idx for _ in x[field]] data.apply(func, new_field_name=name) def load_seqtag(path, files, indexs): word_h, tag_h = 'words', 'tags' loader = ConllLoader(headers=[word_h, tag_h], indexes=indexs) ds_list = [] for fn in files: ds_list.append(loader.load(os.path.join(path, fn))) word_v = Vocabulary(min_freq=2) tag_v = Vocabulary(unknown=None) update_v(word_v, ds_list[0], word_h) update_v(tag_v, ds_list[0], tag_h) def process_data(ds): to_index(word_v, ds, word_h, C.INPUT) to_index(tag_v, ds, tag_h, C.TARGET) ds.apply(lambda x: x[C.INPUT][:MAX_LEN], new_field_name=C.INPUT) ds.apply(lambda x: x[C.TARGET][:MAX_LEN], new_field_name=C.TARGET) ds.apply(lambda x: len(x[word_h]), new_field_name=C.INPUT_LEN) ds.set_input(C.INPUT, C.INPUT_LEN) ds.set_target(C.TARGET, C.INPUT_LEN) for i in range(len(ds_list)): process_data(ds_list[i]) return ds_list, word_v, tag_v def load_sst(path, files): loaders = [SSTLoader(subtree=sub, fine_grained=True) for sub in [True, False, False]] ds_list = [loader.load(os.path.join(path, fn)) for fn, loader in zip(files, loaders)] word_v = Vocabulary(min_freq=0) tag_v = Vocabulary(unknown=None, padding=None) for ds in ds_list: ds.apply(lambda x: [w.lower() for w in x['words']], new_field_name='words') #ds_list[0].drop(lambda x: len(x['words']) < 3) update_v(word_v, ds_list[0], 'words') update_v(word_v, ds_list[1], 'words') update_v(word_v, ds_list[2], 'words') ds_list[0].apply(lambda x: tag_v.add_word( x['target']), new_field_name=None) def process_data(ds): to_index(word_v, ds, 'words', C.INPUT) ds.apply(lambda x: tag_v.to_index(x['target']), new_field_name=C.TARGET) ds.apply(lambda x: x[C.INPUT][:MAX_LEN], new_field_name=C.INPUT) ds.apply(lambda x: len(x['words']), new_field_name=C.INPUT_LEN) ds.set_input(C.INPUT, C.INPUT_LEN) ds.set_target(C.TARGET) for i in range(len(ds_list)): process_data(ds_list[i]) return ds_list, word_v, tag_v def load_snli(path, files): loader = SNLILoader() ds_list = [loader.load(os.path.join(path, f)) for f in files] word_v = Vocabulary(min_freq=2) tag_v = Vocabulary(unknown=None, padding=None) for ds in ds_list: ds.apply(lambda x: [w.lower() for w in x['words1']], new_field_name='words1') ds.apply(lambda x: [w.lower() for w in x['words2']], new_field_name='words2') update_v(word_v, ds_list[0], 'words1') update_v(word_v, ds_list[0], 'words2') ds_list[0].apply(lambda x: tag_v.add_word( x['target']), new_field_name=None) def process_data(ds): to_index(word_v, ds, 'words1', C.INPUTS(0)) to_index(word_v, ds, 'words2', C.INPUTS(1)) ds.apply(lambda x: tag_v.to_index(x['target']), new_field_name=C.TARGET) ds.apply(lambda x: x[C.INPUTS(0)][:MAX_LEN], new_field_name=C.INPUTS(0)) ds.apply(lambda x: x[C.INPUTS(1)][:MAX_LEN], new_field_name=C.INPUTS(1)) ds.apply(lambda x: len(x[C.INPUTS(0)]), new_field_name=C.INPUT_LENS(0)) ds.apply(lambda x: len(x[C.INPUTS(1)]), new_field_name=C.INPUT_LENS(1)) ds.set_input(C.INPUTS(0), C.INPUTS(1), C.INPUT_LENS(0), C.INPUT_LENS(1)) ds.set_target(C.TARGET) for i in range(len(ds_list)): process_data(ds_list[i]) return ds_list, word_v, tag_v class EmbedLoader: @staticmethod def parse_glove_line(line): line = line.split() if len(line) <= 2: raise RuntimeError( "something goes wrong in parsing glove embedding") return line[0], line[1:] @staticmethod def str_list_2_vec(line): return torch.Tensor(list(map(float, line))) @staticmethod def fast_load_embedding(emb_dim, emb_file, vocab): """Fast load the pre-trained embedding and combine with the given dictionary. This loading method uses line-by-line operation. :param int emb_dim: the dimension of the embedding. Should be the same as pre-trained embedding. :param str emb_file: the pre-trained embedding file path. :param Vocabulary vocab: a mapping from word to index, can be provided by user or built from pre-trained embedding :return embedding_matrix: numpy.ndarray """ if vocab is None: raise RuntimeError("You must provide a vocabulary.") embedding_matrix = np.zeros( shape=(len(vocab), emb_dim), dtype=np.float32) hit_flags = np.zeros(shape=(len(vocab),), dtype=int) with open(emb_file, "r", encoding="utf-8") as f: startline = f.readline() if len(startline.split()) > 2: f.seek(0) for line in f: word, vector = EmbedLoader.parse_glove_line(line) try: if word in vocab: vector = EmbedLoader.str_list_2_vec(vector) if emb_dim != vector.size(0): continue embedding_matrix[vocab[word]] = vector hit_flags[vocab[word]] = 1 except Exception: continue if np.sum(hit_flags) < len(vocab): # some words from vocab are missing in pre-trained embedding # we normally sample each dimension vocab_embed = embedding_matrix[np.where(hit_flags)] #sampled_vectors = np.random.normal(vocab_embed.mean(axis=0), vocab_embed.std(axis=0), # size=(len(vocab) - np.sum(hit_flags), emb_dim)) sampled_vectors = np.random.uniform(-0.01, 0.01, size=(len(vocab) - np.sum(hit_flags), emb_dim)) embedding_matrix[np.where(1 - hit_flags)] = sampled_vectors return embedding_matrix