| @@ -3,7 +3,7 @@ import torch | |||
| from fastNLP.core.action import Batchifier, SequentialSampler | |||
| from fastNLP.core.action import convert_to_torch_tensor | |||
| from fastNLP.loader.preprocess import load_pickle, DEFAULT_UNKNOWN_LABEL | |||
| from fastNLP.core.preprocess import load_pickle, DEFAULT_UNKNOWN_LABEL | |||
| from fastNLP.modules import utils | |||
| @@ -0,0 +1,310 @@ | |||
| import _pickle | |||
| import os | |||
| import numpy as np | |||
| DEFAULT_PADDING_LABEL = '<pad>' # dict index = 0 | |||
| DEFAULT_UNKNOWN_LABEL = '<unk>' # dict index = 1 | |||
| DEFAULT_RESERVED_LABEL = ['<reserved-2>', | |||
| '<reserved-3>', | |||
| '<reserved-4>'] # dict index = 2~4 | |||
| DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1, | |||
| DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3, | |||
| DEFAULT_RESERVED_LABEL[2]: 4} | |||
| # the first vocab in dict with the index = 5 | |||
| def save_pickle(obj, pickle_path, file_name): | |||
| with open(os.path.join(pickle_path, file_name), "wb") as f: | |||
| _pickle.dump(obj, f) | |||
| print("{} saved. ".format(file_name)) | |||
| def load_pickle(pickle_path, file_name): | |||
| with open(os.path.join(pickle_path, file_name), "rb") as f: | |||
| obj = _pickle.load(f) | |||
| print("{} loaded. ".format(file_name)) | |||
| return obj | |||
| def pickle_exist(pickle_path, pickle_name): | |||
| """ | |||
| :param pickle_path: the directory of target pickle file | |||
| :param pickle_name: the filename of target pickle file | |||
| :return: True if file exists else False | |||
| """ | |||
| if not os.path.exists(pickle_path): | |||
| os.makedirs(pickle_path) | |||
| file_name = os.path.join(pickle_path, pickle_name) | |||
| if os.path.exists(file_name): | |||
| return True | |||
| else: | |||
| return False | |||
| class BasePreprocess(object): | |||
| def __init__(self): | |||
| self.word2index = None | |||
| self.label2index = None | |||
| @property | |||
| def vocab_size(self): | |||
| return len(self.word2index) | |||
| @property | |||
| def num_classes(self): | |||
| return len(self.label2index) | |||
| def run(self, train_dev_data, test_data=None, pickle_path="./", train_dev_split=0, cross_val=False, n_fold=10): | |||
| """Main preprocessing pipeline. | |||
| :param train_dev_data: three-level list, with either single label or multiple labels in a sample. | |||
| :param test_data: three-level list, with either single label or multiple labels in a sample. (optional) | |||
| :param pickle_path: str, the path to save the pickle files. | |||
| :param train_dev_split: float, between [0, 1]. The ratio of training data used as validation set. | |||
| :param cross_val: bool, whether to do cross validation. | |||
| :param n_fold: int, the number of folds of cross validation. Only useful when cross_val is True. | |||
| :return results: a tuple of datasets after preprocessing. | |||
| """ | |||
| if pickle_exist(pickle_path, "word2id.pkl") and pickle_exist(pickle_path, "class2id.pkl"): | |||
| self.word2index = load_pickle(pickle_path, "word2id.pkl") | |||
| self.label2index = load_pickle(pickle_path, "class2id.pkl") | |||
| else: | |||
| self.word2index, self.label2index = self.build_dict(train_dev_data) | |||
| save_pickle(self.word2index, pickle_path, "word2id.pkl") | |||
| save_pickle(self.label2index, pickle_path, "class2id.pkl") | |||
| if not pickle_exist(pickle_path, "id2word.pkl"): | |||
| index2word = self.build_reverse_dict(self.word2index) | |||
| save_pickle(index2word, pickle_path, "id2word.pkl") | |||
| if not pickle_exist(pickle_path, "id2class.pkl"): | |||
| index2label = self.build_reverse_dict(self.label2index) | |||
| save_pickle(index2label, pickle_path, "id2class.pkl") | |||
| data_train = [] | |||
| data_dev = [] | |||
| if not cross_val: | |||
| if not pickle_exist(pickle_path, "data_train.pkl"): | |||
| data_train.extend(self.to_index(train_dev_data)) | |||
| if train_dev_split > 0 and not pickle_exist(pickle_path, "data_dev.pkl"): | |||
| split = int(len(data_train) * train_dev_split) | |||
| data_dev = data_train[: split] | |||
| data_train = data_train[split:] | |||
| save_pickle(data_dev, pickle_path, "data_dev.pkl") | |||
| print("{} of the training data is split for validation. ".format(train_dev_split)) | |||
| save_pickle(data_train, pickle_path, "data_train.pkl") | |||
| else: | |||
| data_train = load_pickle(pickle_path, "data_train.pkl") | |||
| else: | |||
| # cross_val is True | |||
| if not pickle_exist(pickle_path, "data_train_0.pkl"): | |||
| # cross validation | |||
| data_idx = self.to_index(train_dev_data) | |||
| data_cv = self.cv_split(data_idx, n_fold) | |||
| for i, (data_train_cv, data_dev_cv) in enumerate(data_cv): | |||
| save_pickle( | |||
| data_train_cv, pickle_path, | |||
| "data_train_{}.pkl".format(i)) | |||
| save_pickle( | |||
| data_dev_cv, pickle_path, | |||
| "data_dev_{}.pkl".format(i)) | |||
| data_train.append(data_train_cv) | |||
| data_dev.append(data_dev_cv) | |||
| print("{}-fold cross validation.".format(n_fold)) | |||
| else: | |||
| for i in range(n_fold): | |||
| data_train_cv = load_pickle(pickle_path, "data_train_{}.pkl".format(i)) | |||
| data_dev_cv = load_pickle(pickle_path, "data_dev_{}.pkl".format(i)) | |||
| data_train.append(data_train_cv) | |||
| data_dev.append(data_dev_cv) | |||
| # prepare test data if provided | |||
| data_test = [] | |||
| if test_data is not None: | |||
| if not pickle_exist(pickle_path, "data_test.pkl"): | |||
| data_test = self.to_index(test_data) | |||
| save_pickle(data_test, pickle_path, "data_test.pkl") | |||
| # return preprocessed results | |||
| results = [data_train] | |||
| if cross_val or train_dev_split > 0: | |||
| results.append(data_dev) | |||
| if test_data: | |||
| results.append(data_test) | |||
| if len(results) == 1: | |||
| return results[0] | |||
| else: | |||
| return tuple(results) | |||
| def build_dict(self, data): | |||
| raise NotImplementedError | |||
| def to_index(self, data): | |||
| raise NotImplementedError | |||
| def build_reverse_dict(self, word_dict): | |||
| id2word = {word_dict[w]: w for w in word_dict} | |||
| return id2word | |||
| def data_split(self, data, train_dev_split): | |||
| """Split data into train and dev set.""" | |||
| split = int(len(data) * train_dev_split) | |||
| data_dev = data[: split] | |||
| data_train = data[split:] | |||
| return data_train, data_dev | |||
| def cv_split(self, data, n_fold): | |||
| """Split data for cross validation.""" | |||
| data_copy = data.copy() | |||
| np.random.shuffle(data_copy) | |||
| fold_size = round(len(data_copy) / n_fold) | |||
| data_cv = [] | |||
| for i in range(n_fold - 1): | |||
| start = i * fold_size | |||
| end = (i + 1) * fold_size | |||
| data_dev = data_copy[start:end] | |||
| data_train = data_copy[:start] + data_copy[end:] | |||
| data_cv.append((data_train, data_dev)) | |||
| start = (n_fold - 1) * fold_size | |||
| data_dev = data_copy[start:] | |||
| data_train = data_copy[:start] | |||
| data_cv.append((data_train, data_dev)) | |||
| return data_cv | |||
| class SeqLabelPreprocess(BasePreprocess): | |||
| """Preprocess pipeline, including building mapping from words to index, from index to words, | |||
| from labels/classes to index, from index to labels/classes. | |||
| data of three-level list which have multiple labels in each sample. | |||
| [ | |||
| [ [word_11, word_12, ...], [label_1, label_1, ...] ], | |||
| [ [word_21, word_22, ...], [label_2, label_1, ...] ], | |||
| ... | |||
| ] | |||
| """ | |||
| def __init__(self): | |||
| super(SeqLabelPreprocess, self).__init__() | |||
| def build_dict(self, data): | |||
| """ | |||
| Add new words with indices into self.word_dict, new labels with indices into self.label_dict. | |||
| :param data: three-level list | |||
| [ | |||
| [ [word_11, word_12, ...], [label_1, label_1, ...] ], | |||
| [ [word_21, word_22, ...], [label_2, label_1, ...] ], | |||
| ... | |||
| ] | |||
| :return word2index: dict of {str, int} | |||
| label2index: dict of {str, int} | |||
| """ | |||
| # In seq labeling, both word seq and label seq need to be padded to the same length in a mini-batch. | |||
| label2index = DEFAULT_WORD_TO_INDEX.copy() | |||
| word2index = DEFAULT_WORD_TO_INDEX.copy() | |||
| for example in data: | |||
| for word, label in zip(example[0], example[1]): | |||
| if word not in word2index: | |||
| word2index[word] = len(word2index) | |||
| if label not in label2index: | |||
| label2index[label] = len(label2index) | |||
| return word2index, label2index | |||
| def to_index(self, data): | |||
| """ | |||
| Convert word strings and label strings into indices. | |||
| :param data: three-level list | |||
| [ | |||
| [ [word_11, word_12, ...], [label_1, label_1, ...] ], | |||
| [ [word_21, word_22, ...], [label_2, label_1, ...] ], | |||
| ... | |||
| ] | |||
| :return data_index: the same shape as data, but each string is replaced by its corresponding index | |||
| """ | |||
| data_index = [] | |||
| for example in data: | |||
| word_list = [] | |||
| label_list = [] | |||
| for word, label in zip(example[0], example[1]): | |||
| word_list.append(self.word2index.get(word, DEFAULT_WORD_TO_INDEX[DEFAULT_UNKNOWN_LABEL])) | |||
| label_list.append(self.label2index.get(label, DEFAULT_WORD_TO_INDEX[DEFAULT_UNKNOWN_LABEL])) | |||
| data_index.append([word_list, label_list]) | |||
| return data_index | |||
| class ClassPreprocess(BasePreprocess): | |||
| """ Preprocess pipeline for classification datasets. | |||
| Preprocess pipeline, including building mapping from words to index, from index to words, | |||
| from labels/classes to index, from index to labels/classes. | |||
| design for data of three-level list which has a single label in each sample. | |||
| [ | |||
| [ [word_11, word_12, ...], label_1 ], | |||
| [ [word_21, word_22, ...], label_2 ], | |||
| ... | |||
| ] | |||
| """ | |||
| def __init__(self): | |||
| super(ClassPreprocess, self).__init__() | |||
| def build_dict(self, data): | |||
| """Build vocabulary.""" | |||
| # build vocabulary from scratch if nothing exists | |||
| word2index = DEFAULT_WORD_TO_INDEX.copy() | |||
| label2index = DEFAULT_WORD_TO_INDEX.copy() | |||
| # collect every word and label | |||
| for sent, label in data: | |||
| if len(sent) <= 1: | |||
| continue | |||
| if label not in label2index: | |||
| label2index[label] = len(label2index) | |||
| for word in sent: | |||
| if word not in word2index: | |||
| word2index[word[0]] = len(word2index) | |||
| return word2index, label2index | |||
| def to_index(self, data): | |||
| """ | |||
| Convert word strings and label strings into indices. | |||
| :param data: three-level list | |||
| [ | |||
| [ [word_11, word_12, ...], label_1 ], | |||
| [ [word_21, word_22, ...], label_2 ], | |||
| ... | |||
| ] | |||
| :return data_index: the same shape as data, but each string is replaced by its corresponding index | |||
| """ | |||
| data_index = [] | |||
| for example in data: | |||
| word_list = [] | |||
| # example[0] is the word list, example[1] is the single label | |||
| for word in example[0]: | |||
| word_list.append(self.word2index.get(word, DEFAULT_WORD_TO_INDEX[DEFAULT_UNKNOWN_LABEL])) | |||
| label_index = self.label2index.get(example[1], DEFAULT_WORD_TO_INDEX[DEFAULT_UNKNOWN_LABEL]) | |||
| data_index.append([word_list, label_index]) | |||
| return data_index | |||
| def infer_preprocess(pickle_path, data): | |||
| """ | |||
| Preprocess over inference data. | |||
| Transform three-level list of strings into that of index. | |||
| [ | |||
| [word_11, word_12, ...], | |||
| [word_21, word_22, ...], | |||
| ... | |||
| ] | |||
| """ | |||
| word2index = load_pickle(pickle_path, "word2id.pkl") | |||
| data_index = [] | |||
| for example in data: | |||
| data_index.append([word2index.get(w, DEFAULT_UNKNOWN_LABEL) for w in example]) | |||
| return data_index | |||
| @@ -34,7 +34,7 @@ class BaseTester(object): | |||
| self.eval_history = [] | |||
| self.batch_output = [] | |||
| def test(self, network): | |||
| def test(self, network, dev_data): | |||
| if torch.cuda.is_available() and self.use_cuda: | |||
| self.model = network.cuda() | |||
| else: | |||
| @@ -45,8 +45,8 @@ class BaseTester(object): | |||
| self.eval_history.clear() | |||
| self.batch_output.clear() | |||
| dev_data = self.prepare_input(self.pickle_path) | |||
| logger.info("validation data loaded") | |||
| # dev_data = self.prepare_input(self.pickle_path) | |||
| # logger.info("validation data loaded") | |||
| iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True)) | |||
| n_batches = len(dev_data) // self.batch_size | |||
| @@ -1,4 +1,5 @@ | |||
| import _pickle | |||
| import copy | |||
| import os | |||
| import time | |||
| from datetime import timedelta | |||
| @@ -52,9 +53,11 @@ class BaseTrainer(object): | |||
| self.loss_func = None | |||
| self.optimizer = None | |||
| def train(self, network): | |||
| def train(self, network, train_data, dev_data=None): | |||
| """General Training Steps | |||
| :param network: a model | |||
| :param train_data: three-level list, the training set. | |||
| :param dev_data: three-level list, the validation data (optional) | |||
| The method is framework independent. | |||
| Work by calling the following methods: | |||
| @@ -73,8 +76,8 @@ class BaseTrainer(object): | |||
| else: | |||
| self.model = network | |||
| data_train = self.load_train_data(self.pickle_path) | |||
| logger.info("training data loaded") | |||
| # train_data = self.load_train_data(self.pickle_path) | |||
| # logger.info("training data loaded") | |||
| # define tester over dev data | |||
| if self.validate: | |||
| @@ -88,8 +91,7 @@ class BaseTrainer(object): | |||
| logger.info("optimizer defined as {}".format(str(self.optimizer))) | |||
| # main training epochs | |||
| n_samples = len(data_train) | |||
| n_samples = len(train_data) | |||
| n_batches = n_samples // self.batch_size | |||
| n_print = 1 | |||
| start = time.time() | |||
| @@ -101,14 +103,14 @@ class BaseTrainer(object): | |||
| # turn on network training mode | |||
| self.mode(network, test=False) | |||
| # prepare mini-batch iterator | |||
| data_iterator = iter(Batchifier(RandomSampler(data_train), self.batch_size, drop_last=False)) | |||
| data_iterator = iter(Batchifier(RandomSampler(train_data), self.batch_size, drop_last=False)) | |||
| logger.info("prepared data iterator") | |||
| self._train_step(data_iterator, network, start=start, n_print=n_print, epoch=epoch) | |||
| if self.validate: | |||
| logger.info("validation started") | |||
| validator.test(network) | |||
| validator.test(network, dev_data) | |||
| if self.save_best_dev and self.best_eval_result(validator): | |||
| self.save_model(network) | |||
| @@ -139,6 +141,26 @@ class BaseTrainer(object): | |||
| logger.info(print_output) | |||
| step += 1 | |||
| def cross_validate(self, network, train_data_cv, dev_data_cv): | |||
| """Training with cross validation. | |||
| :param network: the model | |||
| :param train_data_cv: four-level list, of shape [num_folds, num_examples, 2, ?] | |||
| :param dev_data_cv: four-level list, of shape [num_folds, num_examples, 2, ?] | |||
| """ | |||
| if len(train_data_cv) != len(dev_data_cv): | |||
| logger.error("the number of folds in train and dev data unequals {}!={}".format(len(train_data_cv), | |||
| len(dev_data_cv))) | |||
| raise RuntimeError("the number of folds in train and dev data unequals") | |||
| n_fold = len(train_data_cv) | |||
| logger.info("perform {} folds cross validation.".format(n_fold)) | |||
| for i in range(n_fold): | |||
| print("CV:", i) | |||
| logger.info("running the {} of {} folds cross validation".format(i + 1, n_fold)) | |||
| network_copy = copy.deepcopy(network) | |||
| self.train(network_copy, train_data_cv[i], dev_data_cv[i]) | |||
| def load_train_data(self, pickle_path): | |||
| """ | |||
| For task-specific processing. | |||
| @@ -1,366 +0,0 @@ | |||
| import _pickle | |||
| import os | |||
| DEFAULT_PADDING_LABEL = '<pad>' # dict index = 0 | |||
| DEFAULT_UNKNOWN_LABEL = '<unk>' # dict index = 1 | |||
| DEFAULT_RESERVED_LABEL = ['<reserved-2>', | |||
| '<reserved-3>', | |||
| '<reserved-4>'] # dict index = 2~4 | |||
| DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1, | |||
| DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3, | |||
| DEFAULT_RESERVED_LABEL[2]: 4} | |||
| # the first vocab in dict with the index = 5 | |||
| def save_pickle(obj, pickle_path, file_name): | |||
| with open(os.path.join(pickle_path, file_name), "wb") as f: | |||
| _pickle.dump(obj, f) | |||
| print("{} saved. ".format(file_name)) | |||
| def load_pickle(pickle_path, file_name): | |||
| with open(os.path.join(pickle_path, file_name), "rb") as f: | |||
| obj = _pickle.load(f) | |||
| print("{} loaded. ".format(file_name)) | |||
| return obj | |||
| def pickle_exist(pickle_path, pickle_name): | |||
| """ | |||
| :param pickle_path: the directory of target pickle file | |||
| :param pickle_name: the filename of target pickle file | |||
| :return: True if file exists else False | |||
| """ | |||
| if not os.path.exists(pickle_path): | |||
| os.makedirs(pickle_path) | |||
| file_name = os.path.join(pickle_path, pickle_name) | |||
| if os.path.exists(file_name): | |||
| return True | |||
| else: | |||
| return False | |||
| class BasePreprocess(object): | |||
| def __init__(self, data, pickle_path): | |||
| super(BasePreprocess, self).__init__() | |||
| # self.data = data | |||
| self.pickle_path = pickle_path | |||
| if not self.pickle_path.endswith('/'): | |||
| self.pickle_path = self.pickle_path + '/' | |||
| class POSPreprocess(BasePreprocess): | |||
| """ | |||
| This class are used to preprocess the POS Tag datasets. | |||
| """ | |||
| def __init__(self, data, pickle_path="./", train_dev_split=0): | |||
| """ | |||
| Preprocess pipeline, including building mapping from words to index, from index to words, | |||
| from labels/classes to index, from index to labels/classes. | |||
| :param data: three-level list | |||
| [ | |||
| [ [word_11, word_12, ...], [label_1, label_1, ...] ], | |||
| [ [word_21, word_22, ...], [label_2, label_1, ...] ], | |||
| ... | |||
| ] | |||
| :param pickle_path: str, the directory to the pickle files. Default: "./" | |||
| :param train_dev_split: float in [0, 1]. The ratio of dev data split from training data. Default: 0. | |||
| """ | |||
| super(POSPreprocess, self).__init__(data, pickle_path) | |||
| self.pickle_path = pickle_path | |||
| if pickle_exist(pickle_path, "word2id.pkl") and pickle_exist(pickle_path, "class2id.pkl"): | |||
| self.word2index = load_pickle(self.pickle_path, "word2id.pkl") | |||
| self.label2index = load_pickle(self.pickle_path, "class2id.pkl") | |||
| else: | |||
| self.word2index, self.label2index = self.build_dict(data) | |||
| save_pickle(self.word2index, self.pickle_path, "word2id.pkl") | |||
| save_pickle(self.label2index, self.pickle_path, "class2id.pkl") | |||
| if not pickle_exist(pickle_path, "id2word.pkl"): | |||
| index2word = self.build_reverse_dict(self.word2index) | |||
| save_pickle(index2word, self.pickle_path, "id2word.pkl") | |||
| if not pickle_exist(pickle_path, "id2class.pkl"): | |||
| index2label = self.build_reverse_dict(self.label2index) | |||
| save_pickle(index2label, self.pickle_path, "id2class.pkl") | |||
| if not pickle_exist(pickle_path, "data_train.pkl"): | |||
| data_train = self.to_index(data) | |||
| if train_dev_split > 0 and not pickle_exist(pickle_path, "data_dev.pkl"): | |||
| split = int(len(data_train) * train_dev_split) | |||
| data_dev = data_train[: split] | |||
| data_train = data_train[split:] | |||
| save_pickle(data_dev, self.pickle_path, "data_dev.pkl") | |||
| print("{} of the training data is split for validation. ".format(train_dev_split)) | |||
| save_pickle(data_train, self.pickle_path, "data_train.pkl") | |||
| def build_dict(self, data): | |||
| """ | |||
| Add new words with indices into self.word_dict, new labels with indices into self.label_dict. | |||
| :param data: three-level list | |||
| [ | |||
| [ [word_11, word_12, ...], [label_1, label_1, ...] ], | |||
| [ [word_21, word_22, ...], [label_2, label_1, ...] ], | |||
| ... | |||
| ] | |||
| :return word2index: dict of {str, int} | |||
| label2index: dict of {str, int} | |||
| """ | |||
| # In seq labeling, both word seq and label seq need to be padded to the same length in a mini-batch. | |||
| label2index = DEFAULT_WORD_TO_INDEX.copy() | |||
| word2index = DEFAULT_WORD_TO_INDEX.copy() | |||
| for example in data: | |||
| for word, label in zip(example[0], example[1]): | |||
| if word not in word2index: | |||
| word2index[word] = len(word2index) | |||
| if label not in label2index: | |||
| label2index[label] = len(label2index) | |||
| return word2index, label2index | |||
| def build_reverse_dict(self, word_dict): | |||
| id2word = {word_dict[w]: w for w in word_dict} | |||
| return id2word | |||
| def to_index(self, data): | |||
| """ | |||
| Convert word strings and label strings into indices. | |||
| :param data: three-level list | |||
| [ | |||
| [ [word_11, word_12, ...], [label_1, label_1, ...] ], | |||
| [ [word_21, word_22, ...], [label_2, label_1, ...] ], | |||
| ... | |||
| ] | |||
| :return data_index: the shape of data, but each string is replaced by its corresponding index | |||
| """ | |||
| data_index = [] | |||
| for example in data: | |||
| word_list = [] | |||
| label_list = [] | |||
| for word, label in zip(example[0], example[1]): | |||
| word_list.append(self.word2index[word]) | |||
| label_list.append(self.label2index[label]) | |||
| data_index.append([word_list, label_list]) | |||
| return data_index | |||
| @property | |||
| def vocab_size(self): | |||
| return len(self.word2index) | |||
| @property | |||
| def num_classes(self): | |||
| return len(self.label2index) | |||
| class ClassPreprocess(BasePreprocess): | |||
| """ | |||
| Pre-process the classification datasets. | |||
| Params: | |||
| pickle_path - directory to save result of pre-processing | |||
| Saves: | |||
| word2id.pkl | |||
| id2word.pkl | |||
| class2id.pkl | |||
| id2class.pkl | |||
| embedding.pkl | |||
| data_train.pkl | |||
| data_dev.pkl | |||
| data_test.pkl | |||
| """ | |||
| def __init__(self, pickle_path): | |||
| # super(ClassPreprocess, self).__init__(data, pickle_path) | |||
| self.word_dict = None | |||
| self.label_dict = None | |||
| self.pickle_path = pickle_path # save directory | |||
| def process(self, data, save_name): | |||
| """ | |||
| Process data. | |||
| Params: | |||
| data - nested list, data = [sample1, sample2, ...], | |||
| sample = [sentence, label], sentence = [word1, word2, ...] | |||
| save_name - name of processed data, such as data_train.pkl | |||
| Returns: | |||
| vocab_size - vocabulary size | |||
| n_classes - number of classes | |||
| """ | |||
| self.build_dict(data) | |||
| self.word2id() | |||
| vocab_size = self.id2word() | |||
| self.class2id() | |||
| num_classes = self.id2class() | |||
| self.embedding() | |||
| self.data_generate(data, save_name) | |||
| return vocab_size, num_classes | |||
| def build_dict(self, data): | |||
| """Build vocabulary.""" | |||
| # just read if word2id.pkl and class2id.pkl exists | |||
| if self.pickle_exist("word2id.pkl") and \ | |||
| self.pickle_exist("class2id.pkl"): | |||
| file_name = os.path.join(self.pickle_path, "word2id.pkl") | |||
| with open(file_name, 'rb') as f: | |||
| self.word_dict = _pickle.load(f) | |||
| file_name = os.path.join(self.pickle_path, "class2id.pkl") | |||
| with open(file_name, 'rb') as f: | |||
| self.label_dict = _pickle.load(f) | |||
| return | |||
| # build vocabulary from scratch if nothing exists | |||
| self.word_dict = { | |||
| DEFAULT_PADDING_LABEL: 0, | |||
| DEFAULT_UNKNOWN_LABEL: 1, | |||
| DEFAULT_RESERVED_LABEL[0]: 2, | |||
| DEFAULT_RESERVED_LABEL[1]: 3, | |||
| DEFAULT_RESERVED_LABEL[2]: 4} | |||
| self.label_dict = {} | |||
| # collect every word and label | |||
| for sent, label in data: | |||
| if len(sent) <= 1: | |||
| continue | |||
| if label not in self.label_dict: | |||
| index = len(self.label_dict) | |||
| self.label_dict[label] = index | |||
| for word in sent: | |||
| if word not in self.word_dict: | |||
| index = len(self.word_dict) | |||
| self.word_dict[word[0]] = index | |||
| def pickle_exist(self, pickle_name): | |||
| """ | |||
| Check whether a pickle file exists. | |||
| Params | |||
| pickle_name: the filename of target pickle file | |||
| Return | |||
| True if file exists else False | |||
| """ | |||
| if not os.path.exists(self.pickle_path): | |||
| os.makedirs(self.pickle_path) | |||
| file_name = os.path.join(self.pickle_path, pickle_name) | |||
| if os.path.exists(file_name): | |||
| return True | |||
| else: | |||
| return False | |||
| def word2id(self): | |||
| """Save vocabulary of {word:id} mapping format.""" | |||
| # nothing will be done if word2id.pkl exists | |||
| if self.pickle_exist("word2id.pkl"): | |||
| return | |||
| file_name = os.path.join(self.pickle_path, "word2id.pkl") | |||
| with open(file_name, "wb") as f: | |||
| _pickle.dump(self.word_dict, f) | |||
| def id2word(self): | |||
| """Save vocabulary of {id:word} mapping format.""" | |||
| # nothing will be done if id2word.pkl exists | |||
| if self.pickle_exist("id2word.pkl"): | |||
| file_name = os.path.join(self.pickle_path, "id2word.pkl") | |||
| with open(file_name, 'rb') as f: | |||
| id2word_dict = _pickle.load(f) | |||
| return len(id2word_dict) | |||
| id2word_dict = {self.word_dict[w]: w for w in self.word_dict} | |||
| file_name = os.path.join(self.pickle_path, "id2word.pkl") | |||
| with open(file_name, "wb") as f: | |||
| _pickle.dump(id2word_dict, f) | |||
| return len(id2word_dict) | |||
| def class2id(self): | |||
| """Save mapping of {class:id}.""" | |||
| # nothing will be done if class2id.pkl exists | |||
| if self.pickle_exist("class2id.pkl"): | |||
| return | |||
| file_name = os.path.join(self.pickle_path, "class2id.pkl") | |||
| with open(file_name, "wb") as f: | |||
| _pickle.dump(self.label_dict, f) | |||
| def id2class(self): | |||
| """Save mapping of {id:class}.""" | |||
| # nothing will be done if id2class.pkl exists | |||
| if self.pickle_exist("id2class.pkl"): | |||
| file_name = os.path.join(self.pickle_path, "id2class.pkl") | |||
| with open(file_name, "rb") as f: | |||
| id2class_dict = _pickle.load(f) | |||
| return len(id2class_dict) | |||
| id2class_dict = {self.label_dict[c]: c for c in self.label_dict} | |||
| file_name = os.path.join(self.pickle_path, "id2class.pkl") | |||
| with open(file_name, "wb") as f: | |||
| _pickle.dump(id2class_dict, f) | |||
| return len(id2class_dict) | |||
| def embedding(self): | |||
| """Save embedding lookup table corresponding to vocabulary.""" | |||
| # nothing will be done if embedding.pkl exists | |||
| if self.pickle_exist("embedding.pkl"): | |||
| return | |||
| # retrieve vocabulary from pre-trained embedding (not implemented) | |||
| def data_generate(self, data_src, save_name): | |||
| """Convert dataset from text to digit.""" | |||
| # nothing will be done if file exists | |||
| save_path = os.path.join(self.pickle_path, save_name) | |||
| if os.path.exists(save_path): | |||
| return | |||
| data = [] | |||
| # for every sample | |||
| for sent, label in data_src: | |||
| if len(sent) <= 1: | |||
| continue | |||
| label_id = self.label_dict[label] # label id | |||
| sent_id = [] # sentence ids | |||
| for word in sent: | |||
| if word in self.word_dict: | |||
| sent_id.append(self.word_dict[word]) | |||
| else: | |||
| sent_id.append(self.word_dict[DEFAULT_UNKNOWN_LABEL]) | |||
| data.append([sent_id, label_id]) | |||
| # save data | |||
| with open(save_path, "wb") as f: | |||
| _pickle.dump(data, f) | |||
| class LMPreprocess(BasePreprocess): | |||
| def __init__(self, data, pickle_path): | |||
| super(LMPreprocess, self).__init__(data, pickle_path) | |||
| def infer_preprocess(pickle_path, data): | |||
| """ | |||
| Preprocess over inference data. | |||
| Transform three-level list of strings into that of index. | |||
| [ | |||
| [word_11, word_12, ...], | |||
| [word_21, word_22, ...], | |||
| ... | |||
| ] | |||
| """ | |||
| word2index = load_pickle(pickle_path, "word2id.pkl") | |||
| data_index = [] | |||
| for example in data: | |||
| data_index.append([word2index.get(w, DEFAULT_UNKNOWN_LABEL) for w in example]) | |||
| return data_index | |||
| @@ -1,13 +1,12 @@ | |||
| import os | |||
| import | |||
| import | |||
| import torch | |||
| import torch.nn as nn | |||
| .dataset as dst | |||
| from .model import CNN_text | |||
| from torch.autograd import Variable | |||
| from . import dataset as dst | |||
| from .model import CNN_text | |||
| # Hyper Parameters | |||
| batch_size = 50 | |||
| learning_rate = 0.0001 | |||
| @@ -5,7 +5,7 @@ sys.path.append("..") | |||
| from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||
| from fastNLP.core.trainer import SeqLabelTrainer | |||
| from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader | |||
| from fastNLP.loader.preprocess import POSPreprocess, load_pickle | |||
| from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle | |||
| from fastNLP.saver.model_saver import ModelSaver | |||
| from fastNLP.loader.model_loader import ModelLoader | |||
| from fastNLP.core.tester import SeqLabelTester | |||
| @@ -48,7 +48,7 @@ def infer(): | |||
| print("Inference finished!") | |||
| def train(): | |||
| def train_test(): | |||
| # Config Loader | |||
| train_args = ConfigSection() | |||
| test_args = ConfigSection() | |||
| @@ -59,9 +59,10 @@ def train(): | |||
| train_data = loader.load_pku() | |||
| # Preprocessor | |||
| p = POSPreprocess(train_data, pickle_path, train_dev_split=0.3) | |||
| train_args["vocab_size"] = p.vocab_size | |||
| train_args["num_classes"] = p.num_classes | |||
| preprocess = SeqLabelPreprocess() | |||
| data_train, data_dev = preprocess.run(train_data, pickle_path=pickle_path, train_dev_split=0.3) | |||
| train_args["vocab_size"] = preprocess.vocab_size | |||
| train_args["num_classes"] = preprocess.num_classes | |||
| # Trainer | |||
| trainer = SeqLabelTrainer(train_args) | |||
| @@ -70,7 +71,7 @@ def train(): | |||
| model = SeqLabeling(train_args) | |||
| # Start training | |||
| trainer.train(model) | |||
| trainer.train(model, data_train, data_dev) | |||
| print("Training finished!") | |||
| # Saver | |||
| @@ -78,8 +79,11 @@ def train(): | |||
| saver.save_pytorch(model) | |||
| print("Model saved!") | |||
| # testing with validation set | |||
| test(data_dev) | |||
| def test(): | |||
| def test(test_data): | |||
| # Config Loader | |||
| train_args = ConfigSection() | |||
| ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) | |||
| @@ -99,7 +103,7 @@ def test(): | |||
| tester = SeqLabelTester(test_args) | |||
| # Start testing | |||
| tester.test(model) | |||
| tester.test(model, test_data) | |||
| # print test results | |||
| print(tester.show_matrices()) | |||
| @@ -107,4 +111,4 @@ def test(): | |||
| if __name__ == "__main__": | |||
| train() | |||
| train_test() | |||
| @@ -95,10 +95,10 @@ num_classes = 27 | |||
| [text_class] | |||
| epochs = 1 | |||
| batch_size = 10 | |||
| pickle_path = "./data_for_tests/" | |||
| pickle_path = "./save_path/" | |||
| validate = false | |||
| save_best_dev = false | |||
| model_saved_path = "./data_for_tests/" | |||
| model_saved_path = "./save_path/" | |||
| use_cuda = true | |||
| learn_rate = 1e-3 | |||
| momentum = 0.9 | |||
| @@ -4,9 +4,9 @@ import os | |||
| import numpy as np | |||
| import torch | |||
| from fastNLP.core.preprocess import SeqLabelPreprocess | |||
| from fastNLP.core.tester import SeqLabelTester | |||
| from fastNLP.core.trainer import SeqLabelTrainer | |||
| from fastNLP.loader.preprocess import POSPreprocess | |||
| from fastNLP.models.sequence_modeling import AdvSeqLabel | |||
| @@ -114,7 +114,8 @@ emb_path = "data_for_tests/emb50.txt" | |||
| save_path = "data_for_tests/" | |||
| if __name__ == "__main__": | |||
| data = data_load(data_path) | |||
| p = POSPreprocess(data, pickle_path=pick_path, train_dev_split=0.3) | |||
| preprocess = SeqLabelPreprocess() | |||
| data_train, data_dev = preprocess.run(data, pickle_path=pick_path, train_dev_split=0.3) | |||
| # emb = embedding_process(emb_path, p.word2index, 50, os.path.join(pick_path, "embedding.pkl")) | |||
| emb = None | |||
| args = {"epochs": 20, | |||
| @@ -125,13 +126,13 @@ if __name__ == "__main__": | |||
| "model_saved_path": save_path, | |||
| "use_cuda": True, | |||
| "vocab_size": p.vocab_size, | |||
| "num_classes": p.num_classes, | |||
| "vocab_size": preprocess.vocab_size, | |||
| "num_classes": preprocess.num_classes, | |||
| "word_emb_dim": 50, | |||
| "rnn_hidden_units": 100 | |||
| } | |||
| # emb = torch.Tensor(emb).float().cuda() | |||
| networks = AdvSeqLabel(args, emb) | |||
| trainer = MyNERTrainer(args) | |||
| trainer.train(network=networks) | |||
| trainer.train(networks, data_train, data_dev) | |||
| print("Training finished!") | |||
| @@ -0,0 +1,78 @@ | |||
| # python: 3.5 | |||
| # pytorch: 0.4 | |||
| ################ | |||
| # Test cross validation. | |||
| ################ | |||
| from fastNLP.loader.preprocess import ClassPreprocess | |||
| from fastNLP.core.predictor import ClassificationInfer | |||
| from fastNLP.core.trainer import ClassificationTrainer | |||
| from fastNLP.loader.dataset_loader import ClassDatasetLoader | |||
| from fastNLP.models.base_model import BaseModel | |||
| from fastNLP.modules import aggregation | |||
| from fastNLP.modules import encoder | |||
| class ClassificationModel(BaseModel): | |||
| """ | |||
| Simple text classification model based on CNN. | |||
| """ | |||
| def __init__(self, class_num, vocab_size): | |||
| super(ClassificationModel, self).__init__() | |||
| self.embed = encoder.Embedding(nums=vocab_size, dims=300) | |||
| self.conv = encoder.Conv( | |||
| in_channels=300, out_channels=100, kernel_size=3) | |||
| self.pool = aggregation.MaxPool() | |||
| self.output = encoder.Linear(input_size=100, output_size=class_num) | |||
| def forward(self, x): | |||
| x = self.embed(x) # [N,L] -> [N,L,C] | |||
| x = self.conv(x) # [N,L,C_in] -> [N,L,C_out] | |||
| x = self.pool(x) # [N,L,C] -> [N,C] | |||
| x = self.output(x) # [N,C] -> [N, N_class] | |||
| return x | |||
| data_dir = 'data' # directory to save data and model | |||
| train_path = 'test/data_for_tests/text_classify.txt' # training set file | |||
| # load dataset | |||
| ds_loader = ClassDatasetLoader("train", train_path) | |||
| data = ds_loader.load() | |||
| # pre-process dataset | |||
| pre = ClassPreprocess(data, data_dir, cross_val=True, n_fold=5) | |||
| # pre = ClassPreprocess(data, data_dir) | |||
| n_classes = pre.num_classes | |||
| vocab_size = pre.vocab_size | |||
| # construct model | |||
| model_args = { | |||
| 'num_classes': n_classes, | |||
| 'vocab_size': vocab_size | |||
| } | |||
| model = ClassificationModel(class_num=n_classes, vocab_size=vocab_size) | |||
| # train model | |||
| train_args = { | |||
| "epochs": 10, | |||
| "batch_size": 50, | |||
| "pickle_path": data_dir, | |||
| "validate": False, | |||
| "save_best_dev": False, | |||
| "model_saved_path": None, | |||
| "use_cuda": True, | |||
| "learn_rate": 1e-3, | |||
| "momentum": 0.9} | |||
| trainer = ClassificationTrainer(train_args) | |||
| # trainer.train(model, ['data_train.pkl', 'data_dev.pkl']) | |||
| trainer.cross_validate(model) | |||
| # predict using model | |||
| data_infer = [x[0] for x in data] | |||
| infer = ClassificationInfer(data_dir) | |||
| labels_pred = infer.predict(model, data_infer) | |||
| @@ -5,7 +5,7 @@ sys.path.append("..") | |||
| from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||
| from fastNLP.core.trainer import SeqLabelTrainer | |||
| from fastNLP.loader.dataset_loader import POSDatasetLoader, BaseLoader | |||
| from fastNLP.loader.preprocess import POSPreprocess, load_pickle | |||
| from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle | |||
| from fastNLP.saver.model_saver import ModelSaver | |||
| from fastNLP.loader.model_loader import ModelLoader | |||
| from fastNLP.core.tester import SeqLabelTester | |||
| @@ -14,7 +14,7 @@ from fastNLP.core.predictor import SeqLabelInfer | |||
| data_name = "people.txt" | |||
| data_path = "data_for_tests/people.txt" | |||
| pickle_path = "data_for_tests" | |||
| pickle_path = "seq_label/" | |||
| data_infer_path = "data_for_tests/people_infer.txt" | |||
| @@ -33,21 +33,12 @@ def infer(): | |||
| model = SeqLabeling(test_args) | |||
| # Dump trained parameters into the model | |||
| ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") | |||
| ModelLoader.load_pytorch(model, pickle_path + "saved_model.pkl") | |||
| print("model loaded!") | |||
| # Data Loader | |||
| raw_data_loader = BaseLoader(data_name, data_infer_path) | |||
| infer_data = raw_data_loader.load_lines() | |||
| """ | |||
| Transform strings into list of list of strings. | |||
| [ | |||
| [word_11, word_12, ...], | |||
| [word_21, word_22, ...], | |||
| ... | |||
| ] | |||
| In this case, each line in "people_infer.txt" is already a sentence. So load_lines() just splits them. | |||
| """ | |||
| # Inference interface | |||
| infer = SeqLabelInfer(pickle_path) | |||
| @@ -68,7 +59,8 @@ def train_and_test(): | |||
| train_data = pos_loader.load_lines() | |||
| # Preprocessor | |||
| p = POSPreprocess(train_data, pickle_path, train_dev_split=0.5) | |||
| p = SeqLabelPreprocess() | |||
| data_train, data_dev = p.run(train_data, pickle_path=pickle_path, train_dev_split=0.5) | |||
| train_args["vocab_size"] = p.vocab_size | |||
| train_args["num_classes"] = p.num_classes | |||
| @@ -79,11 +71,11 @@ def train_and_test(): | |||
| model = SeqLabeling(train_args) | |||
| # Start training | |||
| trainer.train(model) | |||
| trainer.train(model, data_train, data_dev) | |||
| print("Training finished!") | |||
| # Saver | |||
| saver = ModelSaver("./data_for_tests/saved_model.pkl") | |||
| saver = ModelSaver(pickle_path + "saved_model.pkl") | |||
| saver.save_pytorch(model) | |||
| print("Model saved!") | |||
| @@ -93,7 +85,7 @@ def train_and_test(): | |||
| model = SeqLabeling(train_args) | |||
| # Dump trained parameters into the model | |||
| ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") | |||
| ModelLoader.load_pytorch(model, pickle_path + "saved_model.pkl") | |||
| print("model loaded!") | |||
| # Load test configuration | |||
| @@ -103,8 +95,8 @@ def train_and_test(): | |||
| # Tester | |||
| tester = SeqLabelTester(test_args) | |||
| # Start testing | |||
| tester.test(model) | |||
| # Start testing with validation data | |||
| tester.test(model, data_dev) | |||
| # print test results | |||
| print(tester.show_matrices()) | |||
| @@ -5,7 +5,7 @@ sys.path.append("..") | |||
| from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||
| from fastNLP.core.trainer import SeqLabelTrainer | |||
| from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader | |||
| from fastNLP.loader.preprocess import POSPreprocess, load_pickle | |||
| from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle | |||
| from fastNLP.saver.model_saver import ModelSaver | |||
| from fastNLP.loader.model_loader import ModelLoader | |||
| from fastNLP.core.tester import SeqLabelTester | |||
| @@ -68,7 +68,8 @@ def train_test(): | |||
| train_data = loader.load_pku() | |||
| # Preprocessor | |||
| p = POSPreprocess(train_data, pickle_path) | |||
| p = SeqLabelPreprocess() | |||
| data_train = p.run(train_data, pickle_path=pickle_path) | |||
| train_args["vocab_size"] = p.vocab_size | |||
| train_args["num_classes"] = p.num_classes | |||
| @@ -79,7 +80,7 @@ def train_test(): | |||
| model = SeqLabeling(train_args) | |||
| # Start training | |||
| trainer.train(model) | |||
| trainer.train(model, data_train) | |||
| print("Training finished!") | |||
| # Saver | |||
| @@ -104,7 +105,7 @@ def train_test(): | |||
| tester = SeqLabelTester(test_args) | |||
| # Start testing | |||
| tester.test(model) | |||
| tester.test(model, data_train) | |||
| # print test results | |||
| print(tester.show_matrices()) | |||
| @@ -1,7 +1,7 @@ | |||
| from fastNLP.core.preprocess import SeqLabelPreprocess | |||
| from fastNLP.core.tester import SeqLabelTester | |||
| from fastNLP.loader.config_loader import ConfigSection, ConfigLoader | |||
| from fastNLP.loader.dataset_loader import TokenizeDatasetLoader | |||
| from fastNLP.loader.preprocess import POSPreprocess | |||
| from fastNLP.models.sequence_modeling import SeqLabeling | |||
| data_name = "pku_training.utf8" | |||
| @@ -17,7 +17,7 @@ def foo(): | |||
| ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) | |||
| # Preprocessor | |||
| p = POSPreprocess(train_data, pickle_path) | |||
| p = SeqLabelPreprocess(train_data, pickle_path) | |||
| train_args["vocab_size"] = p.vocab_size | |||
| train_args["num_classes"] = p.num_classes | |||
| @@ -10,10 +10,11 @@ from fastNLP.core.trainer import ClassificationTrainer | |||
| from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||
| from fastNLP.loader.dataset_loader import ClassDatasetLoader | |||
| from fastNLP.loader.model_loader import ModelLoader | |||
| from fastNLP.loader.preprocess import ClassPreprocess | |||
| from fastNLP.core.preprocess import ClassPreprocess | |||
| from fastNLP.models.cnn_text_classification import CNNText | |||
| from fastNLP.saver.model_saver import ModelSaver | |||
| save_path = "./test_classification/" | |||
| data_dir = "./data_for_tests/" | |||
| train_file = 'text_classify.txt' | |||
| model_name = "model_class.pkl" | |||
| @@ -27,8 +28,8 @@ def infer(): | |||
| unlabeled_data = [x[0] for x in data] | |||
| # pre-process data | |||
| pre = ClassPreprocess(data_dir) | |||
| vocab_size, n_classes = pre.process(data, "data_train.pkl") | |||
| pre = ClassPreprocess() | |||
| vocab_size, n_classes = pre.run(data, pickle_path=save_path) | |||
| print("vocabulary size:", vocab_size) | |||
| print("number of classes:", n_classes) | |||
| @@ -59,28 +60,28 @@ def train(): | |||
| print(data[0]) | |||
| # pre-process data | |||
| pre = ClassPreprocess(data_dir) | |||
| vocab_size, n_classes = pre.process(data, "data_train.pkl") | |||
| print("vocabulary size:", vocab_size) | |||
| print("number of classes:", n_classes) | |||
| pre = ClassPreprocess() | |||
| data_train = pre.run(data, pickle_path=save_path) | |||
| print("vocabulary size:", pre.vocab_size) | |||
| print("number of classes:", pre.num_classes) | |||
| # construct model | |||
| print("Building model...") | |||
| cnn = CNNText(model_args) | |||
| model = CNNText(model_args) | |||
| # train | |||
| print("Training...") | |||
| trainer = ClassificationTrainer(train_args) | |||
| trainer.train(cnn) | |||
| trainer.train(model, data_train) | |||
| print("Training finished!") | |||
| saver = ModelSaver("./data_for_tests/saved_model.pkl") | |||
| saver.save_pytorch(cnn) | |||
| saver.save_pytorch(model) | |||
| print("Model saved!") | |||
| if __name__ == "__main__": | |||
| # train() | |||
| infer() | |||
| train() | |||
| # infer() | |||