| @@ -6,7 +6,7 @@ | |||
|  | |||
| [](http://fastnlp.readthedocs.io/?badge=latest) | |||
| FastNLP is a modular Natural Language Processing system based on PyTorch, built for fast development of NLP models. | |||
| FastNLP is a modular Natural Language Processing system based on PyTorch, built for fast development of NLP models. | |||
| A deep learning NLP model is the composition of three types of modules: | |||
| <table> | |||
| @@ -58,6 +58,13 @@ Run the following commands to install fastNLP package. | |||
| pip install fastNLP | |||
| ``` | |||
| ## Models | |||
| fastNLP implements different models for variant NLP tasks. | |||
| Each model has been trained and tested carefully. | |||
| Check out models' performance, usage and source code here. | |||
| - [Documentation](reproduction/) | |||
| - [Source Code](fastNLP/models/) | |||
| ## Project Structure | |||
| @@ -10,4 +10,4 @@ from .tester import Tester | |||
| from .trainer import Trainer | |||
| from .vocabulary import Vocabulary | |||
| from ..io.dataset_loader import DataSet | |||
| from .callback import Callback | |||
| @@ -1,9 +1,16 @@ | |||
| import numpy as np | |||
| import torch | |||
| import atexit | |||
| from fastNLP.core.sampler import RandomSampler | |||
| import torch.multiprocessing as mp | |||
| _python_is_exit = False | |||
| def _set_python_is_exit(): | |||
| global _python_is_exit | |||
| _python_is_exit = True | |||
| atexit.register(_set_python_is_exit) | |||
| class Batch(object): | |||
| """Batch is an iterable object which iterates over mini-batches. | |||
| @@ -97,12 +104,19 @@ def to_tensor(batch, dtype): | |||
| def run_fetch(batch, q): | |||
| global _python_is_exit | |||
| batch.init_iter() | |||
| # print('start fetch') | |||
| while 1: | |||
| res = batch.fetch_one() | |||
| # print('fetch one') | |||
| q.put(res) | |||
| while 1: | |||
| try: | |||
| q.put(res, timeout=3) | |||
| break | |||
| except Exception as e: | |||
| if _python_is_exit: | |||
| return | |||
| if res is None: | |||
| # print('fetch done, waiting processing') | |||
| q.join() | |||
| @@ -15,45 +15,57 @@ class Callback(object): | |||
| def __init__(self): | |||
| super(Callback, self).__init__() | |||
| self.trainer = None # 在Trainer内部被重新赋值 | |||
| self._trainer = None # 在Trainer内部被重新赋值 | |||
| @property | |||
| def n_epochs(self): | |||
| return self.trainer.n_epochs | |||
| def trainer(self): | |||
| return self._trainer | |||
| @property | |||
| def epoch(self): | |||
| return self.trainer.epoch | |||
| def step(self): | |||
| """current step number, in range(1, self.n_steps+1)""" | |||
| return self._trainer.step | |||
| @property | |||
| def n_steps(self): | |||
| return self.trainer.n_steps | |||
| @property | |||
| def step(self): | |||
| return self.trainer.step | |||
| """total number of steps for training""" | |||
| return self._trainer.n_steps | |||
| @property | |||
| def batch_size(self): | |||
| return self.trainer.batch_size | |||
| """batch size for training""" | |||
| return self._trainer.batch_size | |||
| @property | |||
| def model(self): | |||
| return self.trainer.model | |||
| def epoch(self): | |||
| """current epoch number, in range(1, self.n_epochs+1)""" | |||
| return self._trainer.epoch | |||
| @property | |||
| def pbar(self): | |||
| return self.trainer.pbar | |||
| def n_epochs(self): | |||
| """total number of epochs""" | |||
| return self._trainer.n_epochs | |||
| @property | |||
| def optimizer(self): | |||
| return self.trainer.optimizer | |||
| """torch.optim.Optimizer for current model""" | |||
| return self._trainer.optimizer | |||
| @property | |||
| def model(self): | |||
| """training model""" | |||
| return self._trainer.model | |||
| @property | |||
| def pbar(self): | |||
| """If use_tqdm, return trainer's tqdm print bar, else return None.""" | |||
| return self._trainer.pbar | |||
| def on_train_begin(self): | |||
| # before the main training loop | |||
| pass | |||
| def on_epoch_begin(self, cur_epoch, total_epoch): | |||
| def on_epoch_begin(self): | |||
| # at the beginning of each epoch | |||
| pass | |||
| @@ -65,14 +77,14 @@ class Callback(object): | |||
| # after data_forward, and before loss computation | |||
| pass | |||
| def on_backward_begin(self, loss, model): | |||
| def on_backward_begin(self, loss): | |||
| # after loss computation, and before gradient backward | |||
| pass | |||
| def on_backward_end(self, model): | |||
| def on_backward_end(self): | |||
| pass | |||
| def on_step_end(self, optimizer): | |||
| def on_step_end(self): | |||
| pass | |||
| def on_batch_end(self, *args): | |||
| @@ -94,39 +106,28 @@ class Callback(object): | |||
| """ | |||
| pass | |||
| def on_epoch_end(self, cur_epoch, n_epoch, optimizer): | |||
| def on_epoch_end(self): | |||
| """ | |||
| 每个epoch结束将会调用该方法 | |||
| :param cur_epoch: int, 当前的batch。从1开始。 | |||
| :param n_epoch: int, 总的batch数 | |||
| :param optimizer: 传入Trainer的optimizer。 | |||
| :return: | |||
| """ | |||
| pass | |||
| def on_train_end(self, model): | |||
| def on_train_end(self): | |||
| """ | |||
| 训练结束,调用该方法 | |||
| :param model: nn.Module, 传入Trainer的模型 | |||
| :return: | |||
| """ | |||
| pass | |||
| def on_exception(self, exception, model): | |||
| def on_exception(self, exception): | |||
| """ | |||
| 当训练过程出现异常,会触发该方法 | |||
| :param exception: 某种类型的Exception,比如KeyboardInterrupt等 | |||
| :param model: 传入Trainer的模型 | |||
| :return: | |||
| """ | |||
| pass | |||
| def transfer(func): | |||
| """装饰器,将对CallbackManager的调用转发到各个Callback子类. | |||
| :param func: | |||
| :return: | |||
| """ | |||
| @@ -150,7 +151,7 @@ class CallbackManager(Callback): | |||
| """ | |||
| :param dict env: The key is the name of the Trainer attribute(str). The value is the attribute itself. | |||
| :param Callback callbacks: | |||
| :param List[Callback] callbacks: | |||
| """ | |||
| super(CallbackManager, self).__init__() | |||
| # set attribute of trainer environment | |||
| @@ -168,14 +169,14 @@ class CallbackManager(Callback): | |||
| for env_name, env_val in env.items(): | |||
| for callback in self.callbacks: | |||
| setattr(callback, env_name, env_val) # Callback.trainer | |||
| setattr(callback, '_'+env_name, env_val) # Callback.trainer | |||
| @transfer | |||
| def on_train_begin(self): | |||
| pass | |||
| @transfer | |||
| def on_epoch_begin(self, cur_epoch, total_epoch): | |||
| def on_epoch_begin(self): | |||
| pass | |||
| @transfer | |||
| @@ -187,15 +188,15 @@ class CallbackManager(Callback): | |||
| pass | |||
| @transfer | |||
| def on_backward_begin(self, loss, model): | |||
| def on_backward_begin(self, loss): | |||
| pass | |||
| @transfer | |||
| def on_backward_end(self, model): | |||
| def on_backward_end(self): | |||
| pass | |||
| @transfer | |||
| def on_step_end(self, optimizer): | |||
| def on_step_end(self): | |||
| pass | |||
| @transfer | |||
| @@ -211,15 +212,15 @@ class CallbackManager(Callback): | |||
| pass | |||
| @transfer | |||
| def on_epoch_end(self, cur_epoch, n_epoch, optimizer): | |||
| def on_epoch_end(self): | |||
| pass | |||
| @transfer | |||
| def on_train_end(self, model): | |||
| def on_train_end(self): | |||
| pass | |||
| @transfer | |||
| def on_exception(self, exception, model): | |||
| def on_exception(self, exception): | |||
| pass | |||
| @@ -227,15 +228,15 @@ class DummyCallback(Callback): | |||
| def on_train_begin(self, *arg): | |||
| print(arg) | |||
| def on_epoch_end(self, cur_epoch, n_epoch, optimizer): | |||
| print(cur_epoch, n_epoch, optimizer) | |||
| def on_epoch_end(self): | |||
| print(self.epoch, self.n_epochs) | |||
| class EchoCallback(Callback): | |||
| def on_train_begin(self): | |||
| print("before_train") | |||
| def on_epoch_begin(self, cur_epoch, total_epoch): | |||
| def on_epoch_begin(self): | |||
| print("before_epoch") | |||
| def on_batch_begin(self, batch_x, batch_y, indices): | |||
| @@ -244,16 +245,16 @@ class EchoCallback(Callback): | |||
| def on_loss_begin(self, batch_y, predict_y): | |||
| print("before_loss") | |||
| def on_backward_begin(self, loss, model): | |||
| def on_backward_begin(self, loss): | |||
| print("before_backward") | |||
| def on_batch_end(self): | |||
| print("after_batch") | |||
| def on_epoch_end(self, cur_epoch, n_epoch, optimizer): | |||
| def on_epoch_end(self): | |||
| print("after_epoch") | |||
| def on_train_end(self, model): | |||
| def on_train_end(self): | |||
| print("after_train") | |||
| @@ -281,9 +282,9 @@ class GradientClipCallback(Callback): | |||
| self.parameters = parameters | |||
| self.clip_value = clip_value | |||
| def on_backward_end(self, model): | |||
| def on_backward_end(self): | |||
| if self.parameters is None: | |||
| self.clip_fun(model.parameters(), self.clip_value) | |||
| self.clip_fun(self.model.parameters(), self.clip_value) | |||
| else: | |||
| self.clip_fun(self.parameters, self.clip_value) | |||
| @@ -305,14 +306,11 @@ class EarlyStopCallback(Callback): | |||
| :param int patience: 停止之前等待的epoch数 | |||
| """ | |||
| super(EarlyStopCallback, self).__init__() | |||
| self.trainer = None # override by CallbackManager | |||
| self.patience = patience | |||
| self.wait = 0 | |||
| self.epoch = 0 | |||
| def on_valid_end(self, eval_result, metric_key, optimizer): | |||
| self.epoch += 1 | |||
| if not self.trainer._better_eval_result(eval_result): | |||
| def on_valid_end(self, eval_result, metric_key, optimizer, is_better_eval): | |||
| if not is_better_eval: | |||
| # current result is getting worse | |||
| if self.wait == self.patience: | |||
| raise EarlyStopError("Early stopping raised.") | |||
| @@ -321,7 +319,7 @@ class EarlyStopCallback(Callback): | |||
| else: | |||
| self.wait = 0 | |||
| def on_exception(self, exception, model): | |||
| def on_exception(self, exception): | |||
| if isinstance(exception, EarlyStopError): | |||
| print("Early Stopping triggered in epoch {}!".format(self.epoch)) | |||
| else: | |||
| @@ -341,7 +339,7 @@ class LRScheduler(Callback): | |||
| else: | |||
| raise ValueError(f"Expect torch.optim.lr_scheduler for LRScheduler. Got {type(lr_scheduler)}.") | |||
| def on_epoch_begin(self, cur_epoch, total_epoch): | |||
| def on_epoch_begin(self): | |||
| self.scheduler.step() | |||
| @@ -356,7 +354,7 @@ class ControlC(Callback): | |||
| raise ValueError("In KeyBoardInterrupt, quit_all arguemnt must be a bool.") | |||
| self.quit_all = quit_all | |||
| def on_exception(self, exception, model): | |||
| def on_exception(self, exception): | |||
| if isinstance(exception, KeyboardInterrupt): | |||
| if self.quit_all is True: | |||
| import sys | |||
| @@ -402,15 +400,15 @@ class LRFinder(Callback): | |||
| self.find = None | |||
| self.loader = ModelLoader() | |||
| def on_epoch_begin(self, cur_epoch, total_epoch): | |||
| if cur_epoch == 1: | |||
| def on_epoch_begin(self): | |||
| if self.epoch == 1: # first epoch | |||
| self.opt = self.trainer.optimizer # pytorch optimizer | |||
| self.opt.param_groups[0]["lr"] = self.start_lr | |||
| # save model | |||
| ModelSaver("tmp").save_pytorch(self.trainer.model, param_only=True) | |||
| self.find = True | |||
| def on_backward_begin(self, loss, model): | |||
| def on_backward_begin(self, loss): | |||
| if self.find: | |||
| if torch.isnan(loss) or self.stop is True: | |||
| self.stop = True | |||
| @@ -431,8 +429,8 @@ class LRFinder(Callback): | |||
| self.opt.param_groups[0]["lr"] = lr | |||
| # self.loader.load_pytorch(self.trainer.model, "tmp") | |||
| def on_epoch_end(self, cur_epoch, n_epoch, optimizer): | |||
| if cur_epoch == 1: | |||
| def on_epoch_end(self): | |||
| if self.epoch == 1: # first epoch | |||
| self.opt.param_groups[0]["lr"] = self.best_lr | |||
| self.find = False | |||
| # reset model | |||
| @@ -476,7 +474,7 @@ class TensorboardCallback(Callback): | |||
| # self._summary_writer.add_graph(self.trainer.model, torch.zeros(32, 2)) | |||
| self.graph_added = True | |||
| def on_backward_begin(self, loss, model): | |||
| def on_backward_begin(self, loss): | |||
| if "loss" in self.options: | |||
| self._summary_writer.add_scalar("loss", loss.item(), global_step=self.trainer.step) | |||
| @@ -488,18 +486,18 @@ class TensorboardCallback(Callback): | |||
| self._summary_writer.add_scalar(name + "_grad_mean", param.grad.mean(), | |||
| global_step=self.trainer.step) | |||
| def on_valid_end(self, eval_result, metric_key, optimizer): | |||
| def on_valid_end(self, eval_result, metric_key, optimizer, is_better_eval): | |||
| if "metric" in self.options: | |||
| for name, metric in eval_result.items(): | |||
| for metric_key, metric_val in metric.items(): | |||
| self._summary_writer.add_scalar("valid_{}_{}".format(name, metric_key), metric_val, | |||
| global_step=self.trainer.step) | |||
| def on_train_end(self, model): | |||
| def on_train_end(self): | |||
| self._summary_writer.close() | |||
| del self._summary_writer | |||
| def on_exception(self, exception, model): | |||
| def on_exception(self, exception): | |||
| if hasattr(self, "_summary_writer"): | |||
| self._summary_writer.close() | |||
| del self._summary_writer | |||
| @@ -507,5 +505,5 @@ class TensorboardCallback(Callback): | |||
| if __name__ == "__main__": | |||
| manager = CallbackManager(env={"n_epoch": 3}, callbacks=[DummyCallback(), DummyCallback()]) | |||
| manager.on_train_begin(10, 11, 12) | |||
| manager.on_train_begin() | |||
| # print(manager.after_epoch()) | |||
| @@ -127,6 +127,9 @@ class Trainer(object): | |||
| self.best_dev_perf = None | |||
| self.sampler = sampler if sampler is not None else RandomSampler() | |||
| self.prefetch = prefetch | |||
| self.callback_manager = CallbackManager(env={"trainer": self}, callbacks=callbacks) | |||
| self.n_steps = (len(self.train_data) // self.batch_size + int( | |||
| len(self.train_data) % self.batch_size != 0)) * self.n_epochs | |||
| if isinstance(optimizer, torch.optim.Optimizer): | |||
| self.optimizer = optimizer | |||
| @@ -136,6 +139,7 @@ class Trainer(object): | |||
| self.optimizer = optimizer.construct_from_pytorch(self.model.parameters()) | |||
| self.use_tqdm = use_tqdm | |||
| self.pbar = None | |||
| self.print_every = abs(self.print_every) | |||
| if self.dev_data is not None: | |||
| @@ -209,9 +213,9 @@ class Trainer(object): | |||
| try: | |||
| self.callback_manager.on_train_begin() | |||
| self._train() | |||
| self.callback_manager.on_train_end(self.model) | |||
| self.callback_manager.on_train_end() | |||
| except (CallbackException, KeyboardInterrupt) as e: | |||
| self.callback_manager.on_exception(e, self.model) | |||
| self.callback_manager.on_exception(e) | |||
| if self.dev_data is not None and hasattr(self, 'best_dev_perf'): | |||
| print("\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) + | |||
| @@ -238,18 +242,21 @@ class Trainer(object): | |||
| else: | |||
| inner_tqdm = tqdm | |||
| self.step = 0 | |||
| self.epoch = 0 | |||
| start = time.time() | |||
| total_steps = (len(self.train_data) // self.batch_size + int( | |||
| len(self.train_data) % self.batch_size != 0)) * self.n_epochs | |||
| with inner_tqdm(total=total_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar: | |||
| with inner_tqdm(total=self.n_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar: | |||
| self.pbar = pbar if isinstance(pbar, tqdm) else None | |||
| avg_loss = 0 | |||
| data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, | |||
| prefetch=self.prefetch) | |||
| for epoch in range(1, self.n_epochs+1): | |||
| self.epoch = epoch | |||
| pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs)) | |||
| # early stopping | |||
| self.callback_manager.on_epoch_begin(epoch, self.n_epochs) | |||
| self.callback_manager.on_epoch_begin() | |||
| for batch_x, batch_y in data_iterator: | |||
| self.step += 1 | |||
| _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) | |||
| indices = data_iterator.get_batch_indices() | |||
| # negative sampling; replace unknown; re-weight batch_y | |||
| @@ -263,12 +270,12 @@ class Trainer(object): | |||
| loss = loss/self.update_every | |||
| # Is loss NaN or inf? requires_grad = False | |||
| self.callback_manager.on_backward_begin(loss, self.model) | |||
| self.callback_manager.on_backward_begin(loss) | |||
| self._grad_backward(loss) | |||
| self.callback_manager.on_backward_end(self.model) | |||
| self.callback_manager.on_backward_end() | |||
| self._update() | |||
| self.callback_manager.on_step_end(self.optimizer) | |||
| self.callback_manager.on_step_end() | |||
| if (self.step+1) % self.print_every == 0: | |||
| avg_loss = avg_loss / self.print_every | |||
| @@ -282,7 +289,6 @@ class Trainer(object): | |||
| epoch, self.step, avg_loss, diff) | |||
| pbar.set_postfix_str(print_output) | |||
| avg_loss = 0 | |||
| self.step += 1 | |||
| self.callback_manager.on_batch_end() | |||
| if ((self.validate_every > 0 and self.step % self.validate_every == 0) or | |||
| @@ -290,16 +296,17 @@ class Trainer(object): | |||
| and self.dev_data is not None: | |||
| eval_res = self._do_validation(epoch=epoch, step=self.step) | |||
| eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, | |||
| total_steps) + \ | |||
| self.n_steps) + \ | |||
| self.tester._format_eval_results(eval_res) | |||
| pbar.write(eval_str + '\n') | |||
| # ================= mini-batch end ==================== # | |||
| # lr decay; early stopping | |||
| self.callback_manager.on_epoch_end(epoch, self.n_epochs, self.optimizer) | |||
| self.callback_manager.on_epoch_end() | |||
| # =============== epochs end =================== # | |||
| pbar.close() | |||
| self.pbar = None | |||
| # ============ tqdm end ============== # | |||
| def _do_validation(self, epoch, step): | |||
| @@ -1,4 +1,5 @@ | |||
| import os | |||
| import json | |||
| from fastNLP.core.dataset import DataSet | |||
| from fastNLP.core.instance import Instance | |||
| @@ -64,6 +65,53 @@ def convert_seq2seq_dataset(data): | |||
| return dataset | |||
| def download_from_url(url, path): | |||
| from tqdm import tqdm | |||
| import requests | |||
| """Download file""" | |||
| r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, stream=True) | |||
| chunk_size = 16 * 1024 | |||
| total_size = int(r.headers.get('Content-length', 0)) | |||
| with open(path, "wb") as file ,\ | |||
| tqdm(total=total_size, unit='B', unit_scale=1, desc=path.split('/')[-1]) as t: | |||
| for chunk in r.iter_content(chunk_size): | |||
| if chunk: | |||
| file.write(chunk) | |||
| t.update(len(chunk)) | |||
| return | |||
| def uncompress(src, dst): | |||
| import zipfile, gzip, tarfile, os | |||
| def unzip(src, dst): | |||
| with zipfile.ZipFile(src, 'r') as f: | |||
| f.extractall(dst) | |||
| def ungz(src, dst): | |||
| with gzip.open(src, 'rb') as f, open(dst, 'wb') as uf: | |||
| length = 16 * 1024 # 16KB | |||
| buf = f.read(length) | |||
| while buf: | |||
| uf.write(buf) | |||
| buf = f.read(length) | |||
| def untar(src, dst): | |||
| with tarfile.open(src, 'r:gz') as f: | |||
| f.extractall(dst) | |||
| fn, ext = os.path.splitext(src) | |||
| _, ext_2 = os.path.splitext(fn) | |||
| if ext == '.zip': | |||
| unzip(src, dst) | |||
| elif ext == '.gz' and ext_2 != '.tar': | |||
| ungz(src, dst) | |||
| elif (ext == '.gz' and ext_2 == '.tar') or ext_2 == '.tgz': | |||
| untar(src, dst) | |||
| else: | |||
| raise ValueError('unsupported file {}'.format(src)) | |||
| class DataSetLoader: | |||
| """Interface for all DataSetLoaders. | |||
| @@ -290,41 +338,6 @@ class DummyClassificationReader(DataSetLoader): | |||
| return convert_seq2tag_dataset(data) | |||
| class ConllLoader(DataSetLoader): | |||
| """loader for conll format files""" | |||
| def __init__(self): | |||
| super(ConllLoader, self).__init__() | |||
| def load(self, data_path): | |||
| with open(data_path, "r", encoding="utf-8") as f: | |||
| lines = f.readlines() | |||
| data = self.parse(lines) | |||
| return self.convert(data) | |||
| @staticmethod | |||
| def parse(lines): | |||
| """ | |||
| :param list lines: a list containing all lines in a conll file. | |||
| :return: a 3D list | |||
| """ | |||
| sentences = list() | |||
| tokens = list() | |||
| for line in lines: | |||
| if line[0] == "#": | |||
| # skip the comments | |||
| continue | |||
| if line == "\n": | |||
| sentences.append(tokens) | |||
| tokens = [] | |||
| continue | |||
| tokens.append(line.split()) | |||
| return sentences | |||
| def convert(self, data): | |||
| pass | |||
| class DummyLMReader(DataSetLoader): | |||
| """A Dummy Language Model Dataset Reader | |||
| """ | |||
| @@ -434,51 +447,67 @@ class PeopleDailyCorpusLoader(DataSetLoader): | |||
| return data_set | |||
| class Conll2003Loader(DataSetLoader): | |||
| class ConllLoader: | |||
| def __init__(self, headers, indexs=None): | |||
| self.headers = headers | |||
| if indexs is None: | |||
| self.indexs = list(range(len(self.headers))) | |||
| else: | |||
| if len(indexs) != len(headers): | |||
| raise ValueError | |||
| self.indexs = indexs | |||
| def load(self, path): | |||
| datalist = [] | |||
| with open(path, 'r', encoding='utf-8') as f: | |||
| sample = [] | |||
| start = next(f) | |||
| if '-DOCSTART-' not in start: | |||
| sample.append(start.split()) | |||
| for line in f: | |||
| if line.startswith('\n'): | |||
| if len(sample): | |||
| datalist.append(sample) | |||
| sample = [] | |||
| elif line.startswith('#'): | |||
| continue | |||
| else: | |||
| sample.append(line.split()) | |||
| if len(sample) > 0: | |||
| datalist.append(sample) | |||
| data = [self.get_one(sample) for sample in datalist] | |||
| data = filter(lambda x: x is not None, data) | |||
| ds = DataSet() | |||
| for sample in data: | |||
| ins = Instance() | |||
| for name, idx in zip(self.headers, self.indexs): | |||
| ins.add_field(field_name=name, field=sample[idx]) | |||
| ds.append(ins) | |||
| return ds | |||
| def get_one(self, sample): | |||
| sample = list(map(list, zip(*sample))) | |||
| for field in sample: | |||
| if len(field) <= 0: | |||
| return None | |||
| return sample | |||
| class Conll2003Loader(ConllLoader): | |||
| """Loader for conll2003 dataset | |||
| More information about the given dataset cound be found on | |||
| https://sites.google.com/site/ermasoftware/getting-started/ne-tagging-conll2003-data | |||
| Deprecated. Use ConllLoader for all types of conll-format files. | |||
| """ | |||
| def __init__(self): | |||
| super(Conll2003Loader, self).__init__() | |||
| def load(self, dataset_path): | |||
| with open(dataset_path, "r", encoding="utf-8") as f: | |||
| lines = f.readlines() | |||
| parsed_data = [] | |||
| sentence = [] | |||
| tokens = [] | |||
| for line in lines: | |||
| if '-DOCSTART- -X- -X- O' in line or line == '\n': | |||
| if sentence != []: | |||
| parsed_data.append((sentence, tokens)) | |||
| sentence = [] | |||
| tokens = [] | |||
| continue | |||
| temp = line.strip().split(" ") | |||
| sentence.append(temp[0]) | |||
| tokens.append(temp[1:4]) | |||
| return self.convert(parsed_data) | |||
| def convert(self, parsed_data): | |||
| dataset = DataSet() | |||
| for sample in parsed_data: | |||
| label0_list = list(map( | |||
| lambda labels: labels[0], sample[1])) | |||
| label1_list = list(map( | |||
| lambda labels: labels[1], sample[1])) | |||
| label2_list = list(map( | |||
| lambda labels: labels[2], sample[1])) | |||
| dataset.append(Instance(tokens=sample[0], | |||
| pos=label0_list, | |||
| chucks=label1_list, | |||
| ner=label2_list)) | |||
| return dataset | |||
| headers = [ | |||
| 'tokens', 'pos', 'chunks', 'ner', | |||
| ] | |||
| super(Conll2003Loader, self).__init__(headers=headers) | |||
| class SNLIDataSetReader(DataSetLoader): | |||
| @@ -548,6 +577,7 @@ class SNLIDataSetReader(DataSetLoader): | |||
| class ConllCWSReader(object): | |||
| """Deprecated. Use ConllLoader for all types of conll-format files.""" | |||
| def __init__(self): | |||
| pass | |||
| @@ -700,6 +730,7 @@ def cut_long_sentence(sent, max_sample_length=200): | |||
| class ZhConllPOSReader(object): | |||
| """读取中文Conll格式。返回“字级别”的标签,使用BMES记号扩展原来的词级别标签。 | |||
| Deprecated. Use ConllLoader for all types of conll-format files. | |||
| """ | |||
| def __init__(self): | |||
| pass | |||
| @@ -778,10 +809,35 @@ class ZhConllPOSReader(object): | |||
| return text, pos_tags | |||
| class ConllxDataLoader(object): | |||
| class ConllxDataLoader(ConllLoader): | |||
| """返回“词级别”的标签信息,包括词、词性、(句法)头依赖、(句法)边标签。跟``ZhConllPOSReader``完全不同。 | |||
| Deprecated. Use ConllLoader for all types of conll-format files. | |||
| """ | |||
| def __init__(self): | |||
| headers = [ | |||
| 'words', 'pos_tags', 'heads', 'labels', | |||
| ] | |||
| indexs = [ | |||
| 1, 3, 6, 7, | |||
| ] | |||
| super(ConllxDataLoader, self).__init__(headers=headers, indexs=indexs) | |||
| class SSTLoader(DataSetLoader): | |||
| """load SST data in PTB tree format | |||
| data source: https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip | |||
| """ | |||
| def __init__(self, subtree=False, fine_grained=False): | |||
| self.subtree = subtree | |||
| tag_v = {'0':'very negative', '1':'negative', '2':'neutral', | |||
| '3':'positive', '4':'very positive'} | |||
| if not fine_grained: | |||
| tag_v['0'] = tag_v['1'] | |||
| tag_v['4'] = tag_v['3'] | |||
| self.tag_v = tag_v | |||
| def load(self, path): | |||
| """ | |||
| @@ -793,40 +849,47 @@ class ConllxDataLoader(object): | |||
| """ | |||
| datalist = [] | |||
| with open(path, 'r', encoding='utf-8') as f: | |||
| sample = [] | |||
| for line in f: | |||
| if line.startswith('\n'): | |||
| datalist.append(sample) | |||
| sample = [] | |||
| elif line.startswith('#'): | |||
| continue | |||
| else: | |||
| sample.append(line.split('\t')) | |||
| if len(sample) > 0: | |||
| datalist.append(sample) | |||
| datas = [] | |||
| for l in f: | |||
| datas.extend([(s, self.tag_v[t]) | |||
| for s, t in self.get_one(l, self.subtree)]) | |||
| ds = DataSet() | |||
| for words, tag in datas: | |||
| ds.append(Instance(words=words, raw_tag=tag)) | |||
| return ds | |||
| data = [self.get_one(sample) for sample in datalist] | |||
| data_list = list(filter(lambda x: x is not None, data)) | |||
| @staticmethod | |||
| def get_one(data, subtree): | |||
| from nltk.tree import Tree | |||
| tree = Tree.fromstring(data) | |||
| if subtree: | |||
| return [(t.leaves(), t.label()) for t in tree.subtrees()] | |||
| return [(tree.leaves(), tree.label())] | |||
| class JsonLoader(DataSetLoader): | |||
| """Load json-format data, | |||
| every line contains a json obj, like a dict | |||
| fields is the dict key that need to be load | |||
| """ | |||
| def __init__(self, **fields): | |||
| super(JsonLoader, self).__init__() | |||
| self.fields = {} | |||
| for k, v in fields.items(): | |||
| self.fields[k] = k if v is None else v | |||
| def load(self, path): | |||
| with open(path, 'r', encoding='utf-8') as f: | |||
| datas = [json.loads(l) for l in f] | |||
| ds = DataSet() | |||
| for example in data_list: | |||
| ds.append(Instance(words=example[0], | |||
| pos_tags=example[1], | |||
| heads=example[2], | |||
| labels=example[3])) | |||
| for d in datas: | |||
| ins = Instance() | |||
| for k, v in d.items(): | |||
| if k in self.fields: | |||
| ins.add_field(self.fields[k], v) | |||
| ds.append(ins) | |||
| return ds | |||
| def get_one(self, sample): | |||
| sample = list(map(list, zip(*sample))) | |||
| if len(sample) == 0: | |||
| return None | |||
| for w in sample[7]: | |||
| if w == '_': | |||
| print('Error Sample {}'.format(sample)) | |||
| return None | |||
| # return word_seq, pos_seq, head_seq, head_tag_seq | |||
| return sample[1], sample[3], list(map(int, sample[6])), sample[7] | |||
| def add_seg_tag(data): | |||
| """ | |||
| @@ -848,3 +911,4 @@ def add_seg_tag(data): | |||
| new_sample.append((word[-1], 'E-' + pos)) | |||
| _processed.append(list(map(list, zip(*new_sample)))) | |||
| return _processed | |||
| @@ -0,0 +1,223 @@ | |||
| # Code Modified from https://github.com/carpedm20/ENAS-pytorch | |||
| """A module with NAS controller-related code.""" | |||
| import collections | |||
| import os | |||
| import torch | |||
| import torch.nn.functional as F | |||
| import fastNLP | |||
| import fastNLP.models.enas_utils as utils | |||
| from fastNLP.models.enas_utils import Node | |||
| def _construct_dags(prev_nodes, activations, func_names, num_blocks): | |||
| """Constructs a set of DAGs based on the actions, i.e., previous nodes and | |||
| activation functions, sampled from the controller/policy pi. | |||
| Args: | |||
| prev_nodes: Previous node actions from the policy. | |||
| activations: Activations sampled from the policy. | |||
| func_names: Mapping from activation function names to functions. | |||
| num_blocks: Number of blocks in the target RNN cell. | |||
| Returns: | |||
| A list of DAGs defined by the inputs. | |||
| RNN cell DAGs are represented in the following way: | |||
| 1. Each element (node) in a DAG is a list of `Node`s. | |||
| 2. The `Node`s in the list dag[i] correspond to the subsequent nodes | |||
| that take the output from node i as their own input. | |||
| 3. dag[-1] is the node that takes input from x^{(t)} and h^{(t - 1)}. | |||
| dag[-1] always feeds dag[0]. | |||
| dag[-1] acts as if `w_xc`, `w_hc`, `w_xh` and `w_hh` are its | |||
| weights. | |||
| 4. dag[N - 1] is the node that produces the hidden state passed to | |||
| the next timestep. dag[N - 1] is also always a leaf node, and therefore | |||
| is always averaged with the other leaf nodes and fed to the output | |||
| decoder. | |||
| """ | |||
| dags = [] | |||
| for nodes, func_ids in zip(prev_nodes, activations): | |||
| dag = collections.defaultdict(list) | |||
| # add first node | |||
| dag[-1] = [Node(0, func_names[func_ids[0]])] | |||
| dag[-2] = [Node(0, func_names[func_ids[0]])] | |||
| # add following nodes | |||
| for jdx, (idx, func_id) in enumerate(zip(nodes, func_ids[1:])): | |||
| dag[utils.to_item(idx)].append(Node(jdx + 1, func_names[func_id])) | |||
| leaf_nodes = set(range(num_blocks)) - dag.keys() | |||
| # merge with avg | |||
| for idx in leaf_nodes: | |||
| dag[idx] = [Node(num_blocks, 'avg')] | |||
| # This is actually y^{(t)}. h^{(t)} is node N - 1 in | |||
| # the graph, where N Is the number of nodes. I.e., h^{(t)} takes | |||
| # only one other node as its input. | |||
| # last h[t] node | |||
| last_node = Node(num_blocks + 1, 'h[t]') | |||
| dag[num_blocks] = [last_node] | |||
| dags.append(dag) | |||
| return dags | |||
| class Controller(torch.nn.Module): | |||
| """Based on | |||
| https://github.com/pytorch/examples/blob/master/word_language_model/model.py | |||
| RL controllers do not necessarily have much to do with | |||
| language models. | |||
| Base the controller RNN on the GRU from: | |||
| https://github.com/ikostrikov/pytorch-a2c-ppo-acktr/blob/master/model.py | |||
| """ | |||
| def __init__(self, num_blocks=4, controller_hid=100, cuda=False): | |||
| torch.nn.Module.__init__(self) | |||
| # `num_tokens` here is just the activation function | |||
| # for every even step, | |||
| self.shared_rnn_activations = ['tanh', 'ReLU', 'identity', 'sigmoid'] | |||
| self.num_tokens = [len(self.shared_rnn_activations)] | |||
| self.controller_hid = controller_hid | |||
| self.use_cuda = cuda | |||
| self.num_blocks = num_blocks | |||
| for idx in range(num_blocks): | |||
| self.num_tokens += [idx + 1, len(self.shared_rnn_activations)] | |||
| self.func_names = self.shared_rnn_activations | |||
| num_total_tokens = sum(self.num_tokens) | |||
| self.encoder = torch.nn.Embedding(num_total_tokens, | |||
| controller_hid) | |||
| self.lstm = torch.nn.LSTMCell(controller_hid, controller_hid) | |||
| # Perhaps these weights in the decoder should be | |||
| # shared? At least for the activation functions, which all have the | |||
| # same size. | |||
| self.decoders = [] | |||
| for idx, size in enumerate(self.num_tokens): | |||
| decoder = torch.nn.Linear(controller_hid, size) | |||
| self.decoders.append(decoder) | |||
| self._decoders = torch.nn.ModuleList(self.decoders) | |||
| self.reset_parameters() | |||
| self.static_init_hidden = utils.keydefaultdict(self.init_hidden) | |||
| def _get_default_hidden(key): | |||
| return utils.get_variable( | |||
| torch.zeros(key, self.controller_hid), | |||
| self.use_cuda, | |||
| requires_grad=False) | |||
| self.static_inputs = utils.keydefaultdict(_get_default_hidden) | |||
| def reset_parameters(self): | |||
| init_range = 0.1 | |||
| for param in self.parameters(): | |||
| param.data.uniform_(-init_range, init_range) | |||
| for decoder in self.decoders: | |||
| decoder.bias.data.fill_(0) | |||
| def forward(self, # pylint:disable=arguments-differ | |||
| inputs, | |||
| hidden, | |||
| block_idx, | |||
| is_embed): | |||
| if not is_embed: | |||
| embed = self.encoder(inputs) | |||
| else: | |||
| embed = inputs | |||
| hx, cx = self.lstm(embed, hidden) | |||
| logits = self.decoders[block_idx](hx) | |||
| logits /= 5.0 | |||
| # # exploration | |||
| # if self.args.mode == 'train': | |||
| # logits = (2.5 * F.tanh(logits)) | |||
| return logits, (hx, cx) | |||
| def sample(self, batch_size=1, with_details=False, save_dir=None): | |||
| """Samples a set of `args.num_blocks` many computational nodes from the | |||
| controller, where each node is made up of an activation function, and | |||
| each node except the last also includes a previous node. | |||
| """ | |||
| if batch_size < 1: | |||
| raise Exception(f'Wrong batch_size: {batch_size} < 1') | |||
| # [B, L, H] | |||
| inputs = self.static_inputs[batch_size] | |||
| hidden = self.static_init_hidden[batch_size] | |||
| activations = [] | |||
| entropies = [] | |||
| log_probs = [] | |||
| prev_nodes = [] | |||
| # The RNN controller alternately outputs an activation, | |||
| # followed by a previous node, for each block except the last one, | |||
| # which only gets an activation function. The last node is the output | |||
| # node, and its previous node is the average of all leaf nodes. | |||
| for block_idx in range(2*(self.num_blocks - 1) + 1): | |||
| logits, hidden = self.forward(inputs, | |||
| hidden, | |||
| block_idx, | |||
| is_embed=(block_idx == 0)) | |||
| probs = F.softmax(logits, dim=-1) | |||
| log_prob = F.log_softmax(logits, dim=-1) | |||
| # .mean() for entropy? | |||
| entropy = -(log_prob * probs).sum(1, keepdim=False) | |||
| action = probs.multinomial(num_samples=1).data | |||
| selected_log_prob = log_prob.gather( | |||
| 1, utils.get_variable(action, requires_grad=False)) | |||
| # why the [:, 0] here? Should it be .squeeze(), or | |||
| # .view()? Same below with `action`. | |||
| entropies.append(entropy) | |||
| log_probs.append(selected_log_prob[:, 0]) | |||
| # 0: function, 1: previous node | |||
| mode = block_idx % 2 | |||
| inputs = utils.get_variable( | |||
| action[:, 0] + sum(self.num_tokens[:mode]), | |||
| requires_grad=False) | |||
| if mode == 0: | |||
| activations.append(action[:, 0]) | |||
| elif mode == 1: | |||
| prev_nodes.append(action[:, 0]) | |||
| prev_nodes = torch.stack(prev_nodes).transpose(0, 1) | |||
| activations = torch.stack(activations).transpose(0, 1) | |||
| dags = _construct_dags(prev_nodes, | |||
| activations, | |||
| self.func_names, | |||
| self.num_blocks) | |||
| if save_dir is not None: | |||
| for idx, dag in enumerate(dags): | |||
| utils.draw_network(dag, | |||
| os.path.join(save_dir, f'graph{idx}.png')) | |||
| if with_details: | |||
| return dags, torch.cat(log_probs), torch.cat(entropies) | |||
| return dags | |||
| def init_hidden(self, batch_size): | |||
| zeros = torch.zeros(batch_size, self.controller_hid) | |||
| return (utils.get_variable(zeros, self.use_cuda, requires_grad=False), | |||
| utils.get_variable(zeros.clone(), self.use_cuda, requires_grad=False)) | |||
| @@ -0,0 +1,388 @@ | |||
| # Code Modified from https://github.com/carpedm20/ENAS-pytorch | |||
| """Module containing the shared RNN model.""" | |||
| import numpy as np | |||
| import collections | |||
| import torch | |||
| from torch import nn | |||
| import torch.nn.functional as F | |||
| from torch.autograd import Variable | |||
| import fastNLP.models.enas_utils as utils | |||
| from fastNLP.models.base_model import BaseModel | |||
| import fastNLP.modules.encoder as encoder | |||
| def _get_dropped_weights(w_raw, dropout_p, is_training): | |||
| """Drops out weights to implement DropConnect. | |||
| Args: | |||
| w_raw: Full, pre-dropout, weights to be dropped out. | |||
| dropout_p: Proportion of weights to drop out. | |||
| is_training: True iff _shared_ model is training. | |||
| Returns: | |||
| The dropped weights. | |||
| Why does torch.nn.functional.dropout() return: | |||
| 1. `torch.autograd.Variable()` on the training loop | |||
| 2. `torch.nn.Parameter()` on the controller or eval loop, when | |||
| training = False... | |||
| Even though the call to `_setweights` in the Smerity repo's | |||
| `weight_drop.py` does not have this behaviour, and `F.dropout` always | |||
| returns `torch.autograd.Variable` there, even when `training=False`? | |||
| The above TODO is the reason for the hacky check for `torch.nn.Parameter`. | |||
| """ | |||
| dropped_w = F.dropout(w_raw, p=dropout_p, training=is_training) | |||
| if isinstance(dropped_w, torch.nn.Parameter): | |||
| dropped_w = dropped_w.clone() | |||
| return dropped_w | |||
| class EmbeddingDropout(torch.nn.Embedding): | |||
| """Class for dropping out embeddings by zero'ing out parameters in the | |||
| embedding matrix. | |||
| This is equivalent to dropping out particular words, e.g., in the sentence | |||
| 'the quick brown fox jumps over the lazy dog', dropping out 'the' would | |||
| lead to the sentence '### quick brown fox jumps over ### lazy dog' (in the | |||
| embedding vector space). | |||
| See 'A Theoretically Grounded Application of Dropout in Recurrent Neural | |||
| Networks', (Gal and Ghahramani, 2016). | |||
| """ | |||
| def __init__(self, | |||
| num_embeddings, | |||
| embedding_dim, | |||
| max_norm=None, | |||
| norm_type=2, | |||
| scale_grad_by_freq=False, | |||
| sparse=False, | |||
| dropout=0.1, | |||
| scale=None): | |||
| """Embedding constructor. | |||
| Args: | |||
| dropout: Dropout probability. | |||
| scale: Used to scale parameters of embedding weight matrix that are | |||
| not dropped out. Note that this is _in addition_ to the | |||
| `1/(1 - dropout)` scaling. | |||
| See `torch.nn.Embedding` for remaining arguments. | |||
| """ | |||
| torch.nn.Embedding.__init__(self, | |||
| num_embeddings=num_embeddings, | |||
| embedding_dim=embedding_dim, | |||
| max_norm=max_norm, | |||
| norm_type=norm_type, | |||
| scale_grad_by_freq=scale_grad_by_freq, | |||
| sparse=sparse) | |||
| self.dropout = dropout | |||
| assert (dropout >= 0.0) and (dropout < 1.0), ('Dropout must be >= 0.0 ' | |||
| 'and < 1.0') | |||
| self.scale = scale | |||
| def forward(self, inputs): # pylint:disable=arguments-differ | |||
| """Embeds `inputs` with the dropped out embedding weight matrix.""" | |||
| if self.training: | |||
| dropout = self.dropout | |||
| else: | |||
| dropout = 0 | |||
| if dropout: | |||
| mask = self.weight.data.new(self.weight.size(0), 1) | |||
| mask.bernoulli_(1 - dropout) | |||
| mask = mask.expand_as(self.weight) | |||
| mask = mask / (1 - dropout) | |||
| masked_weight = self.weight * Variable(mask) | |||
| else: | |||
| masked_weight = self.weight | |||
| if self.scale and self.scale != 1: | |||
| masked_weight = masked_weight * self.scale | |||
| return F.embedding(inputs, | |||
| masked_weight, | |||
| max_norm=self.max_norm, | |||
| norm_type=self.norm_type, | |||
| scale_grad_by_freq=self.scale_grad_by_freq, | |||
| sparse=self.sparse) | |||
| class LockedDropout(nn.Module): | |||
| # code from https://github.com/salesforce/awd-lstm-lm/blob/master/locked_dropout.py | |||
| def __init__(self): | |||
| super().__init__() | |||
| def forward(self, x, dropout=0.5): | |||
| if not self.training or not dropout: | |||
| return x | |||
| m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - dropout) | |||
| mask = Variable(m, requires_grad=False) / (1 - dropout) | |||
| mask = mask.expand_as(x) | |||
| return mask * x | |||
| class ENASModel(BaseModel): | |||
| """Shared RNN model.""" | |||
| def __init__(self, embed_num, num_classes, num_blocks=4, cuda=False, shared_hid=1000, shared_embed=1000): | |||
| super(ENASModel, self).__init__() | |||
| self.use_cuda = cuda | |||
| self.shared_hid = shared_hid | |||
| self.num_blocks = num_blocks | |||
| self.decoder = nn.Linear(self.shared_hid, num_classes) | |||
| self.encoder = EmbeddingDropout(embed_num, | |||
| shared_embed, | |||
| dropout=0.1) | |||
| self.lockdrop = LockedDropout() | |||
| self.dag = None | |||
| # Tie weights | |||
| # self.decoder.weight = self.encoder.weight | |||
| # Since W^{x, c} and W^{h, c} are always summed, there | |||
| # is no point duplicating their bias offset parameter. Likewise for | |||
| # W^{x, h} and W^{h, h}. | |||
| self.w_xc = nn.Linear(shared_embed, self.shared_hid) | |||
| self.w_xh = nn.Linear(shared_embed, self.shared_hid) | |||
| # The raw weights are stored here because the hidden-to-hidden weights | |||
| # are weight dropped on the forward pass. | |||
| self.w_hc_raw = torch.nn.Parameter( | |||
| torch.Tensor(self.shared_hid, self.shared_hid)) | |||
| self.w_hh_raw = torch.nn.Parameter( | |||
| torch.Tensor(self.shared_hid, self.shared_hid)) | |||
| self.w_hc = None | |||
| self.w_hh = None | |||
| self.w_h = collections.defaultdict(dict) | |||
| self.w_c = collections.defaultdict(dict) | |||
| for idx in range(self.num_blocks): | |||
| for jdx in range(idx + 1, self.num_blocks): | |||
| self.w_h[idx][jdx] = nn.Linear(self.shared_hid, | |||
| self.shared_hid, | |||
| bias=False) | |||
| self.w_c[idx][jdx] = nn.Linear(self.shared_hid, | |||
| self.shared_hid, | |||
| bias=False) | |||
| self._w_h = nn.ModuleList([self.w_h[idx][jdx] | |||
| for idx in self.w_h | |||
| for jdx in self.w_h[idx]]) | |||
| self._w_c = nn.ModuleList([self.w_c[idx][jdx] | |||
| for idx in self.w_c | |||
| for jdx in self.w_c[idx]]) | |||
| self.batch_norm = None | |||
| # if args.mode == 'train': | |||
| # self.batch_norm = nn.BatchNorm1d(self.shared_hid) | |||
| # else: | |||
| # self.batch_norm = None | |||
| self.reset_parameters() | |||
| self.static_init_hidden = utils.keydefaultdict(self.init_hidden) | |||
| def setDAG(self, dag): | |||
| if self.dag is None: | |||
| self.dag = dag | |||
| def forward(self, word_seq, hidden=None): | |||
| inputs = torch.transpose(word_seq, 0, 1) | |||
| time_steps = inputs.size(0) | |||
| batch_size = inputs.size(1) | |||
| self.w_hh = _get_dropped_weights(self.w_hh_raw, | |||
| 0.5, | |||
| self.training) | |||
| self.w_hc = _get_dropped_weights(self.w_hc_raw, | |||
| 0.5, | |||
| self.training) | |||
| # hidden = self.static_init_hidden[batch_size] if hidden is None else hidden | |||
| hidden = self.static_init_hidden[batch_size] | |||
| embed = self.encoder(inputs) | |||
| embed = self.lockdrop(embed, 0.65 if self.training else 0) | |||
| # The norm of hidden states are clipped here because | |||
| # otherwise ENAS is especially prone to exploding activations on the | |||
| # forward pass. This could probably be fixed in a more elegant way, but | |||
| # it might be exposing a weakness in the ENAS algorithm as currently | |||
| # proposed. | |||
| # | |||
| # For more details, see | |||
| # https://github.com/carpedm20/ENAS-pytorch/issues/6 | |||
| clipped_num = 0 | |||
| max_clipped_norm = 0 | |||
| h1tohT = [] | |||
| logits = [] | |||
| for step in range(time_steps): | |||
| x_t = embed[step] | |||
| logit, hidden = self.cell(x_t, hidden, self.dag) | |||
| hidden_norms = hidden.norm(dim=-1) | |||
| max_norm = 25.0 | |||
| if hidden_norms.data.max() > max_norm: | |||
| # Just directly use the torch slice operations | |||
| # in PyTorch v0.4. | |||
| # | |||
| # This workaround for PyTorch v0.3.1 does everything in numpy, | |||
| # because the PyTorch slicing and slice assignment is too | |||
| # flaky. | |||
| hidden_norms = hidden_norms.data.cpu().numpy() | |||
| clipped_num += 1 | |||
| if hidden_norms.max() > max_clipped_norm: | |||
| max_clipped_norm = hidden_norms.max() | |||
| clip_select = hidden_norms > max_norm | |||
| clip_norms = hidden_norms[clip_select] | |||
| mask = np.ones(hidden.size()) | |||
| normalizer = max_norm/clip_norms | |||
| normalizer = normalizer[:, np.newaxis] | |||
| mask[clip_select] = normalizer | |||
| if self.use_cuda: | |||
| hidden *= torch.autograd.Variable( | |||
| torch.FloatTensor(mask).cuda(), requires_grad=False) | |||
| else: | |||
| hidden *= torch.autograd.Variable( | |||
| torch.FloatTensor(mask), requires_grad=False) | |||
| logits.append(logit) | |||
| h1tohT.append(hidden) | |||
| h1tohT = torch.stack(h1tohT) | |||
| output = torch.stack(logits) | |||
| raw_output = output | |||
| output = self.lockdrop(output, 0.4 if self.training else 0) | |||
| #Pooling | |||
| output = torch.mean(output, 0) | |||
| decoded = self.decoder(output) | |||
| extra_out = {'dropped': decoded, | |||
| 'hiddens': h1tohT, | |||
| 'raw': raw_output} | |||
| return {'pred': decoded, 'hidden': hidden, 'extra_out': extra_out} | |||
| def cell(self, x, h_prev, dag): | |||
| """Computes a single pass through the discovered RNN cell.""" | |||
| c = {} | |||
| h = {} | |||
| f = {} | |||
| f[0] = self.get_f(dag[-1][0].name) | |||
| c[0] = torch.sigmoid(self.w_xc(x) + F.linear(h_prev, self.w_hc, None)) | |||
| h[0] = (c[0]*f[0](self.w_xh(x) + F.linear(h_prev, self.w_hh, None)) + | |||
| (1 - c[0])*h_prev) | |||
| leaf_node_ids = [] | |||
| q = collections.deque() | |||
| q.append(0) | |||
| # Computes connections from the parent nodes `node_id` | |||
| # to their child nodes `next_id` recursively, skipping leaf nodes. A | |||
| # leaf node is a node whose id == `self.num_blocks`. | |||
| # | |||
| # Connections between parent i and child j should be computed as | |||
| # h_j = c_j*f_{ij}{(W^h_{ij}*h_i)} + (1 - c_j)*h_i, | |||
| # where c_j = \sigmoid{(W^c_{ij}*h_i)} | |||
| # | |||
| # See Training details from Section 3.1 of the paper. | |||
| # | |||
| # The following algorithm does a breadth-first (since `q.popleft()` is | |||
| # used) search over the nodes and computes all the hidden states. | |||
| while True: | |||
| if len(q) == 0: | |||
| break | |||
| node_id = q.popleft() | |||
| nodes = dag[node_id] | |||
| for next_node in nodes: | |||
| next_id = next_node.id | |||
| if next_id == self.num_blocks: | |||
| leaf_node_ids.append(node_id) | |||
| assert len(nodes) == 1, ('parent of leaf node should have ' | |||
| 'only one child') | |||
| continue | |||
| w_h = self.w_h[node_id][next_id] | |||
| w_c = self.w_c[node_id][next_id] | |||
| f[next_id] = self.get_f(next_node.name) | |||
| c[next_id] = torch.sigmoid(w_c(h[node_id])) | |||
| h[next_id] = (c[next_id]*f[next_id](w_h(h[node_id])) + | |||
| (1 - c[next_id])*h[node_id]) | |||
| q.append(next_id) | |||
| # Instead of averaging loose ends, perhaps there should | |||
| # be a set of separate unshared weights for each "loose" connection | |||
| # between each node in a cell and the output. | |||
| # | |||
| # As it stands, all weights W^h_{ij} are doing double duty by | |||
| # connecting both from i to j, as well as from i to the output. | |||
| # average all the loose ends | |||
| leaf_nodes = [h[node_id] for node_id in leaf_node_ids] | |||
| output = torch.mean(torch.stack(leaf_nodes, 2), -1) | |||
| # stabilizing the Updates of omega | |||
| if self.batch_norm is not None: | |||
| output = self.batch_norm(output) | |||
| return output, h[self.num_blocks - 1] | |||
| def init_hidden(self, batch_size): | |||
| zeros = torch.zeros(batch_size, self.shared_hid) | |||
| return utils.get_variable(zeros, self.use_cuda, requires_grad=False) | |||
| def get_f(self, name): | |||
| name = name.lower() | |||
| if name == 'relu': | |||
| f = torch.relu | |||
| elif name == 'tanh': | |||
| f = torch.tanh | |||
| elif name == 'identity': | |||
| f = lambda x: x | |||
| elif name == 'sigmoid': | |||
| f = torch.sigmoid | |||
| return f | |||
| @property | |||
| def num_parameters(self): | |||
| def size(p): | |||
| return np.prod(p.size()) | |||
| return sum([size(param) for param in self.parameters()]) | |||
| def reset_parameters(self): | |||
| init_range = 0.025 | |||
| # init_range = 0.025 if self.args.mode == 'train' else 0.04 | |||
| for param in self.parameters(): | |||
| param.data.uniform_(-init_range, init_range) | |||
| self.decoder.bias.data.fill_(0) | |||
| def predict(self, word_seq): | |||
| """ | |||
| :param word_seq: torch.LongTensor, [batch_size, seq_len] | |||
| :return predict: dict of torch.LongTensor, [batch_size, seq_len] | |||
| """ | |||
| output = self(word_seq) | |||
| _, predict = output['pred'].max(dim=1) | |||
| return {'pred': predict} | |||
| @@ -0,0 +1,385 @@ | |||
| # Code Modified from https://github.com/carpedm20/ENAS-pytorch | |||
| import os | |||
| import time | |||
| from datetime import datetime | |||
| from datetime import timedelta | |||
| import numpy as np | |||
| import torch | |||
| import math | |||
| from torch import nn | |||
| try: | |||
| from tqdm.autonotebook import tqdm | |||
| except: | |||
| from fastNLP.core.utils import pseudo_tqdm as tqdm | |||
| from fastNLP.core.batch import Batch | |||
| from fastNLP.core.callback import CallbackManager, CallbackException | |||
| from fastNLP.core.dataset import DataSet | |||
| from fastNLP.core.utils import CheckError | |||
| from fastNLP.core.utils import _move_dict_value_to_device | |||
| import fastNLP | |||
| import fastNLP.models.enas_utils as utils | |||
| from fastNLP.core.utils import _build_args | |||
| from torch.optim import Adam | |||
| def _get_no_grad_ctx_mgr(): | |||
| """Returns a the `torch.no_grad` context manager for PyTorch version >= | |||
| 0.4, or a no-op context manager otherwise. | |||
| """ | |||
| return torch.no_grad() | |||
| class ENASTrainer(fastNLP.Trainer): | |||
| """A class to wrap training code.""" | |||
| def __init__(self, train_data, model, controller, **kwargs): | |||
| """Constructor for training algorithm. | |||
| :param DataSet train_data: the training data | |||
| :param torch.nn.modules.module model: a PyTorch model | |||
| :param torch.nn.modules.module controller: a PyTorch model | |||
| """ | |||
| self.final_epochs = kwargs['final_epochs'] | |||
| kwargs.pop('final_epochs') | |||
| super(ENASTrainer, self).__init__(train_data, model, **kwargs) | |||
| self.controller_step = 0 | |||
| self.shared_step = 0 | |||
| self.max_length = 35 | |||
| self.shared = model | |||
| self.controller = controller | |||
| self.shared_optim = Adam( | |||
| self.shared.parameters(), | |||
| lr=20.0, | |||
| weight_decay=1e-7) | |||
| self.controller_optim = Adam( | |||
| self.controller.parameters(), | |||
| lr=3.5e-4) | |||
| def train(self, load_best_model=True): | |||
| """ | |||
| :param bool load_best_model: 该参数只有在初始化提供了dev_data的情况下有效,如果True, trainer将在返回之前重新加载dev表现 | |||
| 最好的模型参数。 | |||
| :return results: 返回一个字典类型的数据, 内含以下内容:: | |||
| seconds: float, 表示训练时长 | |||
| 以下三个内容只有在提供了dev_data的情况下会有。 | |||
| best_eval: Dict of Dict, 表示evaluation的结果 | |||
| best_epoch: int,在第几个epoch取得的最佳值 | |||
| best_step: int, 在第几个step(batch)更新取得的最佳值 | |||
| """ | |||
| results = {} | |||
| if self.n_epochs <= 0: | |||
| print(f"training epoch is {self.n_epochs}, nothing was done.") | |||
| results['seconds'] = 0. | |||
| return results | |||
| try: | |||
| if torch.cuda.is_available() and self.use_cuda: | |||
| self.model = self.model.cuda() | |||
| self._model_device = self.model.parameters().__next__().device | |||
| self._mode(self.model, is_test=False) | |||
| self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) | |||
| start_time = time.time() | |||
| print("training epochs started " + self.start_time, flush=True) | |||
| try: | |||
| self.callback_manager.on_train_begin() | |||
| self._train() | |||
| self.callback_manager.on_train_end() | |||
| except (CallbackException, KeyboardInterrupt) as e: | |||
| self.callback_manager.on_exception(e) | |||
| if self.dev_data is not None: | |||
| print("\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) + | |||
| self.tester._format_eval_results(self.best_dev_perf),) | |||
| results['best_eval'] = self.best_dev_perf | |||
| results['best_epoch'] = self.best_dev_epoch | |||
| results['best_step'] = self.best_dev_step | |||
| if load_best_model: | |||
| model_name = "best_" + "_".join([self.model.__class__.__name__, self.metric_key, self.start_time]) | |||
| load_succeed = self._load_model(self.model, model_name) | |||
| if load_succeed: | |||
| print("Reloaded the best model.") | |||
| else: | |||
| print("Fail to reload best model.") | |||
| finally: | |||
| pass | |||
| results['seconds'] = round(time.time() - start_time, 2) | |||
| return results | |||
| def _train(self): | |||
| if not self.use_tqdm: | |||
| from fastNLP.core.utils import pseudo_tqdm as inner_tqdm | |||
| else: | |||
| inner_tqdm = tqdm | |||
| self.step = 0 | |||
| start = time.time() | |||
| total_steps = (len(self.train_data) // self.batch_size + int( | |||
| len(self.train_data) % self.batch_size != 0)) * self.n_epochs | |||
| with inner_tqdm(total=total_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar: | |||
| avg_loss = 0 | |||
| data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, | |||
| prefetch=self.prefetch) | |||
| for epoch in range(1, self.n_epochs+1): | |||
| pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs)) | |||
| last_stage = (epoch > self.n_epochs + 1 - self.final_epochs) | |||
| if epoch == self.n_epochs + 1 - self.final_epochs: | |||
| print('Entering the final stage. (Only train the selected structure)') | |||
| # early stopping | |||
| self.callback_manager.on_epoch_begin() | |||
| # 1. Training the shared parameters omega of the child models | |||
| self.train_shared(pbar) | |||
| # 2. Training the controller parameters theta | |||
| if not last_stage: | |||
| self.train_controller() | |||
| if ((self.validate_every > 0 and self.step % self.validate_every == 0) or | |||
| (self.validate_every < 0 and self.step % len(data_iterator) == 0)) \ | |||
| and self.dev_data is not None: | |||
| if not last_stage: | |||
| self.derive() | |||
| eval_res = self._do_validation(epoch=epoch, step=self.step) | |||
| eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, | |||
| total_steps) + \ | |||
| self.tester._format_eval_results(eval_res) | |||
| pbar.write(eval_str) | |||
| # lr decay; early stopping | |||
| self.callback_manager.on_epoch_end() | |||
| # =============== epochs end =================== # | |||
| pbar.close() | |||
| # ============ tqdm end ============== # | |||
| def get_loss(self, inputs, targets, hidden, dags): | |||
| """Computes the loss for the same batch for M models. | |||
| This amounts to an estimate of the loss, which is turned into an | |||
| estimate for the gradients of the shared model. | |||
| """ | |||
| if not isinstance(dags, list): | |||
| dags = [dags] | |||
| loss = 0 | |||
| for dag in dags: | |||
| self.shared.setDAG(dag) | |||
| inputs = _build_args(self.shared.forward, **inputs) | |||
| inputs['hidden'] = hidden | |||
| result = self.shared(**inputs) | |||
| output, hidden, extra_out = result['pred'], result['hidden'], result['extra_out'] | |||
| self.callback_manager.on_loss_begin(targets, result) | |||
| sample_loss = self._compute_loss(result, targets) | |||
| loss += sample_loss | |||
| assert len(dags) == 1, 'there are multiple `hidden` for multple `dags`' | |||
| return loss, hidden, extra_out | |||
| def train_shared(self, pbar=None, max_step=None, dag=None): | |||
| """Train the language model for 400 steps of minibatches of 64 | |||
| examples. | |||
| Args: | |||
| max_step: Used to run extra training steps as a warm-up. | |||
| dag: If not None, is used instead of calling sample(). | |||
| BPTT is truncated at 35 timesteps. | |||
| For each weight update, gradients are estimated by sampling M models | |||
| from the fixed controller policy, and averaging their gradients | |||
| computed on a batch of training data. | |||
| """ | |||
| model = self.shared | |||
| model.train() | |||
| self.controller.eval() | |||
| hidden = self.shared.init_hidden(self.batch_size) | |||
| abs_max_grad = 0 | |||
| abs_max_hidden_norm = 0 | |||
| step = 0 | |||
| raw_total_loss = 0 | |||
| total_loss = 0 | |||
| train_idx = 0 | |||
| avg_loss = 0 | |||
| data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, | |||
| prefetch=self.prefetch) | |||
| for batch_x, batch_y in data_iterator: | |||
| _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) | |||
| indices = data_iterator.get_batch_indices() | |||
| # negative sampling; replace unknown; re-weight batch_y | |||
| self.callback_manager.on_batch_begin(batch_x, batch_y, indices) | |||
| # prediction = self._data_forward(self.model, batch_x) | |||
| dags = self.controller.sample(1) | |||
| inputs, targets = batch_x, batch_y | |||
| # self.callback_manager.on_loss_begin(batch_y, prediction) | |||
| loss, hidden, extra_out = self.get_loss(inputs, | |||
| targets, | |||
| hidden, | |||
| dags) | |||
| hidden.detach_() | |||
| avg_loss += loss.item() | |||
| # Is loss NaN or inf? requires_grad = False | |||
| self.callback_manager.on_backward_begin(loss) | |||
| self._grad_backward(loss) | |||
| self.callback_manager.on_backward_end() | |||
| self._update() | |||
| self.callback_manager.on_step_end() | |||
| if (self.step+1) % self.print_every == 0: | |||
| if self.use_tqdm: | |||
| print_output = "loss:{0:<6.5f}".format(avg_loss / self.print_every) | |||
| pbar.update(self.print_every) | |||
| else: | |||
| end = time.time() | |||
| diff = timedelta(seconds=round(end - start)) | |||
| print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.6} time: {}".format( | |||
| epoch, self.step, avg_loss, diff) | |||
| pbar.set_postfix_str(print_output) | |||
| avg_loss = 0 | |||
| self.step += 1 | |||
| step += 1 | |||
| self.shared_step += 1 | |||
| self.callback_manager.on_batch_end() | |||
| # ================= mini-batch end ==================== # | |||
| def get_reward(self, dag, entropies, hidden, valid_idx=0): | |||
| """Computes the perplexity of a single sampled model on a minibatch of | |||
| validation data. | |||
| """ | |||
| if not isinstance(entropies, np.ndarray): | |||
| entropies = entropies.data.cpu().numpy() | |||
| data_iterator = Batch(self.dev_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, | |||
| prefetch=self.prefetch) | |||
| for inputs, targets in data_iterator: | |||
| valid_loss, hidden, _ = self.get_loss(inputs, targets, hidden, dag) | |||
| valid_loss = utils.to_item(valid_loss.data) | |||
| valid_ppl = math.exp(valid_loss) | |||
| R = 80 / valid_ppl | |||
| rewards = R + 1e-4 * entropies | |||
| return rewards, hidden | |||
| def train_controller(self): | |||
| """Fixes the shared parameters and updates the controller parameters. | |||
| The controller is updated with a score function gradient estimator | |||
| (i.e., REINFORCE), with the reward being c/valid_ppl, where valid_ppl | |||
| is computed on a minibatch of validation data. | |||
| A moving average baseline is used. | |||
| The controller is trained for 2000 steps per epoch (i.e., | |||
| first (Train Shared) phase -> second (Train Controller) phase). | |||
| """ | |||
| model = self.controller | |||
| model.train() | |||
| # Why can't we call shared.eval() here? Leads to loss | |||
| # being uniformly zero for the controller. | |||
| # self.shared.eval() | |||
| avg_reward_base = None | |||
| baseline = None | |||
| adv_history = [] | |||
| entropy_history = [] | |||
| reward_history = [] | |||
| hidden = self.shared.init_hidden(self.batch_size) | |||
| total_loss = 0 | |||
| valid_idx = 0 | |||
| for step in range(20): | |||
| # sample models | |||
| dags, log_probs, entropies = self.controller.sample( | |||
| with_details=True) | |||
| # calculate reward | |||
| np_entropies = entropies.data.cpu().numpy() | |||
| # No gradients should be backpropagated to the | |||
| # shared model during controller training, obviously. | |||
| with _get_no_grad_ctx_mgr(): | |||
| rewards, hidden = self.get_reward(dags, | |||
| np_entropies, | |||
| hidden, | |||
| valid_idx) | |||
| reward_history.extend(rewards) | |||
| entropy_history.extend(np_entropies) | |||
| # moving average baseline | |||
| if baseline is None: | |||
| baseline = rewards | |||
| else: | |||
| decay = 0.95 | |||
| baseline = decay * baseline + (1 - decay) * rewards | |||
| adv = rewards - baseline | |||
| adv_history.extend(adv) | |||
| # policy loss | |||
| loss = -log_probs*utils.get_variable(adv, | |||
| self.use_cuda, | |||
| requires_grad=False) | |||
| loss = loss.sum() # or loss.mean() | |||
| # update | |||
| self.controller_optim.zero_grad() | |||
| loss.backward() | |||
| self.controller_optim.step() | |||
| total_loss += utils.to_item(loss.data) | |||
| if ((step % 50) == 0) and (step > 0): | |||
| reward_history, adv_history, entropy_history = [], [], [] | |||
| total_loss = 0 | |||
| self.controller_step += 1 | |||
| # prev_valid_idx = valid_idx | |||
| # valid_idx = ((valid_idx + self.max_length) % | |||
| # (self.valid_data.size(0) - 1)) | |||
| # # Whenever we wrap around to the beginning of the | |||
| # # validation data, we reset the hidden states. | |||
| # if prev_valid_idx > valid_idx: | |||
| # hidden = self.shared.init_hidden(self.batch_size) | |||
| def derive(self, sample_num=10, valid_idx=0): | |||
| """We are always deriving based on the very first batch | |||
| of validation data? This seems wrong... | |||
| """ | |||
| hidden = self.shared.init_hidden(self.batch_size) | |||
| dags, _, entropies = self.controller.sample(sample_num, | |||
| with_details=True) | |||
| max_R = 0 | |||
| best_dag = None | |||
| for dag in dags: | |||
| R, _ = self.get_reward(dag, entropies, hidden, valid_idx) | |||
| if R.max() > max_R: | |||
| max_R = R.max() | |||
| best_dag = dag | |||
| self.model.setDAG(best_dag) | |||
| @@ -0,0 +1,56 @@ | |||
| # Code Modified from https://github.com/carpedm20/ENAS-pytorch | |||
| from __future__ import print_function | |||
| from collections import defaultdict | |||
| import collections | |||
| from datetime import datetime | |||
| import os | |||
| import json | |||
| import numpy as np | |||
| import torch | |||
| from torch.autograd import Variable | |||
| def detach(h): | |||
| if type(h) == Variable: | |||
| return Variable(h.data) | |||
| else: | |||
| return tuple(detach(v) for v in h) | |||
| def get_variable(inputs, cuda=False, **kwargs): | |||
| if type(inputs) in [list, np.ndarray]: | |||
| inputs = torch.Tensor(inputs) | |||
| if cuda: | |||
| out = Variable(inputs.cuda(), **kwargs) | |||
| else: | |||
| out = Variable(inputs, **kwargs) | |||
| return out | |||
| def update_lr(optimizer, lr): | |||
| for param_group in optimizer.param_groups: | |||
| param_group['lr'] = lr | |||
| Node = collections.namedtuple('Node', ['id', 'name']) | |||
| class keydefaultdict(defaultdict): | |||
| def __missing__(self, key): | |||
| if self.default_factory is None: | |||
| raise KeyError(key) | |||
| else: | |||
| ret = self[key] = self.default_factory(key) | |||
| return ret | |||
| def to_item(x): | |||
| """Converts x, possibly scalar and possibly tensor, to a Python scalar.""" | |||
| if isinstance(x, (float, int)): | |||
| return x | |||
| if float(torch.__version__[0:3]) < 0.4: | |||
| assert (x.dim() == 1) and (len(x) == 1) | |||
| return x[0] | |||
| return x.item() | |||
| @@ -0,0 +1,181 @@ | |||
| from fastNLP.modules.encoder.star_transformer import StarTransformer | |||
| from fastNLP.core.utils import seq_lens_to_masks | |||
| import torch | |||
| from torch import nn | |||
| import torch.nn.functional as F | |||
| class StarTransEnc(nn.Module): | |||
| def __init__(self, vocab_size, emb_dim, | |||
| hidden_size, | |||
| num_layers, | |||
| num_head, | |||
| head_dim, | |||
| max_len, | |||
| emb_dropout, | |||
| dropout): | |||
| super(StarTransEnc, self).__init__() | |||
| self.emb_fc = nn.Linear(emb_dim, hidden_size) | |||
| self.emb_drop = nn.Dropout(emb_dropout) | |||
| self.embedding = nn.Embedding(vocab_size, emb_dim) | |||
| self.encoder = StarTransformer(hidden_size=hidden_size, | |||
| num_layers=num_layers, | |||
| num_head=num_head, | |||
| head_dim=head_dim, | |||
| dropout=dropout, | |||
| max_len=max_len) | |||
| def forward(self, x, mask): | |||
| x = self.embedding(x) | |||
| x = self.emb_fc(self.emb_drop(x)) | |||
| nodes, relay = self.encoder(x, mask) | |||
| return nodes, relay | |||
| class Cls(nn.Module): | |||
| def __init__(self, in_dim, num_cls, hid_dim, dropout=0.1): | |||
| super(Cls, self).__init__() | |||
| self.fc = nn.Sequential( | |||
| nn.Linear(in_dim, hid_dim), | |||
| nn.LeakyReLU(), | |||
| nn.Dropout(dropout), | |||
| nn.Linear(hid_dim, num_cls), | |||
| ) | |||
| def forward(self, x): | |||
| h = self.fc(x) | |||
| return h | |||
| class NLICls(nn.Module): | |||
| def __init__(self, in_dim, num_cls, hid_dim, dropout=0.1): | |||
| super(NLICls, self).__init__() | |||
| self.fc = nn.Sequential( | |||
| nn.Dropout(dropout), | |||
| nn.Linear(in_dim*4, hid_dim), #4 | |||
| nn.LeakyReLU(), | |||
| nn.Dropout(dropout), | |||
| nn.Linear(hid_dim, num_cls), | |||
| ) | |||
| def forward(self, x1, x2): | |||
| x = torch.cat([x1, x2, torch.abs(x1-x2), x1*x2], 1) | |||
| h = self.fc(x) | |||
| return h | |||
| class STSeqLabel(nn.Module): | |||
| """star-transformer model for sequence labeling | |||
| """ | |||
| def __init__(self, vocab_size, emb_dim, num_cls, | |||
| hidden_size=300, | |||
| num_layers=4, | |||
| num_head=8, | |||
| head_dim=32, | |||
| max_len=512, | |||
| cls_hidden_size=600, | |||
| emb_dropout=0.1, | |||
| dropout=0.1,): | |||
| super(STSeqLabel, self).__init__() | |||
| self.enc = StarTransEnc(vocab_size=vocab_size, | |||
| emb_dim=emb_dim, | |||
| hidden_size=hidden_size, | |||
| num_layers=num_layers, | |||
| num_head=num_head, | |||
| head_dim=head_dim, | |||
| max_len=max_len, | |||
| emb_dropout=emb_dropout, | |||
| dropout=dropout) | |||
| self.cls = Cls(hidden_size, num_cls, cls_hidden_size) | |||
| def forward(self, word_seq, seq_lens): | |||
| mask = seq_lens_to_masks(seq_lens) | |||
| nodes, _ = self.enc(word_seq, mask) | |||
| output = self.cls(nodes) | |||
| output = output.transpose(1,2) # make hidden to be dim 1 | |||
| return {'output': output} # [bsz, n_cls, seq_len] | |||
| def predict(self, word_seq, seq_lens): | |||
| y = self.forward(word_seq, seq_lens) | |||
| _, pred = y['output'].max(1) | |||
| return {'output': pred, 'seq_lens': seq_lens} | |||
| class STSeqCls(nn.Module): | |||
| """star-transformer model for sequence classification | |||
| """ | |||
| def __init__(self, vocab_size, emb_dim, num_cls, | |||
| hidden_size=300, | |||
| num_layers=4, | |||
| num_head=8, | |||
| head_dim=32, | |||
| max_len=512, | |||
| cls_hidden_size=600, | |||
| emb_dropout=0.1, | |||
| dropout=0.1,): | |||
| super(STSeqCls, self).__init__() | |||
| self.enc = StarTransEnc(vocab_size=vocab_size, | |||
| emb_dim=emb_dim, | |||
| hidden_size=hidden_size, | |||
| num_layers=num_layers, | |||
| num_head=num_head, | |||
| head_dim=head_dim, | |||
| max_len=max_len, | |||
| emb_dropout=emb_dropout, | |||
| dropout=dropout) | |||
| self.cls = Cls(hidden_size, num_cls, cls_hidden_size) | |||
| def forward(self, word_seq, seq_lens): | |||
| mask = seq_lens_to_masks(seq_lens) | |||
| nodes, relay = self.enc(word_seq, mask) | |||
| y = 0.5 * (relay + nodes.max(1)[0]) | |||
| output = self.cls(y) # [bsz, n_cls] | |||
| return {'output': output} | |||
| def predict(self, word_seq, seq_lens): | |||
| y = self.forward(word_seq, seq_lens) | |||
| _, pred = y['output'].max(1) | |||
| return {'output': pred} | |||
| class STNLICls(nn.Module): | |||
| """star-transformer model for NLI | |||
| """ | |||
| def __init__(self, vocab_size, emb_dim, num_cls, | |||
| hidden_size=300, | |||
| num_layers=4, | |||
| num_head=8, | |||
| head_dim=32, | |||
| max_len=512, | |||
| cls_hidden_size=600, | |||
| emb_dropout=0.1, | |||
| dropout=0.1,): | |||
| super(STNLICls, self).__init__() | |||
| self.enc = StarTransEnc(vocab_size=vocab_size, | |||
| emb_dim=emb_dim, | |||
| hidden_size=hidden_size, | |||
| num_layers=num_layers, | |||
| num_head=num_head, | |||
| head_dim=head_dim, | |||
| max_len=max_len, | |||
| emb_dropout=emb_dropout, | |||
| dropout=dropout) | |||
| self.cls = NLICls(hidden_size, num_cls, cls_hidden_size) | |||
| def forward(self, word_seq1, word_seq2, seq_lens1, seq_lens2): | |||
| mask1 = seq_lens_to_masks(seq_lens1) | |||
| mask2 = seq_lens_to_masks(seq_lens2) | |||
| def enc(seq, mask): | |||
| nodes, relay = self.enc(seq, mask) | |||
| return 0.5 * (relay + nodes.max(1)[0]) | |||
| y1 = enc(word_seq1, mask1) | |||
| y2 = enc(word_seq2, mask2) | |||
| output = self.cls(y1, y2) # [bsz, n_cls] | |||
| return {'output': output} | |||
| def predict(self, word_seq1, word_seq2, seq_lens1, seq_lens2): | |||
| y = self.forward(word_seq1, word_seq2, seq_lens1, seq_lens2) | |||
| _, pred = y['output'].max(1) | |||
| return {'output': pred} | |||
| @@ -0,0 +1,145 @@ | |||
| import torch | |||
| from torch import nn | |||
| from torch.nn import functional as F | |||
| import numpy as NP | |||
| class StarTransformer(nn.Module): | |||
| """Star-Transformer Encoder part。 | |||
| paper: https://arxiv.org/abs/1902.09113 | |||
| :param hidden_size: int, 输入维度的大小。同时也是输出维度的大小。 | |||
| :param num_layers: int, star-transformer的层数 | |||
| :param num_head: int,head的数量。 | |||
| :param head_dim: int, 每个head的维度大小。 | |||
| :param dropout: float dropout 概率 | |||
| :param max_len: int or None, 如果为int,输入序列的最大长度, | |||
| 模型会为属于序列加上position embedding。 | |||
| 若为None,忽略加上position embedding的步骤 | |||
| """ | |||
| def __init__(self, hidden_size, num_layers, num_head, head_dim, dropout=0.1, max_len=None): | |||
| super(StarTransformer, self).__init__() | |||
| self.iters = num_layers | |||
| self.norm = nn.ModuleList([nn.LayerNorm(hidden_size) for _ in range(self.iters)]) | |||
| self.ring_att = nn.ModuleList( | |||
| [MSA1(hidden_size, nhead=num_head, head_dim=head_dim, dropout=dropout) | |||
| for _ in range(self.iters)]) | |||
| self.star_att = nn.ModuleList( | |||
| [MSA2(hidden_size, nhead=num_head, head_dim=head_dim, dropout=dropout) | |||
| for _ in range(self.iters)]) | |||
| if max_len is not None: | |||
| self.pos_emb = self.pos_emb = nn.Embedding(max_len, hidden_size) | |||
| else: | |||
| self.pos_emb = None | |||
| def forward(self, data, mask): | |||
| """ | |||
| :param FloatTensor data: [batch, length, hidden] the input sequence | |||
| :param ByteTensor mask: [batch, length] the padding mask for input, in which padding pos is 0 | |||
| :return: [batch, length, hidden] the output sequence | |||
| [batch, hidden] the global relay node | |||
| """ | |||
| def norm_func(f, x): | |||
| # B, H, L, 1 | |||
| return f(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2) | |||
| B, L, H = data.size() | |||
| mask = (mask == 0) # flip the mask for masked_fill_ | |||
| smask = torch.cat([torch.zeros(B, 1, ).byte().to(mask), mask], 1) | |||
| embs = data.permute(0, 2, 1)[:,:,:,None] # B H L 1 | |||
| if self.pos_emb: | |||
| P = self.pos_emb(torch.arange(L, dtype=torch.long, device=embs.device)\ | |||
| .view(1, L)).permute(0, 2, 1).contiguous()[:, :, :, None] # 1 H L 1 | |||
| embs = embs + P | |||
| nodes = embs | |||
| relay = embs.mean(2, keepdim=True) | |||
| ex_mask = mask[:, None, :, None].expand(B, H, L, 1) | |||
| r_embs = embs.view(B, H, 1, L) | |||
| for i in range(self.iters): | |||
| ax = torch.cat([r_embs, relay.expand(B, H, 1, L)], 2) | |||
| nodes = nodes + F.leaky_relu(self.ring_att[i](norm_func(self.norm[i], nodes), ax=ax)) | |||
| relay = F.leaky_relu(self.star_att[i](relay, torch.cat([relay, nodes], 2), smask)) | |||
| nodes = nodes.masked_fill_(ex_mask, 0) | |||
| nodes = nodes.view(B, H, L).permute(0, 2, 1) | |||
| return nodes, relay.view(B, H) | |||
| class MSA1(nn.Module): | |||
| def __init__(self, nhid, nhead=10, head_dim=10, dropout=0.1): | |||
| super(MSA1, self).__init__() | |||
| # Multi-head Self Attention Case 1, doing self-attention for small regions | |||
| # Due to the architecture of GPU, using hadamard production and summation are faster than dot production when unfold_size is very small | |||
| self.WQ = nn.Conv2d(nhid, nhead * head_dim, 1) | |||
| self.WK = nn.Conv2d(nhid, nhead * head_dim, 1) | |||
| self.WV = nn.Conv2d(nhid, nhead * head_dim, 1) | |||
| self.WO = nn.Conv2d(nhead * head_dim, nhid, 1) | |||
| self.drop = nn.Dropout(dropout) | |||
| # print('NUM_HEAD', nhead, 'DIM_HEAD', head_dim) | |||
| self.nhid, self.nhead, self.head_dim, self.unfold_size = nhid, nhead, head_dim, 3 | |||
| def forward(self, x, ax=None): | |||
| # x: B, H, L, 1, ax : B, H, X, L append features | |||
| nhid, nhead, head_dim, unfold_size = self.nhid, self.nhead, self.head_dim, self.unfold_size | |||
| B, H, L, _ = x.shape | |||
| q, k, v = self.WQ(x), self.WK(x), self.WV(x) # x: (B,H,L,1) | |||
| if ax is not None: | |||
| aL = ax.shape[2] | |||
| ak = self.WK(ax).view(B, nhead, head_dim, aL, L) | |||
| av = self.WV(ax).view(B, nhead, head_dim, aL, L) | |||
| q = q.view(B, nhead, head_dim, 1, L) | |||
| k = F.unfold(k.view(B, nhead * head_dim, L, 1), (unfold_size, 1), padding=(unfold_size // 2, 0))\ | |||
| .view(B, nhead, head_dim, unfold_size, L) | |||
| v = F.unfold(v.view(B, nhead * head_dim, L, 1), (unfold_size, 1), padding=(unfold_size // 2, 0))\ | |||
| .view(B, nhead, head_dim, unfold_size, L) | |||
| if ax is not None: | |||
| k = torch.cat([k, ak], 3) | |||
| v = torch.cat([v, av], 3) | |||
| alphas = self.drop(F.softmax((q * k).sum(2, keepdim=True) / NP.sqrt(head_dim), 3)) # B N L 1 U | |||
| att = (alphas * v).sum(3).view(B, nhead * head_dim, L, 1) | |||
| ret = self.WO(att) | |||
| return ret | |||
| class MSA2(nn.Module): | |||
| def __init__(self, nhid, nhead=10, head_dim=10, dropout=0.1): | |||
| # Multi-head Self Attention Case 2, a broadcastable query for a sequence key and value | |||
| super(MSA2, self).__init__() | |||
| self.WQ = nn.Conv2d(nhid, nhead * head_dim, 1) | |||
| self.WK = nn.Conv2d(nhid, nhead * head_dim, 1) | |||
| self.WV = nn.Conv2d(nhid, nhead * head_dim, 1) | |||
| self.WO = nn.Conv2d(nhead * head_dim, nhid, 1) | |||
| self.drop = nn.Dropout(dropout) | |||
| # print('NUM_HEAD', nhead, 'DIM_HEAD', head_dim) | |||
| self.nhid, self.nhead, self.head_dim, self.unfold_size = nhid, nhead, head_dim, 3 | |||
| def forward(self, x, y, mask=None): | |||
| # x: B, H, 1, 1, 1 y: B H L 1 | |||
| nhid, nhead, head_dim, unfold_size = self.nhid, self.nhead, self.head_dim, self.unfold_size | |||
| B, H, L, _ = y.shape | |||
| q, k, v = self.WQ(x), self.WK(y), self.WV(y) | |||
| q = q.view(B, nhead, 1, head_dim) # B, H, 1, 1 -> B, N, 1, h | |||
| k = k.view(B, nhead, head_dim, L) # B, H, L, 1 -> B, N, h, L | |||
| v = v.view(B, nhead, head_dim, L).permute(0, 1, 3, 2) # B, H, L, 1 -> B, N, L, h | |||
| pre_a = torch.matmul(q, k) / NP.sqrt(head_dim) | |||
| if mask is not None: | |||
| pre_a = pre_a.masked_fill(mask[:, None, None, :], -float('inf')) | |||
| alphas = self.drop(F.softmax(pre_a, 3)) # B, N, 1, L | |||
| att = torch.matmul(alphas, v).view(B, -1, 1, 1) # B, N, 1, h -> B, N*h, 1, 1 | |||
| return self.WO(att) | |||
| @@ -5,17 +5,18 @@ from ..dropout import TimestepDropout | |||
| class TransformerEncoder(nn.Module): | |||
| """transformer的encoder模块,不包含embedding层 | |||
| :param num_layers: int, transformer的层数 | |||
| :param model_size: int, 输入维度的大小。同时也是输出维度的大小。 | |||
| :param inner_size: int, FFN层的hidden大小 | |||
| :param key_size: int, 每个head的维度大小。 | |||
| :param value_size: int,每个head中value的维度。 | |||
| :param num_head: int,head的数量。 | |||
| :param dropout: float。 | |||
| """ | |||
| class SubLayer(nn.Module): | |||
| def __init__(self, model_size, inner_size, key_size, value_size, num_head, dropout=0.1): | |||
| """ | |||
| :param model_size: int, 输入维度的大小。同时也是输出维度的大小。 | |||
| :param inner_size: int, FFN层的hidden大小 | |||
| :param key_size: int, 每个head的维度大小。 | |||
| :param value_size: int,每个head中value的维度。 | |||
| :param num_head: int,head的数量。 | |||
| :param dropout: float。 | |||
| """ | |||
| super(TransformerEncoder.SubLayer, self).__init__() | |||
| self.atte = MultiHeadAtte(model_size, key_size, value_size, num_head, dropout) | |||
| self.norm1 = nn.LayerNorm(model_size) | |||
| @@ -45,6 +46,11 @@ class TransformerEncoder(nn.Module): | |||
| self.layers = nn.ModuleList([self.SubLayer(**kargs) for _ in range(num_layers)]) | |||
| def forward(self, x, seq_mask=None): | |||
| """ | |||
| :param x: [batch, seq_len, model_size] 输入序列 | |||
| :param seq_mask: [batch, seq_len] 输入序列的padding mask | |||
| :return: [batch, seq_len, model_size] 输出序列 | |||
| """ | |||
| output = x | |||
| if seq_mask is None: | |||
| atte_mask_out = None | |||
| @@ -0,0 +1,44 @@ | |||
| # 模型复现 | |||
| 这里复现了在fastNLP中实现的模型,旨在达到与论文中相符的性能。 | |||
| 复现的模型有: | |||
| - Star-Transformer | |||
| - ... | |||
| ## Star-Transformer | |||
| [reference](https://arxiv.org/abs/1902.09113) | |||
| ### Performance | |||
| |任务| 数据集 | SOTA | 模型表现 | | |||
| |------|------| ------| ------| | |||
| |Pos Tagging|CTB 9.0|-|ACC 92.31| | |||
| |Pos Tagging|CONLL 2012|-|ACC 96.51| | |||
| |Named Entity Recognition|CONLL 2012|-|F1 85.66| | |||
| |Text Classification|SST|-|49.18| | |||
| |Natural Language Inference|SNLI|-|83.76| | |||
| ### Usage | |||
| ``` python | |||
| # for sequence labeling(ner, pos tagging, etc) | |||
| from fastNLP.models.star_transformer import STSeqLabel | |||
| model = STSeqLabel( | |||
| vocab_size=10000, num_cls=50, | |||
| emb_dim=300) | |||
| # for sequence classification | |||
| from fastNLP.models.star_transformer import STSeqCls | |||
| model = STSeqCls( | |||
| vocab_size=10000, num_cls=50, | |||
| emb_dim=300) | |||
| # for natural language inference | |||
| from fastNLP.models.star_transformer import STNLICls | |||
| model = STNLICls( | |||
| vocab_size=10000, num_cls=50, | |||
| emb_dim=300) | |||
| ``` | |||
| ## ... | |||
| @@ -139,11 +139,14 @@ class TestCallback(unittest.TestCase): | |||
| def test_readonly_property(self): | |||
| from fastNLP.core.callback import Callback | |||
| passed_epochs = [] | |||
| total_epochs = 5 | |||
| class MyCallback(Callback): | |||
| def __init__(self): | |||
| super(MyCallback, self).__init__() | |||
| def on_epoch_begin(self, cur_epoch, total_epoch): | |||
| def on_epoch_begin(self): | |||
| passed_epochs.append(self.epoch) | |||
| print(self.n_epochs, self.n_steps, self.batch_size) | |||
| print(self.model) | |||
| print(self.optimizer) | |||
| @@ -151,7 +154,7 @@ class TestCallback(unittest.TestCase): | |||
| data_set, model = prepare_env() | |||
| trainer = Trainer(data_set, model, | |||
| loss=BCELoss(pred="predict", target="y"), | |||
| n_epochs=5, | |||
| n_epochs=total_epochs, | |||
| batch_size=32, | |||
| print_every=50, | |||
| optimizer=SGD(lr=0.1), | |||
| @@ -161,3 +164,4 @@ class TestCallback(unittest.TestCase): | |||
| metrics=AccuracyMetric(pred="predict", target="y"), | |||
| callbacks=[MyCallback()]) | |||
| trainer.train() | |||
| assert passed_epochs == list(range(1, total_epochs+1)) | |||
| @@ -217,6 +217,7 @@ class TestDataSetMethods(unittest.TestCase): | |||
| self.assertTrue(len(ds) > 0) | |||
| def test_add_null(self): | |||
| # TODO test failed because 'fastNLP\core\fieldarray.py:143: RuntimeError' | |||
| ds = DataSet() | |||
| ds.add_field('test', []) | |||
| ds.set_target('test') | |||
| @@ -141,11 +141,11 @@ class SpanF1PreRecMetric(unittest.TestCase): | |||
| bmes_lst = ['M-8', 'S-2', 'S-0', 'B-9', 'B-6', 'E-5', 'B-7', 'S-2', 'E-7', 'S-8'] | |||
| bio_lst = ['O-8', 'O-2', 'B-0', 'O-9', 'I-6', 'I-5', 'I-7', 'I-2', 'I-7', 'O-8'] | |||
| expect_bmes_res = set() | |||
| expect_bmes_res.update([('8', (0, 0)), ('2', (1, 1)), ('0', (2, 2)), ('9', (3, 3)), ('6', (4, 4)), | |||
| ('5', (5, 5)), ('7', (6, 6)), ('2', (7, 7)), ('7', (8, 8)), ('8', (9, 9))]) | |||
| expect_bmes_res.update([('8', (0, 1)), ('2', (1, 2)), ('0', (2, 3)), ('9', (3, 4)), ('6', (4, 5)), | |||
| ('5', (5, 6)), ('7', (6, 7)), ('2', (7, 8)), ('7', (8, 9)), ('8', (9, 10))]) | |||
| expect_bio_res = set() | |||
| expect_bio_res.update([('7', (8, 8)), ('0', (2, 2)), ('2', (7, 7)), ('5', (5, 5)), | |||
| ('6', (4, 4)), ('7', (6, 6))]) | |||
| expect_bio_res.update([('7', (8, 9)), ('0', (2, 3)), ('2', (7, 8)), ('5', (5, 6)), | |||
| ('6', (4, 5)), ('7', (6, 7))]) | |||
| self.assertSetEqual(expect_bmes_res,set(bmes_tag_to_spans(bmes_lst))) | |||
| self.assertSetEqual(expect_bio_res, set(bio_tag_to_spans(bio_lst))) | |||
| # 已与allennlp对应函数做过验证,但由于测试不能依赖allennlp,所以这里只是截取上面的例子做固定测试 | |||
| @@ -168,9 +168,9 @@ class SpanF1PreRecMetric(unittest.TestCase): | |||
| bmes_lst = ['B', 'E', 'B', 'S', 'B', 'M', 'E', 'M', 'B', 'E'] | |||
| bio_lst = ['I', 'B', 'O', 'O', 'I', 'O', 'I', 'B', 'O', 'O'] | |||
| expect_bmes_res = set() | |||
| expect_bmes_res.update([('', (0, 1)), ('', (2, 2)), ('', (3, 3)), ('', (4, 6)), ('', (7, 7)), ('', (8, 9))]) | |||
| expect_bmes_res.update([('', (0, 2)), ('', (2, 3)), ('', (3, 4)), ('', (4, 7)), ('', (7, 8)), ('', (8, 10))]) | |||
| expect_bio_res = set() | |||
| expect_bio_res.update([('', (7, 7)), ('', (6, 6)), ('', (4, 4)), ('', (0, 0)), ('', (1, 1))]) | |||
| expect_bio_res.update([('', (7, 8)), ('', (6, 7)), ('', (4, 5)), ('', (0, 1)), ('', (1, 2))]) | |||
| self.assertSetEqual(expect_bmes_res,set(bmes_tag_to_spans(bmes_lst))) | |||
| self.assertSetEqual(expect_bio_res, set(bio_tag_to_spans(bio_lst))) | |||
| # 已与allennlp对应函数做过验证,但由于测试不能依赖allennlp,所以这里只是截取上面的例子做固定测试 | |||
| @@ -3,6 +3,7 @@ import unittest | |||
| import torch | |||
| from fastNLP.modules.other_modules import GroupNorm, LayerNormalization, BiLinear, BiAffine | |||
| from fastNLP.modules.encoder.star_transformer import StarTransformer | |||
| class TestGroupNorm(unittest.TestCase): | |||
| @@ -49,3 +50,12 @@ class TestBiAffine(unittest.TestCase): | |||
| encoder_input = torch.randn((batch_size, decoder_length, 10)) | |||
| y = layer(decoder_input, encoder_input) | |||
| self.assertEqual(tuple(y.shape), (batch_size, 25, encoder_length, 1)) | |||
| class TestStarTransformer(unittest.TestCase): | |||
| def test_1(self): | |||
| model = StarTransformer(num_layers=6, hidden_size=100, num_head=8, head_dim=20, max_len=100) | |||
| x = torch.rand(16, 45, 100) | |||
| mask = torch.ones(16, 45).byte() | |||
| y, yn = model(x, mask) | |||
| self.assertEqual(tuple(y.size()), (16, 45, 100)) | |||
| self.assertEqual(tuple(yn.size()), (16, 100)) | |||
| @@ -353,7 +353,7 @@ class TestTutorial(unittest.TestCase): | |||
| train_data[-1], dev_data[-1], test_data[-1] | |||
| # 读入vocab文件 | |||
| with open('vocab.txt') as f: | |||
| with open('vocab.txt', encoding='utf-8') as f: | |||
| lines = f.readlines() | |||
| vocabs = [] | |||
| for line in lines: | |||