| @@ -1,7 +0,0 @@ | |||
| fastNLP.models.base\_model | |||
| ========================== | |||
| .. automodule:: fastNLP.models.base_model | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| @@ -1,7 +0,0 @@ | |||
| fastNLP.models.bert | |||
| =================== | |||
| .. automodule:: fastNLP.models.bert | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| @@ -1,7 +0,0 @@ | |||
| fastNLP.models.enas\_controller | |||
| =============================== | |||
| .. automodule:: fastNLP.models.enas_controller | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| @@ -1,7 +0,0 @@ | |||
| fastNLP.models.enas\_model | |||
| ========================== | |||
| .. automodule:: fastNLP.models.enas_model | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| @@ -1,7 +0,0 @@ | |||
| fastNLP.models.enas\_trainer | |||
| ============================ | |||
| .. automodule:: fastNLP.models.enas_trainer | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| @@ -1,7 +0,0 @@ | |||
| fastNLP.models.enas\_utils | |||
| ========================== | |||
| .. automodule:: fastNLP.models.enas_utils | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| @@ -12,14 +12,8 @@ fastNLP.models | |||
| .. toctree:: | |||
| :titlesonly: | |||
| fastNLP.models.base_model | |||
| fastNLP.models.bert | |||
| fastNLP.models.biaffine_parser | |||
| fastNLP.models.cnn_text_classification | |||
| fastNLP.models.enas_controller | |||
| fastNLP.models.enas_model | |||
| fastNLP.models.enas_trainer | |||
| fastNLP.models.enas_utils | |||
| fastNLP.models.sequence_labeling | |||
| fastNLP.models.snli | |||
| fastNLP.models.star_transformer | |||
| @@ -3,12 +3,12 @@ batch 模块实现了 fastNLP 所需的 Batch 类。 | |||
| """ | |||
| import atexit | |||
| from queue import Empty, Full | |||
| import numpy as np | |||
| import torch | |||
| import torch.multiprocessing as mp | |||
| from queue import Empty, Full | |||
| from .sampler import RandomSampler | |||
| __all__ = [ | |||
| @@ -50,6 +50,7 @@ callback模块实现了 fastNLP 中的许多 callback 类,用于增强 :class: | |||
| """ | |||
| import os | |||
| import torch | |||
| try: | |||
| @@ -273,9 +273,10 @@ | |||
| """ | |||
| import _pickle as pickle | |||
| import numpy as np | |||
| import warnings | |||
| import numpy as np | |||
| from .field import AutoPadder | |||
| from .field import FieldArray | |||
| from .instance import Instance | |||
| @@ -3,10 +3,10 @@ field模块实现了 FieldArray 和若干 Padder。 FieldArray 是 :class:`~fas | |||
| 原理部分请参考 :doc:`fastNLP.core.dataset` | |||
| """ | |||
| import numpy as np | |||
| from copy import deepcopy | |||
| import numpy as np | |||
| __all__ = [ | |||
| "FieldArray", | |||
| "Padder", | |||
| @@ -3,11 +3,11 @@ losses 模块定义了 fastNLP 中所需的各种损失函数,一般做为 :cl | |||
| """ | |||
| import inspect | |||
| from collections import defaultdict | |||
| import torch | |||
| import torch.nn.functional as F | |||
| from collections import defaultdict | |||
| from .utils import _CheckError | |||
| from .utils import _CheckRes | |||
| from .utils import _build_args | |||
| @@ -3,11 +3,11 @@ metrics 模块实现了 fastNLP 所需的各种常用衡量指标,一般做为 | |||
| """ | |||
| import inspect | |||
| from collections import defaultdict | |||
| import numpy as np | |||
| import torch | |||
| from collections import defaultdict | |||
| from .utils import _CheckError | |||
| from .utils import _CheckRes | |||
| from .utils import _build_args | |||
| @@ -2,10 +2,10 @@ | |||
| ..todo:: | |||
| 检查这个类是否需要 | |||
| """ | |||
| import torch | |||
| from collections import defaultdict | |||
| import torch | |||
| from . import Batch | |||
| from . import DataSet | |||
| from . import SequentialSampler | |||
| @@ -1,10 +1,10 @@ | |||
| """ | |||
| sampler 子类实现了 fastNLP 所需的各种采样器。 | |||
| """ | |||
| import numpy as np | |||
| from itertools import chain | |||
| import numpy as np | |||
| __all__ = [ | |||
| "Sampler", | |||
| "BucketSampler", | |||
| @@ -33,6 +33,7 @@ Tester在验证进行之前会调用model.eval()提示当前进入了evaluation | |||
| """ | |||
| import warnings | |||
| import torch | |||
| import torch.nn as nn | |||
| @@ -297,13 +297,13 @@ Example2.3 | |||
| """ | |||
| import os | |||
| import numpy as np | |||
| import time | |||
| from datetime import datetime, timedelta | |||
| import numpy as np | |||
| import torch | |||
| import torch.nn as nn | |||
| from datetime import datetime, timedelta | |||
| try: | |||
| from tqdm.auto import tqdm | |||
| except: | |||
| @@ -3,14 +3,13 @@ utils模块实现了 fastNLP 内部和外部所需的很多工具。其中用户 | |||
| """ | |||
| import _pickle | |||
| import inspect | |||
| import numpy as np | |||
| import os | |||
| import torch | |||
| import torch.nn as nn | |||
| import warnings | |||
| from collections import Counter, namedtuple | |||
| from collections import Counter | |||
| from collections import namedtuple | |||
| import numpy as np | |||
| import torch | |||
| import torch.nn as nn | |||
| __all__ = [ | |||
| "cache_results", | |||
| @@ -9,6 +9,11 @@ | |||
| 这些类的使用方法如下: | |||
| """ | |||
| from .embed_loader import EmbedLoader | |||
| from .dataset_loader import DataSetLoader, CSVLoader, JsonLoader, ConllLoader, SNLILoader, SSTLoader, \ | |||
| PeopleDailyCorpusLoader, Conll2003Loader | |||
| from .model_io import ModelLoader, ModelSaver | |||
| __all__ = [ | |||
| 'EmbedLoader', | |||
| @@ -24,7 +29,3 @@ __all__ = [ | |||
| 'ModelLoader', | |||
| 'ModelSaver', | |||
| ] | |||
| from .embed_loader import EmbedLoader | |||
| from .dataset_loader import DataSetLoader, CSVLoader, JsonLoader, ConllLoader, SNLILoader, SSTLoader, \ | |||
| PeopleDailyCorpusLoader, Conll2003Loader | |||
| from .model_io import ModelLoader as ModelLoader, ModelSaver as ModelSaver | |||
| @@ -1,15 +1,20 @@ | |||
| import _pickle as pickle | |||
| import os | |||
| __all__ = [ | |||
| "BaseLoader" | |||
| ] | |||
| class BaseLoader(object): | |||
| """ | |||
| 各个 Loader 的基类,提供了 API 的参考。 | |||
| """ | |||
| def __init__(self): | |||
| super(BaseLoader, self).__init__() | |||
| @staticmethod | |||
| def load_lines(data_path): | |||
| """ | |||
| @@ -20,7 +25,7 @@ class BaseLoader(object): | |||
| with open(data_path, "r", encoding="utf=8") as f: | |||
| text = f.readlines() | |||
| return [line.strip() for line in text] | |||
| @classmethod | |||
| def load(cls, data_path): | |||
| """ | |||
| @@ -31,7 +36,7 @@ class BaseLoader(object): | |||
| with open(data_path, "r", encoding="utf-8") as f: | |||
| text = f.readlines() | |||
| return [[word for word in sent.strip()] for sent in text] | |||
| @classmethod | |||
| def load_with_cache(cls, data_path, cache_path): | |||
| """缓存版的load | |||
| @@ -48,16 +53,18 @@ class BaseLoader(object): | |||
| class DataLoaderRegister: | |||
| _readers = {} | |||
| @classmethod | |||
| def set_reader(cls, reader_cls, read_fn_name): | |||
| # def wrapper(reader_cls): | |||
| if read_fn_name in cls._readers: | |||
| raise KeyError('duplicate reader: {} and {} for read_func: {}'.format(cls._readers[read_fn_name], reader_cls, read_fn_name)) | |||
| raise KeyError( | |||
| 'duplicate reader: {} and {} for read_func: {}'.format(cls._readers[read_fn_name], reader_cls, | |||
| read_fn_name)) | |||
| if hasattr(reader_cls, 'load'): | |||
| cls._readers[read_fn_name] = reader_cls().load | |||
| return reader_cls | |||
| @classmethod | |||
| def get_reader(cls, read_fn_name): | |||
| if read_fn_name in cls._readers: | |||
| @@ -1,14 +1,20 @@ | |||
| """ | |||
| 用于读入和处理和保存 config 文件 | |||
| .. todo:: | |||
| 这个模块中的类可能被抛弃? | |||
| """ | |||
| __all__ = ["ConfigLoader","ConfigSection","ConfigSaver"] | |||
| import configparser | |||
| import json | |||
| import os | |||
| from .base_loader import BaseLoader | |||
| __all__ = [ | |||
| "ConfigLoader", | |||
| "ConfigSection", | |||
| "ConfigSaver" | |||
| ] | |||
| class ConfigLoader(BaseLoader): | |||
| """ | |||
| @@ -19,15 +25,16 @@ class ConfigLoader(BaseLoader): | |||
| :param str data_path: 配置文件的路径 | |||
| """ | |||
| def __init__(self, data_path=None): | |||
| super(ConfigLoader, self).__init__() | |||
| if data_path is not None: | |||
| self.config = self.parse(super(ConfigLoader, self).load(data_path)) | |||
| @staticmethod | |||
| def parse(string): | |||
| raise NotImplementedError | |||
| @staticmethod | |||
| def load_config(file_path, sections): | |||
| """ | |||
| @@ -81,10 +88,10 @@ class ConfigSection(object): | |||
| ConfigSection是一个存储了一个section中所有键值对的数据结构,推荐使用此类的实例来配合 :meth:`ConfigLoader.load_config` 使用 | |||
| """ | |||
| def __init__(self): | |||
| super(ConfigSection, self).__init__() | |||
| def __getitem__(self, key): | |||
| """ | |||
| :param key: str, the name of the attribute | |||
| @@ -97,7 +104,7 @@ class ConfigSection(object): | |||
| if key in self.__dict__.keys(): | |||
| return getattr(self, key) | |||
| raise AttributeError("do NOT have attribute %s" % key) | |||
| def __setitem__(self, key, value): | |||
| """ | |||
| :param key: str, the name of the attribute | |||
| @@ -112,14 +119,14 @@ class ConfigSection(object): | |||
| raise AttributeError("attr %s except %s but got %s" % | |||
| (key, str(type(getattr(self, key))), str(type(value)))) | |||
| setattr(self, key, value) | |||
| def __contains__(self, item): | |||
| """ | |||
| :param item: The key of item. | |||
| :return: True if the key in self.__dict__.keys() else False. | |||
| """ | |||
| return item in self.__dict__.keys() | |||
| def __eq__(self, other): | |||
| """Overwrite the == operator | |||
| @@ -131,15 +138,15 @@ class ConfigSection(object): | |||
| return False | |||
| if getattr(self, k) != getattr(self, k): | |||
| return False | |||
| for k in other.__dict__.keys(): | |||
| if k not in self.__dict__.keys(): | |||
| return False | |||
| if getattr(self, k) != getattr(self, k): | |||
| return False | |||
| return True | |||
| def __ne__(self, other): | |||
| """Overwrite the != operator | |||
| @@ -147,7 +154,7 @@ class ConfigSection(object): | |||
| :return: | |||
| """ | |||
| return not self.__eq__(other) | |||
| @property | |||
| def data(self): | |||
| return self.__dict__ | |||
| @@ -162,11 +169,12 @@ class ConfigSaver(object): | |||
| :param str file_path: 配置文件的路径 | |||
| """ | |||
| def __init__(self, file_path): | |||
| self.file_path = file_path | |||
| if not os.path.exists(self.file_path): | |||
| raise FileNotFoundError("file {} NOT found!".__format__(self.file_path)) | |||
| def _get_section(self, sect_name): | |||
| """ | |||
| This is the function to get the section with the section name. | |||
| @@ -177,7 +185,7 @@ class ConfigSaver(object): | |||
| sect = ConfigSection() | |||
| ConfigLoader().load_config(self.file_path, {sect_name: sect}) | |||
| return sect | |||
| def _read_section(self): | |||
| """ | |||
| This is the function to read sections from the config file. | |||
| @@ -187,16 +195,16 @@ class ConfigSaver(object): | |||
| sect_key_list: A list of names in sect_list. | |||
| """ | |||
| sect_name = None | |||
| sect_list = {} | |||
| sect_key_list = [] | |||
| single_section = {} | |||
| single_section_key = [] | |||
| with open(self.file_path, 'r') as f: | |||
| lines = f.readlines() | |||
| for line in lines: | |||
| if line.startswith('[') and line.endswith(']\n'): | |||
| if sect_name is None: | |||
| @@ -208,29 +216,29 @@ class ConfigSaver(object): | |||
| sect_key_list.append(sect_name) | |||
| sect_name = line[1: -2] | |||
| continue | |||
| if line.startswith('#'): | |||
| single_section[line] = '#' | |||
| single_section_key.append(line) | |||
| continue | |||
| if line.startswith('\n'): | |||
| single_section_key.append('\n') | |||
| continue | |||
| if '=' not in line: | |||
| raise RuntimeError("can NOT load config file {}".__format__(self.file_path)) | |||
| key = line.split('=', maxsplit=1)[0].strip() | |||
| value = line.split('=', maxsplit=1)[1].strip() + '\n' | |||
| single_section[key] = value | |||
| single_section_key.append(key) | |||
| if sect_name is not None: | |||
| sect_list[sect_name] = single_section, single_section_key | |||
| sect_key_list.append(sect_name) | |||
| return sect_list, sect_key_list | |||
| def _write_section(self, sect_list, sect_key_list): | |||
| """ | |||
| This is the function to write config file with section list and name list. | |||
| @@ -252,7 +260,7 @@ class ConfigSaver(object): | |||
| continue | |||
| f.write(key + ' = ' + single_section[key]) | |||
| f.write('\n') | |||
| def save_config_file(self, section_name, section): | |||
| """ | |||
| 这个方法可以用来修改并保存配置文件中单独的一个 section | |||
| @@ -284,11 +292,11 @@ class ConfigSaver(object): | |||
| break | |||
| if not change_file: | |||
| return | |||
| sect_list, sect_key_list = self._read_section() | |||
| if section_name not in sect_key_list: | |||
| raise AttributeError() | |||
| sect, sect_key = sect_list[section_name] | |||
| for k in section.__dict__.keys(): | |||
| if k not in sect_key: | |||
| @@ -10,6 +10,12 @@ dataset_loader模块实现了许多 DataSetLoader, 用于读取不同格式的 | |||
| # ... do stuff | |||
| """ | |||
| from nltk.tree import Tree | |||
| from ..core.dataset import DataSet | |||
| from ..core.instance import Instance | |||
| from .file_reader import _read_csv, _read_json, _read_conll | |||
| __all__ = [ | |||
| 'DataSetLoader', | |||
| 'CSVLoader', | |||
| @@ -20,11 +26,6 @@ __all__ = [ | |||
| 'PeopleDailyCorpusLoader', | |||
| 'Conll2003Loader', | |||
| ] | |||
| from nltk.tree import Tree | |||
| from ..core.dataset import DataSet | |||
| from ..core.instance import Instance | |||
| from .file_reader import _read_csv, _read_json, _read_conll | |||
| def _download_from_url(url, path): | |||
| @@ -1,11 +1,15 @@ | |||
| import os | |||
| import warnings | |||
| import numpy as np | |||
| from ..core.vocabulary import Vocabulary | |||
| from .base_loader import BaseLoader | |||
| import warnings | |||
| __all__ = [ | |||
| "EmbedLoader" | |||
| ] | |||
| class EmbedLoader(BaseLoader): | |||
| """ | |||
| @@ -13,10 +17,10 @@ class EmbedLoader(BaseLoader): | |||
| 用于读取预训练的embedding, 读取结果可直接载入为模型参数。 | |||
| """ | |||
| def __init__(self): | |||
| super(EmbedLoader, self).__init__() | |||
| @staticmethod | |||
| def load_with_vocab(embed_filepath, vocab, dtype=np.float32, normalize=True, error='ignore'): | |||
| """ | |||
| @@ -40,11 +44,11 @@ class EmbedLoader(BaseLoader): | |||
| line = f.readline().strip() | |||
| parts = line.split() | |||
| start_idx = 0 | |||
| if len(parts)==2: | |||
| if len(parts) == 2: | |||
| dim = int(parts[1]) | |||
| start_idx += 1 | |||
| else: | |||
| dim = len(parts)-1 | |||
| dim = len(parts) - 1 | |||
| f.seek(0) | |||
| matrix = np.random.randn(len(vocab), dim).astype(dtype) | |||
| for idx, line in enumerate(f, start_idx): | |||
| @@ -63,21 +67,21 @@ class EmbedLoader(BaseLoader): | |||
| total_hits = sum(hit_flags) | |||
| print("Found {} out of {} words in the pre-training embedding.".format(total_hits, len(vocab))) | |||
| found_vectors = matrix[hit_flags] | |||
| if len(found_vectors)!=0: | |||
| if len(found_vectors) != 0: | |||
| mean = np.mean(found_vectors, axis=0, keepdims=True) | |||
| std = np.std(found_vectors, axis=0, keepdims=True) | |||
| unfound_vec_num = len(vocab) - total_hits | |||
| r_vecs = np.random.randn(unfound_vec_num, dim).astype(dtype)*std + mean | |||
| matrix[hit_flags==False] = r_vecs | |||
| r_vecs = np.random.randn(unfound_vec_num, dim).astype(dtype) * std + mean | |||
| matrix[hit_flags == False] = r_vecs | |||
| if normalize: | |||
| matrix /= np.linalg.norm(matrix, axis=1, keepdims=True) | |||
| return matrix | |||
| @staticmethod | |||
| def load_without_vocab(embed_filepath, dtype=np.float32, padding='<pad>', unknown='<unk>', normalize=True, | |||
| error='ignore'): | |||
| error='ignore'): | |||
| """ | |||
| 从embed_filepath中读取预训练的word vector。根据预训练的词表读取embedding并生成一个对应的Vocabulary。 | |||
| @@ -96,35 +100,35 @@ class EmbedLoader(BaseLoader): | |||
| vec_dict = {} | |||
| found_unknown = False | |||
| found_pad = False | |||
| with open(embed_filepath, 'r', encoding='utf-8') as f: | |||
| line = f.readline() | |||
| start = 1 | |||
| dim = -1 | |||
| if len(line.strip().split())!=2: | |||
| if len(line.strip().split()) != 2: | |||
| f.seek(0) | |||
| start = 0 | |||
| for idx, line in enumerate(f, start=start): | |||
| try: | |||
| parts = line.strip().split() | |||
| word = parts[0] | |||
| if dim==-1: | |||
| dim = len(parts)-1 | |||
| if dim == -1: | |||
| dim = len(parts) - 1 | |||
| vec = np.fromstring(' '.join(parts[1:]), sep=' ', dtype=dtype, count=dim) | |||
| vec_dict[word] = vec | |||
| vocab.add_word(word) | |||
| if unknown is not None and unknown==word: | |||
| if unknown is not None and unknown == word: | |||
| found_unknown = True | |||
| if found_pad is not None and padding==word: | |||
| if found_pad is not None and padding == word: | |||
| found_pad = True | |||
| except Exception as e: | |||
| if error=='ignore': | |||
| if error == 'ignore': | |||
| warnings.warn("Error occurred at the {} line.".format(idx)) | |||
| pass | |||
| else: | |||
| print("Error occurred at the {} line.".format(idx)) | |||
| raise e | |||
| if dim==-1: | |||
| if dim == -1: | |||
| raise RuntimeError("{} is an empty file.".format(embed_filepath)) | |||
| matrix = np.random.randn(len(vocab), dim).astype(dtype) | |||
| if (unknown is not None and not found_unknown) or (padding is not None and not found_pad): | |||
| @@ -133,19 +137,19 @@ class EmbedLoader(BaseLoader): | |||
| start_idx += 1 | |||
| if unknown is not None: | |||
| start_idx += 1 | |||
| mean = np.mean(matrix[start_idx:], axis=0, keepdims=True) | |||
| std = np.std(matrix[start_idx:], axis=0, keepdims=True) | |||
| if (unknown is not None and not found_unknown): | |||
| matrix[start_idx-1] = np.random.randn(1, dim).astype(dtype)*std + mean | |||
| matrix[start_idx - 1] = np.random.randn(1, dim).astype(dtype) * std + mean | |||
| if (padding is not None and not found_pad): | |||
| matrix[0] = np.random.randn(1, dim).astype(dtype)*std + mean | |||
| matrix[0] = np.random.randn(1, dim).astype(dtype) * std + mean | |||
| for key, vec in vec_dict.items(): | |||
| index = vocab.to_index(key) | |||
| matrix[index] = vec | |||
| if normalize: | |||
| matrix /= np.linalg.norm(matrix, axis=1, keepdims=True) | |||
| return matrix, vocab | |||
| @@ -5,6 +5,11 @@ import torch | |||
| from .base_loader import BaseLoader | |||
| __all__ = [ | |||
| "ModelLoader", | |||
| "ModelSaver" | |||
| ] | |||
| class ModelLoader(BaseLoader): | |||
| """ | |||
| @@ -12,10 +17,10 @@ class ModelLoader(BaseLoader): | |||
| 用于读取模型 | |||
| """ | |||
| def __init__(self): | |||
| super(ModelLoader, self).__init__() | |||
| @staticmethod | |||
| def load_pytorch(empty_model, model_path): | |||
| """ | |||
| @@ -25,7 +30,7 @@ class ModelLoader(BaseLoader): | |||
| :param str model_path: 模型保存的路径 | |||
| """ | |||
| empty_model.load_state_dict(torch.load(model_path)) | |||
| @staticmethod | |||
| def load_pytorch_model(model_path): | |||
| """ | |||
| @@ -48,14 +53,14 @@ class ModelSaver(object): | |||
| saver.save_pytorch(model) | |||
| """ | |||
| def __init__(self, save_path): | |||
| """ | |||
| :param save_path: 模型保存的路径 | |||
| """ | |||
| self.save_path = save_path | |||
| def save_pytorch(self, model, param_only=True): | |||
| """ | |||
| 把 PyTorch 模型存入 ".pkl" 文件 | |||
| @@ -7,7 +7,6 @@ fastNLP 在 :mod:`~fastNLP.models` 模块中内置了如 :class:`~fastNLP.models | |||
| """ | |||
| __all__ = ["CNNText", "SeqLabeling", "ESIM", "STSeqLabel", "AdvSeqLabel", "STNLICls", "STSeqCls"] | |||
| from .base_model import BaseModel | |||
| from .bert import BertForMultipleChoice, BertForQuestionAnswering, BertForSequenceClassification, \ | |||
| BertForTokenClassification | |||
| @@ -15,4 +14,21 @@ from .biaffine_parser import BiaffineParser, GraphParser | |||
| from .cnn_text_classification import CNNText | |||
| from .sequence_labeling import SeqLabeling, AdvSeqLabel | |||
| from .snli import ESIM | |||
| from .star_transformer import STSeqCls, STNLICls, STSeqLabel | |||
| from .star_transformer import StarTransEnc, STSeqCls, STNLICls, STSeqLabel | |||
| __all__ = [ | |||
| "CNNText", | |||
| "SeqLabeling", | |||
| "AdvSeqLabel", | |||
| "ESIM", | |||
| "StarTransEnc", | |||
| "STSeqLabel", | |||
| "STNLICls", | |||
| "STSeqCls", | |||
| "BiaffineParser", | |||
| "GraphParser" | |||
| ] | |||
| @@ -6,13 +6,13 @@ from ..modules.decoder.MLP import MLP | |||
| class BaseModel(torch.nn.Module): | |||
| """Base PyTorch model for all models. | |||
| """ | |||
| def __init__(self): | |||
| super(BaseModel, self).__init__() | |||
| def fit(self, train_data, dev_data=None, **train_args): | |||
| pass | |||
| def predict(self, *args, **kwargs): | |||
| raise NotImplementedError | |||
| @@ -21,9 +21,9 @@ class NaiveClassifier(BaseModel): | |||
| def __init__(self, in_feature_dim, out_feature_dim): | |||
| super(NaiveClassifier, self).__init__() | |||
| self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim]) | |||
| def forward(self, x): | |||
| return {"predict": torch.sigmoid(self.mlp(x))} | |||
| def predict(self, x): | |||
| return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} | |||
| @@ -1,11 +1,12 @@ | |||
| """Biaffine Dependency Parser 的 Pytorch 实现. | |||
| """ | |||
| from collections import defaultdict | |||
| Biaffine Dependency Parser 的 Pytorch 实现. | |||
| """ | |||
| import numpy as np | |||
| import torch | |||
| from torch import nn | |||
| from torch.nn import functional as F | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| from collections import defaultdict | |||
| from ..core.const import Const as C | |||
| from ..core.losses import LossFunc | |||
| @@ -18,6 +19,12 @@ from ..modules.utils import get_embeddings | |||
| from .base_model import BaseModel | |||
| from ..core.utils import seq_len_to_mask | |||
| __all__ = [ | |||
| "BiaffineParser", | |||
| "GraphParser" | |||
| ] | |||
| def _mst(scores): | |||
| """ | |||
| with some modification to support parser output for MST decoding | |||
| @@ -44,7 +51,7 @@ def _mst(scores): | |||
| scores[roots, new_heads] / root_scores)] | |||
| heads[roots] = new_heads | |||
| heads[new_root] = 0 | |||
| edges = defaultdict(set) | |||
| vertices = set((0,)) | |||
| for dep, head in enumerate(heads[tokens]): | |||
| @@ -73,7 +80,7 @@ def _mst(scores): | |||
| heads[changed_cycle] = new_head | |||
| edges[new_head].add(changed_cycle) | |||
| edges[old_head].remove(changed_cycle) | |||
| return heads | |||
| @@ -88,7 +95,7 @@ def _find_cycle(vertices, edges): | |||
| _lowlinks = {} | |||
| _onstack = defaultdict(lambda: False) | |||
| _SCCs = [] | |||
| def _strongconnect(v): | |||
| nonlocal _index | |||
| _indices[v] = _index | |||
| @@ -96,28 +103,28 @@ def _find_cycle(vertices, edges): | |||
| _index += 1 | |||
| _stack.append(v) | |||
| _onstack[v] = True | |||
| for w in edges[v]: | |||
| if w not in _indices: | |||
| _strongconnect(w) | |||
| _lowlinks[v] = min(_lowlinks[v], _lowlinks[w]) | |||
| elif _onstack[w]: | |||
| _lowlinks[v] = min(_lowlinks[v], _indices[w]) | |||
| if _lowlinks[v] == _indices[v]: | |||
| SCC = set() | |||
| while True: | |||
| w = _stack.pop() | |||
| _onstack[w] = False | |||
| SCC.add(w) | |||
| if not(w != v): | |||
| if not (w != v): | |||
| break | |||
| _SCCs.append(SCC) | |||
| for v in vertices: | |||
| if v not in _indices: | |||
| _strongconnect(v) | |||
| return [SCC for SCC in _SCCs if len(SCC) > 1] | |||
| @@ -125,9 +132,10 @@ class GraphParser(BaseModel): | |||
| """ | |||
| 基于图的parser base class, 支持贪婪解码和最大生成树解码 | |||
| """ | |||
| def __init__(self): | |||
| super(GraphParser, self).__init__() | |||
| @staticmethod | |||
| def greedy_decoder(arc_matrix, mask=None): | |||
| """ | |||
| @@ -146,7 +154,7 @@ class GraphParser(BaseModel): | |||
| if mask is not None: | |||
| heads *= mask.long() | |||
| return heads | |||
| @staticmethod | |||
| def mst_decoder(arc_matrix, mask=None): | |||
| """ | |||
| @@ -176,6 +184,7 @@ class ArcBiaffine(nn.Module): | |||
| :param hidden_size: 输入的特征维度 | |||
| :param bias: 是否使用bias. Default: ``True`` | |||
| """ | |||
| def __init__(self, hidden_size, bias=True): | |||
| super(ArcBiaffine, self).__init__() | |||
| self.U = nn.Parameter(torch.Tensor(hidden_size, hidden_size), requires_grad=True) | |||
| @@ -185,7 +194,7 @@ class ArcBiaffine(nn.Module): | |||
| else: | |||
| self.register_parameter("bias", None) | |||
| initial_parameter(self) | |||
| def forward(self, head, dep): | |||
| """ | |||
| @@ -209,11 +218,12 @@ class LabelBilinear(nn.Module): | |||
| :param num_label: 边类别的个数 | |||
| :param bias: 是否使用bias. Default: ``True`` | |||
| """ | |||
| def __init__(self, in1_features, in2_features, num_label, bias=True): | |||
| super(LabelBilinear, self).__init__() | |||
| self.bilinear = nn.Bilinear(in1_features, in2_features, num_label, bias=bias) | |||
| self.lin = nn.Linear(in1_features + in2_features, num_label, bias=False) | |||
| def forward(self, x1, x2): | |||
| """ | |||
| @@ -225,13 +235,13 @@ class LabelBilinear(nn.Module): | |||
| output += self.lin(torch.cat([x1, x2], dim=2)) | |||
| return output | |||
| class BiaffineParser(GraphParser): | |||
| """ | |||
| 别名::class:`fastNLP.models.BiaffineParser` :class:`fastNLP.models.baffine_parser.BiaffineParser` | |||
| Biaffine Dependency Parser 实现. | |||
| 论文参考 ` Deep Biaffine Attention for Neural Dependency Parsing (Dozat and Manning, 2016) | |||
| <https://arxiv.org/abs/1611.01734>`_ . | |||
| 论文参考 `Deep Biaffine Attention for Neural Dependency Parsing (Dozat and Manning, 2016) <https://arxiv.org/abs/1611.01734>`_ . | |||
| :param init_embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 | |||
| embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象, | |||
| @@ -248,18 +258,19 @@ class BiaffineParser(GraphParser): | |||
| :param use_greedy_infer: 是否在inference时使用贪心算法. | |||
| 若 ``False`` , 使用更加精确但相对缓慢的MST算法. Default: ``False`` | |||
| """ | |||
| def __init__(self, | |||
| init_embed, | |||
| pos_vocab_size, | |||
| pos_emb_dim, | |||
| num_label, | |||
| rnn_layers=1, | |||
| rnn_hidden_size=200, | |||
| arc_mlp_size=100, | |||
| label_mlp_size=100, | |||
| dropout=0.3, | |||
| encoder='lstm', | |||
| use_greedy_infer=False): | |||
| init_embed, | |||
| pos_vocab_size, | |||
| pos_emb_dim, | |||
| num_label, | |||
| rnn_layers=1, | |||
| rnn_hidden_size=200, | |||
| arc_mlp_size=100, | |||
| label_mlp_size=100, | |||
| dropout=0.3, | |||
| encoder='lstm', | |||
| use_greedy_infer=False): | |||
| super(BiaffineParser, self).__init__() | |||
| rnn_out_size = 2 * rnn_hidden_size | |||
| word_hid_dim = pos_hid_dim = rnn_hidden_size | |||
| @@ -295,20 +306,20 @@ class BiaffineParser(GraphParser): | |||
| if (d_k * n_head) != rnn_out_size: | |||
| raise ValueError('unsupported rnn_out_size: {} for transformer'.format(rnn_out_size)) | |||
| self.position_emb = nn.Embedding(num_embeddings=self.max_len, | |||
| embedding_dim=rnn_out_size,) | |||
| embedding_dim=rnn_out_size, ) | |||
| self.encoder = TransformerEncoder(num_layers=rnn_layers, | |||
| model_size=rnn_out_size, | |||
| inner_size=1024, | |||
| key_size=d_k, | |||
| value_size=d_v, | |||
| num_head=n_head, | |||
| dropout=dropout,) | |||
| dropout=dropout, ) | |||
| else: | |||
| raise ValueError('unsupported encoder type: {}'.format(encoder)) | |||
| self.mlp = nn.Sequential(nn.Linear(rnn_out_size, arc_mlp_size * 2 + label_mlp_size * 2), | |||
| nn.ELU(), | |||
| TimestepDropout(p=dropout),) | |||
| nn.ELU(), | |||
| TimestepDropout(p=dropout), ) | |||
| self.arc_mlp_size = arc_mlp_size | |||
| self.label_mlp_size = label_mlp_size | |||
| self.arc_predictor = ArcBiaffine(arc_mlp_size, bias=True) | |||
| @@ -316,7 +327,7 @@ class BiaffineParser(GraphParser): | |||
| self.use_greedy_infer = use_greedy_infer | |||
| self.reset_parameters() | |||
| self.dropout = dropout | |||
| def reset_parameters(self): | |||
| for m in self.modules(): | |||
| if isinstance(m, nn.Embedding): | |||
| @@ -327,7 +338,7 @@ class BiaffineParser(GraphParser): | |||
| else: | |||
| for p in m.parameters(): | |||
| nn.init.normal_(p, 0, 0.1) | |||
| def forward(self, words1, words2, seq_len, target1=None): | |||
| """模型forward阶段 | |||
| @@ -337,50 +348,52 @@ class BiaffineParser(GraphParser): | |||
| :param target1: [batch_size, seq_len] 输入真实标注的heads, 仅在训练阶段有效, | |||
| 用于训练label分类器. 若为 ``None`` , 使用预测的heads输入到label分类器 | |||
| Default: ``None`` | |||
| :return dict: parsing结果:: | |||
| :return dict: parsing | |||
| 结果:: | |||
| pred1: [batch_size, seq_len, seq_len] 边预测logits | |||
| pred2: [batch_size, seq_len, num_label] label预测logits | |||
| pred3: [batch_size, seq_len] heads的预测结果, 在 ``target1=None`` 时预测 | |||
| pred1: [batch_size, seq_len, seq_len] 边预测logits | |||
| pred2: [batch_size, seq_len, num_label] label预测logits | |||
| pred3: [batch_size, seq_len] heads的预测结果, 在 ``target1=None`` 时预测 | |||
| """ | |||
| # prepare embeddings | |||
| batch_size, length = words1.shape | |||
| # print('forward {} {}'.format(batch_size, seq_len)) | |||
| # get sequence mask | |||
| mask = seq_len_to_mask(seq_len).long() | |||
| word = self.word_embedding(words1) # [N,L] -> [N,L,C_0] | |||
| pos = self.pos_embedding(words2) # [N,L] -> [N,L,C_1] | |||
| word = self.word_embedding(words1) # [N,L] -> [N,L,C_0] | |||
| pos = self.pos_embedding(words2) # [N,L] -> [N,L,C_1] | |||
| word, pos = self.word_fc(word), self.pos_fc(pos) | |||
| word, pos = self.word_norm(word), self.pos_norm(pos) | |||
| x = torch.cat([word, pos], dim=2) # -> [N,L,C] | |||
| x = torch.cat([word, pos], dim=2) # -> [N,L,C] | |||
| # encoder, extract features | |||
| if self.encoder_name.endswith('lstm'): | |||
| sort_lens, sort_idx = torch.sort(seq_len, dim=0, descending=True) | |||
| x = x[sort_idx] | |||
| x = nn.utils.rnn.pack_padded_sequence(x, sort_lens, batch_first=True) | |||
| feat, _ = self.encoder(x) # -> [N,L,C] | |||
| feat, _ = self.encoder(x) # -> [N,L,C] | |||
| feat, _ = nn.utils.rnn.pad_packed_sequence(feat, batch_first=True) | |||
| _, unsort_idx = torch.sort(sort_idx, dim=0, descending=False) | |||
| feat = feat[unsort_idx] | |||
| else: | |||
| seq_range = torch.arange(length, dtype=torch.long, device=x.device)[None,:] | |||
| seq_range = torch.arange(length, dtype=torch.long, device=x.device)[None, :] | |||
| x = x + self.position_emb(seq_range) | |||
| feat = self.encoder(x, mask.float()) | |||
| # for arc biaffine | |||
| # mlp, reduce dim | |||
| feat = self.mlp(feat) | |||
| arc_sz, label_sz = self.arc_mlp_size, self.label_mlp_size | |||
| arc_dep, arc_head = feat[:,:,:arc_sz], feat[:,:,arc_sz:2*arc_sz] | |||
| label_dep, label_head = feat[:,:,2*arc_sz:2*arc_sz+label_sz], feat[:,:,2*arc_sz+label_sz:] | |||
| arc_dep, arc_head = feat[:, :, :arc_sz], feat[:, :, arc_sz:2 * arc_sz] | |||
| label_dep, label_head = feat[:, :, 2 * arc_sz:2 * arc_sz + label_sz], feat[:, :, 2 * arc_sz + label_sz:] | |||
| # biaffine arc classifier | |||
| arc_pred = self.arc_predictor(arc_head, arc_dep) # [N, L, L] | |||
| arc_pred = self.arc_predictor(arc_head, arc_dep) # [N, L, L] | |||
| # use gold or predicted arc to predict label | |||
| if target1 is None or not self.training: | |||
| # use greedy decoding in training | |||
| @@ -390,22 +403,22 @@ class BiaffineParser(GraphParser): | |||
| heads = self.mst_decoder(arc_pred, mask) | |||
| head_pred = heads | |||
| else: | |||
| assert self.training # must be training mode | |||
| assert self.training # must be training mode | |||
| if target1 is None: | |||
| heads = self.greedy_decoder(arc_pred, mask) | |||
| head_pred = heads | |||
| else: | |||
| head_pred = None | |||
| heads = target1 | |||
| batch_range = torch.arange(start=0, end=batch_size, dtype=torch.long, device=words1.device).unsqueeze(1) | |||
| label_head = label_head[batch_range, heads].contiguous() | |||
| label_pred = self.label_predictor(label_head, label_dep) # [N, L, num_label] | |||
| label_pred = self.label_predictor(label_head, label_dep) # [N, L, num_label] | |||
| res_dict = {C.OUTPUTS(0): arc_pred, C.OUTPUTS(1): label_pred} | |||
| if head_pred is not None: | |||
| res_dict[C.OUTPUTS(2)] = head_pred | |||
| return res_dict | |||
| @staticmethod | |||
| def loss(pred1, pred2, target1, target2, seq_len): | |||
| """ | |||
| @@ -418,7 +431,7 @@ class BiaffineParser(GraphParser): | |||
| :param seq_len: [batch_size, seq_len] 真实目标的长度 | |||
| :return loss: scalar | |||
| """ | |||
| batch_size, length, _ = pred1.shape | |||
| mask = seq_len_to_mask(seq_len) | |||
| flip_mask = (mask == 0) | |||
| @@ -430,24 +443,26 @@ class BiaffineParser(GraphParser): | |||
| child_index = torch.arange(length, device=arc_logits.device, dtype=torch.long).unsqueeze(0) | |||
| arc_loss = arc_logits[batch_index, child_index, target1] | |||
| label_loss = label_logits[batch_index, child_index, target2] | |||
| byte_mask = flip_mask.byte() | |||
| arc_loss.masked_fill_(byte_mask, 0) | |||
| label_loss.masked_fill_(byte_mask, 0) | |||
| arc_nll = -arc_loss.mean() | |||
| label_nll = -label_loss.mean() | |||
| return arc_nll + label_nll | |||
| def predict(self, words1, words2, seq_len): | |||
| """模型预测API | |||
| :param words1: [batch_size, seq_len] 输入word序列 | |||
| :param words2: [batch_size, seq_len] 输入pos序列 | |||
| :param seq_len: [batch_size, seq_len] 输入序列长度 | |||
| :return dict: parsing结果:: | |||
| :return dict: parsing | |||
| 结果:: | |||
| pred1: [batch_size, seq_len] heads的预测结果 | |||
| pred2: [batch_size, seq_len, num_label] label预测logits | |||
| pred1: [batch_size, seq_len] heads的预测结果 | |||
| pred2: [batch_size, seq_len, num_label] label预测logits | |||
| """ | |||
| res = self(words1, words2, seq_len) | |||
| output = {} | |||
| @@ -470,6 +485,7 @@ class ParserLoss(LossFunc): | |||
| :param seq_len: [batch_size, seq_len] 真实目标的长度 | |||
| :return loss: scalar | |||
| """ | |||
| def __init__(self, pred1=None, pred2=None, | |||
| target1=None, target2=None, | |||
| seq_len=None): | |||
| @@ -497,9 +513,10 @@ class ParserMetric(MetricBase): | |||
| UAS: 不带label时, 边预测的准确率 | |||
| LAS: 同时预测边和label的准确率 | |||
| """ | |||
| def __init__(self, pred1=None, pred2=None, | |||
| target1=None, target2=None, seq_len=None): | |||
| super().__init__() | |||
| self._init_param_map(pred1=pred1, pred2=pred2, | |||
| target1=target1, target2=target2, | |||
| @@ -507,13 +524,13 @@ class ParserMetric(MetricBase): | |||
| self.num_arc = 0 | |||
| self.num_label = 0 | |||
| self.num_sample = 0 | |||
| def get_metric(self, reset=True): | |||
| res = {'UAS': self.num_arc*1.0 / self.num_sample, 'LAS': self.num_label*1.0 / self.num_sample} | |||
| res = {'UAS': self.num_arc * 1.0 / self.num_sample, 'LAS': self.num_label * 1.0 / self.num_sample} | |||
| if reset: | |||
| self.num_sample = self.num_label = self.num_arc = 0 | |||
| return res | |||
| def evaluate(self, pred1, pred2, target1, target2, seq_len=None): | |||
| """Evaluate the performance of prediction. | |||
| """ | |||
| @@ -522,7 +539,7 @@ class ParserMetric(MetricBase): | |||
| else: | |||
| seq_mask = seq_len_to_mask(seq_len.long()).long() | |||
| # mask out <root> tag | |||
| seq_mask[:,0] = 0 | |||
| seq_mask[:, 0] = 0 | |||
| head_pred_correct = (pred1 == target1).long() * seq_mask | |||
| label_pred_correct = (pred2 == target2).long() * head_pred_correct | |||
| self.num_arc += head_pred_correct.sum().item() | |||
| @@ -1,12 +1,13 @@ | |||
| # python: 3.6 | |||
| # encoding: utf-8 | |||
| import torch | |||
| import torch.nn as nn | |||
| from ..core.const import Const as C | |||
| from ..core.const import Const as C | |||
| from ..modules import encoder | |||
| __all__ = [ | |||
| "CNNText" | |||
| ] | |||
| class CNNText(torch.nn.Module): | |||
| """ | |||
| @@ -23,7 +24,7 @@ class CNNText(torch.nn.Module): | |||
| :param int padding: 对句子前后的pad的大小, 用0填充。 | |||
| :param float dropout: Dropout的大小 | |||
| """ | |||
| def __init__(self, init_embed, | |||
| num_classes, | |||
| kernel_nums=(3, 4, 5), | |||
| @@ -31,7 +32,7 @@ class CNNText(torch.nn.Module): | |||
| padding=0, | |||
| dropout=0.5): | |||
| super(CNNText, self).__init__() | |||
| # no support for pre-trained embedding currently | |||
| self.embed = encoder.Embedding(init_embed) | |||
| self.conv_pool = encoder.ConvMaxpool( | |||
| @@ -41,7 +42,7 @@ class CNNText(torch.nn.Module): | |||
| padding=padding) | |||
| self.dropout = nn.Dropout(dropout) | |||
| self.fc = nn.Linear(sum(kernel_nums), num_classes) | |||
| def forward(self, words, seq_len=None): | |||
| """ | |||
| @@ -54,7 +55,7 @@ class CNNText(torch.nn.Module): | |||
| x = self.dropout(x) | |||
| x = self.fc(x) # [N,C] -> [N, N_class] | |||
| return {C.OUTPUT: x} | |||
| def predict(self, words, seq_len=None): | |||
| """ | |||
| :param torch.LongTensor words: [batch_size, seq_len],句子中word的index | |||
| @@ -5,6 +5,7 @@ import os | |||
| import torch | |||
| import torch.nn.functional as F | |||
| from . import enas_utils as utils | |||
| from .enas_utils import Node | |||
| @@ -1,17 +1,19 @@ | |||
| # Code Modified from https://github.com/carpedm20/ENAS-pytorch | |||
| """Module containing the shared RNN model.""" | |||
| import numpy as np | |||
| """ | |||
| Module containing the shared RNN model. | |||
| Code Modified from https://github.com/carpedm20/ENAS-pytorch | |||
| """ | |||
| import collections | |||
| import numpy as np | |||
| import torch | |||
| from torch import nn | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| from torch.autograd import Variable | |||
| from . import enas_utils as utils | |||
| from .base_model import BaseModel | |||
| def _get_dropped_weights(w_raw, dropout_p, is_training): | |||
| """Drops out weights to implement DropConnect. | |||
| @@ -35,12 +37,13 @@ def _get_dropped_weights(w_raw, dropout_p, is_training): | |||
| The above TODO is the reason for the hacky check for `torch.nn.Parameter`. | |||
| """ | |||
| dropped_w = F.dropout(w_raw, p=dropout_p, training=is_training) | |||
| if isinstance(dropped_w, torch.nn.Parameter): | |||
| dropped_w = dropped_w.clone() | |||
| return dropped_w | |||
| class EmbeddingDropout(torch.nn.Embedding): | |||
| """Class for dropping out embeddings by zero'ing out parameters in the | |||
| embedding matrix. | |||
| @@ -53,6 +56,7 @@ class EmbeddingDropout(torch.nn.Embedding): | |||
| See 'A Theoretically Grounded Application of Dropout in Recurrent Neural | |||
| Networks', (Gal and Ghahramani, 2016). | |||
| """ | |||
| def __init__(self, | |||
| num_embeddings, | |||
| embedding_dim, | |||
| @@ -83,14 +87,14 @@ class EmbeddingDropout(torch.nn.Embedding): | |||
| assert (dropout >= 0.0) and (dropout < 1.0), ('Dropout must be >= 0.0 ' | |||
| 'and < 1.0') | |||
| self.scale = scale | |||
| def forward(self, inputs): # pylint:disable=arguments-differ | |||
| """Embeds `inputs` with the dropped out embedding weight matrix.""" | |||
| if self.training: | |||
| dropout = self.dropout | |||
| else: | |||
| dropout = 0 | |||
| if dropout: | |||
| mask = self.weight.data.new(self.weight.size(0), 1) | |||
| mask.bernoulli_(1 - dropout) | |||
| @@ -101,7 +105,7 @@ class EmbeddingDropout(torch.nn.Embedding): | |||
| masked_weight = self.weight | |||
| if self.scale and self.scale != 1: | |||
| masked_weight = masked_weight * self.scale | |||
| return F.embedding(inputs, | |||
| masked_weight, | |||
| max_norm=self.max_norm, | |||
| @@ -114,7 +118,7 @@ class LockedDropout(nn.Module): | |||
| # code from https://github.com/salesforce/awd-lstm-lm/blob/master/locked_dropout.py | |||
| def __init__(self): | |||
| super().__init__() | |||
| def forward(self, x, dropout=0.5): | |||
| if not self.training or not dropout: | |||
| return x | |||
| @@ -126,11 +130,12 @@ class LockedDropout(nn.Module): | |||
| class ENASModel(BaseModel): | |||
| """Shared RNN model.""" | |||
| def __init__(self, embed_num, num_classes, num_blocks=4, cuda=False, shared_hid=1000, shared_embed=1000): | |||
| super(ENASModel, self).__init__() | |||
| self.use_cuda = cuda | |||
| self.shared_hid = shared_hid | |||
| self.num_blocks = num_blocks | |||
| self.decoder = nn.Linear(self.shared_hid, num_classes) | |||
| @@ -139,16 +144,16 @@ class ENASModel(BaseModel): | |||
| dropout=0.1) | |||
| self.lockdrop = LockedDropout() | |||
| self.dag = None | |||
| # Tie weights | |||
| # self.decoder.weight = self.encoder.weight | |||
| # Since W^{x, c} and W^{h, c} are always summed, there | |||
| # is no point duplicating their bias offset parameter. Likewise for | |||
| # W^{x, h} and W^{h, h}. | |||
| self.w_xc = nn.Linear(shared_embed, self.shared_hid) | |||
| self.w_xh = nn.Linear(shared_embed, self.shared_hid) | |||
| # The raw weights are stored here because the hidden-to-hidden weights | |||
| # are weight dropped on the forward pass. | |||
| self.w_hc_raw = torch.nn.Parameter( | |||
| @@ -157,10 +162,10 @@ class ENASModel(BaseModel): | |||
| torch.Tensor(self.shared_hid, self.shared_hid)) | |||
| self.w_hc = None | |||
| self.w_hh = None | |||
| self.w_h = collections.defaultdict(dict) | |||
| self.w_c = collections.defaultdict(dict) | |||
| for idx in range(self.num_blocks): | |||
| for jdx in range(idx + 1, self.num_blocks): | |||
| self.w_h[idx][jdx] = nn.Linear(self.shared_hid, | |||
| @@ -169,48 +174,47 @@ class ENASModel(BaseModel): | |||
| self.w_c[idx][jdx] = nn.Linear(self.shared_hid, | |||
| self.shared_hid, | |||
| bias=False) | |||
| self._w_h = nn.ModuleList([self.w_h[idx][jdx] | |||
| for idx in self.w_h | |||
| for jdx in self.w_h[idx]]) | |||
| self._w_c = nn.ModuleList([self.w_c[idx][jdx] | |||
| for idx in self.w_c | |||
| for jdx in self.w_c[idx]]) | |||
| self.batch_norm = None | |||
| # if args.mode == 'train': | |||
| # self.batch_norm = nn.BatchNorm1d(self.shared_hid) | |||
| # else: | |||
| # self.batch_norm = None | |||
| self.reset_parameters() | |||
| self.static_init_hidden = utils.keydefaultdict(self.init_hidden) | |||
| def setDAG(self, dag): | |||
| if self.dag is None: | |||
| self.dag = dag | |||
| def forward(self, word_seq, hidden=None): | |||
| inputs = torch.transpose(word_seq, 0, 1) | |||
| time_steps = inputs.size(0) | |||
| batch_size = inputs.size(1) | |||
| self.w_hh = _get_dropped_weights(self.w_hh_raw, | |||
| 0.5, | |||
| self.training) | |||
| self.w_hc = _get_dropped_weights(self.w_hc_raw, | |||
| 0.5, | |||
| self.training) | |||
| # hidden = self.static_init_hidden[batch_size] if hidden is None else hidden | |||
| hidden = self.static_init_hidden[batch_size] | |||
| embed = self.encoder(inputs) | |||
| embed = self.lockdrop(embed, 0.65 if self.training else 0) | |||
| # The norm of hidden states are clipped here because | |||
| # otherwise ENAS is especially prone to exploding activations on the | |||
| # forward pass. This could probably be fixed in a more elegant way, but | |||
| @@ -226,7 +230,7 @@ class ENASModel(BaseModel): | |||
| for step in range(time_steps): | |||
| x_t = embed[step] | |||
| logit, hidden = self.cell(x_t, hidden, self.dag) | |||
| hidden_norms = hidden.norm(dim=-1) | |||
| max_norm = 25.0 | |||
| if hidden_norms.data.max() > max_norm: | |||
| @@ -237,60 +241,60 @@ class ENASModel(BaseModel): | |||
| # because the PyTorch slicing and slice assignment is too | |||
| # flaky. | |||
| hidden_norms = hidden_norms.data.cpu().numpy() | |||
| clipped_num += 1 | |||
| if hidden_norms.max() > max_clipped_norm: | |||
| max_clipped_norm = hidden_norms.max() | |||
| clip_select = hidden_norms > max_norm | |||
| clip_norms = hidden_norms[clip_select] | |||
| mask = np.ones(hidden.size()) | |||
| normalizer = max_norm/clip_norms | |||
| normalizer = max_norm / clip_norms | |||
| normalizer = normalizer[:, np.newaxis] | |||
| mask[clip_select] = normalizer | |||
| if self.use_cuda: | |||
| hidden *= torch.autograd.Variable( | |||
| torch.FloatTensor(mask).cuda(), requires_grad=False) | |||
| else: | |||
| hidden *= torch.autograd.Variable( | |||
| torch.FloatTensor(mask), requires_grad=False) | |||
| torch.FloatTensor(mask), requires_grad=False) | |||
| logits.append(logit) | |||
| h1tohT.append(hidden) | |||
| h1tohT = torch.stack(h1tohT) | |||
| output = torch.stack(logits) | |||
| raw_output = output | |||
| output = self.lockdrop(output, 0.4 if self.training else 0) | |||
| #Pooling | |||
| # Pooling | |||
| output = torch.mean(output, 0) | |||
| decoded = self.decoder(output) | |||
| extra_out = {'dropped': decoded, | |||
| 'hiddens': h1tohT, | |||
| 'raw': raw_output} | |||
| return {'pred': decoded, 'hidden': hidden, 'extra_out': extra_out} | |||
| def cell(self, x, h_prev, dag): | |||
| """Computes a single pass through the discovered RNN cell.""" | |||
| c = {} | |||
| h = {} | |||
| f = {} | |||
| f[0] = self.get_f(dag[-1][0].name) | |||
| c[0] = torch.sigmoid(self.w_xc(x) + F.linear(h_prev, self.w_hc, None)) | |||
| h[0] = (c[0]*f[0](self.w_xh(x) + F.linear(h_prev, self.w_hh, None)) + | |||
| (1 - c[0])*h_prev) | |||
| h[0] = (c[0] * f[0](self.w_xh(x) + F.linear(h_prev, self.w_hh, None)) + | |||
| (1 - c[0]) * h_prev) | |||
| leaf_node_ids = [] | |||
| q = collections.deque() | |||
| q.append(0) | |||
| # Computes connections from the parent nodes `node_id` | |||
| # to their child nodes `next_id` recursively, skipping leaf nodes. A | |||
| # leaf node is a node whose id == `self.num_blocks`. | |||
| @@ -306,10 +310,10 @@ class ENASModel(BaseModel): | |||
| while True: | |||
| if len(q) == 0: | |||
| break | |||
| node_id = q.popleft() | |||
| nodes = dag[node_id] | |||
| for next_node in nodes: | |||
| next_id = next_node.id | |||
| if next_id == self.num_blocks: | |||
| @@ -317,38 +321,38 @@ class ENASModel(BaseModel): | |||
| assert len(nodes) == 1, ('parent of leaf node should have ' | |||
| 'only one child') | |||
| continue | |||
| w_h = self.w_h[node_id][next_id] | |||
| w_c = self.w_c[node_id][next_id] | |||
| f[next_id] = self.get_f(next_node.name) | |||
| c[next_id] = torch.sigmoid(w_c(h[node_id])) | |||
| h[next_id] = (c[next_id]*f[next_id](w_h(h[node_id])) + | |||
| (1 - c[next_id])*h[node_id]) | |||
| h[next_id] = (c[next_id] * f[next_id](w_h(h[node_id])) + | |||
| (1 - c[next_id]) * h[node_id]) | |||
| q.append(next_id) | |||
| # Instead of averaging loose ends, perhaps there should | |||
| # be a set of separate unshared weights for each "loose" connection | |||
| # between each node in a cell and the output. | |||
| # | |||
| # As it stands, all weights W^h_{ij} are doing double duty by | |||
| # connecting both from i to j, as well as from i to the output. | |||
| # average all the loose ends | |||
| leaf_nodes = [h[node_id] for node_id in leaf_node_ids] | |||
| output = torch.mean(torch.stack(leaf_nodes, 2), -1) | |||
| # stabilizing the Updates of omega | |||
| if self.batch_norm is not None: | |||
| output = self.batch_norm(output) | |||
| return output, h[self.num_blocks - 1] | |||
| def init_hidden(self, batch_size): | |||
| zeros = torch.zeros(batch_size, self.shared_hid) | |||
| return utils.get_variable(zeros, self.use_cuda, requires_grad=False) | |||
| def get_f(self, name): | |||
| name = name.lower() | |||
| if name == 'relu': | |||
| @@ -360,22 +364,21 @@ class ENASModel(BaseModel): | |||
| elif name == 'sigmoid': | |||
| f = torch.sigmoid | |||
| return f | |||
| @property | |||
| def num_parameters(self): | |||
| def size(p): | |||
| return np.prod(p.size()) | |||
| return sum([size(param) for param in self.parameters()]) | |||
| def reset_parameters(self): | |||
| init_range = 0.025 | |||
| # init_range = 0.025 if self.args.mode == 'train' else 0.04 | |||
| for param in self.parameters(): | |||
| param.data.uniform_(-init_range, init_range) | |||
| self.decoder.bias.data.fill_(0) | |||
| def predict(self, word_seq): | |||
| """ | |||
| @@ -1,12 +1,12 @@ | |||
| # Code Modified from https://github.com/carpedm20/ENAS-pytorch | |||
| import time | |||
| from datetime import datetime | |||
| from datetime import timedelta | |||
| import math | |||
| import numpy as np | |||
| import time | |||
| import torch | |||
| import math | |||
| from datetime import datetime, timedelta | |||
| from torch.optim import Adam | |||
| try: | |||
| from tqdm.auto import tqdm | |||
| @@ -21,8 +21,6 @@ from ..core.utils import _move_dict_value_to_device | |||
| from . import enas_utils as utils | |||
| from ..core.utils import _build_args | |||
| from torch.optim import Adam | |||
| def _get_no_grad_ctx_mgr(): | |||
| """Returns a the `torch.no_grad` context manager for PyTorch version >= | |||
| @@ -33,6 +31,7 @@ def _get_no_grad_ctx_mgr(): | |||
| class ENASTrainer(Trainer): | |||
| """A class to wrap training code.""" | |||
| def __init__(self, train_data, model, controller, **kwargs): | |||
| """Constructor for training algorithm. | |||
| :param DataSet train_data: the training data | |||
| @@ -45,19 +44,19 @@ class ENASTrainer(Trainer): | |||
| self.controller_step = 0 | |||
| self.shared_step = 0 | |||
| self.max_length = 35 | |||
| self.shared = model | |||
| self.controller = controller | |||
| self.shared_optim = Adam( | |||
| self.shared.parameters(), | |||
| lr=20.0, | |||
| weight_decay=1e-7) | |||
| self.controller_optim = Adam( | |||
| self.controller.parameters(), | |||
| lr=3.5e-4) | |||
| def train(self, load_best_model=True): | |||
| """ | |||
| :param bool load_best_model: 该参数只有在初始化提供了dev_data的情况下有效,如果True, trainer将在返回之前重新加载dev表现 | |||
| @@ -82,21 +81,22 @@ class ENASTrainer(Trainer): | |||
| self.model = self.model.cuda() | |||
| self._model_device = self.model.parameters().__next__().device | |||
| self._mode(self.model, is_test=False) | |||
| self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) | |||
| start_time = time.time() | |||
| print("training epochs started " + self.start_time, flush=True) | |||
| try: | |||
| self.callback_manager.on_train_begin() | |||
| self._train() | |||
| self.callback_manager.on_train_end() | |||
| except (CallbackException, KeyboardInterrupt) as e: | |||
| self.callback_manager.on_exception(e) | |||
| if self.dev_data is not None: | |||
| print("\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) + | |||
| self.tester._format_eval_results(self.best_dev_perf),) | |||
| print( | |||
| "\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) + | |||
| self.tester._format_eval_results(self.best_dev_perf), ) | |||
| results['best_eval'] = self.best_dev_perf | |||
| results['best_epoch'] = self.best_dev_epoch | |||
| results['best_step'] = self.best_dev_step | |||
| @@ -110,9 +110,9 @@ class ENASTrainer(Trainer): | |||
| finally: | |||
| pass | |||
| results['seconds'] = round(time.time() - start_time, 2) | |||
| return results | |||
| def _train(self): | |||
| if not self.use_tqdm: | |||
| from fastNLP.core.utils import _pseudo_tqdm as inner_tqdm | |||
| @@ -126,21 +126,21 @@ class ENASTrainer(Trainer): | |||
| avg_loss = 0 | |||
| data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, | |||
| prefetch=self.prefetch) | |||
| for epoch in range(1, self.n_epochs+1): | |||
| for epoch in range(1, self.n_epochs + 1): | |||
| pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs)) | |||
| last_stage = (epoch > self.n_epochs + 1 - self.final_epochs) | |||
| if epoch == self.n_epochs + 1 - self.final_epochs: | |||
| print('Entering the final stage. (Only train the selected structure)') | |||
| # early stopping | |||
| self.callback_manager.on_epoch_begin() | |||
| # 1. Training the shared parameters omega of the child models | |||
| self.train_shared(pbar) | |||
| # 2. Training the controller parameters theta | |||
| if not last_stage: | |||
| self.train_controller() | |||
| if ((self.validate_every > 0 and self.step % self.validate_every == 0) or | |||
| (self.validate_every < 0 and self.step % len(data_iterator) == 0)) \ | |||
| and self.dev_data is not None: | |||
| @@ -149,16 +149,15 @@ class ENASTrainer(Trainer): | |||
| eval_res = self._do_validation(epoch=epoch, step=self.step) | |||
| eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, | |||
| total_steps) + \ | |||
| self.tester._format_eval_results(eval_res) | |||
| self.tester._format_eval_results(eval_res) | |||
| pbar.write(eval_str) | |||
| # lr decay; early stopping | |||
| self.callback_manager.on_epoch_end() | |||
| # =============== epochs end =================== # | |||
| pbar.close() | |||
| # ============ tqdm end ============== # | |||
| def get_loss(self, inputs, targets, hidden, dags): | |||
| """Computes the loss for the same batch for M models. | |||
| @@ -167,7 +166,7 @@ class ENASTrainer(Trainer): | |||
| """ | |||
| if not isinstance(dags, list): | |||
| dags = [dags] | |||
| loss = 0 | |||
| for dag in dags: | |||
| self.shared.setDAG(dag) | |||
| @@ -175,14 +174,14 @@ class ENASTrainer(Trainer): | |||
| inputs['hidden'] = hidden | |||
| result = self.shared(**inputs) | |||
| output, hidden, extra_out = result['pred'], result['hidden'], result['extra_out'] | |||
| self.callback_manager.on_loss_begin(targets, result) | |||
| sample_loss = self._compute_loss(result, targets) | |||
| loss += sample_loss | |||
| assert len(dags) == 1, 'there are multiple `hidden` for multple `dags`' | |||
| return loss, hidden, extra_out | |||
| def train_shared(self, pbar=None, max_step=None, dag=None): | |||
| """Train the language model for 400 steps of minibatches of 64 | |||
| examples. | |||
| @@ -200,9 +199,9 @@ class ENASTrainer(Trainer): | |||
| model = self.shared | |||
| model.train() | |||
| self.controller.eval() | |||
| hidden = self.shared.init_hidden(self.batch_size) | |||
| abs_max_grad = 0 | |||
| abs_max_hidden_norm = 0 | |||
| step = 0 | |||
| @@ -211,15 +210,15 @@ class ENASTrainer(Trainer): | |||
| train_idx = 0 | |||
| avg_loss = 0 | |||
| data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, | |||
| prefetch=self.prefetch) | |||
| prefetch=self.prefetch) | |||
| for batch_x, batch_y in data_iterator: | |||
| _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) | |||
| indices = data_iterator.get_batch_indices() | |||
| # negative sampling; replace unknown; re-weight batch_y | |||
| self.callback_manager.on_batch_begin(batch_x, batch_y, indices) | |||
| # prediction = self._data_forward(self.model, batch_x) | |||
| dags = self.controller.sample(1) | |||
| inputs, targets = batch_x, batch_y | |||
| # self.callback_manager.on_loss_begin(batch_y, prediction) | |||
| @@ -228,18 +227,18 @@ class ENASTrainer(Trainer): | |||
| hidden, | |||
| dags) | |||
| hidden.detach_() | |||
| avg_loss += loss.item() | |||
| # Is loss NaN or inf? requires_grad = False | |||
| self.callback_manager.on_backward_begin(loss) | |||
| self._grad_backward(loss) | |||
| self.callback_manager.on_backward_end() | |||
| self._update() | |||
| self.callback_manager.on_step_end() | |||
| if (self.step+1) % self.print_every == 0: | |||
| if (self.step + 1) % self.print_every == 0: | |||
| if self.use_tqdm: | |||
| print_output = "loss:{0:<6.5f}".format(avg_loss / self.print_every) | |||
| pbar.update(self.print_every) | |||
| @@ -255,30 +254,29 @@ class ENASTrainer(Trainer): | |||
| self.shared_step += 1 | |||
| self.callback_manager.on_batch_end() | |||
| # ================= mini-batch end ==================== # | |||
| def get_reward(self, dag, entropies, hidden, valid_idx=0): | |||
| """Computes the perplexity of a single sampled model on a minibatch of | |||
| validation data. | |||
| """ | |||
| if not isinstance(entropies, np.ndarray): | |||
| entropies = entropies.data.cpu().numpy() | |||
| data_iterator = Batch(self.dev_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, | |||
| prefetch=self.prefetch) | |||
| prefetch=self.prefetch) | |||
| for inputs, targets in data_iterator: | |||
| valid_loss, hidden, _ = self.get_loss(inputs, targets, hidden, dag) | |||
| valid_loss = utils.to_item(valid_loss.data) | |||
| valid_ppl = math.exp(valid_loss) | |||
| R = 80 / valid_ppl | |||
| rewards = R + 1e-4 * entropies | |||
| return rewards, hidden | |||
| def train_controller(self): | |||
| """Fixes the shared parameters and updates the controller parameters. | |||
| @@ -296,13 +294,13 @@ class ENASTrainer(Trainer): | |||
| # Why can't we call shared.eval() here? Leads to loss | |||
| # being uniformly zero for the controller. | |||
| # self.shared.eval() | |||
| avg_reward_base = None | |||
| baseline = None | |||
| adv_history = [] | |||
| entropy_history = [] | |||
| reward_history = [] | |||
| hidden = self.shared.init_hidden(self.batch_size) | |||
| total_loss = 0 | |||
| valid_idx = 0 | |||
| @@ -310,7 +308,7 @@ class ENASTrainer(Trainer): | |||
| # sample models | |||
| dags, log_probs, entropies = self.controller.sample( | |||
| with_details=True) | |||
| # calculate reward | |||
| np_entropies = entropies.data.cpu().numpy() | |||
| # No gradients should be backpropagated to the | |||
| @@ -320,40 +318,39 @@ class ENASTrainer(Trainer): | |||
| np_entropies, | |||
| hidden, | |||
| valid_idx) | |||
| reward_history.extend(rewards) | |||
| entropy_history.extend(np_entropies) | |||
| # moving average baseline | |||
| if baseline is None: | |||
| baseline = rewards | |||
| else: | |||
| decay = 0.95 | |||
| baseline = decay * baseline + (1 - decay) * rewards | |||
| adv = rewards - baseline | |||
| adv_history.extend(adv) | |||
| # policy loss | |||
| loss = -log_probs*utils.get_variable(adv, | |||
| 'cuda' in self.device, | |||
| requires_grad=False) | |||
| loss = -log_probs * utils.get_variable(adv, | |||
| 'cuda' in self.device, | |||
| requires_grad=False) | |||
| loss = loss.sum() # or loss.mean() | |||
| # update | |||
| self.controller_optim.zero_grad() | |||
| loss.backward() | |||
| self.controller_optim.step() | |||
| total_loss += utils.to_item(loss.data) | |||
| if ((step % 50) == 0) and (step > 0): | |||
| reward_history, adv_history, entropy_history = [], [], [] | |||
| total_loss = 0 | |||
| self.controller_step += 1 | |||
| # prev_valid_idx = valid_idx | |||
| # valid_idx = ((valid_idx + self.max_length) % | |||
| @@ -362,16 +359,16 @@ class ENASTrainer(Trainer): | |||
| # # validation data, we reset the hidden states. | |||
| # if prev_valid_idx > valid_idx: | |||
| # hidden = self.shared.init_hidden(self.batch_size) | |||
| def derive(self, sample_num=10, valid_idx=0): | |||
| """We are always deriving based on the very first batch | |||
| of validation data? This seems wrong... | |||
| """ | |||
| hidden = self.shared.init_hidden(self.batch_size) | |||
| dags, _, entropies = self.controller.sample(sample_num, | |||
| with_details=True) | |||
| max_R = 0 | |||
| best_dag = None | |||
| for dag in dags: | |||
| @@ -379,5 +376,5 @@ class ENASTrainer(Trainer): | |||
| if R.max() > max_R: | |||
| max_R = R.max() | |||
| best_dag = dag | |||
| self.model.setDAG(best_dag) | |||
| @@ -1,12 +1,10 @@ | |||
| # Code Modified from https://github.com/carpedm20/ENAS-pytorch | |||
| from __future__ import print_function | |||
| from collections import defaultdict | |||
| import collections | |||
| import numpy as np | |||
| import torch | |||
| from torch.autograd import Variable | |||
| @@ -1,11 +1,19 @@ | |||
| """ | |||
| 本模块实现了两种序列标注模型 | |||
| """ | |||
| import torch | |||
| import torch.nn as nn | |||
| from .base_model import BaseModel | |||
| from ..modules import decoder, encoder | |||
| from ..modules.decoder.CRF import allowed_transitions | |||
| from ..core.utils import seq_len_to_mask | |||
| from ..core.const import Const as C | |||
| from torch import nn | |||
| __all__ = [ | |||
| "SeqLabeling", | |||
| "AdvSeqLabel" | |||
| ] | |||
| class SeqLabeling(BaseModel): | |||
| @@ -8,6 +8,9 @@ from ..modules import encoder as Encoder | |||
| from ..modules import aggregator as Aggregator | |||
| from ..core.utils import seq_len_to_mask | |||
| __all__ = [ | |||
| "ESIM" | |||
| ] | |||
| my_inf = 10e12 | |||
| @@ -26,7 +29,7 @@ class ESIM(BaseModel): | |||
| :param int num_classes: 标签数目,默认为3 | |||
| :param numpy.array init_embedding: 初始词嵌入矩阵,形状为(vocab_size, embed_dim),默认为None,即随机初始化词嵌入矩阵 | |||
| """ | |||
| def __init__(self, vocab_size, embed_dim, hidden_size, dropout=0.0, num_classes=3, init_embedding=None): | |||
| super(ESIM, self).__init__() | |||
| @@ -35,35 +38,36 @@ class ESIM(BaseModel): | |||
| self.hidden_size = hidden_size | |||
| self.dropout = dropout | |||
| self.n_labels = num_classes | |||
| self.drop = nn.Dropout(self.dropout) | |||
| self.embedding = Encoder.Embedding( | |||
| (self.vocab_size, self.embed_dim), dropout=self.dropout, | |||
| ) | |||
| self.embedding_layer = nn.Linear(self.embed_dim, self.hidden_size) | |||
| self.encoder = Encoder.LSTM( | |||
| input_size=self.embed_dim, hidden_size=self.hidden_size, num_layers=1, bias=True, | |||
| batch_first=True, bidirectional=True | |||
| ) | |||
| self.bi_attention = Aggregator.BiAttention() | |||
| self.mean_pooling = Aggregator.AvgPoolWithMask() | |||
| self.max_pooling = Aggregator.MaxPoolWithMask() | |||
| self.inference_layer = nn.Linear(self.hidden_size * 4, self.hidden_size) | |||
| self.decoder = Encoder.LSTM( | |||
| input_size=self.hidden_size, hidden_size=self.hidden_size, num_layers=1, bias=True, | |||
| batch_first=True, bidirectional=True | |||
| ) | |||
| self.output = Decoder.MLP([4 * self.hidden_size, self.hidden_size, self.n_labels], 'tanh', dropout=self.dropout) | |||
| def forward(self, words1, words2, seq_len1=None, seq_len2=None, target=None): | |||
| """ Forward function | |||
| :param torch.Tensor words1: [batch size(B), premise seq len(PL)] premise的token表示 | |||
| :param torch.Tensor words2: [B, hypothesis seq len(HL)] hypothesis的token表示 | |||
| :param torch.LongTensor seq_len1: [B] premise的长度 | |||
| @@ -71,10 +75,10 @@ class ESIM(BaseModel): | |||
| :param torch.LongTensor target: [B] 真实目标值 | |||
| :return: dict prediction: [B, n_labels(N)] 预测结果 | |||
| """ | |||
| premise0 = self.embedding_layer(self.embedding(words1)) | |||
| hypothesis0 = self.embedding_layer(self.embedding(words2)) | |||
| if seq_len1 is not None: | |||
| seq_len1 = seq_len_to_mask(seq_len1) | |||
| else: | |||
| @@ -85,55 +89,55 @@ class ESIM(BaseModel): | |||
| else: | |||
| seq_len2 = torch.ones(hypothesis0.size(0), hypothesis0.size(1)) | |||
| seq_len2 = (seq_len2.long()).to(device=hypothesis0.device) | |||
| _BP, _PSL, _HP = premise0.size() | |||
| _BH, _HSL, _HH = hypothesis0.size() | |||
| _BPL, _PLL = seq_len1.size() | |||
| _HPL, _HLL = seq_len2.size() | |||
| assert _BP == _BH and _BPL == _HPL and _BP == _BPL | |||
| assert _HP == _HH | |||
| assert _PSL == _PLL and _HSL == _HLL | |||
| B, PL, H = premise0.size() | |||
| B, HL, H = hypothesis0.size() | |||
| a0 = self.encoder(self.drop(premise0)) # a0: [B, PL, H * 2] | |||
| b0 = self.encoder(self.drop(hypothesis0)) # b0: [B, HL, H * 2] | |||
| a = torch.mean(a0.view(B, PL, -1, H), dim=2) # a: [B, PL, H] | |||
| b = torch.mean(b0.view(B, HL, -1, H), dim=2) # b: [B, HL, H] | |||
| ai, bi = self.bi_attention(a, b, seq_len1, seq_len2) | |||
| ma = torch.cat((a, ai, a - ai, a * ai), dim=2) # ma: [B, PL, 4 * H] | |||
| mb = torch.cat((b, bi, b - bi, b * bi), dim=2) # mb: [B, HL, 4 * H] | |||
| f_ma = self.inference_layer(ma) | |||
| f_mb = self.inference_layer(mb) | |||
| vat = self.decoder(self.drop(f_ma)) | |||
| vbt = self.decoder(self.drop(f_mb)) | |||
| va = torch.mean(vat.view(B, PL, -1, H), dim=2) # va: [B, PL, H] | |||
| vb = torch.mean(vbt.view(B, HL, -1, H), dim=2) # vb: [B, HL, H] | |||
| va_ave = self.mean_pooling(va, seq_len1, dim=1) # va_ave: [B, H] | |||
| va_max, va_arg_max = self.max_pooling(va, seq_len1, dim=1) # va_max: [B, H] | |||
| vb_ave = self.mean_pooling(vb, seq_len2, dim=1) # vb_ave: [B, H] | |||
| vb_max, vb_arg_max = self.max_pooling(vb, seq_len2, dim=1) # vb_max: [B, H] | |||
| v = torch.cat((va_ave, va_max, vb_ave, vb_max), dim=1) # v: [B, 4 * H] | |||
| prediction = torch.tanh(self.output(v)) # prediction: [B, N] | |||
| if target is not None: | |||
| func = nn.CrossEntropyLoss() | |||
| loss = func(prediction, target) | |||
| return {Const.OUTPUT: prediction, Const.LOSS: loss} | |||
| return {Const.OUTPUT: prediction} | |||
| def predict(self, words1, words2, seq_len1=None, seq_len2=None, target=None): | |||
| """ Predict function | |||
| @@ -146,4 +150,3 @@ class ESIM(BaseModel): | |||
| """ | |||
| prediction = self.forward(words1, words2, seq_len1, seq_len2)[Const.OUTPUT] | |||
| return {Const.OUTPUT: torch.argmax(prediction, dim=-1)} | |||
| @@ -1,17 +1,25 @@ | |||
| """Star-Transformer 的 一个 Pytorch 实现. | |||
| """ | |||
| Star-Transformer 的 Pytorch 实现。 | |||
| """ | |||
| import torch | |||
| from torch import nn | |||
| from ..modules.encoder.star_transformer import StarTransformer | |||
| from ..core.utils import seq_len_to_mask | |||
| from ..modules.utils import get_embeddings | |||
| from ..core.const import Const | |||
| import torch | |||
| from torch import nn | |||
| __all__ = [ | |||
| "StarTransEnc", | |||
| "STNLICls", | |||
| "STSeqCls", | |||
| "STSeqLabel", | |||
| ] | |||
| class StarTransEnc(nn.Module): | |||
| """ | |||
| 别名::class:`fastNLP.models.StarTransEnc` :class:`fastNLP.models.start_transformer.StarTransEnc` | |||
| 别名::class:`fastNLP.models.StarTransEnc` :class:`fastNLP.models.star_transformer.StarTransEnc` | |||
| 带word embedding的Star-Transformer Encoder | |||
| @@ -28,6 +36,7 @@ class StarTransEnc(nn.Module): | |||
| :param emb_dropout: 词嵌入的dropout概率. | |||
| :param dropout: 模型除词嵌入外的dropout概率. | |||
| """ | |||
| def __init__(self, init_embed, | |||
| hidden_size, | |||
| num_layers, | |||
| @@ -47,7 +56,7 @@ class StarTransEnc(nn.Module): | |||
| head_dim=head_dim, | |||
| dropout=dropout, | |||
| max_len=max_len) | |||
| def forward(self, x, mask): | |||
| """ | |||
| :param FloatTensor data: [batch, length, hidden] 输入的序列 | |||
| @@ -72,7 +81,7 @@ class _Cls(nn.Module): | |||
| nn.Dropout(dropout), | |||
| nn.Linear(hid_dim, num_cls), | |||
| ) | |||
| def forward(self, x): | |||
| h = self.fc(x) | |||
| return h | |||
| @@ -83,20 +92,21 @@ class _NLICls(nn.Module): | |||
| super(_NLICls, self).__init__() | |||
| self.fc = nn.Sequential( | |||
| nn.Dropout(dropout), | |||
| nn.Linear(in_dim*4, hid_dim), #4 | |||
| nn.Linear(in_dim * 4, hid_dim), # 4 | |||
| nn.LeakyReLU(), | |||
| nn.Dropout(dropout), | |||
| nn.Linear(hid_dim, num_cls), | |||
| ) | |||
| def forward(self, x1, x2): | |||
| x = torch.cat([x1, x2, torch.abs(x1-x2), x1*x2], 1) | |||
| x = torch.cat([x1, x2, torch.abs(x1 - x2), x1 * x2], 1) | |||
| h = self.fc(x) | |||
| return h | |||
| class STSeqLabel(nn.Module): | |||
| """ | |||
| 别名::class:`fastNLP.models.STSeqLabel` :class:`fastNLP.models.start_transformer.STSeqLabel` | |||
| 别名::class:`fastNLP.models.STSeqLabel` :class:`fastNLP.models.star_transformer.STSeqLabel` | |||
| 用于序列标注的Star-Transformer模型 | |||
| @@ -112,6 +122,7 @@ class STSeqLabel(nn.Module): | |||
| :param emb_dropout: 词嵌入的dropout概率. Default: 0.1 | |||
| :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1 | |||
| """ | |||
| def __init__(self, init_embed, num_cls, | |||
| hidden_size=300, | |||
| num_layers=4, | |||
| @@ -120,7 +131,7 @@ class STSeqLabel(nn.Module): | |||
| max_len=512, | |||
| cls_hidden_size=600, | |||
| emb_dropout=0.1, | |||
| dropout=0.1,): | |||
| dropout=0.1, ): | |||
| super(STSeqLabel, self).__init__() | |||
| self.enc = StarTransEnc(init_embed=init_embed, | |||
| hidden_size=hidden_size, | |||
| @@ -131,7 +142,7 @@ class STSeqLabel(nn.Module): | |||
| emb_dropout=emb_dropout, | |||
| dropout=dropout) | |||
| self.cls = _Cls(hidden_size, num_cls, cls_hidden_size) | |||
| def forward(self, words, seq_len): | |||
| """ | |||
| @@ -142,9 +153,9 @@ class STSeqLabel(nn.Module): | |||
| mask = seq_len_to_mask(seq_len) | |||
| nodes, _ = self.enc(words, mask) | |||
| output = self.cls(nodes) | |||
| output = output.transpose(1,2) # make hidden to be dim 1 | |||
| return {Const.OUTPUT: output} # [bsz, n_cls, seq_len] | |||
| output = output.transpose(1, 2) # make hidden to be dim 1 | |||
| return {Const.OUTPUT: output} # [bsz, n_cls, seq_len] | |||
| def predict(self, words, seq_len): | |||
| """ | |||
| @@ -159,7 +170,7 @@ class STSeqLabel(nn.Module): | |||
| class STSeqCls(nn.Module): | |||
| """ | |||
| 别名::class:`fastNLP.models.STSeqCls` :class:`fastNLP.models.start_transformer.STSeqCls` | |||
| 别名::class:`fastNLP.models.STSeqCls` :class:`fastNLP.models.star_transformer.STSeqCls` | |||
| 用于分类任务的Star-Transformer | |||
| @@ -175,7 +186,7 @@ class STSeqCls(nn.Module): | |||
| :param emb_dropout: 词嵌入的dropout概率. Default: 0.1 | |||
| :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1 | |||
| """ | |||
| def __init__(self, init_embed, num_cls, | |||
| hidden_size=300, | |||
| num_layers=4, | |||
| @@ -184,7 +195,7 @@ class STSeqCls(nn.Module): | |||
| max_len=512, | |||
| cls_hidden_size=600, | |||
| emb_dropout=0.1, | |||
| dropout=0.1,): | |||
| dropout=0.1, ): | |||
| super(STSeqCls, self).__init__() | |||
| self.enc = StarTransEnc(init_embed=init_embed, | |||
| hidden_size=hidden_size, | |||
| @@ -195,7 +206,7 @@ class STSeqCls(nn.Module): | |||
| emb_dropout=emb_dropout, | |||
| dropout=dropout) | |||
| self.cls = _Cls(hidden_size, num_cls, cls_hidden_size) | |||
| def forward(self, words, seq_len): | |||
| """ | |||
| @@ -206,9 +217,9 @@ class STSeqCls(nn.Module): | |||
| mask = seq_len_to_mask(seq_len) | |||
| nodes, relay = self.enc(words, mask) | |||
| y = 0.5 * (relay + nodes.max(1)[0]) | |||
| output = self.cls(y) # [bsz, n_cls] | |||
| output = self.cls(y) # [bsz, n_cls] | |||
| return {Const.OUTPUT: output} | |||
| def predict(self, words, seq_len): | |||
| """ | |||
| @@ -223,7 +234,7 @@ class STSeqCls(nn.Module): | |||
| class STNLICls(nn.Module): | |||
| """ | |||
| 别名::class:`fastNLP.models.STNLICls` :class:`fastNLP.models.start_transformer.STNLICls` | |||
| 别名::class:`fastNLP.models.STNLICls` :class:`fastNLP.models.star_transformer.STNLICls` | |||
| 用于自然语言推断(NLI)的Star-Transformer | |||
| @@ -239,7 +250,7 @@ class STNLICls(nn.Module): | |||
| :param emb_dropout: 词嵌入的dropout概率. Default: 0.1 | |||
| :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1 | |||
| """ | |||
| def __init__(self, init_embed, num_cls, | |||
| hidden_size=300, | |||
| num_layers=4, | |||
| @@ -248,7 +259,7 @@ class STNLICls(nn.Module): | |||
| max_len=512, | |||
| cls_hidden_size=600, | |||
| emb_dropout=0.1, | |||
| dropout=0.1,): | |||
| dropout=0.1, ): | |||
| super(STNLICls, self).__init__() | |||
| self.enc = StarTransEnc(init_embed=init_embed, | |||
| hidden_size=hidden_size, | |||
| @@ -259,7 +270,7 @@ class STNLICls(nn.Module): | |||
| emb_dropout=emb_dropout, | |||
| dropout=dropout) | |||
| self.cls = _NLICls(hidden_size, num_cls, cls_hidden_size) | |||
| def forward(self, words1, words2, seq_len1, seq_len2): | |||
| """ | |||
| @@ -271,14 +282,16 @@ class STNLICls(nn.Module): | |||
| """ | |||
| mask1 = seq_len_to_mask(seq_len1) | |||
| mask2 = seq_len_to_mask(seq_len2) | |||
| def enc(seq, mask): | |||
| nodes, relay = self.enc(seq, mask) | |||
| return 0.5 * (relay + nodes.max(1)[0]) | |||
| y1 = enc(words1, mask1) | |||
| y2 = enc(words2, mask2) | |||
| output = self.cls(y1, y2) # [bsz, n_cls] | |||
| output = self.cls(y1, y2) # [bsz, n_cls] | |||
| return {Const.OUTPUT: output} | |||
| def predict(self, words1, words2, seq_len1, seq_len2): | |||
| """ | |||