| @@ -613,6 +613,7 @@ class DataSet(object): | |||
| raise e | |||
| else: | |||
| raise KeyError("{} is not a valid field name.".format(name)) | |||
| return self | |||
| def set_input(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True): | |||
| """ | |||
| @@ -636,6 +637,7 @@ class DataSet(object): | |||
| raise e | |||
| else: | |||
| raise KeyError("{} is not a valid field name.".format(name)) | |||
| return self | |||
| def set_ignore_type(self, *field_names, flag=True): | |||
| """ | |||
| @@ -652,6 +654,7 @@ class DataSet(object): | |||
| self.field_arrays[name].ignore_type = flag | |||
| else: | |||
| raise KeyError("{} is not a valid field name.".format(name)) | |||
| return self | |||
| def set_padder(self, field_name, padder): | |||
| """ | |||
| @@ -667,6 +670,7 @@ class DataSet(object): | |||
| if field_name not in self.field_arrays: | |||
| raise KeyError("There is no field named {}.".format(field_name)) | |||
| self.field_arrays[field_name].set_padder(padder) | |||
| return self | |||
| def set_pad_val(self, field_name, pad_val): | |||
| """ | |||
| @@ -678,6 +682,7 @@ class DataSet(object): | |||
| if field_name not in self.field_arrays: | |||
| raise KeyError("There is no field named {}.".format(field_name)) | |||
| self.field_arrays[field_name].set_pad_val(pad_val) | |||
| return self | |||
| def get_input_name(self): | |||
| """ | |||
| @@ -868,48 +873,6 @@ class DataSet(object): | |||
| return train_set, dev_set | |||
| @classmethod | |||
| def read_csv(cls, csv_path, headers=None, sep=",", dropna=True): | |||
| r""" | |||
| .. warning:: | |||
| 此方法会在下个版本移除,请使用 :class:`fastNLP.io.CSVLoader` | |||
| 从csv_path路径下以csv的格式读取数据。 | |||
| :param str csv_path: 从哪里读取csv文件 | |||
| :param list[str] headers: 如果为None,则使用csv文件的第一行作为header; 如果传入list(str), 则元素的个数必须 | |||
| 与csv文件中每行的元素个数相同。 | |||
| :param str sep: 分割符 | |||
| :param bool dropna: 是否忽略与header数量不一致行。 | |||
| :return: 读取后的 :class:`~fastNLP.读取后的DataSet`。 | |||
| """ | |||
| warnings.warn('DataSet.read_csv is deprecated, use CSVLoader instead', | |||
| category=DeprecationWarning) | |||
| with open(csv_path, "r", encoding='utf-8') as f: | |||
| start_idx = 0 | |||
| if headers is None: | |||
| headers = f.readline().rstrip('\r\n') | |||
| headers = headers.split(sep) | |||
| start_idx += 1 | |||
| else: | |||
| assert isinstance(headers, (list, tuple)), "headers should be list or tuple, not {}.".format( | |||
| type(headers)) | |||
| _dict = {} | |||
| for col in headers: | |||
| _dict[col] = [] | |||
| for line_idx, line in enumerate(f, start_idx): | |||
| contents = line.rstrip('\r\n').split(sep) | |||
| if len(contents) != len(headers): | |||
| if dropna: | |||
| continue | |||
| else: | |||
| # TODO change error type | |||
| raise ValueError("Line {} has {} parts, while header has {} parts." \ | |||
| .format(line_idx, len(contents), len(headers))) | |||
| for header, content in zip(headers, contents): | |||
| _dict[header].append(content) | |||
| return cls(_dict) | |||
| def save(self, path): | |||
| """ | |||
| 保存DataSet. | |||
| @@ -61,6 +61,9 @@ class BertEmbedding(ContextualEmbedding): | |||
| # 根据model_dir_or_name检查是否存在并下载 | |||
| if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: | |||
| if 'cn' in model_dir_or_name.lower() and pool_method not in ('first', 'last'): | |||
| warnings.warn("For Chinese bert, pooled_method should choose from 'first', 'last' in order to achieve" | |||
| " faster speed.") | |||
| model_url = _get_embedding_url('bert', model_dir_or_name.lower()) | |||
| model_dir = cached_path(model_url, name='embedding') | |||
| # 检查是否存在 | |||
| @@ -91,19 +94,33 @@ class BertEmbedding(ContextualEmbedding): | |||
| :param torch.LongTensor words: [batch_size, max_len] | |||
| :return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers)) | |||
| """ | |||
| if self._word_sep_index: # 不能drop sep | |||
| sep_mask = words.eq(self._word_sep_index) | |||
| words = self.drop_word(words) | |||
| if self._word_sep_index: | |||
| words.masked_fill_(sep_mask, self._word_sep_index) | |||
| outputs = self._get_sent_reprs(words) | |||
| if outputs is not None: | |||
| return self.dropout(words) | |||
| return self.dropout(outputs) | |||
| outputs = self.model(words) | |||
| outputs = torch.cat([*outputs], dim=-1) | |||
| return self.dropout(outputs) | |||
| def drop_word(self, words): | |||
| """ | |||
| 按照设定随机将words设置为unknown_index。 | |||
| :param torch.LongTensor words: batch_size x max_len | |||
| :return: | |||
| """ | |||
| if self.word_dropout > 0 and self.training: | |||
| with torch.no_grad(): | |||
| if self._word_sep_index: # 不能drop sep | |||
| sep_mask = words.eq(self._word_sep_index) | |||
| mask = torch.ones_like(words).float() * self.word_dropout | |||
| mask = torch.bernoulli(mask).byte() # dropout_word越大,越多位置为1 | |||
| words = words.masked_fill(mask, self._word_unk_index) | |||
| if self._word_sep_index: | |||
| words.masked_fill_(sep_mask, self._word_sep_index) | |||
| return words | |||
| @property | |||
| def requires_grad(self): | |||
| """ | |||
| @@ -134,10 +151,12 @@ class BertWordPieceEncoder(nn.Module): | |||
| :param str layers: 最终结果中的表示。以','隔开层数,可以以负数去索引倒数几层 | |||
| :param bool pooled_cls: 返回的句子开头的[CLS]是否使用预训练中的BertPool映射一下,仅在include_cls_sep时有效。如果下游任务只取 | |||
| [CLS]做预测,一般该值为True。 | |||
| :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 | |||
| :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 | |||
| :param bool requires_grad: 是否需要gradient。 | |||
| """ | |||
| def __init__(self, model_dir_or_name: str='en-base-uncased', layers: str='-1', | |||
| pooled_cls: bool = False, requires_grad: bool=False): | |||
| def __init__(self, model_dir_or_name: str='en-base-uncased', layers: str='-1', pooled_cls: bool = False, | |||
| word_dropout=0, dropout=0, requires_grad: bool=False): | |||
| super().__init__() | |||
| if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: | |||
| @@ -150,8 +169,12 @@ class BertWordPieceEncoder(nn.Module): | |||
| raise ValueError(f"Cannot recognize {model_dir_or_name}.") | |||
| self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers, pooled_cls=pooled_cls) | |||
| self._sep_index = self.model._sep_index | |||
| self._wordpiece_unk_index = self.model._wordpiece_unknown_index | |||
| self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size | |||
| self.requires_grad = requires_grad | |||
| self.word_dropout = word_dropout | |||
| self.dropout_layer = nn.Dropout(dropout) | |||
| @property | |||
| def requires_grad(self): | |||
| @@ -199,13 +222,41 @@ class BertWordPieceEncoder(nn.Module): | |||
| 计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。 | |||
| :param words: batch_size x max_len | |||
| :param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话 | |||
| :param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话. 如果不传入,则自动生成(大部分情况,都不需要输入), | |||
| 第一个[SEP]及之前为0, 第二个[SEP]及到第一个[SEP]之间为1; 第三个[SEP]及到第二个[SEP]之间为0,依次往后推。 | |||
| :return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers)) | |||
| """ | |||
| with torch.no_grad(): | |||
| sep_mask = word_pieces.eq(self._sep_index) # batch_size x max_len | |||
| if token_type_ids is None: | |||
| sep_mask_cumsum = sep_mask.flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1]) | |||
| token_type_ids = sep_mask_cumsum.fmod(2) | |||
| if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0 | |||
| token_type_ids = token_type_ids.eq(0).long() | |||
| word_pieces = self.drop_word(word_pieces) | |||
| outputs = self.model(word_pieces, token_type_ids) | |||
| outputs = torch.cat([*outputs], dim=-1) | |||
| return outputs | |||
| return self.dropout_layer(outputs) | |||
| def drop_word(self, words): | |||
| """ | |||
| 按照设定随机将words设置为unknown_index。 | |||
| :param torch.LongTensor words: batch_size x max_len | |||
| :return: | |||
| """ | |||
| if self.word_dropout > 0 and self.training: | |||
| with torch.no_grad(): | |||
| if self._word_sep_index: # 不能drop sep | |||
| sep_mask = words.eq(self._wordpiece_unk_index) | |||
| mask = torch.ones_like(words).float() * self.word_dropout | |||
| mask = torch.bernoulli(mask).byte() # dropout_word越大,越多位置为1 | |||
| words = words.masked_fill(mask, self._word_unk_index) | |||
| if self._word_sep_index: | |||
| words.masked_fill_(sep_mask, self._wordpiece_unk_index) | |||
| return words | |||
| class _WordBertModel(nn.Module): | |||
| @@ -288,11 +339,11 @@ class _WordBertModel(nn.Module): | |||
| word_pieces = self.tokenzier.convert_tokens_to_ids(word_pieces) | |||
| word_to_wordpieces.append(word_pieces) | |||
| word_pieces_lengths.append(len(word_pieces)) | |||
| print("Found(Or seg into word pieces) {} words out of {}.".format(found_count, len(vocab))) | |||
| self._cls_index = self.tokenzier.vocab['[CLS]'] | |||
| self._sep_index = self.tokenzier.vocab['[SEP]'] | |||
| self._word_pad_index = vocab.padding_idx | |||
| self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece | |||
| print("Found(Or segment into word pieces) {} words out of {}.".format(found_count, len(vocab))) | |||
| self.word_to_wordpieces = np.array(word_to_wordpieces) | |||
| self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False) | |||
| print("Successfully generate word pieces.") | |||
| @@ -339,7 +390,7 @@ class _WordBertModel(nn.Module): | |||
| sep_mask_cumsum = sep_mask.flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1]) | |||
| token_type_ids = sep_mask_cumsum.fmod(2) | |||
| if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0 | |||
| token_type_ids = token_type_ids.eq(0).float() | |||
| token_type_ids = token_type_ids.eq(0).long() | |||
| else: | |||
| token_type_ids = torch.zeros_like(word_pieces) | |||
| # 2. 获取hidden的结果,根据word_pieces进行对应的pool计算 | |||
| @@ -45,7 +45,7 @@ class StaticEmbedding(TokenEmbedding): | |||
| :param model_dir_or_name: 可以有两种方式调用预训练好的static embedding:第一种是传入embedding文件夹(文件夹下应该只有一个 | |||
| 以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。 | |||
| 如果输入为None则使用embedding_dim的维度随机初始化一个embedding。 | |||
| :param int embedding_dim: 随机初始化的embedding的维度,仅在model_dir_or_name为None时有效。 | |||
| :param int embedding_dim: 随机初始化的embedding的维度,当该值为大于0的值时,将忽略model_dir_or_name。 | |||
| :param bool requires_grad: 是否需要gradient. 默认为True | |||
| :param callable init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。调用该方法时传入一个tensor对 | |||
| :param bool lower: 是否将vocab中的词语小写后再和预训练的词表进行匹配。如果你的词表中包含大写的词语,或者就是需要单独 | |||
| @@ -55,9 +55,11 @@ class StaticEmbedding(TokenEmbedding): | |||
| :param bool normalize: 是否对vector进行normalize,使得每个vector的norm为1。 | |||
| :param int min_freq: Vocabulary词频数小于这个数量的word将被指向unk。 | |||
| """ | |||
| def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', embedding_dim=100, requires_grad: bool=True, | |||
| def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', embedding_dim=-1, requires_grad: bool=True, | |||
| init_method=None, lower=False, dropout=0, word_dropout=0, normalize=False, min_freq=1, **kwargs): | |||
| super(StaticEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | |||
| if embedding_dim>0: | |||
| model_dir_or_name = None | |||
| # 得到cache_path | |||
| if model_dir_or_name is None: | |||
| @@ -30,6 +30,9 @@ __all__ = [ | |||
| 'Conll2003NERLoader', | |||
| 'OntoNotesNERLoader', | |||
| 'CTBLoader', | |||
| "MsraNERLoader", | |||
| "WeiboNERLoader", | |||
| "PeopleDailyNERLoader", | |||
| 'CSVLoader', | |||
| 'JsonLoader', | |||
| @@ -50,6 +53,9 @@ __all__ = [ | |||
| "Conll2003NERPipe", | |||
| "OntoNotesNERPipe", | |||
| "MsraNERPipe", | |||
| "PeopleDailyPipe", | |||
| "WeiboNERPipe", | |||
| "MatchingBertPipe", | |||
| "RTEBertPipe", | |||
| @@ -133,19 +133,21 @@ class DataBundle: | |||
| :param ~fastNLP.Vocabulary vocab: 词表 | |||
| :param str field_name: 这个vocab对应的field名称 | |||
| :return: | |||
| :return: self | |||
| """ | |||
| assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary supports." | |||
| self.vocabs[field_name] = vocab | |||
| return self | |||
| def set_dataset(self, dataset, name): | |||
| """ | |||
| :param ~fastNLP.DataSet dataset: 传递给DataBundle的DataSet | |||
| :param str name: dataset的名称 | |||
| :return: | |||
| :return: self | |||
| """ | |||
| self.datasets[name] = dataset | |||
| return self | |||
| def get_dataset(self, name:str)->DataSet: | |||
| """ | |||
| @@ -165,7 +167,7 @@ class DataBundle: | |||
| """ | |||
| return self.vocabs[field_name] | |||
| def set_input(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True, ignore_miss_field=True): | |||
| def set_input(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True, ignore_miss_dataset=True): | |||
| """ | |||
| 将field_names中的field设置为input, 对data_bundle中所有的dataset执行该操作:: | |||
| @@ -176,18 +178,21 @@ class DataBundle: | |||
| :param bool flag: 将field_name的input状态设置为flag | |||
| :param bool use_1st_ins_infer_dim_type: 如果为True,将不会check该列是否所有数据都是同样的维度,同样的类型。将直接使用第一 | |||
| 行的数据进行类型和维度推断本列的数据的类型和维度。 | |||
| :param bool ignore_miss_field: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略; 如果为False,则报错 | |||
| :param bool ignore_miss_dataset: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略该DataSet; | |||
| 如果为False,则报错 | |||
| :return self | |||
| """ | |||
| for field_name in field_names: | |||
| for name, dataset in self.datasets.items(): | |||
| if not ignore_miss_field and not dataset.has_field(field_name): | |||
| if not ignore_miss_dataset and not dataset.has_field(field_name): | |||
| raise KeyError(f"Field:{field_name} was not found in DataSet:{name}") | |||
| if not dataset.has_field(field_name): | |||
| continue | |||
| else: | |||
| dataset.set_input(field_name, flag=flag, use_1st_ins_infer_dim_type=use_1st_ins_infer_dim_type) | |||
| return self | |||
| def set_target(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True, ignore_miss_field=True): | |||
| def set_target(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True, ignore_miss_dataset=True): | |||
| """ | |||
| 将field_names中的field设置为target, 对data_bundle中所有的dataset执行该操作:: | |||
| @@ -198,16 +203,34 @@ class DataBundle: | |||
| :param bool flag: 将field_name的target状态设置为flag | |||
| :param bool use_1st_ins_infer_dim_type: 如果为True,将不会check该列是否所有数据都是同样的维度,同样的类型。将直接使用第一 | |||
| 行的数据进行类型和维度推断本列的数据的类型和维度。 | |||
| :param bool ignore_miss_field: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略; 如果为False,则报错 | |||
| :param bool ignore_miss_dataset: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略; 如果为False,则报错 | |||
| :return self | |||
| """ | |||
| for field_name in field_names: | |||
| for name, dataset in self.datasets.items(): | |||
| if not ignore_miss_field and not dataset.has_field(field_name): | |||
| if not ignore_miss_dataset and not dataset.has_field(field_name): | |||
| raise KeyError(f"Field:{field_name} was not found in DataSet:{name}") | |||
| if not dataset.has_field(field_name): | |||
| continue | |||
| else: | |||
| dataset.set_target(field_name, flag=flag, use_1st_ins_infer_dim_type=use_1st_ins_infer_dim_type) | |||
| return self | |||
| def copy_field(self, field_name, new_field_name, ignore_miss_dataset=True): | |||
| """ | |||
| 将DataBundle中所有的field_name复制一份叫new_field_name. | |||
| :param str field_name: | |||
| :param str new_field_name: | |||
| :param bool ignore_miss_dataset: 若DataBundle中的DataSet的 | |||
| :return: self | |||
| """ | |||
| for name, dataset in self.datasets.items(): | |||
| if dataset.has_field(field_name=field_name): | |||
| dataset.copy_field(field_name=field_name, new_field_name=new_field_name) | |||
| elif ignore_miss_dataset: | |||
| raise KeyError(f"{field_name} not found DataSet:{name}.") | |||
| return self | |||
| def __repr__(self): | |||
| _str = 'In total {} datasets:\n'.format(len(self.datasets)) | |||
| @@ -27,6 +27,7 @@ PRETRAINED_BERT_MODEL_DIR = { | |||
| 'cn': 'bert-chinese-wwm.zip', | |||
| 'cn-base': 'bert-base-chinese.zip', | |||
| 'cn-wwm': 'bert-chinese-wwm.zip', | |||
| 'cn-wwm-ext': "bert-chinese-wwm-ext.zip" | |||
| } | |||
| PRETRAINED_ELMO_MODEL_DIR = { | |||
| @@ -56,7 +57,7 @@ PRETRAIN_STATIC_FILES = { | |||
| 'en-fasttext-wiki': "wiki-news-300d-1M.vec.zip", | |||
| 'en-fasttext-crawl': "crawl-300d-2M.vec.zip", | |||
| 'cn': "tencent_cn.txt.zip", | |||
| 'cn': "tencent_cn.zip", | |||
| 'cn-tencent': "tencent_cn.txt.zip", | |||
| 'cn-fasttext': "cc.zh.300.vec.gz", | |||
| 'cn-sgns-literature-word': 'sgns.literature.word.txt.zip', | |||
| @@ -71,7 +72,10 @@ DATASET_DIR = { | |||
| "qnli": "QNLI.zip", | |||
| "sst-2": "SST-2.zip", | |||
| "sst": "SST.zip", | |||
| "rte": "RTE.zip" | |||
| "rte": "RTE.zip", | |||
| "msra-ner": "MSRA_NER.zip", | |||
| "peopledaily": "peopledaily.zip", | |||
| "weibo-ner": "weibo_NER.zip" | |||
| } | |||
| PRETRAIN_MAP = {'elmo': PRETRAINED_ELMO_MODEL_DIR, | |||
| @@ -320,42 +324,44 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path: | |||
| # GET file object | |||
| req = requests.get(url, stream=True, headers={"User-Agent": "fastNLP"}) | |||
| if req.status_code == 200: | |||
| content_length = req.headers.get("Content-Length") | |||
| total = int(content_length) if content_length is not None else None | |||
| progress = tqdm(unit="B", total=total, unit_scale=1) | |||
| fd, temp_filename = tempfile.mkstemp() | |||
| print("%s not found in cache, downloading to %s" % (url, temp_filename)) | |||
| with open(temp_filename, "wb") as temp_file: | |||
| for chunk in req.iter_content(chunk_size=1024 * 16): | |||
| if chunk: # filter out keep-alive new chunks | |||
| progress.update(len(chunk)) | |||
| temp_file.write(chunk) | |||
| progress.close() | |||
| print(f"Finish download from {url}.") | |||
| # 开始解压 | |||
| delete_temp_dir = None | |||
| if suffix in ('.zip', '.tar.gz'): | |||
| uncompress_temp_dir = tempfile.mkdtemp() | |||
| delete_temp_dir = uncompress_temp_dir | |||
| print(f"Start to uncompress file to {uncompress_temp_dir}") | |||
| if suffix == '.zip': | |||
| unzip_file(Path(temp_filename), Path(uncompress_temp_dir)) | |||
| else: | |||
| untar_gz_file(Path(temp_filename), Path(uncompress_temp_dir)) | |||
| filenames = os.listdir(uncompress_temp_dir) | |||
| if len(filenames) == 1: | |||
| if os.path.isdir(os.path.join(uncompress_temp_dir, filenames[0])): | |||
| uncompress_temp_dir = os.path.join(uncompress_temp_dir, filenames[0]) | |||
| cache_path.mkdir(parents=True, exist_ok=True) | |||
| print("Finish un-compressing file.") | |||
| else: | |||
| uncompress_temp_dir = temp_filename | |||
| cache_path = str(cache_path) + suffix | |||
| success = False | |||
| fd, temp_filename = tempfile.mkstemp() | |||
| uncompress_temp_dir = None | |||
| try: | |||
| content_length = req.headers.get("Content-Length") | |||
| total = int(content_length) if content_length is not None else None | |||
| progress = tqdm(unit="B", total=total, unit_scale=1) | |||
| print("%s not found in cache, downloading to %s" % (url, temp_filename)) | |||
| with open(temp_filename, "wb") as temp_file: | |||
| for chunk in req.iter_content(chunk_size=1024 * 16): | |||
| if chunk: # filter out keep-alive new chunks | |||
| progress.update(len(chunk)) | |||
| temp_file.write(chunk) | |||
| progress.close() | |||
| print(f"Finish download from {url}") | |||
| # 开始解压 | |||
| if suffix in ('.zip', '.tar.gz', '.gz'): | |||
| uncompress_temp_dir = tempfile.mkdtemp() | |||
| print(f"Start to uncompress file to {uncompress_temp_dir}") | |||
| if suffix == '.zip': | |||
| unzip_file(Path(temp_filename), Path(uncompress_temp_dir)) | |||
| elif suffix == '.gz': | |||
| ungzip_file(temp_filename, uncompress_temp_dir, dir_name) | |||
| else: | |||
| untar_gz_file(Path(temp_filename), Path(uncompress_temp_dir)) | |||
| filenames = os.listdir(uncompress_temp_dir) | |||
| if len(filenames) == 1: | |||
| if os.path.isdir(os.path.join(uncompress_temp_dir, filenames[0])): | |||
| uncompress_temp_dir = os.path.join(uncompress_temp_dir, filenames[0]) | |||
| cache_path.mkdir(parents=True, exist_ok=True) | |||
| print("Finish un-compressing file.") | |||
| else: | |||
| uncompress_temp_dir = temp_filename | |||
| cache_path = str(cache_path) + suffix | |||
| # 复制到指定的位置 | |||
| print(f"Copy file to {cache_path}") | |||
| if os.path.isdir(uncompress_temp_dir): | |||
| @@ -377,10 +383,12 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path: | |||
| os.remove(cache_path) | |||
| else: | |||
| shutil.rmtree(cache_path) | |||
| if delete_temp_dir: | |||
| shutil.rmtree(delete_temp_dir) | |||
| os.close(fd) | |||
| os.remove(temp_filename) | |||
| if os.path.isdir(uncompress_temp_dir): | |||
| shutil.rmtree(uncompress_temp_dir) | |||
| elif os.path.isfile(uncompress_temp_dir): | |||
| os.remove(uncompress_temp_dir) | |||
| return get_filepath(cache_path) | |||
| else: | |||
| raise HTTPError(f"Status code:{req.status_code}. Fail to download from {url}.") | |||
| @@ -402,6 +410,15 @@ def untar_gz_file(file: Path, to: Path): | |||
| tar.extractall(to) | |||
| def ungzip_file(file: str, to: str, filename:str): | |||
| import gzip | |||
| g_file = gzip.GzipFile(file) | |||
| with open(os.path.join(to, filename), 'wb+') as f: | |||
| f.write(g_file.read()) | |||
| g_file.close() | |||
| def match_file(dir_name: str, cache_dir: Path) -> str: | |||
| """ | |||
| 匹配的原则是: 在cache_dir下的文件与dir_name完全一致, 或除了后缀以外和dir_name完全一致。 | |||
| @@ -58,6 +58,9 @@ __all__ = [ | |||
| 'Conll2003NERLoader', | |||
| 'OntoNotesNERLoader', | |||
| 'CTBLoader', | |||
| "MsraNERLoader", | |||
| "PeopleDailyNERLoader", | |||
| "WeiboNERLoader", | |||
| # 'CSVLoader', | |||
| # 'JsonLoader', | |||
| @@ -77,3 +80,4 @@ from .cws import CWSLoader | |||
| from .json import JsonLoader | |||
| from .loader import Loader | |||
| from .matching import MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader | |||
| from .conll import MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader | |||
| @@ -6,6 +6,8 @@ import os | |||
| import random | |||
| import shutil | |||
| import numpy as np | |||
| import glob | |||
| import time | |||
| class YelpLoader(Loader): | |||
| @@ -57,7 +59,7 @@ class YelpLoader(Loader): | |||
| class YelpFullLoader(YelpLoader): | |||
| def download(self, dev_ratio: float = 0.1, seed: int = 0): | |||
| def download(self, dev_ratio: float = 0.1, re_download:bool=False): | |||
| """ | |||
| 自动下载数据集,如果你使用了这个数据集,请引用以下的文章 | |||
| @@ -68,35 +70,23 @@ class YelpFullLoader(YelpLoader): | |||
| dev.csv三个文件。 | |||
| :param float dev_ratio: 如果路径中没有dev集,从train划分多少作为dev的数据. 如果为0,则不划分dev。 | |||
| :param int seed: 划分dev时的随机数种子 | |||
| :param bool re_download: 是否重新下载数据,以重新切分数据。 | |||
| :return: str, 数据集的目录地址 | |||
| """ | |||
| dataset_name = 'yelp-review-full' | |||
| data_dir = self._get_dataset_path(dataset_name=dataset_name) | |||
| if os.path.exists(os.path.join(data_dir, 'dev.csv')): # 存在dev的话,check是否需要重新下载 | |||
| re_download = True | |||
| if dev_ratio > 0: | |||
| dev_line_count = 0 | |||
| tr_line_count = 0 | |||
| with open(os.path.join(data_dir, 'train.csv'), 'r', encoding='utf-8') as f1, \ | |||
| open(os.path.join(data_dir, 'dev.csv'), 'r', encoding='utf-8') as f2: | |||
| for line in f1: | |||
| tr_line_count += 1 | |||
| for line in f2: | |||
| dev_line_count += 1 | |||
| if not np.isclose(dev_line_count, dev_ratio * (tr_line_count + dev_line_count), rtol=0.005): | |||
| re_download = True | |||
| else: | |||
| re_download = False | |||
| if re_download: | |||
| shutil.rmtree(data_dir) | |||
| data_dir = self._get_dataset_path(dataset_name=dataset_name) | |||
| modify_time = 0 | |||
| for filepath in glob.glob(os.path.join(data_dir, '*')): | |||
| modify_time = os.stat(filepath).st_mtime | |||
| break | |||
| if time.time() - modify_time > 1 and re_download: # 通过这种比较丑陋的方式判断一下文件是否是才下载的 | |||
| shutil.rmtree(data_dir) | |||
| data_dir = self._get_dataset_path(dataset_name=dataset_name) | |||
| if not os.path.exists(os.path.join(data_dir, 'dev.csv')): | |||
| if dev_ratio > 0: | |||
| assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." | |||
| random.seed(int(seed)) | |||
| try: | |||
| with open(os.path.join(data_dir, 'train.csv'), 'r', encoding='utf-8') as f, \ | |||
| open(os.path.join(data_dir, 'middle_file.csv'), 'w', encoding='utf-8') as f1, \ | |||
| @@ -116,44 +106,32 @@ class YelpFullLoader(YelpLoader): | |||
| class YelpPolarityLoader(YelpLoader): | |||
| def download(self, dev_ratio: float = 0.1, seed: int = 0): | |||
| def download(self, dev_ratio: float = 0.1, re_download=False): | |||
| """ | |||
| 自动下载数据集,如果你使用了这个数据集,请引用以下的文章 | |||
| Xiang Zhang, Junbo Zhao, Yann LeCun. Character-level Convolutional Networks for Text Classification. Advances | |||
| in Neural Information Processing Systems 28 (NIPS 2015) | |||
| 根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。下载完成后从train中切分0.1作为dev | |||
| 根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。下载完成后从train中切分dev_ratio这么多作为dev | |||
| :param float dev_ratio: 如果路径中不存在dev.csv, 从train划分多少作为dev的数据. 如果为0,则不划分dev | |||
| :param int seed: 划分dev时的随机数种子 | |||
| :param float dev_ratio: 如果路径中不存在dev.csv, 从train划分多少作为dev的数据。 如果为0,则不划分dev。 | |||
| :param bool re_download: 是否重新下载数据,以重新切分数据。 | |||
| :return: str, 数据集的目录地址 | |||
| """ | |||
| dataset_name = 'yelp-review-polarity' | |||
| data_dir = self._get_dataset_path(dataset_name=dataset_name) | |||
| if os.path.exists(os.path.join(data_dir, 'dev.csv')): # 存在dev的话,check是否符合比例要求 | |||
| re_download = True | |||
| if dev_ratio > 0: | |||
| dev_line_count = 0 | |||
| tr_line_count = 0 | |||
| with open(os.path.join(data_dir, 'train.csv'), 'r', encoding='utf-8') as f1, \ | |||
| open(os.path.join(data_dir, 'dev.csv'), 'r', encoding='utf-8') as f2: | |||
| for line in f1: | |||
| tr_line_count += 1 | |||
| for line in f2: | |||
| dev_line_count += 1 | |||
| if not np.isclose(dev_line_count, dev_ratio * (tr_line_count + dev_line_count), rtol=0.005): | |||
| re_download = True | |||
| else: | |||
| re_download = False | |||
| if re_download: | |||
| shutil.rmtree(data_dir) | |||
| data_dir = self._get_dataset_path(dataset_name=dataset_name) | |||
| modify_time = 0 | |||
| for filepath in glob.glob(os.path.join(data_dir, '*')): | |||
| modify_time = os.stat(filepath).st_mtime | |||
| break | |||
| if time.time() - modify_time > 1 and re_download: # 通过这种比较丑陋的方式判断一下文件是否是才下载的 | |||
| shutil.rmtree(data_dir) | |||
| data_dir = self._get_dataset_path(dataset_name=dataset_name) | |||
| if not os.path.exists(os.path.join(data_dir, 'dev.csv')): | |||
| if dev_ratio > 0: | |||
| assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." | |||
| random.seed(int(seed)) | |||
| try: | |||
| with open(os.path.join(data_dir, 'train.csv'), 'r', encoding='utf-8') as f, \ | |||
| open(os.path.join(data_dir, 'middle_file.csv'), 'w', encoding='utf-8') as f1, \ | |||
| @@ -209,7 +187,7 @@ class IMDBLoader(Loader): | |||
| return dataset | |||
| def download(self, dev_ratio: float = 0.1, seed: int = 0): | |||
| def download(self, dev_ratio: float = 0.1, re_download=False): | |||
| """ | |||
| 自动下载数据集,如果你使用了这个数据集,请引用以下的文章 | |||
| @@ -218,34 +196,22 @@ class IMDBLoader(Loader): | |||
| 根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。下载完成后从train中切分0.1作为dev | |||
| :param float dev_ratio: 如果路径中没有dev.txt。从train划分多少作为dev的数据. 如果为0,则不划分dev | |||
| :param int seed: 划分dev时的随机数种子 | |||
| :param bool re_download: 是否重新下载数据,以重新切分数据。 | |||
| :return: str, 数据集的目录地址 | |||
| """ | |||
| dataset_name = 'aclImdb' | |||
| data_dir = self._get_dataset_path(dataset_name=dataset_name) | |||
| if os.path.exists(os.path.join(data_dir, 'dev.txt')): # 存在dev的话,check是否符合比例要求 | |||
| re_download = True | |||
| if dev_ratio > 0: | |||
| dev_line_count = 0 | |||
| tr_line_count = 0 | |||
| with open(os.path.join(data_dir, 'train.txt'), 'r', encoding='utf-8') as f1, \ | |||
| open(os.path.join(data_dir, 'dev.txt'), 'r', encoding='utf-8') as f2: | |||
| for line in f1: | |||
| tr_line_count += 1 | |||
| for line in f2: | |||
| dev_line_count += 1 | |||
| if not np.isclose(dev_line_count, dev_ratio * (tr_line_count + dev_line_count), rtol=0.005): | |||
| re_download = True | |||
| else: | |||
| re_download = False | |||
| if re_download: | |||
| shutil.rmtree(data_dir) | |||
| data_dir = self._get_dataset_path(dataset_name=dataset_name) | |||
| modify_time = 0 | |||
| for filepath in glob.glob(os.path.join(data_dir, '*')): | |||
| modify_time = os.stat(filepath).st_mtime | |||
| break | |||
| if time.time() - modify_time > 1 and re_download: # 通过这种比较丑陋的方式判断一下文件是否是才下载的 | |||
| shutil.rmtree(data_dir) | |||
| data_dir = self._get_dataset_path(dataset_name=dataset_name) | |||
| if not os.path.exists(os.path.join(data_dir, 'dev.csv')): | |||
| if dev_ratio > 0: | |||
| assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." | |||
| random.seed(int(seed)) | |||
| try: | |||
| with open(os.path.join(data_dir, 'train.txt'), 'r', encoding='utf-8') as f, \ | |||
| open(os.path.join(data_dir, 'middle_file.txt'), 'w', encoding='utf-8') as f1, \ | |||
| @@ -4,10 +4,12 @@ from .loader import Loader | |||
| from ...core.dataset import DataSet | |||
| from ..file_reader import _read_conll | |||
| from ...core.instance import Instance | |||
| from .. import DataBundle | |||
| from ..utils import check_loader_paths | |||
| from ...core.const import Const | |||
| import glob | |||
| import os | |||
| import shutil | |||
| import time | |||
| import random | |||
| class ConllLoader(Loader): | |||
| """ | |||
| @@ -262,3 +264,173 @@ class CTBLoader(Loader): | |||
| def _load(self, path:str): | |||
| pass | |||
| class CNNERLoader(Loader): | |||
| def _load(self, path:str): | |||
| """ | |||
| 支持加载形如以下格式的内容,一行两列,以空格隔开两个sample | |||
| Example:: | |||
| 我 O | |||
| 们 O | |||
| 变 O | |||
| 而 O | |||
| 以 O | |||
| 书 O | |||
| 会 O | |||
| ... | |||
| :param str path: 文件路径 | |||
| :return: DataSet,包含raw_words列和target列 | |||
| """ | |||
| ds = DataSet() | |||
| with open(path, 'r', encoding='utf-8') as f: | |||
| raw_chars = [] | |||
| target = [] | |||
| for line in f: | |||
| line = line.strip() | |||
| if line: | |||
| parts = line.split() | |||
| if len(parts) == 1: # 网上下载的数据有一些列少tag,默认补充O | |||
| parts.append('O') | |||
| raw_chars.append(parts[0]) | |||
| target.append(parts[1]) | |||
| else: | |||
| if raw_chars: | |||
| ds.append(Instance(raw_chars=raw_chars, target=target)) | |||
| raw_chars = [] | |||
| target = [] | |||
| return ds | |||
| class MsraNERLoader(CNNERLoader): | |||
| """ | |||
| 读取MSRA-NER数据,数据中的格式应该类似与下列的内容 | |||
| Example:: | |||
| 我 O | |||
| 们 O | |||
| 变 O | |||
| 而 O | |||
| 以 O | |||
| 书 O | |||
| 会 O | |||
| ... | |||
| 读取后的DataSet包含以下的field | |||
| .. csv-table:: target列是基于BIO的编码方式 | |||
| :header: "raw_chars", "target" | |||
| "[我, 们, 变...]", "[O, O, ...]" | |||
| "[中, 共, 中, ...]", "[B-ORG, I-ORG, I-ORG, ...]" | |||
| "[...]", "[...]" | |||
| """ | |||
| def __init__(self): | |||
| super().__init__() | |||
| def download(self, dev_ratio:float=0.1, re_download:bool=False)->str: | |||
| """ | |||
| 自动下载MSAR-NER的数据,如果你使用该数据,请引用 Gina-Anne Levow, 2006, The Third International Chinese Language | |||
| Processing Bakeoff: Word Segmentation and Named Entity Recognition. | |||
| 根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。下载完成后在output_dir中有train.conll, test.conll, | |||
| dev.conll三个文件。 | |||
| :param float dev_ratio: 如果路径中没有dev集,从train划分多少作为dev的数据. 如果为0,则不划分dev。 | |||
| :param bool re_download: 是否重新下载数据,以重新切分数据。 | |||
| :return: str, 数据集的目录地址 | |||
| :return: | |||
| """ | |||
| dataset_name = 'msra-ner' | |||
| data_dir = self._get_dataset_path(dataset_name=dataset_name) | |||
| modify_time = 0 | |||
| for filepath in glob.glob(os.path.join(data_dir, '*')): | |||
| modify_time = os.stat(filepath).st_mtime | |||
| break | |||
| if time.time() - modify_time > 1 and re_download: # 通过这种比较丑陋的方式判断一下文件是否是才下载的 | |||
| shutil.rmtree(data_dir) | |||
| data_dir = self._get_dataset_path(dataset_name=dataset_name) | |||
| if not os.path.exists(os.path.join(data_dir, 'dev.conll')): | |||
| if dev_ratio > 0: | |||
| assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." | |||
| try: | |||
| with open(os.path.join(data_dir, 'train.conll'), 'r', encoding='utf-8') as f, \ | |||
| open(os.path.join(data_dir, 'middle_file.conll'), 'w', encoding='utf-8') as f1, \ | |||
| open(os.path.join(data_dir, 'dev.conll'), 'w', encoding='utf-8') as f2: | |||
| lines = [] # 一个sample包含很多行 | |||
| for line in f: | |||
| line = line.strip() | |||
| if line: | |||
| lines.append(line) | |||
| else: | |||
| if random.random() < dev_ratio: | |||
| f2.write('\n'.join(lines) + '\n\n') | |||
| else: | |||
| f1.write('\n'.join(lines) + '\n\n') | |||
| lines.clear() | |||
| os.remove(os.path.join(data_dir, 'train.conll')) | |||
| os.renames(os.path.join(data_dir, 'middle_file.conll'), os.path.join(data_dir, 'train.conll')) | |||
| finally: | |||
| if os.path.exists(os.path.join(data_dir, 'middle_file.conll')): | |||
| os.remove(os.path.join(data_dir, 'middle_file.conll')) | |||
| return data_dir | |||
| class WeiboNERLoader(CNNERLoader): | |||
| def __init__(self): | |||
| super().__init__() | |||
| def download(self)->str: | |||
| """ | |||
| 自动下载Weibo-NER的数据,如果你使用了该数据,请引用 Nanyun Peng and Mark Dredze, 2015, Named Entity Recognition for | |||
| Chinese Social Media with Jointly Trained Embeddings. | |||
| :return: str | |||
| """ | |||
| dataset_name = 'weibo-ner' | |||
| data_dir = self._get_dataset_path(dataset_name=dataset_name) | |||
| return data_dir | |||
| class PeopleDailyNERLoader(CNNERLoader): | |||
| """ | |||
| 支持加载的数据格式如下 | |||
| Example:: | |||
| 当 O | |||
| 希 O | |||
| 望 O | |||
| 工 O | |||
| 程 O | |||
| 救 O | |||
| 助 O | |||
| 的 O | |||
| 百 O | |||
| 读取后的DataSet包含以下的field | |||
| .. csv-table:: target列是基于BIO的编码方式 | |||
| :header: "raw_chars", "target" | |||
| "[我, 们, 变...]", "[O, O, ...]" | |||
| "[中, 共, 中, ...]", "[B-ORG, I-ORG, I-ORG, ...]" | |||
| "[...]", "[...]" | |||
| """ | |||
| def __init__(self): | |||
| super().__init__() | |||
| def download(self) -> str: | |||
| dataset_name = 'peopledaily' | |||
| data_dir = self._get_dataset_path(dataset_name=dataset_name) | |||
| return data_dir | |||
| @@ -8,6 +8,8 @@ Pipe用于处理通过 Loader 读取的数据,所有的 Pipe 都包含 ``proce | |||
| """ | |||
| __all__ = [ | |||
| "Pipe", | |||
| "YelpFullPipe", | |||
| "YelpPolarityPipe", | |||
| "SSTPipe", | |||
| @@ -16,6 +18,9 @@ __all__ = [ | |||
| "Conll2003NERPipe", | |||
| "OntoNotesNERPipe", | |||
| "MsraNERPipe", | |||
| "WeiboNERPipe", | |||
| "PeopleDailyPipe", | |||
| "MatchingBertPipe", | |||
| "RTEBertPipe", | |||
| @@ -32,6 +37,7 @@ __all__ = [ | |||
| ] | |||
| from .classification import YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe | |||
| from .conll import Conll2003NERPipe, OntoNotesNERPipe | |||
| from .conll import Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe | |||
| from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, \ | |||
| MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe | |||
| from .pipe import Pipe | |||
| @@ -4,6 +4,8 @@ from .utils import iob2, iob2bioes | |||
| from ...core.const import Const | |||
| from ..loader.conll import Conll2003NERLoader, OntoNotesNERLoader | |||
| from .utils import _indexize, _add_words_field | |||
| from .utils import _add_chars_field | |||
| from ..loader.conll import PeopleDailyNERLoader, WeiboNERLoader, MsraNERLoader | |||
| class _NERPipe(Pipe): | |||
| @@ -17,7 +19,7 @@ class _NERPipe(Pipe): | |||
| :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 | |||
| :param bool lower: 是否将words小写化后再建立词表,绝大多数情况都不需要设置为True。 | |||
| :param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为-100。 | |||
| :param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为0。 | |||
| """ | |||
| def __init__(self, encoding_type: str = 'bio', lower: bool = False, target_pad_val=0): | |||
| @@ -32,31 +34,16 @@ class _NERPipe(Pipe): | |||
| """ | |||
| 支持的DataSet的field为 | |||
| .. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader | |||
| .. csv-table:: | |||
| :header: "raw_words", "target" | |||
| "[Nadim, Ladki]", "[B-PER, I-PER]" | |||
| "[AL-AIN, United, Arab, ...]", "[B-LOC, B-LOC, I-LOC, ...]" | |||
| "[...]", "[...]" | |||
| :param DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field,且两个field的内容均为List[str]。 | |||
| 在传入DataBundle基础上原位修改。 | |||
| :return: DataBundle | |||
| Example:: | |||
| data_bundle = Conll2003Loader().load('/path/to/conll2003/') | |||
| data_bundle = Conll2003NERPipe().process(data_bundle) | |||
| # 获取train | |||
| tr_data = data_bundle.get_dataset('train') | |||
| # 获取target这个field的词表 | |||
| target_vocab = data_bundle.get_vocab('target') | |||
| # 获取words这个field的词表 | |||
| word_vocab = data_bundle.get_vocab('words') | |||
| """ | |||
| # 转换tag | |||
| for name, dataset in data_bundle.datasets.items(): | |||
| @@ -79,18 +66,6 @@ class _NERPipe(Pipe): | |||
| return data_bundle | |||
| def process_from_file(self, paths) -> DataBundle: | |||
| """ | |||
| :param paths: 支持路径类型参见 :class:`fastNLP.io.loader.ConllLoader` 的load函数。 | |||
| :return: DataBundle | |||
| """ | |||
| # 读取数据 | |||
| data_bundle = Conll2003NERLoader().load(paths) | |||
| data_bundle = self.process(data_bundle) | |||
| return data_bundle | |||
| class Conll2003NERPipe(_NERPipe): | |||
| """ | |||
| @@ -102,8 +77,8 @@ class Conll2003NERPipe(_NERPipe): | |||
| .. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader | |||
| :header: "raw_words", "words", "target", "seq_len" | |||
| "[Nadim, Ladki]", "[1, 2]", "[1, 2]", 2 | |||
| "[AL-AIN, United, Arab, ...]", "[3, 4, 5,...]", "[3, 4]", 10 | |||
| "[Nadim, Ladki]", "[2, 3]", "[1, 2]", 2 | |||
| "[AL-AIN, United, Arab, ...]", "[4, 5, 6,...]", "[3, 4]", 6 | |||
| "[...]", "[...]", "[...]", . | |||
| raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | |||
| @@ -134,10 +109,13 @@ class OntoNotesNERPipe(_NERPipe): | |||
| .. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader | |||
| :header: "raw_words", "words", "target", "seq_len" | |||
| "[Nadim, Ladki]", "[1, 2]", "[1, 2]", 2 | |||
| "[AL-AIN, United, Arab, ...]", "[3, 4, 5,...]", "[3, 4]", 6 | |||
| "[Nadim, Ladki]", "[2, 3]", "[1, 2]", 2 | |||
| "[AL-AIN, United, Arab, ...]", "[4, 5, 6,...]", "[3, 4]", 6 | |||
| "[...]", "[...]", "[...]", . | |||
| raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | |||
| target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target。 | |||
| :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 | |||
| :param bool lower: 是否将words小写化后再建立词表,绝大多数情况都不需要设置为True。 | |||
| :param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为0。 | |||
| @@ -146,3 +124,124 @@ class OntoNotesNERPipe(_NERPipe): | |||
| def process_from_file(self, paths): | |||
| data_bundle = OntoNotesNERLoader().load(paths) | |||
| return self.process(data_bundle) | |||
| class _CNNERPipe(Pipe): | |||
| """ | |||
| 中文NER任务的处理Pipe, 该Pipe会(1)复制raw_chars列,并命名为chars; (2)在chars, target列建立词表 | |||
| (创建 :class:`fastNLP.Vocabulary` 对象,所以在返回的DataBundle中将有两个Vocabulary); (3)将chars,target列根据相应的 | |||
| Vocabulary转换为index。 | |||
| raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | |||
| target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target, seq_len。 | |||
| :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 | |||
| :param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为0。 | |||
| """ | |||
| def __init__(self, encoding_type: str = 'bio', target_pad_val=0): | |||
| if encoding_type == 'bio': | |||
| self.convert_tag = iob2 | |||
| else: | |||
| self.convert_tag = lambda words: iob2bioes(iob2(words)) | |||
| self.target_pad_val = int(target_pad_val) | |||
| def process(self, data_bundle: DataBundle) -> DataBundle: | |||
| """ | |||
| 支持的DataSet的field为 | |||
| .. csv-table:: | |||
| :header: "raw_chars", "target" | |||
| "[相, 比, 之, 下,...]", "[O, O, O, O, ...]" | |||
| "[青, 岛, 海, 牛, 队, 和, ...]", "[B-ORG, I-ORG, I-ORG, ...]" | |||
| "[...]", "[...]" | |||
| raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | |||
| target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 | |||
| :param DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field,且两个field的内容均为List[str]。 | |||
| 在传入DataBundle基础上原位修改。 | |||
| :return: DataBundle | |||
| """ | |||
| # 转换tag | |||
| for name, dataset in data_bundle.datasets.items(): | |||
| dataset.apply_field(self.convert_tag, field_name=Const.TARGET, new_field_name=Const.TARGET) | |||
| _add_chars_field(data_bundle, lower=False) | |||
| # index | |||
| _indexize(data_bundle, input_field_name=Const.CHAR_INPUT, target_field_name=Const.TARGET) | |||
| input_fields = [Const.TARGET, Const.CHAR_INPUT, Const.INPUT_LEN] | |||
| target_fields = [Const.TARGET, Const.INPUT_LEN] | |||
| for name, dataset in data_bundle.datasets.items(): | |||
| dataset.set_pad_val(Const.TARGET, self.target_pad_val) | |||
| dataset.add_seq_len(Const.CHAR_INPUT) | |||
| data_bundle.set_input(*input_fields) | |||
| data_bundle.set_target(*target_fields) | |||
| return data_bundle | |||
| class MsraNERPipe(_CNNERPipe): | |||
| """ | |||
| 处理MSRA-NER的数据,处理之后的DataSet的field情况为 | |||
| .. csv-table:: | |||
| :header: "raw_chars", "chars", "target", "seq_len" | |||
| "[相, 比, 之, 下,...]", "[2, 3, 4, 5, ...]", "[0, 0, 0, 0, ...]", 11 | |||
| "[青, 岛, 海, 牛, 队, 和, ...]", "[10, 21, ....]", "[1, 2, 3, ...]", 21 | |||
| "[...]", "[...]", "[...]", . | |||
| raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | |||
| target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 | |||
| """ | |||
| def process_from_file(self, paths=None) -> DataBundle: | |||
| data_bundle = MsraNERLoader().load(paths) | |||
| return self.process(data_bundle) | |||
| class PeopleDailyPipe(_CNNERPipe): | |||
| """ | |||
| 处理people daily的ner的数据,处理之后的DataSet的field情况为 | |||
| .. csv-table:: | |||
| :header: "raw_chars", "chars", "target", "seq_len" | |||
| "[相, 比, 之, 下,...]", "[2, 3, 4, 5, ...]", "[0, 0, 0, 0, ...]", 11 | |||
| "[青, 岛, 海, 牛, 队, 和, ...]", "[10, 21, ....]", "[1, 2, 3, ...]", 21 | |||
| "[...]", "[...]", "[...]", . | |||
| raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | |||
| target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 | |||
| """ | |||
| def process_from_file(self, paths=None) -> DataBundle: | |||
| data_bundle = PeopleDailyNERLoader().load(paths) | |||
| return self.process(data_bundle) | |||
| class WeiboNERPipe(_CNNERPipe): | |||
| """ | |||
| 处理weibo的ner的数据,处理之后的DataSet的field情况为 | |||
| .. csv-table:: | |||
| :header: "raw_chars", "chars", "target", "seq_len" | |||
| "[相, 比, 之, 下,...]", "[2, 3, 4, 5, ...]", "[0, 0, 0, 0, ...]", 11 | |||
| "[青, 岛, 海, 牛, 队, 和, ...]", "[10, 21, ....]", "[1, 2, 3, ...]", 21 | |||
| "[...]", "[...]", "[...]", . | |||
| raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | |||
| target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 | |||
| :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 | |||
| :param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为0。 | |||
| """ | |||
| def process_from_file(self, paths=None) -> DataBundle: | |||
| data_bundle = WeiboNERLoader().load(paths) | |||
| return self.process(data_bundle) | |||
| @@ -50,8 +50,8 @@ class MatchingBertPipe(Pipe): | |||
| dataset.drop(lambda x: x[Const.TARGET] == '-') | |||
| for name, dataset in data_bundle.datasets.items(): | |||
| dataset.copy_field(Const.RAW_WORDS(0), Const.INPUTS(0)) | |||
| dataset.copy_field(Const.RAW_WORDS(1), Const.INPUTS(1)) | |||
| dataset.copy_field(Const.RAW_WORDS(0), Const.INPUTS(0), ) | |||
| dataset.copy_field(Const.RAW_WORDS(1), Const.INPUTS(1), ) | |||
| if self.lower: | |||
| for name, dataset in data_bundle.datasets.items(): | |||
| @@ -76,25 +76,27 @@ def _raw_split(sent): | |||
| return sent.split() | |||
| def _indexize(data_bundle): | |||
| def _indexize(data_bundle, input_field_name=Const.INPUT, target_field_name=Const.TARGET): | |||
| """ | |||
| 在dataset中的"words"列建立词表,"target"列建立词表,并把词表加入到data_bundle中。 | |||
| 在dataset中的field_name列建立词表,Const.TARGET列建立词表,并把词表加入到data_bundle中。 | |||
| :param data_bundle: | |||
| :param: str input_field_name: | |||
| :param: str target_field_name: 这一列的vocabulary没有unknown和padding | |||
| :return: | |||
| """ | |||
| src_vocab = Vocabulary() | |||
| src_vocab.from_dataset(data_bundle.datasets['train'], field_name=Const.INPUT, | |||
| src_vocab.from_dataset(data_bundle.datasets['train'], field_name=input_field_name, | |||
| no_create_entry_dataset=[dataset for name, dataset in data_bundle.datasets.items() if | |||
| name != 'train']) | |||
| src_vocab.index_dataset(*data_bundle.datasets.values(), field_name=Const.INPUT) | |||
| src_vocab.index_dataset(*data_bundle.datasets.values(), field_name=input_field_name) | |||
| tgt_vocab = Vocabulary(unknown=None, padding=None) | |||
| tgt_vocab.from_dataset(data_bundle.datasets['train'], field_name=Const.TARGET) | |||
| tgt_vocab.index_dataset(*data_bundle.datasets.values(), field_name=Const.TARGET) | |||
| tgt_vocab.from_dataset(data_bundle.datasets['train'], field_name=target_field_name) | |||
| tgt_vocab.index_dataset(*data_bundle.datasets.values(), field_name=target_field_name) | |||
| data_bundle.set_vocab(src_vocab, Const.INPUT) | |||
| data_bundle.set_vocab(tgt_vocab, Const.TARGET) | |||
| data_bundle.set_vocab(src_vocab, input_field_name) | |||
| data_bundle.set_vocab(tgt_vocab, target_field_name) | |||
| return data_bundle | |||
| @@ -107,14 +109,30 @@ def _add_words_field(data_bundle, lower=False): | |||
| :param bool lower:是否要小写化 | |||
| :return: 传入的DataBundle | |||
| """ | |||
| for name, dataset in data_bundle.datasets.items(): | |||
| dataset.copy_field(field_name=Const.RAW_WORD, new_field_name=Const.INPUT) | |||
| data_bundle.copy_field(field_name=Const.RAW_WORD, new_field_name=Const.INPUT, ignore_miss_dataset=True) | |||
| if lower: | |||
| for name, dataset in data_bundle.datasets.items(): | |||
| dataset[Const.INPUT].lower() | |||
| return data_bundle | |||
| def _add_chars_field(data_bundle, lower=False): | |||
| """ | |||
| 给data_bundle中的dataset中复制一列chars. 并根据lower参数判断是否需要小写化 | |||
| :param data_bundle: | |||
| :param bool lower:是否要小写化 | |||
| :return: 传入的DataBundle | |||
| """ | |||
| data_bundle.copy_field(field_name=Const.RAW_CHAR, new_field_name=Const.CHAR_INPUT, ignore_miss_dataset=True) | |||
| if lower: | |||
| for name, dataset in data_bundle.datasets.items(): | |||
| dataset[Const.CHAR_INPUT].lower() | |||
| return data_bundle | |||
| def _drop_empty_instance(data_bundle, field_name): | |||
| """ | |||
| 删除data_bundle的DataSet中存在的某个field为空的情况 | |||
| @@ -868,6 +868,7 @@ class _WordPieceBertModel(nn.Module): | |||
| self._cls_index = self.tokenzier.vocab['[CLS]'] | |||
| self._sep_index = self.tokenzier.vocab['[SEP]'] | |||
| self._wordpiece_unknown_index = self.tokenzier.vocab['[UNK]'] | |||
| self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece | |||
| self.pooled_cls = pooled_cls | |||
| @@ -919,7 +920,7 @@ class _WordPieceBertModel(nn.Module): | |||
| outputs = bert_outputs[0].new_zeros((len(self.layers), batch_size, max_len, bert_outputs[0].size(-1))) | |||
| for l_index, l in enumerate(self.layers): | |||
| bert_output = bert_outputs[l] | |||
| if l==len(bert_outputs) and self.pooled_cls: | |||
| if l in (len(bert_outputs)-1, -1) and self.pooled_cls: | |||
| bert_output[:, 0] = pooled_cls | |||
| outputs[l_index] = bert_output | |||
| return outputs | |||
| @@ -1,115 +0,0 @@ | |||
| from fastNLP.io.data_bundle import DataSetLoader, DataBundle | |||
| from fastNLP.io import ConllLoader | |||
| from reproduction.seqence_labelling.ner.data.utils import iob2bioes, iob2 | |||
| from fastNLP import Const | |||
| from reproduction.utils import check_dataloader_paths | |||
| from fastNLP import Vocabulary | |||
| class ChineseNERLoader(DataSetLoader): | |||
| """ | |||
| 读取中文命名实体数据集,包括PeopleDaily, MSRA-NER, Weibo。数据在这里可以找到https://github.com/OYE93/Chinese-NLP-Corpus/tree/master/NER | |||
| 请确保输入数据的格式如下, 共两列,第一列为字,第二列为标签,不同句子以空行隔开 | |||
| 我 O | |||
| 们 O | |||
| 变 O | |||
| 而 O | |||
| 以 O | |||
| 书 O | |||
| 会 O | |||
| ... | |||
| """ | |||
| def __init__(self, encoding_type:str='bioes'): | |||
| """ | |||
| :param str encoding_type: 支持bio和bioes格式 | |||
| """ | |||
| super().__init__() | |||
| self._loader = ConllLoader(headers=['raw_chars', 'target'], indexes=[0, 1]) | |||
| assert encoding_type in ('bio', 'bioes') | |||
| self._tag_converters = [iob2] | |||
| if encoding_type == 'bioes': | |||
| self._tag_converters.append(iob2bioes) | |||
| def load(self, path:str): | |||
| dataset = self._loader.load(path) | |||
| def convert_tag_schema(tags): | |||
| for converter in self._tag_converters: | |||
| tags = converter(tags) | |||
| return tags | |||
| if self._tag_converters: | |||
| dataset.apply_field(convert_tag_schema, field_name=Const.TARGET, new_field_name=Const.TARGET) | |||
| return dataset | |||
| def process(self, paths, bigrams=False, trigrams=False): | |||
| """ | |||
| :param paths: | |||
| :param bool, bigrams: 是否包含生成bigram feature, [a, b, c, d] -> [ab, bc, cd, d<eos>] | |||
| :param bool, trigrams: 是否包含trigram feature,[a, b, c, d] -> [abc, bcd, cd<eos>, d<eos><eos>] | |||
| :return: ~fastNLP.io.DataBundle | |||
| 包含以下的fields | |||
| raw_chars: List[str] | |||
| chars: List[int] | |||
| seq_len: int, 字的长度 | |||
| bigrams: List[int], optional | |||
| trigrams: List[int], optional | |||
| target: List[int] | |||
| """ | |||
| paths = check_dataloader_paths(paths) | |||
| data = DataBundle() | |||
| input_fields = [Const.CHAR_INPUT, Const.INPUT_LEN, Const.TARGET] | |||
| target_fields = [Const.TARGET, Const.INPUT_LEN] | |||
| for name, path in paths.items(): | |||
| dataset = self.load(path) | |||
| if bigrams: | |||
| dataset.apply_field(lambda raw_chars: [c1+c2 for c1, c2 in zip(raw_chars, raw_chars[1:]+['<eos>'])], | |||
| field_name='raw_chars', new_field_name='bigrams') | |||
| if trigrams: | |||
| dataset.apply_field(lambda raw_chars: [c1+c2+c3 for c1, c2, c3 in zip(raw_chars, | |||
| raw_chars[1:]+['<eos>'], | |||
| raw_chars[2:]+['<eos>']*2)], | |||
| field_name='raw_chars', new_field_name='trigrams') | |||
| data.datasets[name] = dataset | |||
| char_vocab = Vocabulary().from_dataset(data.datasets['train'], field_name='raw_chars', | |||
| no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train']) | |||
| char_vocab.index_dataset(*data.datasets.values(), field_name='raw_chars', new_field_name=Const.CHAR_INPUT) | |||
| data.vocabs[Const.CHAR_INPUT] = char_vocab | |||
| target_vocab = Vocabulary(unknown=None, padding=None).from_dataset(data.datasets['train'], field_name=Const.TARGET) | |||
| target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET) | |||
| data.vocabs[Const.TARGET] = target_vocab | |||
| if bigrams: | |||
| bigram_vocab = Vocabulary().from_dataset(data.datasets['train'], field_name='bigrams', | |||
| no_create_entry_dataset=[dataset for name, dataset in | |||
| data.datasets.items() if name != 'train']) | |||
| bigram_vocab.index_dataset(*data.datasets.values(), field_name='bigrams', new_field_name='bigrams') | |||
| data.vocabs['bigrams'] = bigram_vocab | |||
| input_fields.append('bigrams') | |||
| if trigrams: | |||
| trigram_vocab = Vocabulary().from_dataset(data.datasets['train'], field_name='trigrams', | |||
| no_create_entry_dataset=[dataset for name, dataset in | |||
| data.datasets.items() if name != 'train']) | |||
| trigram_vocab.index_dataset(*data.datasets.values(), field_name='trigrams', new_field_name='trigrams') | |||
| data.vocabs['trigrams'] = trigram_vocab | |||
| input_fields.append('trigrams') | |||
| for name, dataset in data.datasets.items(): | |||
| dataset.add_seq_len(Const.CHAR_INPUT) | |||
| dataset.set_input(*input_fields) | |||
| dataset.set_target(*target_fields) | |||
| return data | |||
| @@ -12,22 +12,23 @@ sys.path.append('../../../') | |||
| from torch import nn | |||
| from fastNLP.embeddings import BertEmbedding, Embedding | |||
| from reproduction.seqence_labelling.chinese_ner.data.ChineseNER import ChineseNERLoader | |||
| from fastNLP import Trainer, Const | |||
| from fastNLP import BucketSampler, SpanFPreRecMetric, GradientClipCallback | |||
| from fastNLP.modules import MLP | |||
| from fastNLP.core.callback import WarmupCallback | |||
| from fastNLP import CrossEntropyLoss | |||
| from fastNLP.core.optimizer import AdamW | |||
| import os | |||
| from fastNLP.io import MsraNERPipe, MsraNERLoader, WeiboNERPipe | |||
| from fastNLP import cache_results | |||
| encoding_type = 'bio' | |||
| @cache_results('caches/msra.pkl') | |||
| @cache_results('caches/weibo.pkl', _refresh=False) | |||
| def get_data(): | |||
| data = ChineseNERLoader(encoding_type=encoding_type).process("MSRA/") | |||
| # data_dir = MsraNERLoader().download(dev_ratio=0) | |||
| # data = MsraNERPipe(encoding_type=encoding_type, target_pad_val=-100).process_from_file(data_dir) | |||
| data = WeiboNERPipe(encoding_type=encoding_type).process_from_file() | |||
| return data | |||
| data = get_data() | |||
| print(data) | |||
| @@ -35,10 +36,10 @@ print(data) | |||
| class BertCNNER(nn.Module): | |||
| def __init__(self, embed, tag_size): | |||
| super().__init__() | |||
| self.embedding = Embedding(embed, dropout=0.1) | |||
| self.embedding = embed | |||
| self.tag_size = tag_size | |||
| self.mlp = MLP(size_layer=[self.embedding.embedding_dim, tag_size]) | |||
| def forward(self, chars): | |||
| # batch_size, max_len = words.size() | |||
| chars = self.embedding(chars) | |||
| @@ -46,11 +47,15 @@ class BertCNNER(nn.Module): | |||
| return {Const.OUTPUT: outputs} | |||
| embed = BertEmbedding(data.vocabs[Const.CHAR_INPUT], model_dir_or_name='en-base', | |||
| pool_method='max', requires_grad=True, layers='11') | |||
| def predict(self, chars): | |||
| # batch_size, max_len = words.size() | |||
| chars = self.embedding(chars) | |||
| outputs = self.mlp(chars) | |||
| for name, dataset in data.datasets.items(): | |||
| dataset.set_pad_val(Const.TARGET, -100) | |||
| return {Const.OUTPUT: outputs} | |||
| embed = BertEmbedding(data.get_vocab(Const.CHAR_INPUT), model_dir_or_name='cn-wwm-ext', | |||
| pool_method='first', requires_grad=True, layers='11', include_cls_sep=False, dropout=0.5) | |||
| callbacks = [ | |||
| GradientClipCallback(clip_type='norm', clip_value=1), | |||
| @@ -58,7 +63,7 @@ callbacks = [ | |||
| ] | |||
| model = BertCNNER(embed, len(data.vocabs[Const.TARGET])) | |||
| optimizer = AdamW(model.parameters(), lr=1e-4) | |||
| optimizer = AdamW(model.parameters(), lr=3e-5) | |||
| for name, dataset in data.datasets.items(): | |||
| original_len = len(dataset) | |||
| @@ -66,13 +71,11 @@ for name, dataset in data.datasets.items(): | |||
| clipped_len = len(dataset) | |||
| print("Delete {} instances in {}.".format(original_len-clipped_len, name)) | |||
| os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' | |||
| trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(), | |||
| device=[0, 1], dev_data=data.datasets['test'], batch_size=20, | |||
| device=0, dev_data=data.datasets['test'], batch_size=6, | |||
| metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), | |||
| loss=CrossEntropyLoss(reduction='sum'), | |||
| callbacks=callbacks, num_workers=2, n_epochs=5, | |||
| check_code_level=-1, update_every=3) | |||
| check_code_level=0, update_every=3) | |||
| trainer.train() | |||
| @@ -1,7 +1,6 @@ | |||
| import sys | |||
| sys.path.append('../../..') | |||
| from reproduction.seqence_labelling.chinese_ner.data.ChineseNER import ChineseNERLoader | |||
| from fastNLP.embeddings import StaticEmbedding | |||
| from torch import nn | |||
| @@ -14,7 +13,51 @@ import torch.nn.functional as F | |||
| from fastNLP import seq_len_to_mask | |||
| from fastNLP.core.const import Const as C | |||
| from fastNLP import SpanFPreRecMetric, Trainer | |||
| from fastNLP import cache_results | |||
| from fastNLP import cache_results, Vocabulary | |||
| from fastNLP.io.pipe.utils import _add_chars_field, _indexize | |||
| from fastNLP.io.pipe import Pipe | |||
| from fastNLP.core.utils import iob2bioes, iob2 | |||
| from fastNLP.io import MsraNERLoader, WeiboNERLoader | |||
| class ChineseNERPipe(Pipe): | |||
| def __init__(self, encoding_type: str = 'bio', target_pad_val=0, bigram=False): | |||
| if encoding_type == 'bio': | |||
| self.convert_tag = iob2 | |||
| else: | |||
| self.convert_tag = lambda words: iob2bioes(iob2(words)) | |||
| self.target_pad_val = int(target_pad_val) | |||
| self.bigram = bigram | |||
| def process(self, data_bundle): | |||
| data_bundle.copy_field(C.RAW_CHAR, C.CHAR_INPUT) | |||
| input_fields = [C.TARGET, C.CHAR_INPUT, C.INPUT_LEN] | |||
| target_fields = [C.TARGET, C.INPUT_LEN] | |||
| if self.bigram: | |||
| for dataset in data_bundle.datasets.values(): | |||
| dataset.apply_field(lambda chars:[c1+c2 for c1, c2 in zip(chars, chars[1:]+['<eos>'])], | |||
| field_name=C.CHAR_INPUT, new_field_name='bigrams') | |||
| bigram_vocab = Vocabulary() | |||
| bigram_vocab.from_dataset(data_bundle.get_dataset('train'),field_name='bigrams', | |||
| no_create_entry_dataset=[ds for name, ds in data_bundle.datasets.items() if name!='train']) | |||
| bigram_vocab.index_dataset(*data_bundle.datasets.values(), field_name='bigrams') | |||
| data_bundle.set_vocab(bigram_vocab, field_name='bigrams') | |||
| input_fields.append('bigrams') | |||
| _add_chars_field(data_bundle, lower=False) | |||
| # index | |||
| _indexize(data_bundle, input_field_name=C.CHAR_INPUT, target_field_name=C.TARGET) | |||
| for name, dataset in data_bundle.datasets.items(): | |||
| dataset.set_pad_val(C.TARGET, self.target_pad_val) | |||
| dataset.add_seq_len(C.CHAR_INPUT) | |||
| data_bundle.set_input(*input_fields) | |||
| data_bundle.set_target(*target_fields) | |||
| return data_bundle | |||
| class CNBiLSTMCRFNER(nn.Module): | |||
| def __init__(self, char_embed, num_classes, bigram_embed=None, trigram_embed=None, num_layers=1, hidden_size=100, | |||
| @@ -73,22 +116,21 @@ class CNBiLSTMCRFNER(nn.Module): | |||
| return self._forward(chars, bigrams, trigrams, seq_len) | |||
| # data_bundle = pickle.load(open('caches/msra.pkl', 'rb')) | |||
| @cache_results('caches/msra.pkl', _refresh=True) | |||
| @cache_results('caches/weibo-lstm.pkl', _refresh=False) | |||
| def get_data(): | |||
| data_bundle = ChineseNERLoader().process('MSRA-NER/', bigrams=True) | |||
| char_embed = StaticEmbedding(data_bundle.vocabs['chars'], | |||
| model_dir_or_name='cn-char') | |||
| bigram_embed = StaticEmbedding(data_bundle.vocabs['bigrams'], | |||
| model_dir_or_name='cn-bigram') | |||
| data_bundle = WeiboNERLoader().load() | |||
| data_bundle = ChineseNERPipe(encoding_type='bioes', bigram=True).process(data_bundle) | |||
| char_embed = StaticEmbedding(data_bundle.get_vocab(C.CHAR_INPUT), model_dir_or_name='cn-fasttext') | |||
| bigram_embed = StaticEmbedding(data_bundle.get_vocab('bigrams'), embedding_dim=100, min_freq=3) | |||
| return data_bundle, char_embed, bigram_embed | |||
| data_bundle, char_embed, bigram_embed = get_data() | |||
| # data_bundle = get_data() | |||
| print(data_bundle) | |||
| # exit(0) | |||
| data_bundle.datasets['train'].set_input('target') | |||
| data_bundle.datasets['dev'].set_input('target') | |||
| model = CNBiLSTMCRFNER(char_embed, num_classes=len(data_bundle.vocabs['target']), bigram_embed=bigram_embed) | |||
| Trainer(data_bundle.datasets['train'], model, batch_size=640, | |||
| Trainer(data_bundle.datasets['train'], model, batch_size=20, | |||
| metrics=SpanFPreRecMetric(data_bundle.vocabs['target'], encoding_type='bioes'), | |||
| num_workers=2, dev_data=data_bundle. datasets['dev'], device=3).train() | |||
| num_workers=2, dev_data=data_bundle. datasets['dev'], device=0).train() | |||
| @@ -2,7 +2,6 @@ | |||
| import torch | |||
| from torch import nn | |||
| from fastNLP import seq_len_to_mask | |||
| from fastNLP.modules import Embedding | |||
| from fastNLP.modules import LSTM | |||
| from fastNLP.modules import ConditionalRandomField, allowed_transitions | |||
| import torch.nn.functional as F | |||
| @@ -1,8 +1,7 @@ | |||
| import sys | |||
| sys.path.append('../../..') | |||
| from fastNLP.embeddings.embedding import CNNCharEmbedding, StaticEmbedding | |||
| from fastNLP.core.vocabulary import VocabularyOption | |||
| from fastNLP.embeddings import CNNCharEmbedding, StaticEmbedding | |||
| from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF | |||
| from fastNLP import Trainer | |||
| @@ -11,68 +10,44 @@ from fastNLP import BucketSampler | |||
| from fastNLP import Const | |||
| from torch.optim import SGD | |||
| from fastNLP import GradientClipCallback | |||
| from fastNLP.core.callback import FitlogCallback, LRScheduler | |||
| from fastNLP.core.callback import EvaluateCallback, LRScheduler | |||
| from torch.optim.lr_scheduler import LambdaLR | |||
| # from reproduction.seqence_labelling.ner.model.swats import SWATS | |||
| from fastNLP import cache_results | |||
| import fitlog | |||
| fitlog.debug() | |||
| from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003DataLoader | |||
| from fastNLP.io.pipe.conll import Conll2003NERPipe | |||
| encoding_type = 'bioes' | |||
| @cache_results('caches/upper_conll2003.pkl') | |||
| @cache_results('caches/conll2003_new.pkl', _refresh=True) | |||
| def load_data(): | |||
| data = Conll2003DataLoader(encoding_type=encoding_type).process('../../../../others/data/conll2003', | |||
| word_vocab_opt=VocabularyOption(min_freq=1), | |||
| lower=False) | |||
| # 替换路径 | |||
| paths = {'test':"NER/corpus/CoNLL-2003/eng.testb", | |||
| 'train':"NER/corpus/CoNLL-2003/eng.train", | |||
| 'dev':"NER/corpus/CoNLL-2003/eng.testa"} | |||
| data = Conll2003NERPipe(encoding_type=encoding_type, target_pad_val=0).process_from_file(paths) | |||
| return data | |||
| data = load_data() | |||
| print(data) | |||
| char_embed = CNNCharEmbedding(vocab=data.vocabs['words'], embed_size=30, char_emb_size=30, filter_nums=[30], | |||
| kernel_sizes=[3], word_dropout=0.01, dropout=0.5) | |||
| # char_embed = LSTMCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30 ,char_emb_size=30) | |||
| word_embed = StaticEmbedding(vocab=data.vocabs['words'], | |||
| model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt', | |||
| char_embed = CNNCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, filter_nums=[30], | |||
| kernel_sizes=[3], word_dropout=0, dropout=0.5) | |||
| word_embed = StaticEmbedding(vocab=data.get_vocab('words'), | |||
| model_dir_or_name='en-glove-6b-100d', | |||
| requires_grad=True, lower=True, word_dropout=0.01, dropout=0.5) | |||
| word_embed.embedding.weight.data = word_embed.embedding.weight.data/word_embed.embedding.weight.data.std() | |||
| # import joblib | |||
| # raw_data = joblib.load('/hdd/fudanNLP/fastNLP/others/NER-with-LS/data/conll_with_data.joblib') | |||
| # def convert_to_ids(raw_words): | |||
| # ids = [] | |||
| # for word in raw_words: | |||
| # id = raw_data['word_to_id'][word] | |||
| # id = raw_data['id_to_emb_map'][id] | |||
| # ids.append(id) | |||
| # return ids | |||
| # word_embed = raw_data['emb_matrix'] | |||
| # for name, dataset in data.datasets.items(): | |||
| # dataset.apply_field(convert_to_ids, field_name='raw_words', new_field_name=Const.INPUT) | |||
| # elmo_embed = ElmoEmbedding(vocab=data.vocabs['cap_words'], | |||
| # model_dir_or_name='.', | |||
| # requires_grad=True, layers='mix') | |||
| # char_embed = StackEmbedding([elmo_embed, char_embed]) | |||
| model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], | |||
| encoding_type=encoding_type) | |||
| callbacks = [ | |||
| GradientClipCallback(clip_type='value', clip_value=5), | |||
| FitlogCallback({'test':data.datasets['test']}, verbose=1), | |||
| # SaveModelCallback('save_models/', top=3, only_param=False, save_on_exception=True) | |||
| EvaluateCallback(data=data.get_dataset('test')) # 额外对test上的数据进行性能评测 | |||
| ] | |||
| # optimizer = Adam(model.parameters(), lr=0.001) | |||
| # optimizer = SWATS(model.parameters(), verbose=True) | |||
| optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9) | |||
| optimizer = SGD(model.parameters(), lr=0.008, momentum=0.9) | |||
| scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch))) | |||
| callbacks.append(scheduler) | |||
| trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(batch_size=20), | |||
| device=1, dev_data=data.datasets['dev'], batch_size=20, | |||
| trainer = Trainer(train_data=data.get_dataset('train'), model=model, optimizer=optimizer, sampler=BucketSampler(), | |||
| device=0, dev_data=data.get_dataset('dev'), batch_size=20, | |||
| metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), | |||
| callbacks=callbacks, num_workers=2, n_epochs=100) | |||
| callbacks=callbacks, num_workers=2, n_epochs=100, dev_batch_size=512) | |||
| trainer.train() | |||
| @@ -11,52 +11,37 @@ from fastNLP import Const | |||
| from torch.optim import SGD | |||
| from torch.optim.lr_scheduler import LambdaLR | |||
| from fastNLP import GradientClipCallback | |||
| from fastNLP.core.vocabulary import VocabularyOption | |||
| from fastNLP.core.callback import FitlogCallback, LRScheduler | |||
| from functools import partial | |||
| from torch import nn | |||
| from fastNLP import BucketSampler | |||
| from fastNLP.core.callback import EvaluateCallback, LRScheduler | |||
| from fastNLP import cache_results | |||
| from fastNLP.io.pipe.conll import OntoNotesNERPipe | |||
| import fitlog | |||
| fitlog.debug() | |||
| fitlog.set_log_dir('logs/') | |||
| fitlog.add_hyper_in_file(__file__) | |||
| #######hyper | |||
| normalize = False | |||
| divide_std = True | |||
| lower = False | |||
| lr = 0.015 | |||
| lr = 0.01 | |||
| dropout = 0.5 | |||
| batch_size = 20 | |||
| init_method = 'default' | |||
| batch_size = 32 | |||
| job_embed = False | |||
| data_name = 'ontonote' | |||
| #######hyper | |||
| init_method = {'default': None, | |||
| 'xavier': partial(nn.init.xavier_normal_, gain=0.02), | |||
| 'normal': partial(nn.init.normal_, std=0.02) | |||
| }[init_method] | |||
| from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDataLoader | |||
| encoding_type = 'bioes' | |||
| @cache_results('caches/ontonotes.pkl') | |||
| @cache_results('caches/ontonotes.pkl', _refresh=True) | |||
| def cache(): | |||
| data = OntoNoteNERDataLoader(encoding_type=encoding_type).process('../../../../others/data/v4/english', | |||
| lower=lower, | |||
| word_vocab_opt=VocabularyOption(min_freq=1)) | |||
| char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30], | |||
| kernel_sizes=[3]) | |||
| data = OntoNotesNERPipe(encoding_type=encoding_type).process_from_file('../../../../others/data/v4/english') | |||
| char_embed = CNNCharEmbedding(vocab=data.vocabs['words'], embed_size=30, char_emb_size=30, filter_nums=[30], | |||
| kernel_sizes=[3], dropout=dropout) | |||
| word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], | |||
| model_dir_or_name='/remote-home/hyan01/fastnlp_caches/glove.6B.100d/glove.6B.100d.txt', | |||
| model_dir_or_name='en-glove-100d', | |||
| requires_grad=True, | |||
| normalize=normalize, | |||
| init_method=init_method) | |||
| word_dropout=0.01, | |||
| dropout=dropout, | |||
| lower=True, | |||
| min_freq=2) | |||
| return data, char_embed, word_embed | |||
| data, char_embed, word_embed = cache() | |||
| @@ -67,7 +52,7 @@ model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=1200, num_layers=1, tag | |||
| callbacks = [ | |||
| GradientClipCallback(clip_value=5, clip_type='value'), | |||
| FitlogCallback(data.datasets['test'], verbose=1) | |||
| EvaluateCallback(data.datasets['test']) | |||
| ] | |||
| optimizer = SGD(model.parameters(), lr=lr, momentum=0.9) | |||
| @@ -75,8 +60,8 @@ scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.0 | |||
| callbacks.append(scheduler) | |||
| trainer = Trainer(train_data=data.datasets['dev'][:100], model=model, optimizer=optimizer, sampler=None, | |||
| device=0, dev_data=data.datasets['dev'][:100], batch_size=batch_size, | |||
| trainer = Trainer(train_data=data.get_dataset('train'), model=model, optimizer=optimizer, sampler=BucketSampler(num_buckets=100), | |||
| device=0, dev_data=data.get_dataset('dev'), batch_size=batch_size, | |||
| metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), | |||
| callbacks=callbacks, num_workers=1, n_epochs=100) | |||
| callbacks=callbacks, num_workers=1, n_epochs=100, dev_batch_size=256) | |||
| trainer.train() | |||
| @@ -0,0 +1,14 @@ | |||
| import unittest | |||
| from fastNLP import Vocabulary | |||
| from fastNLP.embeddings import BertEmbedding | |||
| import torch | |||
| import os | |||
| @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") | |||
| class TestDownload(unittest.TestCase): | |||
| def test_download(self): | |||
| # import os | |||
| vocab = Vocabulary().add_word_lst("This is a test .".split()) | |||
| embed = BertEmbedding(vocab, model_dir_or_name='/remote-home/source/fastnlp_caches/embedding/bert-base-cased') | |||
| words = torch.LongTensor([[0, 1, 2]]) | |||
| print(embed(words).size()) | |||
| @@ -0,0 +1,21 @@ | |||
| import unittest | |||
| import os | |||
| from fastNLP.io.loader.conll import MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader | |||
| class MSRANERTest(unittest.TestCase): | |||
| @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") | |||
| def test_download(self): | |||
| MsraNERLoader().download(re_download=False) | |||
| data_bundle = MsraNERLoader().load() | |||
| print(data_bundle) | |||
| class PeopleDailyTest(unittest.TestCase): | |||
| @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") | |||
| def test_download(self): | |||
| PeopleDailyNERLoader().download() | |||
| class WeiboNERTest(unittest.TestCase): | |||
| @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") | |||
| def test_download(self): | |||
| WeiboNERLoader().download() | |||
| @@ -0,0 +1,12 @@ | |||
| import unittest | |||
| import os | |||
| from fastNLP.io import MsraNERPipe, PeopleDailyPipe, WeiboNERPipe | |||
| @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") | |||
| class TestPipe(unittest.TestCase): | |||
| def test_process_from_file(self): | |||
| for pipe in [MsraNERPipe, PeopleDailyPipe, WeiboNERPipe]: | |||
| with self.subTest(pipe=pipe): | |||
| print(pipe) | |||
| data_bundle = pipe().process_from_file() | |||
| print(data_bundle) | |||