from fastNLP.core.vocabulary import VocabularyOption from fastNLP.io.base_loader import DataSetLoader, DataBundle from typing import Union, Dict from fastNLP import DataSet from fastNLP import Vocabulary from fastNLP import Const from reproduction.utils import check_dataloader_paths from fastNLP.io import ConllLoader from reproduction.seqence_labelling.ner.data.utils import iob2bioes, iob2 class OntoNoteNERDataLoader(DataSetLoader): """ 用于读取处理为Conll格式后的OntoNote数据。将OntoNote数据处理为conll格式的过程可以参考https://github.com/yhcc/OntoNotes-5.0-NER。 """ def __init__(self, encoding_type:str='bioes'): assert encoding_type in ('bioes', 'bio') self.encoding_type = encoding_type if encoding_type=='bioes': self.encoding_method = iob2bioes else: self.encoding_method = iob2 def load(self, path:str)->DataSet: """ 给定一个文件路径,读取数据。返回的DataSet包含以下的field raw_words: List[str] target: List[str] :param path: :return: """ dataset = ConllLoader(headers=['raw_words', 'target'], indexes=[3, 10]).load(path) def convert_to_bio(tags): bio_tags = [] flag = None for tag in tags: label = tag.strip("()*") if '(' in tag: bio_label = 'B-' + label flag = label elif flag: bio_label = 'I-' + flag else: bio_label = 'O' if ')' in tag: flag = None bio_tags.append(bio_label) return self.encoding_method(bio_tags) def convert_word(words): converted_words = [] for word in words: word = word.replace('/.', '.') # 有些结尾的.是/.形式的 if not word.startswith('-'): converted_words.append(word) continue # 以下是由于这些符号被转义了,再转回来 tfrs = {'-LRB-':'(', '-RRB-': ')', '-LSB-': '[', '-RSB-': ']', '-LCB-': '{', '-RCB-': '}' } if word in tfrs: converted_words.append(tfrs[word]) else: converted_words.append(word) return converted_words dataset.apply_field(convert_word, field_name='raw_words', new_field_name='raw_words') dataset.apply_field(convert_to_bio, field_name='target', new_field_name='target') return dataset def process(self, paths: Union[str, Dict[str, str]], word_vocab_opt:VocabularyOption=None, lower:bool=True)->DataBundle: """ 读取并处理数据。返回的DataInfo包含以下的内容 vocabs: word: Vocabulary target: Vocabulary datasets: train: DataSet words: List[int], 被设置为input target: int. label,被同时设置为input和target seq_len: int. 句子的长度,被同时设置为input和target raw_words: List[str] xxx(根据传入的paths可能有所变化) :param paths: :param word_vocab_opt: vocabulary的初始化值 :param lower: 是否使用小写 :return: """ paths = check_dataloader_paths(paths) data = DataBundle() input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN] target_fields = [Const.TARGET, Const.INPUT_LEN] for name, path in paths.items(): dataset = self.load(path) dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT) if lower: dataset.words.lower() data.datasets[name] = dataset # 对construct vocab word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt) word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT, no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train']) word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT) data.vocabs[Const.INPUT] = word_vocab # cap words cap_word_vocab = Vocabulary() cap_word_vocab.from_dataset(*data.datasets.values(), field_name='raw_words') cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words') input_fields.append('cap_words') data.vocabs['cap_words'] = cap_word_vocab # 对target建vocab target_vocab = Vocabulary(unknown=None, padding=None) target_vocab.from_dataset(*data.datasets.values(), field_name=Const.TARGET) target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET) data.vocabs[Const.TARGET] = target_vocab for name, dataset in data.datasets.items(): dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN) dataset.set_input(*input_fields) dataset.set_target(*target_fields) return data if __name__ == '__main__': loader = OntoNoteNERDataLoader() dataset = loader.load('/hdd/fudanNLP/fastNLP/others/data/v4/english/test.txt') print(dataset.target.value_count()) print(dataset[:4]) """ train 115812 2200752 development 15680 304684 test 12217 230111 train 92403 1901772 valid 13606 279180 test 10258 204135 """