import pickle import numpy as np from fastNLP.core.vocabulary import Vocabulary from fastNLP.io.base_loader import DataBundle from fastNLP.io.dataset_loader import JsonLoader from fastNLP.core.const import Const from tools.logger import * WORD_PAD = "[PAD]" WORD_UNK = "[UNK]" DOMAIN_UNK = "X" TAG_UNK = "X" class SummarizationLoader(JsonLoader): """ 读取summarization数据集,读取的DataSet包含fields:: text: list(str),document summary: list(str), summary text_wd: list(list(str)),tokenized document summary_wd: list(list(str)), tokenized summary labels: list(int), flatten_label: list(int), 0 or 1, flatten labels domain: str, optional tag: list(str), optional 数据来源: CNN_DailyMail Newsroom DUC """ def __init__(self): super(SummarizationLoader, self).__init__() def _load(self, path): ds = super(SummarizationLoader, self)._load(path) def _lower_text(text_list): return [text.lower() for text in text_list] def _split_list(text_list): return [text.split() for text in text_list] def _convert_label(label, sent_len): np_label = np.zeros(sent_len, dtype=int) if label != []: np_label[np.array(label)] = 1 return np_label.tolist() ds.apply(lambda x: _lower_text(x['text']), new_field_name='text') ds.apply(lambda x: _lower_text(x['summary']), new_field_name='summary') ds.apply(lambda x:_split_list(x['text']), new_field_name='text_wd') ds.apply(lambda x:_split_list(x['summary']), new_field_name='summary_wd') ds.apply(lambda x:_convert_label(x["label"], len(x["text"])), new_field_name="flatten_label") return ds def process(self, paths, vocab_size, vocab_path, sent_max_len, doc_max_timesteps, domain=False, tag=False, load_vocab=True): """ :param paths: dict path for each dataset :param vocab_size: int max_size for vocab :param vocab_path: str vocab path :param sent_max_len: int max token number of the sentence :param doc_max_timesteps: int max sentence number of the document :param domain: bool build vocab for publication, use 'X' for unknown :param tag: bool build vocab for tag, use 'X' for unknown :param load_vocab: bool build vocab (False) or load vocab (True) :return: DataBundle datasets: dict keys correspond to the paths dict vocabs: dict key: vocab(if "train" in paths), domain(if domain=True), tag(if tag=True) embeddings: optional """ def _pad_sent(text_wd): pad_text_wd = [] for sent_wd in text_wd: if len(sent_wd) < sent_max_len: pad_num = sent_max_len - len(sent_wd) sent_wd.extend([WORD_PAD] * pad_num) else: sent_wd = sent_wd[:sent_max_len] pad_text_wd.append(sent_wd) return pad_text_wd def _token_mask(text_wd): token_mask_list = [] for sent_wd in text_wd: token_num = len(sent_wd) if token_num < sent_max_len: mask = [1] * token_num + [0] * (sent_max_len - token_num) else: mask = [1] * sent_max_len token_mask_list.append(mask) return token_mask_list def _pad_label(label): text_len = len(label) if text_len < doc_max_timesteps: pad_label = label + [0] * (doc_max_timesteps - text_len) else: pad_label = label[:doc_max_timesteps] return pad_label def _pad_doc(text_wd): text_len = len(text_wd) if text_len < doc_max_timesteps: padding = [WORD_PAD] * sent_max_len pad_text = text_wd + [padding] * (doc_max_timesteps - text_len) else: pad_text = text_wd[:doc_max_timesteps] return pad_text def _sent_mask(text_wd): text_len = len(text_wd) if text_len < doc_max_timesteps: sent_mask = [1] * text_len + [0] * (doc_max_timesteps - text_len) else: sent_mask = [1] * doc_max_timesteps return sent_mask datasets = {} train_ds = None for key, value in paths.items(): ds = self.load(value) # pad sent ds.apply(lambda x:_pad_sent(x["text_wd"]), new_field_name="pad_text_wd") ds.apply(lambda x:_token_mask(x["text_wd"]), new_field_name="pad_token_mask") # pad document ds.apply(lambda x:_pad_doc(x["pad_text_wd"]), new_field_name="pad_text") ds.apply(lambda x:_sent_mask(x["pad_text_wd"]), new_field_name="seq_len") ds.apply(lambda x:_pad_label(x["flatten_label"]), new_field_name="pad_label") # rename field ds.rename_field("pad_text", Const.INPUT) ds.rename_field("seq_len", Const.INPUT_LEN) ds.rename_field("pad_label", Const.TARGET) # set input and target ds.set_input(Const.INPUT, Const.INPUT_LEN) ds.set_target(Const.TARGET, Const.INPUT_LEN) datasets[key] = ds if "train" in key: train_ds = datasets[key] vocab_dict = {} if load_vocab == False: logger.info("[INFO] Build new vocab from training dataset!") if train_ds == None: raise ValueError("Lack train file to build vocabulary!") vocabs = Vocabulary(max_size=vocab_size, padding=WORD_PAD, unknown=WORD_UNK) vocabs.from_dataset(train_ds, field_name=["text_wd","summary_wd"]) vocab_dict["vocab"] = vocabs else: logger.info("[INFO] Load existing vocab from %s!" % vocab_path) word_list = [] with open(vocab_path, 'r', encoding='utf8') as vocab_f: cnt = 2 # pad and unk for line in vocab_f: pieces = line.split("\t") word_list.append(pieces[0]) cnt += 1 if cnt > vocab_size: break vocabs = Vocabulary(max_size=vocab_size, padding=WORD_PAD, unknown=WORD_UNK) vocabs.add_word_lst(word_list) vocabs.build_vocab() vocab_dict["vocab"] = vocabs if domain == True: domaindict = Vocabulary(padding=None, unknown=DOMAIN_UNK) domaindict.from_dataset(train_ds, field_name="publication") vocab_dict["domain"] = domaindict if tag == True: tagdict = Vocabulary(padding=None, unknown=TAG_UNK) tagdict.from_dataset(train_ds, field_name="tag") vocab_dict["tag"] = tagdict for ds in datasets.values(): vocab_dict["vocab"].index_dataset(ds, field_name=Const.INPUT, new_field_name=Const.INPUT) return DataBundle(vocabs=vocab_dict, datasets=datasets)