| @@ -93,9 +93,13 @@ class DataSetGetter: | |||
| class SamplerAdapter(torch.utils.data.Sampler): | |||
| def __init__(self, sampler, dataset): | |||
| super().__init__(dataset) | |||
| self.sampler = sampler | |||
| self.dataset = dataset | |||
| def __len__(self): | |||
| return len(self.dataset) | |||
| def __iter__(self): | |||
| return iter(self.sampler(self.dataset)) | |||
| @@ -165,15 +169,19 @@ class DataSetIter(BatchIter): | |||
| timeout=0, worker_init_fn=None): | |||
| super().__init__() | |||
| assert isinstance(dataset, DataSet) | |||
| sampler = SamplerAdapter(sampler=sampler or SequentialSampler(), dataset=dataset) | |||
| if not isinstance(sampler, torch.utils.data.Sampler): | |||
| self.sampler = SamplerAdapter(sampler=sampler or SequentialSampler(), dataset=dataset) | |||
| else: | |||
| self.sampler = sampler | |||
| dataset = DataSetGetter(dataset, as_numpy) | |||
| collate_fn = dataset.collate_fn if hasattr(dataset, 'collate_fn') else None | |||
| self.dataiter = torch.utils.data.DataLoader( | |||
| dataset=dataset, batch_size=batch_size, sampler=sampler, | |||
| dataset=dataset, batch_size=batch_size, sampler=self.sampler, | |||
| collate_fn=collate_fn, num_workers=num_workers, | |||
| pin_memory=pin_memory, drop_last=drop_last, | |||
| timeout=timeout, worker_init_fn=worker_init_fn) | |||
| self.num_batches = self.get_num_batches(len(dataset), batch_size, drop_last) | |||
| # 以sampler的数量为准,因为DistributedSampler的时候每个进程上并不是所有的数据都用上了 | |||
| self.num_batches = self.get_num_batches(len(self.dataiter.sampler), batch_size, drop_last) | |||
| self.batch_size = batch_size | |||
| @@ -182,7 +190,7 @@ class TorchLoaderIter(BatchIter): | |||
| super().__init__() | |||
| assert isinstance(dataset, torch.utils.data.DataLoader) | |||
| self.dataiter = dataset | |||
| self.num_batches = self.get_num_batches(len(dataset), dataset.batch_size, dataset.drop_last) | |||
| self.num_batches = self.get_num_batches(len(dataset.sampler), dataset.batch_size, dataset.drop_last) | |||
| self.batch_size = dataset.batch_size | |||
| @@ -479,7 +479,7 @@ class FitlogCallback(Callback): | |||
| self.datasets[key] = value | |||
| elif isinstance(data, DataSet): | |||
| self.datasets['test'] = data | |||
| else: | |||
| elif data is not None: | |||
| raise TypeError("data receives dict[DataSet] or DataSet object.") | |||
| self.verbose = verbose | |||
| @@ -487,7 +487,7 @@ class DataSet(object): | |||
| """ | |||
| 删除第index个instance | |||
| :param int index: 需要删除的instance的index,从0开始 | |||
| :param int index: 需要删除的instance的index,序号从0开始。 | |||
| """ | |||
| assert isinstance(index, int), "Only integer supported." | |||
| if len(self) <= index: | |||
| @@ -566,7 +566,7 @@ class DataSet(object): | |||
| raise KeyError("DataSet has no field named {}.".format(old_name)) | |||
| return self | |||
| def set_target(self, *field_names, flag=True): | |||
| def set_target(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True): | |||
| """ | |||
| 将field_names的field设置为target | |||
| @@ -577,11 +577,14 @@ class DataSet(object): | |||
| :param str field_names: field的名称 | |||
| :param bool flag: 将field_name的target状态设置为flag | |||
| :param bool use_1st_ins_infer_dim_type: 如果为True,将不会check该列是否所有数据都是同样的维度,同样的类型。将直接使用第一 | |||
| 行的数据进行类型和维度推断本列的数据的类型和维度。 | |||
| """ | |||
| assert isinstance(flag, bool), "Only bool type supported." | |||
| for name in field_names: | |||
| if name in self.field_arrays: | |||
| try: | |||
| self.field_arrays[name]._use_1st_ins_infer_dim_type = bool(use_1st_ins_infer_dim_type) | |||
| self.field_arrays[name].is_target = flag | |||
| except SetInputOrTargetException as e: | |||
| print(f"Cannot set field:{name} as target.") | |||
| @@ -589,7 +592,7 @@ class DataSet(object): | |||
| else: | |||
| raise KeyError("{} is not a valid field name.".format(name)) | |||
| def set_input(self, *field_names, flag=True): | |||
| def set_input(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True): | |||
| """ | |||
| 将field_names的field设置为input:: | |||
| @@ -598,10 +601,13 @@ class DataSet(object): | |||
| :param str field_names: field的名称 | |||
| :param bool flag: 将field_name的input状态设置为flag | |||
| :param bool use_1st_ins_infer_dim_type: 如果为True,将不会check该列是否所有数据都是同样的维度,同样的类型。将直接使用第一 | |||
| 行的数据进行类型和维度推断本列的数据的类型和维度。 | |||
| """ | |||
| for name in field_names: | |||
| if name in self.field_arrays: | |||
| try: | |||
| self.field_arrays[name]._use_1st_ins_infer_dim_type = bool(use_1st_ins_infer_dim_type) | |||
| self.field_arrays[name].is_input = flag | |||
| except SetInputOrTargetException as e: | |||
| print(f"Cannot set field:{name} as input, exception happens at the {e.index} value.") | |||
| @@ -225,7 +225,7 @@ class CrossEntropyLoss(LossBase): | |||
| def get_loss(self, pred, target, seq_len=None): | |||
| if pred.dim() > 2: | |||
| if pred.size(1) != target.size(1): | |||
| if pred.size(1) != target.size(1): # 有可能顺序替换了 | |||
| pred = pred.transpose(1, 2) | |||
| pred = pred.reshape(-1, pred.size(-1)) | |||
| target = target.reshape(-1) | |||
| @@ -49,7 +49,7 @@ class NullOptimizer(Optimizer): | |||
| super().__init__(None) | |||
| def construct_from_pytorch(self, model_params): | |||
| pass | |||
| return self | |||
| def __getattr__(self, item): | |||
| def pass_func(*args, **kwargs): | |||
| @@ -25,9 +25,9 @@ class Sampler(object): | |||
| def __call__(self, data_set): | |||
| """ | |||
| :param DataSet data_set: `DataSet` 对象, 需要Sample的数据 | |||
| :return result: list(int) 其中元素的下标序列, ``data_set`` 中元素会按 ``result`` 中顺序取出 | |||
| """ | |||
| :param DataSet data_set: `DataSet` 对象, 需要Sample的数据 | |||
| :return result: list(int) 其中元素的下标序列, ``data_set`` 中元素会按 ``result`` 中顺序取出 | |||
| """ | |||
| raise NotImplementedError | |||
| @@ -47,6 +47,7 @@ from .utils import _get_func_signature | |||
| from .utils import _get_model_device | |||
| from .utils import _move_model_to_device | |||
| from ._parallel_utils import _data_parallel_wrapper | |||
| from .utils import _model_contains_inner_module | |||
| from functools import partial | |||
| __all__ = [ | |||
| @@ -83,9 +84,7 @@ class Tester(object): | |||
| def __init__(self, data, model, metrics, batch_size=16, num_workers=0, device=None, verbose=1): | |||
| super(Tester, self).__init__() | |||
| if not isinstance(data, DataSet): | |||
| raise TypeError(f"The type of data must be `fastNLP.DataSet`, got `{type(data)}`.") | |||
| if not isinstance(model, nn.Module): | |||
| raise TypeError(f"The type of model must be `torch.nn.Module`, got `{type(model)}`.") | |||
| @@ -106,19 +105,22 @@ class Tester(object): | |||
| # check predict | |||
| if (hasattr(self._model, 'predict') and callable(self._model.predict)) or \ | |||
| (isinstance(self._model, nn.DataParallel) and hasattr(self._model.module, 'predict') and | |||
| callable(self._model.module.predict)): | |||
| (_model_contains_inner_module(self._model) and hasattr(self._model.module, 'predict') and | |||
| callable(self._model.module.predict)): | |||
| if isinstance(self._model, nn.DataParallel): | |||
| self._predict_func_wrapper = partial(_data_parallel_wrapper('predict', | |||
| self._model.device_ids, | |||
| self._model.output_device), | |||
| network=self._model.module) | |||
| self._predict_func = self._model.module.predict # 用于匹配参数 | |||
| elif isinstance(self._model, nn.parallel.DistributedDataParallel): | |||
| self._predict_func = self._model.module.predict | |||
| self._predict_func_wrapper = self._model.module.predict # 用于调用 | |||
| else: | |||
| self._predict_func = self._model.predict | |||
| self._predict_func_wrapper = self._model.predict | |||
| else: | |||
| if isinstance(self._model, nn.DataParallel): | |||
| if _model_contains_inner_module(model): | |||
| self._predict_func_wrapper = self._model.forward | |||
| self._predict_func = self._model.module.forward | |||
| else: | |||
| @@ -352,7 +352,7 @@ from .utils import _move_dict_value_to_device | |||
| from .utils import _get_func_signature | |||
| from .utils import _get_model_device | |||
| from .utils import _move_model_to_device | |||
| from .utils import _model_contains_inner_module | |||
| class Trainer(object): | |||
| """ | |||
| @@ -389,8 +389,8 @@ class Trainer(object): | |||
| 要指定以哪个指标为准。另外有些指标是越小效果越好,比如语言模型的困惑度,这种情况下,在key前面增加一个'-'来表 | |||
| 明验证时,值越小越好(比如: "-ppl")。仅在传入dev_data时有效。 | |||
| :param int validate_every: 多少个step在验证集上验证一次; 如果为-1,则每个epoch结束验证一次。仅在传入dev_data时有效。 | |||
| :param str,None save_path: 将模型保存路径。如果为None,则不保存模型。如果dev_data为None,则保存最后一次迭代的模型。 | |||
| 保存的时候不仅保存了参数,还保存了模型结构。即便使用DataParallel,这里也只保存模型。 | |||
| :param str,None save_path: 将模型保存路径,如果路径不存在,将自动创建文件夹。如果为None,则不保存模型。如果dev_data为None,则保存 | |||
| 最后一次迭代的模型。保存的时候不仅保存了参数,还保存了模型结构。即便使用DataParallel,这里也只保存模型。 | |||
| :param bool use_tqdm: 是否使用tqdm来显示训练进度; 如果为False,则将loss打印在终端中。 | |||
| :param str,int,torch.device,list(int) device: 将模型load到哪个设备。默认为None,即Trainer不对模型 | |||
| 的计算位置进行管理。支持以下的输入: | |||
| @@ -440,7 +440,7 @@ class Trainer(object): | |||
| # check update every | |||
| assert update_every >= 1, "update_every must be no less than 1." | |||
| self.update_every = int(update_every) | |||
| # check save_path | |||
| if not (save_path is None or isinstance(save_path, str)): | |||
| raise ValueError("save_path can only be None or `str`.") | |||
| @@ -458,30 +458,69 @@ class Trainer(object): | |||
| self.metric_key = None | |||
| # prepare loss | |||
| losser = _prepare_losser(loss) | |||
| # sampler check | |||
| if sampler is not None and not isinstance(sampler, Sampler): | |||
| raise ValueError("The type of sampler should be fastNLP.BaseSampler, got {}.".format(type(sampler))) | |||
| if sampler is None: | |||
| sampler = RandomSampler() | |||
| elif hasattr(sampler, 'set_batch_size'): | |||
| sampler.set_batch_size(batch_size) | |||
| if isinstance(train_data, BatchIter): | |||
| if sampler is not None: | |||
| warnings.warn("sampler is ignored when train_data is a BatchIter.") | |||
| if num_workers>0: | |||
| warnings.warn("num_workers is ignored when train_data is BatchIter.") | |||
| if drop_last: | |||
| warnings.warn("drop_last is ignored when train_data is BatchIter.") | |||
| if isinstance(model, nn.parallel.DistributedDataParallel): # 如果是分布式的 | |||
| # device为None | |||
| if device is not None: | |||
| warnings.warn("device is ignored when model is nn.parallel.DistributedDataParallel.") | |||
| device = None | |||
| # Sampler要是分布式的 | |||
| if sampler is None: | |||
| sampler = torch.utils.data.DistributedSampler(train_data) | |||
| elif not isinstance(sampler, torch.utils.data.DistributedSampler): | |||
| raise TypeError("When using nn.parallel.DistributedDataParallel, " | |||
| "sampler must be None or torch.utils.data.DistributedSampler.") | |||
| # 不能保存模型 | |||
| if save_path: | |||
| raise RuntimeError("Saving model in Distributed situation is not allowed right now.") | |||
| else: | |||
| # sampler check | |||
| if sampler is not None and not isinstance(sampler, (Sampler, torch.utils.data.Sampler)): | |||
| raise ValueError(f"The type of sampler should be fastNLP.BaseSampler or pytorch's Sampler, got {type(sampler)}") | |||
| if sampler is None: | |||
| sampler = RandomSampler() | |||
| elif hasattr(sampler, 'set_batch_size'): | |||
| sampler.set_batch_size(batch_size) | |||
| if isinstance(train_data, DataSet): | |||
| self.data_iterator = DataSetIter( | |||
| dataset=train_data, batch_size=batch_size, num_workers=num_workers, sampler=sampler, drop_last=drop_last) | |||
| elif isinstance(train_data, BatchIter): | |||
| self.data_iterator = train_data | |||
| train_data = train_data.dataset | |||
| else: | |||
| raise TypeError("train_data type {} not support".format(type(train_data))) | |||
| if check_code_level > -1 and isinstance(self.data_iterator, DataSetIter): | |||
| _check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data, | |||
| metric_key=self.metric_key, check_level=check_code_level, | |||
| batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE)) | |||
| # _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的代码 | |||
| self.model = _move_model_to_device(model, device=device) | |||
| if _model_contains_inner_module(self.model): | |||
| self._forward_func = self.model.module.forward | |||
| else: | |||
| self._forward_func = self.model.forward | |||
| if check_code_level > -1: | |||
| # _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的field名与模型的输入 | |||
| # 名是否匹配 | |||
| dev_dataset = dev_data | |||
| if isinstance(dev_data, BatchIter): | |||
| dev_dataset = None | |||
| warnings.warn("dev_data is of BatchIter type, ignore validation checking.") | |||
| check_batch_size = min(batch_size, DEFAULT_CHECK_BATCH_SIZE) | |||
| if isinstance(self.model, nn.DataParallel): | |||
| _num_devices = len(self.model.device_ids) | |||
| if batch_size//_num_devices>1: # 如果多卡是每个卡可以分多个数据的,则用每个卡给两个sample | |||
| check_batch_size = max(len(self.model.device_ids)*2, check_batch_size) | |||
| else: | |||
| check_batch_size = max(len(self.model.device_ids), check_batch_size) | |||
| _check_code(dataset=train_data, model=self.model, losser=losser, forward_func=self._forward_func, metrics=metrics, | |||
| dev_data=dev_dataset, metric_key=self.metric_key, check_level=check_code_level, | |||
| batch_size=check_batch_size) | |||
| self.train_data = train_data | |||
| self.dev_data = dev_data # If None, No validation. | |||
| @@ -496,8 +535,7 @@ class Trainer(object): | |||
| self.best_dev_epoch = None | |||
| self.best_dev_step = None | |||
| self.best_dev_perf = None | |||
| self.n_steps = (len(self.train_data) // self.batch_size + int( | |||
| len(self.train_data) % self.batch_size != 0)) * int(drop_last==0) * self.n_epochs | |||
| self.n_steps = len(self.data_iterator) * self.n_epochs | |||
| if isinstance(optimizer, torch.optim.Optimizer): | |||
| self.optimizer = optimizer | |||
| @@ -600,10 +638,6 @@ class Trainer(object): | |||
| self.step = 0 | |||
| self.epoch = 0 | |||
| start = time.time() | |||
| if isinstance(self.model, nn.DataParallel): | |||
| self._forward_func = self.model.module.forward | |||
| else: | |||
| self._forward_func = self.model.forward | |||
| with inner_tqdm(total=self.n_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar: | |||
| self.pbar = pbar | |||
| avg_loss = 0 | |||
| @@ -745,7 +779,7 @@ class Trainer(object): | |||
| model_path = os.path.join(self.save_path, model_name) | |||
| if not os.path.exists(self.save_path): | |||
| os.makedirs(self.save_path, exist_ok=True) | |||
| if isinstance(model, nn.DataParallel): | |||
| if _model_contains_inner_module(model): | |||
| model = model.module | |||
| if only_param: | |||
| state_dict = model.state_dict() | |||
| @@ -765,7 +799,7 @@ class Trainer(object): | |||
| states = torch.load(model_path) | |||
| else: | |||
| states = torch.load(model_path).state_dict() | |||
| if isinstance(model, nn.DataParallel): | |||
| if _model_contains_inner_module(model): | |||
| model.module.load_state_dict(states) | |||
| else: | |||
| model.load_state_dict(states) | |||
| @@ -823,12 +857,10 @@ def _get_value_info(_dict): | |||
| from numbers import Number | |||
| from .batch import _to_tensor | |||
| def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_SIZE, | |||
| dev_data=None, metric_key=None, | |||
| check_level=0): | |||
| def _check_code(dataset, model, losser, metrics, forward_func, batch_size=DEFAULT_CHECK_BATCH_SIZE, | |||
| dev_data=None, metric_key=None, check_level=0): | |||
| # check get_loss 方法 | |||
| model_devcie = _get_model_device(model=model) | |||
| model_device = _get_model_device(model=model) | |||
| def _iter(): | |||
| start_idx = 0 | |||
| while start_idx<len(dataset): | |||
| @@ -849,7 +881,7 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ | |||
| start_idx += batch_size | |||
| for batch_count, (batch_x, batch_y) in enumerate(_iter()): | |||
| _move_dict_value_to_device(batch_x, batch_y, device=model_devcie) | |||
| _move_dict_value_to_device(batch_x, batch_y, device=model_device) | |||
| # forward check | |||
| if batch_count == 0: | |||
| info_str = "" | |||
| @@ -868,15 +900,11 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ | |||
| else: | |||
| info_str += 'There is no target field.' | |||
| print(info_str) | |||
| _check_forward_error(forward_func=model.forward, dataset=dataset, | |||
| _check_forward_error(forward_func=forward_func, dataset=dataset, | |||
| batch_x=batch_x, check_level=check_level) | |||
| if isinstance(model, nn.DataParallel): | |||
| forward_func = model.module.forward | |||
| else: | |||
| forward_func = model.forward | |||
| refined_batch_x = _build_args(forward_func, **batch_x) | |||
| pred_dict = model(**refined_batch_x) | |||
| func_signature = _get_func_signature(model.forward) | |||
| func_signature = _get_func_signature(forward_func) | |||
| if not isinstance(pred_dict, dict): | |||
| raise TypeError(f"The return value of {func_signature} should be `dict`, not `{type(pred_dict)}`.") | |||
| @@ -896,7 +924,7 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ | |||
| loss.backward() | |||
| except _CheckError as e: | |||
| # TODO: another error raised if _CheckError caught | |||
| pre_func_signature = _get_func_signature(model.forward) | |||
| pre_func_signature = _get_func_signature(forward_func) | |||
| _check_loss_evaluate(prev_func_signature=pre_func_signature, func_signature=e.func_signature, | |||
| check_res=e.check_res, pred_dict=pred_dict, target_dict=batch_y, | |||
| dataset=dataset, check_level=check_level) | |||
| @@ -62,7 +62,6 @@ def _prepare_cache_filepath(filepath): | |||
| os.makedirs(cache_dir) | |||
| # TODO 可以保存下缓存时的参数,如果load的时候发现参数不一致,发出警告。 | |||
| def cache_results(_cache_fp, _refresh=False, _verbose=1): | |||
| """ | |||
| 别名::class:`fastNLP.cache_results` :class:`fastNLP.core.uitls.cache_results` | |||
| @@ -188,49 +187,17 @@ def _save_model(model, model_name, save_dir, only_param=False): | |||
| torch.save(model, model_path) | |||
| model.to(_model_device) | |||
| def _model_contains_inner_module(model): | |||
| """ | |||
| # def save_pickle(obj, pickle_path, file_name): | |||
| # """Save an object into a pickle file. | |||
| # | |||
| # :param obj: an object | |||
| # :param pickle_path: str, the directory where the pickle file is to be saved | |||
| # :param file_name: str, the name of the pickle file. In general, it should be ended by "pkl". | |||
| # """ | |||
| # if not os.path.exists(pickle_path): | |||
| # os.mkdir(pickle_path) | |||
| # print("make dir {} before saving pickle file".format(pickle_path)) | |||
| # with open(os.path.join(pickle_path, file_name), "wb") as f: | |||
| # _pickle.dump(obj, f) | |||
| # print("{} saved in {}".format(file_name, pickle_path)) | |||
| # | |||
| # | |||
| # def load_pickle(pickle_path, file_name): | |||
| # """Load an object from a given pickle file. | |||
| # | |||
| # :param pickle_path: str, the directory where the pickle file is. | |||
| # :param file_name: str, the name of the pickle file. | |||
| # :return obj: an object stored in the pickle | |||
| # """ | |||
| # with open(os.path.join(pickle_path, file_name), "rb") as f: | |||
| # obj = _pickle.load(f) | |||
| # print("{} loaded from {}".format(file_name, pickle_path)) | |||
| # return obj | |||
| # | |||
| # | |||
| # def pickle_exist(pickle_path, pickle_name): | |||
| # """Check if a given pickle file exists in the directory. | |||
| # | |||
| # :param pickle_path: the directory of target pickle file | |||
| # :param pickle_name: the filename of target pickle file | |||
| # :return: True if file exists else False | |||
| # """ | |||
| # if not os.path.exists(pickle_path): | |||
| # os.makedirs(pickle_path) | |||
| # file_name = os.path.join(pickle_path, pickle_name) | |||
| # if os.path.exists(file_name): | |||
| # return True | |||
| # else: | |||
| # return False | |||
| :param nn.Module model: 模型文件,判断是否内部包含model.module, 多用于check模型是否是nn.DataParallel, | |||
| nn.parallel.DistributedDataParallel。主要是在做形参匹配的时候需要使用最内部的model的function。 | |||
| :return: bool | |||
| """ | |||
| if isinstance(model, nn.Module): | |||
| if isinstance(model, (nn.DataParallel, nn.parallel.DistributedDataParallel)): | |||
| return True | |||
| return False | |||
| def _move_model_to_device(model, device): | |||
| """ | |||
| @@ -254,8 +221,8 @@ def _move_model_to_device(model, device): | |||
| :return: torch.nn.DataParallel or torch.nn.Module | |||
| """ | |||
| if isinstance(model, torch.nn.parallel.DistributedDataParallel): | |||
| raise RuntimeError("model of `torch.nn.parallel.DistributedDataParallel` is not supported right now.") | |||
| # if isinstance(model, torch.nn.parallel.DistributedDataParallel): | |||
| # raise RuntimeError("model of `torch.nn.parallel.DistributedDataParallel` is not supported right now.") | |||
| if device is None: | |||
| if isinstance(model, torch.nn.DataParallel): | |||
| @@ -352,7 +319,6 @@ def _map_args(maps: dict, **kwargs): | |||
| output.update({name: val}) | |||
| for keys in maps.keys(): | |||
| if keys not in output.keys(): | |||
| # TODO: add UNUSED warning. | |||
| pass | |||
| return output | |||
| @@ -570,18 +536,6 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re | |||
| else: | |||
| _tmp = f'Provide `{_miss}` in DataSet or output of {prev_func_signature}.' | |||
| suggestions.append(_tmp) | |||
| # for _miss in unmapped_missing: | |||
| # if _miss in dataset: | |||
| # suggestions.append(f"Set `{_miss}` as target.") | |||
| # else: | |||
| # _tmp = '' | |||
| # if check_res.unused: | |||
| # _tmp = f"Specify your assignment for `{input_func_map.get(_miss, _miss)}` when initialize {module_name}." | |||
| # if _tmp: | |||
| # _tmp += f' Or provide `{_miss}` in DataSet or output of {prev_func_signature}.' | |||
| # else: | |||
| # _tmp = f'Provide `{_miss}` in output of {prev_func_signature} or DataSet.' | |||
| # suggestions.append(_tmp) | |||
| if check_res.duplicated: | |||
| errs.append(f"\tduplicated param: {check_res.duplicated}.") | |||
| @@ -37,8 +37,8 @@ class BertEmbedding(ContextualEmbedding): | |||
| :param ~fastNLP.Vocabulary vocab: 词表 | |||
| :param str model_dir_or_name: 模型所在目录或者模型的名称。当传入模型所在目录时,目录中应该包含一个词表文件(以.txt作为后缀名), | |||
| 权重文件(以.bin作为文件后缀名), 配置文件(以.json作为后缀名)。 | |||
| :param str layers: 输出embedding表示来自于哪些层,不同层的结果按照layers中的顺序在最后一维concat起来。以','隔开层数,可以以负数 | |||
| 去索引倒数几层。 | |||
| :param str layers: 输出embedding表示来自于哪些层,不同层的结果按照layers中的顺序在最后一维concat起来。以','隔开层数,层的序号是 | |||
| 从0开始,可以以负数去索引倒数几层。 | |||
| :param str pool_method: 因为在bert中,每个word会被表示为多个word pieces, 当获取一个word的表示的时候,怎样从它的word pieces | |||
| 中计算得到它对应的表示。支持 ``last`` , ``first`` , ``avg`` , ``max``。 | |||
| :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 | |||
| @@ -334,7 +334,7 @@ class _WordBertModel(nn.Module): | |||
| start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1] | |||
| outputs[l_index, i, j+s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2) | |||
| if self.include_cls_sep: | |||
| if l==len(bert_outputs) and self.pooled_cls: | |||
| if l in (len(bert_outputs)-1, -1) and self.pooled_cls: | |||
| outputs[l_index, :, 0] = pooled_cls | |||
| else: | |||
| outputs[l_index, :, 0] = output_layer[:, 0] | |||
| @@ -37,7 +37,7 @@ class ElmoEmbedding(ContextualEmbedding): | |||
| :param model_dir_or_name: 可以有两种方式调用预训练好的ELMo embedding:第一种是传入ELMo所在文件夹,该文件夹下面应该有两个文件, | |||
| 其中一个是以json为后缀的配置文件,另一个是以pkl为后缀的权重文件;第二种是传入ELMo版本的名称,将自动查看缓存中是否存在该模型, | |||
| 没有的话将自动下载并缓存。 | |||
| :param layers: str, 指定返回的层数, 以,隔开不同的层。如果要返回第二层的结果'2', 返回后两层的结果'1,2'。不同的层的结果 | |||
| :param layers: str, 指定返回的层数(从0开始), 以,隔开不同的层。如果要返回第二层的结果'2', 返回后两层的结果'1,2'。不同的层的结果 | |||
| 按照这个顺序concat起来,默认为'2'。'mix'会使用可学习的权重结合不同层的表示(权重是否可训练与requires_grad保持一致, | |||
| 初始化权重对三层结果进行mean-pooling, 可以通过ElmoEmbedding.set_mix_weights_requires_grad()方法只将mix weights设置为可学习。) | |||
| :param requires_grad: bool, 该层是否需要gradient, 默认为False. | |||
| @@ -43,7 +43,7 @@ class StaticEmbedding(TokenEmbedding): | |||
| 如果输入为None则使用embedding_dim的维度随机初始化一个embedding。 | |||
| :param int embedding_dim: 随机初始化的embedding的维度,仅在model_dir_or_name为None时有效。 | |||
| :param bool requires_grad: 是否需要gradient. 默认为True | |||
| :param callable init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。调用该方法时传入一个tensor对象。 | |||
| :param callable init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。调用该方法时传入一个tensor对 | |||
| :param bool lower: 是否将vocab中的词语小写后再和预训练的词表进行匹配。如果你的词表中包含大写的词语,或者就是需要单独 | |||
| 为大写的词语开辟一个vector表示,则将lower设置为False。 | |||
| :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 | |||
| @@ -84,7 +84,7 @@ class StaticEmbedding(TokenEmbedding): | |||
| if lowered_word not in lowered_vocab.word_count: | |||
| lowered_vocab.add_word(lowered_word) | |||
| lowered_vocab._no_create_word[lowered_word] += 1 | |||
| print(f"All word in vocab have been lowered. There are {len(vocab)} words, {len(lowered_vocab)} unique lowered " | |||
| print(f"All word in the vocab have been lowered. There are {len(vocab)} words, {len(lowered_vocab)} unique lowered " | |||
| f"words.") | |||
| if model_path: | |||
| embedding = self._load_with_vocab(model_path, vocab=lowered_vocab, init_method=init_method) | |||
| @@ -563,6 +563,8 @@ class WordpieceTokenizer(object): | |||
| output_tokens.append(self.unk_token) | |||
| else: | |||
| output_tokens.extend(sub_tokens) | |||
| if len(output_tokens)==0: | |||
| return [self.unk_token] | |||
| return output_tokens | |||
| @@ -3,7 +3,8 @@ from functools import reduce | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.init as init | |||
| import glob | |||
| import os | |||
| def initial_parameter(net, initial_method=None): | |||
| """A method used to initialize the weights of PyTorch models. | |||
| @@ -119,7 +120,6 @@ def get_dropout_mask(drop_p: float, tensor: torch.Tensor): | |||
| training=False, inplace=True) | |||
| return mask_x | |||
| import glob | |||
| def _get_file_name_base_on_postfix(dir_path, postfix): | |||
| """ | |||
| @@ -1,4 +1,5 @@ | |||
| import os | |||
| import sys | |||
| import unittest | |||
| from fastNLP import DataSet | |||
| @@ -79,6 +80,16 @@ class TestDataSetMethods(unittest.TestCase): | |||
| self.assertFalse("x" in dd.field_arrays) | |||
| self.assertTrue("y" in dd.field_arrays) | |||
| def test_delete_instance(self): | |||
| dd = DataSet() | |||
| old_length = 2 | |||
| dd.add_field("x", [[1, 2, 3]] * old_length) | |||
| dd.add_field("y", [[1, 2, 3, 4]] * old_length) | |||
| dd.delete_instance(0) | |||
| self.assertEqual(len(dd), old_length-1) | |||
| dd.delete_instance(0) | |||
| self.assertEqual(len(dd), old_length-2) | |||
| def test_getitem(self): | |||
| ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) | |||
| ins_1, ins_0 = ds[0], ds[1] | |||
| @@ -170,22 +170,22 @@ class TestFieldArray(unittest.TestCase): | |||
| def test_append(self): | |||
| with self.assertRaises(Exception): | |||
| fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True) | |||
| fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True, use_1st_ins_infer_dim_type=False) | |||
| fa.append(0) | |||
| with self.assertRaises(Exception): | |||
| fa = FieldArray("y", [1.1, 2.2, 3.3, 4.4, 5.5], is_input=True) | |||
| fa = FieldArray("y", [1.1, 2.2, 3.3, 4.4, 5.5], is_input=True, use_1st_ins_infer_dim_type=False) | |||
| fa.append([1, 2, 3, 4, 5]) | |||
| with self.assertRaises(Exception): | |||
| fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True) | |||
| fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True, use_1st_ins_infer_dim_type=False) | |||
| fa.append([]) | |||
| with self.assertRaises(Exception): | |||
| fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True) | |||
| fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True, use_1st_ins_infer_dim_type=False) | |||
| fa.append(["str", 0, 0, 0, 1.89]) | |||
| fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1.0, 2.0, 3.0, 4.0, 5.0]], is_input=True) | |||
| fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1.0, 2.0, 3.0, 4.0, 5.0]], is_input=True, use_1st_ins_infer_dim_type=False) | |||
| fa.append([1.2, 2.3, 3.4, 4.5, 5.6]) | |||
| self.assertEqual(len(fa), 3) | |||
| self.assertEqual(fa[2], [1.2, 2.3, 3.4, 4.5, 5.6]) | |||