From 0d3b7b0df210418326295c4cbe1c07152e540af0 Mon Sep 17 00:00:00 2001 From: "zhangzhicheng.zzc" Date: Mon, 31 Oct 2022 20:52:27 +0800 Subject: [PATCH 01/18] [to #42322933]fix bugs relate to token cls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1.修复token classification preprocessor finetune结果错误问题 2.修复word segmentation output 无用属性 3. 修复nlp preprocessor传use_fast错误 4. 修复torch model exporter bug 5. 修复文档撰写过程中发现trainer相关bug Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10573269 --- modelscope/exporters/torch_model_exporter.py | 5 +- modelscope/outputs/outputs.py | 11 +- .../nlp/token_classification_pipeline.py | 4 +- .../nlp/word_segmentation_pipeline.py | 6 +- modelscope/preprocessors/nlp/nlp_base.py | 17 +- .../nlp/token_classification_preprocessor.py | 148 ++++++++++-------- .../trainers/nlp/text_generation_trainer.py | 2 +- modelscope/trainers/nlp_trainer.py | 6 +- modelscope/trainers/trainer.py | 2 +- tests/outputs/test_model_outputs.py | 3 +- .../test_finetune_token_classificatin.py | 2 +- 11 files changed, 110 insertions(+), 96 deletions(-) diff --git a/modelscope/exporters/torch_model_exporter.py b/modelscope/exporters/torch_model_exporter.py index 7bf6c0c0..1d332591 100644 --- a/modelscope/exporters/torch_model_exporter.py +++ b/modelscope/exporters/torch_model_exporter.py @@ -128,7 +128,7 @@ class TorchModelExporter(Exporter): args_list = list(args) else: args_list = [args] - if isinstance(args_list[-1], dict): + if isinstance(args_list[-1], Mapping): args_dict = args_list[-1] args_list = args_list[:-1] n_nonkeyword = len(args_list) @@ -284,9 +284,8 @@ class TorchModelExporter(Exporter): 'Model property dummy_inputs must be set.') dummy_inputs = collate_fn(dummy_inputs, device) if isinstance(dummy_inputs, Mapping): - dummy_inputs = self._decide_input_format(model, dummy_inputs) dummy_inputs_filter = [] - for _input in dummy_inputs: + for _input in self._decide_input_format(model, dummy_inputs): if _input is not None: dummy_inputs_filter.append(_input) else: diff --git a/modelscope/outputs/outputs.py b/modelscope/outputs/outputs.py index b7003809..2c6dd85a 100644 --- a/modelscope/outputs/outputs.py +++ b/modelscope/outputs/outputs.py @@ -491,17 +491,8 @@ TASK_OUTPUTS = { # word segmentation result for single sample # { # "output": "今天 天气 不错 , 适合 出去 游玩" - # "labels": [ - # {'word': '今天', 'label': 'PROPN'}, - # {'word': '天气', 'label': 'PROPN'}, - # {'word': '不错', 'label': 'VERB'}, - # {'word': ',', 'label': 'NUM'}, - # {'word': '适合', 'label': 'NOUN'}, - # {'word': '出去', 'label': 'PART'}, - # {'word': '游玩', 'label': 'ADV'}, - # ] # } - Tasks.word_segmentation: [OutputKeys.OUTPUT, OutputKeys.LABELS], + Tasks.word_segmentation: [OutputKeys.OUTPUT], # TODO @wenmeng.zwm support list of result check # named entity recognition result for single sample diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py index 75bc538d..4af187ee 100644 --- a/modelscope/pipelines/nlp/token_classification_pipeline.py +++ b/modelscope/pipelines/nlp/token_classification_pipeline.py @@ -109,13 +109,13 @@ class TokenClassificationPipeline(Pipeline): chunk['span'] = text[chunk['start']:chunk['end']] chunks.append(chunk) - # for cws output + # for cws outputs if len(chunks) > 0 and chunks[0]['type'] == 'cws': spans = [ chunk['span'] for chunk in chunks if chunk['span'].strip() ] seg_result = ' '.join(spans) - outputs = {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []} + outputs = {OutputKeys.OUTPUT: seg_result} # for ner outputs else: diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py index 0df8f1ad..c57f6b93 100644 --- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py +++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py @@ -115,15 +115,15 @@ class WordSegmentationPipeline(Pipeline): chunk['span'] = text[chunk['start']:chunk['end']] chunks.append(chunk) - # for cws output + # for cws outputs if len(chunks) > 0 and chunks[0]['type'] == 'cws': spans = [ chunk['span'] for chunk in chunks if chunk['span'].strip() ] seg_result = ' '.join(spans) - outputs = {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []} + outputs = {OutputKeys.OUTPUT: seg_result} - # for ner outpus + # for ner output else: outputs = {OutputKeys.OUTPUT: chunks} return outputs diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py index 48a04d7a..45efc6e7 100644 --- a/modelscope/preprocessors/nlp/nlp_base.py +++ b/modelscope/preprocessors/nlp/nlp_base.py @@ -34,6 +34,7 @@ class NLPBasePreprocessor(Preprocessor, ABC): label=None, label2id=None, mode=ModeKeys.INFERENCE, + use_fast=None, **kwargs): """The NLP preprocessor base class. @@ -45,14 +46,18 @@ class NLPBasePreprocessor(Preprocessor, ABC): label2id: An optional label2id mapping, the class will try to call utils.parse_label_mapping if this mapping is not supplied. mode: Run this preprocessor in either 'train'/'eval'/'inference' mode + use_fast: use the fast version of tokenizer + """ self.model_dir = model_dir self.first_sequence = first_sequence self.second_sequence = second_sequence self.label = label - self.use_fast = kwargs.pop('use_fast', None) - if self.use_fast is None and os.path.isfile( + self.use_fast = use_fast + if self.use_fast is None and model_dir is None: + self.use_fast = False + elif self.use_fast is None and os.path.isfile( os.path.join(model_dir, 'tokenizer_config.json')): with open(os.path.join(model_dir, 'tokenizer_config.json'), 'r') as f: @@ -61,8 +66,8 @@ class NLPBasePreprocessor(Preprocessor, ABC): self.use_fast = False if self.use_fast is None else self.use_fast self.label2id = label2id - if self.label2id is None: - self.label2id = parse_label_mapping(self.model_dir) + if self.label2id is None and model_dir is not None: + self.label2id = parse_label_mapping(model_dir) super().__init__(mode, **kwargs) @property @@ -106,6 +111,7 @@ class NLPTokenizerPreprocessorBase(NLPBasePreprocessor): label: str = 'label', label2id: dict = None, mode: str = ModeKeys.INFERENCE, + use_fast: bool = None, **kwargs): """The NLP tokenizer preprocessor base class. @@ -122,11 +128,12 @@ class NLPTokenizerPreprocessorBase(NLPBasePreprocessor): - config.json label2id/id2label - label_mapping.json mode: Run this preprocessor in either 'train'/'eval'/'inference' mode, the behavior may be different. + use_fast: use the fast version of tokenizer kwargs: These kwargs will be directly fed into the tokenizer. """ super().__init__(model_dir, first_sequence, second_sequence, label, - label2id, mode) + label2id, mode, use_fast, **kwargs) self.model_dir = model_dir self.tokenize_kwargs = kwargs self.tokenizer = self.build_tokenizer(model_dir) diff --git a/modelscope/preprocessors/nlp/token_classification_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_preprocessor.py index 2de0c806..5069048b 100644 --- a/modelscope/preprocessors/nlp/token_classification_preprocessor.py +++ b/modelscope/preprocessors/nlp/token_classification_preprocessor.py @@ -2,6 +2,7 @@ from typing import Any, Dict, Tuple, Union +import numpy as np import torch from modelscope.metainfo import Preprocessors @@ -20,9 +21,7 @@ class WordSegmentationBlankSetToLabelPreprocessor(NLPBasePreprocessor): """ def __init__(self, **kwargs): - super().__init__(**kwargs) - self.first_sequence: str = kwargs.pop('first_sequence', - 'first_sequence') + self.first_sequence: str = kwargs.pop('first_sequence', 'tokens') self.label = kwargs.pop('label', OutputKeys.LABELS) def __call__(self, data: str) -> Union[Dict[str, Any], Tuple]: @@ -80,10 +79,9 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): 'is_split_into_words', False) if 'label2id' in kwargs: kwargs.pop('label2id') - self.tokenize_kwargs = kwargs - @type_assert(object, str) - def __call__(self, data: str) -> Dict[str, Any]: + @type_assert(object, (str, dict)) + def __call__(self, data: Union[dict, str]) -> Dict[str, Any]: """process the raw input data Args: @@ -99,18 +97,24 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): text = None labels_list = None if isinstance(data, str): + # for inference inputs without label text = data + self.tokenize_kwargs['add_special_tokens'] = False elif isinstance(data, dict): + # for finetune inputs with label text = data.get(self.first_sequence) labels_list = data.get(self.label) + if isinstance(text, list): + self.tokenize_kwargs['is_split_into_words'] = True input_ids = [] label_mask = [] offset_mapping = [] - if self.is_split_into_words: - for offset, token in enumerate(list(data)): - subtoken_ids = self.tokenizer.encode( - token, add_special_tokens=False) + token_type_ids = [] + if self.is_split_into_words and self._mode == ModeKeys.INFERENCE: + for offset, token in enumerate(list(text)): + subtoken_ids = self.tokenizer.encode(token, + **self.tokenize_kwargs) if len(subtoken_ids) == 0: subtoken_ids = [self.tokenizer.unk_token_id] input_ids.extend(subtoken_ids) @@ -119,10 +123,9 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): else: if self.tokenizer.is_fast: encodings = self.tokenizer( - text, - add_special_tokens=False, - return_offsets_mapping=True, - **self.tokenize_kwargs) + text, return_offsets_mapping=True, **self.tokenize_kwargs) + attention_mask = encodings['attention_mask'] + token_type_ids = encodings['token_type_ids'] input_ids = encodings['input_ids'] word_ids = encodings.word_ids() for i in range(len(word_ids)): @@ -143,69 +146,80 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): label_mask, offset_mapping = self.get_label_mask_and_offset_mapping( text) - if len(input_ids) >= self.sequence_length - 2: - input_ids = input_ids[:self.sequence_length - 2] - label_mask = label_mask[:self.sequence_length - 2] - input_ids = [self.tokenizer.cls_token_id - ] + input_ids + [self.tokenizer.sep_token_id] - label_mask = [0] + label_mask + [0] - attention_mask = [1] * len(input_ids) - offset_mapping = offset_mapping[:sum(label_mask)] + if self._mode == ModeKeys.INFERENCE: + if len(input_ids) >= self.sequence_length - 2: + input_ids = input_ids[:self.sequence_length - 2] + label_mask = label_mask[:self.sequence_length - 2] + input_ids = [self.tokenizer.cls_token_id + ] + input_ids + [self.tokenizer.sep_token_id] + label_mask = [0] + label_mask + [0] + attention_mask = [1] * len(input_ids) + offset_mapping = offset_mapping[:sum(label_mask)] - if not self.is_transformer_based_model: - input_ids = input_ids[1:-1] - attention_mask = attention_mask[1:-1] - label_mask = label_mask[1:-1] + if not self.is_transformer_based_model: + input_ids = input_ids[1:-1] + attention_mask = attention_mask[1:-1] + label_mask = label_mask[1:-1] - if self._mode == ModeKeys.INFERENCE: input_ids = torch.tensor(input_ids).unsqueeze(0) attention_mask = torch.tensor(attention_mask).unsqueeze(0) label_mask = torch.tensor( label_mask, dtype=torch.bool).unsqueeze(0) - # the token classification - output = { - 'text': text, - 'input_ids': input_ids, - 'attention_mask': attention_mask, - 'label_mask': label_mask, - 'offset_mapping': offset_mapping - } - - # align the labels with tokenized text - if labels_list is not None: - assert self.label2id is not None - # Map that sends B-Xxx label to its I-Xxx counterpart - b_to_i_label = [] - label_enumerate_values = [ - k for k, v in sorted( - self.label2id.items(), key=lambda item: item[1]) - ] - for idx, label in enumerate(label_enumerate_values): - if label.startswith('B-') and label.replace( - 'B-', 'I-') in label_enumerate_values: - b_to_i_label.append( - label_enumerate_values.index( - label.replace('B-', 'I-'))) - else: - b_to_i_label.append(idx) + # the token classification + output = { + 'text': text, + 'input_ids': input_ids, + 'attention_mask': attention_mask, + 'label_mask': label_mask, + 'offset_mapping': offset_mapping + } + else: + output = { + 'input_ids': input_ids, + 'token_type_ids': token_type_ids, + 'attention_mask': attention_mask, + 'label_mask': label_mask, + } - label_row = [self.label2id[lb] for lb in labels_list] - previous_word_idx = None - label_ids = [] - for word_idx in word_ids: - if word_idx is None: - label_ids.append(-100) - elif word_idx != previous_word_idx: - label_ids.append(label_row[word_idx]) - else: - if self.label_all_tokens: - label_ids.append(b_to_i_label[label_row[word_idx]]) + # align the labels with tokenized text + if labels_list is not None: + assert self.label2id is not None + # Map that sends B-Xxx label to its I-Xxx counterpart + b_to_i_label = [] + label_enumerate_values = [ + k for k, v in sorted( + self.label2id.items(), key=lambda item: item[1]) + ] + for idx, label in enumerate(label_enumerate_values): + if label.startswith('B-') and label.replace( + 'B-', 'I-') in label_enumerate_values: + b_to_i_label.append( + label_enumerate_values.index( + label.replace('B-', 'I-'))) else: + b_to_i_label.append(idx) + + label_row = [self.label2id[lb] for lb in labels_list] + previous_word_idx = None + label_ids = [] + for word_idx in word_ids: + if word_idx is None: label_ids.append(-100) - previous_word_idx = word_idx - labels = label_ids - output['labels'] = labels + elif word_idx != previous_word_idx: + label_ids.append(label_row[word_idx]) + else: + if self.label_all_tokens: + label_ids.append(b_to_i_label[label_row[word_idx]]) + else: + label_ids.append(-100) + previous_word_idx = word_idx + labels = label_ids + output['labels'] = labels + output = { + k: np.array(v) if isinstance(v, list) else v + for k, v in output.items() + } return output def get_tokenizer_class(self): diff --git a/modelscope/trainers/nlp/text_generation_trainer.py b/modelscope/trainers/nlp/text_generation_trainer.py index 0e26f153..f02faf71 100644 --- a/modelscope/trainers/nlp/text_generation_trainer.py +++ b/modelscope/trainers/nlp/text_generation_trainer.py @@ -18,7 +18,7 @@ class TextGenerationTrainer(NlpEpochBasedTrainer): return tokenizer.decode(tokens.tolist(), skip_special_tokens=True) def evaluation_step(self, data): - model = self.model + model = self.model.module if self._dist else self.model model.eval() with torch.no_grad(): diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py index a92a3706..5ff6f62f 100644 --- a/modelscope/trainers/nlp_trainer.py +++ b/modelscope/trainers/nlp_trainer.py @@ -586,14 +586,16 @@ class NlpEpochBasedTrainer(EpochBasedTrainer): preprocessor_mode=ModeKeys.TRAIN, **model_args, **self.train_keys, - mode=ModeKeys.TRAIN) + mode=ModeKeys.TRAIN, + use_fast=True) eval_preprocessor = Preprocessor.from_pretrained( self.model_dir, cfg_dict=self.cfg, preprocessor_mode=ModeKeys.EVAL, **model_args, **self.eval_keys, - mode=ModeKeys.EVAL) + mode=ModeKeys.EVAL, + use_fast=True) return train_preprocessor, eval_preprocessor diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py index 7478d8e4..3556badf 100644 --- a/modelscope/trainers/trainer.py +++ b/modelscope/trainers/trainer.py @@ -876,7 +876,7 @@ class EpochBasedTrainer(BaseTrainer): Subclass and override to inject custom behavior. """ - model = self.model + model = self.model.module if self._dist else self.model model.eval() if is_parallel(model): diff --git a/tests/outputs/test_model_outputs.py b/tests/outputs/test_model_outputs.py index 31271869..311ce201 100644 --- a/tests/outputs/test_model_outputs.py +++ b/tests/outputs/test_model_outputs.py @@ -21,9 +21,10 @@ class TestModelOutput(unittest.TestCase): self.assertEqual(outputs['logits'], torch.Tensor([1])) self.assertEqual(outputs[0], torch.Tensor([1])) self.assertEqual(outputs.logits, torch.Tensor([1])) + outputs.loss = torch.Tensor([2]) logits, loss = outputs self.assertEqual(logits, torch.Tensor([1])) - self.assertTrue(loss is None) + self.assertTrue(loss is not None) if __name__ == '__main__': diff --git a/tests/trainers/test_finetune_token_classificatin.py b/tests/trainers/test_finetune_token_classificatin.py index 9bdab9b7..a92cee7b 100644 --- a/tests/trainers/test_finetune_token_classificatin.py +++ b/tests/trainers/test_finetune_token_classificatin.py @@ -87,7 +87,7 @@ class TestFinetuneTokenClassification(unittest.TestCase): cfg['dataset'] = { 'train': { 'labels': label_enumerate_values, - 'first_sequence': 'first_sequence', + 'first_sequence': 'tokens', 'label': 'labels', } } From 3464324f6b5d9d0ef975cd0b0e76870e95b5fa22 Mon Sep 17 00:00:00 2001 From: Yingda Chen Date: Mon, 31 Oct 2022 22:15:25 +0800 Subject: [PATCH 02/18] [to #42322933] limit datasets version for now --- requirements/framework.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements/framework.txt b/requirements/framework.txt index 2408cda6..17fbd8a3 100644 --- a/requirements/framework.txt +++ b/requirements/framework.txt @@ -1,6 +1,7 @@ addict attrs -datasets +# version beyond 2.6.0 introduces compatbility issue and is being resolved +datasets<=2.6.0 easydict einops filelock>=3.3.0 From 5302259a0a3fb7cafdce473aa78990e7dc84e676 Mon Sep 17 00:00:00 2001 From: "mulin.lyh" Date: Mon, 31 Oct 2022 22:46:17 +0800 Subject: [PATCH 03/18] [to #45854437]fix: add user name to user-agent Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10584797 --- modelscope/hub/api.py | 9 +++++++-- modelscope/hub/constants.py | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index dca6d099..7468e5e3 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -23,7 +23,8 @@ from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA, API_RESPONSE_FIELD_MESSAGE, API_RESPONSE_FIELD_USERNAME, DEFAULT_CREDENTIALS_PATH, - MODELSCOPE_ENVIRONMENT, ONE_YEAR_SECONDS, + MODELSCOPE_ENVIRONMENT, + MODELSCOPE_USERNAME, ONE_YEAR_SECONDS, Licenses, ModelVisibility) from modelscope.hub.errors import (InvalidParameter, NotExistError, NotLoginException, NoValidRevisionError, @@ -760,14 +761,18 @@ class ModelScopeConfig: env = 'custom' if MODELSCOPE_ENVIRONMENT in os.environ: env = os.environ[MODELSCOPE_ENVIRONMENT] + user_name = 'unknown' + if MODELSCOPE_USERNAME in os.environ: + user_name = os.environ[MODELSCOPE_USERNAME] - ua = 'modelscope/%s; python/%s; session_id/%s; platform/%s; processor/%s; env/%s' % ( + ua = 'modelscope/%s; python/%s; session_id/%s; platform/%s; processor/%s; env/%s; user/%s' % ( __version__, platform.python_version(), ModelScopeConfig.get_user_session_id(), platform.platform(), platform.processor(), env, + user_name, ) if isinstance(user_agent, dict): ua = '; '.join(f'{k}/{v}' for k, v in user_agent.items()) diff --git a/modelscope/hub/constants.py b/modelscope/hub/constants.py index 730702c1..373a0cf4 100644 --- a/modelscope/hub/constants.py +++ b/modelscope/hub/constants.py @@ -18,6 +18,7 @@ API_RESPONSE_FIELD_EMAIL = 'Email' API_RESPONSE_FIELD_MESSAGE = 'Message' MODELSCOPE_ENVIRONMENT = 'MODELSCOPE_ENVIRONMENT' MODELSCOPE_SDK_DEBUG = 'MODELSCOPE_SDK_DEBUG' +MODELSCOPE_USERNAME = 'MODELSCOPE_USERNAME' ONE_YEAR_SECONDS = 24 * 365 * 60 * 60 From 06abae4dc6d68e99cba56608c857de5cdabd16b0 Mon Sep 17 00:00:00 2001 From: "zhangzhicheng.zzc" Date: Tue, 1 Nov 2022 09:56:15 +0800 Subject: [PATCH 04/18] [to #42322933]add token-cls test cases and bug fix Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10585502 --- .../nlp/token_classification_preprocessor.py | 3 +-- tests/pipelines/test_named_entity_recognition.py | 8 ++++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/modelscope/preprocessors/nlp/token_classification_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_preprocessor.py index 5069048b..92b7c46b 100644 --- a/modelscope/preprocessors/nlp/token_classification_preprocessor.py +++ b/modelscope/preprocessors/nlp/token_classification_preprocessor.py @@ -140,8 +140,7 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): label_mask.append(1) offset_mapping.append(encodings['offset_mapping'][i]) else: - encodings = self.tokenizer( - text, add_special_tokens=False, **self.tokenize_kwargs) + encodings = self.tokenizer(text, **self.tokenize_kwargs) input_ids = encodings['input_ids'] label_mask, offset_mapping = self.get_label_mask_and_offset_mapping( text) diff --git a/tests/pipelines/test_named_entity_recognition.py b/tests/pipelines/test_named_entity_recognition.py index 3658cf3f..aef4aaed 100644 --- a/tests/pipelines/test_named_entity_recognition.py +++ b/tests/pipelines/test_named_entity_recognition.py @@ -19,9 +19,11 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): self.task = Tasks.named_entity_recognition self.model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news' + english_model_id = 'damo/nlp_raner_named-entity-recognition_english-large-ecom' tcrf_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news' lcrf_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-news' sentence = '这与温岭市新河镇的一个神秘的传说有关。' + sentence_en = 'pizza shovel' @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_tcrf_by_direct_model_download(self): @@ -89,6 +91,12 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): task=Tasks.named_entity_recognition, model=self.lcrf_model_id) print(pipeline_ins(input=self.sentence)) + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_english_with_model_name(self): + pipeline_ins = pipeline( + task=Tasks.named_entity_recognition, model=self.english_model_id) + print(pipeline_ins(input='pizza shovel')) + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_default_model(self): pipeline_ins = pipeline(task=Tasks.named_entity_recognition) From 9187103e3a32d4048e79e57d23fa596b2d1bffd5 Mon Sep 17 00:00:00 2001 From: "yichang.zyc" Date: Tue, 1 Nov 2022 09:57:31 +0800 Subject: [PATCH 05/18] =?UTF-8?q?[to=20#42322933]=E5=85=BC=E5=AE=B9?= =?UTF-8?q?=E6=96=B0=E5=A2=9Eclip=20huge=E6=A8=A1=E5=9E=8B=20=20=20=20=20?= =?UTF-8?q?=20=20=20=20Link:=20https://code.alibaba-inc.com/Ali-MaaS/MaaS-?= =?UTF-8?q?lib/codereview/10585552?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * compatiable with vit huge, and set clip base default mm-ebed pipeline --- modelscope/models/multi_modal/clip/model.py | 6 ++++-- modelscope/pipelines/builder.py | 5 ++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/modelscope/models/multi_modal/clip/model.py b/modelscope/models/multi_modal/clip/model.py index b1c84292..9b82e4a1 100644 --- a/modelscope/models/multi_modal/clip/model.py +++ b/modelscope/models/multi_modal/clip/model.py @@ -349,11 +349,13 @@ class CLIP(nn.Module): text_num_hidden_layers: int, text_type_vocab_size: int, tokenizer: FullTokenizer, + # vision_head_width, added this param for ViT-H + vision_head_width: int = 64, ): super().__init__() if isinstance(vision_layers, (tuple, list)): - vision_heads = vision_width * 32 // 64 + vision_heads = vision_width * 32 // vision_head_width self.visual = ModifiedResNet( layers=vision_layers, output_dim=embed_dim, @@ -361,7 +363,7 @@ class CLIP(nn.Module): input_resolution=image_resolution, width=vision_width) else: - vision_heads = vision_width // 64 + vision_heads = vision_width // vision_head_width self.visual = VisualTransformer( input_resolution=image_resolution, patch_size=vision_patch_size, diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index 498c9ed8..70f8f11c 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -93,9 +93,8 @@ DEFAULT_MODEL_FOR_PIPELINE = { 'damo/cv_resnet50_live-category'), Tasks.video_category: (Pipelines.video_category, 'damo/cv_resnet50_video-category'), - Tasks.multi_modal_embedding: - (Pipelines.multi_modal_embedding, - 'damo/multi-modal_clip-vit-large-patch14_zh'), + Tasks.multi_modal_embedding: (Pipelines.multi_modal_embedding, + 'damo/multi-modal_clip-vit-base-patch16_zh'), Tasks.generative_multi_modal_embedding: (Pipelines.generative_multi_modal_embedding, 'damo/multi-modal_gemm-vit-large-patch14_generative-multi-modal-embedding' From 40b677095605594d426b9c731687fb834d04b4fc Mon Sep 17 00:00:00 2001 From: "liugao.lg" Date: Tue, 1 Nov 2022 10:22:11 +0800 Subject: [PATCH 06/18] [to #42322933]fix ocr prepreocess & conflict MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 修复ocr预处理逻辑不一致问题 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10581697 --- modelscope/preprocessors/multi_modal.py | 1 - modelscope/preprocessors/ofa/ocr_recognition.py | 11 ++++++----- requirements/multi-modal.txt | 2 ++ tests/trainers/test_ofa_trainer.py | 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py index 17dffb48..13876058 100644 --- a/modelscope/preprocessors/multi_modal.py +++ b/modelscope/preprocessors/multi_modal.py @@ -96,7 +96,6 @@ class OfaPreprocessor(Preprocessor): data = input else: data = self._build_dict(input) - data = self._ofa_input_compatibility_conversion(data) sample = self.preprocess(data) str_data = dict() for k, v in data.items(): diff --git a/modelscope/preprocessors/ofa/ocr_recognition.py b/modelscope/preprocessors/ofa/ocr_recognition.py index 26fff9d2..a0342c14 100644 --- a/modelscope/preprocessors/ofa/ocr_recognition.py +++ b/modelscope/preprocessors/ofa/ocr_recognition.py @@ -2,12 +2,12 @@ from typing import Any, Dict import torch -from PIL import Image +import unicodedata2 from torchvision import transforms from torchvision.transforms import InterpolationMode from torchvision.transforms import functional as F +from zhconv import convert -from modelscope.preprocessors.image import load_image from modelscope.utils.constant import ModeKeys from .base import OfaBasePreprocessor @@ -98,8 +98,7 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor): def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: sample = self._build_infer_sample(data) - target = data[self.column_map['text']] - target = target.translate(self.transtab).strip() + target = sample['label'] target_token_list = target.strip().split() target = ' '.join(target_token_list[:self.max_tgt_length]) sample['target'] = self.tokenize_text(target, add_bos=False) @@ -119,5 +118,7 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor): 'patch_mask': torch.tensor([True]) } if 'text' in self.column_map and self.column_map['text'] in data: - sample['label'] = data[self.column_map['text']] + target = data[self.column_map['text']] + target = unicodedata2.normalize('NFKC', convert(target, 'zh-hans')) + sample['label'] = target return sample diff --git a/requirements/multi-modal.txt b/requirements/multi-modal.txt index 255f6155..578f0b54 100644 --- a/requirements/multi-modal.txt +++ b/requirements/multi-modal.txt @@ -11,3 +11,5 @@ timm tokenizers torchvision transformers>=4.12.0 +unicodedata2 +zhconv diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py index 3f68a9fb..85c21881 100644 --- a/tests/trainers/test_ofa_trainer.py +++ b/tests/trainers/test_ofa_trainer.py @@ -85,7 +85,7 @@ class TestOfaTrainer(unittest.TestCase): 'ocr_fudanvi_zh', subset_name='scene', namespace='modelscope', - split='train[:200]', + split='train[800:900]', download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS), eval_dataset=MsDataset.load( 'ocr_fudanvi_zh', From 4080f8071e96d4dbcc5ae8af10b051e14fea30ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8F=AD=E6=89=AC?= Date: Tue, 1 Nov 2022 12:57:04 +0800 Subject: [PATCH 07/18] temp --- modelscope/hub/api.py | 11 +++++++++++ modelscope/msdatasets/ms_dataset.py | 14 ++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index 7468e5e3..0262fc1d 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -646,6 +646,17 @@ class HubApi: def check_local_cookies(self, use_cookies) -> CookieJar: return self._check_cookie(use_cookies=use_cookies) + def count_uv_by_channel(self, dataset_name: str, namespace: str, channel: str): + # todo: 1. check args 2. + + url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/uv/{channel}' + cookies = ModelScopeConfig.get_cookies() + r = requests.post(url, cookies=cookies, headers=self.headers) + resp = r.json() + raise_on_error(resp) + print(resp) + return resp['Message'] + class ModelScopeConfig: path_credential = expanduser(DEFAULT_CREDENTIALS_PATH) diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py index 0c537df7..a7d29990 100644 --- a/modelscope/msdatasets/ms_dataset.py +++ b/modelscope/msdatasets/ms_dataset.py @@ -727,3 +727,17 @@ class MsDataset: resp_msg = _delete_manager.delete(object_name=object_name) logger.info(f'Object {object_name} successfully removed!') return resp_msg + + +if __name__ == '__main__': + from modelscope.hub.api import HubApi + api = HubApi() + # api.login('c252d64a-ce7b-4c0c-b583-7bedf628c7da') # online + # api.login('aa14716f-e2de-4f26-bf49-254d81eb8ac6') # test + + channel = 'local' # dsw + dataset_name = 'small_coco_for_test' + namespace = 'wangxingjun778test' + resp = api.count_uv_by_channel( + dataset_name=dataset_name, namespace=namespace, channel=channel) + print(resp) From f5c31b33198288405f209773cd41a5efa1991e50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B9=B2=E5=8A=B2?= Date: Tue, 1 Nov 2022 13:31:25 +0800 Subject: [PATCH 08/18] Add miss init --- .../models/science/unifold/modules/__init__.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 modelscope/models/science/unifold/modules/__init__.py diff --git a/modelscope/models/science/unifold/modules/__init__.py b/modelscope/models/science/unifold/modules/__init__.py new file mode 100644 index 00000000..9821d212 --- /dev/null +++ b/modelscope/models/science/unifold/modules/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2021 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Data pipeline for model features.""" From 943478de635393e957bb0bf6ad677fdd189ac5c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B9=B2=E5=8A=B2?= Date: Tue, 1 Nov 2022 13:32:57 +0800 Subject: [PATCH 09/18] Update --- .../models/science/unifold/modules/__init__.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/modelscope/models/science/unifold/modules/__init__.py b/modelscope/models/science/unifold/modules/__init__.py index 9821d212..63aa84ed 100644 --- a/modelscope/models/science/unifold/modules/__init__.py +++ b/modelscope/models/science/unifold/modules/__init__.py @@ -1,14 +1,3 @@ -# Copyright 2021 DeepMind Technologies Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Data pipeline for model features.""" +# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license, +# and is publicly available at https://github.com/dptech-corp/Uni-Fold. +"""Unifold Modules.""" From 2759d538bb30c8c82d0dd32ea3b4bcd7606d41d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B9=B2=E5=8A=B2?= Date: Tue, 1 Nov 2022 14:59:45 +0800 Subject: [PATCH 10/18] fix ut level for unifold --- tests/pipelines/test_unifold.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipelines/test_unifold.py b/tests/pipelines/test_unifold.py index df35dc5e..47bb7874 100644 --- a/tests/pipelines/test_unifold.py +++ b/tests/pipelines/test_unifold.py @@ -19,7 +19,7 @@ class UnifoldProteinStructureTest(unittest.TestCase, DemoCompatibilityCheck): self.protein_multimer = 'GAMGLPEEPSSPQESTLKALSLYEAHLSSYIMYLQTFLVKTKQKVNNKNYPEFTLFDTSKLKKDQTLKSIKT' + \ 'NIAALKNHIDKIKPIAMQIYKKYSKNIP' - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_by_direct_model_download(self): model_dir = snapshot_download(self.model_id) mono_pipeline_ins = pipeline(task=self.task, model=model_dir) From 84032f90e3f2b4a183725ceda16a4b1dc204c2f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8F=AD=E6=89=AC?= Date: Tue, 1 Nov 2022 15:34:58 +0800 Subject: [PATCH 11/18] add event tracking --- modelscope/hub/api.py | 20 ++++++++++++++------ modelscope/msdatasets/ms_dataset.py | 16 ++-------------- modelscope/utils/constant.py | 8 ++++++++ requirements/framework.txt | 2 +- 4 files changed, 25 insertions(+), 21 deletions(-) diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index 0262fc1d..f2ff822d 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -39,8 +39,8 @@ from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, DEFAULT_MODEL_REVISION, DEFAULT_REPOSITORY_REVISION, MASTER_MODEL_BRANCH, DatasetFormations, - DatasetMetaFormats, DownloadMode, - ModelFile) + DatasetMetaFormats, DownloadChannel, + DownloadMode, ModelFile) from modelscope.utils.logger import get_logger from .utils.utils import (get_endpoint, get_release_datetime, model_id_to_group_owner_name) @@ -646,15 +646,23 @@ class HubApi: def check_local_cookies(self, use_cookies) -> CookieJar: return self._check_cookie(use_cookies=use_cookies) - def count_uv_by_channel(self, dataset_name: str, namespace: str, channel: str): - # todo: 1. check args 2. + def dataset_download_uv(self, dataset_name: str, namespace: str): + if not dataset_name or not namespace: + raise ValueError('dataset_name or namespace cannot be empty!') - url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/uv/{channel}' + # get channel and user_name + channel = DownloadChannel.LOCAL.value + user_name = '' + if MODELSCOPE_ENVIRONMENT in os.environ: + channel = os.environ[MODELSCOPE_ENVIRONMENT] + if MODELSCOPE_USERNAME in os.environ: + user_name = os.environ[MODELSCOPE_USERNAME] + + url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/uv/{channel}?user={user_name}' cookies = ModelScopeConfig.get_cookies() r = requests.post(url, cookies=cookies, headers=self.headers) resp = r.json() raise_on_error(resp) - print(resp) return resp['Message'] diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py index a7d29990..5c8ea59f 100644 --- a/modelscope/msdatasets/ms_dataset.py +++ b/modelscope/msdatasets/ms_dataset.py @@ -274,6 +274,8 @@ class MsDataset: try: api.on_dataset_download( dataset_name=download_dataset, namespace=namespace) + api.dataset_download_uv( + dataset_name=download_dataset, namespace=namespace) except Exception as e: logger.error(e) @@ -727,17 +729,3 @@ class MsDataset: resp_msg = _delete_manager.delete(object_name=object_name) logger.info(f'Object {object_name} successfully removed!') return resp_msg - - -if __name__ == '__main__': - from modelscope.hub.api import HubApi - api = HubApi() - # api.login('c252d64a-ce7b-4c0c-b583-7bedf628c7da') # online - # api.login('aa14716f-e2de-4f26-bf49-254d81eb8ac6') # test - - channel = 'local' # dsw - dataset_name = 'small_coco_for_test' - namespace = 'wangxingjun778test' - resp = api.count_uv_by_channel( - dataset_name=dataset_name, namespace=namespace, channel=channel) - print(resp) diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 2729b75a..f0a97dbd 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -238,6 +238,14 @@ class DownloadMode(enum.Enum): FORCE_REDOWNLOAD = 'force_redownload' +class DownloadChannel(enum.Enum): + """ Channels of datasets downloading for uv/pv counting. + """ + LOCAL = 'local' + DSW = 'dsw' + EAIS = 'eais' + + class UploadMode(enum.Enum): """ How to upload object to remote. """ diff --git a/requirements/framework.txt b/requirements/framework.txt index 17fbd8a3..e78bc9a9 100644 --- a/requirements/framework.txt +++ b/requirements/framework.txt @@ -1,7 +1,7 @@ addict attrs # version beyond 2.6.0 introduces compatbility issue and is being resolved -datasets<=2.6.0 +datasets<=2.5.2 easydict einops filelock>=3.3.0 From 79c44a68102e182b3194e3b9e6244d4891859274 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8F=AD=E6=89=AC?= Date: Tue, 1 Nov 2022 15:41:01 +0800 Subject: [PATCH 12/18] add event tracking --- requirements/framework.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/framework.txt b/requirements/framework.txt index e78bc9a9..a86c0cc5 100644 --- a/requirements/framework.txt +++ b/requirements/framework.txt @@ -1,6 +1,6 @@ addict attrs -# version beyond 2.6.0 introduces compatbility issue and is being resolved +# version beyond 2.5.2 introduces compatbility issue and is being resolved datasets<=2.5.2 easydict einops From 63a08e7be68bce218eb6ca755ecbc821017d83b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8F=AD=E6=89=AC?= Date: Tue, 1 Nov 2022 15:49:21 +0800 Subject: [PATCH 13/18] add event tracking --- tests/msdatasets/test_dataset_upload.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/msdatasets/test_dataset_upload.py b/tests/msdatasets/test_dataset_upload.py index 3d35d480..b67c2ebb 100644 --- a/tests/msdatasets/test_dataset_upload.py +++ b/tests/msdatasets/test_dataset_upload.py @@ -104,7 +104,11 @@ class DatasetUploadTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_ds_download_dir(self): - test_ds = MsDataset.load(self.dataset_name, self.namespace) + from modelscope.utils.constant import DownloadMode + test_ds = MsDataset.load( + self.dataset_name, + namespace=self.namespace, + download_mode=DownloadMode.FORCE_REDOWNLOAD) assert test_ds.config_kwargs['split_config'].values() @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') From e45ab2c32d66a3ae8014be045d773719b82cb0cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8F=AD=E6=89=AC?= Date: Tue, 1 Nov 2022 15:51:00 +0800 Subject: [PATCH 14/18] add event tracking --- tests/msdatasets/test_dataset_upload.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/msdatasets/test_dataset_upload.py b/tests/msdatasets/test_dataset_upload.py index b67c2ebb..d91f24d7 100644 --- a/tests/msdatasets/test_dataset_upload.py +++ b/tests/msdatasets/test_dataset_upload.py @@ -8,7 +8,8 @@ import zipfile from modelscope.msdatasets import MsDataset from modelscope.msdatasets.utils.dataset_utils import list_dataset_objects from modelscope.utils import logger as logging -from modelscope.utils.constant import DEFAULT_DATASET_REVISION, ModelFile +from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, DownloadMode, + ModelFile) from modelscope.utils.test_utils import test_level logger = logging.get_logger(__name__) @@ -104,7 +105,6 @@ class DatasetUploadTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_ds_download_dir(self): - from modelscope.utils.constant import DownloadMode test_ds = MsDataset.load( self.dataset_name, namespace=self.namespace, From 30c8c27145261a3e5c7606976e11faef733d3f49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B9=B2=E5=8A=B2?= Date: Tue, 1 Nov 2022 17:06:30 +0800 Subject: [PATCH 15/18] up requirements --- requirements/science.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements/science.txt b/requirements/science.txt index 72994f72..c345da99 100644 --- a/requirements/science.txt +++ b/requirements/science.txt @@ -4,3 +4,5 @@ ml_collections scipy tensorboardX tokenizers +biopython +ipdb From 853e5235d56bf35922cde0db843cb62353e19a39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B9=B2=E5=8A=B2?= Date: Tue, 1 Nov 2022 17:32:04 +0800 Subject: [PATCH 16/18] fix requirements --- requirements/science.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements/science.txt b/requirements/science.txt index c345da99..636f98f4 100644 --- a/requirements/science.txt +++ b/requirements/science.txt @@ -1,8 +1,8 @@ -iopath +biopython lmdb ml_collections scipy tensorboardX tokenizers -biopython -ipdb +iopath +ipdb \ No newline at end of file From 420b63f03b55d5c2a591fd69cd060ed3a8141ef4 Mon Sep 17 00:00:00 2001 From: "mulin.lyh" Date: Tue, 1 Nov 2022 17:44:18 +0800 Subject: [PATCH 17/18] fix style issues --- requirements/science.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/science.txt b/requirements/science.txt index 636f98f4..c30ff644 100644 --- a/requirements/science.txt +++ b/requirements/science.txt @@ -1,8 +1,8 @@ biopython +iopath +ipdb lmdb ml_collections scipy tensorboardX tokenizers -iopath -ipdb \ No newline at end of file From aecb88044eba1789a675f22a32cc6f2eed71b91a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B9=B2=E5=8A=B2?= Date: Tue, 1 Nov 2022 17:44:37 +0800 Subject: [PATCH 18/18] up --- requirements/science.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/science.txt b/requirements/science.txt index 636f98f4..c30ff644 100644 --- a/requirements/science.txt +++ b/requirements/science.txt @@ -1,8 +1,8 @@ biopython +iopath +ipdb lmdb ml_collections scipy tensorboardX tokenizers -iopath -ipdb \ No newline at end of file