From 0d3b7b0df210418326295c4cbe1c07152e540af0 Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Mon, 31 Oct 2022 20:52:27 +0800
Subject: [PATCH 01/18] [to #42322933]fix bugs relate to token cls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1.修复token classification preprocessor finetune结果错误问题
2.修复word segmentation output 无用属性
3. 修复nlp preprocessor传use_fast错误
4. 修复torch model exporter bug
5. 修复文档撰写过程中发现trainer相关bug
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10573269
---
 modelscope/exporters/torch_model_exporter.py  |   5 +-
 modelscope/outputs/outputs.py                 |  11 +-
 .../nlp/token_classification_pipeline.py      |   4 +-
 .../nlp/word_segmentation_pipeline.py         |   6 +-
 modelscope/preprocessors/nlp/nlp_base.py      |  17 +-
 .../nlp/token_classification_preprocessor.py  | 148 ++++++++++--------
 .../trainers/nlp/text_generation_trainer.py   |   2 +-
 modelscope/trainers/nlp_trainer.py            |   6 +-
 modelscope/trainers/trainer.py                |   2 +-
 tests/outputs/test_model_outputs.py           |   3 +-
 .../test_finetune_token_classificatin.py      |   2 +-
 11 files changed, 110 insertions(+), 96 deletions(-)

diff --git a/modelscope/exporters/torch_model_exporter.py b/modelscope/exporters/torch_model_exporter.py
index 7bf6c0c0..1d332591 100644
--- a/modelscope/exporters/torch_model_exporter.py
+++ b/modelscope/exporters/torch_model_exporter.py
@@ -128,7 +128,7 @@ class TorchModelExporter(Exporter):
                 args_list = list(args)
             else:
                 args_list = [args]
-            if isinstance(args_list[-1], dict):
+            if isinstance(args_list[-1], Mapping):
                 args_dict = args_list[-1]
                 args_list = args_list[:-1]
             n_nonkeyword = len(args_list)
@@ -284,9 +284,8 @@ class TorchModelExporter(Exporter):
                 'Model property dummy_inputs must be set.')
         dummy_inputs = collate_fn(dummy_inputs, device)
         if isinstance(dummy_inputs, Mapping):
-            dummy_inputs = self._decide_input_format(model, dummy_inputs)
             dummy_inputs_filter = []
-            for _input in dummy_inputs:
+            for _input in self._decide_input_format(model, dummy_inputs):
                 if _input is not None:
                     dummy_inputs_filter.append(_input)
                 else:
diff --git a/modelscope/outputs/outputs.py b/modelscope/outputs/outputs.py
index b7003809..2c6dd85a 100644
--- a/modelscope/outputs/outputs.py
+++ b/modelscope/outputs/outputs.py
@@ -491,17 +491,8 @@ TASK_OUTPUTS = {
     # word segmentation result for single sample
     # {
     #   "output": "今天 天气 不错 ， 适合 出去 游玩"
-    #   "labels": [
-    #     {'word': '今天', 'label': 'PROPN'},
-    #     {'word': '天气', 'label': 'PROPN'},
-    #     {'word': '不错', 'label': 'VERB'},
-    #     {'word': ',', 'label': 'NUM'},
-    #     {'word': '适合', 'label': 'NOUN'},
-    #     {'word': '出去', 'label': 'PART'},
-    #     {'word': '游玩', 'label': 'ADV'},
-    # ]
     # }
-    Tasks.word_segmentation: [OutputKeys.OUTPUT, OutputKeys.LABELS],
+    Tasks.word_segmentation: [OutputKeys.OUTPUT],
 
     # TODO @wenmeng.zwm support list of result check
     # named entity recognition result for single sample
diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py
index 75bc538d..4af187ee 100644
--- a/modelscope/pipelines/nlp/token_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/token_classification_pipeline.py
@@ -109,13 +109,13 @@ class TokenClassificationPipeline(Pipeline):
             chunk['span'] = text[chunk['start']:chunk['end']]
             chunks.append(chunk)
 
-        # for cws output
+        # for cws outputs
         if len(chunks) > 0 and chunks[0]['type'] == 'cws':
             spans = [
                 chunk['span'] for chunk in chunks if chunk['span'].strip()
             ]
             seg_result = ' '.join(spans)
-            outputs = {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []}
+            outputs = {OutputKeys.OUTPUT: seg_result}
 
         # for ner outputs
         else:
diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
index 0df8f1ad..c57f6b93 100644
--- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
@@ -115,15 +115,15 @@ class WordSegmentationPipeline(Pipeline):
             chunk['span'] = text[chunk['start']:chunk['end']]
             chunks.append(chunk)
 
-        # for cws output
+        # for cws outputs
         if len(chunks) > 0 and chunks[0]['type'] == 'cws':
             spans = [
                 chunk['span'] for chunk in chunks if chunk['span'].strip()
             ]
             seg_result = ' '.join(spans)
-            outputs = {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []}
+            outputs = {OutputKeys.OUTPUT: seg_result}
 
-        # for ner outpus
+        # for ner output
         else:
             outputs = {OutputKeys.OUTPUT: chunks}
         return outputs
diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py
index 48a04d7a..45efc6e7 100644
--- a/modelscope/preprocessors/nlp/nlp_base.py
+++ b/modelscope/preprocessors/nlp/nlp_base.py
@@ -34,6 +34,7 @@ class NLPBasePreprocessor(Preprocessor, ABC):
                  label=None,
                  label2id=None,
                  mode=ModeKeys.INFERENCE,
+                 use_fast=None,
                  **kwargs):
         """The NLP preprocessor base class.
 
@@ -45,14 +46,18 @@ class NLPBasePreprocessor(Preprocessor, ABC):
             label2id: An optional label2id mapping, the class will try to call utils.parse_label_mapping
                 if this mapping is not supplied.
             mode: Run this preprocessor in either 'train'/'eval'/'inference' mode
+            use_fast: use the fast version of tokenizer
+
         """
         self.model_dir = model_dir
         self.first_sequence = first_sequence
         self.second_sequence = second_sequence
         self.label = label
 
-        self.use_fast = kwargs.pop('use_fast', None)
-        if self.use_fast is None and os.path.isfile(
+        self.use_fast = use_fast
+        if self.use_fast is None and model_dir is None:
+            self.use_fast = False
+        elif self.use_fast is None and os.path.isfile(
                 os.path.join(model_dir, 'tokenizer_config.json')):
             with open(os.path.join(model_dir, 'tokenizer_config.json'),
                       'r') as f:
@@ -61,8 +66,8 @@ class NLPBasePreprocessor(Preprocessor, ABC):
         self.use_fast = False if self.use_fast is None else self.use_fast
 
         self.label2id = label2id
-        if self.label2id is None:
-            self.label2id = parse_label_mapping(self.model_dir)
+        if self.label2id is None and model_dir is not None:
+            self.label2id = parse_label_mapping(model_dir)
         super().__init__(mode, **kwargs)
 
     @property
@@ -106,6 +111,7 @@ class NLPTokenizerPreprocessorBase(NLPBasePreprocessor):
                  label: str = 'label',
                  label2id: dict = None,
                  mode: str = ModeKeys.INFERENCE,
+                 use_fast: bool = None,
                  **kwargs):
         """The NLP tokenizer preprocessor base class.
 
@@ -122,11 +128,12 @@ class NLPTokenizerPreprocessorBase(NLPBasePreprocessor):
                 - config.json label2id/id2label
                 - label_mapping.json
             mode: Run this preprocessor in either 'train'/'eval'/'inference' mode, the behavior may be different.
+            use_fast: use the fast version of tokenizer
             kwargs: These kwargs will be directly fed into the tokenizer.
         """
 
         super().__init__(model_dir, first_sequence, second_sequence, label,
-                         label2id, mode)
+                         label2id, mode, use_fast, **kwargs)
         self.model_dir = model_dir
         self.tokenize_kwargs = kwargs
         self.tokenizer = self.build_tokenizer(model_dir)
diff --git a/modelscope/preprocessors/nlp/token_classification_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_preprocessor.py
index 2de0c806..5069048b 100644
--- a/modelscope/preprocessors/nlp/token_classification_preprocessor.py
+++ b/modelscope/preprocessors/nlp/token_classification_preprocessor.py
@@ -2,6 +2,7 @@
 
 from typing import Any, Dict, Tuple, Union
 
+import numpy as np
 import torch
 
 from modelscope.metainfo import Preprocessors
@@ -20,9 +21,7 @@ class WordSegmentationBlankSetToLabelPreprocessor(NLPBasePreprocessor):
     """
 
     def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.first_sequence: str = kwargs.pop('first_sequence',
-                                              'first_sequence')
+        self.first_sequence: str = kwargs.pop('first_sequence', 'tokens')
         self.label = kwargs.pop('label', OutputKeys.LABELS)
 
     def __call__(self, data: str) -> Union[Dict[str, Any], Tuple]:
@@ -80,10 +79,9 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
                 'is_split_into_words', False)
         if 'label2id' in kwargs:
             kwargs.pop('label2id')
-        self.tokenize_kwargs = kwargs
 
-    @type_assert(object, str)
-    def __call__(self, data: str) -> Dict[str, Any]:
+    @type_assert(object, (str, dict))
+    def __call__(self, data: Union[dict, str]) -> Dict[str, Any]:
         """process the raw input data
 
         Args:
@@ -99,18 +97,24 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
         text = None
         labels_list = None
         if isinstance(data, str):
+            # for inference inputs without label
             text = data
+            self.tokenize_kwargs['add_special_tokens'] = False
         elif isinstance(data, dict):
+            # for finetune inputs with label
             text = data.get(self.first_sequence)
             labels_list = data.get(self.label)
+            if isinstance(text, list):
+                self.tokenize_kwargs['is_split_into_words'] = True
 
         input_ids = []
         label_mask = []
         offset_mapping = []
-        if self.is_split_into_words:
-            for offset, token in enumerate(list(data)):
-                subtoken_ids = self.tokenizer.encode(
-                    token, add_special_tokens=False)
+        token_type_ids = []
+        if self.is_split_into_words and self._mode == ModeKeys.INFERENCE:
+            for offset, token in enumerate(list(text)):
+                subtoken_ids = self.tokenizer.encode(token,
+                                                     **self.tokenize_kwargs)
                 if len(subtoken_ids) == 0:
                     subtoken_ids = [self.tokenizer.unk_token_id]
                 input_ids.extend(subtoken_ids)
@@ -119,10 +123,9 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
         else:
             if self.tokenizer.is_fast:
                 encodings = self.tokenizer(
-                    text,
-                    add_special_tokens=False,
-                    return_offsets_mapping=True,
-                    **self.tokenize_kwargs)
+                    text, return_offsets_mapping=True, **self.tokenize_kwargs)
+                attention_mask = encodings['attention_mask']
+                token_type_ids = encodings['token_type_ids']
                 input_ids = encodings['input_ids']
                 word_ids = encodings.word_ids()
                 for i in range(len(word_ids)):
@@ -143,69 +146,80 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
                 label_mask, offset_mapping = self.get_label_mask_and_offset_mapping(
                     text)
 
-        if len(input_ids) >= self.sequence_length - 2:
-            input_ids = input_ids[:self.sequence_length - 2]
-            label_mask = label_mask[:self.sequence_length - 2]
-        input_ids = [self.tokenizer.cls_token_id
-                     ] + input_ids + [self.tokenizer.sep_token_id]
-        label_mask = [0] + label_mask + [0]
-        attention_mask = [1] * len(input_ids)
-        offset_mapping = offset_mapping[:sum(label_mask)]
+        if self._mode == ModeKeys.INFERENCE:
+            if len(input_ids) >= self.sequence_length - 2:
+                input_ids = input_ids[:self.sequence_length - 2]
+                label_mask = label_mask[:self.sequence_length - 2]
+            input_ids = [self.tokenizer.cls_token_id
+                         ] + input_ids + [self.tokenizer.sep_token_id]
+            label_mask = [0] + label_mask + [0]
+            attention_mask = [1] * len(input_ids)
+            offset_mapping = offset_mapping[:sum(label_mask)]
 
-        if not self.is_transformer_based_model:
-            input_ids = input_ids[1:-1]
-            attention_mask = attention_mask[1:-1]
-            label_mask = label_mask[1:-1]
+            if not self.is_transformer_based_model:
+                input_ids = input_ids[1:-1]
+                attention_mask = attention_mask[1:-1]
+                label_mask = label_mask[1:-1]
 
-        if self._mode == ModeKeys.INFERENCE:
             input_ids = torch.tensor(input_ids).unsqueeze(0)
             attention_mask = torch.tensor(attention_mask).unsqueeze(0)
             label_mask = torch.tensor(
                 label_mask, dtype=torch.bool).unsqueeze(0)
 
-        # the token classification
-        output = {
-            'text': text,
-            'input_ids': input_ids,
-            'attention_mask': attention_mask,
-            'label_mask': label_mask,
-            'offset_mapping': offset_mapping
-        }
-
-        # align the labels with tokenized text
-        if labels_list is not None:
-            assert self.label2id is not None
-            # Map that sends B-Xxx label to its I-Xxx counterpart
-            b_to_i_label = []
-            label_enumerate_values = [
-                k for k, v in sorted(
-                    self.label2id.items(), key=lambda item: item[1])
-            ]
-            for idx, label in enumerate(label_enumerate_values):
-                if label.startswith('B-') and label.replace(
-                        'B-', 'I-') in label_enumerate_values:
-                    b_to_i_label.append(
-                        label_enumerate_values.index(
-                            label.replace('B-', 'I-')))
-                else:
-                    b_to_i_label.append(idx)
+            # the token classification
+            output = {
+                'text': text,
+                'input_ids': input_ids,
+                'attention_mask': attention_mask,
+                'label_mask': label_mask,
+                'offset_mapping': offset_mapping
+            }
+        else:
+            output = {
+                'input_ids': input_ids,
+                'token_type_ids': token_type_ids,
+                'attention_mask': attention_mask,
+                'label_mask': label_mask,
+            }
 
-            label_row = [self.label2id[lb] for lb in labels_list]
-            previous_word_idx = None
-            label_ids = []
-            for word_idx in word_ids:
-                if word_idx is None:
-                    label_ids.append(-100)
-                elif word_idx != previous_word_idx:
-                    label_ids.append(label_row[word_idx])
-                else:
-                    if self.label_all_tokens:
-                        label_ids.append(b_to_i_label[label_row[word_idx]])
+            # align the labels with tokenized text
+            if labels_list is not None:
+                assert self.label2id is not None
+                # Map that sends B-Xxx label to its I-Xxx counterpart
+                b_to_i_label = []
+                label_enumerate_values = [
+                    k for k, v in sorted(
+                        self.label2id.items(), key=lambda item: item[1])
+                ]
+                for idx, label in enumerate(label_enumerate_values):
+                    if label.startswith('B-') and label.replace(
+                            'B-', 'I-') in label_enumerate_values:
+                        b_to_i_label.append(
+                            label_enumerate_values.index(
+                                label.replace('B-', 'I-')))
                     else:
+                        b_to_i_label.append(idx)
+
+                label_row = [self.label2id[lb] for lb in labels_list]
+                previous_word_idx = None
+                label_ids = []
+                for word_idx in word_ids:
+                    if word_idx is None:
                         label_ids.append(-100)
-                previous_word_idx = word_idx
-            labels = label_ids
-            output['labels'] = labels
+                    elif word_idx != previous_word_idx:
+                        label_ids.append(label_row[word_idx])
+                    else:
+                        if self.label_all_tokens:
+                            label_ids.append(b_to_i_label[label_row[word_idx]])
+                        else:
+                            label_ids.append(-100)
+                    previous_word_idx = word_idx
+                labels = label_ids
+                output['labels'] = labels
+            output = {
+                k: np.array(v) if isinstance(v, list) else v
+                for k, v in output.items()
+            }
         return output
 
     def get_tokenizer_class(self):
diff --git a/modelscope/trainers/nlp/text_generation_trainer.py b/modelscope/trainers/nlp/text_generation_trainer.py
index 0e26f153..f02faf71 100644
--- a/modelscope/trainers/nlp/text_generation_trainer.py
+++ b/modelscope/trainers/nlp/text_generation_trainer.py
@@ -18,7 +18,7 @@ class TextGenerationTrainer(NlpEpochBasedTrainer):
         return tokenizer.decode(tokens.tolist(), skip_special_tokens=True)
 
     def evaluation_step(self, data):
-        model = self.model
+        model = self.model.module if self._dist else self.model
         model.eval()
 
         with torch.no_grad():
diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py
index a92a3706..5ff6f62f 100644
--- a/modelscope/trainers/nlp_trainer.py
+++ b/modelscope/trainers/nlp_trainer.py
@@ -586,14 +586,16 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
             preprocessor_mode=ModeKeys.TRAIN,
             **model_args,
             **self.train_keys,
-            mode=ModeKeys.TRAIN)
+            mode=ModeKeys.TRAIN,
+            use_fast=True)
         eval_preprocessor = Preprocessor.from_pretrained(
             self.model_dir,
             cfg_dict=self.cfg,
             preprocessor_mode=ModeKeys.EVAL,
             **model_args,
             **self.eval_keys,
-            mode=ModeKeys.EVAL)
+            mode=ModeKeys.EVAL,
+            use_fast=True)
         return train_preprocessor, eval_preprocessor
 
 
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index 7478d8e4..3556badf 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -876,7 +876,7 @@ class EpochBasedTrainer(BaseTrainer):
         Subclass and override to inject custom behavior.
 
         """
-        model = self.model
+        model = self.model.module if self._dist else self.model
         model.eval()
 
         if is_parallel(model):
diff --git a/tests/outputs/test_model_outputs.py b/tests/outputs/test_model_outputs.py
index 31271869..311ce201 100644
--- a/tests/outputs/test_model_outputs.py
+++ b/tests/outputs/test_model_outputs.py
@@ -21,9 +21,10 @@ class TestModelOutput(unittest.TestCase):
         self.assertEqual(outputs['logits'], torch.Tensor([1]))
         self.assertEqual(outputs[0], torch.Tensor([1]))
         self.assertEqual(outputs.logits, torch.Tensor([1]))
+        outputs.loss = torch.Tensor([2])
         logits, loss = outputs
         self.assertEqual(logits, torch.Tensor([1]))
-        self.assertTrue(loss is None)
+        self.assertTrue(loss is not None)
 
 
 if __name__ == '__main__':
diff --git a/tests/trainers/test_finetune_token_classificatin.py b/tests/trainers/test_finetune_token_classificatin.py
index 9bdab9b7..a92cee7b 100644
--- a/tests/trainers/test_finetune_token_classificatin.py
+++ b/tests/trainers/test_finetune_token_classificatin.py
@@ -87,7 +87,7 @@ class TestFinetuneTokenClassification(unittest.TestCase):
             cfg['dataset'] = {
                 'train': {
                     'labels': label_enumerate_values,
-                    'first_sequence': 'first_sequence',
+                    'first_sequence': 'tokens',
                     'label': 'labels',
                 }
             }

From 3464324f6b5d9d0ef975cd0b0e76870e95b5fa22 Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingda.chen@alibaba-inc.com>
Date: Mon, 31 Oct 2022 22:15:25 +0800
Subject: [PATCH 02/18] [to #42322933] limit datasets version for now

---
 requirements/framework.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements/framework.txt b/requirements/framework.txt
index 2408cda6..17fbd8a3 100644
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -1,6 +1,7 @@
 addict
 attrs
-datasets
+# version beyond 2.6.0 introduces compatbility issue and is being resolved
+datasets<=2.6.0
 easydict
 einops
 filelock>=3.3.0

From 5302259a0a3fb7cafdce473aa78990e7dc84e676 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Mon, 31 Oct 2022 22:46:17 +0800
Subject: [PATCH 03/18] [to #45854437]fix: add user name to user-agent        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10584797

---
 modelscope/hub/api.py       | 9 +++++++--
 modelscope/hub/constants.py | 1 +
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index dca6d099..7468e5e3 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -23,7 +23,8 @@ from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA,
                                       API_RESPONSE_FIELD_MESSAGE,
                                       API_RESPONSE_FIELD_USERNAME,
                                       DEFAULT_CREDENTIALS_PATH,
-                                      MODELSCOPE_ENVIRONMENT, ONE_YEAR_SECONDS,
+                                      MODELSCOPE_ENVIRONMENT,
+                                      MODELSCOPE_USERNAME, ONE_YEAR_SECONDS,
                                       Licenses, ModelVisibility)
 from modelscope.hub.errors import (InvalidParameter, NotExistError,
                                    NotLoginException, NoValidRevisionError,
@@ -760,14 +761,18 @@ class ModelScopeConfig:
         env = 'custom'
         if MODELSCOPE_ENVIRONMENT in os.environ:
             env = os.environ[MODELSCOPE_ENVIRONMENT]
+        user_name = 'unknown'
+        if MODELSCOPE_USERNAME in os.environ:
+            user_name = os.environ[MODELSCOPE_USERNAME]
 
-        ua = 'modelscope/%s; python/%s; session_id/%s; platform/%s; processor/%s; env/%s' % (
+        ua = 'modelscope/%s; python/%s; session_id/%s; platform/%s; processor/%s; env/%s; user/%s' % (
             __version__,
             platform.python_version(),
             ModelScopeConfig.get_user_session_id(),
             platform.platform(),
             platform.processor(),
             env,
+            user_name,
         )
         if isinstance(user_agent, dict):
             ua = '; '.join(f'{k}/{v}' for k, v in user_agent.items())
diff --git a/modelscope/hub/constants.py b/modelscope/hub/constants.py
index 730702c1..373a0cf4 100644
--- a/modelscope/hub/constants.py
+++ b/modelscope/hub/constants.py
@@ -18,6 +18,7 @@ API_RESPONSE_FIELD_EMAIL = 'Email'
 API_RESPONSE_FIELD_MESSAGE = 'Message'
 MODELSCOPE_ENVIRONMENT = 'MODELSCOPE_ENVIRONMENT'
 MODELSCOPE_SDK_DEBUG = 'MODELSCOPE_SDK_DEBUG'
+MODELSCOPE_USERNAME = 'MODELSCOPE_USERNAME'
 ONE_YEAR_SECONDS = 24 * 365 * 60 * 60
 
 

From 06abae4dc6d68e99cba56608c857de5cdabd16b0 Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Tue, 1 Nov 2022 09:56:15 +0800
Subject: [PATCH 04/18] [to #42322933]add token-cls test cases and bug fix     
    Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10585502

---
 .../nlp/token_classification_preprocessor.py              | 3 +--
 tests/pipelines/test_named_entity_recognition.py          | 8 ++++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/modelscope/preprocessors/nlp/token_classification_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_preprocessor.py
index 5069048b..92b7c46b 100644
--- a/modelscope/preprocessors/nlp/token_classification_preprocessor.py
+++ b/modelscope/preprocessors/nlp/token_classification_preprocessor.py
@@ -140,8 +140,7 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
                         label_mask.append(1)
                         offset_mapping.append(encodings['offset_mapping'][i])
             else:
-                encodings = self.tokenizer(
-                    text, add_special_tokens=False, **self.tokenize_kwargs)
+                encodings = self.tokenizer(text, **self.tokenize_kwargs)
                 input_ids = encodings['input_ids']
                 label_mask, offset_mapping = self.get_label_mask_and_offset_mapping(
                     text)
diff --git a/tests/pipelines/test_named_entity_recognition.py b/tests/pipelines/test_named_entity_recognition.py
index 3658cf3f..aef4aaed 100644
--- a/tests/pipelines/test_named_entity_recognition.py
+++ b/tests/pipelines/test_named_entity_recognition.py
@@ -19,9 +19,11 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
         self.task = Tasks.named_entity_recognition
         self.model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
 
+    english_model_id = 'damo/nlp_raner_named-entity-recognition_english-large-ecom'
     tcrf_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
     lcrf_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-news'
     sentence = '这与温岭市新河镇的一个神秘的传说有关。'
+    sentence_en = 'pizza shovel'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_tcrf_by_direct_model_download(self):
@@ -89,6 +91,12 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
             task=Tasks.named_entity_recognition, model=self.lcrf_model_id)
         print(pipeline_ins(input=self.sentence))
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_english_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.named_entity_recognition, model=self.english_model_id)
+        print(pipeline_ins(input='pizza shovel'))
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_ins = pipeline(task=Tasks.named_entity_recognition)

From 9187103e3a32d4048e79e57d23fa596b2d1bffd5 Mon Sep 17 00:00:00 2001
From: "yichang.zyc" <yichang.zyc@alibaba-inc.com>
Date: Tue, 1 Nov 2022 09:57:31 +0800
Subject: [PATCH 05/18] =?UTF-8?q?[to=20#42322933]=E5=85=BC=E5=AE=B9?=
 =?UTF-8?q?=E6=96=B0=E5=A2=9Eclip=20huge=E6=A8=A1=E5=9E=8B=20=20=20=20=20?=
 =?UTF-8?q?=20=20=20=20Link:=20https://code.alibaba-inc.com/Ali-MaaS/MaaS-?=
 =?UTF-8?q?lib/codereview/10585552?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

    * compatiable with vit huge, and set clip base default mm-ebed pipeline
---
 modelscope/models/multi_modal/clip/model.py | 6 ++++--
 modelscope/pipelines/builder.py             | 5 ++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/modelscope/models/multi_modal/clip/model.py b/modelscope/models/multi_modal/clip/model.py
index b1c84292..9b82e4a1 100644
--- a/modelscope/models/multi_modal/clip/model.py
+++ b/modelscope/models/multi_modal/clip/model.py
@@ -349,11 +349,13 @@ class CLIP(nn.Module):
         text_num_hidden_layers: int,
         text_type_vocab_size: int,
         tokenizer: FullTokenizer,
+        # vision_head_width, added this param for ViT-H
+        vision_head_width: int = 64,
     ):
         super().__init__()
 
         if isinstance(vision_layers, (tuple, list)):
-            vision_heads = vision_width * 32 // 64
+            vision_heads = vision_width * 32 // vision_head_width
             self.visual = ModifiedResNet(
                 layers=vision_layers,
                 output_dim=embed_dim,
@@ -361,7 +363,7 @@ class CLIP(nn.Module):
                 input_resolution=image_resolution,
                 width=vision_width)
         else:
-            vision_heads = vision_width // 64
+            vision_heads = vision_width // vision_head_width
             self.visual = VisualTransformer(
                 input_resolution=image_resolution,
                 patch_size=vision_patch_size,
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 498c9ed8..70f8f11c 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -93,9 +93,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
                           'damo/cv_resnet50_live-category'),
     Tasks.video_category: (Pipelines.video_category,
                            'damo/cv_resnet50_video-category'),
-    Tasks.multi_modal_embedding:
-    (Pipelines.multi_modal_embedding,
-     'damo/multi-modal_clip-vit-large-patch14_zh'),
+    Tasks.multi_modal_embedding: (Pipelines.multi_modal_embedding,
+                                  'damo/multi-modal_clip-vit-base-patch16_zh'),
     Tasks.generative_multi_modal_embedding:
     (Pipelines.generative_multi_modal_embedding,
      'damo/multi-modal_gemm-vit-large-patch14_generative-multi-modal-embedding'

From 40b677095605594d426b9c731687fb834d04b4fc Mon Sep 17 00:00:00 2001
From: "liugao.lg" <liugao.lg@alibaba-inc.com>
Date: Tue, 1 Nov 2022 10:22:11 +0800
Subject: [PATCH 06/18] [to #42322933]fix ocr prepreocess & conflict
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

修复ocr预处理逻辑不一致问题
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10581697
---
 modelscope/preprocessors/multi_modal.py         |  1 -
 modelscope/preprocessors/ofa/ocr_recognition.py | 11 ++++++-----
 requirements/multi-modal.txt                    |  2 ++
 tests/trainers/test_ofa_trainer.py              |  2 +-
 4 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 17dffb48..13876058 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -96,7 +96,6 @@ class OfaPreprocessor(Preprocessor):
             data = input
         else:
             data = self._build_dict(input)
-        data = self._ofa_input_compatibility_conversion(data)
         sample = self.preprocess(data)
         str_data = dict()
         for k, v in data.items():
diff --git a/modelscope/preprocessors/ofa/ocr_recognition.py b/modelscope/preprocessors/ofa/ocr_recognition.py
index 26fff9d2..a0342c14 100644
--- a/modelscope/preprocessors/ofa/ocr_recognition.py
+++ b/modelscope/preprocessors/ofa/ocr_recognition.py
@@ -2,12 +2,12 @@
 from typing import Any, Dict
 
 import torch
-from PIL import Image
+import unicodedata2
 from torchvision import transforms
 from torchvision.transforms import InterpolationMode
 from torchvision.transforms import functional as F
+from zhconv import convert
 
-from modelscope.preprocessors.image import load_image
 from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
 
@@ -98,8 +98,7 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor):
 
     def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
         sample = self._build_infer_sample(data)
-        target = data[self.column_map['text']]
-        target = target.translate(self.transtab).strip()
+        target = sample['label']
         target_token_list = target.strip().split()
         target = ' '.join(target_token_list[:self.max_tgt_length])
         sample['target'] = self.tokenize_text(target, add_bos=False)
@@ -119,5 +118,7 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor):
             'patch_mask': torch.tensor([True])
         }
         if 'text' in self.column_map and self.column_map['text'] in data:
-            sample['label'] = data[self.column_map['text']]
+            target = data[self.column_map['text']]
+            target = unicodedata2.normalize('NFKC', convert(target, 'zh-hans'))
+            sample['label'] = target
         return sample
diff --git a/requirements/multi-modal.txt b/requirements/multi-modal.txt
index 255f6155..578f0b54 100644
--- a/requirements/multi-modal.txt
+++ b/requirements/multi-modal.txt
@@ -11,3 +11,5 @@ timm
 tokenizers
 torchvision
 transformers>=4.12.0
+unicodedata2
+zhconv
diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index 3f68a9fb..85c21881 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -85,7 +85,7 @@ class TestOfaTrainer(unittest.TestCase):
                 'ocr_fudanvi_zh',
                 subset_name='scene',
                 namespace='modelscope',
-                split='train[:200]',
+                split='train[800:900]',
                 download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS),
             eval_dataset=MsDataset.load(
                 'ocr_fudanvi_zh',

From 4080f8071e96d4dbcc5ae8af10b051e14fea30ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8F=AD=E6=89=AC?= <xingjun.wxj@alibaba-inc.com>
Date: Tue, 1 Nov 2022 12:57:04 +0800
Subject: [PATCH 07/18] temp

---
 modelscope/hub/api.py               | 11 +++++++++++
 modelscope/msdatasets/ms_dataset.py | 14 ++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index 7468e5e3..0262fc1d 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -646,6 +646,17 @@ class HubApi:
     def check_local_cookies(self, use_cookies) -> CookieJar:
         return self._check_cookie(use_cookies=use_cookies)
 
+    def count_uv_by_channel(self, dataset_name: str, namespace: str, channel: str):
+        # todo: 1. check args  2.
+
+        url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/uv/{channel}'
+        cookies = ModelScopeConfig.get_cookies()
+        r = requests.post(url, cookies=cookies, headers=self.headers)
+        resp = r.json()
+        raise_on_error(resp)
+        print(resp)
+        return resp['Message']
+
 
 class ModelScopeConfig:
     path_credential = expanduser(DEFAULT_CREDENTIALS_PATH)
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index 0c537df7..a7d29990 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -727,3 +727,17 @@ class MsDataset:
         resp_msg = _delete_manager.delete(object_name=object_name)
         logger.info(f'Object {object_name} successfully removed!')
         return resp_msg
+
+
+if __name__ == '__main__':
+    from modelscope.hub.api import HubApi
+    api = HubApi()
+    # api.login('c252d64a-ce7b-4c0c-b583-7bedf628c7da')  # online
+    # api.login('aa14716f-e2de-4f26-bf49-254d81eb8ac6')   # test
+
+    channel = 'local'  # dsw
+    dataset_name = 'small_coco_for_test'
+    namespace = 'wangxingjun778test'
+    resp = api.count_uv_by_channel(
+        dataset_name=dataset_name, namespace=namespace, channel=channel)
+    print(resp)

From f5c31b33198288405f209773cd41a5efa1991e50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B9=B2=E5=8A=B2?= <yuanzheng.yuanzhen@alibaba-inc.com>
Date: Tue, 1 Nov 2022 13:31:25 +0800
Subject: [PATCH 08/18] Add miss init

---
 .../models/science/unifold/modules/__init__.py     | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 modelscope/models/science/unifold/modules/__init__.py

diff --git a/modelscope/models/science/unifold/modules/__init__.py b/modelscope/models/science/unifold/modules/__init__.py
new file mode 100644
index 00000000..9821d212
--- /dev/null
+++ b/modelscope/models/science/unifold/modules/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Data pipeline for model features."""

From 943478de635393e957bb0bf6ad677fdd189ac5c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B9=B2=E5=8A=B2?= <yuanzheng.yuanzhen@alibaba-inc.com>
Date: Tue, 1 Nov 2022 13:32:57 +0800
Subject: [PATCH 09/18] Update

---
 .../models/science/unifold/modules/__init__.py  | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/modelscope/models/science/unifold/modules/__init__.py b/modelscope/models/science/unifold/modules/__init__.py
index 9821d212..63aa84ed 100644
--- a/modelscope/models/science/unifold/modules/__init__.py
+++ b/modelscope/models/science/unifold/modules/__init__.py
@@ -1,14 +1,3 @@
-# Copyright 2021 DeepMind Technologies Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Data pipeline for model features."""
+# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license,
+# and is publicly available at https://github.com/dptech-corp/Uni-Fold.
+"""Unifold Modules."""

From 2759d538bb30c8c82d0dd32ea3b4bcd7606d41d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B9=B2=E5=8A=B2?= <yuanzheng.yuanzhen@alibaba-inc.com>
Date: Tue, 1 Nov 2022 14:59:45 +0800
Subject: [PATCH 10/18] fix ut level for unifold

---
 tests/pipelines/test_unifold.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/test_unifold.py b/tests/pipelines/test_unifold.py
index df35dc5e..47bb7874 100644
--- a/tests/pipelines/test_unifold.py
+++ b/tests/pipelines/test_unifold.py
@@ -19,7 +19,7 @@ class UnifoldProteinStructureTest(unittest.TestCase, DemoCompatibilityCheck):
         self.protein_multimer = 'GAMGLPEEPSSPQESTLKALSLYEAHLSSYIMYLQTFLVKTKQKVNNKNYPEFTLFDTSKLKKDQTLKSIKT' + \
             'NIAALKNHIDKIKPIAMQIYKKYSKNIP'
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
         model_dir = snapshot_download(self.model_id)
         mono_pipeline_ins = pipeline(task=self.task, model=model_dir)

From 84032f90e3f2b4a183725ceda16a4b1dc204c2f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8F=AD=E6=89=AC?= <xingjun.wxj@alibaba-inc.com>
Date: Tue, 1 Nov 2022 15:34:58 +0800
Subject: [PATCH 11/18] add event tracking

---
 modelscope/hub/api.py               | 20 ++++++++++++++------
 modelscope/msdatasets/ms_dataset.py | 16 ++--------------
 modelscope/utils/constant.py        |  8 ++++++++
 requirements/framework.txt          |  2 +-
 4 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index 0262fc1d..f2ff822d 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -39,8 +39,8 @@ from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
                                        DEFAULT_MODEL_REVISION,
                                        DEFAULT_REPOSITORY_REVISION,
                                        MASTER_MODEL_BRANCH, DatasetFormations,
-                                       DatasetMetaFormats, DownloadMode,
-                                       ModelFile)
+                                       DatasetMetaFormats, DownloadChannel,
+                                       DownloadMode, ModelFile)
 from modelscope.utils.logger import get_logger
 from .utils.utils import (get_endpoint, get_release_datetime,
                           model_id_to_group_owner_name)
@@ -646,15 +646,23 @@ class HubApi:
     def check_local_cookies(self, use_cookies) -> CookieJar:
         return self._check_cookie(use_cookies=use_cookies)
 
-    def count_uv_by_channel(self, dataset_name: str, namespace: str, channel: str):
-        # todo: 1. check args  2.
+    def dataset_download_uv(self, dataset_name: str, namespace: str):
+        if not dataset_name or not namespace:
+            raise ValueError('dataset_name or namespace cannot be empty!')
 
-        url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/uv/{channel}'
+        # get channel and user_name
+        channel = DownloadChannel.LOCAL.value
+        user_name = ''
+        if MODELSCOPE_ENVIRONMENT in os.environ:
+            channel = os.environ[MODELSCOPE_ENVIRONMENT]
+        if MODELSCOPE_USERNAME in os.environ:
+            user_name = os.environ[MODELSCOPE_USERNAME]
+
+        url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/uv/{channel}?user={user_name}'
         cookies = ModelScopeConfig.get_cookies()
         r = requests.post(url, cookies=cookies, headers=self.headers)
         resp = r.json()
         raise_on_error(resp)
-        print(resp)
         return resp['Message']
 
 
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index a7d29990..5c8ea59f 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -274,6 +274,8 @@ class MsDataset:
             try:
                 api.on_dataset_download(
                     dataset_name=download_dataset, namespace=namespace)
+                api.dataset_download_uv(
+                    dataset_name=download_dataset, namespace=namespace)
             except Exception as e:
                 logger.error(e)
 
@@ -727,17 +729,3 @@ class MsDataset:
         resp_msg = _delete_manager.delete(object_name=object_name)
         logger.info(f'Object {object_name} successfully removed!')
         return resp_msg
-
-
-if __name__ == '__main__':
-    from modelscope.hub.api import HubApi
-    api = HubApi()
-    # api.login('c252d64a-ce7b-4c0c-b583-7bedf628c7da')  # online
-    # api.login('aa14716f-e2de-4f26-bf49-254d81eb8ac6')   # test
-
-    channel = 'local'  # dsw
-    dataset_name = 'small_coco_for_test'
-    namespace = 'wangxingjun778test'
-    resp = api.count_uv_by_channel(
-        dataset_name=dataset_name, namespace=namespace, channel=channel)
-    print(resp)
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 2729b75a..f0a97dbd 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -238,6 +238,14 @@ class DownloadMode(enum.Enum):
     FORCE_REDOWNLOAD = 'force_redownload'
 
 
+class DownloadChannel(enum.Enum):
+    """ Channels of datasets downloading for uv/pv counting.
+    """
+    LOCAL = 'local'
+    DSW = 'dsw'
+    EAIS = 'eais'
+
+
 class UploadMode(enum.Enum):
     """ How to upload object to remote.
     """
diff --git a/requirements/framework.txt b/requirements/framework.txt
index 17fbd8a3..e78bc9a9 100644
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -1,7 +1,7 @@
 addict
 attrs
 # version beyond 2.6.0 introduces compatbility issue and is being resolved
-datasets<=2.6.0
+datasets<=2.5.2
 easydict
 einops
 filelock>=3.3.0

From 79c44a68102e182b3194e3b9e6244d4891859274 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8F=AD=E6=89=AC?= <xingjun.wxj@alibaba-inc.com>
Date: Tue, 1 Nov 2022 15:41:01 +0800
Subject: [PATCH 12/18] add event tracking

---
 requirements/framework.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/framework.txt b/requirements/framework.txt
index e78bc9a9..a86c0cc5 100644
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -1,6 +1,6 @@
 addict
 attrs
-# version beyond 2.6.0 introduces compatbility issue and is being resolved
+# version beyond 2.5.2 introduces compatbility issue and is being resolved
 datasets<=2.5.2
 easydict
 einops

From 63a08e7be68bce218eb6ca755ecbc821017d83b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8F=AD=E6=89=AC?= <xingjun.wxj@alibaba-inc.com>
Date: Tue, 1 Nov 2022 15:49:21 +0800
Subject: [PATCH 13/18] add event tracking

---
 tests/msdatasets/test_dataset_upload.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/msdatasets/test_dataset_upload.py b/tests/msdatasets/test_dataset_upload.py
index 3d35d480..b67c2ebb 100644
--- a/tests/msdatasets/test_dataset_upload.py
+++ b/tests/msdatasets/test_dataset_upload.py
@@ -104,7 +104,11 @@ class DatasetUploadTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_ds_download_dir(self):
-        test_ds = MsDataset.load(self.dataset_name, self.namespace)
+        from modelscope.utils.constant import DownloadMode
+        test_ds = MsDataset.load(
+            self.dataset_name,
+            namespace=self.namespace,
+            download_mode=DownloadMode.FORCE_REDOWNLOAD)
         assert test_ds.config_kwargs['split_config'].values()
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')

From e45ab2c32d66a3ae8014be045d773719b82cb0cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8F=AD=E6=89=AC?= <xingjun.wxj@alibaba-inc.com>
Date: Tue, 1 Nov 2022 15:51:00 +0800
Subject: [PATCH 14/18] add event tracking

---
 tests/msdatasets/test_dataset_upload.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/msdatasets/test_dataset_upload.py b/tests/msdatasets/test_dataset_upload.py
index b67c2ebb..d91f24d7 100644
--- a/tests/msdatasets/test_dataset_upload.py
+++ b/tests/msdatasets/test_dataset_upload.py
@@ -8,7 +8,8 @@ import zipfile
 from modelscope.msdatasets import MsDataset
 from modelscope.msdatasets.utils.dataset_utils import list_dataset_objects
 from modelscope.utils import logger as logging
-from modelscope.utils.constant import DEFAULT_DATASET_REVISION, ModelFile
+from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, DownloadMode,
+                                       ModelFile)
 from modelscope.utils.test_utils import test_level
 
 logger = logging.get_logger(__name__)
@@ -104,7 +105,6 @@ class DatasetUploadTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_ds_download_dir(self):
-        from modelscope.utils.constant import DownloadMode
         test_ds = MsDataset.load(
             self.dataset_name,
             namespace=self.namespace,

From 30c8c27145261a3e5c7606976e11faef733d3f49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B9=B2=E5=8A=B2?= <yuanzheng.yuanzhen@alibaba-inc.com>
Date: Tue, 1 Nov 2022 17:06:30 +0800
Subject: [PATCH 15/18] up requirements

---
 requirements/science.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/requirements/science.txt b/requirements/science.txt
index 72994f72..c345da99 100644
--- a/requirements/science.txt
+++ b/requirements/science.txt
@@ -4,3 +4,5 @@ ml_collections
 scipy
 tensorboardX
 tokenizers
+biopython
+ipdb

From 853e5235d56bf35922cde0db843cb62353e19a39 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B9=B2=E5=8A=B2?= <yuanzheng.yuanzhen@alibaba-inc.com>
Date: Tue, 1 Nov 2022 17:32:04 +0800
Subject: [PATCH 16/18] fix requirements

---
 requirements/science.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements/science.txt b/requirements/science.txt
index c345da99..636f98f4 100644
--- a/requirements/science.txt
+++ b/requirements/science.txt
@@ -1,8 +1,8 @@
-iopath
+biopython
 lmdb
 ml_collections
 scipy
 tensorboardX
 tokenizers
-biopython
-ipdb
+iopath
+ipdb
\ No newline at end of file

From 420b63f03b55d5c2a591fd69cd060ed3a8141ef4 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Tue, 1 Nov 2022 17:44:18 +0800
Subject: [PATCH 17/18] fix style issues

---
 requirements/science.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/science.txt b/requirements/science.txt
index 636f98f4..c30ff644 100644
--- a/requirements/science.txt
+++ b/requirements/science.txt
@@ -1,8 +1,8 @@
 biopython
+iopath
+ipdb
 lmdb
 ml_collections
 scipy
 tensorboardX
 tokenizers
-iopath
-ipdb
\ No newline at end of file

From aecb88044eba1789a675f22a32cc6f2eed71b91a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B9=B2=E5=8A=B2?= <yuanzheng.yuanzhen@alibaba-inc.com>
Date: Tue, 1 Nov 2022 17:44:37 +0800
Subject: [PATCH 18/18] up

---
 requirements/science.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/science.txt b/requirements/science.txt
index 636f98f4..c30ff644 100644
--- a/requirements/science.txt
+++ b/requirements/science.txt
@@ -1,8 +1,8 @@
 biopython
+iopath
+ipdb
 lmdb
 ml_collections
 scipy
 tensorboardX
 tokenizers
-iopath
-ipdb
\ No newline at end of file