diff --git a/docs/source/tutorials/pipeline.md b/docs/source/tutorials/pipeline.md index 2d1f18e2..ebdc06f3 100644 --- a/docs/source/tutorials/pipeline.md +++ b/docs/source/tutorials/pipeline.md @@ -37,9 +37,9 @@ pipeline函数支持传入实例化的预处理对象、模型对象,从而支 1. 首先,创建预处理方法和模型 ```python from modelscope.models import Model -from modelscope.preprocessors import TokenClassifcationPreprocessor +from modelscope.preprocessors import TokenClassificationPreprocessor model = Model.from_pretrained('damo/nlp_structbert_word-segmentation_chinese-base') -tokenizer = TokenClassifcationPreprocessor(model.model_dir) +tokenizer = TokenClassificationPreprocessor(model.model_dir) ``` 2. 使用tokenizer和模型对象创建pipeline diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 1f8440de..555de643 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -106,6 +106,7 @@ class Preprocessors(object): load_image = 'load-image' # nlp preprocessor + sen_sim_tokenizer = 'sen-sim-tokenizer' bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer' palm_text_gen_tokenizer = 'palm-text-gen-tokenizer' token_cls_tokenizer = 'token-cls-tokenizer' diff --git a/modelscope/pipelines/nlp/sentence_similarity_pipeline.py b/modelscope/pipelines/nlp/sentence_similarity_pipeline.py index 4cccd996..c8484521 100644 --- a/modelscope/pipelines/nlp/sentence_similarity_pipeline.py +++ b/modelscope/pipelines/nlp/sentence_similarity_pipeline.py @@ -6,7 +6,7 @@ import torch from ...metainfo import Pipelines from ...models import Model from ...models.nlp import SbertForSentenceSimilarity -from ...preprocessors import SequenceClassificationPreprocessor +from ...preprocessors import SentenceSimilarityPreprocessor from ...utils.constant import Tasks from ..base import Input, Pipeline from ..builder import PIPELINES @@ -21,7 +21,7 @@ class SentenceSimilarityPipeline(Pipeline): def __init__(self, model: Union[Model, str], - preprocessor: SequenceClassificationPreprocessor = None, + preprocessor: SentenceSimilarityPreprocessor = None, first_sequence='first_sequence', second_sequence='second_sequence', **kwargs): @@ -29,7 +29,7 @@ class SentenceSimilarityPipeline(Pipeline): Args: model (SbertForSentenceSimilarity): a model instance - preprocessor (SequenceClassificationPreprocessor): a preprocessor instance + preprocessor (SentenceSimilarityPreprocessor): a preprocessor instance """ assert isinstance(model, str) or isinstance(model, SbertForSentenceSimilarity), \ 'model must be a single str or SbertForSentenceSimilarity' @@ -37,7 +37,7 @@ class SentenceSimilarityPipeline(Pipeline): model, SbertForSentenceSimilarity) else Model.from_pretrained(model) if preprocessor is None: - preprocessor = SequenceClassificationPreprocessor( + preprocessor = SentenceSimilarityPreprocessor( sc_model.model_dir, first_sequence=first_sequence, second_sequence=second_sequence) diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py index d5e9e58b..d383838e 100644 --- a/modelscope/pipelines/nlp/text_generation_pipeline.py +++ b/modelscope/pipelines/nlp/text_generation_pipeline.py @@ -22,11 +22,11 @@ class TextGenerationPipeline(Pipeline): model: Union[PalmForTextGeneration, str], preprocessor: Optional[TextGenerationPreprocessor] = None, **kwargs): - """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction + """use `model` and `preprocessor` to create a nlp text generation pipeline for prediction Args: - model (SequenceClassificationModel): a model instance - preprocessor (SequenceClassificationPreprocessor): a preprocessor instance + model (PalmForTextGeneration): a model instance + preprocessor (TextGenerationPreprocessor): a preprocessor instance """ model = model if isinstance( model, PalmForTextGeneration) else Model.from_pretrained(model) diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py index 66b333cb..c220adbb 100644 --- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py +++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py @@ -5,7 +5,7 @@ import torch from ...metainfo import Pipelines from ...models import Model from ...models.nlp import SbertForTokenClassification -from ...preprocessors import TokenClassifcationPreprocessor +from ...preprocessors import TokenClassificationPreprocessor from ...utils.constant import Tasks from ..base import Pipeline, Tensor from ..builder import PIPELINES @@ -18,21 +18,22 @@ __all__ = ['WordSegmentationPipeline'] Tasks.word_segmentation, module_name=Pipelines.word_segmentation) class WordSegmentationPipeline(Pipeline): - def __init__(self, - model: Union[SbertForTokenClassification, str], - preprocessor: Optional[TokenClassifcationPreprocessor] = None, - **kwargs): + def __init__( + self, + model: Union[SbertForTokenClassification, str], + preprocessor: Optional[TokenClassificationPreprocessor] = None, + **kwargs): """use `model` and `preprocessor` to create a nlp word segmentation pipeline for prediction Args: model (StructBertForTokenClassification): a model instance - preprocessor (TokenClassifcationPreprocessor): a preprocessor instance + preprocessor (TokenClassificationPreprocessor): a preprocessor instance """ model = model if isinstance( model, SbertForTokenClassification) else Model.from_pretrained(model) if preprocessor is None: - preprocessor = TokenClassifcationPreprocessor(model.model_dir) + preprocessor = TokenClassificationPreprocessor(model.model_dir) model.eval() super().__init__(model=model, preprocessor=preprocessor, **kwargs) self.tokenizer = preprocessor.tokenizer diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py index 007a3ac1..360d97aa 100644 --- a/modelscope/preprocessors/nlp.py +++ b/modelscope/preprocessors/nlp.py @@ -5,7 +5,8 @@ from typing import Any, Dict, Union from transformers import AutoTokenizer -from ..metainfo import Models, Preprocessors +from ..metainfo import Preprocessors +from ..models import Model from ..utils.constant import Fields, InputFields from ..utils.type_assert import type_assert from .base import Preprocessor @@ -13,9 +14,10 @@ from .builder import PREPROCESSORS __all__ = [ 'Tokenize', 'SequenceClassificationPreprocessor', - 'TextGenerationPreprocessor', 'TokenClassifcationPreprocessor', + 'TextGenerationPreprocessor', 'TokenClassificationPreprocessor', 'NLIPreprocessor', 'SentimentClassificationPreprocessor', - 'FillMaskPreprocessor', 'ZeroShotClassificationPreprocessor' + 'FillMaskPreprocessor', 'SentenceSimilarityPreprocessor', + 'ZeroShotClassificationPreprocessor' ] @@ -33,9 +35,7 @@ class Tokenize(Preprocessor): return data -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.nli_tokenizer) -class NLIPreprocessor(Preprocessor): +class NLPPreprocessorBase(Preprocessor): def __init__(self, model_dir: str, *args, **kwargs): """preprocess the data via the vocab.txt from the `model_dir` path @@ -45,18 +45,19 @@ class NLIPreprocessor(Preprocessor): """ super().__init__(*args, **kwargs) - - from sofa import SbertTokenizer self.model_dir: str = model_dir self.first_sequence: str = kwargs.pop('first_sequence', 'first_sequence') self.second_sequence = kwargs.pop('second_sequence', 'second_sequence') - self.sequence_length = kwargs.pop('sequence_length', 128) + self.tokenize_kwargs = kwargs + self.tokenizer = self.build_tokenizer(model_dir) - self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir) + def build_tokenizer(self, model_dir): + from sofa import SbertTokenizer + return SbertTokenizer.from_pretrained(model_dir) - @type_assert(object, tuple) - def __call__(self, data: tuple) -> Dict[str, Any]: + @type_assert(object, object) + def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]: """process the raw input data Args: @@ -70,101 +71,54 @@ class NLIPreprocessor(Preprocessor): Returns: Dict[str, Any]: the preprocessed data """ - sentence1, sentence2 = data - new_data = { - self.first_sequence: sentence1, - self.second_sequence: sentence2 - } - # preprocess the data for the model input - rst = { - 'id': [], - 'input_ids': [], - 'attention_mask': [], - 'token_type_ids': [] - } + text_a, text_b = None, None + if isinstance(data, str): + text_a = data + elif isinstance(data, tuple): + assert len(data) == 2 + text_a, text_b = data + elif isinstance(data, dict): + text_a = data.get(self.first_sequence) + text_b = data.get(self.second_sequence, None) - max_seq_length = self.sequence_length + return self.tokenizer(text_a, text_b, **self.tokenize_kwargs) - text_a = new_data[self.first_sequence] - text_b = new_data[self.second_sequence] - feature = self.tokenizer( - text_a, - text_b, - padding=False, - truncation=True, - max_length=max_seq_length) - rst['id'].append(new_data.get('id', str(uuid.uuid4()))) - rst['input_ids'].append(feature['input_ids']) - rst['attention_mask'].append(feature['attention_mask']) - rst['token_type_ids'].append(feature['token_type_ids']) +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.nli_tokenizer) +class NLIPreprocessor(NLPPreprocessorBase): - return rst + def __init__(self, model_dir: str, *args, **kwargs): + kwargs['truncation'] = True + kwargs['padding'] = False + kwargs['return_tensors'] = 'pt' + kwargs['max_length'] = kwargs.pop('sequence_length', 128) + super().__init__(model_dir, *args, **kwargs) @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer) -class SentimentClassificationPreprocessor(Preprocessor): +class SentimentClassificationPreprocessor(NLPPreprocessorBase): def __init__(self, model_dir: str, *args, **kwargs): - """preprocess the data via the vocab.txt from the `model_dir` path - - Args: - model_dir (str): model path - """ - - super().__init__(*args, **kwargs) - - from sofa import SbertTokenizer - self.model_dir: str = model_dir - self.first_sequence: str = kwargs.pop('first_sequence', - 'first_sequence') - self.second_sequence = kwargs.pop('second_sequence', 'second_sequence') - self.sequence_length = kwargs.pop('sequence_length', 128) - - self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir) - - @type_assert(object, str) - def __call__(self, data: str) -> Dict[str, Any]: - """process the raw input data + kwargs['truncation'] = True + kwargs['padding'] = 'max_length' + kwargs['return_tensors'] = 'pt' + kwargs['max_length'] = kwargs.pop('sequence_length', 128) + super().__init__(model_dir, *args, **kwargs) - Args: - data (str): a sentence - Example: - 'you are so handsome.' - Returns: - Dict[str, Any]: the preprocessed data - """ - - new_data = {self.first_sequence: data} - # preprocess the data for the model input - rst = { - 'id': [], - 'input_ids': [], - 'attention_mask': [], - 'token_type_ids': [] - } - - max_seq_length = self.sequence_length - - text_a = new_data[self.first_sequence] - - text_b = new_data.get(self.second_sequence, None) - feature = self.tokenizer( - text_a, - text_b, - padding='max_length', - truncation=True, - max_length=max_seq_length) - - rst['id'].append(new_data.get('id', str(uuid.uuid4()))) - rst['input_ids'].append(feature['input_ids']) - rst['attention_mask'].append(feature['attention_mask']) - rst['token_type_ids'].append(feature['token_type_ids']) +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.sen_sim_tokenizer) +class SentenceSimilarityPreprocessor(NLPPreprocessorBase): - return rst + def __init__(self, model_dir: str, *args, **kwargs): + kwargs['truncation'] = True + kwargs['padding'] = False + kwargs['return_tensors'] = 'pt' + kwargs['max_length'] = kwargs.pop('sequence_length', 128) + super().__init__(model_dir, *args, **kwargs) @PREPROCESSORS.register_module( @@ -192,36 +146,7 @@ class SequenceClassificationPreprocessor(Preprocessor): @type_assert(object, (str, tuple, Dict)) def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]: - """process the raw input data - - Args: - data (str or tuple, Dict): - sentence1 (str): a sentence - Example: - 'you are so handsome.' - or - (sentence1, sentence2) - sentence1 (str): a sentence - Example: - 'you are so handsome.' - sentence2 (str): a sentence - Example: - 'you are so beautiful.' - or - {field1: field_value1, field2: field_value2} - field1 (str): field name, default 'first_sequence' - field_value1 (str): a sentence - Example: - 'you are so handsome.' - - field2 (str): field name, default 'second_sequence' - field_value2 (str): a sentence - Example: - 'you are so beautiful.' - - Returns: - Dict[str, Any]: the preprocessed data - """ + feature = super().__call__(data) if isinstance(data, str): new_data = {self.first_sequence: data} elif isinstance(data, tuple): @@ -263,136 +188,55 @@ class SequenceClassificationPreprocessor(Preprocessor): @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.palm_text_gen_tokenizer) -class TextGenerationPreprocessor(Preprocessor): +class TextGenerationPreprocessor(NLPPreprocessorBase): def __init__(self, model_dir: str, tokenizer, *args, **kwargs): - """preprocess the data using the vocab.txt from the `model_dir` path - - Args: - model_dir (str): model path - """ - super().__init__(*args, **kwargs) - - self.model_dir: str = model_dir - self.first_sequence: str = kwargs.pop('first_sequence', - 'first_sequence') - self.second_sequence: str = kwargs.pop('second_sequence', - 'second_sequence') - self.sequence_length: int = kwargs.pop('sequence_length', 128) self.tokenizer = tokenizer + kwargs['truncation'] = True + kwargs['padding'] = 'max_length' + kwargs['return_tensors'] = 'pt' + kwargs['return_token_type_ids'] = False + kwargs['max_length'] = kwargs.pop('sequence_length', 128) + super().__init__(model_dir, *args, **kwargs) - @type_assert(object, str) - def __call__(self, data: str) -> Dict[str, Any]: - """process the raw input data - - Args: - data (str): a sentence - Example: - 'you are so handsome.' - - Returns: - Dict[str, Any]: the preprocessed data - """ - import torch - - new_data = {self.first_sequence: data} - # preprocess the data for the model input - - rst = {'input_ids': [], 'attention_mask': []} - - max_seq_length = self.sequence_length - - text_a = new_data.get(self.first_sequence, None) - text_b = new_data.get(self.second_sequence, None) - feature = self.tokenizer( - text_a, - text_b, - padding='max_length', - truncation=True, - max_length=max_seq_length) - - rst['input_ids'].append(feature['input_ids']) - rst['attention_mask'].append(feature['attention_mask']) - return {k: torch.tensor(v) for k, v in rst.items()} + def build_tokenizer(self, model_dir): + return self.tokenizer @PREPROCESSORS.register_module(Fields.nlp) -class FillMaskPreprocessor(Preprocessor): +class FillMaskPreprocessor(NLPPreprocessorBase): def __init__(self, model_dir: str, *args, **kwargs): - """preprocess the data via the vocab.txt from the `model_dir` path - - Args: - model_dir (str): model path - """ - super().__init__(*args, **kwargs) - self.model_dir = model_dir - self.first_sequence: str = kwargs.pop('first_sequence', - 'first_sequence') - self.sequence_length = kwargs.pop('sequence_length', 128) - try: - from transformers import AutoTokenizer - self.tokenizer = AutoTokenizer.from_pretrained(model_dir) - except KeyError: - from sofa.utils.backend import AutoTokenizer - self.tokenizer = AutoTokenizer.from_pretrained( - model_dir, use_fast=False) - - @type_assert(object, str) - def __call__(self, data: str) -> Dict[str, Any]: - """process the raw input data - - Args: - data (str): a sentence - Example: - 'you are so handsome.' - - Returns: - Dict[str, Any]: the preprocessed data - """ - import torch - - new_data = {self.first_sequence: data} - # preprocess the data for the model input - - rst = {'input_ids': [], 'attention_mask': [], 'token_type_ids': []} - - max_seq_length = self.sequence_length - - text_a = new_data[self.first_sequence] - feature = self.tokenizer( - text_a, - padding='max_length', - truncation=True, - max_length=max_seq_length, - return_token_type_ids=True) - - rst['input_ids'].append(feature['input_ids']) - rst['attention_mask'].append(feature['attention_mask']) - rst['token_type_ids'].append(feature['token_type_ids']) - - return {k: torch.tensor(v) for k, v in rst.items()} + kwargs['truncation'] = True + kwargs['padding'] = 'max_length' + kwargs['return_tensors'] = 'pt' + kwargs['max_length'] = kwargs.pop('sequence_length', 128) + kwargs['return_token_type_ids'] = True + super().__init__(model_dir, *args, **kwargs) + + def build_tokenizer(self, model_dir): + from ..utils.hub import get_model_type + model_type = get_model_type(model_dir) + if model_type in ['sbert', 'structbert', 'bert']: + from sofa import SbertTokenizer + return SbertTokenizer.from_pretrained(model_dir, use_fast=False) + elif model_type == 'veco': + from sofa import VecoTokenizer + return VecoTokenizer.from_pretrained(model_dir, use_fast=False) + else: + # TODO Only support veco & sbert + raise RuntimeError(f'Unsupported model type: {model_type}') @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.token_cls_tokenizer) -class TokenClassifcationPreprocessor(Preprocessor): +class TokenClassificationPreprocessor(NLPPreprocessorBase): def __init__(self, model_dir: str, *args, **kwargs): - """preprocess the data via the vocab.txt from the `model_dir` path - - Args: - model_dir (str): model path - """ - - super().__init__(*args, **kwargs) - - from sofa import SbertTokenizer - self.model_dir: str = model_dir - self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir) + super().__init__(model_dir, *args, **kwargs) @type_assert(object, str) - def __call__(self, data: str) -> Dict[str, Any]: + def __call__(self, data: Union[str, Dict]) -> Dict[str, Any]: """process the raw input data Args: @@ -405,7 +249,8 @@ class TokenClassifcationPreprocessor(Preprocessor): """ # preprocess the data for the model input - + if isinstance(data, dict): + data = data[self.first_sequence] text = data.replace(' ', '').strip() tokens = [] for token in text: @@ -425,7 +270,7 @@ class TokenClassifcationPreprocessor(Preprocessor): @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer) -class ZeroShotClassificationPreprocessor(Preprocessor): +class ZeroShotClassificationPreprocessor(NLPPreprocessorBase): def __init__(self, model_dir: str, *args, **kwargs): """preprocess the data via the vocab.txt from the `model_dir` path @@ -433,16 +278,11 @@ class ZeroShotClassificationPreprocessor(Preprocessor): Args: model_dir (str): model path """ - - super().__init__(*args, **kwargs) - - from sofa import SbertTokenizer - self.model_dir: str = model_dir self.sequence_length = kwargs.pop('sequence_length', 512) - self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir) + super().__init__(model_dir, *args, **kwargs) @type_assert(object, str) - def __call__(self, data: str, hypothesis_template: str, + def __call__(self, data, hypothesis_template: str, candidate_labels: list) -> Dict[str, Any]: """process the raw input data @@ -454,6 +294,9 @@ class ZeroShotClassificationPreprocessor(Preprocessor): Returns: Dict[str, Any]: the preprocessed data """ + if isinstance(data, dict): + data = data.get(self.first_sequence) + pairs = [[data, hypothesis_template.format(label)] for label in candidate_labels] diff --git a/modelscope/utils/hub.py b/modelscope/utils/hub.py index 3b7e80ef..f2a3c120 100644 --- a/modelscope/utils/hub.py +++ b/modelscope/utils/hub.py @@ -11,6 +11,9 @@ from modelscope.hub.file_download import model_file_download from modelscope.hub.snapshot_download import snapshot_download from modelscope.utils.config import Config from modelscope.utils.constant import ModelFile +from .logger import get_logger + +logger = get_logger(__name__) def create_model_if_not_exist( @@ -67,3 +70,18 @@ def auto_load(model: Union[str, List[str]]): ] return model + + +def get_model_type(model_dir): + try: + configuration_file = osp.join(model_dir, ModelFile.CONFIGURATION) + config_file = osp.join(model_dir, 'config.json') + if osp.isfile(configuration_file): + cfg = Config.from_file(configuration_file) + return cfg.model.model_type if hasattr(cfg.model, 'model_type') and not hasattr(cfg.model, 'type') \ + else cfg.model.type + elif osp.isfile(config_file): + cfg = Config.from_file(config_file) + return cfg.model_type if hasattr(cfg, 'model_type') else None + except Exception as e: + logger.error(f'parse config file failed with error: {e}') diff --git a/tests/pipelines/test_sentence_similarity.py b/tests/pipelines/test_sentence_similarity.py index df38593f..02edb87f 100644 --- a/tests/pipelines/test_sentence_similarity.py +++ b/tests/pipelines/test_sentence_similarity.py @@ -6,7 +6,7 @@ from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model from modelscope.models.nlp import SbertForSentenceSimilarity from modelscope.pipelines import SentenceSimilarityPipeline, pipeline -from modelscope.preprocessors import SequenceClassificationPreprocessor +from modelscope.preprocessors import SentenceSimilarityPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.test_utils import test_level @@ -19,7 +19,7 @@ class SentenceSimilarityTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run(self): cache_path = snapshot_download(self.model_id) - tokenizer = SequenceClassificationPreprocessor(cache_path) + tokenizer = SentenceSimilarityPreprocessor(cache_path) model = SbertForSentenceSimilarity(cache_path, tokenizer=tokenizer) pipeline1 = SentenceSimilarityPipeline(model, preprocessor=tokenizer) pipeline2 = pipeline( @@ -35,7 +35,7 @@ class SentenceSimilarityTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_model_from_modelhub(self): model = Model.from_pretrained(self.model_id) - tokenizer = SequenceClassificationPreprocessor(model.model_dir) + tokenizer = SentenceSimilarityPreprocessor(model.model_dir) pipeline_ins = pipeline( task=Tasks.sentence_similarity, model=model, diff --git a/tests/pipelines/test_word_segmentation.py b/tests/pipelines/test_word_segmentation.py index d33e4bdb..51f14011 100644 --- a/tests/pipelines/test_word_segmentation.py +++ b/tests/pipelines/test_word_segmentation.py @@ -6,7 +6,7 @@ from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model from modelscope.models.nlp import SbertForTokenClassification from modelscope.pipelines import WordSegmentationPipeline, pipeline -from modelscope.preprocessors import TokenClassifcationPreprocessor +from modelscope.preprocessors import TokenClassificationPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.test_utils import test_level @@ -18,7 +18,7 @@ class WordSegmentationTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_by_direct_model_download(self): cache_path = snapshot_download(self.model_id) - tokenizer = TokenClassifcationPreprocessor(cache_path) + tokenizer = TokenClassificationPreprocessor(cache_path) model = SbertForTokenClassification(cache_path, tokenizer=tokenizer) pipeline1 = WordSegmentationPipeline(model, preprocessor=tokenizer) pipeline2 = pipeline( @@ -31,7 +31,7 @@ class WordSegmentationTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_model_from_modelhub(self): model = Model.from_pretrained(self.model_id) - tokenizer = TokenClassifcationPreprocessor(model.model_dir) + tokenizer = TokenClassificationPreprocessor(model.model_dir) pipeline_ins = pipeline( task=Tasks.word_segmentation, model=model, preprocessor=tokenizer) print(pipeline_ins(input=self.sentence))