From 83b0adf0a2391a8459b28685d843970fcdbcb310 Mon Sep 17 00:00:00 2001 From: pangda Date: Thu, 25 Aug 2022 23:04:14 +0800 Subject: [PATCH] [to #42322933] fix bug for multi-lang text MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 支持多语言tokenize(830模型) Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9900916 --- modelscope/preprocessors/nlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py index 25576667..222a219a 100644 --- a/modelscope/preprocessors/nlp.py +++ b/modelscope/preprocessors/nlp.py @@ -533,7 +533,7 @@ class NERPreprocessor(Preprocessor): self.model_dir: str = model_dir self.sequence_length = kwargs.pop('sequence_length', 512) self.tokenizer = AutoTokenizer.from_pretrained( - model_dir, use_fast=False) + model_dir, use_fast=True) self.is_split_into_words = self.tokenizer.init_kwargs.get( 'is_split_into_words', False)