Browse Source

[to #42322933]fix tokenizer for faq

多语言faq,Tokenizer新增类型判别
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10530690
master
wenshen.xws yingda.chen 3 years ago
parent
commit
2c994ed760
1 changed files with 9 additions and 1 deletions
  1. +9
    -1
      modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py

+ 9
- 1
modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py View File

@@ -18,11 +18,19 @@ class FaqQuestionAnsweringPreprocessor(NLPBasePreprocessor):
def __init__(self, model_dir: str, *args, **kwargs):
super(FaqQuestionAnsweringPreprocessor, self).__init__(
model_dir, mode=ModeKeys.INFERENCE, **kwargs)

from transformers import BertTokenizer
self.tokenizer = BertTokenizer.from_pretrained(model_dir)
preprocessor_config = Config.from_file(
os.path.join(model_dir, ModelFile.CONFIGURATION)).get(
ConfigFields.preprocessor, {})
if preprocessor_config.get('tokenizer',
'BertTokenizer') == 'XLMRoberta':
from transformers import XLMRobertaTokenizer
self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_dir)
else:
self.tokenizer = BertTokenizer.from_pretrained(model_dir)

self.MAX_LEN = preprocessor_config.get('max_seq_length', 50)
self.label_dict = None



Loading…
Cancel
Save