Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9627744master
| @@ -22,7 +22,7 @@ DEFAULT_MODEL_FOR_PIPELINE = { | |||||
| 'damo/nlp_structbert_word-segmentation_chinese-base'), | 'damo/nlp_structbert_word-segmentation_chinese-base'), | ||||
| Tasks.named_entity_recognition: | Tasks.named_entity_recognition: | ||||
| (Pipelines.named_entity_recognition, | (Pipelines.named_entity_recognition, | ||||
| 'damo/nlp_transformercrf_named-entity-recognition_chinese-base-news'), | |||||
| 'damo/nlp_raner_named-entity-recognition_chinese-base-news'), | |||||
| Tasks.sentence_similarity: | Tasks.sentence_similarity: | ||||
| (Pipelines.sentence_similarity, | (Pipelines.sentence_similarity, | ||||
| 'damo/nlp_structbert_sentence-similarity_chinese-base'), | 'damo/nlp_structbert_sentence-similarity_chinese-base'), | ||||
| @@ -42,7 +42,7 @@ class NamedEntityRecognitionPipeline(Pipeline): | |||||
| def postprocess(self, inputs: Dict[str, Any], | def postprocess(self, inputs: Dict[str, Any], | ||||
| **postprocess_params) -> Dict[str, str]: | **postprocess_params) -> Dict[str, str]: | ||||
| text = inputs['text'] | text = inputs['text'] | ||||
| offset_mapping = inputs['offset_mapping'] | |||||
| offset_mapping = [x.cpu().tolist() for x in inputs['offset_mapping']] | |||||
| labels = [self.id2label[x] for x in inputs['predicts']] | labels = [self.id2label[x] for x in inputs['predicts']] | ||||
| entities = [] | entities = [] | ||||
| entity = {} | entity = {} | ||||
| @@ -483,6 +483,8 @@ class NERPreprocessor(Preprocessor): | |||||
| self.sequence_length = kwargs.pop('sequence_length', 512) | self.sequence_length = kwargs.pop('sequence_length', 512) | ||||
| self.tokenizer = AutoTokenizer.from_pretrained( | self.tokenizer = AutoTokenizer.from_pretrained( | ||||
| model_dir, use_fast=True) | model_dir, use_fast=True) | ||||
| self.is_split_into_words = self.tokenizer.init_kwargs.get( | |||||
| 'is_split_into_words', False) | |||||
| @type_assert(object, str) | @type_assert(object, str) | ||||
| def __call__(self, data: str) -> Dict[str, Any]: | def __call__(self, data: str) -> Dict[str, Any]: | ||||
| @@ -499,29 +501,51 @@ class NERPreprocessor(Preprocessor): | |||||
| # preprocess the data for the model input | # preprocess the data for the model input | ||||
| text = data | text = data | ||||
| encodings = self.tokenizer( | |||||
| text, | |||||
| add_special_tokens=True, | |||||
| padding=True, | |||||
| truncation=True, | |||||
| max_length=self.sequence_length, | |||||
| return_offsets_mapping=True) | |||||
| input_ids = encodings['input_ids'] | |||||
| attention_mask = encodings['attention_mask'] | |||||
| word_ids = encodings.word_ids() | |||||
| label_mask = [] | |||||
| offset_mapping = [] | |||||
| for i in range(len(word_ids)): | |||||
| if word_ids[i] is None: | |||||
| label_mask.append(0) | |||||
| elif word_ids[i] == word_ids[i - 1]: | |||||
| label_mask.append(0) | |||||
| offset_mapping[-1] = (offset_mapping[-1][0], | |||||
| encodings['offset_mapping'][i][1]) | |||||
| else: | |||||
| label_mask.append(1) | |||||
| offset_mapping.append(encodings['offset_mapping'][i]) | |||||
| if self.is_split_into_words: | |||||
| input_ids = [] | |||||
| label_mask = [] | |||||
| offset_mapping = [] | |||||
| for offset, token in enumerate(list(data)): | |||||
| subtoken_ids = self.tokenizer.encode( | |||||
| token, add_special_tokens=False) | |||||
| if len(subtoken_ids) == 0: | |||||
| subtoken_ids = [self.tokenizer.unk_token_id] | |||||
| input_ids.extend(subtoken_ids) | |||||
| label_mask.extend([1] + [0] * (len(subtoken_ids) - 1)) | |||||
| offset_mapping.extend([(offset, offset + 1)] | |||||
| + [(offset + 1, offset + 1)] | |||||
| * (len(subtoken_ids) - 1)) | |||||
| if len(input_ids) >= self.sequence_length - 2: | |||||
| input_ids = input_ids[:self.sequence_length - 2] | |||||
| label_mask = label_mask[:self.sequence_length - 2] | |||||
| offset_mapping = offset_mapping[:self.sequence_length - 2] | |||||
| input_ids = [self.tokenizer.cls_token_id | |||||
| ] + input_ids + [self.tokenizer.sep_token_id] | |||||
| label_mask = [0] + label_mask + [0] | |||||
| attention_mask = [1] * len(input_ids) | |||||
| else: | |||||
| encodings = self.tokenizer( | |||||
| text, | |||||
| add_special_tokens=True, | |||||
| padding=True, | |||||
| truncation=True, | |||||
| max_length=self.sequence_length, | |||||
| return_offsets_mapping=True) | |||||
| input_ids = encodings['input_ids'] | |||||
| attention_mask = encodings['attention_mask'] | |||||
| word_ids = encodings.word_ids() | |||||
| label_mask = [] | |||||
| offset_mapping = [] | |||||
| for i in range(len(word_ids)): | |||||
| if word_ids[i] is None: | |||||
| label_mask.append(0) | |||||
| elif word_ids[i] == word_ids[i - 1]: | |||||
| label_mask.append(0) | |||||
| offset_mapping[-1] = (offset_mapping[-1][0], | |||||
| encodings['offset_mapping'][i][1]) | |||||
| else: | |||||
| label_mask.append(1) | |||||
| offset_mapping.append(encodings['offset_mapping'][i]) | |||||
| return { | return { | ||||
| 'text': text, | 'text': text, | ||||
| 'input_ids': input_ids, | 'input_ids': input_ids, | ||||
| @@ -12,7 +12,7 @@ from modelscope.utils.test_utils import test_level | |||||
| class NamedEntityRecognitionTest(unittest.TestCase): | class NamedEntityRecognitionTest(unittest.TestCase): | ||||
| model_id = 'damo/nlp_transformercrf_named-entity-recognition_chinese-base-news' | |||||
| model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news' | |||||
| sentence = '这与温岭市新河镇的一个神秘的传说有关。' | sentence = '这与温岭市新河镇的一个神秘的传说有关。' | ||||
| @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | ||||
| @@ -32,7 +32,7 @@ class NamedEntityRecognitionTest(unittest.TestCase): | |||||
| print() | print() | ||||
| print(f'pipeline2: {pipeline2(input=self.sentence)}') | print(f'pipeline2: {pipeline2(input=self.sentence)}') | ||||
| @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | |||||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||||
| def test_run_with_model_from_modelhub(self): | def test_run_with_model_from_modelhub(self): | ||||
| model = Model.from_pretrained(self.model_id) | model = Model.from_pretrained(self.model_id) | ||||
| tokenizer = NERPreprocessor(model.model_dir) | tokenizer = NERPreprocessor(model.model_dir) | ||||
| @@ -42,7 +42,7 @@ class NamedEntityRecognitionTest(unittest.TestCase): | |||||
| preprocessor=tokenizer) | preprocessor=tokenizer) | ||||
| print(pipeline_ins(input=self.sentence)) | print(pipeline_ins(input=self.sentence)) | ||||
| @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | |||||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||||
| def test_run_with_model_name(self): | def test_run_with_model_name(self): | ||||
| pipeline_ins = pipeline( | pipeline_ins = pipeline( | ||||
| task=Tasks.named_entity_recognition, model=self.model_id) | task=Tasks.named_entity_recognition, model=self.model_id) | ||||