Browse Source

Merge remote-tracking branch 'origin/master' into ofa/finetune

master
行嗔 3 years ago
parent
commit
44026e0bf8
46 changed files with 284 additions and 335 deletions
  1. +3
    -3
      modelscope/metainfo.py
  2. +0
    -3
      modelscope/models/cv/face_human_hand_detection/one_stage_detector.py
  3. +2
    -2
      modelscope/models/multi_modal/ofa/utils/constant.py
  4. +4
    -4
      modelscope/models/multi_modal/ofa_for_all_tasks.py
  5. +3
    -2
      modelscope/models/nlp/__init__.py
  6. +2
    -0
      modelscope/models/nlp/heads/infromation_extraction_head.py
  7. +2
    -0
      modelscope/models/nlp/task_models/information_extraction.py
  8. +5
    -5
      modelscope/models/nlp/text_ranking.py
  9. +2
    -2
      modelscope/msdatasets/task_datasets/__init__.py
  10. +8
    -8
      modelscope/msdatasets/task_datasets/text_ranking_dataset.py
  11. +2
    -2
      modelscope/outputs.py
  12. +1
    -1
      modelscope/pipeline_inputs.py
  13. +5
    -2
      modelscope/pipelines/builder.py
  14. +2
    -0
      modelscope/pipelines/cv/face_image_generation_pipeline.py
  15. +2
    -0
      modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py
  16. +1
    -1
      modelscope/pipelines/multi_modal/ocr_recognition_pipeline.py
  17. +2
    -2
      modelscope/pipelines/nlp/__init__.py
  18. +2
    -0
      modelscope/pipelines/nlp/information_extraction_pipeline.py
  19. +1
    -1
      modelscope/pipelines/nlp/summarization_pipeline.py
  20. +27
    -24
      modelscope/pipelines/nlp/table_question_answering_pipeline.py
  21. +5
    -5
      modelscope/pipelines/nlp/text_ranking_pipeline.py
  22. +2
    -2
      modelscope/preprocessors/__init__.py
  23. +4
    -4
      modelscope/preprocessors/multi_modal.py
  24. +2
    -2
      modelscope/preprocessors/nlp/__init__.py
  25. +26
    -29
      modelscope/preprocessors/nlp/nlp_base.py
  26. +1
    -1
      modelscope/preprocessors/star3/fields/database.py
  27. +17
    -14
      modelscope/preprocessors/star3/fields/schema_link.py
  28. +2
    -0
      modelscope/preprocessors/star3/table_question_answering_preprocessor.py
  29. +2
    -2
      modelscope/trainers/__init__.py
  30. +2
    -2
      modelscope/trainers/nlp/__init__.py
  31. +6
    -5
      modelscope/trainers/nlp/text_ranking_trainer.py
  32. +2
    -3
      modelscope/utils/constant.py
  33. +1
    -2
      modelscope/utils/registry.py
  34. +1
    -1
      tests/export/test_export_sbert_sequence_classification.py
  35. +2
    -2
      tests/msdatasets/test_ms_dataset.py
  36. +32
    -55
      tests/pipelines/test_automatic_speech_recognition.py
  37. +2
    -2
      tests/pipelines/test_gpt3_text_generation.py
  38. +3
    -3
      tests/pipelines/test_ofa_tasks.py
  39. +5
    -5
      tests/pipelines/test_relation_extraction.py
  40. +49
    -6
      tests/pipelines/test_table_question_answering.py
  41. +0
    -100
      tests/pipelines/test_text_classification.py
  42. +12
    -13
      tests/pipelines/test_text_ranking.py
  43. +2
    -1
      tests/trainers/test_finetune_sequence_classification.py
  44. +9
    -8
      tests/trainers/test_finetune_text_ranking.py
  45. +2
    -2
      tests/trainers/test_image_portrait_enhancement_trainer.py
  46. +17
    -4
      tests/trainers/test_trainer_with_nlp.py

+ 3
- 3
modelscope/metainfo.py View File

@@ -236,7 +236,7 @@ class Pipelines(object):
conversational_text_to_sql = 'conversational-text-to-sql'
table_question_answering_pipeline = 'table-question-answering-pipeline'
sentence_embedding = 'sentence-embedding'
passage_ranking = 'passage-ranking'
text_ranking = 'text-ranking'
relation_extraction = 'relation-extraction'
document_segmentation = 'document-segmentation'
feature_extraction = 'feature-extraction'
@@ -298,7 +298,7 @@ class Trainers(object):
dialog_intent_trainer = 'dialog-intent-trainer'
nlp_base_trainer = 'nlp-base-trainer'
nlp_veco_trainer = 'nlp-veco-trainer'
nlp_passage_ranking_trainer = 'nlp-passage-ranking-trainer'
nlp_text_ranking_trainer = 'nlp-text-ranking-trainer'

# audio trainers
speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
@@ -344,7 +344,7 @@ class Preprocessors(object):
zero_shot_cls_tokenizer = 'zero-shot-cls-tokenizer'
text_error_correction = 'text-error-correction'
sentence_embedding = 'sentence-embedding'
passage_ranking = 'passage-ranking'
text_ranking = 'text-ranking'
sequence_labeling_tokenizer = 'sequence-labeling-tokenizer'
word_segment_text_to_label_preprocessor = 'word-segment-text-to-label-preprocessor'
fill_mask = 'fill-mask'


+ 0
- 3
modelscope/models/cv/face_human_hand_detection/one_stage_detector.py View File

@@ -56,9 +56,6 @@ class OneStageDetector(nn.Module):

def inference(self, meta):
with torch.no_grad():
torch.cuda.synchronize()
preds = self(meta['img'])
torch.cuda.synchronize()
results = self.head.post_process(preds, meta)
torch.cuda.synchronize()
return results

+ 2
- 2
modelscope/models/multi_modal/ofa/utils/constant.py View File

@@ -3,9 +3,9 @@ from modelscope.outputs import OutputKeys
from modelscope.utils.constant import Tasks

OFA_TASK_KEY_MAPPING = {
Tasks.ofa_ocr_recognition: OutputKeys.TEXT,
Tasks.ocr_recognition: OutputKeys.TEXT,
Tasks.image_captioning: OutputKeys.CAPTION,
Tasks.summarization: OutputKeys.TEXT,
Tasks.text_summarization: OutputKeys.TEXT,
Tasks.visual_question_answering: OutputKeys.TEXT,
Tasks.visual_grounding: OutputKeys.BOXES,
Tasks.text_classification: OutputKeys.LABELS,


+ 4
- 4
modelscope/models/multi_modal/ofa_for_all_tasks.py View File

@@ -28,13 +28,13 @@ __all__ = ['OfaForAllTasks']


@MODELS.register_module(Tasks.image_captioning, module_name=Models.ofa)
@MODELS.register_module(Tasks.ofa_ocr_recognition, module_name=Models.ofa)
@MODELS.register_module(Tasks.ocr_recognition, module_name=Models.ofa)
@MODELS.register_module(Tasks.visual_grounding, module_name=Models.ofa)
@MODELS.register_module(
Tasks.visual_question_answering, module_name=Models.ofa)
@MODELS.register_module(Tasks.visual_entailment, module_name=Models.ofa)
@MODELS.register_module(Tasks.image_classification, module_name=Models.ofa)
@MODELS.register_module(Tasks.summarization, module_name=Models.ofa)
@MODELS.register_module(Tasks.text_summarization, module_name=Models.ofa)
@MODELS.register_module(Tasks.text_classification, module_name=Models.ofa)
class OfaForAllTasks(TorchModel):

@@ -98,9 +98,9 @@ class OfaForAllTasks(TorchModel):
'traverse': self._traverse_inference,
}
self.task_inference_mapping = {
Tasks.ofa_ocr_recognition: self._text_gen_inference,
Tasks.ocr_recognition: self._text_gen_inference,
Tasks.image_captioning: self._text_gen_inference,
Tasks.summarization: self._text_gen_inference,
Tasks.text_summarization: self._text_gen_inference,
Tasks.visual_grounding: self._visual_grounding_inference,
Tasks.visual_entailment: inference_d[self.gen_type],
Tasks.visual_question_answering: inference_d[self.gen_type],


+ 3
- 2
modelscope/models/nlp/__init__.py View File

@@ -34,8 +34,9 @@ if TYPE_CHECKING:
TaskModelForTextGeneration)
from .token_classification import SbertForTokenClassification
from .sentence_embedding import SentenceEmbedding
from .passage_ranking import PassageRanking
from .text_ranking import TextRanking
from .T5 import T5ForConditionalGeneration

else:
_import_structure = {
'backbones': ['SbertModel'],
@@ -75,7 +76,7 @@ else:
'token_classification': ['SbertForTokenClassification'],
'table_question_answering': ['TableQuestionAnswering'],
'sentence_embedding': ['SentenceEmbedding'],
'passage_ranking': ['PassageRanking'],
'text_ranking': ['TextRanking'],
'T5': ['T5ForConditionalGeneration'],
}



+ 2
- 0
modelscope/models/nlp/heads/infromation_extraction_head.py View File

@@ -10,6 +10,8 @@ from modelscope.utils.constant import Tasks

@HEADS.register_module(
Tasks.information_extraction, module_name=Heads.information_extraction)
@HEADS.register_module(
Tasks.relation_extraction, module_name=Heads.information_extraction)
class InformationExtractionHead(TorchHead):

def __init__(self, **kwargs):


+ 2
- 0
modelscope/models/nlp/task_models/information_extraction.py View File

@@ -16,6 +16,8 @@ __all__ = ['InformationExtractionModel']
@MODELS.register_module(
Tasks.information_extraction,
module_name=TaskModels.information_extraction)
@MODELS.register_module(
Tasks.relation_extraction, module_name=TaskModels.information_extraction)
class InformationExtractionModel(SingleBackboneTaskModelBase):

def __init__(self, model_dir: str, *args, **kwargs):


modelscope/models/nlp/passage_ranking.py → modelscope/models/nlp/text_ranking.py View File

@@ -13,18 +13,18 @@ from modelscope.models.nlp.structbert import SbertPreTrainedModel
from modelscope.outputs import OutputKeys
from modelscope.utils.constant import Tasks

__all__ = ['PassageRanking']
__all__ = ['TextRanking']


@MODELS.register_module(Tasks.passage_ranking, module_name=Models.bert)
class PassageRanking(SbertForSequenceClassification, SbertPreTrainedModel):
@MODELS.register_module(Tasks.text_ranking, module_name=Models.bert)
class TextRanking(SbertForSequenceClassification, SbertPreTrainedModel):
base_model_prefix: str = 'bert'
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r'position_ids']

def __init__(self, config, model_dir, *args, **kwargs):
if hasattr(config, 'base_model_prefix'):
PassageRanking.base_model_prefix = config.base_model_prefix
TextRanking.base_model_prefix = config.base_model_prefix
super().__init__(config, model_dir)
self.train_batch_size = kwargs.get('train_batch_size', 4)
self.register_buffer(
@@ -74,7 +74,7 @@ class PassageRanking(SbertForSequenceClassification, SbertPreTrainedModel):
num_labels = kwargs.get('num_labels', 1)
model_args = {} if num_labels is None else {'num_labels': num_labels}

return super(SbertPreTrainedModel, PassageRanking).from_pretrained(
return super(SbertPreTrainedModel, TextRanking).from_pretrained(
pretrained_model_name_or_path=kwargs.get('model_dir'),
model_dir=kwargs.get('model_dir'),
**model_args)

+ 2
- 2
modelscope/msdatasets/task_datasets/__init__.py View File

@@ -12,14 +12,14 @@ if TYPE_CHECKING:
from .movie_scene_segmentation import MovieSceneSegmentationDataset
from .video_summarization_dataset import VideoSummarizationDataset
from .image_inpainting import ImageInpaintingDataset
from .passage_ranking_dataset import PassageRankingDataset
from .text_ranking_dataset import TextRankingDataset

else:
_import_structure = {
'base': ['TaskDataset'],
'builder': ['TASK_DATASETS', 'build_task_dataset'],
'torch_base_dataset': ['TorchTaskDataset'],
'passage_ranking_dataset': ['PassageRankingDataset'],
'text_ranking_dataset': ['TextRankingDataset'],
'veco_dataset': ['VecoDataset'],
'image_instance_segmentation_coco_dataset':
['ImageInstanceSegmentationCocoDataset'],


modelscope/msdatasets/task_datasets/passage_ranking_dataset.py → modelscope/msdatasets/task_datasets/text_ranking_dataset.py View File

@@ -16,8 +16,8 @@ from .torch_base_dataset import TorchTaskDataset


@TASK_DATASETS.register_module(
group_key=Tasks.passage_ranking, module_name=Models.bert)
class PassageRankingDataset(TorchTaskDataset):
group_key=Tasks.text_ranking, module_name=Models.bert)
class TextRankingDataset(TorchTaskDataset):

def __init__(self,
datasets: Union[Any, List[Any]],
@@ -35,8 +35,8 @@ class PassageRankingDataset(TorchTaskDataset):
'positive_passages')
self.neg_sequence = self.dataset_config.get('neg_sequence',
'negative_passages')
self.passage_text_fileds = self.dataset_config.get(
'passage_text_fileds', ['title', 'text'])
self.text_fileds = self.dataset_config.get('text_fileds',
['title', 'text'])
self.qid_field = self.dataset_config.get('qid_field', 'query_id')
if mode == ModeKeys.TRAIN:
train_config = kwargs.get('train', {})
@@ -58,14 +58,14 @@ class PassageRankingDataset(TorchTaskDataset):

pos_sequences = group[self.pos_sequence]
pos_sequences = [
' '.join([ele[key] for key in self.passage_text_fileds])
' '.join([ele[key] for key in self.text_fileds])
for ele in pos_sequences
]
labels.extend([1] * len(pos_sequences))

neg_sequences = group[self.neg_sequence]
neg_sequences = [
' '.join([ele[key] for key in self.passage_text_fileds])
' '.join([ele[key] for key in self.text_fileds])
for ele in neg_sequences
]

@@ -88,13 +88,13 @@ class PassageRankingDataset(TorchTaskDataset):

pos_sequences = group[self.pos_sequence]
pos_sequences = [
' '.join([ele[key] for key in self.passage_text_fileds])
' '.join([ele[key] for key in self.text_fileds])
for ele in pos_sequences
]

neg_sequences = group[self.neg_sequence]
neg_sequences = [
' '.join([ele[key] for key in self.passage_text_fileds])
' '.join([ele[key] for key in self.text_fileds])
for ele in neg_sequences
]


+ 2
- 2
modelscope/outputs.py View File

@@ -506,7 +506,7 @@ TASK_OUTPUTS = {
# }
Tasks.text_error_correction: [OutputKeys.OUTPUT],
Tasks.sentence_embedding: [OutputKeys.TEXT_EMBEDDING, OutputKeys.SCORES],
Tasks.passage_ranking: [OutputKeys.SCORES],
Tasks.text_ranking: [OutputKeys.SCORES],

# text generation result for single sample
# {
@@ -661,7 +661,7 @@ TASK_OUTPUTS = {
# "caption": "this is an image caption text."
# }
Tasks.image_captioning: [OutputKeys.CAPTION],
Tasks.ofa_ocr_recognition: [OutputKeys.TEXT],
Tasks.ocr_recognition: [OutputKeys.TEXT],

# visual grounding result for single sample
# {


+ 1
- 1
modelscope/pipeline_inputs.py View File

@@ -162,7 +162,7 @@ TASK_INPUTS = {
'source_sentence': InputType.LIST,
'sentences_to_compare': InputType.LIST,
},
Tasks.passage_ranking: (InputType.TEXT, InputType.TEXT),
Tasks.text_ranking: (InputType.TEXT, InputType.TEXT),
Tasks.text_generation:
InputType.TEXT,
Tasks.fill_mask:


+ 5
- 2
modelscope/pipelines/builder.py View File

@@ -20,8 +20,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
Tasks.sentence_embedding:
(Pipelines.sentence_embedding,
'damo/nlp_corom_sentence-embedding_english-base'),
Tasks.passage_ranking: (Pipelines.passage_ranking,
'damo/nlp_corom_passage-ranking_english-base'),
Tasks.text_ranking: (Pipelines.text_ranking,
'damo/nlp_corom_passage-ranking_english-base'),
Tasks.word_segmentation:
(Pipelines.word_segmentation,
'damo/nlp_structbert_word-segmentation_chinese-base'),
@@ -31,6 +31,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
Tasks.named_entity_recognition:
(Pipelines.named_entity_recognition,
'damo/nlp_raner_named-entity-recognition_chinese-base-news'),
Tasks.relation_extraction:
(Pipelines.relation_extraction,
'damo/nlp_bert_relation-extraction_chinese-base'),
Tasks.information_extraction:
(Pipelines.relation_extraction,
'damo/nlp_bert_relation-extraction_chinese-base'),


+ 2
- 0
modelscope/pipelines/cv/face_image_generation_pipeline.py View File

@@ -61,6 +61,8 @@ class FaceImageGenerationPipeline(Pipeline):
return input

def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
if isinstance(input, str):
input = int(input)
assert isinstance(input, int)
torch.manual_seed(input)
torch.cuda.manual_seed(input)


+ 2
- 0
modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py View File

@@ -11,6 +11,8 @@ from modelscope.utils.logger import get_logger
logger = get_logger()


@PIPELINES.register_module(
Tasks.image_text_retrieval, module_name=Pipelines.multi_modal_embedding)
@PIPELINES.register_module(
Tasks.multi_modal_embedding, module_name=Pipelines.multi_modal_embedding)
class MultiModalEmbeddingPipeline(Pipeline):


+ 1
- 1
modelscope/pipelines/multi_modal/ocr_recognition_pipeline.py View File

@@ -16,7 +16,7 @@ logger = get_logger()


@PIPELINES.register_module(
Tasks.ofa_ocr_recognition, module_name=Pipelines.ofa_ocr_recognition)
Tasks.ocr_recognition, module_name=Pipelines.ofa_ocr_recognition)
class OcrRecognitionPipeline(Pipeline):

def __init__(self,


+ 2
- 2
modelscope/pipelines/nlp/__init__.py View File

@@ -17,7 +17,7 @@ if TYPE_CHECKING:
from .fill_mask_ponet_pipeline import FillMaskPonetPipeline
from .information_extraction_pipeline import InformationExtractionPipeline
from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline
from .passage_ranking_pipeline import PassageRankingPipeline
from .text_ranking_pipeline import TextRankingPipeline
from .sentence_embedding_pipeline import SentenceEmbeddingPipeline
from .sequence_classification_pipeline import SequenceClassificationPipeline
from .summarization_pipeline import SummarizationPipeline
@@ -51,7 +51,7 @@ else:
'information_extraction_pipeline': ['InformationExtractionPipeline'],
'named_entity_recognition_pipeline':
['NamedEntityRecognitionPipeline'],
'passage_ranking_pipeline': ['PassageRankingPipeline'],
'text_ranking_pipeline': ['TextRankingPipeline'],
'sentence_embedding_pipeline': ['SentenceEmbeddingPipeline'],
'sequence_classification_pipeline': ['SequenceClassificationPipeline'],
'summarization_pipeline': ['SummarizationPipeline'],


+ 2
- 0
modelscope/pipelines/nlp/information_extraction_pipeline.py View File

@@ -17,6 +17,8 @@ __all__ = ['InformationExtractionPipeline']

@PIPELINES.register_module(
Tasks.information_extraction, module_name=Pipelines.relation_extraction)
@PIPELINES.register_module(
Tasks.relation_extraction, module_name=Pipelines.relation_extraction)
class InformationExtractionPipeline(Pipeline):

def __init__(self,


+ 1
- 1
modelscope/pipelines/nlp/summarization_pipeline.py View File

@@ -13,7 +13,7 @@ logger = get_logger()


@PIPELINES.register_module(
Tasks.summarization, module_name=Pipelines.text_generation)
Tasks.text_summarization, module_name=Pipelines.text_generation)
class SummarizationPipeline(Pipeline):

def __init__(self,


+ 27
- 24
modelscope/pipelines/nlp/table_question_answering_pipeline.py View File

@@ -72,6 +72,7 @@ class TableQuestionAnsweringPipeline(Pipeline):
action = self.action_ops[result['action']]
headers = table['header_name']
current_sql = result['sql']
current_sql['from'] = [table['table_id']]

if history_sql is None:
return current_sql
@@ -216,10 +217,11 @@ class TableQuestionAnsweringPipeline(Pipeline):
else:
return current_sql

def sql_dict_to_str(self, result, table):
def sql_dict_to_str(self, result, tables):
"""
convert sql struct to string
"""
table = tables[result['sql']['from'][0]]
header_names = table['header_name'] + ['空列']
header_ids = table['header_id'] + ['null']
sql = result['sql']
@@ -279,42 +281,43 @@ class TableQuestionAnsweringPipeline(Pipeline):
"""
result = inputs['result']
history_sql = inputs['history_sql']
result['sql'] = self.post_process_multi_turn(
history_sql=history_sql,
result=result,
table=self.db.tables[result['table_id']])
result['sql']['from'] = [result['table_id']]
sql = self.sql_dict_to_str(
result=result, table=self.db.tables[result['table_id']])
try:
result['sql'] = self.post_process_multi_turn(
history_sql=history_sql,
result=result,
table=self.db.tables[result['table_id']])
except Exception:
result['sql'] = history_sql
sql = self.sql_dict_to_str(result=result, tables=self.db.tables)

# add sqlite
if self.db.is_use_sqlite:
try:
cursor = self.db.connection_obj.cursor().execute(sql.query)
names = [{
'name':
description[0],
'label':
self.db.tables[result['table_id']]['headerid2name'].get(
description[0], description[0])
} for description in cursor.description]
cells = []
header_ids, header_names = [], []
for description in cursor.description:
header_ids.append(self.db.tables[result['table_id']]
['headerid2name'].get(
description[0], description[0]))
header_names.append(description[0])
rows = []
for res in cursor.fetchall():
row = {}
for name, cell in zip(names, res):
row[name['name']] = cell
cells.append(row)
tabledata = {'headers': names, 'cells': cells}
rows.append(list(res))
tabledata = {
'header_id': header_ids,
'header_name': header_names,
'rows': rows
}
except Exception:
tabledata = {'headers': [], 'cells': []}
tabledata = {'header_id': [], 'header_name': [], 'rows': []}
else:
tabledata = {'headers': [], 'cells': []}
tabledata = {'header_id': [], 'header_name': [], 'rows': []}

output = {
OutputKeys.SQL_STRING: sql.string,
OutputKeys.SQL_QUERY: sql.query,
OutputKeys.HISTORY: result['sql'],
OutputKeys.QUERT_RESULT: json.dumps(tabledata, ensure_ascii=False),
OutputKeys.QUERT_RESULT: tabledata,
}

return output


modelscope/pipelines/nlp/passage_ranking_pipeline.py → modelscope/pipelines/nlp/text_ranking_pipeline.py View File

@@ -9,15 +9,15 @@ from modelscope.models import Model
from modelscope.outputs import OutputKeys
from modelscope.pipelines.base import Pipeline
from modelscope.pipelines.builder import PIPELINES
from modelscope.preprocessors import PassageRankingPreprocessor, Preprocessor
from modelscope.preprocessors import Preprocessor, TextRankingPreprocessor
from modelscope.utils.constant import Tasks

__all__ = ['PassageRankingPipeline']
__all__ = ['TextRankingPipeline']


@PIPELINES.register_module(
Tasks.passage_ranking, module_name=Pipelines.passage_ranking)
class PassageRankingPipeline(Pipeline):
Tasks.text_ranking, module_name=Pipelines.text_ranking)
class TextRankingPipeline(Pipeline):

def __init__(self,
model: Union[Model, str],
@@ -36,7 +36,7 @@ class PassageRankingPipeline(Pipeline):
Model) else Model.from_pretrained(model)

if preprocessor is None:
preprocessor = PassageRankingPreprocessor(
preprocessor = TextRankingPreprocessor(
model.model_dir if isinstance(model, Model) else model,
sequence_length=kwargs.pop('sequence_length', 128))
model.eval()

+ 2
- 2
modelscope/preprocessors/__init__.py View File

@@ -21,7 +21,7 @@ if TYPE_CHECKING:
FillMaskPoNetPreprocessor,
NLPPreprocessor,
NLPTokenizerPreprocessorBase,
PassageRankingPreprocessor,
TextRankingPreprocessor,
RelationExtractionPreprocessor,
SentenceEmbeddingPreprocessor,
SequenceClassificationPreprocessor,
@@ -62,7 +62,7 @@ else:
'FillMaskPoNetPreprocessor',
'NLPPreprocessor',
'NLPTokenizerPreprocessorBase',
'PassageRankingPreprocessor',
'TextRankingPreprocessor',
'RelationExtractionPreprocessor',
'SentenceEmbeddingPreprocessor',
'SequenceClassificationPreprocessor',


+ 4
- 4
modelscope/preprocessors/multi_modal.py View File

@@ -40,7 +40,7 @@ class OfaPreprocessor(Preprocessor):
"""
super().__init__(*args, **kwargs)
preprocess_mapping = {
Tasks.ofa_ocr_recognition: OfaOcrRecognitionPreprocessor,
Tasks.ocr_recognition: OfaOcrRecognitionPreprocessor,
Tasks.image_captioning: OfaImageCaptioningPreprocessor,
Tasks.visual_grounding: OfaVisualGroundingPreprocessor,
Tasks.visual_question_answering:
@@ -48,14 +48,14 @@ class OfaPreprocessor(Preprocessor):
Tasks.visual_entailment: OfaVisualEntailmentPreprocessor,
Tasks.image_classification: OfaImageClassificationPreprocessor,
Tasks.text_classification: OfaTextClassificationPreprocessor,
Tasks.summarization: OfaSummarizationPreprocessor,
Tasks.text_summarization: OfaSummarizationPreprocessor,
Tasks.text_to_image_synthesis: OfaTextToImageSynthesisPreprocessor
}
input_key_mapping = {
Tasks.ofa_ocr_recognition: ['image'],
Tasks.ocr_recognition: ['image'],
Tasks.image_captioning: ['image'],
Tasks.image_classification: ['image'],
Tasks.summarization: ['text'],
Tasks.text_summarization: ['text'],
Tasks.text_classification: ['text', 'text2'],
Tasks.visual_grounding: ['image', 'text'],
Tasks.visual_question_answering: ['image', 'text'],


+ 2
- 2
modelscope/preprocessors/nlp/__init__.py View File

@@ -11,7 +11,7 @@ if TYPE_CHECKING:
FillMaskPoNetPreprocessor,
NLPPreprocessor,
NLPTokenizerPreprocessorBase,
PassageRankingPreprocessor,
TextRankingPreprocessor,
RelationExtractionPreprocessor,
SentenceEmbeddingPreprocessor,
SequenceClassificationPreprocessor,
@@ -33,7 +33,7 @@ else:
'FillMaskPoNetPreprocessor',
'NLPPreprocessor',
'NLPTokenizerPreprocessorBase',
'PassageRankingPreprocessor',
'TextRankingPreprocessor',
'RelationExtractionPreprocessor',
'SentenceEmbeddingPreprocessor',
'SequenceClassificationPreprocessor',


+ 26
- 29
modelscope/preprocessors/nlp/nlp_base.py View File

@@ -1,9 +1,10 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
import os.path as osp
import re
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
from typing import Any, Dict, Optional, Tuple, Union

import json
import numpy as np
import sentencepiece as spm
import torch
@@ -13,8 +14,7 @@ from modelscope.metainfo import Models, Preprocessors
from modelscope.outputs import OutputKeys
from modelscope.preprocessors.base import Preprocessor
from modelscope.preprocessors.builder import PREPROCESSORS
from modelscope.utils.config import (Config, ConfigFields,
use_task_specific_params)
from modelscope.utils.config import Config, ConfigFields
from modelscope.utils.constant import Fields, InputFields, ModeKeys, ModelFile
from modelscope.utils.hub import get_model_type, parse_label_mapping
from modelscope.utils.logger import get_logger
@@ -29,7 +29,7 @@ __all__ = [
'NLPPreprocessor',
'FillMaskPoNetPreprocessor',
'NLPTokenizerPreprocessorBase',
'PassageRankingPreprocessor',
'TextRankingPreprocessor',
'RelationExtractionPreprocessor',
'SentenceEmbeddingPreprocessor',
'SequenceClassificationPreprocessor',
@@ -83,6 +83,15 @@ class NLPTokenizerPreprocessorBase(Preprocessor):

self._mode = mode
self.label = kwargs.pop('label', OutputKeys.LABEL)
self.use_fast = kwargs.pop('use_fast', None)
if self.use_fast is None and os.path.isfile(
os.path.join(model_dir, 'tokenizer_config.json')):
with open(os.path.join(model_dir, 'tokenizer_config.json'),
'r') as f:
json_config = json.load(f)
self.use_fast = json_config.get('use_fast')
self.use_fast = False if self.use_fast is None else self.use_fast

self.label2id = None
if 'label2id' in kwargs:
self.label2id = kwargs.pop('label2id')
@@ -118,32 +127,23 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
if model_type in (Models.structbert, Models.gpt3, Models.palm,
Models.plug):
from modelscope.models.nlp.structbert import SbertTokenizer, SbertTokenizerFast
return SbertTokenizer.from_pretrained(
model_dir
) if self._mode == ModeKeys.INFERENCE else SbertTokenizerFast.from_pretrained(
model_dir)
tokenizer = SbertTokenizerFast if self.use_fast else SbertTokenizer
return tokenizer.from_pretrained(model_dir)
elif model_type == Models.veco:
from modelscope.models.nlp.veco import VecoTokenizer, VecoTokenizerFast
return VecoTokenizer.from_pretrained(
model_dir
) if self._mode == ModeKeys.INFERENCE else VecoTokenizerFast.from_pretrained(
model_dir)
tokenizer = VecoTokenizerFast if self.use_fast else VecoTokenizer
return tokenizer.from_pretrained(model_dir)
elif model_type == Models.deberta_v2:
from modelscope.models.nlp.deberta_v2 import DebertaV2Tokenizer, DebertaV2TokenizerFast
return DebertaV2Tokenizer.from_pretrained(
model_dir
) if self._mode == ModeKeys.INFERENCE else DebertaV2TokenizerFast.from_pretrained(
model_dir)
tokenizer = DebertaV2TokenizerFast if self.use_fast else DebertaV2Tokenizer
return tokenizer.from_pretrained(model_dir)
elif not self.is_transformer_based_model:
from transformers import BertTokenizer, BertTokenizerFast
return BertTokenizer.from_pretrained(
model_dir
) if self._mode == ModeKeys.INFERENCE else BertTokenizerFast.from_pretrained(
model_dir)
tokenizer = BertTokenizerFast if self.use_fast else BertTokenizer
return tokenizer.from_pretrained(model_dir)
else:
return AutoTokenizer.from_pretrained(
model_dir,
use_fast=False if self._mode == ModeKeys.INFERENCE else True)
model_dir, use_fast=self.use_fast)

def __call__(self, data: Union[str, Tuple, Dict]) -> Dict[str, Any]:
"""process the raw input data
@@ -245,9 +245,9 @@ class NLPPreprocessor(NLPTokenizerPreprocessorBase):


@PREPROCESSORS.register_module(
Fields.nlp, module_name=Preprocessors.passage_ranking)
class PassageRankingPreprocessor(NLPTokenizerPreprocessorBase):
"""The tokenizer preprocessor used in passage ranking model.
Fields.nlp, module_name=Preprocessors.text_ranking)
class TextRankingPreprocessor(NLPTokenizerPreprocessorBase):
"""The tokenizer preprocessor used in text-ranking model.
"""

def __init__(self,
@@ -593,9 +593,6 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
else:
self.is_split_into_words = self.tokenizer.init_kwargs.get(
'is_split_into_words', False)
if 'label2id' in kwargs:
kwargs.pop('label2id')
self.tokenize_kwargs = kwargs

@type_assert(object, str)
def __call__(self, data: str) -> Dict[str, Any]:


+ 1
- 1
modelscope/preprocessors/star3/fields/database.py View File

@@ -13,7 +13,7 @@ class Database:
tokenizer,
table_file_path,
syn_dict_file_path,
is_use_sqlite=False):
is_use_sqlite=True):
self.tokenizer = tokenizer
self.is_use_sqlite = is_use_sqlite
if self.is_use_sqlite:


+ 17
- 14
modelscope/preprocessors/star3/fields/schema_link.py View File

@@ -293,6 +293,7 @@ class SchemaLinker:
nlu_t,
tables,
col_syn_dict,
table_id=None,
history_sql=None):
"""
get linking between question and schema column
@@ -300,6 +301,9 @@ class SchemaLinker:
typeinfos = []
numbers = re.findall(r'[-]?\d*\.\d+|[-]?\d+|\d+', nlu)

if table_id is not None and table_id in tables:
tables = {table_id: tables[table_id]}

# search schema link in every table
search_result_list = []
for tablename in tables:
@@ -411,26 +415,25 @@ class SchemaLinker:
# get the match score of each table
match_score = self.get_table_match_score(nlu_t, schema_link)

# cal table_score
if history_sql is not None and 'from' in history_sql:
table_score = int(table['table_id'] == history_sql['from'][0])
else:
table_score = 0

search_result = {
'table_id':
table['table_id'],
'question_knowledge':
final_question,
'header_knowledge':
final_header,
'schema_link':
schema_link,
'match_score':
match_score,
'table_score':
int(table['table_id'] == history_sql['from'][0])
if history_sql is not None else 0
'table_id': table['table_id'],
'question_knowledge': final_question,
'header_knowledge': final_header,
'schema_link': schema_link,
'match_score': match_score,
'table_score': table_score
}
search_result_list.append(search_result)

search_result_list = sorted(
search_result_list,
key=lambda x: (x['match_score'], x['table_score']),
reverse=True)[0:4]
reverse=True)[0:1]

return search_result_list

+ 2
- 0
modelscope/preprocessors/star3/table_question_answering_preprocessor.py View File

@@ -95,6 +95,7 @@ class TableQuestionAnsweringPreprocessor(Preprocessor):

# tokenize question
question = data['question']
table_id = data.get('table_id', None)
history_sql = data.get('history_sql', None)
nlu = question.lower()
nlu_t = self.tokenizer.tokenize(nlu)
@@ -106,6 +107,7 @@ class TableQuestionAnsweringPreprocessor(Preprocessor):
nlu_t=nlu_t,
tables=self.db.tables,
col_syn_dict=self.db.syn_dict,
table_id=table_id,
history_sql=history_sql)

# collect data


+ 2
- 2
modelscope/trainers/__init__.py View File

@@ -11,7 +11,7 @@ if TYPE_CHECKING:
ImagePortraitEnhancementTrainer,
MovieSceneSegmentationTrainer, ImageInpaintingTrainer)
from .multi_modal import CLIPTrainer
from .nlp import SequenceClassificationTrainer, PassageRankingTrainer
from .nlp import SequenceClassificationTrainer, TextRankingTrainer
from .nlp_trainer import NlpEpochBasedTrainer, VecoTrainer
from .trainer import EpochBasedTrainer

@@ -26,7 +26,7 @@ else:
'ImageInpaintingTrainer'
],
'multi_modal': ['CLIPTrainer'],
'nlp': ['SequenceClassificationTrainer', 'PassageRankingTrainer'],
'nlp': ['SequenceClassificationTrainer', 'TextRankingTrainer'],
'nlp_trainer': ['NlpEpochBasedTrainer', 'VecoTrainer'],
'trainer': ['EpochBasedTrainer']
}


+ 2
- 2
modelscope/trainers/nlp/__init__.py View File

@@ -6,12 +6,12 @@ from modelscope.utils.import_utils import LazyImportModule
if TYPE_CHECKING:
from .sequence_classification_trainer import SequenceClassificationTrainer
from .csanmt_translation_trainer import CsanmtTranslationTrainer
from .passage_ranking_trainer import PassageRankingTranier
from .text_ranking_trainer import TextRankingTranier
else:
_import_structure = {
'sequence_classification_trainer': ['SequenceClassificationTrainer'],
'csanmt_translation_trainer': ['CsanmtTranslationTrainer'],
'passage_ranking_trainer': ['PassageRankingTrainer']
'text_ranking_trainer': ['TextRankingTrainer']
}

import sys


modelscope/trainers/nlp/passage_ranking_trainer.py → modelscope/trainers/nlp/text_ranking_trainer.py View File

@@ -8,6 +8,7 @@ import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

from modelscope.metainfo import Trainers
from modelscope.models.base import Model, TorchModel
@@ -42,8 +43,8 @@ class GroupCollator():
return batch


@TRAINERS.register_module(module_name=Trainers.nlp_passage_ranking_trainer)
class PassageRankingTrainer(NlpEpochBasedTrainer):
@TRAINERS.register_module(module_name=Trainers.nlp_text_ranking_trainer)
class TextRankingTrainer(NlpEpochBasedTrainer):

def __init__(
self,
@@ -117,7 +118,7 @@ class PassageRankingTrainer(NlpEpochBasedTrainer):
Example:
{"accuracy": 0.5091743119266054, "f1": 0.673780487804878}
"""
from modelscope.models.nlp import PassageRanking
from modelscope.models.nlp import TextRanking
# get the raw online dataset
self.eval_dataloader = self._build_dataloader_with_dataset(
self.eval_dataset,
@@ -126,7 +127,7 @@ class PassageRankingTrainer(NlpEpochBasedTrainer):
# generate a standard dataloader
# generate a model
if checkpoint_path is not None:
model = PassageRanking.from_pretrained(checkpoint_path)
model = TextRanking.from_pretrained(checkpoint_path)
else:
model = self.model

@@ -141,7 +142,7 @@ class PassageRankingTrainer(NlpEpochBasedTrainer):
total_spent_time = 0.0
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model.to(device)
for _step, batch in enumerate(self.eval_dataloader):
for _step, batch in enumerate(tqdm(self.eval_dataloader)):
try:
batch = {
key:

+ 2
- 3
modelscope/utils/constant.py View File

@@ -103,7 +103,7 @@ class NLPTasks(object):
sentence_similarity = 'sentence-similarity'
text_classification = 'text-classification'
sentence_embedding = 'sentence-embedding'
passage_ranking = 'passage-ranking'
text_ranking = 'text-ranking'
relation_extraction = 'relation-extraction'
zero_shot = 'zero-shot'
translation = 'translation'
@@ -117,7 +117,7 @@ class NLPTasks(object):
table_question_answering = 'table-question-answering'
sentence_embedding = 'sentence-embedding'
fill_mask = 'fill-mask'
summarization = 'summarization'
text_summarization = 'text-summarization'
question_answering = 'question-answering'
zero_shot_classification = 'zero-shot-classification'
backbone = 'backbone'
@@ -151,7 +151,6 @@ class MultiModalTasks(object):
visual_entailment = 'visual-entailment'
video_multi_modal_embedding = 'video-multi-modal-embedding'
image_text_retrieval = 'image-text-retrieval'
ofa_ocr_recognition = 'ofa-ocr-recognition'


class TasksIODescriptions(object):


+ 1
- 2
modelscope/utils/registry.py View File

@@ -196,8 +196,7 @@ def build_from_cfg(cfg,
raise KeyError(
f'{obj_type} is not in the {registry.name}'
f' registry group {group_key}. Please make'
f' sure the correct version of 1qqQModelScope library is used.'
)
f' sure the correct version of ModelScope library is used.')
obj_cls.group_key = group_key
elif inspect.isclass(obj_type) or inspect.isfunction(obj_type):
obj_cls = obj_type


+ 1
- 1
tests/export/test_export_sbert_sequence_classification.py View File

@@ -22,7 +22,7 @@ class TestExportSbertSequenceClassification(unittest.TestCase):
shutil.rmtree(self.tmp_dir)
super().tearDown()

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
@unittest.skip
def test_export_sbert_sequence_classification(self):
model = Model.from_pretrained(self.model_id)
print(


+ 2
- 2
tests/msdatasets/test_ms_dataset.py View File

@@ -71,7 +71,7 @@ class MsDatasetTest(unittest.TestCase):
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
@require_torch
def test_to_torch_dataset_text(self):
model_id = 'damo/bert-base-sst2'
model_id = 'damo/nlp_structbert_sentence-similarity_chinese-tiny'
nlp_model = Model.from_pretrained(model_id)
preprocessor = SequenceClassificationPreprocessor(
nlp_model.model_dir,
@@ -93,7 +93,7 @@ class MsDatasetTest(unittest.TestCase):
def test_to_tf_dataset_text(self):
import tensorflow as tf
tf.compat.v1.enable_eager_execution()
model_id = 'damo/bert-base-sst2'
model_id = 'damo/nlp_structbert_sentence-similarity_chinese-tiny'
nlp_model = Model.from_pretrained(model_id)
preprocessor = SequenceClassificationPreprocessor(
nlp_model.model_dir,


+ 32
- 55
tests/pipelines/test_automatic_speech_recognition.py View File

@@ -80,164 +80,141 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase,

all_models_info = [
{
'model_group': 'damo',
'model_id':
'speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1',
'damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1',
'wav_path': 'data/test/audios/asr_example.wav'
},
{
'model_group': 'damo',
'model_id': 'speech_paraformer_asr_nat-aishell1-pytorch',
'model_id': 'damo/speech_paraformer_asr_nat-aishell1-pytorch',
'wav_path': 'data/test/audios/asr_example.wav'
},
{
'model_id': 'damo/speech_paraformer_asr_nat-aishell2-pytorch',
'wav_path': 'data/test/audios/asr_example.wav'
},
{
'model_group': 'damo',
'model_id':
'speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1',
'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1',
'wav_path': 'data/test/audios/asr_example.wav'
},
{
'model_group': 'damo',
'model_id':
'speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1',
'damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1',
'wav_path': 'data/test/audios/asr_example_8K.wav'
},
{
'model_group': 'damo',
'model_id':
'speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online',
'damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online',
'wav_path': 'data/test/audios/asr_example.wav'
},
{
'model_group': 'damo',
'model_id':
'speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline',
'damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline',
'wav_path': 'data/test/audios/asr_example.wav'
},
{
'model_group': 'damo',
'model_id':
'speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-online',
'damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-online',
'wav_path': 'data/test/audios/asr_example_8K.wav'
},
{
'model_group': 'damo',
'model_id':
'speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline',
'damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline',
'wav_path': 'data/test/audios/asr_example_8K.wav'
},
{
'model_group': 'damo',
'model_id':
'speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline',
'damo/speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline',
'wav_path': 'data/test/audios/asr_example.wav'
},
{
'model_group': 'damo',
'model_id':
'speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-online',
'damo/speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-online',
'wav_path': 'data/test/audios/asr_example_cn_en.wav'
},
{
'model_group': 'damo',
'model_id':
'speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-offline',
'damo/speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-offline',
'wav_path': 'data/test/audios/asr_example_cn_en.wav'
},
{
'model_group': 'damo',
'model_id':
'speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-online',
'damo/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-online',
'wav_path': 'data/test/audios/asr_example_cn_dialect.wav'
},
{
'model_group': 'damo',
'model_id':
'speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline',
'damo/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline',
'wav_path': 'data/test/audios/asr_example_cn_dialect.wav'
},
{
'model_group': 'damo',
'model_id':
'speech_paraformer_asr_nat-zh-cn-16k-common-vocab3444-tensorflow1-online',
'damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab3444-tensorflow1-online',
'wav_path': 'data/test/audios/asr_example.wav'
},
{
'model_group': 'damo',
'model_id':
'speech_paraformer_asr_nat-zh-cn-8k-common-vocab3444-tensorflow1-online',
'damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab3444-tensorflow1-online',
'wav_path': 'data/test/audios/asr_example_8K.wav'
},
{
'model_group': 'damo',
'model_id':
'speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline',
'damo/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline',
'wav_path': 'data/test/audios/asr_example_en.wav'
},
{
'model_group': 'damo',
'model_id':
'speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-online',
'damo/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-online',
'wav_path': 'data/test/audios/asr_example_en.wav'
},
{
'model_group': 'damo',
'model_id':
'speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline',
'damo/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline',
'wav_path': 'data/test/audios/asr_example_ru.wav'
},
{
'model_group': 'damo',
'model_id':
'speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-online',
'damo/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-online',
'wav_path': 'data/test/audios/asr_example_ru.wav'
},
{
'model_group': 'damo',
'model_id':
'speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline',
'damo/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline',
'wav_path': 'data/test/audios/asr_example_es.wav'
},
{
'model_group': 'damo',
'model_id':
'speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-online',
'damo/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-online',
'wav_path': 'data/test/audios/asr_example_es.wav'
},
{
'model_group': 'damo',
'model_id':
'speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline',
'damo/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline',
'wav_path': 'data/test/audios/asr_example_ko.wav'
},
{
'model_group': 'damo',
'model_id':
'speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-online',
'damo/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-online',
'wav_path': 'data/test/audios/asr_example_ko.wav'
},
{
'model_group': 'damo',
'model_id':
'speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-online',
'damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-online',
'wav_path': 'data/test/audios/asr_example_ja.wav'
},
{
'model_group': 'damo',
'model_id':
'speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline',
'damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline',
'wav_path': 'data/test/audios/asr_example_ja.wav'
},
{
'model_group': 'damo',
'model_id':
'speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online',
'damo/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online',
'wav_path': 'data/test/audios/asr_example_id.wav'
},
{
'model_group': 'damo',
'model_id':
'speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline',
'damo/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline',
'wav_path': 'data/test/audios/asr_example_id.wav'
},
]
@@ -404,7 +381,7 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase,
logger.info('Run ASR test with all models')

for item in self.all_models_info:
model_id = item['model_group'] + '/' + item['model_id']
model_id = item['model_id']
wav_path = item['wav_path']
rec_result = self.run_pipeline(
model_id=model_id, audio_in=wav_path)


+ 2
- 2
tests/pipelines/test_gpt3_text_generation.py View File

@@ -17,12 +17,12 @@ class TextGPT3GenerationTest(unittest.TestCase):
self.model_dir_13B = snapshot_download(self.model_id_13B)
self.input = '好的'

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@unittest.skip('distributed gpt3 1.3B, skipped')
def test_gpt3_1_3B(self):
pipe = pipeline(Tasks.text_generation, model=self.model_id_1_3B)
print(pipe(self.input))

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@unittest.skip('distributed gpt3 2.7B, skipped')
def test_gpt3_2_7B(self):
pipe = pipeline(Tasks.text_generation, model=self.model_id_2_7B)
print(pipe(self.input))


+ 3
- 3
tests/pipelines/test_ofa_tasks.py View File

@@ -48,7 +48,7 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_ocr_recognize_with_name(self):
ocr_recognize = pipeline(
Tasks.ofa_ocr_recognition,
Tasks.ocr_recognition,
model='damo/ofa_ocr-recognition_scene_base_zh')
result = ocr_recognize('data/test/images/image_ocr_recognition.jpg')
print(result[OutputKeys.TEXT])
@@ -75,7 +75,7 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
def test_run_with_summarization_with_model(self):
model = Model.from_pretrained(
'damo/ofa_summarization_gigaword_large_en')
ofa_pipe = pipeline(Tasks.summarization, model=model)
ofa_pipe = pipeline(Tasks.text_summarization, model=model)
text = 'five-time world champion michelle kwan withdrew' + \
'from the #### us figure skating championships on wednesday ,' + \
' but will petition us skating officials for the chance to ' + \
@@ -87,7 +87,7 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_summarization_with_name(self):
ofa_pipe = pipeline(
Tasks.summarization,
Tasks.text_summarization,
model='damo/ofa_summarization_gigaword_large_en')
text = 'five-time world champion michelle kwan withdrew' + \
'from the #### us figure skating championships on wednesday ,' + \


+ 5
- 5
tests/pipelines/test_relation_extraction.py View File

@@ -15,7 +15,7 @@ from modelscope.utils.test_utils import test_level
class RelationExtractionTest(unittest.TestCase, DemoCompatibilityCheck):

def setUp(self) -> None:
self.task = Tasks.information_extraction
self.task = Tasks.relation_extraction
self.model_id = 'damo/nlp_bert_relation-extraction_chinese-base'

sentence = '高捷,祖籍江苏,本科毕业于东南大学'
@@ -28,7 +28,7 @@ class RelationExtractionTest(unittest.TestCase, DemoCompatibilityCheck):
pipeline1 = InformationExtractionPipeline(
model, preprocessor=tokenizer)
pipeline2 = pipeline(
Tasks.information_extraction, model=model, preprocessor=tokenizer)
Tasks.relation_extraction, model=model, preprocessor=tokenizer)
print(f'sentence: {self.sentence}\n'
f'pipeline1:{pipeline1(input=self.sentence)}')
print()
@@ -39,7 +39,7 @@ class RelationExtractionTest(unittest.TestCase, DemoCompatibilityCheck):
model = Model.from_pretrained(self.model_id)
tokenizer = RelationExtractionPreprocessor(model.model_dir)
pipeline_ins = pipeline(
task=Tasks.information_extraction,
task=Tasks.relation_extraction,
model=model,
preprocessor=tokenizer)
print(pipeline_ins(input=self.sentence))
@@ -47,12 +47,12 @@ class RelationExtractionTest(unittest.TestCase, DemoCompatibilityCheck):
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_model_name(self):
pipeline_ins = pipeline(
task=Tasks.information_extraction, model=self.model_id)
task=Tasks.relation_extraction, model=self.model_id)
print(pipeline_ins(input=self.sentence))

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_with_default_model(self):
pipeline_ins = pipeline(task=Tasks.information_extraction)
pipeline_ins = pipeline(task=Tasks.relation_extraction)
print(pipeline_ins(input=self.sentence))

@unittest.skip('demo compatibility test is only enabled on a needed-basis')


+ 49
- 6
tests/pipelines/test_table_question_answering.py View File

@@ -43,7 +43,7 @@ def tableqa_tracking_and_print_results_with_history(
print('sql text:', output_dict[OutputKeys.SQL_STRING])
print('sql query:', output_dict[OutputKeys.SQL_QUERY])
print('query result:', output_dict[OutputKeys.QUERT_RESULT])
print('json dumps', json.dumps(output_dict))
print('json dumps', json.dumps(output_dict, ensure_ascii=False))
print()
historical_queries = output_dict[OutputKeys.HISTORY]

@@ -66,10 +66,42 @@ def tableqa_tracking_and_print_results_without_history(
print('sql text:', output_dict[OutputKeys.SQL_STRING])
print('sql query:', output_dict[OutputKeys.SQL_QUERY])
print('query result:', output_dict[OutputKeys.QUERT_RESULT])
print('json dumps', json.dumps(output_dict))
print('json dumps', json.dumps(output_dict, ensure_ascii=False))
print()


def tableqa_tracking_and_print_results_with_tableid(
pipelines: List[TableQuestionAnsweringPipeline]):
test_case = {
'utterance': [
['有哪些风险类型?', 'fund'],
['风险类型有多少种?', 'reservoir'],
['珠江流域的小(2)型水库的库容总量是多少?', 'reservoir'],
['那平均值是多少?', 'reservoir'],
['那水库的名称呢?', 'reservoir'],
['换成中型的呢?', 'reservoir'],
['枣庄营业厅的电话', 'business'],
['那地址呢?', 'business'],
['枣庄营业厅的电话和地址', 'business'],
],
}
for p in pipelines:
historical_queries = None
for question, table_id in test_case['utterance']:
output_dict = p({
'question': question,
'table_id': table_id,
'history_sql': historical_queries
})
print('question', question)
print('sql text:', output_dict[OutputKeys.SQL_STRING])
print('sql query:', output_dict[OutputKeys.SQL_QUERY])
print('query result:', output_dict[OutputKeys.QUERT_RESULT])
print('json dumps', json.dumps(output_dict, ensure_ascii=False))
print()
historical_queries = output_dict[OutputKeys.HISTORY]


class TableQuestionAnswering(unittest.TestCase):

def setUp(self) -> None:
@@ -93,15 +125,27 @@ class TableQuestionAnswering(unittest.TestCase):
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_model_from_modelhub(self):
model = Model.from_pretrained(self.model_id)
self.tokenizer = BertTokenizer(
os.path.join(model.model_dir, ModelFile.VOCAB_FILE))
db = Database(
tokenizer=self.tokenizer,
table_file_path=[
os.path.join(model.model_dir, 'databases', fname)
for fname in os.listdir(
os.path.join(model.model_dir, 'databases'))
],
syn_dict_file_path=os.path.join(model.model_dir, 'synonym.txt'),
is_use_sqlite=False)
preprocessor = TableQuestionAnsweringPreprocessor(
model_dir=model.model_dir)
model_dir=model.model_dir, db=db)
pipelines = [
pipeline(
Tasks.table_question_answering,
model=model,
preprocessor=preprocessor)
preprocessor=preprocessor,
db=db)
]
tableqa_tracking_and_print_results_with_history(pipelines)
tableqa_tracking_and_print_results_with_tableid(pipelines)

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_from_task(self):
@@ -132,7 +176,6 @@ class TableQuestionAnswering(unittest.TestCase):
db=db)
]
tableqa_tracking_and_print_results_without_history(pipelines)
tableqa_tracking_and_print_results_with_history(pipelines)


if __name__ == '__main__':


+ 0
- 100
tests/pipelines/test_text_classification.py View File

@@ -1,100 +0,0 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import unittest

from modelscope.models import Model
from modelscope.msdatasets import MsDataset
from modelscope.pipelines import pipeline
from modelscope.pipelines.nlp import SequenceClassificationPipeline
from modelscope.preprocessors import SequenceClassificationPreprocessor
from modelscope.utils.constant import Tasks
from modelscope.utils.demo_utils import DemoCompatibilityCheck
from modelscope.utils.test_utils import test_level


class SequenceClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
sentence1 = 'i like this wonderful place'

def setUp(self) -> None:
self.model_id = 'damo/bert-base-sst2'
self.task = Tasks.text_classification

def predict(self, pipeline_ins: SequenceClassificationPipeline):
from easynlp.appzoo import load_dataset

set = load_dataset('glue', 'sst2')
data = set['test']['sentence'][:3]

results = pipeline_ins(data[0])
print(results)
results = pipeline_ins(data[1])
print(results)

print(data)

def printDataset(self, dataset: MsDataset):
for i, r in enumerate(dataset):
if i > 10:
break
print(r)

# @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@unittest.skip('nlp model does not support tensor input, skipped')
def test_run_with_model_from_modelhub(self):
model = Model.from_pretrained(self.model_id)
preprocessor = SequenceClassificationPreprocessor(
model.model_dir, first_sequence='sentence', second_sequence=None)
pipeline_ins = pipeline(
task=Tasks.text_classification,
model=model,
preprocessor=preprocessor)
print(f'sentence1: {self.sentence1}\n'
f'pipeline1:{pipeline_ins(input=self.sentence1)}')

# @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@unittest.skip('nlp model does not support tensor input, skipped')
def test_run_with_model_name(self):
text_classification = pipeline(
task=Tasks.text_classification, model=self.model_id)
result = text_classification(
MsDataset.load(
'xcopa',
subset_name='translation-et',
namespace='damotest',
split='test',
target='premise'))
self.printDataset(result)

# @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@unittest.skip('nlp model does not support tensor input, skipped')
def test_run_with_default_model(self):
text_classification = pipeline(task=Tasks.text_classification)
result = text_classification(
MsDataset.load(
'xcopa',
subset_name='translation-et',
namespace='damotest',
split='test',
target='premise'))
self.printDataset(result)

# @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
@unittest.skip('nlp model does not support tensor input, skipped')
def test_run_with_modelscope_dataset(self):
text_classification = pipeline(task=Tasks.text_classification)
# loaded from modelscope dataset
dataset = MsDataset.load(
'xcopa',
subset_name='translation-et',
namespace='damotest',
split='test',
target='premise')
result = text_classification(dataset)
self.printDataset(result)

@unittest.skip('demo compatibility test is only enabled on a needed-basis')
def test_demo_compatibility(self):
self.compatibility_check()


if __name__ == '__main__':
unittest.main()

tests/pipelines/test_passage_ranking.py → tests/pipelines/test_text_ranking.py View File

@@ -4,15 +4,15 @@ import unittest

from modelscope.hub.snapshot_download import snapshot_download
from modelscope.models import Model
from modelscope.models.nlp import PassageRanking
from modelscope.models.nlp import TextRanking
from modelscope.pipelines import pipeline
from modelscope.pipelines.nlp import PassageRankingPipeline
from modelscope.preprocessors import PassageRankingPreprocessor
from modelscope.pipelines.nlp import TextRankingPipeline
from modelscope.preprocessors import TextRankingPreprocessor
from modelscope.utils.constant import Tasks
from modelscope.utils.test_utils import test_level


class PassageRankingTest(unittest.TestCase):
class TextRankingTest(unittest.TestCase):
model_id = 'damo/nlp_corom_passage-ranking_english-base'
inputs = {
'source_sentence': ["how long it take to get a master's degree"],
@@ -27,11 +27,11 @@ class PassageRankingTest(unittest.TestCase):
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_by_direct_model_download(self):
cache_path = snapshot_download(self.model_id)
tokenizer = PassageRankingPreprocessor(cache_path)
model = PassageRanking.from_pretrained(cache_path)
pipeline1 = PassageRankingPipeline(model, preprocessor=tokenizer)
tokenizer = TextRankingPreprocessor(cache_path)
model = TextRanking.from_pretrained(cache_path)
pipeline1 = TextRankingPipeline(model, preprocessor=tokenizer)
pipeline2 = pipeline(
Tasks.passage_ranking, model=model, preprocessor=tokenizer)
Tasks.text_ranking, model=model, preprocessor=tokenizer)
print(f'sentence: {self.inputs}\n'
f'pipeline1:{pipeline1(input=self.inputs)}')
print()
@@ -40,20 +40,19 @@ class PassageRankingTest(unittest.TestCase):
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_from_modelhub(self):
model = Model.from_pretrained(self.model_id)
tokenizer = PassageRankingPreprocessor(model.model_dir)
tokenizer = TextRankingPreprocessor(model.model_dir)
pipeline_ins = pipeline(
task=Tasks.passage_ranking, model=model, preprocessor=tokenizer)
task=Tasks.text_ranking, model=model, preprocessor=tokenizer)
print(pipeline_ins(input=self.inputs))

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_model_name(self):
pipeline_ins = pipeline(
task=Tasks.passage_ranking, model=self.model_id)
pipeline_ins = pipeline(task=Tasks.text_ranking, model=self.model_id)
print(pipeline_ins(input=self.inputs))

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_with_default_model(self):
pipeline_ins = pipeline(task=Tasks.passage_ranking)
pipeline_ins = pipeline(task=Tasks.text_ranking)
print(pipeline_ins(input=self.inputs))



+ 2
- 1
tests/trainers/test_finetune_sequence_classification.py View File

@@ -38,7 +38,8 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
shutil.rmtree(self.tmp_dir)
super().tearDown()

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
@unittest.skip(
'Skip testing trainer repeatable, because it\'s unstable in daily UT')
def test_trainer_repeatable(self):
import torch # noqa



tests/trainers/test_finetune_passage_ranking.py → tests/trainers/test_finetune_text_ranking.py View File

@@ -41,7 +41,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
model_id,
train_dataset,
eval_dataset,
name=Trainers.nlp_passage_ranking_trainer,
name=Trainers.nlp_text_ranking_trainer,
cfg_modify_fn=None,
**kwargs):
kwargs = dict(
@@ -61,8 +61,8 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
def test_finetune_msmarco(self):

def cfg_modify_fn(cfg):
cfg.task = 'passage-ranking'
cfg['preprocessor'] = {'type': 'passage-ranking'}
cfg.task = 'text-ranking'
cfg['preprocessor'] = {'type': 'text-ranking'}
cfg.train.optimizer.lr = 2e-5
cfg['dataset'] = {
'train': {
@@ -105,7 +105,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
}, {
'type': 'EvaluationHook',
'by_epoch': False,
'interval': 3000
'interval': 15
}]
return cfg

@@ -114,18 +114,19 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
train_ds = ds['train'].to_hf_dataset()
dev_ds = ds['train'].to_hf_dataset()

model_id = 'damo/nlp_corom_passage-ranking_english-base'
self.finetune(
model_id='damo/nlp_corom_passage-ranking_english-base',
model_id=model_id,
train_dataset=train_ds,
eval_dataset=dev_ds,
cfg_modify_fn=cfg_modify_fn)

output_dir = os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)
self.pipeline_passage_ranking(output_dir)
self.pipeline_text_ranking(output_dir)

def pipeline_passage_ranking(self, model_dir):
def pipeline_text_ranking(self, model_dir):
model = Model.from_pretrained(model_dir)
pipeline_ins = pipeline(task=Tasks.passage_ranking, model=model)
pipeline_ins = pipeline(task=Tasks.text_ranking, model=model)
print(pipeline_ins(input=self.inputs))



+ 2
- 2
tests/trainers/test_image_portrait_enhancement_trainer.py View File

@@ -37,13 +37,13 @@ class TestImagePortraitEnhancementTrainer(unittest.TestCase):
namespace='modelscope',
subset_name='default',
split='test',
download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds
download_mode=DownloadMode.FORCE_REDOWNLOAD)._hf_ds
dataset_val = MsDataset.load(
'image-portrait-enhancement-dataset',
namespace='modelscope',
subset_name='default',
split='test',
download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds
download_mode=DownloadMode.FORCE_REDOWNLOAD)._hf_ds

self.dataset_train = ImagePortraitEnhancementDataset(
dataset_train, is_train=True)


+ 17
- 4
tests/trainers/test_trainer_with_nlp.py View File

@@ -169,11 +169,25 @@ class TestTrainerWithNlp(unittest.TestCase):
cfg.preprocessor.label = 'label'
cfg.preprocessor.train['label2id'] = {'0': 0, '1': 1}
cfg.preprocessor.val['label2id'] = {'0': 0, '1': 1}
cfg.train.dataloader.batch_size_per_gpu = 2
cfg.train.hooks = [{
'type': 'CheckpointHook',
'interval': 3,
'by_epoch': False,
}, {
'type': 'TextLoggerHook',
'interval': 1
}, {
'type': 'IterTimerHook'
}, {
'type': 'EvaluationHook',
'interval': 1
}]
cfg.train.work_dir = self.tmp_dir
cfg_file = os.path.join(self.tmp_dir, 'config.json')
cfg.dump(cfg_file)
dataset = MsDataset.load('clue', subset_name='afqmc', split='train')
dataset = dataset.to_hf_dataset().select(range(128))
dataset = dataset.to_hf_dataset().select(range(4))
kwargs = dict(
model=model_id,
train_dataset=dataset,
@@ -190,7 +204,7 @@ class TestTrainerWithNlp(unittest.TestCase):
PRIORITY = Priority.VERY_LOW

def after_iter(self, trainer):
if trainer.iter == 12:
if trainer.iter == 3:
raise MsRegressTool.EarlyStopError('Test finished.')

if 'EarlyStopHook' not in [
@@ -207,12 +221,11 @@ class TestTrainerWithNlp(unittest.TestCase):

results_files = os.listdir(self.tmp_dir)
self.assertIn(f'{trainer.timestamp}.log.json', results_files)

trainer = build_trainer(default_args=kwargs)
regress_tool = MsRegressTool(baseline=False)
with regress_tool.monitor_ms_train(
trainer, 'trainer_continue_train', level='strict'):
trainer.train(os.path.join(self.tmp_dir, 'iter_12.pth'))
trainer.train(os.path.join(self.tmp_dir, 'iter_3.pth'))

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_trainer_with_model_and_args(self):


Loading…
Cancel
Save