diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py index 860b68d3..05950378 100644 --- a/modelscope/models/multi_modal/ofa_for_all_tasks.py +++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py @@ -152,8 +152,8 @@ class OfaForAllTasks(TorchModel): region_tensor[:, ::2] /= input['w_resize_ratios'] region_tensor[:, 1::2] /= input['h_resize_ratios'] return { - OutputKeys.BOXES: move_to_device(region_tensor, - torch.device('cpu')), + OutputKeys.BOXES: + move_to_device(region_tensor, torch.device('cpu')).tolist(), OutputKeys.SCORES: [1.0] * region_tensor.shape[0] } diff --git a/modelscope/pipelines/audio/linear_aec_pipeline.py b/modelscope/pipelines/audio/linear_aec_pipeline.py index b59bc475..0e73b697 100644 --- a/modelscope/pipelines/audio/linear_aec_pipeline.py +++ b/modelscope/pipelines/audio/linear_aec_pipeline.py @@ -51,7 +51,7 @@ class LinearAECPipeline(Pipeline): When invoke the class with pipeline.__call__(), you should provide two params: Dict[str, Any] - the path of wav files,eg:{ + the path of wav files, eg:{ "nearend_mic": "/your/data/near_end_mic_audio.wav", "farend_speech": "/your/data/far_end_speech_audio.wav"} output_path (str, optional): "/your/output/audio_after_aec.wav" diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py index 9873a62c..342ba6b5 100644 --- a/modelscope/preprocessors/multi_modal.py +++ b/modelscope/preprocessors/multi_modal.py @@ -8,6 +8,7 @@ from PIL import Image from modelscope.hub.snapshot_download import snapshot_download from modelscope.metainfo import Preprocessors from modelscope.pipelines.base import Input +from modelscope.preprocessors import load_image from modelscope.utils.config import Config from modelscope.utils.constant import Fields, ModeKeys, ModelFile, Tasks from .base import Preprocessor @@ -137,7 +138,7 @@ class MPlugPreprocessor(Preprocessor): def image_open(self, path: str) -> Tuple[Image.Image, int]: if path not in self._image_map: index = len(self._image_map) - self._image_map[path] = (Image.open(path), index) + self._image_map[path] = (load_image(path), index) return self._image_map[path] def __call__( diff --git a/modelscope/utils/demo_utils.py b/modelscope/utils/demo_utils.py index 0f8378cd..93535c1e 100644 --- a/modelscope/utils/demo_utils.py +++ b/modelscope/utils/demo_utils.py @@ -236,7 +236,7 @@ def postprocess(req, resp): _, img_encode = cv2.imencode('.' + file_type, content) img_bytes = img_encode.tobytes() return type(img_bytes) - elif file_type == 'wav': + else: out_mem_file = io.BytesIO() out_mem_file.write(new_resp.get(output_key)) return type(out_mem_file) diff --git a/tests/pipelines/test_automatic_speech_recognition.py b/tests/pipelines/test_automatic_speech_recognition.py index e475c3cd..303fb6b9 100644 --- a/tests/pipelines/test_automatic_speech_recognition.py +++ b/tests/pipelines/test_automatic_speech_recognition.py @@ -22,6 +22,9 @@ URL_FILE = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audi LITTLE_TESTSETS_FILE = 'data_aishell.tar.gz' LITTLE_TESTSETS_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/datasets/data_aishell.tar.gz' +TFRECORD_TESTSETS_FILE = 'tfrecord.tar.gz' +TFRECORD_TESTSETS_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/datasets/tfrecord.tar.gz' + class AutomaticSpeechRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):