diff --git a/modelscope/outputs/outputs.py b/modelscope/outputs/outputs.py index 949a91b5..30361b5d 100644 --- a/modelscope/outputs/outputs.py +++ b/modelscope/outputs/outputs.py @@ -435,9 +435,11 @@ TASK_OUTPUTS = { # referring video object segmentation result for a single video # { - # "masks": [np.array # 2D array with shape [height, width]] + # "masks": [np.array # 3D array with shape [frame_num, height, width]] + # "timestamps": ["hh:mm:ss", "hh:mm:ss", "hh:mm:ss"] # } - Tasks.referring_video_object_segmentation: [OutputKeys.MASKS], + Tasks.referring_video_object_segmentation: + [OutputKeys.MASKS, OutputKeys.TIMESTAMPS], # ============ nlp tasks =================== @@ -698,8 +700,9 @@ TASK_OUTPUTS = { # "img_embedding": np.array with shape [1, D], # "text_embedding": np.array with shape [1, D] # } - Tasks.multi_modal_embedding: - [OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING], + Tasks.multi_modal_embedding: [ + OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING + ], # generative multi-modal embedding result for single sample # { diff --git a/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py b/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py index f0a717a5..dcbb5de0 100644 --- a/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py +++ b/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py @@ -52,17 +52,16 @@ class ReferringVideoObjectSegmentationPipeline(Pipeline): """ assert isinstance(input, tuple) and len( input - ) == 4, 'error - input type must be tuple and input length must be 4' - self.input_video_pth, text_queries, start_pt, end_pt = input + ) == 2, 'error - input type must be tuple and input length must be 2' + self.input_video_pth, text_queries = input - assert 0 < end_pt - start_pt <= 10, 'error - the subclip length must be 0-10 seconds long' assert 1 <= len( text_queries) <= 2, 'error - 1-2 input text queries are expected' # extract the relevant subclip: self.input_clip_pth = 'input_clip.mp4' with VideoFileClip(self.input_video_pth) as video: - subclip = video.subclip(start_pt, end_pt) + subclip = video.subclip() subclip.write_videofile(self.input_clip_pth) self.window_length = 24 # length of window during inference @@ -191,7 +190,16 @@ class ReferringVideoObjectSegmentationPipeline(Pipeline): output_clip_path, fps=self.meta['video_fps'], audio=True) del masked_video - result = {OutputKeys.MASKS: inputs} + masks = [mask.squeeze(1) for mask in inputs] + + fps = self.meta['video_fps'] + output_timestamps = [] + for frame_idx in range(self.video.shape[0]): + output_timestamps.append(timestamp_format(seconds=frame_idx / fps)) + result = { + OutputKeys.MASKS: masks, + OutputKeys.TIMESTAMPS: output_timestamps + } return result @@ -201,3 +209,10 @@ def apply_mask(image, mask, color, transparency=0.7): color_matrix = np.ones(image.shape, dtype=np.float) * color out_image = color_matrix * mask + image * (1.0 - mask) return out_image + + +def timestamp_format(seconds): + m, s = divmod(seconds, 60) + h, m = divmod(m, 60) + time = '%02d:%02d:%06.3f' % (h, m, s) + return time diff --git a/tests/pipelines/test_referring_video_object_segmentation.py b/tests/pipelines/test_referring_video_object_segmentation.py index 3e81d9c3..509e9317 100644 --- a/tests/pipelines/test_referring_video_object_segmentation.py +++ b/tests/pipelines/test_referring_video_object_segmentation.py @@ -21,8 +21,7 @@ class ReferringVideoObjectSegmentationTest(unittest.TestCase, 'guy in black performing tricks on a bike', 'a black bike used to perform tricks' ] - start_pt, end_pt = 4, 14 - input_tuple = (input_location, text_queries, start_pt, end_pt) + input_tuple = (input_location, text_queries) pp = pipeline( Tasks.referring_video_object_segmentation, model=self.model_id) result = pp(input_tuple) @@ -38,8 +37,7 @@ class ReferringVideoObjectSegmentationTest(unittest.TestCase, 'guy in black performing tricks on a bike', 'a black bike used to perform tricks' ] - start_pt, end_pt = 4, 14 - input_tuple = (input_location, text_queries, start_pt, end_pt) + input_tuple = (input_location, text_queries) pp = pipeline(Tasks.referring_video_object_segmentation) result = pp(input_tuple) if result: