Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10873454master^2
| @@ -435,9 +435,11 @@ TASK_OUTPUTS = { | |||||
| # referring video object segmentation result for a single video | # referring video object segmentation result for a single video | ||||
| # { | # { | ||||
| # "masks": [np.array # 2D array with shape [height, width]] | |||||
| # "masks": [np.array # 3D array with shape [frame_num, height, width]] | |||||
| # "timestamps": ["hh:mm:ss", "hh:mm:ss", "hh:mm:ss"] | |||||
| # } | # } | ||||
| Tasks.referring_video_object_segmentation: [OutputKeys.MASKS], | |||||
| Tasks.referring_video_object_segmentation: | |||||
| [OutputKeys.MASKS, OutputKeys.TIMESTAMPS], | |||||
| # ============ nlp tasks =================== | # ============ nlp tasks =================== | ||||
| @@ -698,8 +700,9 @@ TASK_OUTPUTS = { | |||||
| # "img_embedding": np.array with shape [1, D], | # "img_embedding": np.array with shape [1, D], | ||||
| # "text_embedding": np.array with shape [1, D] | # "text_embedding": np.array with shape [1, D] | ||||
| # } | # } | ||||
| Tasks.multi_modal_embedding: | |||||
| [OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING], | |||||
| Tasks.multi_modal_embedding: [ | |||||
| OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING | |||||
| ], | |||||
| # generative multi-modal embedding result for single sample | # generative multi-modal embedding result for single sample | ||||
| # { | # { | ||||
| @@ -52,17 +52,16 @@ class ReferringVideoObjectSegmentationPipeline(Pipeline): | |||||
| """ | """ | ||||
| assert isinstance(input, tuple) and len( | assert isinstance(input, tuple) and len( | ||||
| input | input | ||||
| ) == 4, 'error - input type must be tuple and input length must be 4' | |||||
| self.input_video_pth, text_queries, start_pt, end_pt = input | |||||
| ) == 2, 'error - input type must be tuple and input length must be 2' | |||||
| self.input_video_pth, text_queries = input | |||||
| assert 0 < end_pt - start_pt <= 10, 'error - the subclip length must be 0-10 seconds long' | |||||
| assert 1 <= len( | assert 1 <= len( | ||||
| text_queries) <= 2, 'error - 1-2 input text queries are expected' | text_queries) <= 2, 'error - 1-2 input text queries are expected' | ||||
| # extract the relevant subclip: | # extract the relevant subclip: | ||||
| self.input_clip_pth = 'input_clip.mp4' | self.input_clip_pth = 'input_clip.mp4' | ||||
| with VideoFileClip(self.input_video_pth) as video: | with VideoFileClip(self.input_video_pth) as video: | ||||
| subclip = video.subclip(start_pt, end_pt) | |||||
| subclip = video.subclip() | |||||
| subclip.write_videofile(self.input_clip_pth) | subclip.write_videofile(self.input_clip_pth) | ||||
| self.window_length = 24 # length of window during inference | self.window_length = 24 # length of window during inference | ||||
| @@ -191,7 +190,16 @@ class ReferringVideoObjectSegmentationPipeline(Pipeline): | |||||
| output_clip_path, fps=self.meta['video_fps'], audio=True) | output_clip_path, fps=self.meta['video_fps'], audio=True) | ||||
| del masked_video | del masked_video | ||||
| result = {OutputKeys.MASKS: inputs} | |||||
| masks = [mask.squeeze(1) for mask in inputs] | |||||
| fps = self.meta['video_fps'] | |||||
| output_timestamps = [] | |||||
| for frame_idx in range(self.video.shape[0]): | |||||
| output_timestamps.append(timestamp_format(seconds=frame_idx / fps)) | |||||
| result = { | |||||
| OutputKeys.MASKS: masks, | |||||
| OutputKeys.TIMESTAMPS: output_timestamps | |||||
| } | |||||
| return result | return result | ||||
| @@ -201,3 +209,10 @@ def apply_mask(image, mask, color, transparency=0.7): | |||||
| color_matrix = np.ones(image.shape, dtype=np.float) * color | color_matrix = np.ones(image.shape, dtype=np.float) * color | ||||
| out_image = color_matrix * mask + image * (1.0 - mask) | out_image = color_matrix * mask + image * (1.0 - mask) | ||||
| return out_image | return out_image | ||||
| def timestamp_format(seconds): | |||||
| m, s = divmod(seconds, 60) | |||||
| h, m = divmod(m, 60) | |||||
| time = '%02d:%02d:%06.3f' % (h, m, s) | |||||
| return time | |||||
| @@ -21,8 +21,7 @@ class ReferringVideoObjectSegmentationTest(unittest.TestCase, | |||||
| 'guy in black performing tricks on a bike', | 'guy in black performing tricks on a bike', | ||||
| 'a black bike used to perform tricks' | 'a black bike used to perform tricks' | ||||
| ] | ] | ||||
| start_pt, end_pt = 4, 14 | |||||
| input_tuple = (input_location, text_queries, start_pt, end_pt) | |||||
| input_tuple = (input_location, text_queries) | |||||
| pp = pipeline( | pp = pipeline( | ||||
| Tasks.referring_video_object_segmentation, model=self.model_id) | Tasks.referring_video_object_segmentation, model=self.model_id) | ||||
| result = pp(input_tuple) | result = pp(input_tuple) | ||||
| @@ -38,8 +37,7 @@ class ReferringVideoObjectSegmentationTest(unittest.TestCase, | |||||
| 'guy in black performing tricks on a bike', | 'guy in black performing tricks on a bike', | ||||
| 'a black bike used to perform tricks' | 'a black bike used to perform tricks' | ||||
| ] | ] | ||||
| start_pt, end_pt = 4, 14 | |||||
| input_tuple = (input_location, text_queries, start_pt, end_pt) | |||||
| input_tuple = (input_location, text_queries) | |||||
| pp = pipeline(Tasks.referring_video_object_segmentation) | pp = pipeline(Tasks.referring_video_object_segmentation) | ||||
| result = pp(input_tuple) | result = pp(input_tuple) | ||||
| if result: | if result: | ||||