diff --git a/modelscope/models/cv/movie_scene_segmentation/model.py b/modelscope/models/cv/movie_scene_segmentation/model.py index 1232d427..8117961a 100644 --- a/modelscope/models/cv/movie_scene_segmentation/model.py +++ b/modelscope/models/cv/movie_scene_segmentation/model.py @@ -67,7 +67,6 @@ class MovieSceneSegmentationModel(TorchModel): mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) - self.infer_result = {'vid': [], 'sid': [], 'pred': []} sampling_method = self.cfg.dataset.sampling_method.name self.neighbor_size = self.cfg.dataset.sampling_method.params[ sampling_method].neighbor_size @@ -104,6 +103,8 @@ class MovieSceneSegmentationModel(TorchModel): shot_num = len(sids) cnt = shot_num // bs + 1 + infer_sid, infer_pred = [], [] + infer_result = {} for i in range(cnt): start = i * bs end = (i + 1) * bs if (i + 1) * bs < shot_num else shot_num @@ -112,13 +113,14 @@ class MovieSceneSegmentationModel(TorchModel): input_ = torch.stack(input_) outputs = self.shared_step(input_) # shape [b,2] prob = F.softmax(outputs, dim=1) - self.infer_result['sid'].extend(sid_.cpu().detach().numpy()) - self.infer_result['pred'].extend(prob[:, 1].cpu().detach().numpy()) - self.infer_result['pred'] = np.stack(self.infer_result['pred']) + infer_sid.extend(sid_.cpu().detach().numpy()) + infer_pred.extend(prob[:, 1].cpu().detach().numpy()) + infer_result.update({'pred': np.stack(infer_pred)}) + infer_result.update({'sid': infer_sid}) - assert len(self.infer_result['sid']) == len(sids) - assert len(self.infer_result['pred']) == len(inputs) - return self.infer_result + assert len(infer_result['sid']) == len(sids) + assert len(infer_result['pred']) == len(inputs) + return infer_result def shared_step(self, inputs): with torch.no_grad(): @@ -162,11 +164,12 @@ class MovieSceneSegmentationModel(TorchModel): thres = self.cfg.pipeline.save_threshold anno_dict = get_pred_boundary(pred_dict, thres) - scene_dict_lst, scene_list = pred2scene(self.shot2keyf, anno_dict) + scene_dict_lst, scene_list, shot_num, shot_dict_lst = pred2scene( + self.shot2keyf, anno_dict) if self.cfg.pipeline.save_split_scene: re_dir = scene2video(inputs['input_video_pth'], scene_list, thres) print(f'Split scene video saved to {re_dir}') - return len(scene_list), scene_dict_lst + return len(scene_list), scene_dict_lst, shot_num, shot_dict_lst def preprocess(self, inputs): logger.info('Begin shot detect......') diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py index b350ff13..3339e1a3 100644 --- a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py +++ b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py @@ -22,15 +22,23 @@ def pred2scene(shot2keyf, anno_dict): scene_list, pair_list = get_demo_scene_list(shot2keyf, anno_dict) scene_dict_lst = [] + shot_num = len(shot2keyf) + shot_dict_lst = [] + for item in shot2keyf: + tmp = item.split(' ') + shot_dict_lst.append({ + 'frame': [tmp[0], tmp[1]], + 'timestamps': [tmp[-2], tmp[-1]] + }) assert len(scene_list) == len(pair_list) for scene_ind, scene_item in enumerate(scene_list): scene_dict_lst.append({ 'shot': pair_list[scene_ind], 'frame': scene_item[0], - 'timestamp': scene_item[1] + 'timestamps': scene_item[1] }) - return scene_dict_lst, scene_list + return scene_dict_lst, scene_list, shot_num, shot_dict_lst def scene2video(source_movie_fn, scene_list, thres): diff --git a/modelscope/outputs.py b/modelscope/outputs.py index d8d2458a..717ff4dd 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -38,8 +38,10 @@ class OutputKeys(object): KWS_LIST = 'kws_list' HISTORY = 'history' TIMESTAMPS = 'timestamps' - SPLIT_VIDEO_NUM = 'split_video_num' - SPLIT_META_LIST = 'split_meta_list' + SHOT_NUM = 'shot_num' + SCENE_NUM = 'scene_num' + SCENE_META_LIST = 'scene_meta_list' + SHOT_META_LIST = 'shot_meta_list' TASK_OUTPUTS = { @@ -309,19 +311,30 @@ TASK_OUTPUTS = { Tasks.shop_segmentation: [OutputKeys.MASKS], # movide scene segmentation result for a single video # { - # "split_video_num":3, - # "split_meta_list": + # "shot_num":15, + # "shot_meta_list": + # [ + # { + # "frame": [start_frame, end_frame], + # "timestamps": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245'] + # + # } + # ] + # "scene_num":3, + # "scene_meta_list": # [ # { # "shot": [0,1,2], # "frame": [start_frame, end_frame], - # "timestamp": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245'] + # "timestamps": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245'] # } # ] # # } - Tasks.movie_scene_segmentation: - [OutputKeys.SPLIT_VIDEO_NUM, OutputKeys.SPLIT_META_LIST], + Tasks.movie_scene_segmentation: [ + OutputKeys.SHOT_NUM, OutputKeys.SHOT_META_LIST, OutputKeys.SCENE_NUM, + OutputKeys.SCENE_META_LIST + ], # ============ nlp tasks =================== diff --git a/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py b/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py index 6704e4c0..3fffc546 100644 --- a/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py +++ b/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py @@ -60,9 +60,12 @@ class MovieSceneSegmentationPipeline(Pipeline): def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: data = {'input_video_pth': self.input_video_pth, 'feat': inputs} - video_num, meta_lst = self.model.postprocess(data) + scene_num, scene_meta_lst, shot_num, shot_meta_lst = self.model.postprocess( + data) result = { - OutputKeys.SPLIT_VIDEO_NUM: video_num, - OutputKeys.SPLIT_META_LIST: meta_lst + OutputKeys.SHOT_NUM: shot_num, + OutputKeys.SHOT_META_LIST: shot_meta_lst, + OutputKeys.SCENE_NUM: scene_num, + OutputKeys.SCENE_META_LIST: scene_meta_lst } return result