Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10328357master
| @@ -67,7 +67,6 @@ class MovieSceneSegmentationModel(TorchModel): | |||
| mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | |||
| ]) | |||
| self.infer_result = {'vid': [], 'sid': [], 'pred': []} | |||
| sampling_method = self.cfg.dataset.sampling_method.name | |||
| self.neighbor_size = self.cfg.dataset.sampling_method.params[ | |||
| sampling_method].neighbor_size | |||
| @@ -104,6 +103,8 @@ class MovieSceneSegmentationModel(TorchModel): | |||
| shot_num = len(sids) | |||
| cnt = shot_num // bs + 1 | |||
| infer_sid, infer_pred = [], [] | |||
| infer_result = {} | |||
| for i in range(cnt): | |||
| start = i * bs | |||
| end = (i + 1) * bs if (i + 1) * bs < shot_num else shot_num | |||
| @@ -112,13 +113,14 @@ class MovieSceneSegmentationModel(TorchModel): | |||
| input_ = torch.stack(input_) | |||
| outputs = self.shared_step(input_) # shape [b,2] | |||
| prob = F.softmax(outputs, dim=1) | |||
| self.infer_result['sid'].extend(sid_.cpu().detach().numpy()) | |||
| self.infer_result['pred'].extend(prob[:, 1].cpu().detach().numpy()) | |||
| self.infer_result['pred'] = np.stack(self.infer_result['pred']) | |||
| infer_sid.extend(sid_.cpu().detach().numpy()) | |||
| infer_pred.extend(prob[:, 1].cpu().detach().numpy()) | |||
| infer_result.update({'pred': np.stack(infer_pred)}) | |||
| infer_result.update({'sid': infer_sid}) | |||
| assert len(self.infer_result['sid']) == len(sids) | |||
| assert len(self.infer_result['pred']) == len(inputs) | |||
| return self.infer_result | |||
| assert len(infer_result['sid']) == len(sids) | |||
| assert len(infer_result['pred']) == len(inputs) | |||
| return infer_result | |||
| def shared_step(self, inputs): | |||
| with torch.no_grad(): | |||
| @@ -162,11 +164,12 @@ class MovieSceneSegmentationModel(TorchModel): | |||
| thres = self.cfg.pipeline.save_threshold | |||
| anno_dict = get_pred_boundary(pred_dict, thres) | |||
| scene_dict_lst, scene_list = pred2scene(self.shot2keyf, anno_dict) | |||
| scene_dict_lst, scene_list, shot_num, shot_dict_lst = pred2scene( | |||
| self.shot2keyf, anno_dict) | |||
| if self.cfg.pipeline.save_split_scene: | |||
| re_dir = scene2video(inputs['input_video_pth'], scene_list, thres) | |||
| print(f'Split scene video saved to {re_dir}') | |||
| return len(scene_list), scene_dict_lst | |||
| return len(scene_list), scene_dict_lst, shot_num, shot_dict_lst | |||
| def preprocess(self, inputs): | |||
| logger.info('Begin shot detect......') | |||
| @@ -22,15 +22,23 @@ def pred2scene(shot2keyf, anno_dict): | |||
| scene_list, pair_list = get_demo_scene_list(shot2keyf, anno_dict) | |||
| scene_dict_lst = [] | |||
| shot_num = len(shot2keyf) | |||
| shot_dict_lst = [] | |||
| for item in shot2keyf: | |||
| tmp = item.split(' ') | |||
| shot_dict_lst.append({ | |||
| 'frame': [tmp[0], tmp[1]], | |||
| 'timestamps': [tmp[-2], tmp[-1]] | |||
| }) | |||
| assert len(scene_list) == len(pair_list) | |||
| for scene_ind, scene_item in enumerate(scene_list): | |||
| scene_dict_lst.append({ | |||
| 'shot': pair_list[scene_ind], | |||
| 'frame': scene_item[0], | |||
| 'timestamp': scene_item[1] | |||
| 'timestamps': scene_item[1] | |||
| }) | |||
| return scene_dict_lst, scene_list | |||
| return scene_dict_lst, scene_list, shot_num, shot_dict_lst | |||
| def scene2video(source_movie_fn, scene_list, thres): | |||
| @@ -38,8 +38,10 @@ class OutputKeys(object): | |||
| KWS_LIST = 'kws_list' | |||
| HISTORY = 'history' | |||
| TIMESTAMPS = 'timestamps' | |||
| SPLIT_VIDEO_NUM = 'split_video_num' | |||
| SPLIT_META_LIST = 'split_meta_list' | |||
| SHOT_NUM = 'shot_num' | |||
| SCENE_NUM = 'scene_num' | |||
| SCENE_META_LIST = 'scene_meta_list' | |||
| SHOT_META_LIST = 'shot_meta_list' | |||
| TASK_OUTPUTS = { | |||
| @@ -309,19 +311,30 @@ TASK_OUTPUTS = { | |||
| Tasks.shop_segmentation: [OutputKeys.MASKS], | |||
| # movide scene segmentation result for a single video | |||
| # { | |||
| # "split_video_num":3, | |||
| # "split_meta_list": | |||
| # "shot_num":15, | |||
| # "shot_meta_list": | |||
| # [ | |||
| # { | |||
| # "frame": [start_frame, end_frame], | |||
| # "timestamps": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245'] | |||
| # | |||
| # } | |||
| # ] | |||
| # "scene_num":3, | |||
| # "scene_meta_list": | |||
| # [ | |||
| # { | |||
| # "shot": [0,1,2], | |||
| # "frame": [start_frame, end_frame], | |||
| # "timestamp": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245'] | |||
| # "timestamps": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245'] | |||
| # } | |||
| # ] | |||
| # | |||
| # } | |||
| Tasks.movie_scene_segmentation: | |||
| [OutputKeys.SPLIT_VIDEO_NUM, OutputKeys.SPLIT_META_LIST], | |||
| Tasks.movie_scene_segmentation: [ | |||
| OutputKeys.SHOT_NUM, OutputKeys.SHOT_META_LIST, OutputKeys.SCENE_NUM, | |||
| OutputKeys.SCENE_META_LIST | |||
| ], | |||
| # ============ nlp tasks =================== | |||
| @@ -60,9 +60,12 @@ class MovieSceneSegmentationPipeline(Pipeline): | |||
| def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||
| data = {'input_video_pth': self.input_video_pth, 'feat': inputs} | |||
| video_num, meta_lst = self.model.postprocess(data) | |||
| scene_num, scene_meta_lst, shot_num, shot_meta_lst = self.model.postprocess( | |||
| data) | |||
| result = { | |||
| OutputKeys.SPLIT_VIDEO_NUM: video_num, | |||
| OutputKeys.SPLIT_META_LIST: meta_lst | |||
| OutputKeys.SHOT_NUM: shot_num, | |||
| OutputKeys.SHOT_META_LIST: shot_meta_lst, | |||
| OutputKeys.SCENE_NUM: scene_num, | |||
| OutputKeys.SCENE_META_LIST: scene_meta_lst | |||
| } | |||
| return result | |||