Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10328357master
| @@ -67,7 +67,6 @@ class MovieSceneSegmentationModel(TorchModel): | |||||
| mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | ||||
| ]) | ]) | ||||
| self.infer_result = {'vid': [], 'sid': [], 'pred': []} | |||||
| sampling_method = self.cfg.dataset.sampling_method.name | sampling_method = self.cfg.dataset.sampling_method.name | ||||
| self.neighbor_size = self.cfg.dataset.sampling_method.params[ | self.neighbor_size = self.cfg.dataset.sampling_method.params[ | ||||
| sampling_method].neighbor_size | sampling_method].neighbor_size | ||||
| @@ -104,6 +103,8 @@ class MovieSceneSegmentationModel(TorchModel): | |||||
| shot_num = len(sids) | shot_num = len(sids) | ||||
| cnt = shot_num // bs + 1 | cnt = shot_num // bs + 1 | ||||
| infer_sid, infer_pred = [], [] | |||||
| infer_result = {} | |||||
| for i in range(cnt): | for i in range(cnt): | ||||
| start = i * bs | start = i * bs | ||||
| end = (i + 1) * bs if (i + 1) * bs < shot_num else shot_num | end = (i + 1) * bs if (i + 1) * bs < shot_num else shot_num | ||||
| @@ -112,13 +113,14 @@ class MovieSceneSegmentationModel(TorchModel): | |||||
| input_ = torch.stack(input_) | input_ = torch.stack(input_) | ||||
| outputs = self.shared_step(input_) # shape [b,2] | outputs = self.shared_step(input_) # shape [b,2] | ||||
| prob = F.softmax(outputs, dim=1) | prob = F.softmax(outputs, dim=1) | ||||
| self.infer_result['sid'].extend(sid_.cpu().detach().numpy()) | |||||
| self.infer_result['pred'].extend(prob[:, 1].cpu().detach().numpy()) | |||||
| self.infer_result['pred'] = np.stack(self.infer_result['pred']) | |||||
| infer_sid.extend(sid_.cpu().detach().numpy()) | |||||
| infer_pred.extend(prob[:, 1].cpu().detach().numpy()) | |||||
| infer_result.update({'pred': np.stack(infer_pred)}) | |||||
| infer_result.update({'sid': infer_sid}) | |||||
| assert len(self.infer_result['sid']) == len(sids) | |||||
| assert len(self.infer_result['pred']) == len(inputs) | |||||
| return self.infer_result | |||||
| assert len(infer_result['sid']) == len(sids) | |||||
| assert len(infer_result['pred']) == len(inputs) | |||||
| return infer_result | |||||
| def shared_step(self, inputs): | def shared_step(self, inputs): | ||||
| with torch.no_grad(): | with torch.no_grad(): | ||||
| @@ -162,11 +164,12 @@ class MovieSceneSegmentationModel(TorchModel): | |||||
| thres = self.cfg.pipeline.save_threshold | thres = self.cfg.pipeline.save_threshold | ||||
| anno_dict = get_pred_boundary(pred_dict, thres) | anno_dict = get_pred_boundary(pred_dict, thres) | ||||
| scene_dict_lst, scene_list = pred2scene(self.shot2keyf, anno_dict) | |||||
| scene_dict_lst, scene_list, shot_num, shot_dict_lst = pred2scene( | |||||
| self.shot2keyf, anno_dict) | |||||
| if self.cfg.pipeline.save_split_scene: | if self.cfg.pipeline.save_split_scene: | ||||
| re_dir = scene2video(inputs['input_video_pth'], scene_list, thres) | re_dir = scene2video(inputs['input_video_pth'], scene_list, thres) | ||||
| print(f'Split scene video saved to {re_dir}') | print(f'Split scene video saved to {re_dir}') | ||||
| return len(scene_list), scene_dict_lst | |||||
| return len(scene_list), scene_dict_lst, shot_num, shot_dict_lst | |||||
| def preprocess(self, inputs): | def preprocess(self, inputs): | ||||
| logger.info('Begin shot detect......') | logger.info('Begin shot detect......') | ||||
| @@ -22,15 +22,23 @@ def pred2scene(shot2keyf, anno_dict): | |||||
| scene_list, pair_list = get_demo_scene_list(shot2keyf, anno_dict) | scene_list, pair_list = get_demo_scene_list(shot2keyf, anno_dict) | ||||
| scene_dict_lst = [] | scene_dict_lst = [] | ||||
| shot_num = len(shot2keyf) | |||||
| shot_dict_lst = [] | |||||
| for item in shot2keyf: | |||||
| tmp = item.split(' ') | |||||
| shot_dict_lst.append({ | |||||
| 'frame': [tmp[0], tmp[1]], | |||||
| 'timestamps': [tmp[-2], tmp[-1]] | |||||
| }) | |||||
| assert len(scene_list) == len(pair_list) | assert len(scene_list) == len(pair_list) | ||||
| for scene_ind, scene_item in enumerate(scene_list): | for scene_ind, scene_item in enumerate(scene_list): | ||||
| scene_dict_lst.append({ | scene_dict_lst.append({ | ||||
| 'shot': pair_list[scene_ind], | 'shot': pair_list[scene_ind], | ||||
| 'frame': scene_item[0], | 'frame': scene_item[0], | ||||
| 'timestamp': scene_item[1] | |||||
| 'timestamps': scene_item[1] | |||||
| }) | }) | ||||
| return scene_dict_lst, scene_list | |||||
| return scene_dict_lst, scene_list, shot_num, shot_dict_lst | |||||
| def scene2video(source_movie_fn, scene_list, thres): | def scene2video(source_movie_fn, scene_list, thres): | ||||
| @@ -38,8 +38,10 @@ class OutputKeys(object): | |||||
| KWS_LIST = 'kws_list' | KWS_LIST = 'kws_list' | ||||
| HISTORY = 'history' | HISTORY = 'history' | ||||
| TIMESTAMPS = 'timestamps' | TIMESTAMPS = 'timestamps' | ||||
| SPLIT_VIDEO_NUM = 'split_video_num' | |||||
| SPLIT_META_LIST = 'split_meta_list' | |||||
| SHOT_NUM = 'shot_num' | |||||
| SCENE_NUM = 'scene_num' | |||||
| SCENE_META_LIST = 'scene_meta_list' | |||||
| SHOT_META_LIST = 'shot_meta_list' | |||||
| TASK_OUTPUTS = { | TASK_OUTPUTS = { | ||||
| @@ -309,19 +311,30 @@ TASK_OUTPUTS = { | |||||
| Tasks.shop_segmentation: [OutputKeys.MASKS], | Tasks.shop_segmentation: [OutputKeys.MASKS], | ||||
| # movide scene segmentation result for a single video | # movide scene segmentation result for a single video | ||||
| # { | # { | ||||
| # "split_video_num":3, | |||||
| # "split_meta_list": | |||||
| # "shot_num":15, | |||||
| # "shot_meta_list": | |||||
| # [ | |||||
| # { | |||||
| # "frame": [start_frame, end_frame], | |||||
| # "timestamps": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245'] | |||||
| # | |||||
| # } | |||||
| # ] | |||||
| # "scene_num":3, | |||||
| # "scene_meta_list": | |||||
| # [ | # [ | ||||
| # { | # { | ||||
| # "shot": [0,1,2], | # "shot": [0,1,2], | ||||
| # "frame": [start_frame, end_frame], | # "frame": [start_frame, end_frame], | ||||
| # "timestamp": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245'] | |||||
| # "timestamps": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245'] | |||||
| # } | # } | ||||
| # ] | # ] | ||||
| # | # | ||||
| # } | # } | ||||
| Tasks.movie_scene_segmentation: | |||||
| [OutputKeys.SPLIT_VIDEO_NUM, OutputKeys.SPLIT_META_LIST], | |||||
| Tasks.movie_scene_segmentation: [ | |||||
| OutputKeys.SHOT_NUM, OutputKeys.SHOT_META_LIST, OutputKeys.SCENE_NUM, | |||||
| OutputKeys.SCENE_META_LIST | |||||
| ], | |||||
| # ============ nlp tasks =================== | # ============ nlp tasks =================== | ||||
| @@ -60,9 +60,12 @@ class MovieSceneSegmentationPipeline(Pipeline): | |||||
| def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | ||||
| data = {'input_video_pth': self.input_video_pth, 'feat': inputs} | data = {'input_video_pth': self.input_video_pth, 'feat': inputs} | ||||
| video_num, meta_lst = self.model.postprocess(data) | |||||
| scene_num, scene_meta_lst, shot_num, shot_meta_lst = self.model.postprocess( | |||||
| data) | |||||
| result = { | result = { | ||||
| OutputKeys.SPLIT_VIDEO_NUM: video_num, | |||||
| OutputKeys.SPLIT_META_LIST: meta_lst | |||||
| OutputKeys.SHOT_NUM: shot_num, | |||||
| OutputKeys.SHOT_META_LIST: shot_meta_lst, | |||||
| OutputKeys.SCENE_NUM: scene_num, | |||||
| OutputKeys.SCENE_META_LIST: scene_meta_lst | |||||
| } | } | ||||
| return result | return result | ||||