[to #42322933] format outputs for movie scene segmentation demo

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10328357
3 years ago · 38a399cf38
--- a/modelscope/models/cv/movie_scene_segmentation/model.py
+++ b/modelscope/models/cv/movie_scene_segmentation/model.py
@@ -67,7 +67,6 @@ class MovieSceneSegmentationModel(TorchModel):
                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        self.infer_result = {'vid': [], 'sid': [], 'pred': []}
        sampling_method = self.cfg.dataset.sampling_method.name
        self.neighbor_size = self.cfg.dataset.sampling_method.params[
            sampling_method].neighbor_size
@@ -104,6 +103,8 @@ class MovieSceneSegmentationModel(TorchModel):
        shot_num = len(sids)
        cnt = shot_num // bs + 1
        infer_sid, infer_pred = [], []
        infer_result = {}
        for i in range(cnt):
            start = i * bs
            end = (i + 1) * bs if (i + 1) * bs < shot_num else shot_num
@@ -112,13 +113,14 @@ class MovieSceneSegmentationModel(TorchModel):
            input_ = torch.stack(input_)
            outputs = self.shared_step(input_)  # shape [b,2]
            prob = F.softmax(outputs, dim=1)
            self.infer_result['sid'].extend(sid_.cpu().detach().numpy())
            self.infer_result['pred'].extend(prob[:, 1].cpu().detach().numpy())
        self.infer_result['pred'] = np.stack(self.infer_result['pred'])
            infer_sid.extend(sid_.cpu().detach().numpy())
            infer_pred.extend(prob[:, 1].cpu().detach().numpy())
        infer_result.update({'pred': np.stack(infer_pred)})
        infer_result.update({'sid': infer_sid})
        assert len(self.infer_result['sid']) == len(sids)
        assert len(self.infer_result['pred']) == len(inputs)
        return self.infer_result
        assert len(infer_result['sid']) == len(sids)
        assert len(infer_result['pred']) == len(inputs)
        return infer_result
    def shared_step(self, inputs):
        with torch.no_grad():
@@ -162,11 +164,12 @@ class MovieSceneSegmentationModel(TorchModel):
        thres = self.cfg.pipeline.save_threshold
        anno_dict = get_pred_boundary(pred_dict, thres)
        scene_dict_lst, scene_list = pred2scene(self.shot2keyf, anno_dict)
        scene_dict_lst, scene_list, shot_num, shot_dict_lst = pred2scene(
            self.shot2keyf, anno_dict)
        if self.cfg.pipeline.save_split_scene:
            re_dir = scene2video(inputs['input_video_pth'], scene_list, thres)
            print(f'Split scene video saved to {re_dir}')
        return len(scene_list), scene_dict_lst
        return len(scene_list), scene_dict_lst, shot_num, shot_dict_lst
    def preprocess(self, inputs):
        logger.info('Begin shot detect......')
--- a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
@@ -22,15 +22,23 @@ def pred2scene(shot2keyf, anno_dict):
    scene_list, pair_list = get_demo_scene_list(shot2keyf, anno_dict)
    scene_dict_lst = []
    shot_num = len(shot2keyf)
    shot_dict_lst = []
    for item in shot2keyf:
        tmp = item.split(' ')
        shot_dict_lst.append({
            'frame': [tmp[0], tmp[1]],
            'timestamps': [tmp[-2], tmp[-1]]
        })
    assert len(scene_list) == len(pair_list)
    for scene_ind, scene_item in enumerate(scene_list):
        scene_dict_lst.append({
            'shot': pair_list[scene_ind],
            'frame': scene_item[0],
            'timestamp': scene_item[1]
            'timestamps': scene_item[1]
        })
    return scene_dict_lst, scene_list
    return scene_dict_lst, scene_list, shot_num, shot_dict_lst
 def scene2video(source_movie_fn, scene_list, thres):
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -38,8 +38,10 @@ class OutputKeys(object):
    KWS_LIST = 'kws_list'
    HISTORY = 'history'
    TIMESTAMPS = 'timestamps'
    SPLIT_VIDEO_NUM = 'split_video_num'
    SPLIT_META_LIST = 'split_meta_list'
    SHOT_NUM = 'shot_num'
    SCENE_NUM = 'scene_num'
    SCENE_META_LIST = 'scene_meta_list'
    SHOT_META_LIST = 'shot_meta_list'
 TASK_OUTPUTS = {
@@ -309,19 +311,30 @@ TASK_OUTPUTS = {
    Tasks.shop_segmentation: [OutputKeys.MASKS],
    # movide scene segmentation result for a single video
    # {
    #        "split_video_num":3,
    #        "split_meta_list":
    #        "shot_num":15,
    #        "shot_meta_list":
    #        [
    #           {
    #               "frame": [start_frame, end_frame],
    #               "timestamps": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245']
    #
    #           }
    #         ]
    #        "scene_num":3,
    #        "scene_meta_list":
    #        [
    #           {
    #               "shot": [0,1,2],
    #               "frame": [start_frame, end_frame],
    #               "timestamp": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245']
    #               "timestamps": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245']
    #           }
    #        ]
    #
    # }
    Tasks.movie_scene_segmentation:
    [OutputKeys.SPLIT_VIDEO_NUM, OutputKeys.SPLIT_META_LIST],
    Tasks.movie_scene_segmentation: [
        OutputKeys.SHOT_NUM, OutputKeys.SHOT_META_LIST, OutputKeys.SCENE_NUM,
        OutputKeys.SCENE_META_LIST
    ],
    # ============ nlp tasks ===================
--- a/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
+++ b/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
@@ -60,9 +60,12 @@ class MovieSceneSegmentationPipeline(Pipeline):
    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        data = {'input_video_pth': self.input_video_pth, 'feat': inputs}
        video_num, meta_lst = self.model.postprocess(data)
        scene_num, scene_meta_lst, shot_num, shot_meta_lst = self.model.postprocess(
            data)
        result = {
            OutputKeys.SPLIT_VIDEO_NUM: video_num,
            OutputKeys.SPLIT_META_LIST: meta_lst
            OutputKeys.SHOT_NUM: shot_num,
            OutputKeys.SHOT_META_LIST: shot_meta_lst,
            OutputKeys.SCENE_NUM: scene_num,
            OutputKeys.SCENE_META_LIST: scene_meta_lst
        }
        return result