diff --git a/modelscope/outputs/outputs.py b/modelscope/outputs/outputs.py
index 949a91b5..30361b5d 100644
--- a/modelscope/outputs/outputs.py
+++ b/modelscope/outputs/outputs.py
@@ -435,9 +435,11 @@ TASK_OUTPUTS = {
 
     # referring video object segmentation result for a single video
     #   {
-    #       "masks": [np.array # 2D array with shape [height, width]]
+    #       "masks": [np.array # 3D array with shape [frame_num, height, width]]
+    #       "timestamps": ["hh:mm:ss", "hh:mm:ss", "hh:mm:ss"]
     #   }
-    Tasks.referring_video_object_segmentation: [OutputKeys.MASKS],
+    Tasks.referring_video_object_segmentation:
+    [OutputKeys.MASKS, OutputKeys.TIMESTAMPS],
 
     # ============ nlp tasks ===================
 
@@ -698,8 +700,9 @@ TASK_OUTPUTS = {
     #   "img_embedding": np.array with shape [1, D],
     #   "text_embedding": np.array with shape [1, D]
     # }
-    Tasks.multi_modal_embedding:
-    [OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING],
+    Tasks.multi_modal_embedding: [
+        OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING
+    ],
 
     # generative multi-modal embedding result for single sample
     # {
diff --git a/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py b/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py
index f0a717a5..dcbb5de0 100644
--- a/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py
+++ b/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py
@@ -52,17 +52,16 @@ class ReferringVideoObjectSegmentationPipeline(Pipeline):
         """
         assert isinstance(input, tuple) and len(
             input
-        ) == 4, 'error - input type must be tuple and input length must be 4'
-        self.input_video_pth, text_queries, start_pt, end_pt = input
+        ) == 2, 'error - input type must be tuple and input length must be 2'
+        self.input_video_pth, text_queries = input
 
-        assert 0 < end_pt - start_pt <= 10, 'error - the subclip length must be 0-10 seconds long'
         assert 1 <= len(
             text_queries) <= 2, 'error - 1-2 input text queries are expected'
 
         # extract the relevant subclip:
         self.input_clip_pth = 'input_clip.mp4'
         with VideoFileClip(self.input_video_pth) as video:
-            subclip = video.subclip(start_pt, end_pt)
+            subclip = video.subclip()
             subclip.write_videofile(self.input_clip_pth)
 
         self.window_length = 24  # length of window during inference
@@ -191,7 +190,16 @@ class ReferringVideoObjectSegmentationPipeline(Pipeline):
                 output_clip_path, fps=self.meta['video_fps'], audio=True)
             del masked_video
 
-        result = {OutputKeys.MASKS: inputs}
+        masks = [mask.squeeze(1) for mask in inputs]
+
+        fps = self.meta['video_fps']
+        output_timestamps = []
+        for frame_idx in range(self.video.shape[0]):
+            output_timestamps.append(timestamp_format(seconds=frame_idx / fps))
+        result = {
+            OutputKeys.MASKS: masks,
+            OutputKeys.TIMESTAMPS: output_timestamps
+        }
         return result
 
 
@@ -201,3 +209,10 @@ def apply_mask(image, mask, color, transparency=0.7):
     color_matrix = np.ones(image.shape, dtype=np.float) * color
     out_image = color_matrix * mask + image * (1.0 - mask)
     return out_image
+
+
+def timestamp_format(seconds):
+    m, s = divmod(seconds, 60)
+    h, m = divmod(m, 60)
+    time = '%02d:%02d:%06.3f' % (h, m, s)
+    return time
diff --git a/tests/pipelines/test_referring_video_object_segmentation.py b/tests/pipelines/test_referring_video_object_segmentation.py
index 3e81d9c3..509e9317 100644
--- a/tests/pipelines/test_referring_video_object_segmentation.py
+++ b/tests/pipelines/test_referring_video_object_segmentation.py
@@ -21,8 +21,7 @@ class ReferringVideoObjectSegmentationTest(unittest.TestCase,
             'guy in black performing tricks on a bike',
             'a black bike used to perform tricks'
         ]
-        start_pt, end_pt = 4, 14
-        input_tuple = (input_location, text_queries, start_pt, end_pt)
+        input_tuple = (input_location, text_queries)
         pp = pipeline(
             Tasks.referring_video_object_segmentation, model=self.model_id)
         result = pp(input_tuple)
@@ -38,8 +37,7 @@ class ReferringVideoObjectSegmentationTest(unittest.TestCase,
             'guy in black performing tricks on a bike',
             'a black bike used to perform tricks'
         ]
-        start_pt, end_pt = 4, 14
-        input_tuple = (input_location, text_queries, start_pt, end_pt)
+        input_tuple = (input_location, text_queries)
         pp = pipeline(Tasks.referring_video_object_segmentation)
         result = pp(input_tuple)
         if result: