diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 342ba6b5..f38ff8ae 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -159,7 +159,8 @@ class MPlugPreprocessor(Preprocessor):
         image = image.convert('RGB')
         image = self.patch_resize_transform(image)
         question = '' if self.cfg.task == Tasks.image_captioning \
-            else data[1 if isinstance(data, tuple) else 'question']
+            else data[1 if isinstance(data, tuple)
+                      else ('text' if 'text' in data else 'question')]
         question = self.tokenizer(
             question.lower(),
             padding='max_length',
diff --git a/tests/pipelines/test_mplug_tasks.py b/tests/pipelines/test_mplug_tasks.py
index 273d3105..a3ace62d 100644
--- a/tests/pipelines/test_mplug_tasks.py
+++ b/tests/pipelines/test_mplug_tasks.py
@@ -44,8 +44,8 @@ class MplugTasksTest(unittest.TestCase, DemoCompatibilityCheck):
             'damo/mplug_visual-question-answering_coco_large_en')
         pipeline_vqa = pipeline(Tasks.visual_question_answering, model=model)
         image = Image.open('data/test/images/image_mplug_vqa.jpg')
-        question = 'What is the woman doing?'
-        input = {'image': image, 'question': question}
+        text = 'What is the woman doing?'
+        input = {'image': image, 'text': text}
         result = pipeline_vqa(input)
         print(result)
 
@@ -54,8 +54,8 @@ class MplugTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         model = 'damo/mplug_visual-question-answering_coco_large_en'
         pipeline_vqa = pipeline(Tasks.visual_question_answering, model=model)
         image = Image.open('data/test/images/image_mplug_vqa.jpg')
-        question = 'What is the woman doing?'
-        input = {'image': image, 'question': question}
+        text = 'What is the woman doing?'
+        input = {'image': image, 'text': text}
         result = pipeline_vqa(input)
         print(result)
 
@@ -65,8 +65,8 @@ class MplugTasksTest(unittest.TestCase, DemoCompatibilityCheck):
             'damo/mplug_image-text-retrieval_flickr30k_large_en')
         pipeline_retrieval = pipeline(Tasks.image_text_retrieval, model=model)
         image = Image.open('data/test/images/image-text-retrieval.jpg')
-        question = 'Two young guys with shaggy hair look at their hands while hanging out in the yard.'
-        input = {'image': image, 'question': question}
+        text = 'Two young guys with shaggy hair look at their hands while hanging out in the yard.'
+        input = {'image': image, 'text': text}
         result = pipeline_retrieval(input)
         print(result)
 
@@ -75,8 +75,8 @@ class MplugTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         model = 'damo/mplug_image-text-retrieval_flickr30k_large_en'
         pipeline_retrieval = pipeline(Tasks.image_text_retrieval, model=model)
         image = Image.open('data/test/images/image-text-retrieval.jpg')
-        question = 'Two young guys with shaggy hair look at their hands while hanging out in the yard.'
-        input = {'image': image, 'question': question}
+        text = 'Two young guys with shaggy hair look at their hands while hanging out in the yard.'
+        input = {'image': image, 'text': text}
         result = pipeline_retrieval(input)
         print(result)