diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
index bab94aee26..15a44efa2d 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
@@ -38,7 +38,7 @@ int ConcatOpenCLKernel::RunAxis0() {
   auto dst_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
   cl::Image2D *out_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(dst_data));
   for (int i = 0; i < in_tensors_.size(); i++) {
-    auto src_data = in_tensors_[i]->data_c();
+    auto src_data = inputs_weight_ptrs_.at(i) == nullptr ? in_tensors_[i]->data_c() : inputs_weight_ptrs_.at(i);
     allocator_->GetImageSize(src_data, &img_size);
     auto src_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
     auto region = cl::array<cl::size_type, 3U>{img_size[0], img_size[1], 1};
@@ -160,10 +160,76 @@ void ConcatOpenCLKernel::SetGlobalLocal() {
   OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
 }
 
+int ConcatOpenCLKernel::ConvertWeightToTensor(const std::vector<lite::Tensor *> &in_tensors,
+                                              std::vector<void *> *inputs_weight_ptrs, bool fp16_enable,
+                                              size_t data_size) {
+  for (auto in_tensor_ : in_tensors) {
+    auto nhwc_shape = GetNHWCShape(in_tensor_->shape());
+    if (!in_tensor_->IsConst()) {
+      (*inputs_weight_ptrs).push_back(nullptr);
+    } else {
+      auto allocator = ocl_runtime_->GetAllocator();
+      std::vector<size_t> img_size = GetImage2dShapeFromNHWC(nhwc_shape, schema::Format_NHWC4);
+      int pack_weight_size = img_size[0] * img_size[1] * C4NUM;
+      int plane = nhwc_shape[1] * nhwc_shape[2];
+      int channel = nhwc_shape[3];
+      int batch = nhwc_shape[0];
+      img_size.push_back(fp16_enable ? CL_HALF_FLOAT : CL_FLOAT);
+      if (!fp16_enable) {
+        float *weight = new (std::nothrow) float[pack_weight_size];
+        if (weight == nullptr) {
+          MS_LOG(ERROR) << "Malloc buffer failed!";
+          return RET_ERROR;
+        }
+        memset(weight, 0x00, pack_weight_size * data_size);
+        if (in_tensor_->data_type() == kNumberTypeFloat32) {
+          std::function<float(float)> to_dtype = [](float x) -> float { return x; };
+          PackNHWCToNHWC4<float, float>(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype);
+        } else if (in_tensor_->data_type() == kNumberTypeFloat16) {
+          std::function<float(float16_t)> to_dtype = [](float16_t x) -> float { return static_cast<float>(x); };
+          PackNHWCToNHWC4<float16_t, float>(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype);
+        }
+        if (batch * plane * channel == 1) {
+          // scalar
+          weight[3] = weight[2] = weight[1] = weight[0];
+        }
+        auto weight_ptr_ = allocator->Malloc(pack_weight_size, img_size, weight);
+        (*inputs_weight_ptrs).push_back(weight_ptr_);
+        delete[] weight;
+      } else {
+        float16_t *weight = new (std::nothrow) float16_t[pack_weight_size];
+        if (weight == nullptr) {
+          MS_LOG(ERROR) << "Malloc buffer failed!";
+          return RET_ERROR;
+        }
+        memset(weight, 0x00, pack_weight_size * data_size);
+        if (in_tensor_->data_type() == kNumberTypeFloat32) {
+          std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); };
+          PackNHWCToNHWC4<float, float16_t>(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype);
+        } else if (in_tensor_->data_type() == kNumberTypeFloat16) {
+          std::function<float16_t(float16_t)> to_dtype = [](float16_t x) -> float16_t { return x; };
+          PackNHWCToNHWC4<float16_t, float16_t>(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype);
+        }
+        if (batch * plane * channel == 1) {
+          // scalar
+          weight[3] = weight[2] = weight[1] = weight[0];
+        }
+        auto weight_ptr_ = allocator->Malloc(pack_weight_size, img_size, weight);
+        (*inputs_weight_ptrs).push_back(weight_ptr_);
+        delete[] weight;
+      }
+    }
+  }
+  return RET_OK;
+}
+
 int ConcatOpenCLKernel::Prepare() {
+  enable_fp16_ = ocl_runtime_->GetFp16Enable();
+  auto data_size = enable_fp16_ ? sizeof(float16_t) : sizeof(float);
+  ConvertWeightToTensor(in_tensors_, &inputs_weight_ptrs_, enable_fp16_, data_size);
   if (axis_ == 0) {
     for (int i = 0; i < in_tensors_.size(); ++i) {
-      if (in_tensors_.at(0)->shape().size() != 1) {
+      if (in_tensors_.at(i)->shape().size() != 1) {
         return RET_OK;
       }
     }
@@ -175,7 +241,7 @@ int ConcatOpenCLKernel::Prepare() {
       Align_ = false;
     }
   }
-  enable_fp16_ = ocl_runtime_->GetFp16Enable();
+
   std::string kernel_name = "Concat";
   if (axis_ == 3 && !Align_) {
     kernel_name += "Input" + std::to_string(in_tensors_.size()) + "UnAlign";
@@ -202,7 +268,8 @@ int ConcatOpenCLKernel::Run() {
   }
   int arg_cn = 0;
   for (int i = 0; i < in_tensors_.size(); ++i) {
-    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[i]->data_c());
+    auto input_ptr = inputs_weight_ptrs_.at(i) == nullptr ? in_tensors_[i]->data_c() : inputs_weight_ptrs_.at(i);
+    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_ptr);
   }
   if (axis_ == 3 && !Align_) {
     ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h
index dd5960d4ac..9660d6cadc 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h
@@ -43,6 +43,7 @@ class ConcatOpenCLKernel : public OpenCLKernel {
   uint32_t OC = {1};
   std::vector<size_t> global;
   bool Align_{true};
+  std::vector<void *> inputs_weight_ptrs_;
   bool enable_fp16_{false};
   cl_int stride_w{1};
   cl_int4 in_shape_{};
@@ -51,6 +52,8 @@ class ConcatOpenCLKernel : public OpenCLKernel {
 
  private:
   int RunAxis0();
+  int ConvertWeightToTensor(const std::vector<lite::Tensor *> &in_tensors, std::vector<void *> *inputs_weight_ptrs,
+                            bool fp16_enable, size_t data_size);
 };
 
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.cc
index 5be0ad12a6..24d01091df 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.cc
@@ -60,7 +60,7 @@ void StrassenOpenCLKernel::AllocatorMemoryForStrassen(int NumA, int NumB) {
   img_size.push_back(UP_DIV(NumA, C4NUM));
   img_size.push_back(NumA);
   size_t img_dtype = enable_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
-  size_t dtype_size = enable_fp16_ ? sizeof(CL_HALF_FLOAT) : sizeof(CL_FLOAT);
+  size_t dtype_size = enable_fp16_ ? sizeof(cl_half) : sizeof(cl_float);
   img_size.push_back(img_dtype);
   auto allocator = ocl_runtime_->GetAllocator();
   size_t memA = NumA * NumA;
@@ -178,29 +178,6 @@ void StrassenOpenCLKernel::SetConstArgs() {
   ocl_runtime_->SetKernelArg(kernel_, arg_count++, shape_offset);
 }
 
-// OriginSize = N*H*W*C  typesize = sizeof(type data)  width = W * UP_DIV(C,C4NUM)  size = N
-void StrassenOpenCLKernel::PrintImage2d(void *IMGData, size_t typesize, size_t width, size_t size) {
-  auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
-  int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment();
-  auto runtime = runtime_wrapper.GetInstance();
-  runtime->SyncCommandQueue();
-  MS_ASSERT(alignment);
-  size_t row_pitch = UP_ROUND(width, alignment) * typesize * C4NUM;
-  size_t OriginSize = size * size * typesize;
-  std::vector<char> data(OriginSize);
-  auto row_size = width * typesize * C4NUM;
-
-  for (int i = 0; i < size; ++i) {
-    memcpy(reinterpret_cast<char *>(data.data()) + i * row_size, static_cast<char *>(IMGData) + i * row_pitch,
-           row_size);
-  }
-  for (int i = 0; i < size * size; ++i) {
-    if ((i + 1) % size == 0) {
-      std::cout << std::endl;
-    }
-  }
-}
-
 void StrassenOpenCLKernel::StrassenDataFilled(cl::Kernel *kernel, void *input, void *output, const int size,
                                               cl_int2 offset, lite::opencl::MemType mem_type) {
   if (input == nullptr || output == nullptr) {
@@ -344,7 +321,7 @@ void StrassenOpenCLKernel::DoStrassen(void *data, void *weight, void *result, co
 
 int StrassenOpenCLKernel::Run() {
   MS_LOG(DEBUG) << this->name() << " Running!";
-  int threshold = 0;
+  int threshold;
   const int up_bound = 1024;
   const int down_bound = 256;
   if (in_tensors_.at(0)->shape()[0] >= up_bound) {
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.h
index db7432d100..0aa9893f2e 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.h
@@ -48,7 +48,6 @@ class StrassenOpenCLKernel : public MatMulOpenCLKernel {
   void StrassenBackResult(cl::Kernel *kernel, void *input1, void *input2, void *input3, void *input4, void *input5,
                           void *input6, void *input7, void *output, const int size);
   void StrassenRunMmatmul(void *input, void *weight, void *output, const int size);
-  void PrintImage2d(void *IMGData, size_t typesize, size_t width, size_t size);
   cl::Kernel kernel_IMG_add_sub_2;
   cl::Kernel MatMul_StrassenBUFFilled;
   cl::Kernel MatMul_StrassenIMGFilled;