diff --git a/mindspore/lite/schema/model.fbs b/mindspore/lite/schema/model.fbs
index 733fc28d76..0c0ad360b6 100644
--- a/mindspore/lite/schema/model.fbs
+++ b/mindspore/lite/schema/model.fbs
@@ -192,7 +192,8 @@ union PrimitiveType {
     SpaceToBatchND,
     TopKV2,
     Return,
-    MakeTuple
+    MakeTuple,
+    ToFormat,
 }
 
 enum QuantType: int {
diff --git a/mindspore/lite/schema/ops.fbs b/mindspore/lite/schema/ops.fbs
index e6584dc059..2e93983c16 100644
--- a/mindspore/lite/schema/ops.fbs
+++ b/mindspore/lite/schema/ops.fbs
@@ -881,5 +881,10 @@ table TopKV2 {
 table MakeTuple {
 }
 
+table ToFormat {
+    srcT: int;
+    dstT: int;
+}
+
 table Return {
 }
\ No newline at end of file
diff --git a/mindspore/lite/src/lite_kernel.h b/mindspore/lite/src/lite_kernel.h
index d278b27351..36b90701d0 100644
--- a/mindspore/lite/src/lite_kernel.h
+++ b/mindspore/lite/src/lite_kernel.h
@@ -131,6 +131,10 @@ class LiteKernel {
 
   void AddOutKernel(LiteKernel *kernel) { this->out_kernels_.emplace_back(kernel); }
 
+  void SetInKernel(const std::vector<LiteKernel *> &kernel) { this->in_kernels_ = kernel; }
+
+  void SetOutKernel(const std::vector<LiteKernel *> &kernel) { this->out_kernels_ = kernel; }
+
   std::vector<LiteKernel *> &in_kernels() { return this->in_kernels_; }
 
   std::vector<LiteKernel *> &out_kernels() { return this->out_kernels_; }
@@ -167,16 +171,14 @@ class SubGraphKernel : public LiteKernel {
  public:
   explicit SubGraphKernel(const std::vector<lite::tensor::Tensor *> &inputs,
                           const std::vector<lite::tensor::Tensor *> &outputs,
-                          const std::vector<kernel::LiteKernel *> &inKernels,
-                          const std::vector<kernel::LiteKernel *> &outKernels,
+                          const std::vector<kernel::LiteKernel *> &in_kernels,
+                          const std::vector<kernel::LiteKernel *> &out_kernels,
                           const std::vector<kernel::LiteKernel *> &nodes, const lite::Context *ctx,
                           const lite::Primitive *primitive)
-      : LiteKernel(nullptr, inputs, outputs, ctx, primitive),
-        inputs_(inputs),
-        outputs_(outputs),
-        inkernels_(inKernels),
-        outkernels_(outKernels),
-        nodes_(nodes) {}
+      : LiteKernel(nullptr, inputs, outputs, ctx, primitive), nodes_(nodes) {
+    in_kernels_ = in_kernels;
+    out_kernels_ = out_kernels;
+  }
 
   virtual int Init() { return -1; }
   virtual int InferShape() { return -1; }
@@ -184,10 +186,6 @@ class SubGraphKernel : public LiteKernel {
   virtual int Run() { return -1; }
 
  protected:
-  std::vector<lite::tensor::Tensor *> inputs_;
-  std::vector<lite::tensor::Tensor *> outputs_;
-  std::vector<LiteKernel *> inkernels_;
-  std::vector<LiteKernel *> outkernels_;
   std::vector<LiteKernel *> nodes_;
 };
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl
index 6438f24e6b..33d6143f9d 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl
@@ -49,9 +49,6 @@ __kernel void ElementDiv(__read_only image2d_t input_a, __read_only image2d_t in
 
   float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
   float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
-  if (b == 0) {
-    return;
-  }
   write_imagef(output, (int2)(X, Y), a / b);
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/to_format.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/to_format.cl
new file mode 100644
index 0000000000..0d877b778c
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/to_format.cl
@@ -0,0 +1,235 @@
+#define FLT float
+#define FLT4 float4
+#define READ_IMAGE read_imagef
+#define WRITE_IMAGE write_imagef
+// enum Format {
+//  Format_NCHW = 0,
+//  Format_NHWC = 1,
+//  Format_NHWC4 = 2,
+//  Format_HWKC = 3,
+//  Format_HWCK = 4,
+//  Format_KCHW = 5,
+//  Format_CKHW = 6,
+//  Format_KHWC = 7,
+//  Format_CHWK = 8,
+//  Format_NC4HW4 = 100,
+//  Format_NUM_OF_FORMAT = 101,
+//  Format_MIN = Format_NCHW,
+//  Format_MAX = Format_NUM_OF_FORMAT
+//};
+__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+__kernel void to_format_NCHW_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
+                                          int4 shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= size.x || Y >= size.y || Z >= size.z) {
+    return;
+  }
+  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
+}
+__kernel void to_format_NHWC_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
+                                          int4 shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= size.x || Y >= size.y || Z >= size.z) {
+    return;
+  }
+  int offset = (X * shape.z + Y) * shape.w + Z * 4;
+  __global FLT *src_addr = (__global FLT *)src_data;
+  src_addr += offset;
+  FLT4 data = (FLT4)(0.f);
+  if ((Z + 1) * 4 <= shape.w) {
+    data = ((__global FLT4 *)src_addr)[0];
+  } else {
+    if ((shape.w - Z * 4) >= 1) {
+      data.x = src_addr[0];
+    }
+    if ((shape.w - Z * 4) >= 2) {
+      data.y = src_addr[1];
+    }
+    if ((shape.w - Z * 4) >= 3) {
+      data.z = src_addr[2];
+    }
+  }
+  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data);
+}
+__kernel void to_format_NHWC4_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
+                                           int4 shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= size.x || Y >= size.y || Z >= size.z) {
+    return;
+  }
+  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
+}
+__kernel void to_format_NC4HW4_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
+                                            int4 shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= size.x || Y >= size.y || Z >= size.z) {
+    return;
+  }
+  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
+}
+__kernel void to_format_NCHW_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
+                                           int4 shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= size.x || Y >= size.y || Z >= size.z) {
+    return;
+  }
+  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
+}
+__kernel void to_format_NHWC_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
+                                           int4 shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= size.x || Y >= size.y || Z >= size.z) {
+    return;
+  }
+  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
+}
+__kernel void to_format_NHWC4_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
+                                            int4 shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= size.x || Y >= size.y || Z >= size.z) {
+    return;
+  }
+  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
+}
+__kernel void to_format_NC4HW4_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
+                                             int4 shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= size.x || Y >= size.y || Z >= size.z) {
+    return;
+  }
+  // FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))];
+  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), src_data[(Y * size.z + Z) * size.x + X]);
+}
+
+__kernel void to_format_NCHW_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
+                                         int4 shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= size.x || Y >= size.y || Z >= size.z) {
+    return;
+  }
+  dst_data[(Z * size.y + Y) * size.x + X] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.x + X, Z));
+}
+__kernel void to_format_NHWC_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
+                                         int4 shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= size.x || Y >= size.y || Z >= size.z) {
+    return;
+  }
+  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
+}
+__kernel void to_format_NHWC4_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
+                                          int4 shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= size.x || Y >= size.y || Z >= size.z) {
+    return;
+  }
+  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
+}
+__kernel void to_format_NC4HW4_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
+                                           int4 shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= size.x || Y >= size.y || Z >= size.z) {
+    return;
+  }
+  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
+}
+__kernel void to_format_NCHW_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
+                                         int4 shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= size.x || Y >= size.y || Z >= size.z) {
+    return;
+  }
+  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
+}
+__kernel void to_format_NHWC_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
+                                         int4 shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= size.x || Y >= size.y || Z >= size.z) {
+    return;
+  }
+  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
+}
+__kernel void to_format_NHWC4_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
+                                          int4 shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= size.x || Y >= size.y || Z >= size.z) {
+    return;
+  }
+  FLT4 data = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X));
+  int offset = (X * shape.z + Y) * shape.w + Z * 4;
+  __global FLT *dst_addr = (__global FLT *)dst_data;
+  dst_addr += offset;
+  if ((Z + 1) * 4 <= shape.w) {
+    ((__global FLT4 *)dst_addr)[0] = data;
+  } else {
+    if (shape.w - Z * 4 >= 1) {
+      dst_addr[0] = data.x;
+    }
+    if (shape.w - Z * 4 >= 2) {
+      dst_addr[1] = data.y;
+    }
+    if (shape.w - Z * 4 >= 3) {
+      dst_addr[2] = data.z;
+    }
+  }
+}
+__kernel void to_format_NC4HW4_to_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
+                                              int4 shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= size.x || Y >= size.y || Z >= size.z) {
+    return;
+  }
+  //  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
+}
+__kernel void to_format_NC4HW4_to_NC4HW4_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
+                                             int4 shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= size.x || Y >= size.y || Z >= size.z) {
+    return;
+  }
+  dst_data[(Y * size.z + Z) * size.x + X] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X));
+}
+__kernel void to_format_NHWC4_to_NHWC4_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
+                                           int4 shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= size.x || Y >= size.y || Z >= size.z) {
+    return;
+  }
+  dst_data[(Y * size.z + Z) * size.x + X] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X));
+}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
index daddfe73a9..726dc0ac36 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
@@ -329,7 +329,8 @@ kernel::LiteKernel *OpenCLConvolutionKernelCreator(const std::vector<lite::tenso
                                                    const std::vector<lite::tensor::Tensor *> &outputs,
                                                    OpParameter *opParameter, const lite::Context *ctx,
                                                    const kernel::KernelKey &desc, const lite::Primitive *primitive) {
-  auto *kernel = new ConvolutionOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
+  auto *kernel =
+    new (std::nothrow) ConvolutionOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
   if (kernel == nullptr) {
     MS_LOG(ERROR) << "Create OpenCL Convolution kernel failed!";
     return nullptr;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
index 184dd5cd9e..99607ff43e 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
@@ -145,7 +145,7 @@ kernel::LiteKernel *OpenCLPooling2dKernelCreator(const std::vector<lite::tensor:
                                                  const std::vector<lite::tensor::Tensor *> &outputs,
                                                  OpParameter *opParameter, const lite::Context *ctx,
                                                  const kernel::KernelKey &desc, const lite::Primitive *primitive) {
-  auto *kernel = new (std::nothrow)PoolingOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
+  auto *kernel = new (std::nothrow) PoolingOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
   if (kernel == nullptr) {
     MS_LOG(ERROR) << "Create OpenCL Pooling kernel failed!";
     return nullptr;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
index 242a27b2a6..1396cae004 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
@@ -158,7 +158,7 @@ kernel::LiteKernel *OpenCLSoftMaxKernelCreator(const std::vector<lite::tensor::T
                                                const std::vector<lite::tensor::Tensor *> &outputs,
                                                OpParameter *opParameter, const lite::Context *ctx,
                                                const kernel::KernelKey &desc, const lite::Primitive *primitive) {
-  auto *kernel = new (std::nothrow)SoftmaxOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
+  auto *kernel = new (std::nothrow) SoftmaxOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
   if (kernel == nullptr) {
     MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr.";
     return nullptr;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
new file mode 100644
index 0000000000..44d1cf8dde
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
@@ -0,0 +1,167 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/opencl/kernel/to_format.h"
+#include <set>
+#include <map>
+#include <string>
+#include <utility>
+#include "include/errorcode.h"
+#include "src/kernel_registry.h"
+#include "src/runtime/opencl/opencl_runtime.h"
+#include "src/runtime/kernel/opencl/cl/fp32/to_format.cl.inc"
+
+using mindspore::kernel::KERNEL_ARCH::kGPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_ToFormat;
+
+namespace mindspore::kernel {
+
+int ToFormatOpenCLKernel::Init() {
+  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  auto parameter = reinterpret_cast<OpenCLToFormatParameter *>(op_parameter_);
+  out_mem_type_ = parameter->out_mem_type;
+  std::string program_name = "to_format";
+  std::map<schema::Format, std::string> format_str{{schema::Format_NCHW, "NCHW"},
+                                                   {schema::Format_NHWC, "NHWC"},
+                                                   {schema::Format_NC4HW4, "NC4HW4"},
+                                                   {schema::Format_NHWC4, "NHWC4"}};
+  std::string kernel_name =
+    "to_format_" + format_str[in_tensors_[0]->GetFormat()] + "_to_" + format_str[out_tensors_[0]->GetFormat()];
+  if (out_mem_type_ == OpenCLMemType::IMG) {
+    kernel_name += "_IMG";
+  } else {
+    kernel_name += "_BUF";
+  }
+
+#ifdef PROGRAM_WITH_IL
+  ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
+#else
+  std::set<std::string> build_options;
+#ifdef ENABLE_FP16
+  std::string source = to_format_source_fp16;
+#else
+  std::string source = to_format_source_fp32;
+#endif
+  ocl_runtime->LoadSource(program_name, source);
+  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
+#endif
+  MS_LOG(DEBUG) << kernel_name << " Init Done!";
+  return RET_OK;
+}
+
+int ToFormatOpenCLKernel::ReSize() { return RET_OK; }
+
+int ToFormatOpenCLKernel::GetGlobalSize(size_t idx, std::vector<size_t> *global_size) {
+  std::vector<int> shapex = out_tensors_[0]->shape();
+  if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4 || out_tensors_[0]->GetFormat() == schema::Format_NHWC) {
+    int h = shapex[1];
+    int w = shapex[2];
+    int c = shapex[3];
+    int c4 = UP_DIV(c, C4NUM);
+    std::vector<size_t> vec = {(size_t)h, (size_t)w, (size_t)c4};
+    *global_size = std::move(vec);
+  } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4 ||
+             out_tensors_[0]->GetFormat() == schema::Format_NCHW) {
+    int h = shapex[2];
+    int w = shapex[3];
+    int c = shapex[1];
+    int c4 = UP_DIV(c, C4NUM);
+    std::vector<size_t> vec = {(size_t)c4, (size_t)h, (size_t)w};
+    *global_size = std::move(vec);
+  } else if (out_tensors_[0]->GetFormat() == out_tensors_[0]->GetFormat() == schema::Format_NCHW) {
+    int h = shapex[2];
+    int w = shapex[3];
+    int c = shapex[1];
+    int w4 = UP_DIV(w, C4NUM);
+    std::vector<size_t> vec = {(size_t)w4, (size_t)h, (size_t)c};
+    *global_size = std::move(vec);
+  }
+  return RET_OK;
+}
+int ToFormatOpenCLKernel::GetLocalSize(size_t idx, const std::vector<size_t> &global_size,
+                                       std::vector<size_t> *local_size) {
+  return RET_OK;
+}
+
+int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
+  size_t im_dst_x, im_dst_y;
+  std::vector<int> shapex = out_tensors_[0]->shape();
+  if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) {
+    int h = shapex[1];
+    int w = shapex[2];
+    int c = shapex[3];
+    im_dst_y = UP_DIV(h * c, C4NUM);
+    im_dst_x = w;
+  } else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
+    int h = shapex[2];
+    int w = shapex[3];
+    int c = shapex[1];
+    im_dst_x = UP_DIV(w * c, C4NUM);
+    im_dst_y = h;
+  } else {
+    MS_LOG(ERROR) << "Unsupported format. " << out_tensors_[0]->GetFormat();
+  }
+  img_size->clear();
+#ifdef ENABLE_FP16
+  size_t img_dtype = CL_HALF_FLOAT;
+#else
+  size_t img_dtype = CL_FLOAT;
+#endif
+  std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
+  *img_size = vec;
+  return RET_OK;
+}
+
+int ToFormatOpenCLKernel::Run() {
+  MS_LOG(DEBUG) << "ToFormat" << " Running!";
+  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  std::vector<size_t> local = {};
+  std::vector<size_t> global;
+  GetGlobalSize(0, &global);
+  auto shapex = in_tensors_[0]->shape();
+  cl_int4 shape{shapex.size() > 0 ? shapex[0] : 1, shapex.size() > 1 ? shapex[1] : 1, shapex.size() > 2 ? shapex[2] : 1,
+                shapex.size() > 3 ? shapex[3] : 1};
+  cl_int4 gsize{(cl_int)global[0], (cl_int)global[1], (cl_int)global[2], 1};
+  ocl_runtime->SetKernelArg(kernel_, 0, in_tensors_[0]->Data());
+  ocl_runtime->SetKernelArg(kernel_, 1, out_tensors_[0]->Data());
+  ocl_runtime->SetKernelArg(kernel_, 2, gsize);
+  ocl_runtime->SetKernelArg(kernel_, 3, shape);
+  ocl_runtime->RunKernel(kernel_, global, local, nullptr);
+  return RET_OK;
+}
+
+kernel::LiteKernel *OpenCLToFormatKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+                                                const std::vector<lite::tensor::Tensor *> &outputs,
+                                                OpParameter *opParameter, const lite::Context *ctx,
+                                                const kernel::KernelKey &desc, const lite::Primitive *primitive) {
+  auto *kernel = new (std::nothrow) ToFormatOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
+  if (kernel == nullptr) {
+    MS_LOG(ERROR) << "kernel " << opParameter->name_ << " create failed.";
+    return nullptr;
+  }
+  auto ret = kernel->Init();
+  if (ret != RET_OK) {
+    delete kernel;
+    return nullptr;
+  }
+  return kernel;
+}
+
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_ToFormat, OpenCLToFormatKernelCreator)
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h
new file mode 100644
index 0000000000..09d31b7454
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h
@@ -0,0 +1,46 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_BACKEND_OPENCL_TO_FORMAT_H_
+#define MINDSPORE_LITE_SRC_BACKEND_OPENCL_TO_FORMAT_H_
+
+#include <vector>
+
+#include "src/lite_kernel.h"
+#include "src/runtime/opencl/opencl_runtime.h"
+#include "src/runtime/kernel/opencl/opencl_kernel.h"
+
+namespace mindspore::kernel {
+class ToFormatOpenCLKernel : public OpenCLKernel {
+ public:
+  explicit ToFormatOpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
+                                const std::vector<lite::tensor::Tensor *> &outputs)
+      : OpenCLKernel(parameter, inputs, outputs) {}
+  ~ToFormatOpenCLKernel() override{};
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+  int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
+  int GetGlobalSize(size_t idx, std::vector<size_t> *global_size) override;
+  int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) override;
+
+ private:
+  cl::Kernel kernel_;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_BACKEND_OPENCL_RESHAPE_H_
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
index b0052e0f5b..c76920c47f 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
@@ -109,7 +109,8 @@ kernel::LiteKernel *OpenCLTransposeKernelCreator(const std::vector<lite::tensor:
                                                  const std::vector<lite::tensor::Tensor *> &outputs,
                                                  OpParameter *opParameter, const lite::Context *ctx,
                                                  const kernel::KernelKey &desc, const lite::Primitive *primitive) {
-  auto *kernel = new (std::nothrow)TransposeOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
+  auto *kernel =
+    new (std::nothrow) TransposeOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
   if (kernel == nullptr) {
     MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr.";
     return nullptr;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
index 10dabfea71..d1facd0bc1 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
@@ -21,21 +21,37 @@
 #include "src/lite_kernel.h"
 
 namespace mindspore::kernel {
+
+enum class OpenCLMemType { BUF, IMG };
+
+struct OpenCLToFormatParameter {
+  OpParameter op_parameter;
+  schema::Format src_format{schema::Format_NHWC};
+  schema::Format dst_format{schema::Format_NHWC4};
+  OpenCLMemType out_mem_type{OpenCLMemType::IMG};
+};
+
 class OpenCLKernel : public LiteKernel {
  public:
   explicit OpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                         const std::vector<lite::tensor::Tensor *> &outputs)
-      : LiteKernel(parameter, inputs, outputs, nullptr, nullptr) {}
+    : LiteKernel(parameter, inputs, outputs, nullptr, nullptr) {}
 
   virtual int Init() { return -1; }
   virtual int Prepare() { return -1; }
   virtual int InferShape() { return -1; }
   virtual int ReSize() { return -1; }
   virtual int Run() { return -1; }
-  virtual int GetImageSize(size_t idx, std::vector<size_t>* img_size) { return -1; }
-  virtual int GetGlobalSize(size_t idx, std::vector<size_t>* global_size) { return -1; }
-  virtual int GetLocalSize(size_t idx, const std::vector<size_t>& global_size,
-                           std::vector<size_t>* local_size) { return -1; }
+  virtual int GetImageSize(size_t idx, std::vector<size_t> *img_size) { return -1; }
+  virtual int GetGlobalSize(size_t idx, std::vector<size_t> *global_size) { return -1; }
+  virtual int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) {
+    return -1;
+  }
+  OpenCLMemType GetMemType() { return out_mem_type_; }
+  void SetMemType(OpenCLMemType mem_type) { out_mem_type_ = mem_type; }
+
+ protected:
+  OpenCLMemType out_mem_type_{OpenCLMemType::IMG};
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
index 7c495cf68f..05514815d7 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
@@ -17,22 +17,117 @@
 #include "src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
 #include "src/runtime/opencl/opencl_executor.h"
 #include "src/runtime/opencl/opencl_runtime.h"
+#include "src/runtime/kernel/opencl/utils.h"
+#include "include/errorcode.h"
+#include "src/common/utils.h"
 
 namespace mindspore::kernel {
 
 SubGraphOpenCLKernel::~SubGraphOpenCLKernel() { UnInit(); }
 
+int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::tensor::Tensor *> &in_tensors,
+                                        const std::vector<kernel::LiteKernel *> in_kernels,
+                                        std::vector<lite::tensor::Tensor *> *out_tensors,
+                                        std::vector<OpenCLToFormatParameter *> *out_parameters,
+                                        std::vector<LiteKernel *> *out_convert_ops, OpenCLMemType mem_type) {
+  out_tensors->clear();
+  out_parameters->clear();
+  out_convert_ops->clear();
+  for (size_t i = 0; i < in_tensors.size(); ++i) {
+    lite::tensor::Tensor *new_tensor = new (std::nothrow) lite::tensor::Tensor();
+    MS_ASSERT(new_tensor);
+    if (new_tensor == nullptr) {
+      MS_LOG(ERROR) << "SubGraphOpenCLKernel new tensor failed!";
+      return RET_ERROR;
+    }
+    new_tensor->CopyTensor(*in_tensors[i]);
+    auto dst_format =
+      (mem_type == OpenCLMemType::IMG) ? in_kernels.back()->out_tensors()[0]->GetFormat() : in_tensors[i]->GetFormat();
+    auto src_format =
+      (mem_type == OpenCLMemType::IMG) ? in_tensors[i]->GetFormat() : in_kernels.front()->out_tensors()[0]->GetFormat();
+    if ((dst_format == schema::Format_NCHW || dst_format == schema::Format_NC4HW4) &&
+        (src_format == schema::Format_NHWC || src_format == schema::Format_NHWC4)) {
+      auto &shape = new_tensor->shape();
+      std::vector<int> dst_shape{shape[0], shape[3], shape[1], shape[2]};
+      new_tensor->set_shape(shape);
+    }
+    if ((dst_format == schema::Format_NHWC || dst_format == schema::Format_NHWC4) &&
+        (src_format == schema::Format_NCHW || src_format == schema::Format_NC4HW4)) {
+      auto &shape = new_tensor->shape();
+      std::vector<int> dst_shape{shape[0], shape[2], shape[3], shape[1]};
+      new_tensor->set_shape(shape);
+    }
+    new_tensor->SetFormat(dst_format);
+    out_tensors->emplace_back(new_tensor);
+#ifdef ENABLE_FP16
+    KernelKey desc{kGPU, kNumberTypeFloat16, schema::PrimitiveType_ToFormat};
+#else
+    KernelKey desc{kGPU, kNumberTypeFloat32, schema::PrimitiveType_ToFormat};
+#endif
+    OpenCLToFormatParameter *parameter = new (std::nothrow) OpenCLToFormatParameter;
+    MS_ASSERT(parameter);
+    if (parameter == nullptr) {
+      MS_LOG(ERROR) << "SubGraphOpenCLKernel new parameter failed!";
+      return RET_ERROR;
+    }
+    parameter->src_format = src_format;
+    parameter->dst_format = dst_format;
+    parameter->out_mem_type = mem_type;
+    out_parameters->emplace_back(parameter);
+    LiteKernel *in_convert_op;
+    if (mem_type == OpenCLMemType::IMG) {
+      in_convert_op =
+        lite::GetOpenCLKernel({in_tensors[i]}, {new_tensor}, reinterpret_cast<OpParameter *>(parameter), nullptr, desc);
+    } else {
+      in_convert_op =
+        lite::GetOpenCLKernel({new_tensor}, {in_tensors[i]}, reinterpret_cast<OpParameter *>(parameter), nullptr, desc);
+    }
+    MS_ASSERT(in_convert_op);
+    if (in_convert_op == nullptr) {
+      MS_LOG(ERROR) << "SubGraphOpenCLKernel create op failed!";
+      return RET_ERROR;
+    }
+    auto in_opencl_op = reinterpret_cast<OpenCLKernel *>(in_convert_op);
+    if (mem_type == OpenCLMemType::IMG) {
+      in_opencl_op->AddOutKernel(in_kernels[i]);
+      reinterpret_cast<OpenCLKernel *>(in_kernels[i])->SetInKernel({in_convert_op});
+      reinterpret_cast<OpenCLKernel *>(in_kernels[i])->set_in_tensors({new_tensor});
+    } else {
+      reinterpret_cast<OpenCLKernel *>(in_kernels[i])->SetOutKernel({in_convert_op});
+      reinterpret_cast<OpenCLKernel *>(in_kernels[i])->set_out_tensors({new_tensor});
+      in_convert_op->AddInKernel(in_kernels[i]);
+    }
+    out_convert_ops->emplace_back(in_convert_op);
+  }
+  return RET_OK;
+}
+
 int SubGraphOpenCLKernel::Init() {
   allocator_ = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
-  MS_LOG(DEBUG) << "input num=" << inputs_.size() << ", output num=" << outputs_.size();
-  for (const auto tensor : inputs_) {
+  MS_LOG(DEBUG) << "input num=" << in_tensors_.size() << ", output num=" << out_tensors_.size();
+  for (const auto tensor : in_tensors_) {
     tensor->set_allocator(allocator_);
   }
-  for (const auto tensor : outputs_) {
+  for (const auto tensor : out_tensors_) {
     tensor->set_allocator(allocator_);
   }
+  int ret = GenToFormatOp(in_tensors_, in_kernels_, &in_convert_tensors_, &in_parameters_, &in_convert_ops_,
+                          OpenCLMemType::IMG);
+  if (ret != RET_OK) {
+    return RET_ERROR;
+  }
+  nodes_.insert(nodes_.begin(), in_convert_ops_.begin(), in_convert_ops_.end());
+  ret = GenToFormatOp(out_tensors_, out_kernels_, &out_convert_tensors_, &out_parameters_, &out_convert_ops_,
+                      OpenCLMemType::BUF);
+  if (ret != RET_OK) {
+    return RET_ERROR;
+  }
+  nodes_.insert(nodes_.end(), out_convert_ops_.begin(), out_convert_ops_.end());
+
+  MallocTensorWithReuse();
+
   // Map buffer for write, it is not necessary for fine-grained
-  for (auto &tensor : inputs_) {
+  for (auto &tensor : in_tensors_) {
     void *data = tensor->Data();
     // It is required with coarse-grained SVM
     if (data != nullptr) {
@@ -42,44 +137,118 @@ int SubGraphOpenCLKernel::Init() {
       MS_LOG(ERROR) << "SubGraphOpenCLKernel input nullptr!";
     }
   }
-  return 0;
+  return RET_OK;
+}
+int SubGraphOpenCLKernel::MallocTensorWithReuse() {
+  kernel::LiteKernelUtil::InitTensorRefCount(nodes_);
+  for (auto *kernel : nodes_) {
+    MS_ASSERT(nullptr != kernel);
+    kernel::OpenCLKernel *op_kernel = reinterpret_cast<kernel::OpenCLKernel *>(kernel);
+    auto &outputs = kernel->out_tensors();
+    for (auto i = 0; i < outputs.size(); ++i) {
+      auto *output = outputs.at(i);
+      MS_ASSERT(nullptr != output);
+      if (op_kernel->GetMemType() == OpenCLMemType::IMG) {
+        std::vector<size_t> img_size;
+        op_kernel->GetImageSize(i, &img_size);
+        auto data_ptr = allocator_->Malloc(output->Size(), img_size);
+        output->SetData(data_ptr);
+      } else {
+        output->MallocData(allocator_);
+      }
+      output->set_allocator(allocator_);
+    }
+    for (auto input_kernel : kernel->in_kernels()) {
+      MS_EXCEPTION_IF_NULL(input_kernel);
+      auto ret = input_kernel->DecOutTensorRefCount();
+      if (0 != ret) {
+        MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed";
+      }
+    }
+  }
+  for (auto kernel : out_kernels_) {
+    MS_EXCEPTION_IF_NULL(kernel);
+    auto ret = kernel->DecOutTensorRefCount();
+    if (0 != ret) {
+      MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed";
+    }
+  }
+  for (auto kernel : in_convert_ops_) {
+    MS_EXCEPTION_IF_NULL(kernel);
+    auto ret = kernel->DecOutTensorRefCount();
+    if (0 != ret) {
+      MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed";
+    }
+  }
+  for (auto kernel : out_convert_ops_) {
+    MS_EXCEPTION_IF_NULL(kernel);
+    auto ret = kernel->DecOutTensorRefCount();
+    if (0 != ret) {
+      MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed";
+    }
+  }
+  return RET_OK;
 }
 
 int SubGraphOpenCLKernel::UnInit() {
-  for (auto &tensor : outputs_) {
+  for (auto &tensor : out_tensors_) {
+    allocator_->UnmapBuffer(tensor->Data());
+  }
+  for (const auto tensor : in_tensors_) {
+    if (tensor != nullptr) {
+      tensor->FreeData();
+    }
+  }
+  for (const auto tensor : out_tensors_) {
+    if (tensor != nullptr) {
+      tensor->FreeData();
+    }
+  }
+  for (auto &tensor : out_tensors_) {
     allocator_->UnmapBuffer(tensor->Data());
   }
-  for (const auto tensor : inputs_) {
+  for (const auto tensor : in_convert_tensors_) {
     if (tensor != nullptr) {
       tensor->FreeData();
+      delete tensor;
     }
   }
-  for (const auto tensor : outputs_) {
+  for (const auto tensor : out_convert_tensors_) {
     if (tensor != nullptr) {
       tensor->FreeData();
+      delete tensor;
     }
   }
-  return 0;
+  for (const auto parameter : in_parameters_) {
+    if (parameter != nullptr) {
+      delete parameter;
+    }
+  }
+  for (const auto op : in_convert_ops_) {
+    if (op != nullptr) {
+      delete op;
+    }
+  }
+  return RET_OK;
 }
 
-int SubGraphOpenCLKernel::InferShape() { return 0; }
+int SubGraphOpenCLKernel::InferShape() { return RET_OK; }
 
-int SubGraphOpenCLKernel::ReSize() { return 0; }
+int SubGraphOpenCLKernel::ReSize() { return RET_OK; }
 
 int SubGraphOpenCLKernel::Run() {
-  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
-  for (auto &tensor : inputs_) {
+  for (auto &tensor : in_tensors_) {
     allocator_->UnmapBuffer(tensor->Data());
   }
 
   lite::opencl::OpenCLExecutor executor;
-  executor.Run(inputs_, outputs_, nodes_, allocator_);
-  ocl_runtime->SyncCommandQueue();
-  for (auto &tensor : outputs_) {
+  executor.Run(in_tensors_, out_tensors_, nodes_, allocator_);
+
+  for (auto &tensor : out_tensors_) {
     void *data = allocator_->MapBuffer(tensor->Data(), CL_MAP_READ, nullptr, true);
     tensor->SetData(data);
   }
-  return 0;
+  return RET_OK;
 }
 
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h
index d965c2e7bd..cd926f0ab9 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h
@@ -36,7 +36,7 @@ class SubGraphOpenCLKernel : public SubGraphKernel {
                                 const std::vector<kernel::LiteKernel *> inKernels,
                                 const std::vector<kernel::LiteKernel *> outKernels,
                                 const std::vector<kernel::LiteKernel *> nodes)
-      : SubGraphKernel(inputs, outputs, inKernels, outKernels, nodes, nullptr, nullptr) {}
+    : SubGraphKernel(inputs, outputs, inKernels, outKernels, nodes, nullptr, nullptr) {}
   ~SubGraphOpenCLKernel() override;
 
   int Init() override;
@@ -45,11 +45,24 @@ class SubGraphOpenCLKernel : public SubGraphKernel {
   int Run() override;
   int UnInit();
 
+ protected:
+  int MallocTensorWithReuse();
+  int GenToFormatOp(const std::vector<lite::tensor::Tensor *> &in_tensors,
+                    const std::vector<kernel::LiteKernel *> in_kernels,
+                    std::vector<lite::tensor::Tensor *> *out_tensors,
+                    std::vector<OpenCLToFormatParameter *> *out_parameters, std::vector<LiteKernel *> *out_convert_ops,
+                    OpenCLMemType mem_type);
+
  private:
   SubGraphOpenCLParameter *subgraph_ocl_parameter_;
   lite::opencl::OpenCLAllocator *allocator_;
+  std::vector<lite::tensor::Tensor *> in_convert_tensors_;
+  std::vector<lite::tensor::Tensor *> out_convert_tensors_;
+  std::vector<OpenCLToFormatParameter *> in_parameters_;
+  std::vector<OpenCLToFormatParameter *> out_parameters_;
+  std::vector<LiteKernel *> in_convert_ops_;
+  std::vector<LiteKernel *> out_convert_ops_;
 };
 }  // namespace mindspore::kernel
 
 #endif  // MINDSPORE_LITE_SRC_BACKEND_OPENCL_SUBGRAPH_OPENCL_KERNEL_H_
-
diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.cc b/mindspore/lite/src/runtime/kernel/opencl/utils.cc
index aab9795734..41ed6df6cf 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/utils.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.cc
@@ -18,6 +18,22 @@
 #include <algorithm>
 #include <string>
 #include <vector>
+#include "src/kernel_registry.h"
+
+using mindspore::lite::KernelRegistrar;
+
+namespace mindspore::lite {
+kernel::LiteKernel *GetOpenCLKernel(const std::vector<tensor::Tensor *> &in_tensors,
+                                    const std::vector<tensor::Tensor *> &out_tensors, OpParameter *parameter,
+                                    const Context *ctx, const kernel::KernelKey &key) {
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  if (creator != nullptr) {
+    auto kernel = creator(in_tensors, out_tensors, parameter, nullptr, key, nullptr);
+    return kernel;
+  }
+  return nullptr;
+}
+}  // namespace mindspore::lite
 
 namespace mindspore {
 namespace kernel {
diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.h b/mindspore/lite/src/runtime/kernel/opencl/utils.h
index f34135c29c..d1c0d130f3 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/utils.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.h
@@ -22,6 +22,13 @@
 #include "CL/cl2.hpp"
 #include "utils/log_adapter.h"
 #include "src/runtime/kernel/arm/nnacl/op_base.h"
+#include "src/lite_kernel.h"
+
+namespace mindspore::lite {
+kernel::LiteKernel *GetOpenCLKernel(const std::vector<tensor::Tensor *> &in_tensors,
+                                    const std::vector<tensor::Tensor *> &out_tensors, OpParameter *parameter,
+                                    const Context *ctx, const kernel::KernelKey &key);
+}
 
 namespace mindspore::kernel {
 
diff --git a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
index ab11ff06ae..8c34e2737f 100644
--- a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
@@ -16,10 +16,10 @@
 
 #include "src/runtime/opencl/opencl_allocator.h"
 #include <utility>
-#include "utils/log_adapter.h"
 #include "src/runtime/opencl/opencl_runtime.h"
-#include "include/errorcode.h"
 #include "src/runtime/kernel/opencl/utils.h"
+#include "utils/log_adapter.h"
+#include "include/errorcode.h"
 
 namespace mindspore::lite::opencl {
 
@@ -61,7 +61,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size)
   }
   Lock();
   auto iter = free_list_.lower_bound(size);
-  if (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) {
+  while (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) {
     auto mem_buf = iter->second;
     bool is_match{mem_buf->img_size.size() == img_size.size()};
     for (int i = 0; i < img_size.size() && is_match; ++i) {
@@ -75,6 +75,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size)
                     << ", host addr: " << mem_buf->host_ptr_ << ", device addr: " << mem_buf->device_ptr_;
       return mem_buf->host_ptr_;
     }
+    ++iter;
   }
   void *host_ptr = nullptr;
   void *device_ptr = nullptr;
@@ -136,7 +137,7 @@ void *OpenCLAllocator::CreateImageFromHost(void *data, size_t size, const std::v
   auto ocl_runtime = opencl::OpenCLRuntime::GetInstance();
   Lock();
   auto iter = free_list_.lower_bound(size);
-  if (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) {
+  while (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) {
     auto mem_buf = iter->second;
     bool is_match{mem_buf->img_size.size() == img_size.size()};
     for (int i = 0; i < img_size.size() && is_match; ++i) {
@@ -150,6 +151,7 @@ void *OpenCLAllocator::CreateImageFromHost(void *data, size_t size, const std::v
                     << ", host addr: " << mem_buf->host_ptr_ << ", device addr: " << mem_buf->device_ptr_;
       return mem_buf->host_ptr_;
     }
+    ++iter;
   }
   void *host_ptr = nullptr;
   void *device_ptr = nullptr;
@@ -198,10 +200,13 @@ void OpenCLAllocator::Free(void *buf) {
     allocated_list_.erase(iter);
     free_list_.insert(std::make_pair(mem_buf->size_, mem_buf));
     UnLock();
+    MS_LOG(DEBUG) << "Free a new Image2D. size: " << mem_buf->size_ << ", host addr: " << mem_buf->host_ptr_
+                  << ", device addr: " << mem_buf->device_ptr_ << ", image addr: " << mem_buf->image_ptr_;
     return;
   }
   UnLock();
   free(buf);
+  MS_LOG(DEBUG) << "Free host ptr: " << buf;
 }
 
 size_t OpenCLAllocator::GetTotalSize() {
diff --git a/mindspore/lite/src/runtime/opencl/opencl_executor.cc b/mindspore/lite/src/runtime/opencl/opencl_executor.cc
index 2dc619afe2..38841a067b 100644
--- a/mindspore/lite/src/runtime/opencl/opencl_executor.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_executor.cc
@@ -24,41 +24,9 @@ namespace mindspore::lite::opencl {
 int OpenCLExecutor::Run(std::vector<tensor::Tensor *> &inputs, std::vector<tensor::Tensor *> &outputs,
                         std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator,
                         const session::KernelCallBack &before, const session::KernelCallBack &after) {
-  MS_ASSERT(nullptr != allocator);
-  for (auto &inTensor : inputs) {
-    if (inTensor == nullptr) {
-      MS_LOG(ERROR) << "Graph input tensor is nullptr";
-      return RET_ERROR;
-    }
-    if (inTensor->GetFormat() != schema::Format_NHWC4 && inTensor->GetFormat() != schema::Format_NC4HW4 &&
-      inTensor->GetFormat() != schema::Format_NHWC) {
-      MS_LOG(ERROR) << "input should be NHWC/NHWC4/NC4HW4, actual is " << schema::EnumNameFormat(inTensor->GetFormat());
-      return RET_ERROR;
-    } else {
-      TransformTensorLayout(inTensor, inTensor->GetFormat(), schema::Format_NHWC4, true);
-      // TransformTensorLayout(inTensor, inTensor->GetFormat(), schema::Format_NC4HW4, true);
-    }
-  }
   kernel::LiteKernelUtil::InitTensorRefCount(kernels);
-  OpenCLAllocator* op_allocator = reinterpret_cast<OpenCLAllocator*>(allocator);
   for (auto *kernel : kernels) {
     MS_ASSERT(nullptr != kernel);
-    kernel::OpenCLKernel *op_kernel = reinterpret_cast<kernel::OpenCLKernel*>(kernel);
-    auto &outputs = kernel->out_tensors();
-    for (auto i = 0; i < outputs.size(); ++i) {
-      auto *output = outputs.at(i);
-      MS_ASSERT(nullptr != output);
-      if (is_image2d_out_) {
-        std::vector<size_t> img_size;
-        op_kernel->GetImageSize(i, &img_size);
-        auto data_ptr = op_allocator->Malloc(output->Size(), img_size);
-
-        output->SetData(data_ptr);
-      } else {
-        output->MallocData(allocator);
-      }
-      output->set_allocator(allocator);
-    }
     session::CallBackParam callbackParam;
     callbackParam.name_callback_param = kernel->name();
 
@@ -67,6 +35,21 @@ int OpenCLExecutor::Run(std::vector<tensor::Tensor *> &inputs, std::vector<tenso
         MS_LOG(ERROR) << "run kernel before_callback failed, name: " << kernel->name();
       }
     }
+    kernel::OpenCLKernel *op_kernel = reinterpret_cast<kernel::OpenCLKernel *>(kernel);
+    auto &cur_outputs = kernel->out_tensors();
+    for (auto i = 0; i < cur_outputs.size(); ++i) {
+      auto *output = cur_outputs.at(i);
+      MS_ASSERT(nullptr != output);
+      if (op_kernel->GetMemType() == kernel::OpenCLMemType::IMG) {
+        std::vector<size_t> img_size;
+        op_kernel->GetImageSize(i, &img_size);
+        auto data_ptr = allocator_->Malloc(output->Size(), img_size);
+        output->SetData(data_ptr);
+      } else {
+        output->MallocData(allocator_);
+      }
+    }
+
     auto ret = kernel->Run();
     if (0 != ret) {
       MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name();
@@ -86,21 +69,11 @@ int OpenCLExecutor::Run(std::vector<tensor::Tensor *> &inputs, std::vector<tenso
       }
     }
   }
-  // output format transform
-  for (auto &outTensor : outputs) {
-    if (outTensor == nullptr) {
-      MS_LOG(ERROR) << "Graph output tensor is nullptr";
-      return RET_ERROR;
-    }
-    if (outTensor->GetFormat() != schema::Format_NHWC) {
-      TransformTensorLayout(outTensor, outTensor->GetFormat(), schema::Format_NHWC, false);
-    }
-  }
   return RET_OK;
 }
 
-int OpenCLExecutor::TransformTensorLayout(tensor::Tensor *tensor, schema::Format src_format,
-    schema::Format dst_format, bool trans_dir) {
+int OpenCLExecutor::TransformTensorLayout(tensor::Tensor *tensor, schema::Format src_format, schema::Format dst_format,
+                                          bool trans_dir) {
   MS_ASSERT(nullptr != tensor);
   MS_ASSERT(4 == tensor->shape().size());
   auto data_type = tensor->data_type();
@@ -114,11 +87,10 @@ int OpenCLExecutor::TransformTensorLayout(tensor::Tensor *tensor, schema::Format
                     << schema::EnumNameFormat(dst_format);
       return RET_ERROR;
   }
-  return RET_OK;
 }
 
 int OpenCLExecutor::TransformTensorLayoutFp32(tensor::Tensor *tensor, schema::Format src_format,
-    schema::Format dst_format, bool trans_dir) {
+                                              schema::Format dst_format, bool trans_dir) {
   MS_ASSERT(nullptr != tensor);
   MS_ASSERT(nullptr != allocator_);
   MS_ASSERT(4 == tensor->shape().size());
@@ -138,11 +110,11 @@ int OpenCLExecutor::TransformTensorLayoutFp32(tensor::Tensor *tensor, schema::Fo
 }
 
 int OpenCLExecutor::TransformTensorLayoutToBuffer(tensor::Tensor *tensor, schema::Format src_format,
-    schema::Format dst_format) {
+                                                  schema::Format dst_format) {
   if (dst_format == schema::Format_NHWC4) {
     auto *src_data = tensor->Data();
     size_t C4 = UP_DIV(tensor->Channel(), C4NUM);
-    std::vector <size_t> img_size{tensor->Width() * C4, (size_t) tensor->Height(), CL_FLOAT};
+    std::vector<size_t> img_size{tensor->Width() * C4, (size_t)tensor->Height(), CL_FLOAT};
     if (src_format == schema::Format_NHWC) {
       auto *dst_data = allocator_->Malloc(tensor->Size(), img_size);
       if (dst_data == nullptr) {
@@ -168,7 +140,7 @@ int OpenCLExecutor::TransformTensorLayoutToBuffer(tensor::Tensor *tensor, schema
 }
 
 int OpenCLExecutor::TransformTensorLayoutToImage(tensor::Tensor *tensor, schema::Format src_format,
-    schema::Format dst_format) {
+                                                 schema::Format dst_format) {
   if (dst_format == schema::Format_NHWC4) {
     tensor->SetFormat(schema::Format_NHWC4);
     // convert to nhwc4
@@ -202,15 +174,15 @@ int OpenCLExecutor::TransformTensorLayoutToImage(tensor::Tensor *tensor, schema:
 }
 
 int OpenCLExecutor::TransformTensorLayoutFromImage(tensor::Tensor *tensor, schema::Format src_format,
-    schema::Format dst_format) {
+                                                   schema::Format dst_format) {
   if (dst_format == schema::Format_NHWC) {
     auto src_data = tensor->Data();
     auto dst_data = allocator_->Malloc(tensor->Size());
     cl::Image2D *out_mem = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data));
     std::vector<size_t> img_size;
     allocator_->GetImageSize(src_data, &img_size);
-    auto origin = cl::array < cl::size_type, 3U > {0, 0, 0};
-    auto region = cl::array < cl::size_type, 3U > {img_size[0], img_size[1], 1};
+    auto origin = cl::array<cl::size_type, 3U>{0, 0, 0};
+    auto region = cl::array<cl::size_type, 3U>{img_size[0], img_size[1], 1};
     auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
     ocl_runtime->GetDefaultCommandQueue()->enqueueReadImage(*out_mem, CL_TRUE, origin, region, 0, 0, dst_data);
     tensor->SetData(dst_data);
@@ -224,7 +196,7 @@ int OpenCLExecutor::TransformTensorLayoutFromImage(tensor::Tensor *tensor, schem
 }
 
 int OpenCLExecutor::TransformTensorLayoutUint8(tensor::Tensor *tensor, schema::Format src_format,
-    schema::Format dst_format, bool is_image) {
+                                               schema::Format dst_format, bool is_image) {
   MS_ASSERT(nullptr != tensor);
   MS_ASSERT(4 == tensor->shape().size());
   //  auto src_format = tensor->GetFormat();
@@ -234,4 +206,3 @@ int OpenCLExecutor::TransformTensorLayoutUint8(tensor::Tensor *tensor, schema::F
   return RET_ERROR;
 }
 }  // namespace mindspore::lite::opencl
-
diff --git a/mindspore/lite/test/CMakeLists.txt b/mindspore/lite/test/CMakeLists.txt
index eaae765d67..b6b961868d 100644
--- a/mindspore/lite/test/CMakeLists.txt
+++ b/mindspore/lite/test/CMakeLists.txt
@@ -148,6 +148,7 @@ if (SUPPORT_GPU)
             ${LITE_DIR}/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
             ${LITE_DIR}/src/runtime/kernel/opencl/kernel/transpose.cc
             ${LITE_DIR}/src/runtime/kernel/opencl/kernel/reshape.cc
+            ${LITE_DIR}/src/runtime/kernel/opencl/kernel/to_format.cc
             )
 endif()
 ### minddata lite
@@ -323,6 +324,8 @@ if (SUPPORT_GPU)
             ${TEST_DIR}/ut/src/runtime/kernel/opencl/transpose_tests.cc
             ${TEST_DIR}/ut/src/runtime/kernel/opencl/convolution_tests.cc
             ${TEST_DIR}/ut/src/runtime/kernel/opencl/activation_tests.cc
+            #${TEST_DIR}/ut/src/runtime/kernel/opencl/leakyrelu_tests.cc
+            ${TEST_DIR}/ut/src/runtime/kernel/opencl/to_format_tests.cc
             )
 endif()
 
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/to_format_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/to_format_tests.cc
new file mode 100644
index 0000000000..475183bb7f
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/to_format_tests.cc
@@ -0,0 +1,79 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <iostream>
+#include <memory>
+#include "mindspore/core/utils/log_adapter.h"
+#include "common/common_test.h"
+#include "mindspore/lite/src/common/file_utils.h"
+#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
+#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
+#include "mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h"
+
+namespace mindspore {
+class TestToFormatOpenCL : public mindspore::CommonTest {
+ public:
+  TestToFormatOpenCL() {}
+};
+
+TEST_F(TestToFormatOpenCL, TransposeFp32) {
+  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  ocl_runtime->Init();
+  auto allocator = ocl_runtime->GetAllocator();
+  int h = 64;
+  int w = 1;
+  int c = 7360;
+  size_t input_size;
+  std::string input_path = "./test_data/transpose/transpose_fp32_input.bin";
+  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
+
+  lite::tensor::Tensor *tensor_x =
+    new lite::tensor::Tensor(TypeId(kNumberTypeFloat32), {1, h, w, c}, schema::Format_NHWC4);
+
+  lite::tensor::Tensor *tensor_out = new lite::tensor::Tensor(TypeId(kNumberTypeFloat32), {1, c, h, w});
+  std::vector<lite::tensor::Tensor *> inputs{tensor_x};
+  std::vector<lite::tensor::Tensor *> outputs{tensor_out};
+  auto *arith_kernel = new kernel::TransposeOpenCLKernel(nullptr, inputs, outputs);
+  arith_kernel->Init();
+
+  inputs[0]->MallocData(allocator);
+
+  std::vector<kernel::LiteKernel *> kernels{arith_kernel};
+  auto *pGraph = new kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
+  pGraph->Init();
+  memcpy(inputs[0]->Data(), input_data, input_size);
+  pGraph->Run();
+
+  size_t output_size;
+  std::string output_path = "./test_data/transpose/transpose_fp32_output.bin";
+  auto correct_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size));
+  printf("==================output data=================\n");
+  float *output_data = reinterpret_cast<float *>(tensor_out->Data());
+  std::cout << std::endl;
+  int size_n = h * w * c;
+  size_n = size_n > 100 ? 100 : size_n;
+  for (int i = 0; i < size_n; i++) {
+    std::cout << output_data[i] << " ";
+    if ((i + 1) % c == 0) {
+      std::cout << std::endl;
+    }
+  }
+  std::cout << std::endl;
+
+  // compare
+  CompareOutputData(output_data, correct_data, h * w * c, 0.00001);
+  MS_LOG(INFO) << "TestMatMulFp32 passed";
+}
+}  // namespace mindspore