fixed opencl program stuck for some device

5 years ago · c018938e9a
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
@@ -55,8 +55,7 @@ int ActivationOpenClKernel::Init() {
    c = in_tensors_[0]->shape()[3];
  }
  nhwc_shape_ = {n, h, w, c};
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  enable_fp16_ = ocl_runtime->GetFp16Enable();
  enable_fp16_ = ocl_runtime_->GetFp16Enable();
  fp_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
  if (in_size_ != 2 && in_size_ != 4) {
    MS_LOG(ERROR) << "Activate fun only support dim=4 or 2, but your dim=" << in_size_;
@@ -75,9 +74,9 @@ int ActivationOpenClKernel::Init() {

  std::string source = activation_source;
  std::set<std::string> build_options;
  ocl_runtime->LoadSource(Program_Kernel[type_][0], source);
  ocl_runtime_->LoadSource(Program_Kernel[type_][0], source);
  std::string kernel_name = Program_Kernel[type_][1];
  ocl_runtime->BuildKernel(kernel_, Program_Kernel[type_][0], kernel_name, build_options);
  ocl_runtime_->BuildKernel(kernel_, Program_Kernel[type_][0], kernel_name, build_options);
  in_ori_format_ = in_tensors_[0]->GetFormat();
  out_ori_format_ = out_tensors_[0]->GetFormat();
  in_tensors_[0]->SetFormat(op_format_);
@@ -89,17 +88,16 @@ int ActivationOpenClKernel::Init() {
 int ActivationOpenClKernel::Run() {
  MS_LOG(DEBUG) << op_parameter_->name_ << " begin running!";
  cl_int4 img2d_shape = GetImg2dShape();
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  int arg_idx = 0;
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, img2d_shape);
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, img2d_shape);
  if (type_ == ActivationType_LEAKY_RELU) {
    ocl_runtime->SetKernelArg(kernel_, arg_idx++, alpha_);
    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, alpha_);
  }
  std::vector<size_t> local = {};
  std::vector<size_t> global = {static_cast<size_t>(img2d_shape.s[1]), static_cast<size_t>(img2d_shape.s[2])};
  auto ret = ocl_runtime->RunKernel(kernel_, global, local, nullptr);
  auto ret = ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Run kernel:" << op_parameter_->name_ << " fail.";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h
@@ -19,7 +19,6 @@

 #include <vector>

 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/opencl_kernel.h"
 #include "nnacl/fp32/activation.h"

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
@@ -34,7 +34,7 @@ namespace mindspore::kernel {

 ArithmeticOpenCLKernel::~ArithmeticOpenCLKernel() {
  if (weight_ptr_ != nullptr) {
    auto allocator = runtime_->GetAllocator();
    auto allocator = ocl_runtime_->GetAllocator();
    allocator->Free(weight_ptr_);
    weight_ptr_ = nullptr;
  }
@@ -106,7 +106,7 @@ int ArithmeticOpenCLKernel::InitBuffer() {
  const ArithmeticParameter *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(op_parameter_);
  if (!arithmetic_parameter->broadcasting_) {
    if (in_tensors_[1]->category() == lite::Tensor::Category::CONST && in_tensors_[1]->data_c() != nullptr) {
      auto allocator = runtime_->GetAllocator();
      auto allocator = ocl_runtime_->GetAllocator();
      std::vector<size_t> img_size;
      GetImageSize(0, &img_size);
      int pack_weight_size = in_tensors_[1]->ElementsC4Num();
@@ -194,7 +194,6 @@ int ArithmeticOpenCLKernel::InitBuffer() {
 }

 int ArithmeticOpenCLKernel::Init() {
  runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
  std::string kernel_name;

  const ArithmeticParameter *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(op_parameter_);
@@ -265,7 +264,7 @@ int ArithmeticOpenCLKernel::Init() {

  lite::STATUS error_code = RET_OK;
 #ifdef PROGRAM_WITH_IL
  kernel_ = runtime_->GetKernelFromBinary(kernel_name);
  kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
  if (out_mem_type_ == OpenCLMemType::IMG) {
    kernel_name += "_IMG";
@@ -275,8 +274,8 @@ int ArithmeticOpenCLKernel::Init() {
  std::string program_name = "Arithmetic";
  std::set<std::string> build_options;
  std::string source = arithmetic_source;
  runtime_->LoadSource(program_name, source);
  error_code = runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
  ocl_runtime_->LoadSource(program_name, source);
  error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif
  if (error_code != RET_OK) {
    return error_code;
@@ -302,10 +301,10 @@ int ArithmeticOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running!";

  int arg_idx = 0;
  runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
  if (element_flag_) {
    void *weight = weight_ptr_ == nullptr ? in_tensors_[1]->data_c() : weight_ptr_;
    runtime_->SetKernelArg(kernel_, arg_idx++, weight);
    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, weight);
  } else {
    float weight = 0.f;
    if (in_tensors_[1]->data_type() == kNumberTypeFloat32) {
@@ -316,9 +315,9 @@ int ArithmeticOpenCLKernel::Run() {
      MS_LOG(ERROR) << "Unsupport data type " << in_tensors_[1]->data_type();
      return RET_ERROR;
    }
    runtime_->SetKernelArg(kernel_, arg_idx++, weight);
    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, weight);
  }
  runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());

  int H = 0;
  int W = 0;
@@ -336,8 +335,8 @@ int ArithmeticOpenCLKernel::Run() {
    return RET_ERROR;
  }
  cl_int2 output_shape{W, H};
  runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
  runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
  ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
@@ -19,7 +19,6 @@

 #include <vector>
 #include "src/runtime/kernel/arm/fp32/arithmetic.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/opencl_kernel.h"

 namespace mindspore::kernel {
@@ -42,7 +41,6 @@ class ArithmeticOpenCLKernel : public OpenCLKernel {
  int InitBuffer();

  cl::Kernel kernel_;
  lite::opencl::OpenCLRuntime *runtime_;
  bool element_flag_{true};
  void *weight_ptr_{nullptr};

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc
@@ -17,7 +17,6 @@
 #include <algorithm>
 #include <set>
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/arithmetic_self.h"
 #include "src/runtime/kernel/opencl/utils.h"
 #include "src/runtime/kernel/opencl/cl/arithmeticself.cl.inc"
@@ -51,8 +50,7 @@ int ArithmeticSelfOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *im
    im_dst_x = out_tensors_[0]->Width();
  }
  size_t img_dtype = CL_FLOAT;
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  auto enable_fp16_ = ocl_runtime->GetFp16Enable();
  auto enable_fp16_ = ocl_runtime_->GetFp16Enable();
  if (enable_fp16_) {
    img_dtype = CL_HALF_FLOAT;
  }
@@ -136,9 +134,8 @@ int ArithmeticSelfOpenCLKernel::Init() {
  std::set<std::string> build_options;
  std::string source = arithmeticself_source;
  std::string program_name = "ArithmeticSelf";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  ocl_runtime->LoadSource(program_name, source);
  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
  ocl_runtime_->LoadSource(program_name, source);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);

  return RET_OK;
 }
@@ -162,7 +159,6 @@ void ArithmeticSelfGetWorkGroup(const std::vector<size_t> &global, std::vector<s
 int ArithmeticSelfOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running! ";

  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  auto output_shape = out_tensors_[0]->shape();
  cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], UP_DIV(output_shape[3], C4NUM)};

@@ -170,17 +166,17 @@ int ArithmeticSelfOpenCLKernel::Run() {
  uint32_t OW = output_shape[2];
  uint32_t OC = UP_DIV(output_shape[3], C4NUM);

  const std::vector<size_t> &max_global = ocl_runtime->GetWorkItemSize();
  const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
  std::vector<size_t> local = {1, 1, 1};  // init local
  std::vector<size_t> global = {OH, OW, OC};
  ArithmeticSelfGetWorkGroup(global, &local, max_global[0]);

  int arg_cn = 0;
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_);
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);

  ocl_runtime->RunKernel(kernel_, global, local, nullptr);
  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);

  return RET_OK;
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.h
@@ -21,7 +21,6 @@
 #include <string>
 #include "ir/anf.h"
 #include "src/runtime/kernel/opencl/opencl_kernel.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "nnacl/arithmetic_self_parameter.h"

 namespace mindspore::kernel {
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
@@ -18,7 +18,6 @@
 #include <set>
 #include <string>
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/batchnorm.h"
 #include "src/runtime/kernel/opencl/utils.h"
 #include "src/runtime/kernel/opencl/cl/batchnorm.cl.inc"
@@ -40,8 +39,7 @@ int BatchNormOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_siz
    im_dst_x = out_tensors_[0]->Width();
  }
  size_t img_dtype = CL_FLOAT;
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  auto enable_fp16_ = ocl_runtime->GetFp16Enable();
  auto enable_fp16_ = ocl_runtime_->GetFp16Enable();
  if (enable_fp16_) {
    img_dtype = CL_HALF_FLOAT;
  }
@@ -72,9 +70,8 @@ int BatchNormOpenCLKernel::Init() {
  std::set<std::string> build_options;
  std::string source = batchnorm_source;
  std::string program_name = "Batch_normalization";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  ocl_runtime->LoadSource(program_name, source);
  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
  ocl_runtime_->LoadSource(program_name, source);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);

  return RET_OK;
 }
@@ -98,7 +95,6 @@ void BatchNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t
 int BatchNormOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running! ";
  auto param = reinterpret_cast<BatchNormParameter *>(this->op_parameter_);
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  auto input0_shape = in_tensors_[0]->shape();
  auto output_shape = out_tensors_[0]->shape();
  cl_int4 input_shape_ = {input0_shape[0], input0_shape[1], input0_shape[2], UP_DIV(input0_shape[3], C4NUM)};
@@ -107,20 +103,20 @@ int BatchNormOpenCLKernel::Run() {
  uint32_t OW = output_shape[2];
  uint32_t OC = UP_DIV(output_shape[3], C4NUM);

  const std::vector<size_t> &max_global = ocl_runtime->GetWorkItemSize();
  const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
  std::vector<size_t> local = {1, 1, 1};  // init local
  std::vector<size_t> global = {OH, OW, OC};
  BatchNormGetWorkGroup(global, &local, max_global[0]);
  int arg_cn = 0;
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());   // input tensor
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c());   // scale
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->data_c());   // offest
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->data_c());   // mean
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[4]->data_c());   // variance
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());  // out tensor
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape_);
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, param->epsilon_);
  ocl_runtime->RunKernel(kernel_, global, local, nullptr);
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());   // input tensor
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c());   // scale
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->data_c());   // offest
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->data_c());   // mean
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[4]->data_c());   // variance
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());  // out tensor
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->epsilon_);
  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);

  return RET_OK;
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.h
@@ -20,7 +20,6 @@
 #include <vector>
 #include "ir/anf.h"
 #include "src/runtime/kernel/opencl/opencl_kernel.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "nnacl/fp32/batchnorm.h"

 namespace mindspore::kernel {
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.cc
@@ -16,6 +16,7 @@
 * limitations under the License.
 */

 #include "src/runtime/kernel/opencl/kernel/biasadd.h"
 #include <string>
 #include <map>
 #include <set>
@@ -23,7 +24,6 @@

 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
 #include "src/runtime/kernel/opencl/kernel/biasadd.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/cl/biasadd.cl.inc"

@@ -38,7 +38,7 @@ namespace mindspore::kernel {
 void BiasAddOpenCLKernel::InitBuffer() {
  int C = in_tensors_[1]->shape()[0];
  int div_ci = UP_DIV(C, C4NUM);
  auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
  auto allocator = ocl_runtime_->GetAllocator();
  size_t img_dtype = CL_FLOAT;
  if (enable_fp16_) {
    img_dtype = CL_HALF_FLOAT;
@@ -57,8 +57,7 @@ int BiasAddOpenCLKernel::Init() {
  for (int i = 0; i < in_size_; ++i) {
    input_shape_.s[i + 4 - in_size_] = in_tensors_[0]->shape()[i];
  }
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  enable_fp16_ = ocl_runtime->GetFp16Enable();
  enable_fp16_ = ocl_runtime_->GetFp16Enable();
  fp_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
  if (in_size_ != 4 && in_size_ != 2) {
    MS_LOG(ERROR) << "BiasAdd only support dim=4 or 2, but your dim=" << in_size_;
@@ -75,8 +74,8 @@ int BiasAddOpenCLKernel::Init() {
  std::string source = biasadd_source;
  std::string program_name = "BiasAdd";
  std::string kernel_name = "BiasAdd";
  ocl_runtime->LoadSource(program_name, source);
  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
  ocl_runtime_->LoadSource(program_name, source);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);

  in_ori_format_ = in_tensors_[0]->GetFormat();
  out_ori_format_ = out_tensors_[0]->GetFormat();
@@ -89,18 +88,17 @@ int BiasAddOpenCLKernel::Init() {
 int BiasAddOpenCLKernel::Run() {
  cl_int4 global_size = GetGlobalshape();
  MS_LOG(DEBUG) << op_parameter_->name_ << " Running!";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  int arg_idx = 0;
  std::map<schema::Format, int> data_type{
    {schema::Format::Format_NC4, 1}, {schema::Format::Format_NHWC4, 2}, {schema::Format::Format_NC4HW4, 3}};
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, input_shape_);
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, BiasAdd_);
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, data_type[op_format_]);
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_shape_);
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, BiasAdd_);
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, data_type[op_format_]);
  std::vector<size_t> local = {1, 1};
  std::vector<size_t> global = {static_cast<size_t>(global_size.s[1]), static_cast<size_t>(global_size.s[2])};
  auto ret = ocl_runtime->RunKernel(kernel_, global, local, nullptr);
  auto ret = ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Run kernel " << op_parameter_->name_ << " error.";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.h
@@ -23,7 +23,6 @@
 #include "src/tensor.h"
 #include "src/runtime/kernel/opencl/opencl_kernel.h"
 #include "schema/model_generated.h"
 #include "src/runtime/opencl/opencl_runtime.h"

 namespace mindspore::kernel {

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
@@ -13,13 +13,13 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "src/runtime/kernel/opencl/kernel/concat.h"
 #include <cstring>
 #include <string>
 #include <algorithm>
 #include <set>
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/concat.h"
 #include "src/runtime/kernel/opencl/utils.h"
 #include "src/runtime/kernel/opencl/cl/concat.cl.inc"

@@ -40,8 +40,7 @@ int ConcatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
    im_dst_x = out_tensors_[0]->Width();
  }
  size_t img_dtype = CL_FLOAT;
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  auto enable_fp16_ = ocl_runtime->GetFp16Enable();
  auto enable_fp16_ = ocl_runtime_->GetFp16Enable();
  if (enable_fp16_) {
    img_dtype = CL_HALF_FLOAT;
  }
@@ -52,8 +51,7 @@ int ConcatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
 }

 int ConcatOpenCLKernel::RunAxis0() {
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  auto allocator_ = ocl_runtime->GetAllocator();
  auto allocator_ = ocl_runtime_->GetAllocator();
  std::vector<size_t> img_size;
  auto dst_data = out_tensors_[0]->data_c();
  auto dst_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
@@ -64,7 +62,7 @@ int ConcatOpenCLKernel::RunAxis0() {
    auto src_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
    auto region = cl::array<cl::size_type, 3U>{img_size[0], img_size[1], 1};
    cl::Image2D *input_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data));
    ocl_runtime->GetDefaultCommandQueue()->enqueueCopyImage(*input_image, *out_image, src_origin, dst_origin, region);
    ocl_runtime_->GetDefaultCommandQueue()->enqueueCopyImage(*input_image, *out_image, src_origin, dst_origin, region);
    dst_origin[1] += region[1];
  }
  return RET_OK;
@@ -112,9 +110,8 @@ int ConcatOpenCLKernel::Init() {
  std::set<std::string> build_options;
  std::string source = concat_source;
  std::string program_name = "Concat";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  ocl_runtime->LoadSource(program_name, source);
  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
  ocl_runtime_->LoadSource(program_name, source);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);

  return RET_OK;
 }
@@ -155,7 +152,6 @@ int ConcatOpenCLKernel::Run() {
    return RunAxis0();
  }

  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  auto input1_shape = in_tensors_[0]->shape();
  auto input2_shape = in_tensors_[1]->shape();
  auto output_shape = out_tensors_[0]->shape();
@@ -168,7 +164,7 @@ int ConcatOpenCLKernel::Run() {
  uint32_t OW = output_shape[2];
  uint32_t OC = UP_DIV(output_shape[3], C4NUM);

  const std::vector<size_t> &max_global = ocl_runtime->GetWorkItemSize();
  const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
  std::vector<size_t> local = {1, 1, 1};  // init local
  std::vector<size_t> global = {OH, OW, OC};
  ConcatGetWorkGroup(global, &local, max_global[0]);
@@ -176,48 +172,48 @@ int ConcatOpenCLKernel::Run() {

  int arg_cn = 0;
  if (in_tensors_.size() == 2) {
    ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
    ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c());
    ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
    ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape1_);
    ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape2_);
    ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_);
    ocl_runtime->SetKernelArg(kernel_, arg_cn++, param->axis_);
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c());
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape1_);
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape2_);
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->axis_);
  } else if (in_tensors_.size() == 3) {
    auto input3_shape = in_tensors_[2]->shape();
    cl_int4 input_shape3_ = {input3_shape[0], input3_shape[1], input3_shape[2], UP_DIV(input3_shape[3], C4NUM)};

    ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
    ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c());
    ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->data_c());
    ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
    ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape1_);
    ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape2_);
    ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape3_);
    ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_);
    ocl_runtime->SetKernelArg(kernel_, arg_cn++, param->axis_);
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c());
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->data_c());
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape1_);
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape2_);
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape3_);
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->axis_);
  } else if (in_tensors_.size() == 4) {
    auto input3_shape = in_tensors_[2]->shape();
    auto input4_shape = in_tensors_[3]->shape();
    cl_int4 input_shape3_ = {input3_shape[0], input3_shape[1], input3_shape[2], UP_DIV(input3_shape[3], C4NUM)};
    cl_int4 input_shape4_ = {input4_shape[0], input4_shape[1], input4_shape[2], UP_DIV(input4_shape[3], C4NUM)};

    ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
    ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c());
    ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->data_c());
    ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->data_c());
    ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
    ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape1_);
    ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape2_);
    ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape3_);
    ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape4_);
    ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_);
    ocl_runtime->SetKernelArg(kernel_, arg_cn++, param->axis_);
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c());
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->data_c());
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->data_c());
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape1_);
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape2_);
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape3_);
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape4_);
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->axis_);
  } else {
    MS_LOG(ERROR) << " input sizes must 2 or 3 or 4";
    return RET_ERROR;
  }
  ocl_runtime->RunKernel(kernel_, global, local, nullptr);
  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);

  return RET_OK;
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h
@@ -20,7 +20,6 @@
 #include <vector>
 #include "ir/anf.h"
 #include "src/runtime/kernel/opencl/opencl_kernel.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/arm/base/concat_base.h"

 namespace mindspore::kernel {
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
@@ -14,12 +14,11 @@
 * limitations under the License.
 */

 #include "src/runtime/kernel/opencl/kernel/conv2d_transpose.h"
 #include <string>
 #include <set>
 #include "nnacl/fp32/common_func.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/conv2d_transpose.h"
 #ifndef PROGRAM_WITH_IL
 #include "src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl.inc"
 #endif
@@ -41,16 +40,15 @@ int Conv2dTransposeOpenCLKernel::Init() {
    return RET_ERROR;
  }
  std::string kernel_name = "conv2d_transpose2x2_" + std::string(EnumNameFormat(op_format_));
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  enable_fp16_ = ocl_runtime->GetFp16Enable();
  enable_fp16_ = ocl_runtime_->GetFp16Enable();
 #ifdef PROGRAM_WITH_IL
  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
  kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
  std::string source = conv2d_transpose2x2_source;
  std::set<std::string> build_options;
  std::string program_name = "conv2d_transpose2x2";
  ocl_runtime->LoadSource(program_name, source);
  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
  ocl_runtime_->LoadSource(program_name, source);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif
  PadWeight();
  in_ori_format_ = in_tensors_[0]->GetFormat();
@@ -71,7 +69,7 @@ void Conv2dTransposeOpenCLKernel::PadWeight() {
  int kw = param->kernel_w_;
  int div_ci = UP_DIV(ci, C4NUM);
  int div_co = UP_DIV(co, C4NUM);
  auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
  auto allocator = ocl_runtime_->GetAllocator();
  auto data_size = enable_fp16_ ? sizeof(int16_t) : sizeof(float);

  // IHWO to OHWI4(I)4(O)(converter format is IHWO)
@@ -188,7 +186,6 @@ int Conv2dTransposeOpenCLKernel::Run() {
  int ow = out_tensors_[0]->shape()[2];
  int h = in_tensors_[0]->shape()[1];
  int w = in_tensors_[0]->shape()[2];
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  // local size should less than MAX_GROUP_SIZE
  std::vector<size_t> local = {16, 1, 16};
  std::vector<size_t> global = {UP_ROUND((size_t)UP_ROUND(oh / 2, 2), local[0]),
@@ -200,16 +197,16 @@ int Conv2dTransposeOpenCLKernel::Run() {
  cl_int4 src_size = {h, w, UP_DIV(ci, C4NUM), 1};
  cl_int4 dst_size = {oh, ow, UP_DIV(co, C4NUM), 1};
  int arg_cnt = 0;
  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c());
  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, padWeight_, lite::opencl::MemType::BUF);
  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, bias_);
  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c());
  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, kernel_size);
  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, stride);
  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, padding);
  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, src_size);
  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, dst_size);
  ocl_runtime->RunKernel(kernel_, global, local, nullptr);
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padWeight_, lite::opencl::MemType::BUF);
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, bias_);
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, kernel_size);
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, stride);
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding);
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size);
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size);
  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h
@@ -21,7 +21,6 @@

 #include "src/lite_kernel.h"
 #include "nnacl/conv_parameter.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/opencl_kernel.h"

 namespace mindspore::kernel {
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
@@ -39,12 +39,11 @@ constexpr size_t CO_TILE = C4NUM;

 int ConvolutionOpenCLKernel::Init() {
  static int init_count = 0;
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  auto allocator = ocl_runtime->GetAllocator();
  auto allocator = ocl_runtime_->GetAllocator();
  auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
  std::set<std::string> build_options;
  init_count++;
  use_fp16_ = ocl_runtime->GetFp16Enable();
  use_fp16_ = ocl_runtime_->GetFp16Enable();

  if (op_format_ != Format_NHWC4 && op_format_ != Format_NC4HW4) {
    MS_LOG(ERROR) << "op_format_ " << op_format_ << " not support!";
@@ -76,21 +75,21 @@ int ConvolutionOpenCLKernel::Init() {
    MS_LOG(DEBUG) << "use winograd";
    std::string program_name;
    program_name = "Winograd4x4To36" + std::to_string(init_count);
    ocl_runtime->LoadSource(program_name, CodeGenWinograd4x4To36());
    ocl_runtime->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36", build_options);
    ocl_runtime_->LoadSource(program_name, CodeGenWinograd4x4To36());
    ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36", build_options);

    program_name = "WinogradConvolution" + std::to_string(init_count);
    ocl_runtime->LoadSource(program_name, CodeGenWinogradConvolution());
    ocl_runtime->BuildKernel(kernel_conv_, program_name, "WinogradConvolution", build_options);
    ocl_runtime_->LoadSource(program_name, CodeGenWinogradConvolution());
    ocl_runtime_->BuildKernel(kernel_conv_, program_name, "WinogradConvolution", build_options);

    program_name = "Winograd36To4x4" + std::to_string(init_count);
    ocl_runtime->LoadSource(program_name, CodeGenWinograd36To4x4());
    ocl_runtime->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4", build_options);
    ocl_runtime_->LoadSource(program_name, CodeGenWinograd36To4x4());
    ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4", build_options);
  } else {
    std::string program_name = "convolution" + std::to_string(init_count);
    std::string source = op_format_ == Format_NHWC4 ? CodeGenConvolutionNHWC4() : CodeGenConvolutionNC4HW4();
    ocl_runtime->LoadSource(program_name, source);
    ocl_runtime->BuildKernel(kernel_conv_, program_name, "Convolution", build_options);
    ocl_runtime_->LoadSource(program_name, source);
    ocl_runtime_->BuildKernel(kernel_conv_, program_name, "Convolution", build_options);
  }

  // allocate winograd memory
@@ -167,7 +166,7 @@ int ConvolutionOpenCLKernel::GenerateWinogradWeight() {
 }

 int ConvolutionOpenCLKernel::InitWeight() {
  auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
  auto allocator = ocl_runtime_->GetAllocator();

  // allocate memory
  size_t packed_weight_size;
@@ -205,8 +204,7 @@ int ConvolutionOpenCLKernel::InitWeight() {
 }

 int ConvolutionOpenCLKernel::InitBias() {
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  auto allocator = ocl_runtime->GetAllocator();
  auto allocator = ocl_runtime_->GetAllocator();

  // align bias from C to C4
  auto bias_tensor = in_tensors_[2];
@@ -272,57 +270,56 @@ int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_s

 int ConvolutionOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running!";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();

  int arg_cn = 0;
  if (use_winograd_) {
    arg_cn = 0;
    cl_int4 _4x4to36_in_shape = {1, IH_, IW_, CI_SLICES_};
    cl_int4 _4x4to36_out_shape = {1, 36, TILES_XY_, CI_SLICES_};
    ocl_runtime->SetKernelArg(kernel_4x4to36_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
    ocl_runtime->SetKernelArg(kernel_4x4to36_, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
    ocl_runtime->SetKernelArg(kernel_4x4to36_, arg_cn++, _4x4to36_in_shape);
    ocl_runtime->SetKernelArg(kernel_4x4to36_, arg_cn++, _4x4to36_out_shape);
    ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
    ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
    ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, _4x4to36_in_shape);
    ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, _4x4to36_out_shape);

    arg_cn = 0;
    cl_int4 conv_in_shape = {1, 36, TILES_XY_, CI_SLICES_};
    cl_int4 conv_out_shape = {1, 36, TILES_XY_, CO_SLICES_};
    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, conv_in_shape);
    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, conv_out_shape);
    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, conv_in_shape);
    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, conv_out_shape);

    arg_cn = 0;
    cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY_, CO_SLICES_};
    cl_int4 _36to4x4_out_shape = {1, OH_, OW_, CO_SLICES_};
    ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
    ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
    ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
    ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_in_shape);
    ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_out_shape);
    ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
    ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
    ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
    ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_in_shape);
    ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_out_shape);
  } else {
    arg_cn = 0;
    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
    ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
    if (op_format_ == Format_NC4HW4) {
      cl_int4 input_shape = {1, IH_, IW_, CI_SLICES_};
      cl_int4 output_shape = {1, OH_, OW_, CO_SLICES_};
      ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, input_shape);
      ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, output_shape);
      ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, input_shape);
      ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, output_shape);
    }
  }

  if (use_winograd_) {
    ocl_runtime->RunKernel(kernel_4x4to36_, {size_t(TILES_XY_), 6, size_t(CI_SLICES_)}, {8, 6, 4}, nullptr);
    ocl_runtime->RunKernel(kernel_conv_, {size_t(TILES_XY_ / 2), 36, size_t(CO_SLICES_ / 2)}, {8, 6, 2}, nullptr);
    ocl_runtime->RunKernel(kernel_36to4x4_, {size_t(TILES_XY_), 4, size_t(CO_SLICES_)}, {32, 4, 2}, nullptr);
    ocl_runtime_->RunKernel(kernel_4x4to36_, {size_t(TILES_XY_), 6, size_t(CI_SLICES_)}, {8, 6, 4}, nullptr);
    ocl_runtime_->RunKernel(kernel_conv_, {size_t(TILES_XY_ / 2), 36, size_t(CO_SLICES_ / 2)}, {8, 6, 2}, nullptr);
    ocl_runtime_->RunKernel(kernel_36to4x4_, {size_t(TILES_XY_), 4, size_t(CO_SLICES_)}, {32, 4, 2}, nullptr);
  } else {
    std::vector<size_t> global, local;
    SetGlobalLocalConv(&global, &local);
    ocl_runtime->RunKernel(kernel_conv_, global, local, nullptr);
    ocl_runtime_->RunKernel(kernel_conv_, global, local, nullptr);
  }

  return RET_OK;
@@ -819,10 +816,9 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
 }

 int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std::vector<size_t> *local) {
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  constexpr size_t work_group_size[] = {4, 4, 1};
  auto max_work_item_sizes = ocl_runtime->GetWorkItemSize();
  size_t max_work_group_size = ocl_runtime->GetKernelMaxWorkGroupSize(kernel_conv_(), (*ocl_runtime->Device())());
  auto max_work_item_sizes = ocl_runtime_->GetWorkItemSize();
  size_t max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_conv_(), (*ocl_runtime_->Device())());
  const size_t max_z_size = std::min<size_t>(16, max_work_item_sizes[2]);

  size_t global_h = UP_DIV(OH_, work_group_size[0]) * work_group_size[0];
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
@@ -22,7 +22,6 @@
 #include "src/tensor.h"
 #include "src/runtime/kernel/opencl/opencl_kernel.h"
 #include "schema/model_generated.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "nnacl/conv_parameter.h"

 namespace mindspore::kernel {
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
@@ -21,7 +21,6 @@
 #include <map>
 #include <utility>
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/utils.h"
 #include "nnacl/fp32/common_func.h"
 #include "nnacl/op_base.h"
@@ -42,7 +41,6 @@ using mindspore::schema::PrimitiveType_DepthwiseConv2D;
 namespace mindspore::kernel {

 int DepthwiseConv2dOpenCLKernel::Init() {
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  std::string kernel_name = "DepthwiseConv2d";
  auto in_format = op_format_;
  in_ori_format_ = in_tensors_[0]->GetFormat();
@@ -69,13 +67,13 @@ int DepthwiseConv2dOpenCLKernel::Init() {
    kernel_name += "_1x1";
  }
 #ifdef PROGRAM_WITH_IL
  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
  kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
  std::string program_name = "DepthwiseConv2d";
  std::set<std::string> build_options;
  std::string source = depthwise_conv2d_source;
  ocl_runtime->LoadSource(program_name, source);
  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
  ocl_runtime_->LoadSource(program_name, source);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif
  this->InitBuffer();
  MS_LOG(DEBUG) << kernel_name << " Init Done! mem type=" << static_cast<int>(out_mem_type_);
@@ -84,9 +82,8 @@ int DepthwiseConv2dOpenCLKernel::Init() {

 int DepthwiseConv2dOpenCLKernel::InitBuffer() {
  auto parameter = reinterpret_cast<ConvParameter *>(op_parameter_);
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  auto allocator = ocl_runtime->GetAllocator();
  bool is_fp16 = ocl_runtime->GetFp16Enable();
  auto allocator = ocl_runtime_->GetAllocator();
  bool is_fp16 = ocl_runtime_->GetFp16Enable();

  // weight: o, h, w, i; o == group, i == 1
  void *origin_weight = in_tensors_.at(kWeightIndex)->data_c();
@@ -162,7 +159,7 @@ int DepthwiseConv2dOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *i
    im_dst_x = out_tensors_[0]->Width();
  }
  size_t img_dtype = CL_FLOAT;
  if (lite::opencl::OpenCLRuntime::GetInstance()->GetFp16Enable()) {
  if (ocl_runtime_->GetFp16Enable()) {
    img_dtype = CL_HALF_FLOAT;
  }
  img_size->clear();
@@ -189,7 +186,6 @@ int DepthwiseConv2dOpenCLKernel::GetLocalSize(size_t idx, const std::vector<size
 int DepthwiseConv2dOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running!";
  auto parameter = reinterpret_cast<ConvParameter *>(op_parameter_);
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
  size_t CI4 = UP_DIV(in_tensors_[0]->Channel(), C4NUM);
  std::vector<size_t> global = {(size_t)out_tensors_[0]->Width(), (size_t)out_tensors_[0]->Height(), CO4};
@@ -207,19 +203,19 @@ int DepthwiseConv2dOpenCLKernel::Run() {
                      (cl_int)out_tensors_[0]->Batch()};

  int arg_cnt = 0;
  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c());
  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, packed_weight_, lite::opencl::MemType::BUF);
  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, bias_data_, lite::opencl::MemType::BUF);
  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c());
  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, kernel_size);
  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, stride);
  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, padding);
  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, dilation);
  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, src_size);
  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, dst_size);
  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].first);
  ocl_runtime->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].second);
  ocl_runtime->RunKernel(kernel_, global, local, nullptr);
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, packed_weight_, lite::opencl::MemType::BUF);
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, bias_data_, lite::opencl::MemType::BUF);
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, kernel_size);
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, stride);
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding);
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dilation);
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size);
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size);
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].first);
  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].second);
  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h
@@ -20,7 +20,6 @@
 #include <vector>
 #include "src/runtime/kernel/opencl/opencl_kernel.h"
 #include "nnacl/conv_parameter.h"
 #include "src/runtime/opencl/opencl_runtime.h"

 namespace mindspore::kernel {

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc
@@ -19,7 +19,6 @@
 #include <set>
 #include <utility>
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/gather.h"
 #include "src/runtime/kernel/opencl/cl/gather.cl.inc"

@@ -49,9 +48,8 @@ int GatherOpenCLKernel::Init() {
  std::set<std::string> build_options;
  std::string source = gather_source;
  std::string program_name = "gather";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  ocl_runtime->LoadSource(program_name, source);
  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
  ocl_runtime_->LoadSource(program_name, source);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
  // init indices_data_
  auto indices_tensor = in_tensors_.at(1);
  int indices_num = indices_tensor->ElementsNum();
@@ -104,8 +102,7 @@ int GatherOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
    im_dst_x = out_tensors_[0]->Width();
  }
  size_t img_dtype = CL_FLOAT;
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  auto enable_fp16_ = ocl_runtime->GetFp16Enable();
  auto enable_fp16_ = ocl_runtime_->GetFp16Enable();
  if (enable_fp16_) {
    img_dtype = CL_HALF_FLOAT;
  }
@@ -117,7 +114,6 @@ int GatherOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
 int GatherOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running! ";
  auto param = reinterpret_cast<GatherParameter *>(this->op_parameter_);
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();

  if (InitBuffer() != RET_OK) {
    return RET_ERROR;
@@ -134,14 +130,14 @@ int GatherOpenCLKernel::Run() {
  std::vector<size_t> local = {1, 1, 1};
  std::vector<size_t> global = {(size_t)out_tensors_[0]->Width(), (size_t)out_tensors_[0]->Height(), CO4};
  int arg_cn = 0;
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, indices_data_, lite::opencl::MemType::BUF);
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, src_size);
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, dst_size);
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, indices_num);
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, param->axis_);
  ocl_runtime->RunKernel(kernel_, global, local, nullptr);
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, indices_data_, lite::opencl::MemType::BUF);
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, src_size);
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dst_size);
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, indices_num);
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->axis_);
  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);

  return RET_OK;
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h
@@ -20,7 +20,6 @@
 #include <vector>
 #include "ir/anf.h"
 #include "src/runtime/kernel/opencl/opencl_kernel.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "nnacl/gather_parameter.h"

 namespace mindspore::kernel {
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
@@ -19,7 +19,6 @@
 #include <map>
 #include "nnacl/fp32/common_func.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/matmul.h"
 #ifndef PROGRAM_WITH_IL
 #include "src/runtime/kernel/opencl/cl/matmul.cl.inc"
@@ -35,7 +34,6 @@ namespace mindspore::kernel {
 int MatMulOpenCLKernel::Init() {
  std::string kernel_name = "MatMul";
  kernel_name += "_" + std::string(EnumNameFormat(op_format_));
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  auto param = reinterpret_cast<MatMulParameter *>(op_parameter_);
  transposeA = param->a_transpose_;
  if (transposeA) {
@@ -43,7 +41,7 @@ int MatMulOpenCLKernel::Init() {
    return RET_ERROR;
  }
  transposeB = param->b_transpose_;
  enable_fp16_ = ocl_runtime->GetFp16Enable();
  enable_fp16_ = ocl_runtime_->GetFp16Enable();
  if (in_tensors_[0]->shape().size() != out_tensors_[0]->shape().size() ||
      (in_tensors_[0]->shape().size() != 2 && in_tensors_[0]->shape().size() != 4)) {
    MS_LOG(ERROR) << "matmul only support input shape size=2 or 4.";
@@ -57,13 +55,13 @@ int MatMulOpenCLKernel::Init() {
  std::map<int, std::string> dims2str = {{2, "_2d"}, {4, "_4d"}};
  kernel_name += dims2str[dims];
 #ifdef PROGRAM_WITH_IL
  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
  kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
  std::set<std::string> build_options;
  std::string source = matmul_source;
  std::string program_name = "MatMul";
  ocl_runtime->LoadSource(program_name, source);
  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
  ocl_runtime_->LoadSource(program_name, source);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif

  PadWeight();
@@ -79,7 +77,7 @@ int MatMulOpenCLKernel::ReSize() { return RET_OK; }

 void MatMulOpenCLKernel::PadWeight() {
  // ABMCI @ ABCICO = ABMCO
  auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
  auto allocator = ocl_runtime_->GetAllocator();
  int ci = inShape[3];
  int ci4 = UP_DIV(ci, C4NUM);
  int co = outShape[3];
@@ -201,7 +199,6 @@ int MatMulOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)

 int MatMulOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running!";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  // local size should less than MAX_GROUP_SIZE
  std::vector<size_t> local = {32, 4, 1};
  std::vector<size_t> global = {UP_DIV(static_cast<size_t>(outShape[3]), C4NUM),
@@ -210,14 +207,14 @@ int MatMulOpenCLKernel::Run() {
  int arg_count = 0;
  cl_int4 in_shape = {inShape[0], inShape[1], inShape[2], inShape[3]};
  cl_int4 out_shape = {outShape[0], outShape[1], outShape[2], outShape[3]};
  ocl_runtime->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c());
  ocl_runtime->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF);
  ocl_runtime->SetKernelArg(kernel_, arg_count++, bias_);
  ocl_runtime->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c());
  ocl_runtime->SetKernelArg(kernel_, arg_count++, in_shape);
  ocl_runtime->SetKernelArg(kernel_, arg_count++, out_shape);
  ocl_runtime->SetKernelArg(kernel_, arg_count++, hasBias_ ? 1 : 0);
  ocl_runtime->RunKernel(kernel_, global, local, nullptr);
  ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF);
  ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_);
  ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape);
  ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape);
  ocl_runtime_->SetKernelArg(kernel_, arg_count++, hasBias_ ? 1 : 0);
  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h
@@ -21,7 +21,6 @@

 #include "src/runtime/kernel/opencl/opencl_kernel.h"
 #include "nnacl/matmul_parameter.h"
 #include "src/runtime/opencl/opencl_runtime.h"

 namespace mindspore::kernel {

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
@@ -20,8 +20,6 @@
 #include "include/errorcode.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/kernel/opencl/utils.h"
 #include "src/runtime/opencl/opencl_wrapper.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/image_format.h"
 #ifndef PROGRAM_WITH_IL
 #include "src/runtime/kernel/opencl/cl/avg_pool2d.cl.inc"
@@ -59,10 +57,9 @@ int PoolingOpenCLKernel::Init() {
    MS_LOG(ERROR) << "Init `Pooling2d` kernel failed!";
    return RET_INVALID_OP_NAME;
  }
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  enable_fp16_ = ocl_runtime->GetFp16Enable();
  enable_fp16_ = ocl_runtime_->GetFp16Enable();
 #ifdef PROGRAM_WITH_IL
  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
  kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
  kernel_name += "_" + std::string(EnumNameFormat(op_format_));
  if (out_mem_type_ == OpenCLMemType::BUF) {
@@ -72,8 +69,8 @@ int PoolingOpenCLKernel::Init() {
    kernel_name += "_IMG";
  }
  std::set<std::string> build_options;
  ocl_runtime->LoadSource(program_name, source);
  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
  ocl_runtime_->LoadSource(program_name, source);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif
  in_ori_format_ = in_tensors_[0]->GetFormat();
  out_ori_format_ = out_tensors_[0]->GetFormat();
@@ -124,7 +121,6 @@ int PoolingOpenCLKernel::ReSize() { return RET_OK; }

 int PoolingOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running!";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();

  int slices = UP_DIV(out_tensors_[0]->shape()[3], C4NUM);
  cl_int4 input_shape = {in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2], in_tensors_[0]->shape()[3], slices};
@@ -135,21 +131,21 @@ int PoolingOpenCLKernel::Run() {
  cl_int2 padding = {parameter_->pad_u_, parameter_->pad_l_};

  int arg_idx = 0;
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, input_shape);
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, output_shape);
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, stride);
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, kernel_size);
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, padding);
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_shape);
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, stride);
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, kernel_size);
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, padding);

  std::vector<size_t> local_size;
  std::vector<size_t> global_size = InitGlobalSize();
  int max_work_group_size = ocl_runtime->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime->Device())());
  int max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())());
  local_size = GetCommonLocalSize(global_size, max_work_group_size);
  global_size = GetCommonGlobalSize(local_size, global_size);

  ocl_runtime->RunKernel(kernel_, global_size, local_size, nullptr);
  ocl_runtime_->RunKernel(kernel_, global_size, local_size, nullptr);
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h
@@ -21,7 +21,6 @@

 #include "src/runtime/kernel/opencl/opencl_kernel.h"
 #include "nnacl/fp32/pooling.h"
 #include "src/runtime/opencl/opencl_runtime.h"

 namespace mindspore::kernel {

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc
@@ -24,7 +24,6 @@
 #include "include/errorcode.h"
 #include "nnacl/fp32/common_func.h"
 #include "src/runtime/kernel/opencl/kernel/prelu.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/cl/prelu.cl.inc"

 using mindspore::kernel::KERNEL_ARCH::kGPU;
@@ -36,7 +35,7 @@ using mindspore::schema::PrimitiveType_PReLU;
 namespace mindspore::kernel {

 void PReluOpenCLKernel::InitBuffer() {
  auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
  auto allocator = ocl_runtime_->GetAllocator();
  int elem_num = in_tensors_[0]->shape().size() == 2 ? in_tensors_[0]->shape()[1] : in_tensors_[0]->shape()[3];
  int elem_num_c4 = UP_DIV(elem_num, C4NUM);
  size_t img_dtype = CL_FLOAT;
@@ -91,12 +90,11 @@ int PReluOpenCLKernel::Init() {
  std::string source = prelu_source;
  std::string program_name = "PRelu";
  std::string kernel_name = "PRelu";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  enable_fp16_ = ocl_runtime->GetFp16Enable();
  enable_fp16_ = ocl_runtime_->GetFp16Enable();
  fp_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
  InitBuffer();
  ocl_runtime->LoadSource(program_name, source);
  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
  ocl_runtime_->LoadSource(program_name, source);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
  in_ori_format_ = in_tensors_[0]->GetFormat();
  in_tensors_[0]->SetFormat(op_format_);
  out_ori_format_ = out_tensors_[0]->GetFormat();
@@ -107,18 +105,17 @@ int PReluOpenCLKernel::Init() {

 int PReluOpenCLKernel::Run() {
  MS_LOG(DEBUG) << op_parameter_->name_ << " Running!";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  std::map<schema::Format, int> data_type{{schema::Format::Format_NHWC4, 1}, {schema::Format::Format_NC4HW4, 2}};
  int arg_idx = 0;
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, input_shape_);
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, PReluWeight_);
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, data_type[op_format_]);
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, reinterpret_cast<int>(in_tensors_[1]->shape()[0]));
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_shape_);
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, PReluWeight_);
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, data_type[op_format_]);
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, reinterpret_cast<int>(in_tensors_[1]->shape()[0]));
  std::vector<size_t> local = {1, 1};
  std::vector<size_t> global = {static_cast<size_t>(global_shape_.s[1]), static_cast<size_t>(global_shape_.s[2])};
  auto ret = ocl_runtime->RunKernel(kernel_, global, local, nullptr);
  auto ret = ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Run kernel " << op_parameter_->name_ << " error.";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.h
@@ -22,7 +22,6 @@
 #include "src/tensor.h"
 #include "src/runtime/kernel/opencl/opencl_kernel.h"
 #include "schema/model_generated.h"
 #include "src/runtime/opencl/opencl_runtime.h"

 namespace mindspore::kernel {

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc
@@ -19,7 +19,6 @@
 #include <map>
 #include "include/errorcode.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/reduce.h"
 #include "src/runtime/kernel/opencl/cl/reduce.cl.inc"

@@ -59,8 +58,7 @@ int ReduceOpenCLKernel::Init() {
  }
  std::string kernel_name = reduce_type2str.at(reduce_param->mode_);
  kernel_name += "_" + std::string(EnumNameFormat(op_format_));
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  enable_fp16_ = ocl_runtime->GetFp16Enable();
  enable_fp16_ = ocl_runtime_->GetFp16Enable();

  if (in_tensors_[0]->shape().back() != out_tensors_[0]->shape().back()) {
    MS_LOG(ERROR) << "Reduce input channel " << in_tensors_[0]->shape().back() << " should equal output channel"
@@ -68,12 +66,12 @@ int ReduceOpenCLKernel::Init() {
    return RET_ERROR;
  }
 #ifdef PROGRAM_WITH_IL
  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
  kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
  std::set<std::string> build_options;
  std::string source = reduce_source;
  ocl_runtime->LoadSource(kernel_name, source);
  ocl_runtime->BuildKernel(kernel_, kernel_name, kernel_name, build_options);
  ocl_runtime_->LoadSource(kernel_name, source);
  ocl_runtime_->BuildKernel(kernel_, kernel_name, kernel_name, build_options);
 #endif
  in_ori_format_ = in_tensors_[0]->GetFormat();
  out_ori_format_ = out_tensors_[0]->GetFormat();
@@ -130,15 +128,14 @@ int ReduceOpenCLKernel::Run() {
  int w = shapex[2];
  int c = shapex[3];
  int c4 = UP_DIV(c, C4NUM);
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  std::vector<size_t> local = {};
  std::vector<size_t> global = {static_cast<size_t>(c4)};
  cl_int4 size = {h, w, c4, 1};
  int arg_idx = 0;
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, size);
  ocl_runtime->RunKernel(kernel_, global, local, nullptr);
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size);
  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h
@@ -20,7 +20,6 @@
 #include <vector>

 #include "src/lite_kernel.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/opencl_kernel.h"
 #include "nnacl/reduce_parameter.h"

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
@@ -18,7 +18,6 @@
 #include <string>
 #include "include/errorcode.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/reshape.h"
 #include "src/runtime/kernel/opencl/cl/reshape.cl.inc"

@@ -34,8 +33,7 @@ namespace mindspore::kernel {
 int ReshapeOpenCLKernel::Init() {
  std::string kernel_name = "reshape";
  kernel_name += "_" + std::string(EnumNameFormat(op_format_));
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  enable_fp16_ = ocl_runtime->GetFp16Enable();
  enable_fp16_ = ocl_runtime_->GetFp16Enable();
  if (out_tensors_[0]->shape().size() != 2 && out_tensors_[0]->shape().size() != 4) {
    MS_LOG(ERROR) << "Reshape output size should in 2,4";
    return RET_ERROR;
@@ -46,13 +44,13 @@ int ReshapeOpenCLKernel::Init() {
    return RET_ERROR;
  }
 #ifdef PROGRAM_WITH_IL
  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
  kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
  std::set<std::string> build_options;
  std::string source = reshape_source;
  std::string program_name = "reshape";
  ocl_runtime->LoadSource(program_name, source);
  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
  ocl_runtime_->LoadSource(program_name, source);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif
  in_ori_format_ = in_tensors_[0]->GetFormat();
  out_ori_format_ = out_tensors_[0]->GetFormat();
@@ -112,17 +110,16 @@ int ReshapeOpenCLKernel::Run() {
    oh = out_tensors_[0]->shape()[1];
    ow = out_tensors_[0]->shape()[2];
  }
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  std::vector<size_t> local = {};
  std::vector<size_t> global = {(size_t)oh, (size_t)ow, (size_t)c4};
  cl_int4 size = {h, w, c4, 1};
  cl_int4 size_out = {oh, ow, c4, 1};
  int arg_idx = 0;
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, size);
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, size_out);
  ocl_runtime->RunKernel(kernel_, global, local, nullptr);
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size);
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size_out);
  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h
@@ -20,7 +20,6 @@
 #include <vector>

 #include "src/lite_kernel.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/opencl_kernel.h"

 namespace mindspore::kernel {
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
@@ -245,7 +245,6 @@ int ScaleOpenCLKernel::InitBuffer() {
 }

 int ScaleOpenCLKernel::Init() {
  ocl_runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
  std::string kernel_name;

  const ScaleParameter *scale_param = reinterpret_cast<const ScaleParameter *>(op_parameter_);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h
@@ -19,7 +19,6 @@

 #include <vector>
 #include "nnacl/scale.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/opencl_kernel.h"

 namespace mindspore::kernel {
@@ -42,7 +41,6 @@ class ScaleOpenCLKernel : public OpenCLKernel {
  int InitBuffer();

  cl::Kernel kernel_;
  lite::opencl::OpenCLRuntime *ocl_runtime_;
  bool element_flag_{true};
  void *scale_ptr_{nullptr};
  void *offset_ptr_{nullptr};
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc
@@ -18,7 +18,6 @@
 #include <algorithm>
 #include <set>
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/slice.h"
 #include "src/runtime/kernel/opencl/utils.h"
 #include "src/runtime/kernel/opencl/cl/slice.cl.inc"
@@ -40,8 +39,7 @@ int SliceOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
    im_dst_x = out_tensors_[0]->Width();
  }
  size_t img_dtype = CL_FLOAT;
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  auto enable_fp16_ = ocl_runtime->GetFp16Enable();
  auto enable_fp16_ = ocl_runtime_->GetFp16Enable();
  if (enable_fp16_) {
    img_dtype = CL_HALF_FLOAT;
  }
@@ -71,9 +69,8 @@ int SliceOpenCLKernel::Init() {
  std::set<std::string> build_options;
  std::string source = slice_source;
  std::string program_name = "slice";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  ocl_runtime->LoadSource(program_name, source);
  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
  ocl_runtime_->LoadSource(program_name, source);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
  return RET_OK;
 }

@@ -96,7 +93,6 @@ void SlcieGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *l
 int SliceOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running! ";
  auto param = reinterpret_cast<SliceParameter *>(this->op_parameter_);
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  auto input_shape = in_tensors_[0]->shape();
  cl_int4 input_shape_ = {input_shape[0], input_shape[1], input_shape[2], UP_DIV(input_shape[3], C4NUM)};
  cl_int4 size_ = {param->size_[0], param->size_[1], param->size_[2], UP_DIV(param->size_[3], C4NUM)};
@@ -105,18 +101,18 @@ int SliceOpenCLKernel::Run() {
  uint32_t OH = param->size_[1];
  uint32_t OW = param->size_[2];

  const std::vector<size_t> &max_global = ocl_runtime->GetWorkItemSize();
  const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
  std::vector<size_t> local = {1, 1, 1};  // init local
  std::vector<size_t> global = {1, OH, OW};
  SlcieGetWorkGroup(global, &local, max_global[0]);
  int arg_cn = 0;
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());   // input tensor
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());  // out tensor
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape_);
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, size_);
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, begin_);
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, sharedNoUpdiv);
  ocl_runtime->RunKernel(kernel_, global, local, nullptr);
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());   // input tensor
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());  // out tensor
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, size_);
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, begin_);
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, sharedNoUpdiv);
  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);

  return RET_OK;
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.h
@@ -20,7 +20,6 @@
 #include <vector>
 #include "ir/anf.h"
 #include "src/runtime/kernel/opencl/opencl_kernel.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "nnacl/fp32/slice.h"

 namespace mindspore::kernel {
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
@@ -19,7 +19,6 @@
 #include <set>
 #include "include/errorcode.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/utils.h"
 #ifndef PROGRAM_WITH_IL
 #include "src/runtime/kernel/opencl/cl/softmax.cl.inc"
@@ -51,7 +50,7 @@ int SoftmaxOpenCLKernel::InitGlobalSize() {
 int SoftmaxOpenCLKernel::SetWorkGroupSize() {
  // set work group size
  InitGlobalSize();
  int max_work_group_size = runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*runtime_->Device())());
  int max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())());
  local_size_ = GetCommonLocalSize(global_size_, max_work_group_size);
  global_size_ = GetCommonGlobalSize(local_size_, global_size_);
  return lite::RET_OK;
@@ -101,8 +100,7 @@ int SoftmaxOpenCLKernel::Init() {
  std::string program_name = "SoftMax";

  std::string source = softmax_source;
  runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
  enable_fp16_ = runtime_->GetFp16Enable();
  enable_fp16_ = ocl_runtime_->GetFp16Enable();
  // framework not set this param yet! just use default.
  if (in_tensors_[0]->shape().size() == 4) {
    // support 4d tensor
@@ -133,8 +131,8 @@ int SoftmaxOpenCLKernel::Init() {
    program_name += "_IMG";
  }
  std::set<std::string> build_options;
  runtime_->LoadSource(program_name, source);
  runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
  ocl_runtime_->LoadSource(program_name, source);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif
  in_ori_format_ = in_tensors_[0]->GetFormat();
  out_ori_format_ = out_tensors_[0]->GetFormat();
@@ -158,32 +156,32 @@ int SoftmaxOpenCLKernel::Run() {
    auto mask_ = GetMaskForLastChannel(channel_size);
    cl_float4 mask = {mask_[0], mask_[1], mask_[2], mask_[3]};

    runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
    if (is_image_out_) {
      runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
      ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
    } else {
      runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
      ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
    }
    runtime_->SetKernelArg(kernel_, arg_idx++, mask);
    runtime_->SetKernelArg(kernel_, arg_idx++, slices);
    runtime_->SetKernelArg(kernel_, arg_idx, slices_x32);
    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, mask);
    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, slices);
    ocl_runtime_->SetKernelArg(kernel_, arg_idx, slices_x32);
    SetWorkGroupSize1x1();
  } else {
    int slices = UP_DIV(out_tensors_[0]->shape()[3], C4NUM);
    cl_int4 input_shape = {in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2], in_tensors_[0]->shape()[3], slices};

    runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
    if (is_image_out_) {
      runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
      ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
    } else {
      runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
      ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
    }
    runtime_->SetKernelArg(kernel_, arg_idx, input_shape);
    ocl_runtime_->SetKernelArg(kernel_, arg_idx, input_shape);
    SetWorkGroupSize();
  }

  // run opengl kernel
  runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
  ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
  return lite::RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h
@@ -21,7 +21,6 @@

 #include "src/runtime/kernel/opencl/opencl_kernel.h"
 #include "nnacl/fp32/softmax.h"
 #include "src/runtime/opencl/opencl_runtime.h"

 namespace mindspore::kernel {

@@ -46,7 +45,6 @@ class SoftmaxOpenCLKernel : public OpenCLKernel {
 private:
  cl::Kernel kernel_;
  SoftmaxParameter *parameter_;
  lite::opencl::OpenCLRuntime *runtime_;

  bool onexone_flag_{false};
  std::vector<size_t> local_size_;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
@@ -21,7 +21,6 @@
 #include <utility>
 #include "include/errorcode.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/cl/to_format.cl.inc"

 using mindspore::kernel::KERNEL_ARCH::kGPU;
@@ -33,7 +32,6 @@ using mindspore::schema::PrimitiveType_ToFormat;
 namespace mindspore::kernel {

 int ToFormatOpenCLKernel::Init() {
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  auto parameter = reinterpret_cast<OpenCLToFormatParameter *>(op_parameter_);
  out_mem_type_ = parameter->out_mem_type;
  std::string program_name = "to_format";
@@ -53,12 +51,12 @@ int ToFormatOpenCLKernel::Init() {

  this->set_name(kernel_name);
 #ifdef PROGRAM_WITH_IL
  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
  kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
  std::set<std::string> build_options;
  std::string source = to_format_source;
  ocl_runtime->LoadSource(program_name, source);
  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
  ocl_runtime_->LoadSource(program_name, source);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif
  InitNHWCShape();
  MS_LOG(DEBUG) << kernel_name << " Init Done!";
@@ -147,7 +145,7 @@ int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size
    return RET_ERROR;
  }
  img_size->clear();
  auto enable_fp16_ = lite::opencl::OpenCLRuntime::GetInstance()->GetFp16Enable();
  auto enable_fp16_ = ocl_runtime_->GetFp16Enable();
  size_t img_dtype = CL_FLOAT;
  if (enable_fp16_) {
    img_dtype = CL_HALF_FLOAT;
@@ -158,7 +156,6 @@ int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size
 }
 int ToFormatOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running!";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  std::vector<size_t> local = {};
  std::vector<size_t> global;
  GetGlobalSize(0, &global);
@@ -167,11 +164,11 @@ int ToFormatOpenCLKernel::Run() {
  cl_int4 gsize{(cl_int)global[0], (cl_int)global[1], (cl_int)global[2], 1};
  auto src_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::BUF : lite::opencl::MemType::IMG;
  auto dst_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::IMG : lite::opencl::MemType::BUF;
  ocl_runtime->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), src_mem_type);
  ocl_runtime->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), dst_mem_type);
  ocl_runtime->SetKernelArg(kernel_, 2, gsize);
  ocl_runtime->SetKernelArg(kernel_, 3, shape);
  ocl_runtime->RunKernel(kernel_, global, local, nullptr);
  ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), src_mem_type);
  ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), dst_mem_type);
  ocl_runtime_->SetKernelArg(kernel_, 2, gsize);
  ocl_runtime_->SetKernelArg(kernel_, 3, shape);
  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h
@@ -20,7 +20,6 @@
 #include <vector>

 #include "src/lite_kernel.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/opencl_kernel.h"

 namespace mindspore::kernel {
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
@@ -18,7 +18,6 @@
 #include <string>
 #include "include/errorcode.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/transpose.h"
 #ifndef PROGRAM_WITH_IL
 #include "src/runtime/kernel/opencl/cl/transpose.cl.inc"
@@ -34,8 +33,7 @@ namespace mindspore::kernel {

 int TransposeOpenCLKernel::Init() {
  std::string kernel_name = "transpose";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  enable_fp16_ = ocl_runtime->GetFp16Enable();
  enable_fp16_ = ocl_runtime_->GetFp16Enable();
  auto param = reinterpret_cast<TransposeParameter *>(op_parameter_);
  if (param->num_axes_ == 4 && param->perm_[0] == 0 && param->perm_[1] == 3 && param->perm_[2] == 1 &&
      param->perm_[3] == 2) {
@@ -52,13 +50,13 @@ int TransposeOpenCLKernel::Init() {
    kernel_name += "_IMG";
  }
 #ifdef PROGRAM_WITH_IL
  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
  kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
  std::set<std::string> build_options;
  std::string source = transpose_source;
  std::string program_name = "transpose";
  ocl_runtime->LoadSource(program_name, source);
  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
  ocl_runtime_->LoadSource(program_name, source);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif
  if ((in_tensors_[0]->shape()[1] * in_tensors_[0]->shape()[2]) % 4 != 0) {
    MS_LOG(ERROR) << "input H * W % 4 != 0 not support!";
@@ -114,24 +112,23 @@ int TransposeOpenCLKernel::Run() {
  int c = shapex[3];
  int c4 = UP_DIV(c, 4);
  int hw4 = UP_DIV(h * w, 4);
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  std::vector<size_t> local = {16, 16};
  std::vector<size_t> global = {UP_ROUND(hw4, local[0]), UP_ROUND(c4, local[1])};

  cl_int2 HW = {h * w, hw4};
  cl_int2 C = {c, c4};
  int arg_idx = 0;
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
  if (out_mem_type_ == OpenCLMemType::BUF) {
    ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
  } else {
    ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
  }
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, HW);
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, C);
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, w);
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, h);
  ocl_runtime->RunKernel(kernel_, global, local, nullptr);
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, HW);
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, C);
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, w);
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, h);
  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h
@@ -21,7 +21,6 @@

 #include "src/lite_kernel.h"
 #include "nnacl/transpose.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/opencl_kernel.h"

 namespace mindspore::kernel {
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include "src/lite_kernel.h"
 #include "include/errorcode.h"
 #include "src/runtime/opencl/opencl_runtime.h"

 namespace mindspore::kernel {

@@ -36,7 +37,16 @@ class OpenCLKernel : public LiteKernel {
 public:
  explicit OpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                        const std::vector<lite::Tensor *> &outputs)
      : LiteKernel(parameter, inputs, outputs, nullptr, nullptr) {}
      : LiteKernel(parameter, inputs, outputs, nullptr, nullptr) {
    ocl_runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
  }

  ~OpenCLKernel() {
    if (ocl_runtime_ != nullptr) {
      lite::opencl::OpenCLRuntime::DeleteInstance();
      ocl_runtime_ = nullptr;
    }
  }

  virtual int Init() { return RET_ERROR; }
  virtual int Prepare() { return RET_ERROR; }
@@ -59,6 +69,7 @@ class OpenCLKernel : public LiteKernel {
  schema::Format in_ori_format_{schema::Format::Format_NHWC};
  schema::Format out_ori_format_{schema::Format::Format_NHWC4};
  schema::Format op_format_{schema::Format::Format_NHWC4};
  lite::opencl::OpenCLRuntime *ocl_runtime_{nullptr};
 };
 }  // namespace mindspore::kernel

--- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
@@ -99,7 +99,7 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te

    out_tensors->emplace_back(new_tensor);
    KernelKey desc{kGPU, kNumberTypeFloat32, schema::PrimitiveType_ToFormat};
    if (mem_type == OpenCLMemType::IMG && lite::opencl::OpenCLRuntime::GetInstance()->GetFp16Enable()) {
    if (mem_type == OpenCLMemType::IMG && ocl_runtime_->GetFp16Enable()) {
      desc.data_type = kNumberTypeFloat16;
      new_tensor->set_data_type(kNumberTypeFloat16);
    }
@@ -160,7 +160,8 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te
 }

 int SubGraphOpenCLKernel::Init() {
  allocator_ = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
  ocl_runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
  allocator_ = ocl_runtime_->GetAllocator();
  MS_LOG(DEBUG) << "input num=" << in_tensors_.size() << ", output num=" << out_tensors_.size();
  for (const auto tensor : in_tensors_) {
    tensor->set_allocator(allocator_);
@@ -195,8 +196,7 @@ int SubGraphOpenCLKernel::Init() {
 }

 int SubGraphOpenCLKernel::UpdateTensorDataType() {
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  bool is_fp16 = ocl_runtime->GetFp16Enable();
  bool is_fp16 = ocl_runtime_->GetFp16Enable();
  if (is_fp16 && (in_tensors_[0]->data_type() == kNumberTypeFloat32)) {
    std::set<lite::Tensor *> out_set;
    out_set.insert(in_tensors_.begin(), in_tensors_.end());
@@ -292,16 +292,25 @@ int SubGraphOpenCLKernel::UnInit() {
      delete tensor;
    }
  }
  in_convert_tensors_.clear();
  for (const auto &tensor : out_convert_tensors_) {
    if (tensor != nullptr) {
      delete tensor;
    }
  }
  for (const auto &op : in_convert_ops_) {
  out_convert_tensors_.clear();
  for (const auto &op : nodes_) {
    if (op != nullptr) {
      delete op;
    }
  }
  nodes_.clear();
  in_convert_ops_.clear();
  out_convert_ops_.clear();
  if (ocl_runtime_ != nullptr) {
    lite::opencl::OpenCLRuntime::DeleteInstance();
    ocl_runtime_ = nullptr;
  }
  return RET_OK;
 }

@@ -310,14 +319,13 @@ int SubGraphOpenCLKernel::InferShape() { return RET_OK; }
 int SubGraphOpenCLKernel::ReSize() { return RET_OK; }

 int SubGraphOpenCLKernel::Run() {
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  for (auto &tensor : in_tensors_) {
    allocator_->UnmapBuffer(tensor->data_c());
  }

  lite::opencl::OpenCLExecutor executor;
  executor.Run(in_tensors_, out_tensors_, nodes_, allocator_);
  ocl_runtime->SyncCommandQueue();
  ocl_runtime_->SyncCommandQueue();

  return RET_OK;
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h
@@ -64,6 +64,7 @@ class SubGraphOpenCLKernel : public SubGraphKernel {
  std::vector<OpenCLToFormatParameter *> out_parameters_;
  std::vector<LiteKernel *> in_convert_ops_;
  std::vector<LiteKernel *> out_convert_ops_;
  lite::opencl::OpenCLRuntime *ocl_runtime_{nullptr};
 };
 }  // namespace mindspore::kernel

--- a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
@@ -23,8 +23,6 @@

 namespace mindspore::lite::opencl {

 OpenCLAllocator::OpenCLAllocator() {}

 OpenCLAllocator::OpenCLAllocator(OpenCLRuntime *ocl_runtime) : ocl_runtime_(ocl_runtime) {}

 OpenCLAllocator::~OpenCLAllocator() { Clear(); }
@@ -49,9 +47,6 @@ void OpenCLAllocator::UnLock() {
 void *OpenCLAllocator::Malloc(size_t size) { return Malloc(size, std::vector<size_t>{}); }

 void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size) {
  if (ocl_runtime_ == nullptr) {
    ocl_runtime_ = opencl::OpenCLRuntime::GetInstance();
  }
  auto svm_capabilities = ocl_runtime_->GetSVMCapabilities();

  size_t img_pitch = 0;
@@ -144,9 +139,6 @@ void *OpenCLAllocator::CreateImageFromHost(void *data, size_t size, const std::v
    MS_LOG(ERROR) << "MallocData out of max_size, size: " << size;
    return nullptr;
  }
  if (ocl_runtime_ == nullptr) {
    ocl_runtime_ = opencl::OpenCLRuntime::GetInstance();
  }
  Lock();
  auto iter = free_list_.lower_bound(size);
  while (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) {
@@ -258,9 +250,6 @@ void *OpenCLAllocator::GetBuffer(void *buffer) {

 void OpenCLAllocator::Clear() {
  Lock();
  if (ocl_runtime_ == nullptr) {
    ocl_runtime_ = opencl::OpenCLRuntime::GetInstance();
  }
  auto svm_capabilities = ocl_runtime_->GetSVMCapabilities();
  for (auto it = allocated_list_.begin(); it != allocated_list_.end(); it++) {
    if (svm_capabilities) {
@@ -306,9 +295,6 @@ void OpenCLAllocator::Clear() {
 }

 void *OpenCLAllocator::MapBuffer(void *host_ptr, int flags, void *command_queue, bool sync) {
  if (ocl_runtime_ == nullptr) {
    ocl_runtime_ = opencl::OpenCLRuntime::GetInstance();
  }
  auto svm_capabilities = ocl_runtime_->GetSVMCapabilities();
  if (svm_capabilities) {
    if (!(svm_capabilities & CL_DEVICE_SVM_FINE_GRAIN_BUFFER)) {
@@ -362,9 +348,6 @@ void *OpenCLAllocator::MapBuffer(void *host_ptr, int flags, void *command_queue,
 }

 int OpenCLAllocator::UnmapBuffer(void *host_ptr, void *command_queue) {
  if (ocl_runtime_ == nullptr) {
    ocl_runtime_ = opencl::OpenCLRuntime::GetInstance();
  }
  auto svm_capabilities = ocl_runtime_->GetSVMCapabilities();
  if (svm_capabilities) {
    if (!(svm_capabilities & CL_DEVICE_SVM_FINE_GRAIN_BUFFER)) {
--- a/mindspore/lite/src/runtime/opencl/opencl_allocator.h
+++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.h
@@ -45,7 +45,6 @@ enum class MemType : char { SVM, BUF, IMG };

 class OpenCLAllocator : public Allocator {
 public:
  OpenCLAllocator();
  explicit OpenCLAllocator(OpenCLRuntime *ocl_runtime);
  ~OpenCLAllocator() override;
  void SetContext(const AllocatorContext &ctx) override;
--- a/mindspore/lite/src/runtime/opencl/opencl_executor.h
+++ b/mindspore/lite/src/runtime/opencl/opencl_executor.h
@@ -27,7 +27,11 @@
 namespace mindspore::lite::opencl {
 class OpenCLExecutor : Executor {
 public:
  OpenCLExecutor() : Executor() { allocator_ = OpenCLRuntime::GetInstance()->GetAllocator(); }
  OpenCLExecutor() : Executor() {
    auto ocl_runtime = OpenCLRuntime::GetInstance();
    allocator_ = ocl_runtime->GetAllocator();
    OpenCLRuntime::DeleteInstance();
  }

  int Prepare(const std::vector<kernel::LiteKernel *> &kernels);

--- a/mindspore/lite/src/scheduler.cc
+++ b/mindspore/lite/src/scheduler.cc
@@ -244,7 +244,7 @@ kernel::LiteKernel *Scheduler::ScheduleNode(const std::vector<Tensor *> &in_tens
  TypeId data_type = GetFirstFp32Fp16OrInt8Type(in_tensors);
  kernel::KernelKey desc{kernel::KERNEL_ARCH::kCPU, data_type, static_cast<schema::PrimitiveType>(primitive->Type())};
 #if SUPPORT_GPU
  if (context_->device_type_ == DT_GPU && lite::opencl::OpenCLRuntime::GetInstance()->IsInitOK()) {
  if (context_->device_type_ == DT_GPU) {
    desc.arch = kernel::KERNEL_ARCH::kGPU;
    auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, primitive, context_, desc);
    if (kernel != nullptr) {
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/activation_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/activation_tests.cc
@@ -157,7 +157,6 @@ TEST_F(TestActivationOpenCL, ReluFp_dim4) {
  ret = sub_graph->Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Init sub_graph error.";
    delete kernel;
    delete param;
    delete input_tensor;
    delete output_tensor;
@@ -167,7 +166,6 @@ TEST_F(TestActivationOpenCL, ReluFp_dim4) {
  MS_LOG(INFO) << "Run SubGraphOpenCLKernel.";
  ret = sub_graph->Run();
  if (ret != RET_OK) {
    delete kernel;
    delete param;
    delete input_tensor;
    delete output_tensor;
@@ -182,7 +180,6 @@ TEST_F(TestActivationOpenCL, ReluFp_dim4) {
    printf_tensor<float>("ReluFp32--output data--", outputs[0]);
    CompareRes<float>(output_tensor, out_file);
  }
  delete kernel;
  delete param;
  delete input_tensor;
  delete output_tensor;
@@ -271,7 +268,6 @@ TEST_F(TestActivationOpenCL, Relu6Fp_dim4) {
  ret = sub_graph->Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Init sub_graph error.";
    delete kernel;
    delete param;
    delete input_tensor;
    delete output_tensor;
@@ -281,7 +277,6 @@ TEST_F(TestActivationOpenCL, Relu6Fp_dim4) {
  MS_LOG(INFO) << "Run SubGraphOpenCLKernel.";
  ret = sub_graph->Run();
  if (ret != RET_OK) {
    delete kernel;
    delete param;
    delete input_tensor;
    delete output_tensor;
@@ -297,7 +292,6 @@ TEST_F(TestActivationOpenCL, Relu6Fp_dim4) {
    printf_tensor<float>("Relu6:FP32--output data---", outputs[0]);
    CompareRes<float>(output_tensor, out_file);
  }
  delete kernel;
  delete param;
  delete input_tensor;
  delete output_tensor;
@@ -386,7 +380,6 @@ TEST_F(TestActivationOpenCL, SigmoidFp_dim4) {
  ret = sub_graph->Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Init sub_graph error.";
    delete kernel;
    delete param;
    delete input_tensor;
    delete output_tensor;
@@ -396,7 +389,6 @@ TEST_F(TestActivationOpenCL, SigmoidFp_dim4) {
  MS_LOG(INFO) << "Run SubGraphOpenCLKernel.";
  ret = sub_graph->Run();
  if (ret != RET_OK) {
    delete kernel;
    delete param;
    delete input_tensor;
    delete output_tensor;
@@ -412,7 +404,6 @@ TEST_F(TestActivationOpenCL, SigmoidFp_dim4) {
    printf_tensor<float>("Sigmoid:FP32--output data---", outputs[0]);
    CompareRes<float>(output_tensor, out_file);
  }
  delete kernel;
  delete param;
  delete input_tensor;
  delete output_tensor;
@@ -502,7 +493,6 @@ TEST_F(TestActivationOpenCL, LeakyReluFp_dim4) {
  ret = sub_graph->Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Init sub_graph error.";
    delete kernel;
    delete param;
    delete input_tensor;
    delete output_tensor;
@@ -512,7 +502,6 @@ TEST_F(TestActivationOpenCL, LeakyReluFp_dim4) {
  MS_LOG(INFO) << "Run SubGraphOpenCLKernel.";
  ret = sub_graph->Run();
  if (ret != RET_OK) {
    delete kernel;
    delete param;
    delete input_tensor;
    delete output_tensor;
@@ -527,7 +516,6 @@ TEST_F(TestActivationOpenCL, LeakyReluFp_dim4) {
    printf_tensor<float>("Leaky Relu:FP32--output data---", outputs[0]);
    CompareRes<float>(output_tensor, out_file);
  }
  delete kernel;
  delete param;
  delete input_tensor;
  delete output_tensor;
@@ -616,7 +604,6 @@ TEST_F(TestActivationOpenCLTanh, TanhFp_dim4) {
  ret = sub_graph->Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Init sub_graph error.";
    delete kernel;
    delete param;
    delete input_tensor;
    delete output_tensor;
@@ -626,7 +613,6 @@ TEST_F(TestActivationOpenCLTanh, TanhFp_dim4) {
  MS_LOG(INFO) << "Run SubGraphOpenCLKernel.";
  ret = sub_graph->Run();
  if (ret != RET_OK) {
    delete kernel;
    delete param;
    delete input_tensor;
    delete output_tensor;
@@ -642,7 +628,6 @@ TEST_F(TestActivationOpenCLTanh, TanhFp_dim4) {
    printf_tensor<float>("Tanh:FP32--output data---", outputs[0]);
    CompareRes<float>(output_tensor, out_file);
  }
  delete kernel;
  delete param;
  delete input_tensor;
  delete output_tensor;
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_self_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_self_tests.cc
@@ -127,7 +127,6 @@ TEST_F(TestArithmeticSelfOpenCLfp16, ArithmeticSelfOpenCLFp16) {
    delete tensor;
  }
  delete param;
  delete arithmeticself_kernel;
  delete sub_graph;
 }
 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc
@@ -203,7 +203,6 @@ static void TestCase(const std::vector<int> &shape_a, const std::vector<int> &sh
  delete[] data_c_ocl;

  delete kernel;
  delete arith_kernel;
  delete param;
  for (auto tensor : inputs) {
    delete tensor;
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/batchnorm_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/batchnorm_tests.cc
@@ -147,7 +147,6 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) {
    delete tensor;
  }
  delete param;
  delete batchnorm_kernel;
  delete sub_graph;
 }
 TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) {
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/biasadd_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/biasadd_tests.cc
@@ -174,7 +174,6 @@ TEST_F(TestBiasAddOpenCL, BiasAddFp32_dim4) {
    delete weight_tensor;
    delete sub_graph;
    delete param;
    delete biasadd_kernel;
    return;
  }
  MS_LOG(INFO) << "Sub graph begin running!";
@@ -186,7 +185,6 @@ TEST_F(TestBiasAddOpenCL, BiasAddFp32_dim4) {
    delete weight_tensor;
    delete sub_graph;
    delete param;
    delete biasadd_kernel;
    return;
  }

@@ -202,7 +200,6 @@ TEST_F(TestBiasAddOpenCL, BiasAddFp32_dim4) {
  delete output_tensor;
  delete sub_graph;
  delete param;
  delete biasadd_kernel;
  lite::opencl::OpenCLRuntime::DeleteInstance();
 }
 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc
@@ -164,7 +164,6 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_2input_dim4_axis3) {
    delete tensor;
  }
  delete param;
  delete concat_kernel;
  delete sub_graph;
 }

@@ -284,7 +283,6 @@ TEST_F(TestConcatOpenCLfp32, ConcatFp32_2input_dim4_axis3) {
    delete tensor;
  }
  delete param;
  delete concat_kernel;
  delete sub_graph;
 }
 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/gather_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/gather_tests.cc
@@ -78,7 +78,6 @@ void test_main_gather(void *input_data, void *correct_data, const std::vector<in
  std::cout << "==================output data================" << std::endl;
  auto *output_data = reinterpret_cast<T *>(outputs[0]->data_c());
  CommonTest::CompareOutputData<T>(output_data, static_cast<T*>(correct_data), outputs[0]->ElementsNum(), 0.0001);
  delete pkernel;
  delete sub_graph;
 }
 TEST_F(TestGatherOpenCL, Axis1Fp32) {
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/prelu_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/prelu_tests.cc
@@ -167,7 +167,6 @@ TEST_F(TestPReluOpenCL, PReluFp32_dim4) {
    delete output_tensor;
    delete weight_tensor;
    delete param;
    delete prelu_kernel;
    delete sub_graph;
    return;
  }
@@ -179,7 +178,6 @@ TEST_F(TestPReluOpenCL, PReluFp32_dim4) {
    delete output_tensor;
    delete weight_tensor;
    delete param;
    delete prelu_kernel;
    delete sub_graph;
    return;
  }
@@ -195,7 +193,6 @@ TEST_F(TestPReluOpenCL, PReluFp32_dim4) {
  delete output_tensor;
  delete weight_tensor;
  delete param;
  delete prelu_kernel;
  delete sub_graph;
  lite::opencl::OpenCLRuntime::DeleteInstance();
 }
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/scale_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/scale_tests.cc
@@ -223,7 +223,6 @@ static void TestCase(const std::vector<int> &shape_a, const std::vector<int> &sh
  delete[] data_out_ocl;

  delete kernel;
  delete scale_kernel;
  delete param;
  for (auto tensor : inputs) {
    delete tensor;
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/slice_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/slice_tests.cc
@@ -143,7 +143,6 @@ TEST_F(TestSliceOpenCLfp32, Slicefp32input_dim4) {
  for (auto tensor : outputs) {
    delete tensor;
  }
  delete slice_kernel;
  delete sub_graph;
 }
 TEST_F(TestSliceOpenCLfp16, Slicefp16input_dim4) {
@@ -251,7 +250,6 @@ TEST_F(TestSliceOpenCLfp16, Slicefp16input_dim4) {
  for (auto tensor : outputs) {
    delete tensor;
  }
  delete slice_kernel;
  delete sub_graph;
 }
 }  // namespace mindspore