opencl_strided_slice

5 years ago · c94563c06b
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/slice.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/slice.cl
@@ -1,146 +0,0 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #define INT2 int2
 #define INT4 int4
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
 __kernel void slice_NHWC4(__read_only image2d_t input, __write_only image2d_t output, INT4 input_shape, INT4 out_shape,
                          INT4 begin, INT2 sharedNoUpdiv) {
  int X = get_global_id(1);  // H
  int Y = get_global_id(2);  // W
  if (X >= out_shape.y || Y >= out_shape.z) {
    return;
  }
  FLT4 result;
  if (sharedNoUpdiv.x % 4 == 0) {
    for (int i = 0; i < out_shape.w; i++) {
      result = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + (i + begin.w), (X + begin.y)));
      WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + i, (X)), result);
    }
  } else {
    int begin_postion = sharedNoUpdiv.x % 4;
    FLT4 first = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + begin.w, (X + begin.y)));
    if (begin_postion == 1) {
      for (int i = 1; i <= out_shape.w; i++) {
        FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + (begin.w + i), (X + begin.y)));
        result.x = first.y;
        result.y = first.z;
        result.z = first.w;
        result.w = second.x;
        WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + i - 1, (X)), result);
        first.y = second.y;
        first.z = second.z;
        first.w = second.w;
      }
    } else if (begin_postion == 2) {
      for (int i = 1; i <= out_shape.w; i++) {
        FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + (begin.w + i), (X + begin.y)));
        result.x = first.z;
        result.y = first.w;
        result.z = second.x;
        result.w = second.y;
        WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + i - 1, (X)), result);
        first.z = second.z;
        first.w = second.w;
      }
    } else {
      for (int i = 1; i <= out_shape.w; i++) {
        FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + (begin.w + i), (X + begin.y)));
        result.x = first.w;
        result.y = second.x;
        result.z = second.y;
        result.w = second.z;
        WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + i - 1, (X)), result);
        first.w = second.w;
      }
    }
  }
  // judge the line of size
  int size = sharedNoUpdiv.y % 4;
  FLT4 result_fill0;
  if (size == 1) {
    result_fill0.x = result.x;
    result_fill0.y = 0;
    result_fill0.z = 0;
    result_fill0.w = 0;
    WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + out_shape.w - 1, (X)), result_fill0);
  } else if (size == 2) {
    result_fill0.x = result.x;
    result_fill0.y = result.y;
    result_fill0.z = 0;
    result_fill0.w = 0;
    WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + out_shape.w - 1, (X)), result_fill0);
  } else if (size == 3) {
    result_fill0.x = result.x;
    result_fill0.y = result.y;
    result_fill0.z = result.z;
    result_fill0.w = 0;
    WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + out_shape.w - 1, (X)), result_fill0);
  }
 }
 __kernel void slice_NC4HW4(__read_only image2d_t input, __write_only image2d_t output, INT4 input_shape, INT4 out_shape,
                           INT4 begin, INT2 sharedNoUpdiv) {
  int X = get_global_id(1);  // H
  int Y = get_global_id(2);  // W
  if (X >= out_shape.y || Y >= out_shape.z) {
    return;
  }
  FLT4 result;
  if (sharedNoUpdiv.x % 4 == 0) {
    for (int i = 0; i < out_shape.w; i++) {
      result = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z), (i + begin.w) * input_shape.y + (X + begin.y)));
      WRITE_IMAGE(output, (INT2)((Y), (i * out_shape.y + X)), result);
    }
  } else {
    int begin_postion = sharedNoUpdiv.x % 4;
    FLT4 first = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z), (begin.w) * input_shape.y + (X + begin.y)));
    if (begin_postion == 1) {
      for (int i = 1; i <= out_shape.w; i++) {
        FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z), (i + begin.w) * input_shape.y + (X + begin.y)));
        result.x = first.y;
        result.y = first.z;
        result.z = first.w;
        result.w = second.x;
        WRITE_IMAGE(output, (INT2)((Y), ((i - 1) * out_shape.y + X)), result);
        first.y = second.y;
        first.z = second.z;
        first.w = second.w;
      }
    } else if (begin_postion == 2) {
      for (int i = 1; i <= out_shape.w; i++) {
        FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z), (i + begin.w) * input_shape.y + (X + begin.y)));
        result.x = first.z;
        result.y = first.w;
        result.z = second.x;
        result.w = second.y;
        WRITE_IMAGE(output, (INT2)((Y), ((i - 1) * out_shape.y + X)), result);
        first.z = second.z;
        first.w = second.w;
      }
    } else {
      for (int i = 1; i <= out_shape.w; i++) {
        FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z), (i + begin.w) * input_shape.y + (X + begin.y)));
        result.x = first.w;
        result.y = second.x;
        result.z = second.y;
        result.w = second.z;
        WRITE_IMAGE(output, (INT2)((Y), ((i - 1) * out_shape.y + X)), result);
        first.w = second.w;
      }
    }
  }
  // judge the line of size
  int size = sharedNoUpdiv.y % 4;
  FLT4 result_fill0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
  if (size == 1) {
    result_fill0.x = result.x;
    WRITE_IMAGE(output, (INT2)((Y), ((out_shape.w - 1) * out_shape.y + X)), result_fill0);
  } else if (size == 2) {
    result_fill0.x = result.x;
    result_fill0.y = result.y;
    WRITE_IMAGE(output, (INT2)((Y), ((out_shape.w - 1) * out_shape.y + X)), result_fill0);
  } else if (size == 3) {
    result_fill0.x = result.x;
    result_fill0.y = result.y;
    result_fill0.z = result.z;
    WRITE_IMAGE(output, (INT2)((Y), ((out_shape.w - 1) * out_shape.y + X)), result_fill0);
  }
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/strided_slice.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/strided_slice.cl
@@ -0,0 +1,59 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable

 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;

 __kernel void strided_slice(__read_only image2d_t input, __write_only image2d_t output, int4 input_shape,
                            int4 output_shape, int2 io_slices, int4 begin, int4 stride, int4 size) {
  int IN = input_shape.x, IH = input_shape.y, IW = input_shape.z, CI = input_shape.w;
  int ON = output_shape.x, OH = output_shape.y, OW = output_shape.z, CO = output_shape.w;
  int CI_SLICES = io_slices.x, CO_SLICES = io_slices.y;
  int on_oh = get_global_id(0);
  int ow = get_global_id(1);
  int co_slice = get_global_id(2);
  int on = on_oh / OH;
  int oh = on_oh % OH;
  if (on >= ON || oh >= OH || ow >= OW || co_slice >= CO_SLICES) {
    return;
  }

  FLT tmp[4];
  for (int i = 0; i < 4; ++i) {
    // output_shape idx -> size idx. because squeeze(output_shape)=squeeze(size)
    // for example:
    // python code: B = A[1, 1:16, 2:16, 3:16]
    // input_shape  = [16, 16, 16, 16]
    // begin        = [ 1,  1,  2,  3]
    // end          = [ 2, 16, 16, 16]
    // stride       = [ 1,  1,  1,  1]
    // size         = [ 1, 15, 14, 13] = ceil((end - begin) / stride)
    // output_shape = [    15, 14, 13]
    int idx = ((on * OH + oh) * OW + ow) * CO + co_slice * 4 + i;
    int co_ = idx % size.w;
    idx /= size.w;
    int ow_ = idx % size.z;
    idx /= size.z;
    int oh_ = idx % size.y;
    idx /= size.y;
    int on_ = idx;

    int in = begin.x + stride.x * on_;
    int ih = begin.y + stride.y * oh_;
    int iw = begin.z + stride.z * ow_;
    int ci = begin.w + stride.w * co_;

    FLT4 src = READ_IMAGE(input, smp_none, (int2)(iw * CI_SLICES + ci / 4, in * IH + ih));
    int offset = ci % 4;
    if (offset == 0) {
      tmp[i] = src.x;
    } else if (offset == 1) {
      tmp[i] = src.y;
    } else if (offset == 2) {
      tmp[i] = src.z;
    } else {
      tmp[i] = src.w;
    }
  }

  FLT4 out = (FLT4)(tmp[0], tmp[1], tmp[2], tmp[3]);
  WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, on_oh), out);
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc
@@ -1,106 +0,0 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <cstring>
 #include <string>
 #include <algorithm>
 #include <set>
 #include "src/kernel_registry.h"
 #include "src/runtime/kernel/opencl/kernel/slice.h"
 #include "src/runtime/kernel/opencl/utils.h"
 #include "src/runtime/kernel/opencl/cl/slice.cl.inc"

 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Slice;

 namespace mindspore::kernel {

 int SliceOpenCLKernel::Init() {
  std::set<std::string> build_options;
  std::string source = slice_source;
  std::string program_name = "slice";
  std::string kernel_name = "slice_NHWC4";
  ocl_runtime_->LoadSource(program_name, source);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
  MS_LOG(DEBUG) << kernel_name << " Init Done!";
  return RET_OK;
 }

 void SlcieGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
  const int max_divider = 8;
  const int max_x = 4, max_y = 8;
  int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
  int yz = max_size / x;
  int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
  int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));

  local->clear();
  local->push_back(x);
  local->push_back(y);
  local->push_back(z);
 }

 int SliceOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running! ";
  auto param = reinterpret_cast<SliceParameter *>(this->op_parameter_);
  auto input_shape = in_tensors_[0]->shape();
  cl_int4 input_shape_ = {input_shape[0], input_shape[1], input_shape[2], UP_DIV(input_shape[3], C4NUM)};
  cl_int4 size_ = {param->size_[0], param->size_[1], param->size_[2], UP_DIV(param->size_[3], C4NUM)};
  cl_int4 begin_ = {param->begin_[0], param->begin_[1], param->begin_[2], param->begin_[3] / 4};
  cl_int2 sharedNoUpdiv = {param->begin_[3], param->size_[3]};
  uint32_t OH = param->size_[1];
  uint32_t OW = param->size_[2];

  const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
  std::vector<size_t> local = {1, 1, 1};  // init local
  std::vector<size_t> global = {1, OH, OW};
  SlcieGetWorkGroup(global, &local, max_global[0]);
  int arg_cn = 0;
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());   // input tensor
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());  // out tensor
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, size_);
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, begin_);
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, sharedNoUpdiv);
  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);

  return RET_OK;
 }

 kernel::LiteKernel *OpenCLSliceKernelCreator(const std::vector<lite::Tensor *> &inputs,
                                             const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
                                             const lite::InnerContext *ctx, const kernel::KernelKey &desc,
                                             const mindspore::lite::PrimitiveC *primitive) {
  auto *kernel = new (std::nothrow) SliceOpenCLKernel(opParameter, inputs, outputs);
  if (kernel == nullptr) {
    MS_LOG(ERROR) << " new SliceOpenCLKernel failed ";
    free(opParameter);
    return nullptr;
  }
  auto ret = kernel->Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << " Init kernel failed, name: Slice ";
    delete kernel;
    return nullptr;
  }
  return kernel;
 }

 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Slice, OpenCLSliceKernelCreator);
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Slice, OpenCLSliceKernelCreator);
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc
@@ -0,0 +1,192 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <cstring>
 #include <deque>
 #include <string>
 #include <algorithm>
 #include <set>
 #include "src/kernel_registry.h"
 #include "src/runtime/kernel/opencl/kernel/strided_slice.h"
 #include "src/runtime/kernel/opencl/utils.h"
 #include "src/runtime/kernel/opencl/cl/strided_slice.cl.inc"
 #include "nnacl/strided_slice.h"

 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Slice;
 using mindspore::schema::PrimitiveType_StridedSlice;

 namespace mindspore::kernel {

 int SliceOpenCLKernel::CheckSpecs() {
  const std::string kernel_name = op_parameter_->type_ == PrimitiveType_Slice ? "Slice" : "StridedSlice";
  if (in_tensors_.size() != 1) {
    MS_LOG(ERROR) << kernel_name + " only supports 1 input Tensor.";
    return RET_ERROR;
  }
  if (out_tensors_.size() != 1) {
    MS_LOG(ERROR) << kernel_name + " only supports 1 output Tensor.";
    return RET_ERROR;
  }
  auto in_ndim = in_tensors_.front()->shape().size();
  if (in_ndim == 0 || in_ndim > 4) {
    MS_LOG(ERROR) << kernel_name + " only supports 1D-4D input tensor";
    return RET_ERROR;
  }
  auto out_ndim = out_tensors_.front()->shape().size();
  if (out_ndim > 4) {
    MS_LOG(ERROR) << kernel_name + " only supports 0D-4D output tensor";
    return RET_ERROR;
  }
  if (InitConstArgs() != RET_OK) {
    MS_LOG(ERROR) << "call SliceOpenCLKernel::InitConstArgs() failed";
    return RET_ERROR;
  }
  return RET_OK;
 }

 int SliceOpenCLKernel::Prepare() {
  std::set<std::string> build_options;
  std::string program_name = "strided_slice";
  ocl_runtime_->LoadSource(program_name, strided_slice_source);
  ocl_runtime_->BuildKernel(kernel_, program_name, "strided_slice", build_options);
  SetConstArgs();
  SetGlobalLocal();
  return RET_OK;
 }

 int SliceOpenCLKernel::InitConstArgs() {
  auto input_info = Image2DInfo(in_tensors_.front());
  auto output_info = Image2DInfo(out_tensors_.front());
  input_shape_ = {static_cast<cl_int>(input_info.N), static_cast<cl_int>(input_info.H),
                  static_cast<cl_int>(input_info.W), static_cast<cl_int>(input_info.C)};
  output_shape_ = {static_cast<cl_int>(output_info.N), static_cast<cl_int>(output_info.H),
                   static_cast<cl_int>(output_info.W), static_cast<cl_int>(output_info.C)};
  io_slices_ = {static_cast<cl_int>(input_info.Slice), static_cast<cl_int>(output_info.Slice)};

  if (op_parameter_->type_ == PrimitiveType_Slice) {
    auto param = reinterpret_cast<SliceParameter *>(op_parameter_);
    Broadcast2GpuShape(param->begin_, begin_.s, param->param_length_, 0);
    Broadcast2GpuShape(param->size_, size_.s, param->param_length_, -1);
    for (int i = 0; i < 4; ++i) {
      if (begin_.s[i] < 0) {
        begin_.s[i] += input_shape_.s[i];
      }
      if (begin_.s[i] < 0 || begin_.s[i] >= input_shape_.s[i]) {
        MS_LOG(ERROR) << "Slice kernel only supports 0<=begin<input_shape but begin[i]=" << begin_.s[i]
                      << " input_shape[i]=" << input_shape_.s[i];
        return RET_ERROR;
      }
      if (size_.s[i] < -1 || size_.s[i] == 0) {
        MS_LOG(ERROR) << "Slice kernel only supports size=-1 or size>0 but size[i]=" << size_.s[i];
        return RET_ERROR;
      }
      if (size_.s[i] == -1 || begin_.s[i] + size_.s[i] > input_shape_.s[i]) {
        size_.s[i] = input_shape_.s[i] - begin_.s[i];
      }
    }
  } else {
    auto param = reinterpret_cast<StridedSliceParameter *>(op_parameter_);
    cl_int4 end = input_shape_;
    Broadcast2GpuShape(param->begins_, begin_.s, param->num_axes_, 0);
    Broadcast2GpuShape(param->strides_, stride_.s, param->num_axes_, 1);
    Broadcast2GpuShape(param->ends_, end.s, param->num_axes_);

    for (int i = 0; i < 4; ++i) {
      // begin is negative
      if (begin_.s[i] < 0) {
        begin_.s[i] += input_shape_.s[i];
      }
      // avoid begin is out of range
      begin_.s[i] = std::clamp(begin_.s[i], 0, input_shape_.s[i] - 1);
      // end is negative
      if (end.s[i] < 0) {
        end.s[i] += input_shape_.s[i];
      }
      // avoid end is out of range
      end.s[i] = std::clamp(end.s[i], -1, input_shape_.s[i]);

      // check stride begin end
      if (stride_.s[i] > 0) {
        if (begin_.s[i] >= end.s[i]) {
          MS_LOG(ERROR) << "StridedSlice kernel only supports begin_<end when stride>0";
          return RET_ERROR;
        }
      } else if (stride_.s[i] < 0) {
        if (begin_.s[i] <= end.s[i]) {
          MS_LOG(ERROR) << "StridedSlice kernel only supports begin_>end when stride<0";
          return RET_ERROR;
        }
      } else {
        MS_LOG(ERROR) << "StridedSlice kernel only supports stride!=0";
        return RET_ERROR;
      }
      size_.s[i] = std::ceil(static_cast<float>(end.s[i] - begin_.s[i]) / static_cast<float>(stride_.s[i]));
    }
  }

  // check size
  std::vector<int> shape_not_1;
  std::vector<int> size_not_1;
  std::copy_if(out_tensors_.front()->shape().begin(), out_tensors_.front()->shape().end(), shape_not_1.begin(),
               [](int x) { return x > 1; });
  std::copy_if(size_.s, size_.s + 4, size_not_1.begin(), [](int x) { return x > 1; });
  if (shape_not_1 != size_not_1) {
    MS_LOG(ERROR) << "Slice/StridedSlice kernel output shape infer error";
    return RET_ERROR;
  }
  return RET_OK;
 }

 void SliceOpenCLKernel::SetConstArgs() {
  int arg_cn = 2;
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, io_slices_);
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, begin_);
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_);
  ocl_runtime_->SetKernelArg(kernel_, arg_cn, size_);
 }

 void SliceOpenCLKernel::SetGlobalLocal() {
  auto output_info = Image2DInfo(out_tensors_.front());
  std::vector<size_t> global = {output_info.N * output_info.H, output_info.W, output_info.Slice};

  const int max_divider = 8;
  auto max_work_group_size = ocl_runtime_->DeviceMaxWorkGroupSize();
  size_t local_c = GetMaxDivisorStrategy0(global[2], max_divider);
  size_t local_hw = max_work_group_size / local_c;
  size_t local_h = std::min(UP_DIV(global[0], 2), local_hw);
  size_t local_w = std::min(local_hw / local_h, global[1]);
  std::vector<size_t> local = {local_h, local_w, local_c};
  AlignGlobalLocal(global, local);
 }

 int SliceOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running! ";
  ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
  return RET_OK;
 }

 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Slice, OpenCLKernelCreator<SliceOpenCLKernel>);
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Slice, OpenCLKernelCreator<SliceOpenCLKernel>);
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_StridedSlice, OpenCLKernelCreator<SliceOpenCLKernel>);
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_StridedSlice, OpenCLKernelCreator<SliceOpenCLKernel>);
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.h
@@ -14,8 +14,8 @@
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_SLICE_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_SLICE_H_
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_STRIDED_SLICE_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_STRIDED_SLICE_H_

 #include <vector>
 #include "src/runtime/kernel/opencl/opencl_kernel.h"
@@ -31,12 +31,23 @@ class SliceOpenCLKernel : public OpenCLKernel {

  ~SliceOpenCLKernel() override = default;

  int Init() override;

  int Prepare() override;
  int Run() override;

  int CheckSpecs() override;
  void SetConstArgs() override;
  void SetGlobalLocal() override;

 private:
  int InitConstArgs();

  cl::Kernel kernel_;
  cl_int4 input_shape_{};
  cl_int4 output_shape_{};
  cl_int2 io_slices_{};
  cl_int4 begin_{};
  cl_int4 stride_{{1, 1, 1, 1}};
  cl_int4 size_{};
 };

 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
@@ -34,33 +34,67 @@ struct OpenCLToFormatParameter {
  lite::opencl::MemType out_mem_type{lite::opencl::MemType::IMG};
 };

 template <typename SrcT, typename DstT>
 void Broadcast2GpuShape(const SrcT *src, DstT *dst, int src_num) {
  auto *N = dst;
  auto *H = dst + 1;
  auto *W = dst + 2;
  auto *C = dst + 3;
  if (src_num == 1) {
    *N = src[0];
  } else if (src_num == 2) {
    *N = src[0];
    *C = src[1];
  } else if (src_num == 3) {
    *N = src[0];
    *W = src[1];
    *C = src[2];
  } else if (src_num == 4) {
    *N = src[0];
    *H = src[1];
    *W = src[2];
    *C = src[3];
  } else if (src_num >= 5) {
    MS_LOG(ERROR) << "GPU doesn't support ndim>=" << src_num;
  }
 }

 template <typename SrcT, typename DstT>
 void Broadcast2GpuShape(const SrcT *src, DstT *dst, int src_num, DstT default_value) {
  for (int i = 0; i < 4; ++i) {
    dst[i] = default_value;
  }
  Broadcast2GpuShape(src, dst, src_num);
 }

 struct Image2DInfo {
  explicit Image2DInfo(const lite::Tensor *tensor) {
    if (tensor == nullptr) {
      return;
    }

    auto shape = tensor->shape();
    if (shape.size() == 1) {
    auto ndim = shape.size();
    if (ndim == 1) {
      N = shape[0];
    } else if (shape.size() == 2) {
    } else if (ndim == 2) {
      N = shape[0];
      C = shape[1];
    } else if (shape.size() == 3) {
    } else if (ndim == 3) {
      N = shape[0];
      W = shape[1];
      C = shape[2];
    } else if (shape.size() == 4) {
    } else if (ndim == 4) {
      N = shape[0];
      H = shape[1];
      W = shape[2];
      C = shape[3];
    } else if (shape.size() >= 5) {
      MS_LOG(ERROR) << "GPU dont't support Tensor with dim=" << shape.size();
    } else if (ndim >= 5) {
      MS_LOG(ERROR) << "GPU doesn't support Tensor with ndim>=" << ndim;
    }
    Slice = UP_DIV(C, C4NUM);

    FLT_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half) : sizeof(cl_float);
    FLT4_size = FLT_size * 4;
    Slice = UP_DIV(C, C4NUM);
    if (W * Slice <= MAX_IMAGE2D_SIZE) {
      height = N * H;
      width = W * Slice;
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/slice_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/slice_tests.cc
@@ -13,357 +13,151 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <iostream>
 #include <memory>
 #include "src/common/log_adapter.h"
 #include "common/common_test.h"
 #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
 #include "mindspore/lite/src/common/file_utils.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/slice.h"
 #include "nnacl/slice_parameter.h"
 #include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"

 namespace mindspore {
 class TestSliceOpenCLfp32 : public mindspore::CommonTest {
 public:
  TestSliceOpenCLfp32() {}
 };
 class TestSliceOpenCLfp16 : public mindspore::CommonTest {
 public:
  TestSliceOpenCLfp16() {}
 };

 template <typename T>
 void CompareOutputData1(T *output_data, T *correct_data, int size, float err_bound) {
  for (size_t i = 0; i < size; i++) {
    T abs = fabs(output_data[i] - correct_data[i]);
    ASSERT_LE(abs, err_bound);
  }
 }

 TEST_F(TestSliceOpenCLfp32, Slicefp32CI) {
  MS_LOG(INFO) << " begin test ";
  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
  ocl_runtime->Init();
  auto allocator = ocl_runtime->GetAllocator();

  MS_LOG(INFO) << " Read tensors from .bin ";
  std::vector<int> input_shape = {1, 2, 2, 8};
  std::vector<int> output_shape = {1, 2, 2, 5};
  std::vector<int> begin = {0, 0, 0, 2};
  std::vector<int> size = {1, 2, 2, 5};
  auto data_type = kNumberTypeFloat32;
  auto tensor_type = lite::Tensor::CONST_TENSOR;

  float input_data[] = {-0.45816937, 0.92391545,  -0.9135602, -1.4002057, 1.1080881,  0.40712625,  -0.28128958,
                        0.09470133,  0.19801073,  0.04927751, -1.2808367, 0.1470597,  0.03393711,  -0.33282498,
                        -1.0433807,  -1.3678077,  -0.6423931, 0.5584889,  0.28965706, 0.5343769,   0.75480366,
                        -1.9328151,  -0.48714373, 1.711132,   -1.8871949, -0.2987629, -0.14000037, -0.080552,
                        0.95056856,  -0.06886655, 0.5316237,  0.05787678};
  float correct_data[] = {-0.9135602,  -1.4002057,  1.1080881,  0.40712625, -0.28128958, -1.2808367, 0.1470597,
                          0.03393711,  -0.33282498, -1.0433807, 0.28965706, 0.5343769,   0.75480366, -1.9328151,
                          -0.48714373, -0.14000037, -0.080552,  0.95056856, -0.06886655, 0.5316237};
  MS_LOG(INFO) << " construct tensors ";
  lite::Tensor *tensor_data = new (std::nothrow) lite::Tensor(data_type, input_shape, schema::Format_NHWC, tensor_type);
  if (tensor_data == nullptr) {
    MS_LOG(INFO) << " init tensor failed ";
    return;
  }
  auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, output_shape, schema::Format_NHWC, tensor_type);
  if (output_tensor == nullptr) {
    delete tensor_data;
    MS_LOG(INFO) << " init tensor failed ";
    return;
  }
  std::vector<lite::Tensor *> inputs = {tensor_data};
  std::vector<lite::Tensor *> outputs = {output_tensor};
 class TestSliceOpenCL : public mindspore::CommonTest {};

  MS_LOG(INFO) << "setting  SliceParameter ";
  auto param = reinterpret_cast<SliceParameter *>(malloc(sizeof(SliceParameter)));
 OpParameter *GetSliceParameter(const std::vector<int> &begin, const std::vector<int> &size) {
  auto param = static_cast<SliceParameter *>(malloc(sizeof(SliceParameter)));
  if (param == nullptr) {
    for (auto tensor : inputs) {
      delete tensor;
    }
    for (auto tensor : outputs) {
      delete tensor;
    }
    MS_LOG(INFO) << "new SliceParameter failed ";
    return;
    MS_LOG(ERROR) << "SliceParameter create error.";
    return nullptr;
  }
  for (int i = 0; i < input_shape.size(); i++) {
  param->op_parameter_.type_ = schema::PrimitiveType_Slice;
  param->param_length_ = begin.size();
  for (int i = 0; i < begin.size(); ++i) {
    param->begin_[i] = begin[i];
    param->size_[i] = size[i];
  }

  auto *slice_kernel =
    new (std::nothrow) kernel::SliceOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
  if (slice_kernel == nullptr) {
    for (auto tensor : inputs) {
      delete tensor;
    }
    for (auto tensor : outputs) {
      delete tensor;
    }
    delete param;
    MS_LOG(INFO) << "new kernel::slice_kernel failed ";
    return;
  }
  slice_kernel->Init();

  // to do allocate memory for inputs and outputs
  for (auto &input_tensor : inputs) {
    input_tensor->MallocData(allocator);
  }

  MS_LOG(INFO) << " initialize sub_graph ";
  std::vector<kernel::LiteKernel *> kernels{slice_kernel};
  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
  if (sub_graph == nullptr) {
    for (auto tensor : inputs) {
      delete tensor;
    }
    for (auto tensor : outputs) {
      delete tensor;
    }
    delete param;
    delete slice_kernel;
    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
    return;
  }
  sub_graph->Init();

  MS_LOG(INFO) << " init tensors ";
  memcpy(inputs[0]->data_c(), input_data, sizeof(input_data));

  std::cout << "==================output data================" << std::endl;
  sub_graph->Run();

  auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->data_c());
  CompareOutputData1(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.0001);
  for (auto tensor : inputs) {
    tensor->set_data(nullptr);
    delete tensor;
  }
  for (auto tensor : outputs) {
    tensor->set_data(nullptr);
    delete tensor;
  }
  delete sub_graph;
  return reinterpret_cast<OpParameter *>(param);
 }

 TEST_F(TestSliceOpenCLfp32, Slicefp32input_dim4) {
  MS_LOG(INFO) << " begin test ";
  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
  ocl_runtime->Init();
  auto allocator = ocl_runtime->GetAllocator();

  MS_LOG(INFO) << " Read tensors from .bin ";
  std::vector<int> input_shape = {1, 19, 19, 96};
  std::vector<int> output_shape = {1, 10, 10, 13};
  std::vector<int> begin = {0, 2, 3, 4};
  std::vector<int> size = {1, 10, 10, 13};
  auto data_type = kNumberTypeFloat32;
  auto tensor_type = lite::Tensor::CONST_TENSOR;

  // get the input from .bin
  size_t input_size, output_size;
  std::string input_path = "./test_data/in_slicefp32.bin";
  std::string output_path = "./test_data/out_slicefp32.bin";
  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
  auto correct_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size));
  MS_LOG(INFO) << " construct tensors ";
  lite::Tensor *tensor_data = new (std::nothrow) lite::Tensor(data_type, input_shape, schema::Format_NHWC, tensor_type);
  if (tensor_data == nullptr) {
    MS_LOG(INFO) << " init tensor failed ";
    return;
  }
  auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, output_shape, schema::Format_NHWC, tensor_type);
  if (output_tensor == nullptr) {
    delete tensor_data;
    MS_LOG(INFO) << " init tensor failed ";
    return;
  }
  std::vector<lite::Tensor *> inputs = {tensor_data};
  std::vector<lite::Tensor *> outputs = {output_tensor};

  MS_LOG(INFO) << "setting  SliceParameter ";
  auto param = reinterpret_cast<SliceParameter *>(malloc(sizeof(SliceParameter)));
  if (param == nullptr) {
    for (auto tensor : inputs) {
      delete tensor;
    }
    for (auto tensor : outputs) {
      delete tensor;
    }
    MS_LOG(INFO) << "new SliceParameter failed ";
    return;
  }
  for (int i = 0; i < input_shape.size(); i++) {
    param->begin_[i] = begin[i];
    param->size_[i] = size[i];
  }

  auto *slice_kernel =
    new (std::nothrow) kernel::SliceOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
  if (slice_kernel == nullptr) {
    for (auto tensor : inputs) {
      delete tensor;
    }
    for (auto tensor : outputs) {
      delete tensor;
    }
    delete param;
    MS_LOG(INFO) << "new kernel::slice_kernel failed ";
    return;
  }
  slice_kernel->Init();

  // to do allocate memory for inputs and outputs
  for (auto &input_tensor : inputs) {
    input_tensor->MallocData(allocator);
  }

  MS_LOG(INFO) << " initialize sub_graph ";
  std::vector<kernel::LiteKernel *> kernels{slice_kernel};
  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
  if (sub_graph == nullptr) {
    for (auto tensor : inputs) {
      delete tensor;
    }
    for (auto tensor : outputs) {
      delete tensor;
    }
    delete param;
    delete slice_kernel;
    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
    return;
  }
  sub_graph->Init();

  MS_LOG(INFO) << " init tensors ";
  memcpy(inputs[0]->data_c(), input_data, input_size);

  std::cout << "==================output data================" << std::endl;
  sub_graph->Run();

  auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->data_c());
  CompareOutputData1(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.0001);
  for (auto tensor : inputs) {
    tensor->set_data(nullptr);
    delete tensor;
  }
  for (auto tensor : outputs) {
    tensor->set_data(nullptr);
    delete tensor;
  }
  delete sub_graph;
 TEST_F(TestSliceOpenCL, 4D) {
  float input_data[] = {-0.45816937, 0.92391545,  -0.9135602, -1.4002057, 1.1080881,  0.40712625,  -0.28128958,
                        0.09470133,  0.19801073,  0.04927751, -1.2808367, 0.1470597,  0.03393711,  -0.33282498,
                        -1.0433807,  -1.3678077,  -0.6423931, 0.5584889,  0.28965706, 0.5343769,   0.75480366,
                        -1.9328151,  -0.48714373, 1.711132,   -1.8871949, -0.2987629, -0.14000037, -0.080552,
                        0.95056856,  -0.06886655, 0.5316237,  0.05787678};
  float expect_data[] = {-0.9135602,  -1.4002057,  1.1080881,  0.40712625, -0.28128958, -1.2808367, 0.1470597,
                         0.03393711,  -0.33282498, -1.0433807, 0.28965706, 0.5343769,   0.75480366, -1.9328151,
                         -0.48714373, -0.14000037, -0.080552,  0.95056856, -0.06886655, 0.5316237};
  auto param = GetSliceParameter({0, 0, 0, 2}, {1, 2, 2, 5});
  TestMain({{{1, 2, 2, 8}, input_data, Tensor::Category::VAR}}, {{1, 2, 2, 5}, expect_data}, param, false);
 }

 TEST_F(TestSliceOpenCLfp16, Slicefp16input_dim4) {
  MS_LOG(INFO) << " begin test ";
  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
  ocl_runtime->SetFp16Enable(true);
  ocl_runtime->Init();
  auto allocator = ocl_runtime->GetAllocator();

  MS_LOG(INFO) << " Read tensors from .bin ";
  std::vector<int> input_shape = {1, 25, 25, 48};
  std::vector<int> output_shape = {1, 24, 24, 15};
  std::vector<int> begin = {0, 1, 1, 7};
  std::vector<int> size = {1, 24, 24, 15};
  auto data_type = kNumberTypeFloat16;
  auto tensor_type = lite::Tensor::CONST_TENSOR;

  // get the input from .bin
  size_t input_size, output_size;
  std::string input_path = "./test_data/in_slicefp16.bin";
  std::string output_path = "./test_data/out_slicefp16.bin";
  auto input_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
  auto correct_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size));

  MS_LOG(INFO) << " construct tensors ";
  lite::Tensor *tensor_data = new (std::nothrow) lite::Tensor(data_type, input_shape, schema::Format_NHWC, tensor_type);
  if (tensor_data == nullptr) {
    MS_LOG(INFO) << " init tensor failed ";
    return;
  }
  auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, output_shape, schema::Format_NHWC4, tensor_type);
  if (output_tensor == nullptr) {
    delete tensor_data;
    MS_LOG(INFO) << " init tensor failed ";
    return;
 TEST_F(TestSliceOpenCL, tflite_cpu) {
  std::vector<std::tuple<std::string, std::vector<int>, std::vector<int>, std::vector<float>, std::vector<float>,
                         std::vector<int>, std::vector<int>>>
    cases = {{"In1D", {4}, {2}, {1, 2, 3, 4}, {2, 3}, {1}, {2}},
             {"In2D", {2, 3}, {1, 2}, {1, 2, 3, 4, 5, 6}, {4, 5}, {1, 0}, {1, 2}},
             {"In3D",
              {2, 3, 2},
              {2, 3, 2},
              {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
              {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
              {0, 0, 0},
              {2, 3, 2}},
             {"InputFloat", {4, 1, 1, 1}, {3, 1, 1, 1}, {1, 2, 3, 4}, {2, 3, 4}, {1, 0, 0, 0}, {3, 1, 1, 1}},
             {"IndexInt64", {4, 1, 1, 1}, {3, 1, 1, 1}, {1, 2, 3, 4}, {2, 3, 4}, {1, 0, 0, 0}, {3, 1, 1, 1}},
             {"InputInteger1",
              {3, 2, 3, 1},
              {1, 1, 3, 1},
              {1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6},
              {3, 3, 3},
              {1, 0, 0, 0},
              {1, 1, 3, 1}},
             {"InputInteger2",
              {3, 2, 3, 1},
              {1, 2, 3, 1},
              {1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6},
              {3, 3, 3, 4, 4, 4},
              {1, 0, 0, 0},
              {1, 2, 3, 1}},
             {"InputInteger3",
              {3, 2, 3, 1},
              {2, 1, 3, 1},
              {1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6},
              {3, 3, 3, 5, 5, 5},
              {1, 0, 0, 0},
              {2, 1, 3, 1}},
             {"SizeMinus1",
              {3, 2, 3, 1},
              {2, 1, 3, 1},
              {1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6},
              {3, 3, 3, 5, 5, 5},
              {1, 0, 0, 0},
              {2, 1, -1, 1}},
             {"BeginNonZeroSizeMinus1Axis1",
              {3, 3, 2, 1},
              {2, 2, 1, 1},
              {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9},
              {5, 6, 8, 9},
              {1, 1, 0, 0},
              {2, -1, 1, 1}},
             {"BeginNonZeroSizeMinus1Axis2",
              {3, 2, 3, 1},
              {2, 1, 2, 1},
              {1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6},
              {3, 3, 5, 5},
              {1, 0, 1, 0},
              {2, 1, -1, 1}},
             {"BeginNonZeroSizeMinus1Axis3",
              {3, 1, 2, 3},
              {2, 1, 1, 2},
              {1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6},
              {3, 3, 5, 5},
              {1, 0, 0, 1},
              {2, 1, 1, -1}},
             {"SliceUint8",
              {3, 2, 3, 1},
              {2, 1, 3, 1},
              {1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6},
              {3, 3, 3, 5, 5, 5},
              {1, 0, 0, 0},
              {2, 1, -1, 1}},
             {"SliceInt8",
              {3, 2, 3, 1},
              {2, 1, 3, 1},
              {1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6},
              {3, 3, 3, 5, 5, 5},
              {1, 0, 0, 0},
              {2, 1, -1, 1}},
             {"SliceInt16",
              {3, 2, 3, 1},
              {2, 1, 3, 1},
              {1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6},
              {3, 3, 3, 5, 5, 5},
              {1, 0, 0, 0},
              {2, 1, -1, 1}},
             {"SliceInt64",
              {3, 2, 3, 1},
              {2, 1, 3, 1},
              {1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6},
              {3, 3, 3, 5, 5, 5},
              {1, 0, 0, 0},
              {2, 1, -1, 1}}};

  for (auto &case_ : cases) {
    auto &name = std::get<0>(case_);
    auto &input_shape = std::get<1>(case_);
    auto &output_shape = std::get<2>(case_);
    auto &input_data = std::get<3>(case_);
    auto &expect_data = std::get<4>(case_);
    auto &begin = std::get<5>(case_);
    auto &size = std::get<6>(case_);

    std::cout << name << std::endl;
    auto *param = GetSliceParameter(begin, size);
    TestMain({{input_shape, input_data.data(), Tensor::Category::VAR}}, {output_shape, expect_data.data()}, param,
             false);
    param = GetSliceParameter(begin, size);
    TestMain({{input_shape, input_data.data(), Tensor::Category::VAR}}, {output_shape, expect_data.data()}, param,
             true);
  }
  std::vector<lite::Tensor *> inputs = {tensor_data};
  std::vector<lite::Tensor *> outputs = {output_tensor};

  MS_LOG(INFO) << " setting  SliceParameter ";
  auto param = reinterpret_cast<SliceParameter *>(malloc(sizeof(SliceParameter)));
  if (param == nullptr) {
    for (auto tensor : inputs) {
      delete tensor;
    }
    for (auto tensor : outputs) {
      delete tensor;
    }
    MS_LOG(INFO) << " new SliceParameter failed ";
    return;
  }
  for (int i = 0; i < input_shape.size(); i++) {
    param->begin_[i] = begin[i];
    param->size_[i] = size[i];
  }

  auto *slice_kernel =
    new (std::nothrow) kernel::SliceOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
  if (slice_kernel == nullptr) {
    for (auto tensor : inputs) {
      delete tensor;
    }
    for (auto tensor : outputs) {
      delete tensor;
    }
    delete param;
    MS_LOG(INFO) << " new kernel::slice_kernel failed ";
    return;
  }
  slice_kernel->Init();

  // to do allocate memory for inputs and outputs
  for (auto &input_tensor : inputs) {
    input_tensor->MallocData(allocator);
  }

  MS_LOG(INFO) << " initialize sub_graph ";
  std::vector<kernel::LiteKernel *> kernels{slice_kernel};
  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
  if (sub_graph == nullptr) {
    for (auto tensor : inputs) {
      delete tensor;
    }
    for (auto tensor : outputs) {
      delete tensor;
    }
    delete param;
    delete slice_kernel;
    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
    return;
  }
  sub_graph->Init();

  MS_LOG(INFO) << " init tensors ";
  memcpy(inputs[0]->data_c(), input_data, input_size);
 }  // namespace mindspore

  std::cout << "==================output data================" << std::endl;
  sub_graph->Run();
  auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->data_c());
  CompareOutputData1(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.0001);
  for (auto tensor : inputs) {
    tensor->set_data(nullptr);
    delete tensor;
  }
  for (auto tensor : outputs) {
    tensor->set_data(nullptr);
    delete tensor;
  }
  delete sub_graph;
 }
 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/strided_slice_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/strided_slice_tests.cc
@@ -0,0 +1,317 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "common/common_test.h"
 #include "nnacl/strided_slice.h"
 #include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"

 namespace mindspore {

 class TestStridedSliceOpenCL : public mindspore::CommonTest {};

 OpParameter *GetStridedSliceParameter(const std::vector<int> &begins, const std::vector<int> &ends,
                                      const std::vector<int> &strides) {
  auto param = static_cast<StridedSliceParameter *>(malloc(sizeof(StridedSliceParameter)));
  if (param == nullptr) {
    MS_LOG(ERROR) << "create StridedSliceParameter error.";
    return nullptr;
  }
  param->op_parameter_.type_ = schema::PrimitiveType_StridedSlice;
  param->num_axes_ = begins.size();
  for (int i = 0; i < begins.size(); ++i) {
    param->begins_[i] = begins[i];
    param->ends_[i] = ends[i];
    param->strides_[i] = strides[i];
  }
  return reinterpret_cast<OpParameter *>(param);
 }

 TEST_F(TestStridedSliceOpenCL, 1D) {
  float input_data[] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17,
                        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35};
  float expect_data[] = {3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33};
  auto *param = GetStridedSliceParameter({3}, {36}, {3});
  TestMain({{{36}, input_data, Tensor::Category::VAR}}, {{11}, expect_data}, param, false);
 }

 TEST_F(TestStridedSliceOpenCL, 2D) {
  float input_data[] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17,
                        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35};
  float expect_data[] = {11, 14};
  auto *param = GetStridedSliceParameter({1, 2}, {3, 8}, {2, 3});
  TestMain({{{4, 9}, input_data, Tensor::Category::VAR}}, {{1, 2}, expect_data}, param, false);
 }

 TEST_F(TestStridedSliceOpenCL, 3D) {
  float input_data[] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17,
                        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35};
  float expect_data[] = {11, 14};
  auto *param = GetStridedSliceParameter({0, 1, 2}, {1, 3, 8}, {1, 2, 3});
  TestMain({{{1, 4, 9}, input_data, Tensor::Category::VAR}}, {{1, 1, 2}, expect_data}, param, false);
 }

 TEST_F(TestStridedSliceOpenCL, 4D) {
  float input_data[] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17,
                        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35};

  float expect_data0[] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17,
                          18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35};
  auto *param = GetStridedSliceParameter({0, 0, 0, 0}, {2, 2, 3, 3}, {1, 1, 1, 1});
  TestMain({{{2, 2, 3, 3}, input_data, Tensor::Category::VAR}}, {{2, 2, 3, 3}, expect_data0}, param, false);

  param = GetStridedSliceParameter({0, 0, 0, 0}, {2, 2, 3, 3}, {1, 1, 1, 1});
  TestMain({{{2, 2, 3, 3}, input_data, Tensor::Category::VAR}}, {{2, 2, 3, 3}, expect_data0}, param, true);

  float expect_data1[] = {18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35};
  param = GetStridedSliceParameter({1, 0, 0, 0}, {2, 2, 3, 3}, {1, 1, 1, 1});
  TestMain({{{2, 2, 3, 3}, input_data, Tensor::Category::VAR}}, {{1, 2, 3, 3}, expect_data1}, param, false);

  float expect_data2[] = {27, 28, 29, 30, 31, 32, 33, 34, 35};
  param = GetStridedSliceParameter({1, 1, 0, 0}, {2, 2, 3, 3}, {1, 1, 1, 1});
  TestMain({{{2, 2, 3, 3}, input_data, Tensor::Category::VAR}}, {{1, 1, 3, 3}, expect_data2}, param, false);

  float expect_data3[] = {33, 34, 35};
  param = GetStridedSliceParameter({1, 1, 2, 0}, {2, 2, 3, 3}, {1, 1, 1, 1});
  TestMain({{{2, 2, 3, 3}, input_data, Tensor::Category::VAR}}, {{1, 1, 1, 3}, expect_data3}, param, false);

  float expect_data4[] = {34};
  param = GetStridedSliceParameter({1, 1, 2, 1}, {2, 2, 3, 2}, {1, 1, 1, 1});
  TestMain({{{2, 2, 3, 3}, input_data, Tensor::Category::VAR}}, {{1, 1, 1, 1}, expect_data4}, param, false);
 }

 TEST_F(TestStridedSliceOpenCL, 4D_stride2) {
  float input_data[] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17,
                        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35};
  float expect_data[] = {13, 14, 31, 32};
  auto *param = GetStridedSliceParameter({0, 1, 1, 1}, {1, 4, 3, 3}, {2, 2, 2, 1});
  TestMain({{{1, 4, 3, 3}, input_data, Tensor::Category::VAR}}, {{1, 2, 1, 2}, expect_data}, param, false);
 }

 TEST_F(TestStridedSliceOpenCL, 4D_to_3D) {
  float input_data[] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17,
                        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35};
  float expect_data[] = {18, 20, 21, 23, 27, 29, 30, 32};
  auto *param = GetStridedSliceParameter({1, 0, 0, 0}, {2, 2, 2, 3}, {1, 1, 1, 2});
  TestMain({{{2, 2, 3, 3}, input_data, Tensor::Category::VAR}}, {{2, 2, 2}, expect_data}, param, false);
 }

 TEST_F(TestStridedSliceOpenCL, In1D_OutOfRangeBeginNegativeStride) {
  float input_data[] = {1, 2, 3, 4};
  float expect_data[] = {4, 3, 2};
  auto *param = GetStridedSliceParameter({5}, {0}, {-1});
  TestMain({{{4}, input_data, Tensor::Category::VAR}}, {{3}, expect_data}, param, false);
 }

 TEST_F(TestStridedSliceOpenCL, tflite_cpu) {
  std::vector<float> values(32768);
  for (int i = 0; i < values.size(); ++i) {
    values[i] = i % 1000;
  }
  std::vector<std::tuple<std::string, std::vector<int>, std::vector<int>, std::vector<float>, std::vector<float>,
                         std::vector<int>, std::vector<int>, std::vector<int>>>
    cases = {{"In1D", {4}, {2}, {1, 2, 3, 4}, {2, 3}, {1}, {3}, {1}},
             {"In1D_Int32End", {32768}, {32768}, values, values, {0}, {32768}, {1}},
             {"In1D_NegativeBegin", {4}, {2}, {1, 2, 3, 4}, {2, 3}, {-3}, {3}, {1}},
             {"In1D_OutOfRangeBegin", {4}, {3}, {1, 2, 3, 4}, {1, 2, 3}, {-5}, {3}, {1}},
             {"In1D_NegativeEnd", {4}, {1}, {1, 2, 3, 4}, {2}, {1}, {-2}, {1}},
             {"In1D_OutOfRangeEnd", {4}, {3}, {1, 2, 3, 4}, {2, 3, 4}, {-3}, {5}, {1}},
             {"In1D_NegativeBeginNegativeStride", {4}, {1}, {1, 2, 3, 4}, {3}, {-2}, {-3}, {-1}},
             {"In1D_OutOfRangeBeginNegativeStride", {4}, {1}, {1, 2, 3, 4}, {4}, {5}, {2}, {-1}},
             {"In1D_NegativeEndNegativeStride", {4}, {2}, {1, 2, 3, 4}, {3, 2}, {2}, {-4}, {-1}},
             {"In1D_OutOfRangeEndNegativeStride", {4}, {2}, {1, 2, 3, 4}, {2, 1}, {-3}, {-5}, {-1}},
             {"In1D_NegStride", {3}, {3}, {1, 2, 3}, {3, 2, 1}, {-1}, {-4}, {-1}},
             {"In1D_EvenLenStride2", {2}, {1}, {1, 2}, {1}, {0}, {2}, {2}},
             {"In1D_OddLenStride2", {3}, {2}, {1, 2, 3}, {1, 3}, {0}, {3}, {2}},
             {"In2D_Identity", {2, 3}, {2, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}, {0, 0}, {2, 3}, {1, 1}},
             {"In2D", {2, 3}, {1, 2}, {1, 2, 3, 4, 5, 6}, {4, 5}, {1, 0}, {2, 2}, {1, 1}},
             {"In2D_Stride2", {2, 3}, {1, 2}, {1, 2, 3, 4, 5, 6}, {1, 3}, {0, 0}, {2, 3}, {2, 2}},
             {"In2D_NegStride", {2, 3}, {1, 3}, {1, 2, 3, 4, 5, 6}, {6, 5, 4}, {1, -1}, {2, -4}, {2, -1}},
             {"In2D_BeginMask", {2, 3}, {2, 2}, {1, 2, 3, 4, 5, 6}, {1, 2, 4, 5}, {0, 0}, {2, 2}, {1, 1}},
             {"In2D_EndMask", {2, 3}, {1, 3}, {1, 2, 3, 4, 5, 6}, {4, 5, 6}, {1, 0}, {2, 3}, {1, 1}},
             {"In2D_NegStrideBeginMask", {2, 3}, {1, 3}, {1, 2, 3, 4, 5, 6}, {6, 5, 4}, {1, -1}, {2, -4}, {1, -1}},
             {"In2D_NegStrideEndMask", {2, 3}, {1, 2}, {1, 2, 3, 4, 5, 6}, {6, 5}, {1, -1}, {2, 0}, {1, -1}},
             {"In3D_Identity",
              {2, 3, 2},
              {2, 3, 2},
              {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
              {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
              {0, 0, 0},
              {2, 3, 2},
              {1, 1, 1}},
             {"In3D_NegStride",
              {2, 3, 2},
              {2, 3, 2},
              {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
              {12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1},
              {-1, -1, -1},
              {-3, -4, -3},
              {-1, -1, -1}},
             {"In3D_Strided2",
              {2, 3, 2},
              {1, 2, 1},
              {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
              {1, 5},
              {0, 0, 0},
              {2, 3, 2},
              {2, 2, 2}},
             {"In1D_ShrinkAxisMask1", {4}, {1}, {1, 2, 3, 4}, {2}, {1}, {2}, {1}},
             {"In1D_ShrinkAxisMask1_NegativeSlice", {4}, {1}, {0, 1, 2, 3}, {3}, {-1}, {4}, {1}},
             {"In2D_ShrinkAxis3_NegativeSlice", {4, 1}, {1}, {0, 1, 2, 3}, {2}, {-2, -1}, {3, 1}, {1, 1}},
             {"In2D_ShrinkAxis2_BeginEndAxis1_NegativeSlice",
              {4, 1},
              {4},
              {0, 1, 2, 3},
              {0, 1, 2, 3},
              {0, -1},
              {4, 1},
              {1, 1}},
             {"In1D_BeginMaskShrinkAxisMask1", {4}, {1}, {1, 2, 3, 4}, {1}, {0}, {1}, {1}},
             {"In2D_ShrinkAxisMask1", {2, 3}, {3}, {1, 2, 3, 4, 5, 6}, {1, 2, 3}, {0, 0}, {1, 3}, {1, 1}},
             {"In2D_ShrinkAxisMask2", {2, 3}, {2}, {1, 2, 3, 4, 5, 6}, {1, 4}, {0, 0}, {2, 1}, {1, 1}},
             {"In2D_ShrinkAxisMask3", {2, 3}, {1}, {1, 2, 3, 4, 5, 6}, {1}, {0, 0}, {1, 1}, {1, 1}},
             {"In3D_IdentityShrinkAxis1",
              {2, 3, 2},
              {3, 2},
              {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
              {1, 2, 3, 4, 5, 6},
              {0, 0, 0},
              {1, 3, 2},
              {1, 1, 1}},
             {"In3D_IdentityShrinkAxis2",
              {2, 3, 2},
              {2, 2},
              {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
              {1, 2, 7, 8},
              {0, 0, 0},
              {2, 1, 2},
              {1, 1, 1}},
             {"In3D_IdentityShrinkAxis3",
              {2, 3, 2},
              {2},
              {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
              {1, 2},
              {0, 0, 0},
              {1, 1, 2},
              {1, 1, 1}},
             {"In3D_IdentityShrinkAxis4",
              {2, 3, 2},
              {2, 3},
              {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
              {1, 3, 5, 7, 9, 11},
              {0, 0, 0},
              {2, 3, 1},
              {1, 1, 1}},
             {"In3D_IdentityShrinkAxis5",
              {2, 3, 2},
              {3},
              {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
              {1, 3, 5},
              {0, 0, 0},
              {1, 3, 1},
              {1, 1, 1}},
             {"In3D_IdentityShrinkAxis6",
              {2, 3, 2},
              {2},
              {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
              {1, 7},
              {0, 0, 0},
              {2, 1, 1},
              {1, 1, 1}},
             {"In3D_IdentityShrinkAxis7",
              {2, 3, 2},
              {1},
              {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
              {1},
              {0, 0, 0},
              {1, 1, 1},
              {1, 1, 1}},
             {"In3D_IdentityShrinkAxis1Uint8",
              {2, 3, 2},
              {3, 2},
              {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
              {1, 2, 3, 4, 5, 6},
              {0, 0, 0},
              {1, 3, 2},
              {1, 1, 1}},
             {"In3D_IdentityShrinkAxis1int8",
              {2, 3, 2},
              {3, 2},
              {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
              {1, 2, 3, 4, 5, 6},
              {0, 0, 0},
              {1, 3, 2},
              {1, 1, 1}},
             {"In5D_Identity",
              {2, 2, 2, 2},
              {2, 1, 2, 2},
              {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
              {1, 2, 3, 4, 9, 10, 11, 12},
              {0, 0, 0, 0},
              {2, 1, 2, 2},
              {1, 1, 1, 1}},
             {"In5D_IdentityShrinkAxis1",
              {2, 2, 2, 2},
              {1, 2, 2},
              {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
              {1, 2, 3, 4},
              {0, 0, 0, 0},
              {1, 1, 2, 2},
              {1, 1, 1, 1}},
             {"In3D_SmallBegin",
              {2, 3, 2},
              {1, 3, 2},
              {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
              {1, 2, 3, 4, 5, 6},
              {0},
              {1},
              {1}},
             {"In3D_SmallBeginWithhrinkAxis1",
              {2, 3, 2},
              {3, 2},
              {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
              {1, 2, 3, 4, 5, 6},
              {0},
              {1},
              {1}}};

  for (auto &case_ : cases) {
    auto &name = std::get<0>(case_);
    auto &input_shape = std::get<1>(case_);
    auto &output_shape = std::get<2>(case_);
    auto &input_data = std::get<3>(case_);
    auto &expect_data = std::get<4>(case_);
    auto &begin = std::get<5>(case_);
    auto &end = std::get<6>(case_);
    auto &stride = std::get<7>(case_);

    std::cout << name << std::endl;
    auto *param = GetStridedSliceParameter(begin, end, stride);
    TestMain({{input_shape, input_data.data(), Tensor::Category::VAR}}, {output_shape, expect_data.data()}, param,
             false);
    param = GetStridedSliceParameter(begin, end, stride);
    TestMain({{input_shape, input_data.data(), Tensor::Category::VAR}}, {output_shape, expect_data.data()}, param,
             true);
  }
 }

 TEST_F(TestStridedSliceOpenCL, tflite_opencl) {
  float input_data[] = {0.1f,  0.2f,  0.3f,  0.4,  1.1f,  1.2f,  1.3f,  1.4,  10.1f, 10.2f, 10.3f, 10.4,
                        11.1f, 11.2f, 11.3f, 11.4, 20.1f, 20.2f, 20.3f, 20.4, 21.1f, 21.2f, 21.3f, 21.4};
  float expect_data[] = {10.2, 10.4, 20.2, 20.4};
  auto *param = GetStridedSliceParameter({0, 1, 0, 1}, {1, 3, 2, 4}, {1, 1, 2, 2});
  TestMain({{{1, 3, 2, 4}, input_data, Tensor::Category::VAR}}, {{1, 2, 1, 2}, expect_data}, param, false);
 }

 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.cc
@@ -15,11 +15,17 @@
 */

 #include <string>
 #include "src/common/log_adapter.h"
 #include "mindspore/lite/src/common/file_utils.h"
 #include "common/common_test.h"
 #include "src/kernel_registry.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
 #include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"

 using mindspore::kernel::LiteKernel;
 using mindspore::kernel::SubGraphOpenCLKernel;
 using mindspore::lite::KernelRegistry;
 using mindspore::lite::Tensor;
 using mindspore::schema::Format::Format_NHWC;

 namespace mindspore {

 void LoadTestData(void *dst, size_t dst_size, const std::string &file_path) {
@@ -35,4 +41,80 @@ void LoadTestData(void *dst, size_t dst_size, const std::string &file_path) {
  }
 }

 void TestMain(const std::vector<std::tuple<std::vector<int>, float *, Tensor::Category>> &input_infos,
              std::tuple<std::vector<int>, float *> output_info, OpParameter *op_parameter, bool fp16_enable,
              float atol, bool print_output) {
  MS_LOG(DEBUG) << "initialize OpenCLRuntime and OpenCLAllocator";
  auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
  auto ocl_runtime = runtime_wrapper.GetInstance();
  EXPECT_TRUE(ocl_runtime->Init() == RET_OK);
  ocl_runtime->SetFp16Enable(fp16_enable);
  auto allocator = ocl_runtime->GetAllocator();

  MS_LOG(DEBUG) << "create Tensors & init weight data";
  std::vector<Tensor> tensors;
  std::vector<Tensor *> kernel_inputs;
  std::vector<Tensor *> subgraph_inputs;
  std::map<Tensor *, float *> subgraph_inputs_data;
  for (auto input_info : input_infos) {
    const std::vector<int> &shape = std::get<0>(input_info);
    auto *input_data = std::get<1>(input_info);
    const Tensor::Category category = std::get<2>(input_info);
    tensors.emplace_back(kNumberTypeFloat32, shape, Format_NHWC, category);
    auto *new_tensor = &tensors.back();
    kernel_inputs.push_back(new_tensor);
    if (category != Tensor::Category::VAR) {
      memcpy(new_tensor->MutableData(), input_data, new_tensor->Size());
    } else {
      subgraph_inputs.push_back(new_tensor);
      subgraph_inputs_data[new_tensor] = input_data;
    }
  }
  const std::vector<int> &output_shape = std::get<0>(output_info);
  float *expect_data = std::get<1>(output_info);
  auto output = Tensor(kNumberTypeFloat32, output_shape, Format_NHWC, Tensor::Category::VAR);

  MS_LOG(DEBUG) << "create OpenCL Kernel";
  auto primitive_type = static_cast<schema::PrimitiveType>(op_parameter->type_);
  kernel::KernelKey key{kernel::kGPU, kernel_inputs.front()->data_type(), primitive_type};
  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
  if (creator == nullptr) {
    std::cerr << "get kernel registry function error: " << schema::EnumNamePrimitiveType(primitive_type) << std::endl;
    free(op_parameter);
    FAIL();
  }
  auto *kernel = creator(kernel_inputs, {&output}, op_parameter, nullptr, key, nullptr);
  if (kernel == nullptr) {
    std::cerr << "call kernel registry function error: " << schema::EnumNamePrimitiveType(primitive_type) << std::endl;
    free(op_parameter);
    FAIL();
  }

  MS_LOG(DEBUG) << "create SubGraph & init input data";
  std::vector<LiteKernel *> kernels{kernel};
  auto sub_graph = new (std::nothrow) SubGraphOpenCLKernel(subgraph_inputs, {&output}, kernels, kernels, kernels);
  if (sub_graph == nullptr) {
    return;
  }
  for (auto input : subgraph_inputs) {
    EXPECT_TRUE(input->MallocData(allocator) == RET_OK);
  }
  EXPECT_TRUE(sub_graph->Init() == RET_OK);
  for (auto input : subgraph_inputs) {
    memcpy(input->data_c(), subgraph_inputs_data[input], input->Size());
  }

  MS_LOG(DEBUG) << "run SubGraph & compare result";
  EXPECT_TRUE(sub_graph->Run() == RET_OK);
  if (print_output) {
    for (int i = 0; i < output.ElementsNum(); ++i) {
      printf("%d: expect=%.3f output=%.3f\n", i, expect_data[i], reinterpret_cast<float *>(output.data_c())[i]);
    }
  }
  CommonTest::CompareOutputData(reinterpret_cast<float *>(output.data_c()), expect_data, output.ElementsNum(), atol);

  MS_LOG(DEBUG) << "release resources";
  delete sub_graph;
 }

 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h
@@ -14,16 +14,18 @@
 * limitations under the License.
 */

 #ifndef TESTS_UT_OPENCL_KERNEL_TESTS_UTILS_H_
 #define TESTS_UT_OPENCL_KERNEL_TESTS_UTILS_H_

 #include <string>
 #include <iostream>
 #include "tests/ut/cpp/common/common_test.h"
 #include "src/common/log_adapter.h"
 #include <vector>
 #include <tuple>
 #include <map>
 #include "mindspore/lite/src/tensor.h"
 #include "mindspore/lite/src/common/file_utils.h"
 #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"

 #ifndef TESTS_UT_OPENCL_KERNEL_TESTS_UTILS_H_
 #define TESTS_UT_OPENCL_KERNEL_TESTS_UTILS_H_
 using mindspore::lite::Tensor;

 namespace mindspore {

@@ -63,6 +65,10 @@ void CompareOutput(lite::Tensor *output_tensor, const std::string &file_path, T
  CompareOutput(output_tensor->data_c(), expect_data, output_tensor->ElementsNum(), atol, rtol);
 }

 void TestMain(const std::vector<std::tuple<std::vector<int>, float *, Tensor::Category>> &input_infos,
              std::tuple<std::vector<int>, float *> output_info, OpParameter *op_parameter, bool fp16_enable = false,
              float atol = 10e-9, bool print_output = false);

 }  // namespace mindspore

 #endif  // TESTS_UT_OPENCL_KERNEL_TESTS_UTILS_H_