diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/stack.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/stack.cl
index 3620334414..4bc72b9f6e 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/stack.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/stack.cl
@@ -1,5 +1,6 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #define INT4 int4
+#define C4NUM 4
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
 #define CHECK_IDX_FOR_STACK                                                                 \
   int X = get_global_id(0);                                                                 \
@@ -10,128 +11,95 @@ __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE |
   }                                                                                         \
   FLT4 result;
 
-__kernel void stack8inputaxis1(__read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2,
-                               __read_only image2d_t input3, __read_only image2d_t input4, __read_only image2d_t input5,
-                               __read_only image2d_t input6, __read_only image2d_t input7,
-                               __write_only image2d_t output, int4 input_shape0, int4 input_shape1, int4 input_shape2,
-                               int4 input_shape3, int4 input_shape4, int4 input_shape5, int4 input_shape6,
-                               int4 input_shape7, int4 output_shape) {
+// input -1D
+__kernel void stack_2input_3axis_1inshape(__read_only image2d_t input0, __read_only image2d_t input1,
+                                          __write_only image2d_t output, int4 input_shape, int4 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z) {
+    return;
+  }
+  int coordinate_x_out = output_shape.w;
+  FLT4 result1 = READ_IMAGE(input0, smp_none, (int2)(0, (X)));
+  FLT4 result2 = READ_IMAGE(input1, smp_none, (int2)(0, (X)));
+  FLT4 result = {result1.x, result2.x, 0, 0};
+  WRITE_IMAGE(output, (int2)(coordinate_x_out, (X)), result);
+}
+
+// input -2D -axis = 1
+__kernel void stack_2input_1axis_2inshape(__read_only image2d_t input0, __read_only image2d_t input1,
+                                          __write_only image2d_t output, int4 input_shape, int4 output_shape) {
   CHECK_IDX_FOR_STACK;
-  if (X < input_shape0.y) {
-    result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));
-  } else if (X < (input_shape0.y + input_shape1.y)) {
-    result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z, (X - input_shape0.y)));
-  } else if (X < (input_shape0.y + input_shape1.y + input_shape2.y)) {
-    result = READ_IMAGE(input2, smp_none, (int2)((Y)*input_shape2.w + Z, (X - input_shape0.y - input_shape1.y)));
-  } else if (X < (input_shape0.y + input_shape1.y + input_shape2.y + input_shape3.y)) {
-    result = READ_IMAGE(input3, smp_none,
-                        (int2)((Y)*input_shape3.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y)));
-  } else if (X < (input_shape0.y + input_shape1.y + input_shape2.y + input_shape3.y + input_shape4.y)) {
-    result = READ_IMAGE(
-      input4, smp_none,
-      (int2)((Y)*input_shape4.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y - input_shape3.y)));
-  } else if (X <
-             (input_shape0.y + input_shape1.y + input_shape2.y + input_shape3.y + input_shape4.y + input_shape5.y)) {
-    result = READ_IMAGE(input5, smp_none,
-                        (int2)((Y)*input_shape5.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y -
-                                                        input_shape3.y - input_shape4.y)));
-  } else if (X < (input_shape0.y + input_shape1.y + input_shape2.y + input_shape3.y + input_shape4.y + input_shape5.y +
-                  input_shape6.y)) {
-    result = READ_IMAGE(input6, smp_none,
-                        (int2)((Y)*input_shape6.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y -
-                                                        input_shape3.y - input_shape4.y - input_shape5.y)));
+  int IN = X / output_shape.y;
+  int IH = X % output_shape.y;
+  int boundary0 = input_shape.z;
+  if (Y < boundary0) {
+    int coordinate_x = Y * input_shape.w + Z;
+    int coordinate_y = IN * input_shape.y + IH;
+    result = READ_IMAGE(input0, smp_none, (int2)(coordinate_x, coordinate_y));
   } else {
-    result =
-      READ_IMAGE(input7, smp_none,
-                 (int2)((Y)*input_shape7.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y - input_shape3.y -
-                                                 input_shape4.y - input_shape5.y - input_shape6.y)));
+    int coordinate_x = (Y - boundary0) * input_shape.w + Z;
+    int coordinate_y = IN * input_shape.y + IH;
+    result = READ_IMAGE(input1, smp_none, (int2)(coordinate_x, coordinate_y));
   }
-  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
+  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (IN * output_shape.y + IH)), result);
 }
 
-__kernel void stack8inputaxis2(__read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2,
-                               __read_only image2d_t input3, __read_only image2d_t input4, __read_only image2d_t input5,
-                               __read_only image2d_t input6, __read_only image2d_t input7,
-                               __write_only image2d_t output, int4 input_shape0, int4 input_shape1, int4 input_shape2,
-                               int4 input_shape3, int4 input_shape4, int4 input_shape5, int4 input_shape6,
-                               int4 input_shape7, int4 output_shape) {
+// input -3D -axis = 1
+__kernel void stack_2input_1axis_3inshape(__read_only image2d_t input0, __read_only image2d_t input1,
+                                          __write_only image2d_t output, int4 input_shape, int4 output_shape) {
   CHECK_IDX_FOR_STACK;
-  if (Y < input_shape0.z) {
-    result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));
-  } else if (Y < (input_shape0.z + input_shape1.z)) {
-    result = READ_IMAGE(input1, smp_none, (int2)((Y - input_shape0.z) * input_shape1.w + Z, (X)));
-  } else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z)) {
-    result = READ_IMAGE(input2, smp_none, (int2)((Y - input_shape0.z - input_shape1.z) * input_shape2.w + Z, (X)));
-  } else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z + input_shape3.z)) {
-    result = READ_IMAGE(input3, smp_none,
-                        (int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z) * input_shape3.w + Z, (X)));
-  } else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z + input_shape3.z + input_shape4.z)) {
-    result = READ_IMAGE(
-      input4, smp_none,
-      (int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z - input_shape3.z) * input_shape4.w + Z, (X)));
-  } else if (Y <
-             (input_shape0.z + input_shape1.z + input_shape2.z + input_shape3.z + input_shape4.z + input_shape5.z)) {
-    result = READ_IMAGE(
-      input5, smp_none,
-      (int2)(
-        (Y - input_shape0.z - input_shape1.z - input_shape2.z - input_shape3.z - input_shape4.z) * input_shape5.w + Z,
-        (X)));
-  } else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z + input_shape3.z + input_shape4.z + input_shape5.z +
-                  input_shape6.z)) {
-    result = READ_IMAGE(
-      input6, smp_none,
-      (int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z - input_shape3.z - input_shape4.z - input_shape5.z) *
-                 input_shape6.w +
-               Z,
-             (X)));
+  int IN = X / output_shape.y;
+  int IH = X % output_shape.y;
+  int boundary0 = input_shape.y;
+  if (IH < boundary0) {
+    int coordinate_x = Y * input_shape.w + Z;
+    int coordinate_y = IN * input_shape.y + IH;
+    result = READ_IMAGE(input0, smp_none, (int2)(coordinate_x, coordinate_y));
   } else {
-    result = READ_IMAGE(input7, smp_none,
-                        (int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z - input_shape3.z - input_shape4.z -
-                                input_shape5.z - input_shape6.z) *
-                                   input_shape7.w +
-                                 Z,
-                               (X)));
+    int coordinate_x = Y * input_shape.w + Z;
+    int coordinate_y = IN * input_shape.y + IH - boundary0;
+    result = READ_IMAGE(input1, smp_none, (int2)(coordinate_x, coordinate_y));
   }
-  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
+  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (IN * output_shape.y + IH)), result);
 }
 
-__kernel void stack8inputaxis3(__read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2,
-                               __read_only image2d_t input3, __read_only image2d_t input4, __read_only image2d_t input5,
-                               __read_only image2d_t input6, __read_only image2d_t input7,
-                               __write_only image2d_t output, int4 input_shape0, int4 input_shape1, int4 input_shape2,
-                               int4 input_shape3, int4 input_shape4, int4 input_shape5, int4 input_shape6,
-                               int4 input_shape7, int4 output_shape) {
+// input -3D -axis = 2
+__kernel void stack_2input_2axis_3inshape(__read_only image2d_t input0, __read_only image2d_t input1,
+                                          __write_only image2d_t output, int4 input_shape, int4 output_shape) {
   CHECK_IDX_FOR_STACK;
-  if (Z < input_shape0.w) {
-    result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));
-  } else if (Z < (input_shape0.w + input_shape1.w)) {
-    result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z - input_shape0.w, (X)));
-  } else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w)) {
-    result = READ_IMAGE(input2, smp_none, (int2)((Y)*input_shape2.w + Z - input_shape0.w - input_shape1.w, (X)));
-  } else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w + input_shape3.w)) {
-    result = READ_IMAGE(input3, smp_none,
-                        (int2)((Y)*input_shape3.w + Z - input_shape0.w - input_shape1.w - input_shape2.w, (X)));
-  } else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w + input_shape3.w + input_shape4.w)) {
-    result = READ_IMAGE(
-      input4, smp_none,
-      (int2)((Y)*input_shape4.w + Z - input_shape0.w - input_shape1.w - input_shape2.w - input_shape3.w, (X)));
-  } else if (Z <
-             (input_shape0.w + input_shape1.w + input_shape2.w + input_shape3.w + input_shape4.w + input_shape5.w)) {
-    result = READ_IMAGE(input5, smp_none,
-                        (int2)((Y)*input_shape5.w + Z - input_shape0.w - input_shape1.w - input_shape2.w -
-                                 input_shape3.w - input_shape4.w,
-                               (X)));
-  } else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w + input_shape3.w + input_shape4.w + input_shape5.w +
-                  input_shape6.w)) {
-    result = READ_IMAGE(input6, smp_none,
-                        (int2)((Y)*input_shape6.w + Z - input_shape0.w - input_shape1.w - input_shape2.w -
-                                 input_shape3.w - input_shape4.w - input_shape5.w,
-                               (X)));
+  int boundary0 = input_shape.y;
+  int IN = X / output_shape.y;
+  int IW = X % output_shape.y;
+  int IC = Z;
+  int coordinate_x = IW * input_shape.w + IC;
+  int coordinate_y = IN * input_shape.y;
+  if (Y < boundary0) {
+    result = READ_IMAGE(input0, smp_none, (int2)(coordinate_x, coordinate_y));
   } else {
-    result = READ_IMAGE(input7, smp_none,
-                        (int2)((Y)*input_shape7.w + Z - input_shape0.w - input_shape1.w - input_shape2.w -
-                                 input_shape3.w - input_shape4.w - input_shape5.w - input_shape6.w,
-                               (X)));
+    result = READ_IMAGE(input1, smp_none, (int2)(coordinate_x, coordinate_y));
+  }
+  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, IN * output_shape.y + IW), result);
+}
+
+// input -3D -axis = 3  and input -2D -axis = 2  boundary stack
+__kernel void stack_2input_boundary(__global float *input0, __global float *input1, __global float *output,
+                                    int4 input_shape, int4 output_shape, int2 stride_w) {
+  int X = get_global_id(0);  // N
+  int Y = get_global_id(1);  // H
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+  int IW = output_shape.z;
+  int Align_out = output_shape.w * C4NUM;
+  int Align_in = input_shape.w * C4NUM;
+  int index_out = X * output_shape.y * stride_w.x + Y * stride_w.x;
+  int index_in = X * input_shape.y * stride_w.y + Y * Align_in;
+  for (int iw = 0; iw < IW; iw++) {
+    int index_out_tmp = index_out + iw * Align_out;
+    int index_in_tmp = index_in + iw;
+    output[index_out_tmp] = input0[index_in_tmp];
+    index_out_tmp++;
+    output[index_out_tmp] = input1[index_in_tmp];
   }
-  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc
index b5639a34f0..55b75c6db2 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc
@@ -47,40 +47,6 @@ int StackOpenCLKernel::RunAxis0() {
   return RET_OK;
 }
 
-int StackOpenCLKernel::Init() {
-  if (in_tensors_[0]->shape().size() > 4 || in_tensors_[0]->shape().size() <= 0) {
-    MS_LOG(ERROR) << " only support dim <= 4 ";
-    return RET_ERROR;
-  }
-  auto param = reinterpret_cast<StackParameter *>(this->op_parameter_);
-  axis_ = param->axis_;
-  axis_ = axis_ < 0 ? axis_ + in_tensors_[0]->shape().size() + 1 : axis_;
-  if (in_tensors_[0]->shape().size() != 4) {
-    if (in_tensors_[0]->shape().size() == 2) {
-      axis_ = axis_ + 2;
-    }
-  }
-  if (param->axis_ < -3 || param->axis_ > 3) {
-    MS_LOG(ERROR) << " only support axis >= -3 and axis <= 3 ";
-    return RET_ERROR;
-  }
-
-  std::string kernel_name = "stack";
-  if (in_tensors_.size() == 8) {
-    kernel_name += "8inputaxis" + std::to_string(axis_);
-  } else {
-    MS_LOG(ERROR) << " input must be 8";
-    return RET_ERROR;
-  }
-  MS_LOG(DEBUG) << "kernel_name=: " << kernel_name;
-  std::string source = stack_source;
-  std::string program_name = "stack";
-  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
-
-  return RET_OK;
-}
-
 int StackOpenCLKernel::ReSize() { return RET_OK; }
 
 void StackGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
@@ -97,114 +63,123 @@ void StackGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *l
   local->push_back(z);
 }
 
-int StackOpenCLKernel::InferInTensorShapeTo4D(int *arg_cn) {
-  if (in_tensors_.size() == 8) {
-    int size = in_tensors_[0]->shape().size();
-    switch (size) {
-      case 1:
-        for (int i = 0; i < in_tensors_.size(); ++i) {
-          ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, in_tensors_[i]->data_c());
-        }
-        ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, out_tensors_[0]->data_c());
-        for (int i = 0; i < in_tensors_.size(); ++i) {
-          cl_int4 temp = {in_tensors_[i]->shape()[0], 1, 1, 1};
-          ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, temp);
-        }
-        break;
-      case 2:
-        for (int i = 0; i < in_tensors_.size(); ++i) {
-          ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, in_tensors_[i]->data_c());
-        }
-        ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, out_tensors_[0]->data_c());
-        for (int i = 0; i < in_tensors_.size(); ++i) {
-          cl_int4 temp = {in_tensors_[i]->shape()[0], 1, 1, UP_DIV(in_tensors_[i]->shape()[1], C4NUM)};
-          ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, temp);
-        }
-        break;
-      case 3:
-        for (int i = 0; i < in_tensors_.size(); ++i) {
-          ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, in_tensors_[i]->data_c());
-        }
-        ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, out_tensors_[0]->data_c());
-        for (int i = 0; i < in_tensors_.size(); ++i) {
-          cl_int4 temp = {in_tensors_[i]->shape()[0], 1, in_tensors_[i]->shape()[1],
-                          UP_DIV(in_tensors_[i]->shape()[2], C4NUM)};
-          ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, temp);
-        }
-        break;
-      default:
-        MS_LOG(ERROR) << "unsupported input size > 3 or size <= 0 :" << in_tensors_.size();
-        return RET_ERROR;
-    }
-  } else {
-    MS_LOG(ERROR) << "unsupported input size :" << in_tensors_.size();
+int StackOpenCLKernel::CheckSpecs() {
+  if (in_tensors_[0]->shape().size() > 2 && (axis_ != 0)) {
+    MS_LOG(ERROR) << " only support input size = 2 ";
+    return RET_ERROR;
+  }
+  if (in_tensors_[0]->shape().size() > 4 || in_tensors_[0]->shape().size() <= 0) {
+    MS_LOG(ERROR) << " only support dim <= 4 ";
+    return RET_ERROR;
+  }
+  auto param = reinterpret_cast<StackParameter *>(this->op_parameter_);
+  axis_ = param->axis_;
+  axis_ = axis_ < 0 ? axis_ + in_tensors_[0]->shape().size() : axis_;
+  if (axis_ > 3) {
+    MS_LOG(ERROR) << " only support  axis <= 3 ";
+    return RET_ERROR;
+  }
+  if (axis_ > in_tensors_[0]->shape().size()) {
+    MS_LOG(ERROR) << " stack  axis must been <= in_tensors_[0]->shape().size() ";
     return RET_ERROR;
   }
   return RET_OK;
 }
 
-int StackOpenCLKernel::InferOutTensorShapeTo4D(cl_int4 *output_shape) {
-  std::vector<int> out_shape = out_tensors_[0]->shape();
-  if (out_shape.size() == 3) {
-    N_ = out_shape[0];
-    C_ = out_shape[1] * UP_DIV(out_shape[2], C4NUM);
-  } else if (out_shape.size() == 4) {
-    if (axis_ == 1) {
-      N_ = out_shape[0];
-      H_ = out_shape[1];
-      W_ = out_shape[2];
-      C_ = UP_DIV(out_shape[3], C4NUM);
-    } else {
-      MS_LOG(ERROR) << "Unsupported out_shape.size=: " << out_shape.size() << " axis=: " << axis_;
-      return RET_ERROR;
-    }
+void StackOpenCLKernel::SetConstArgs() {
+  int arg_cn = in_tensors_.size() + 1;
+  cl_int4 inshape_tmp = {}, outshape_tmp = {};
+  for (int i = 0; i < in_tensors_[0]->shape().size(); ++i) {
+    inshape_tmp.s[i] = in_tensors_[0]->shape()[i];
+  }
+  Broadcast2GpuShape(in_shape_.s, inshape_tmp.s, in_tensors_[0]->shape().size(), 1);
+  for (int i = 0; i < out_tensors_[0]->shape().size(); ++i) {
+    outshape_tmp.s[i] = out_tensors_[0]->shape()[i];
+  }
+  Broadcast2GpuShape(out_shape_.s, outshape_tmp.s, out_tensors_[0]->shape().size(), 1);
+  in_shape_.s[3] = UP_DIV(in_shape_.s[3], C4NUM);
+  out_shape_.s[3] = UP_DIV(out_shape_.s[3], C4NUM);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_);
+  if (buffer_button_) {
+    GpuTensorInfo img_info_out(out_tensors_[0]);
+    GpuTensorInfo img_info_in(in_tensors_[0]);
+    size_t dtype = enable_fp16_ ? sizeof(cl_half) : sizeof(cl_float);
+    stride_w_out = img_info_out.RowPitch() / dtype;
+    stride_w_in = img_info_in.RowPitch() / dtype;
+    cl_int2 stride_w = {stride_w_out, stride_w_in};
+    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w);
   }
-  OH_ = N_ * H_;
-  OW_ = W_;
-  OC_ = C_;
-  output_shape->s[0] = N_;
-  output_shape->s[1] = H_;
-  output_shape->s[2] = W_;
-  output_shape->s[3] = C_;
-  return RET_OK;
 }
 
-int StackOpenCLKernel::Run() {
-  MS_LOG(DEBUG) << this->name() << " Running! ";
-  if (axis_ == 0) {
-    return RunAxis0();
+void StackOpenCLKernel::SetGlobalLocal() {
+  if (((in_tensors_[0]->shape().size() == 2 || in_tensors_[0]->shape().size() == 3) && axis_ == 1) ||
+      (in_tensors_[0]->shape().size() == 3 && axis_ == 2)) {
+    OH_ = out_shape_.s[0] * out_shape_.s[1];
+    OW_ = out_shape_.s[2];
+    OC_ = out_shape_.s[3];
+  } else if (in_tensors_[0]->shape().size() == 1) {
+    OH_ = out_shape_.s[0] * out_shape_.s[1];
+    OW_ = out_shape_.s[2];
+  } else {
+    OH_ = out_shape_.s[0];
+    OW_ = out_shape_.s[1];
   }
-  cl_int4 output_shape = {1, 1, 1, 1};
   const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
   std::vector<size_t> local = {1, 1, 1};
-  int arg_cn = 0;
-  InferInTensorShapeTo4D(&arg_cn);
-  InferOutTensorShapeTo4D(&output_shape);
   std::vector<size_t> global = {OH_, OW_, OC_};
   StackGetWorkGroup(global, &local, max_global[0]);
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape);
-  ocl_runtime_->RunKernel(kernel_, global, local);
+  OpenCLKernel::AlignGlobalLocal(global, local);
+}
+
+int StackOpenCLKernel::Prepare() {
+  enable_fp16_ = ocl_runtime_->GetFp16Enable();
+
+  if (in_tensors_[0]->shape().size() == 1 && axis_ == 1) {
+    axis_ += 2;
+  } else if (in_tensors_[0]->shape().size() == axis_) {
+    buffer_button_ = true;  // boundary stack judge
+  }
+  std::string kernel_name = "stack_";
+  if (!buffer_button_) {
+    kernel_name += std::to_string(in_tensors_.size()) + "input_" + std::to_string(axis_) + "axis_" +
+                   std::to_string(in_tensors_[0]->shape().size()) + "inshape";
+  } else {
+    kernel_name += std::to_string(in_tensors_.size()) + "input_" + "boundary";
+  }
+
+  MS_LOG(DEBUG) << "kernel_name=: " << kernel_name;
+  std::string source = stack_source;
+  std::string program_name = "stack";
+  ocl_runtime_->LoadSource(program_name, source);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  SetConstArgs();
+  SetGlobalLocal();
+
   return RET_OK;
 }
 
-kernel::LiteKernel *OpenCLStackKernelCreator(const std::vector<lite::Tensor *> &inputs,
-                                             const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
-                                             const lite::InnerContext *ctx, const kernel::KernelKey &desc,
-                                             const mindspore::lite::PrimitiveC *primitive) {
-  auto *kernel = new (std::nothrow) StackOpenCLKernel(opParameter, inputs, outputs);
-  if (kernel == nullptr) {
-    MS_LOG(ERROR) << " new StackOpenCLKernel failed ";
-    return nullptr;
+int StackOpenCLKernel::Run() {
+  MS_LOG(DEBUG) << this->name() << " Running! ";
+  if (axis_ == 0) {
+    return RunAxis0();
   }
-  auto ret = kernel->Init();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << " Init kernel failed, name: Stack ";
-    delete kernel;
-    return nullptr;
+  int arg_cn = 0;
+  if (buffer_button_) {
+    for (int i = 0; i < in_tensors_.size(); ++i) {
+      ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[i]->data_c(), lite::opencl::MemType::BUF);
+    }
+    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
+  } else {
+    for (int i = 0; i < in_tensors_.size(); ++i) {
+      ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[i]->data_c());
+    }
+    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
   }
-  return kernel;
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_);
+  return RET_OK;
 }
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Stack, OpenCLKernelCreator<StackOpenCLKernel>);
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Stack, OpenCLKernelCreator<StackOpenCLKernel>);
 
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Stack, OpenCLStackKernelCreator);
-REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Stack, OpenCLStackKernelCreator);
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.h
index 9fc809b988..85bb66881b 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.h
@@ -30,8 +30,10 @@ class StackOpenCLKernel : public OpenCLKernel {
       : OpenCLKernel(parameter, inputs, outputs) {}
 
   ~StackOpenCLKernel() override{};
-
-  int Init() override;
+  int Prepare() override;
+  int CheckSpecs() override;
+  void SetConstArgs() override;
+  void SetGlobalLocal() override;
 
   int ReSize() override;
 
@@ -40,18 +42,17 @@ class StackOpenCLKernel : public OpenCLKernel {
  private:
   int RunAxis0();
 
-  int InferInTensorShapeTo4D(int *arg_cn);
-
-  int InferOutTensorShapeTo4D(cl_int4 *output_shape);
-
+  cl::Kernel kernel_;
   int axis_{0};
-  size_t N_{1};
-  size_t H_{1};
-  size_t W_{1};
-  size_t C_{1};
   size_t OH_{1};
   size_t OW_{1};
   size_t OC_{1};
+  bool buffer_button_{false};
+  bool enable_fp16_{false};
+  cl_int stride_w_in{1};
+  cl_int stride_w_out{1};
+  cl_int4 in_shape_ = {};
+  cl_int4 out_shape_ = {};
 };
 
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/stack_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/stack_tests.cc
index 298cab43b2..49a0328772 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/stack_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/stack_tests.cc
@@ -29,34 +29,155 @@ OpParameter *CreateParameter(int axis) {
 }
 }  // namespace
 
-TEST_F(TestOpenCL_Stack, input8_ndim3_axis0) {
-  constexpr int INPUT_NUM = 8;
-  int axis = 0;
-  std::vector<int> input_shapes[INPUT_NUM] = {{1, 1, 8}, {1, 1, 8}, {1, 1, 8}, {1, 1, 8},
-                                              {1, 1, 8}, {1, 1, 8}, {1, 1, 8}, {1, 1, 8}};
-  std::vector<int> output_shape = {8, 1, 1, 8};
-  float input_datas[INPUT_NUM][8] = {
-    {0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.03, 0.37},  {0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.47},
-    {0.31, 0.63, 0.84, 0.43, 0.56, 0.79, 0.12, 0.57}, {0.35, 0.26, 0.17, 0.33, 0.66, 0.89, 0.93, 0.77},
-    {0.57, 0.6, 0.84, 0.83, 0.48, 0.78, 0.63, 0.87},  {0.66, 0.56, 0.64, 0.63, 0.56, 0.59, 0.73, 0.37},
-    {0.35, 0.26, 0.54, 0.33, 0.76, 0.59, 0.73, 0.34}, {0.15, 0.36, 0.44, 0.73, 0.56, 0.49, 0.93, 0.37}};
-  float output_data[] = {0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.03, 0.37, 0.5,  0.6,  0.74, 0.23, 0.46,
-                         0.69, 0.13, 0.47, 0.31, 0.63, 0.84, 0.43, 0.56, 0.79, 0.12, 0.57, 0.35, 0.26,
-                         0.17, 0.33, 0.66, 0.89, 0.93, 0.77, 0.57, 0.6,  0.84, 0.83, 0.48, 0.78, 0.63,
-                         0.87, 0.66, 0.56, 0.64, 0.63, 0.56, 0.59, 0.73, 0.37, 0.35, 0.26, 0.54, 0.33,
-                         0.76, 0.59, 0.73, 0.34, 0.15, 0.36, 0.44, 0.73, 0.56, 0.49, 0.93, 0.37};
+// stack test cases
+TEST_F(TestOpenCL_Stack, input2_ndim1_axis1) {
+  constexpr int INPUT_NUM = 2;
+  int axis = 1;
+  std::vector<int> input_shapes[INPUT_NUM] = {{8}, {8}};
+  std::vector<int> output_shape = {8, 2};
+  float input_datas[INPUT_NUM][8] = {{0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.03, 0.37},
+                                     {0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.47}};
+  float output_data[] = {0.75, 0.5, 0.06, 0.6, 0.74, 0.74, 0.30, 0.23, 0.9, 0.46, 0.59, 0.69, 0.03, 0.13, 0.37, 0.47};
+
+  for (auto fp16_enable : {true}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shapes[0], input_datas[0], VAR}, {input_shapes[1], input_datas[1], VAR}},
+             {output_shape, output_data}, param, fp16_enable, fp16_enable ? 1e-3 : 1e-9);
+  }
+}
+
+TEST_F(TestOpenCL_Stack, input2_ndim2_axis1) {
+  constexpr int INPUT_NUM = 2;
+  int axis = 1;
+  std::vector<int> input_shapes[INPUT_NUM] = {{3, 4}, {3, 4}};
+  std::vector<int> output_shape = {3, 2, 4};
+  float input_datas[INPUT_NUM][12] = {
+    {1.317, -2.094, -1.892, -0.4612, -0.884, -0.524, 0.4504, 0.0284, 3.227, -0.4673, -1.115, -0.1572},
+    {-0.0677, -1.289, 0.0685, 0.889, 0.8145, 1.6455, 0.6587, -0.236, 0.3625, 0.7393, -1.393, 0.2534}};
+  float output_data[] = {1.317,  -2.094,  -1.892, -0.4612, -0.0677, -1.289, 0.0685, 0.889,
+                         -0.884, -0.524,  0.4504, 0.0284,  0.8145,  1.6455, 0.6587, -0.236,
+                         3.227,  -0.4673, -1.115, -0.1572, 0.3625,  0.7393, -1.393, 0.2534};
+
+  for (auto fp16_enable : {true}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shapes[0], input_datas[0], VAR}, {input_shapes[1], input_datas[1], VAR}},
+             {output_shape, output_data}, param, fp16_enable, fp16_enable ? 1e-3 : 1e-9);
+  }
+}
+
+TEST_F(TestOpenCL_Stack, input2_ndim3_axis1) {
+  constexpr int INPUT_NUM = 2;
+  int axis = 1;
+  std::vector<int> input_shapes[INPUT_NUM] = {{3, 4, 5}, {3, 4, 5}};
+  std::vector<int> output_shape = {3, 2, 4, 5};
+  size_t input1_size, input2_size, output_size;
+  std::string input1Ppath = "./test_data/stackfp32_input1.bin";
+  std::string input2Ppath = "./test_data/stackfp32_input2.bin";
+  std::string correctOutputPath = "./test_data/stackfp32_output.bin";
+  auto input_data1 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size));
+  auto input_data2 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input2Ppath.c_str(), &input2_size));
+  auto output_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size));
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shapes[0], input_data1, VAR}, {input_shapes[1], input_data2, VAR}}, {output_shape, output_data},
+             param, fp16_enable, fp16_enable ? 1e-3 : 1e-9);
+  }
+}
 
+TEST_F(TestOpenCL_Stack, input2_ndim3_axis2) {
+  constexpr int INPUT_NUM = 2;
+  int axis = 2;
+  std::vector<int> input_shapes[INPUT_NUM] = {{3, 4, 5}, {3, 4, 5}};
+  std::vector<int> output_shape = {3, 4, 2, 5};
+  size_t input1_size, input2_size, output_size;
+  std::string input1Ppath = "./test_data/stackfp32_input1.bin";
+  std::string input2Ppath = "./test_data/stackfp32_input2.bin";
+  std::string correctOutputPath = "./test_data/stackfp32_output.bin";
+  auto input_data1 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size));
+  auto input_data2 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input2Ppath.c_str(), &input2_size));
+  auto output_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size));
   for (auto fp16_enable : {false}) {
     auto *param = CreateParameter(axis);
-    TestMain({{input_shapes[0], input_datas[0], VAR},
-              {input_shapes[1], input_datas[1], VAR},
-              {input_shapes[2], input_datas[2], VAR},
-              {input_shapes[3], input_datas[3], VAR},
-              {input_shapes[4], input_datas[4], VAR},
-              {input_shapes[5], input_datas[5], VAR},
-              {input_shapes[6], input_datas[6], VAR},
-              {input_shapes[7], input_datas[7], VAR}},
-             {output_shape, output_data}, param, fp16_enable);
+    TestMain({{input_shapes[0], input_data1, VAR}, {input_shapes[1], input_data2, VAR}}, {output_shape, output_data},
+             param, fp16_enable, fp16_enable ? 1e-3 : 1e-9);
+  }
+}
+
+TEST_F(TestOpenCL_Stack, input2_ndim2_axis2) {
+  constexpr int INPUT_NUM = 2;
+  int axis = 2;
+  std::vector<int> input_shapes[INPUT_NUM] = {{1, 96}, {1, 96}};
+  std::vector<int> output_shape = {1, 96, 2};
+  size_t input1_size, input2_size, output_size;
+  std::string input1Ppath = "./test_data/stackfp32_input1.bin";
+  std::string input2Ppath = "./test_data/stackfp32_input2.bin";
+  std::string correctOutputPath = "./test_data/stackfp32_output.bin";
+  auto input_data1 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size));
+  auto input_data2 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input2Ppath.c_str(), &input2_size));
+  auto output_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size));
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shapes[0], input_data1, VAR}, {input_shapes[1], input_data2, VAR}}, {output_shape, output_data},
+             param, fp16_enable, fp16_enable ? 1e-3 : 1e-9);
+  }
+}
+
+TEST_F(TestOpenCL_Stack, input2_ndim3_axis3) {
+  constexpr int INPUT_NUM = 2;
+  int axis = 3;
+  std::vector<int> input_shapes[INPUT_NUM] = {{3, 4, 6}, {3, 4, 6}};
+  std::vector<int> output_shape = {3, 4, 6, 2};
+  size_t input1_size, input2_size, output_size;
+  std::string input1Ppath = "./test_data/stackfp32_input1.bin";
+  std::string input2Ppath = "./test_data/stackfp32_input2.bin";
+  std::string correctOutputPath = "./test_data/stackfp32_output.bin";
+  auto input_data1 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size));
+  auto input_data2 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input2Ppath.c_str(), &input2_size));
+  auto output_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size));
+  for (auto fp16_enable : {false}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shapes[0], input_data1, VAR}, {input_shapes[1], input_data2, VAR}}, {output_shape, output_data},
+             param, fp16_enable, fp16_enable ? 1e-3 : 1e-9);
+  }
+}
+
+TEST_F(TestOpenCL_Stack, input6_ndim3_axis0) {
+  constexpr int INPUT_NUM = 8;
+  int axis = 0;
+  std::vector<int> input_shapes[INPUT_NUM] = {{1, 17, 18}, {1, 17, 18}, {1, 17, 18}, {1, 17, 18},
+                                              {1, 17, 18}, {1, 17, 18}, {1, 17, 18}, {1, 17, 18}};
+  std::vector<int> output_shape = {8, 1, 17, 18};
+  size_t input1_size, input2_size, input3_size, input4_size, input5_size, input6_size, input7_size, input8_size,
+    output_size;
+  std::string input1Ppath = "./test_data/stackfp32_input1.bin";
+  std::string input2Ppath = "./test_data/stackfp32_input2.bin";
+  std::string input3Ppath = "./test_data/stackfp32_input3.bin";
+  std::string input4Ppath = "./test_data/stackfp32_input4.bin";
+  std::string input5Ppath = "./test_data/stackfp32_input5.bin";
+  std::string input6Ppath = "./test_data/stackfp32_input6.bin";
+  std::string input7Ppath = "./test_data/stackfp32_input7.bin";
+  std::string input8Ppath = "./test_data/stackfp32_input8.bin";
+  std::string correctOutputPath = "./test_data/stackfp32_output.bin";
+  auto input_data1 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size));
+  auto input_data2 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input2Ppath.c_str(), &input2_size));
+  auto input_data3 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input3Ppath.c_str(), &input3_size));
+  auto input_data4 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input4Ppath.c_str(), &input4_size));
+  auto input_data5 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input5Ppath.c_str(), &input5_size));
+  auto input_data6 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input6Ppath.c_str(), &input6_size));
+  auto input_data7 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input7Ppath.c_str(), &input7_size));
+  auto input_data8 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input8Ppath.c_str(), &input8_size));
+  auto output_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size));
+  for (auto fp16_enable : {true}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input_shapes[0], input_data1, VAR},
+              {input_shapes[1], input_data2, VAR},
+              {input_shapes[2], input_data3, VAR},
+              {input_shapes[3], input_data4, VAR},
+              {input_shapes[4], input_data5, VAR},
+              {input_shapes[5], input_data6, VAR},
+              {input_shapes[6], input_data7, VAR},
+              {input_shapes[7], input_data8, VAR}},
+             {output_shape, output_data}, param, fp16_enable, fp16_enable ? 1e-3 : 1e-9);
   }
 }
 
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim2_axis2/stackfp32_input1.bin b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim2_axis2/stackfp32_input1.bin
new file mode 100644
index 0000000000..c9b0b8922a
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim2_axis2/stackfp32_input1.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim2_axis2/stackfp32_input2.bin b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim2_axis2/stackfp32_input2.bin
new file mode 100644
index 0000000000..b0e9490bcc
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim2_axis2/stackfp32_input2.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim2_axis2/stackfp32_output.bin b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim2_axis2/stackfp32_output.bin
new file mode 100644
index 0000000000..43d0e107ea
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim2_axis2/stackfp32_output.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis1/stackfp32_input1.bin b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis1/stackfp32_input1.bin
new file mode 100644
index 0000000000..7c9d8a905d
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis1/stackfp32_input1.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis1/stackfp32_input2.bin b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis1/stackfp32_input2.bin
new file mode 100644
index 0000000000..08359107b7
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis1/stackfp32_input2.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis1/stackfp32_output.bin b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis1/stackfp32_output.bin
new file mode 100644
index 0000000000..4c6479cc84
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis1/stackfp32_output.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis2/stackfp32_input1.bin b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis2/stackfp32_input1.bin
new file mode 100644
index 0000000000..3f2624a069
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis2/stackfp32_input1.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis2/stackfp32_input2.bin b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis2/stackfp32_input2.bin
new file mode 100644
index 0000000000..7c3bd3916b
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis2/stackfp32_input2.bin
@@ -0,0 +1,2 @@
+����=��t$�,�0��>���5�>ƕ8=;��?�p����D>	B�?Aȼ;^f?��S�ILz�P������l��=��ƾ�b���i	�i�?v��>񟌿?��>�ϛ����?�5Y?@�>�]>�5��,s�"��=ŭ��X_�?G
+��Q�ֿ�B��C�=Pti�9R>�Z�?����>��xA�>g&�?��{�0�)?����E�M^���><�j=[Fۿ�߼?u��a��JȘ�"^����?
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis2/stackfp32_output.bin b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis2/stackfp32_output.bin
new file mode 100644
index 0000000000..10b863c048
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis2/stackfp32_output.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis3/stackfp32_input1.bin b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis3/stackfp32_input1.bin
new file mode 100644
index 0000000000..40570cde91
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis3/stackfp32_input1.bin
@@ -0,0 +1,2 @@
+�K����%><�?��O�aV�>X2G��,��s�=|\�|l�>(���c(?�����<�%��?f9�������XG���?��>�RL��]ҽ�?b'@�7�,�>Al&@Rg�?���~^���R�?9+6���B?��)>��?�腿��=(p�����h\�?�b�?�LF>��?�ܽe���'@{(�>އn?�F����~=y{�c�Ͼh��=��뽼V���K?��俈�=����\3"��?��E����S���)M�B˴?m�?	���
+s�?�!�
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis3/stackfp32_input2.bin b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis3/stackfp32_input2.bin
new file mode 100644
index 0000000000..de836a1da5
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis3/stackfp32_input2.bin
@@ -0,0 +1,3 @@
+_T?�˿`?;�q��R���?�^�?���������=�Ü���>{;B��<쾣��>�/?q���[?��	��1�	A�_�?a�J?q���.��?�3r��.'�4;��pk�����>L2?���>X6����=q8��ZW�?����9�>�#?��;=7�v�՝�/[���Y�>lɽ����K?��S���^?�灿�$��n�	?ҰK���=A�s>�
+	?���y�)��~>��g��i�?f�I����	�� 4#?_���ѵ��֮>��?
+D��@�=!g>?
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis3/stackfp32_output.bin b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis3/stackfp32_output.bin
new file mode 100644
index 0000000000..0dfc0087b4
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis3/stackfp32_output.bin
@@ -0,0 +1,4 @@
+�K��_T?��%>�˿<�?`?��O�;�q�aV�>�R�X2G���?�,��^�?�s�=����|\����|l�>��=(���Ü��c(?��>���{;B���<�<쾬%����>�?�/?f9��q�������[?�XG���	���?�1���>	A��RL�_�?�]ҽa�J?�?q���b'@.��?�7��3r�,�>�.'�Al&@4;��Rg�?pk��������>~^��L2?�R�?���>9+6�X6����B?��=��)>q8����?ZW�?�腿������=9�>(p��#?������;=h\�?7�v��b�?՝��LF>/[����?�Y�>�ܽlɽe�������'@K?��{(�>S��އn?�^?�F���灿��~=�$��y{�n�	?c�ϾҰK�h��=��=���A�s>�V��
+	?��K?������y�)���=�~>������g�\3"��i�?�?��f�I�E�������	��S�� 4#?�)M�_���B˴?ѵ�m�?�֮>	����?�
+D�
+s�?�@�=�!�!g>?
\ No newline at end of file