diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/stack.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/stack.cl index 3620334414..4bc72b9f6e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/stack.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/stack.cl @@ -1,5 +1,6 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define INT4 int4 +#define C4NUM 4 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; #define CHECK_IDX_FOR_STACK \ int X = get_global_id(0); \ @@ -10,128 +11,95 @@ __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | } \ FLT4 result; -__kernel void stack8inputaxis1(__read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2, - __read_only image2d_t input3, __read_only image2d_t input4, __read_only image2d_t input5, - __read_only image2d_t input6, __read_only image2d_t input7, - __write_only image2d_t output, int4 input_shape0, int4 input_shape1, int4 input_shape2, - int4 input_shape3, int4 input_shape4, int4 input_shape5, int4 input_shape6, - int4 input_shape7, int4 output_shape) { +// input -1D +__kernel void stack_2input_3axis_1inshape(__read_only image2d_t input0, __read_only image2d_t input1, + __write_only image2d_t output, int4 input_shape, int4 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z) { + return; + } + int coordinate_x_out = output_shape.w; + FLT4 result1 = READ_IMAGE(input0, smp_none, (int2)(0, (X))); + FLT4 result2 = READ_IMAGE(input1, smp_none, (int2)(0, (X))); + FLT4 result = {result1.x, result2.x, 0, 0}; + WRITE_IMAGE(output, (int2)(coordinate_x_out, (X)), result); +} + +// input -2D -axis = 1 +__kernel void stack_2input_1axis_2inshape(__read_only image2d_t input0, __read_only image2d_t input1, + __write_only image2d_t output, int4 input_shape, int4 output_shape) { CHECK_IDX_FOR_STACK; - if (X < input_shape0.y) { - result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X))); - } else if (X < (input_shape0.y + input_shape1.y)) { - result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z, (X - input_shape0.y))); - } else if (X < (input_shape0.y + input_shape1.y + input_shape2.y)) { - result = READ_IMAGE(input2, smp_none, (int2)((Y)*input_shape2.w + Z, (X - input_shape0.y - input_shape1.y))); - } else if (X < (input_shape0.y + input_shape1.y + input_shape2.y + input_shape3.y)) { - result = READ_IMAGE(input3, smp_none, - (int2)((Y)*input_shape3.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y))); - } else if (X < (input_shape0.y + input_shape1.y + input_shape2.y + input_shape3.y + input_shape4.y)) { - result = READ_IMAGE( - input4, smp_none, - (int2)((Y)*input_shape4.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y - input_shape3.y))); - } else if (X < - (input_shape0.y + input_shape1.y + input_shape2.y + input_shape3.y + input_shape4.y + input_shape5.y)) { - result = READ_IMAGE(input5, smp_none, - (int2)((Y)*input_shape5.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y - - input_shape3.y - input_shape4.y))); - } else if (X < (input_shape0.y + input_shape1.y + input_shape2.y + input_shape3.y + input_shape4.y + input_shape5.y + - input_shape6.y)) { - result = READ_IMAGE(input6, smp_none, - (int2)((Y)*input_shape6.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y - - input_shape3.y - input_shape4.y - input_shape5.y))); + int IN = X / output_shape.y; + int IH = X % output_shape.y; + int boundary0 = input_shape.z; + if (Y < boundary0) { + int coordinate_x = Y * input_shape.w + Z; + int coordinate_y = IN * input_shape.y + IH; + result = READ_IMAGE(input0, smp_none, (int2)(coordinate_x, coordinate_y)); } else { - result = - READ_IMAGE(input7, smp_none, - (int2)((Y)*input_shape7.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y - input_shape3.y - - input_shape4.y - input_shape5.y - input_shape6.y))); + int coordinate_x = (Y - boundary0) * input_shape.w + Z; + int coordinate_y = IN * input_shape.y + IH; + result = READ_IMAGE(input1, smp_none, (int2)(coordinate_x, coordinate_y)); } - WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); + WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (IN * output_shape.y + IH)), result); } -__kernel void stack8inputaxis2(__read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2, - __read_only image2d_t input3, __read_only image2d_t input4, __read_only image2d_t input5, - __read_only image2d_t input6, __read_only image2d_t input7, - __write_only image2d_t output, int4 input_shape0, int4 input_shape1, int4 input_shape2, - int4 input_shape3, int4 input_shape4, int4 input_shape5, int4 input_shape6, - int4 input_shape7, int4 output_shape) { +// input -3D -axis = 1 +__kernel void stack_2input_1axis_3inshape(__read_only image2d_t input0, __read_only image2d_t input1, + __write_only image2d_t output, int4 input_shape, int4 output_shape) { CHECK_IDX_FOR_STACK; - if (Y < input_shape0.z) { - result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X))); - } else if (Y < (input_shape0.z + input_shape1.z)) { - result = READ_IMAGE(input1, smp_none, (int2)((Y - input_shape0.z) * input_shape1.w + Z, (X))); - } else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z)) { - result = READ_IMAGE(input2, smp_none, (int2)((Y - input_shape0.z - input_shape1.z) * input_shape2.w + Z, (X))); - } else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z + input_shape3.z)) { - result = READ_IMAGE(input3, smp_none, - (int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z) * input_shape3.w + Z, (X))); - } else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z + input_shape3.z + input_shape4.z)) { - result = READ_IMAGE( - input4, smp_none, - (int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z - input_shape3.z) * input_shape4.w + Z, (X))); - } else if (Y < - (input_shape0.z + input_shape1.z + input_shape2.z + input_shape3.z + input_shape4.z + input_shape5.z)) { - result = READ_IMAGE( - input5, smp_none, - (int2)( - (Y - input_shape0.z - input_shape1.z - input_shape2.z - input_shape3.z - input_shape4.z) * input_shape5.w + Z, - (X))); - } else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z + input_shape3.z + input_shape4.z + input_shape5.z + - input_shape6.z)) { - result = READ_IMAGE( - input6, smp_none, - (int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z - input_shape3.z - input_shape4.z - input_shape5.z) * - input_shape6.w + - Z, - (X))); + int IN = X / output_shape.y; + int IH = X % output_shape.y; + int boundary0 = input_shape.y; + if (IH < boundary0) { + int coordinate_x = Y * input_shape.w + Z; + int coordinate_y = IN * input_shape.y + IH; + result = READ_IMAGE(input0, smp_none, (int2)(coordinate_x, coordinate_y)); } else { - result = READ_IMAGE(input7, smp_none, - (int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z - input_shape3.z - input_shape4.z - - input_shape5.z - input_shape6.z) * - input_shape7.w + - Z, - (X))); + int coordinate_x = Y * input_shape.w + Z; + int coordinate_y = IN * input_shape.y + IH - boundary0; + result = READ_IMAGE(input1, smp_none, (int2)(coordinate_x, coordinate_y)); } - WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); + WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (IN * output_shape.y + IH)), result); } -__kernel void stack8inputaxis3(__read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2, - __read_only image2d_t input3, __read_only image2d_t input4, __read_only image2d_t input5, - __read_only image2d_t input6, __read_only image2d_t input7, - __write_only image2d_t output, int4 input_shape0, int4 input_shape1, int4 input_shape2, - int4 input_shape3, int4 input_shape4, int4 input_shape5, int4 input_shape6, - int4 input_shape7, int4 output_shape) { +// input -3D -axis = 2 +__kernel void stack_2input_2axis_3inshape(__read_only image2d_t input0, __read_only image2d_t input1, + __write_only image2d_t output, int4 input_shape, int4 output_shape) { CHECK_IDX_FOR_STACK; - if (Z < input_shape0.w) { - result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X))); - } else if (Z < (input_shape0.w + input_shape1.w)) { - result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z - input_shape0.w, (X))); - } else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w)) { - result = READ_IMAGE(input2, smp_none, (int2)((Y)*input_shape2.w + Z - input_shape0.w - input_shape1.w, (X))); - } else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w + input_shape3.w)) { - result = READ_IMAGE(input3, smp_none, - (int2)((Y)*input_shape3.w + Z - input_shape0.w - input_shape1.w - input_shape2.w, (X))); - } else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w + input_shape3.w + input_shape4.w)) { - result = READ_IMAGE( - input4, smp_none, - (int2)((Y)*input_shape4.w + Z - input_shape0.w - input_shape1.w - input_shape2.w - input_shape3.w, (X))); - } else if (Z < - (input_shape0.w + input_shape1.w + input_shape2.w + input_shape3.w + input_shape4.w + input_shape5.w)) { - result = READ_IMAGE(input5, smp_none, - (int2)((Y)*input_shape5.w + Z - input_shape0.w - input_shape1.w - input_shape2.w - - input_shape3.w - input_shape4.w, - (X))); - } else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w + input_shape3.w + input_shape4.w + input_shape5.w + - input_shape6.w)) { - result = READ_IMAGE(input6, smp_none, - (int2)((Y)*input_shape6.w + Z - input_shape0.w - input_shape1.w - input_shape2.w - - input_shape3.w - input_shape4.w - input_shape5.w, - (X))); + int boundary0 = input_shape.y; + int IN = X / output_shape.y; + int IW = X % output_shape.y; + int IC = Z; + int coordinate_x = IW * input_shape.w + IC; + int coordinate_y = IN * input_shape.y; + if (Y < boundary0) { + result = READ_IMAGE(input0, smp_none, (int2)(coordinate_x, coordinate_y)); } else { - result = READ_IMAGE(input7, smp_none, - (int2)((Y)*input_shape7.w + Z - input_shape0.w - input_shape1.w - input_shape2.w - - input_shape3.w - input_shape4.w - input_shape5.w - input_shape6.w, - (X))); + result = READ_IMAGE(input1, smp_none, (int2)(coordinate_x, coordinate_y)); + } + WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, IN * output_shape.y + IW), result); +} + +// input -3D -axis = 3 and input -2D -axis = 2 boundary stack +__kernel void stack_2input_boundary(__global float *input0, __global float *input1, __global float *output, + int4 input_shape, int4 output_shape, int2 stride_w) { + int X = get_global_id(0); // N + int Y = get_global_id(1); // H + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + int IW = output_shape.z; + int Align_out = output_shape.w * C4NUM; + int Align_in = input_shape.w * C4NUM; + int index_out = X * output_shape.y * stride_w.x + Y * stride_w.x; + int index_in = X * input_shape.y * stride_w.y + Y * Align_in; + for (int iw = 0; iw < IW; iw++) { + int index_out_tmp = index_out + iw * Align_out; + int index_in_tmp = index_in + iw; + output[index_out_tmp] = input0[index_in_tmp]; + index_out_tmp++; + output[index_out_tmp] = input1[index_in_tmp]; } - WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc index b5639a34f0..55b75c6db2 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc @@ -47,40 +47,6 @@ int StackOpenCLKernel::RunAxis0() { return RET_OK; } -int StackOpenCLKernel::Init() { - if (in_tensors_[0]->shape().size() > 4 || in_tensors_[0]->shape().size() <= 0) { - MS_LOG(ERROR) << " only support dim <= 4 "; - return RET_ERROR; - } - auto param = reinterpret_cast(this->op_parameter_); - axis_ = param->axis_; - axis_ = axis_ < 0 ? axis_ + in_tensors_[0]->shape().size() + 1 : axis_; - if (in_tensors_[0]->shape().size() != 4) { - if (in_tensors_[0]->shape().size() == 2) { - axis_ = axis_ + 2; - } - } - if (param->axis_ < -3 || param->axis_ > 3) { - MS_LOG(ERROR) << " only support axis >= -3 and axis <= 3 "; - return RET_ERROR; - } - - std::string kernel_name = "stack"; - if (in_tensors_.size() == 8) { - kernel_name += "8inputaxis" + std::to_string(axis_); - } else { - MS_LOG(ERROR) << " input must be 8"; - return RET_ERROR; - } - MS_LOG(DEBUG) << "kernel_name=: " << kernel_name; - std::string source = stack_source; - std::string program_name = "stack"; - ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); - - return RET_OK; -} - int StackOpenCLKernel::ReSize() { return RET_OK; } void StackGetWorkGroup(const std::vector &global, std::vector *local, int max_size) { @@ -97,114 +63,123 @@ void StackGetWorkGroup(const std::vector &global, std::vector *l local->push_back(z); } -int StackOpenCLKernel::InferInTensorShapeTo4D(int *arg_cn) { - if (in_tensors_.size() == 8) { - int size = in_tensors_[0]->shape().size(); - switch (size) { - case 1: - for (int i = 0; i < in_tensors_.size(); ++i) { - ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, in_tensors_[i]->data_c()); - } - ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, out_tensors_[0]->data_c()); - for (int i = 0; i < in_tensors_.size(); ++i) { - cl_int4 temp = {in_tensors_[i]->shape()[0], 1, 1, 1}; - ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, temp); - } - break; - case 2: - for (int i = 0; i < in_tensors_.size(); ++i) { - ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, in_tensors_[i]->data_c()); - } - ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, out_tensors_[0]->data_c()); - for (int i = 0; i < in_tensors_.size(); ++i) { - cl_int4 temp = {in_tensors_[i]->shape()[0], 1, 1, UP_DIV(in_tensors_[i]->shape()[1], C4NUM)}; - ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, temp); - } - break; - case 3: - for (int i = 0; i < in_tensors_.size(); ++i) { - ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, in_tensors_[i]->data_c()); - } - ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, out_tensors_[0]->data_c()); - for (int i = 0; i < in_tensors_.size(); ++i) { - cl_int4 temp = {in_tensors_[i]->shape()[0], 1, in_tensors_[i]->shape()[1], - UP_DIV(in_tensors_[i]->shape()[2], C4NUM)}; - ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, temp); - } - break; - default: - MS_LOG(ERROR) << "unsupported input size > 3 or size <= 0 :" << in_tensors_.size(); - return RET_ERROR; - } - } else { - MS_LOG(ERROR) << "unsupported input size :" << in_tensors_.size(); +int StackOpenCLKernel::CheckSpecs() { + if (in_tensors_[0]->shape().size() > 2 && (axis_ != 0)) { + MS_LOG(ERROR) << " only support input size = 2 "; + return RET_ERROR; + } + if (in_tensors_[0]->shape().size() > 4 || in_tensors_[0]->shape().size() <= 0) { + MS_LOG(ERROR) << " only support dim <= 4 "; + return RET_ERROR; + } + auto param = reinterpret_cast(this->op_parameter_); + axis_ = param->axis_; + axis_ = axis_ < 0 ? axis_ + in_tensors_[0]->shape().size() : axis_; + if (axis_ > 3) { + MS_LOG(ERROR) << " only support axis <= 3 "; + return RET_ERROR; + } + if (axis_ > in_tensors_[0]->shape().size()) { + MS_LOG(ERROR) << " stack axis must been <= in_tensors_[0]->shape().size() "; return RET_ERROR; } return RET_OK; } -int StackOpenCLKernel::InferOutTensorShapeTo4D(cl_int4 *output_shape) { - std::vector out_shape = out_tensors_[0]->shape(); - if (out_shape.size() == 3) { - N_ = out_shape[0]; - C_ = out_shape[1] * UP_DIV(out_shape[2], C4NUM); - } else if (out_shape.size() == 4) { - if (axis_ == 1) { - N_ = out_shape[0]; - H_ = out_shape[1]; - W_ = out_shape[2]; - C_ = UP_DIV(out_shape[3], C4NUM); - } else { - MS_LOG(ERROR) << "Unsupported out_shape.size=: " << out_shape.size() << " axis=: " << axis_; - return RET_ERROR; - } +void StackOpenCLKernel::SetConstArgs() { + int arg_cn = in_tensors_.size() + 1; + cl_int4 inshape_tmp = {}, outshape_tmp = {}; + for (int i = 0; i < in_tensors_[0]->shape().size(); ++i) { + inshape_tmp.s[i] = in_tensors_[0]->shape()[i]; + } + Broadcast2GpuShape(in_shape_.s, inshape_tmp.s, in_tensors_[0]->shape().size(), 1); + for (int i = 0; i < out_tensors_[0]->shape().size(); ++i) { + outshape_tmp.s[i] = out_tensors_[0]->shape()[i]; + } + Broadcast2GpuShape(out_shape_.s, outshape_tmp.s, out_tensors_[0]->shape().size(), 1); + in_shape_.s[3] = UP_DIV(in_shape_.s[3], C4NUM); + out_shape_.s[3] = UP_DIV(out_shape_.s[3], C4NUM); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_); + if (buffer_button_) { + GpuTensorInfo img_info_out(out_tensors_[0]); + GpuTensorInfo img_info_in(in_tensors_[0]); + size_t dtype = enable_fp16_ ? sizeof(cl_half) : sizeof(cl_float); + stride_w_out = img_info_out.RowPitch() / dtype; + stride_w_in = img_info_in.RowPitch() / dtype; + cl_int2 stride_w = {stride_w_out, stride_w_in}; + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w); } - OH_ = N_ * H_; - OW_ = W_; - OC_ = C_; - output_shape->s[0] = N_; - output_shape->s[1] = H_; - output_shape->s[2] = W_; - output_shape->s[3] = C_; - return RET_OK; } -int StackOpenCLKernel::Run() { - MS_LOG(DEBUG) << this->name() << " Running! "; - if (axis_ == 0) { - return RunAxis0(); +void StackOpenCLKernel::SetGlobalLocal() { + if (((in_tensors_[0]->shape().size() == 2 || in_tensors_[0]->shape().size() == 3) && axis_ == 1) || + (in_tensors_[0]->shape().size() == 3 && axis_ == 2)) { + OH_ = out_shape_.s[0] * out_shape_.s[1]; + OW_ = out_shape_.s[2]; + OC_ = out_shape_.s[3]; + } else if (in_tensors_[0]->shape().size() == 1) { + OH_ = out_shape_.s[0] * out_shape_.s[1]; + OW_ = out_shape_.s[2]; + } else { + OH_ = out_shape_.s[0]; + OW_ = out_shape_.s[1]; } - cl_int4 output_shape = {1, 1, 1, 1}; const std::vector &max_global = ocl_runtime_->GetWorkItemSize(); std::vector local = {1, 1, 1}; - int arg_cn = 0; - InferInTensorShapeTo4D(&arg_cn); - InferOutTensorShapeTo4D(&output_shape); std::vector global = {OH_, OW_, OC_}; StackGetWorkGroup(global, &local, max_global[0]); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape); - ocl_runtime_->RunKernel(kernel_, global, local); + OpenCLKernel::AlignGlobalLocal(global, local); +} + +int StackOpenCLKernel::Prepare() { + enable_fp16_ = ocl_runtime_->GetFp16Enable(); + + if (in_tensors_[0]->shape().size() == 1 && axis_ == 1) { + axis_ += 2; + } else if (in_tensors_[0]->shape().size() == axis_) { + buffer_button_ = true; // boundary stack judge + } + std::string kernel_name = "stack_"; + if (!buffer_button_) { + kernel_name += std::to_string(in_tensors_.size()) + "input_" + std::to_string(axis_) + "axis_" + + std::to_string(in_tensors_[0]->shape().size()) + "inshape"; + } else { + kernel_name += std::to_string(in_tensors_.size()) + "input_" + "boundary"; + } + + MS_LOG(DEBUG) << "kernel_name=: " << kernel_name; + std::string source = stack_source; + std::string program_name = "stack"; + ocl_runtime_->LoadSource(program_name, source); + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + SetConstArgs(); + SetGlobalLocal(); + return RET_OK; } -kernel::LiteKernel *OpenCLStackKernelCreator(const std::vector &inputs, - const std::vector &outputs, OpParameter *opParameter, - const lite::InnerContext *ctx, const kernel::KernelKey &desc, - const mindspore::lite::PrimitiveC *primitive) { - auto *kernel = new (std::nothrow) StackOpenCLKernel(opParameter, inputs, outputs); - if (kernel == nullptr) { - MS_LOG(ERROR) << " new StackOpenCLKernel failed "; - return nullptr; +int StackOpenCLKernel::Run() { + MS_LOG(DEBUG) << this->name() << " Running! "; + if (axis_ == 0) { + return RunAxis0(); } - auto ret = kernel->Init(); - if (ret != RET_OK) { - MS_LOG(ERROR) << " Init kernel failed, name: Stack "; - delete kernel; - return nullptr; + int arg_cn = 0; + if (buffer_button_) { + for (int i = 0; i < in_tensors_.size(); ++i) { + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[i]->data_c(), lite::opencl::MemType::BUF); + } + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF); + } else { + for (int i = 0; i < in_tensors_.size(); ++i) { + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[i]->data_c()); + } + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); } - return kernel; + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + return RET_OK; } +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Stack, OpenCLKernelCreator); +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Stack, OpenCLKernelCreator); -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Stack, OpenCLStackKernelCreator); -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Stack, OpenCLStackKernelCreator); } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.h index 9fc809b988..85bb66881b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.h @@ -30,8 +30,10 @@ class StackOpenCLKernel : public OpenCLKernel { : OpenCLKernel(parameter, inputs, outputs) {} ~StackOpenCLKernel() override{}; - - int Init() override; + int Prepare() override; + int CheckSpecs() override; + void SetConstArgs() override; + void SetGlobalLocal() override; int ReSize() override; @@ -40,18 +42,17 @@ class StackOpenCLKernel : public OpenCLKernel { private: int RunAxis0(); - int InferInTensorShapeTo4D(int *arg_cn); - - int InferOutTensorShapeTo4D(cl_int4 *output_shape); - + cl::Kernel kernel_; int axis_{0}; - size_t N_{1}; - size_t H_{1}; - size_t W_{1}; - size_t C_{1}; size_t OH_{1}; size_t OW_{1}; size_t OC_{1}; + bool buffer_button_{false}; + bool enable_fp16_{false}; + cl_int stride_w_in{1}; + cl_int stride_w_out{1}; + cl_int4 in_shape_ = {}; + cl_int4 out_shape_ = {}; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/stack_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/stack_tests.cc index 298cab43b2..49a0328772 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/stack_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/stack_tests.cc @@ -29,34 +29,155 @@ OpParameter *CreateParameter(int axis) { } } // namespace -TEST_F(TestOpenCL_Stack, input8_ndim3_axis0) { - constexpr int INPUT_NUM = 8; - int axis = 0; - std::vector input_shapes[INPUT_NUM] = {{1, 1, 8}, {1, 1, 8}, {1, 1, 8}, {1, 1, 8}, - {1, 1, 8}, {1, 1, 8}, {1, 1, 8}, {1, 1, 8}}; - std::vector output_shape = {8, 1, 1, 8}; - float input_datas[INPUT_NUM][8] = { - {0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.03, 0.37}, {0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.47}, - {0.31, 0.63, 0.84, 0.43, 0.56, 0.79, 0.12, 0.57}, {0.35, 0.26, 0.17, 0.33, 0.66, 0.89, 0.93, 0.77}, - {0.57, 0.6, 0.84, 0.83, 0.48, 0.78, 0.63, 0.87}, {0.66, 0.56, 0.64, 0.63, 0.56, 0.59, 0.73, 0.37}, - {0.35, 0.26, 0.54, 0.33, 0.76, 0.59, 0.73, 0.34}, {0.15, 0.36, 0.44, 0.73, 0.56, 0.49, 0.93, 0.37}}; - float output_data[] = {0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.03, 0.37, 0.5, 0.6, 0.74, 0.23, 0.46, - 0.69, 0.13, 0.47, 0.31, 0.63, 0.84, 0.43, 0.56, 0.79, 0.12, 0.57, 0.35, 0.26, - 0.17, 0.33, 0.66, 0.89, 0.93, 0.77, 0.57, 0.6, 0.84, 0.83, 0.48, 0.78, 0.63, - 0.87, 0.66, 0.56, 0.64, 0.63, 0.56, 0.59, 0.73, 0.37, 0.35, 0.26, 0.54, 0.33, - 0.76, 0.59, 0.73, 0.34, 0.15, 0.36, 0.44, 0.73, 0.56, 0.49, 0.93, 0.37}; +// stack test cases +TEST_F(TestOpenCL_Stack, input2_ndim1_axis1) { + constexpr int INPUT_NUM = 2; + int axis = 1; + std::vector input_shapes[INPUT_NUM] = {{8}, {8}}; + std::vector output_shape = {8, 2}; + float input_datas[INPUT_NUM][8] = {{0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.03, 0.37}, + {0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.47}}; + float output_data[] = {0.75, 0.5, 0.06, 0.6, 0.74, 0.74, 0.30, 0.23, 0.9, 0.46, 0.59, 0.69, 0.03, 0.13, 0.37, 0.47}; + + for (auto fp16_enable : {true}) { + auto *param = CreateParameter(axis); + TestMain({{input_shapes[0], input_datas[0], VAR}, {input_shapes[1], input_datas[1], VAR}}, + {output_shape, output_data}, param, fp16_enable, fp16_enable ? 1e-3 : 1e-9); + } +} + +TEST_F(TestOpenCL_Stack, input2_ndim2_axis1) { + constexpr int INPUT_NUM = 2; + int axis = 1; + std::vector input_shapes[INPUT_NUM] = {{3, 4}, {3, 4}}; + std::vector output_shape = {3, 2, 4}; + float input_datas[INPUT_NUM][12] = { + {1.317, -2.094, -1.892, -0.4612, -0.884, -0.524, 0.4504, 0.0284, 3.227, -0.4673, -1.115, -0.1572}, + {-0.0677, -1.289, 0.0685, 0.889, 0.8145, 1.6455, 0.6587, -0.236, 0.3625, 0.7393, -1.393, 0.2534}}; + float output_data[] = {1.317, -2.094, -1.892, -0.4612, -0.0677, -1.289, 0.0685, 0.889, + -0.884, -0.524, 0.4504, 0.0284, 0.8145, 1.6455, 0.6587, -0.236, + 3.227, -0.4673, -1.115, -0.1572, 0.3625, 0.7393, -1.393, 0.2534}; + + for (auto fp16_enable : {true}) { + auto *param = CreateParameter(axis); + TestMain({{input_shapes[0], input_datas[0], VAR}, {input_shapes[1], input_datas[1], VAR}}, + {output_shape, output_data}, param, fp16_enable, fp16_enable ? 1e-3 : 1e-9); + } +} + +TEST_F(TestOpenCL_Stack, input2_ndim3_axis1) { + constexpr int INPUT_NUM = 2; + int axis = 1; + std::vector input_shapes[INPUT_NUM] = {{3, 4, 5}, {3, 4, 5}}; + std::vector output_shape = {3, 2, 4, 5}; + size_t input1_size, input2_size, output_size; + std::string input1Ppath = "./test_data/stackfp32_input1.bin"; + std::string input2Ppath = "./test_data/stackfp32_input2.bin"; + std::string correctOutputPath = "./test_data/stackfp32_output.bin"; + auto input_data1 = reinterpret_cast(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size)); + auto input_data2 = reinterpret_cast(mindspore::lite::ReadFile(input2Ppath.c_str(), &input2_size)); + auto output_data = reinterpret_cast(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size)); + for (auto fp16_enable : {false}) { + auto *param = CreateParameter(axis); + TestMain({{input_shapes[0], input_data1, VAR}, {input_shapes[1], input_data2, VAR}}, {output_shape, output_data}, + param, fp16_enable, fp16_enable ? 1e-3 : 1e-9); + } +} +TEST_F(TestOpenCL_Stack, input2_ndim3_axis2) { + constexpr int INPUT_NUM = 2; + int axis = 2; + std::vector input_shapes[INPUT_NUM] = {{3, 4, 5}, {3, 4, 5}}; + std::vector output_shape = {3, 4, 2, 5}; + size_t input1_size, input2_size, output_size; + std::string input1Ppath = "./test_data/stackfp32_input1.bin"; + std::string input2Ppath = "./test_data/stackfp32_input2.bin"; + std::string correctOutputPath = "./test_data/stackfp32_output.bin"; + auto input_data1 = reinterpret_cast(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size)); + auto input_data2 = reinterpret_cast(mindspore::lite::ReadFile(input2Ppath.c_str(), &input2_size)); + auto output_data = reinterpret_cast(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size)); for (auto fp16_enable : {false}) { auto *param = CreateParameter(axis); - TestMain({{input_shapes[0], input_datas[0], VAR}, - {input_shapes[1], input_datas[1], VAR}, - {input_shapes[2], input_datas[2], VAR}, - {input_shapes[3], input_datas[3], VAR}, - {input_shapes[4], input_datas[4], VAR}, - {input_shapes[5], input_datas[5], VAR}, - {input_shapes[6], input_datas[6], VAR}, - {input_shapes[7], input_datas[7], VAR}}, - {output_shape, output_data}, param, fp16_enable); + TestMain({{input_shapes[0], input_data1, VAR}, {input_shapes[1], input_data2, VAR}}, {output_shape, output_data}, + param, fp16_enable, fp16_enable ? 1e-3 : 1e-9); + } +} + +TEST_F(TestOpenCL_Stack, input2_ndim2_axis2) { + constexpr int INPUT_NUM = 2; + int axis = 2; + std::vector input_shapes[INPUT_NUM] = {{1, 96}, {1, 96}}; + std::vector output_shape = {1, 96, 2}; + size_t input1_size, input2_size, output_size; + std::string input1Ppath = "./test_data/stackfp32_input1.bin"; + std::string input2Ppath = "./test_data/stackfp32_input2.bin"; + std::string correctOutputPath = "./test_data/stackfp32_output.bin"; + auto input_data1 = reinterpret_cast(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size)); + auto input_data2 = reinterpret_cast(mindspore::lite::ReadFile(input2Ppath.c_str(), &input2_size)); + auto output_data = reinterpret_cast(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size)); + for (auto fp16_enable : {false}) { + auto *param = CreateParameter(axis); + TestMain({{input_shapes[0], input_data1, VAR}, {input_shapes[1], input_data2, VAR}}, {output_shape, output_data}, + param, fp16_enable, fp16_enable ? 1e-3 : 1e-9); + } +} + +TEST_F(TestOpenCL_Stack, input2_ndim3_axis3) { + constexpr int INPUT_NUM = 2; + int axis = 3; + std::vector input_shapes[INPUT_NUM] = {{3, 4, 6}, {3, 4, 6}}; + std::vector output_shape = {3, 4, 6, 2}; + size_t input1_size, input2_size, output_size; + std::string input1Ppath = "./test_data/stackfp32_input1.bin"; + std::string input2Ppath = "./test_data/stackfp32_input2.bin"; + std::string correctOutputPath = "./test_data/stackfp32_output.bin"; + auto input_data1 = reinterpret_cast(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size)); + auto input_data2 = reinterpret_cast(mindspore::lite::ReadFile(input2Ppath.c_str(), &input2_size)); + auto output_data = reinterpret_cast(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size)); + for (auto fp16_enable : {false}) { + auto *param = CreateParameter(axis); + TestMain({{input_shapes[0], input_data1, VAR}, {input_shapes[1], input_data2, VAR}}, {output_shape, output_data}, + param, fp16_enable, fp16_enable ? 1e-3 : 1e-9); + } +} + +TEST_F(TestOpenCL_Stack, input6_ndim3_axis0) { + constexpr int INPUT_NUM = 8; + int axis = 0; + std::vector input_shapes[INPUT_NUM] = {{1, 17, 18}, {1, 17, 18}, {1, 17, 18}, {1, 17, 18}, + {1, 17, 18}, {1, 17, 18}, {1, 17, 18}, {1, 17, 18}}; + std::vector output_shape = {8, 1, 17, 18}; + size_t input1_size, input2_size, input3_size, input4_size, input5_size, input6_size, input7_size, input8_size, + output_size; + std::string input1Ppath = "./test_data/stackfp32_input1.bin"; + std::string input2Ppath = "./test_data/stackfp32_input2.bin"; + std::string input3Ppath = "./test_data/stackfp32_input3.bin"; + std::string input4Ppath = "./test_data/stackfp32_input4.bin"; + std::string input5Ppath = "./test_data/stackfp32_input5.bin"; + std::string input6Ppath = "./test_data/stackfp32_input6.bin"; + std::string input7Ppath = "./test_data/stackfp32_input7.bin"; + std::string input8Ppath = "./test_data/stackfp32_input8.bin"; + std::string correctOutputPath = "./test_data/stackfp32_output.bin"; + auto input_data1 = reinterpret_cast(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size)); + auto input_data2 = reinterpret_cast(mindspore::lite::ReadFile(input2Ppath.c_str(), &input2_size)); + auto input_data3 = reinterpret_cast(mindspore::lite::ReadFile(input3Ppath.c_str(), &input3_size)); + auto input_data4 = reinterpret_cast(mindspore::lite::ReadFile(input4Ppath.c_str(), &input4_size)); + auto input_data5 = reinterpret_cast(mindspore::lite::ReadFile(input5Ppath.c_str(), &input5_size)); + auto input_data6 = reinterpret_cast(mindspore::lite::ReadFile(input6Ppath.c_str(), &input6_size)); + auto input_data7 = reinterpret_cast(mindspore::lite::ReadFile(input7Ppath.c_str(), &input7_size)); + auto input_data8 = reinterpret_cast(mindspore::lite::ReadFile(input8Ppath.c_str(), &input8_size)); + auto output_data = reinterpret_cast(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size)); + for (auto fp16_enable : {true}) { + auto *param = CreateParameter(axis); + TestMain({{input_shapes[0], input_data1, VAR}, + {input_shapes[1], input_data2, VAR}, + {input_shapes[2], input_data3, VAR}, + {input_shapes[3], input_data4, VAR}, + {input_shapes[4], input_data5, VAR}, + {input_shapes[5], input_data6, VAR}, + {input_shapes[6], input_data7, VAR}, + {input_shapes[7], input_data8, VAR}}, + {output_shape, output_data}, param, fp16_enable, fp16_enable ? 1e-3 : 1e-9); } } diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim2_axis2/stackfp32_input1.bin b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim2_axis2/stackfp32_input1.bin new file mode 100644 index 0000000000..c9b0b8922a Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim2_axis2/stackfp32_input1.bin differ diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim2_axis2/stackfp32_input2.bin b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim2_axis2/stackfp32_input2.bin new file mode 100644 index 0000000000..b0e9490bcc Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim2_axis2/stackfp32_input2.bin differ diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim2_axis2/stackfp32_output.bin b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim2_axis2/stackfp32_output.bin new file mode 100644 index 0000000000..43d0e107ea Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim2_axis2/stackfp32_output.bin differ diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis1/stackfp32_input1.bin b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis1/stackfp32_input1.bin new file mode 100644 index 0000000000..7c9d8a905d Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis1/stackfp32_input1.bin differ diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis1/stackfp32_input2.bin b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis1/stackfp32_input2.bin new file mode 100644 index 0000000000..08359107b7 Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis1/stackfp32_input2.bin differ diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis1/stackfp32_output.bin b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis1/stackfp32_output.bin new file mode 100644 index 0000000000..4c6479cc84 Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis1/stackfp32_output.bin differ diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis2/stackfp32_input1.bin b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis2/stackfp32_input1.bin new file mode 100644 index 0000000000..3f2624a069 Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis2/stackfp32_input1.bin differ diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis2/stackfp32_input2.bin b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis2/stackfp32_input2.bin new file mode 100644 index 0000000000..7c3bd3916b --- /dev/null +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis2/stackfp32_input2.bin @@ -0,0 +1,2 @@ +=t$,0>5>ƕ8=;?pD> B?Aȼ;^f?SILzPl=ƾbi i?v>񟌿?>ϛ?5Y?@>]>5,s"=ŭX_?G +QֿBC=Pti9R>Z?>xA>g&?{0)? EM^ ><?OaV>X2G,s=|\|l>(c(?<%?f9XG?>RL]ҽ?b' @7,>Al&@Rg?~^R?9+6B?)>?腿=(p h\?b?LF>?ܽe'@{(>އn?F~=y{cϾh=뽼VK?俈=\3"?ES)MB˴?m?   +s?! \ No newline at end of file diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis3/stackfp32_input2.bin b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis3/stackfp32_input2.bin new file mode 100644 index 0000000000..de836a1da5 --- /dev/null +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis3/stackfp32_input2.bin @@ -0,0 +1,3 @@ +_T?˿`?;qR ?^? =Ü>{;B<쾣>/?q[? 1 A_?aJ?q.?3r.'4;pk>L2?>X6=q8ZW?9>#?;=7v՝/[Y>lɽK?S^?灿$n ?ҰK=As> + ?y)~>gi?fI 4#?_ѵ֮>? +D@=!g>? \ No newline at end of file diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis3/stackfp32_output.bin b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis3/stackfp32_output.bin new file mode 100644 index 0000000000..0dfc0087b4 --- /dev/null +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/test_data/stack/input2_ndim3_axis3/stackfp32_output.bin @@ -0,0 +1,4 @@ +K_T?%>˿<?`?O;qaV>RX2G ?,^?s=|\ |l> =(Üc(?>{;B<<쾬%>?/?f9q[?XG ?1> ARL_?]ҽaJ??qb' @.?73r,>.'Al&@4;Rg?pk>~^L2?R?>9+6X6B?=)>q8?ZW?腿=9>(p #?;=h\?7vb?՝LF>/[?Y>ܽlɽe'@K?{(>Sއn?^?F灿~=$y{n ?cϾҰKh==As>V + ?K?y)=~>g\3"i??fIE  S 4#?)M_B˴?ѵm?֮> ?  +D +s?@=!!g>? \ No newline at end of file