diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl index a163fb9175..c22970b98b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl @@ -1,24 +1,70 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable +#define C4NUM 4 +#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void reshape_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size, int4 size_out) { +__kernel void reshape_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 src_size, + int4 dst_size) { int X = get_global_id(0); - if (X >= size_out.x * size_out.y * size_out.z * size_out.w) { + int Y = get_global_id(1); + int CO4 = UP_DIV(dst_size.z, C4NUM); + int CO4_rem = dst_size.z % C4NUM; + if (X >= dst_size.x || Y > dst_size.y) { return; } - int in_img_x = size.z * size.w; - int out_img_x = size_out.z * size_out.w; - WRITE_IMAGE(dst_data, (int2)(X % out_img_x, X / out_img_x), - READ_IMAGE(src_data, smp_zero, (int2)(X % in_img_x, X / in_img_x))); + int CI4 = UP_DIV(src_size.x, C4NUM); + int CI4_rem = src_size.x % C4NUM; + int in_img_x = CI4 * src_size.y; + FLT4 res = (FLT4)(0.0f); + FLT tmp[4]; + FLT res_tmp[4]; + int gcnt = 0; + int start = 0; + int i = 0; + int j = 0; + int n = 0; + int cond = (((int)(CO4_rem > 0)) << 1) | (CI4_rem > 0); + switch (cond) { + case 1: + start = ((X / CO4 * dst_size.z + min(dst_size.z, (X % CO4) * C4NUM)) + dst_size.w * Y); + gcnt = start / src_size.x * CI4 + (start % src_size.x) / C4NUM; + start = (CI4 > 1 && gcnt < CI4) ? 0 : ((X + Y * dst_size.x) * C4NUM) % src_size.x % C4NUM; + for (i = 0, n = 0, j = start; i < 4; ++n, j = 0) { + int X_src = (gcnt + n) % in_img_x; + res = READ_IMAGE(src_data, smp_zero, (int2)(X_src, (gcnt + n) / in_img_x)); + tmp[0] = res.x; + tmp[1] = res.y; + tmp[2] = res.z; + tmp[3] = res.w; + int k = (X_src % CI4) == (CI4 - 1) ? CI4_rem : 4; + for (; j < k && i < 4; ++j, ++i) { + res_tmp[i] = tmp[j]; + } + } + res.x = res_tmp[0]; + res.y = res_tmp[1]; + res.z = res_tmp[2]; + res.w = res_tmp[3]; + WRITE_IMAGE(dst_data, (int2)(X, Y), res); + break; + default: + gcnt = X + dst_size.x * Y; + res = READ_IMAGE(src_data, smp_zero, (int2)(gcnt % in_img_x, gcnt / in_img_x)); + WRITE_IMAGE(dst_data, (int2)(X, Y), res); + } } -__kernel void reshape_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size, - int4 size_out) { +__kernel void reshape_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 src_size, + int4 dst_size) { int X = get_global_id(0); - if (X >= size_out.x * size_out.y * size_out.z * size_out.w) { + int Y = get_global_id(1); + int CO4 = UP_DIV(dst_size.z, C4NUM); + int CO4_rem = dst_size.z % C4NUM; + if (X >= dst_size.x || Y > dst_size.y) { return; } - int in_img_x = size.z; - int out_img_x = size_out.z; - WRITE_IMAGE(dst_data, (int2)(X % out_img_x, X / out_img_x), - READ_IMAGE(src_data, smp_zero, (int2)(X % in_img_x, X / in_img_x))); + int CI4 = UP_DIV(src_size.x, C4NUM); + int CI4_rem = src_size.x % C4NUM; + int in_img_x = CI4 * src_size.y; + int gcnt = X + dst_size.x * Y; + WRITE_IMAGE(dst_data, (int2)(X, Y), READ_IMAGE(src_data, smp_zero, (int2)(gcnt % in_img_x, gcnt / in_img_x))); } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc index 935a154f67..b5555b672a 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc @@ -76,6 +76,7 @@ int ConcatOpenCLKernel::Init() { auto param = reinterpret_cast(this->op_parameter_); MS_LOG(DEBUG) << " concat at axis=: " << param->axis_; + param->axis_ = (param->axis_ == -1) ? (in_tensors_[0]->shape().size() - 1) : param->axis_; if (param->axis_ < 0 || param->axis_ > 3) { MS_LOG(ERROR) << " only support axis >= 0 and axis <= 3 "; return RET_ERROR; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc index eaead966ac..63d0ca4f24 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc @@ -38,12 +38,6 @@ int ReshapeOpenCLKernel::Init() { MS_LOG(ERROR) << "Reshape output size should in 2,4"; return RET_ERROR; } - if ((in_tensors_[0]->shape().back() % 4 != 0 || out_tensors_[0]->shape().back() % 4 != 0) && - in_tensors_[0]->shape().back() != out_tensors_[0]->shape().back()) { - MS_LOG(ERROR) << "Reshape input channel align 4 should equal output channel, cin:" << in_tensors_[0]->shape().back() - << " cout:" << out_tensors_[0]->shape().back(); - return RET_ERROR; - } if (in_tensors_[0]->shape().size() == 2) { inShape = {in_tensors_[0]->shape()[0], 1, 1, in_tensors_[0]->shape()[1]}; } else { @@ -81,6 +75,10 @@ int ReshapeOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) int h = outShape[1]; int w = outShape[2]; int c = outShape[3]; + if (img_size_.size() == OpenCLImageSizeIndex::IDX_NUM) { + *img_size = img_size_; + return RET_OK; + } if (op_format_ == schema::Format::Format_NHWC4) { im_dst_x = w * UP_DIV(c, C4NUM); im_dst_y = n * h; @@ -98,6 +96,7 @@ int ReshapeOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) img_size->clear(); std::vector vec{im_dst_x, im_dst_y, img_dtype}; *img_size = vec; + img_size_ = vec; return RET_OK; } @@ -105,15 +104,15 @@ int ReshapeOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running!"; std::vector local = {}; - std::vector global = { - static_cast(outShape[0] * outShape[1] * outShape[2] * UP_DIV(outShape[3], C4NUM))}; - cl_int4 size = {inShape[0], inShape[1], inShape[2], UP_DIV(inShape[3], C4NUM)}; - cl_int4 size_out = {outShape[0], outShape[1], outShape[2], UP_DIV(outShape[3], C4NUM)}; + std::vector global{img_size_[0], img_size_[1]}; + cl_int4 src_size = {inShape[3], inShape[2], inShape[1], inShape[0]}; + cl_int4 dst_size = {static_cast(img_size_[0]), static_cast(img_size_[1]), outShape[3], + outShape[3] * outShape[2]}; int arg_idx = 0; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size); - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size_out); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, src_size); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, dst_size); ocl_runtime_->RunKernel(kernel_, global, local, nullptr); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h index a3f36c5c66..17519d547e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h @@ -25,6 +25,7 @@ namespace mindspore::kernel { enum class OpenCLMemType { BUF, IMG }; +enum OpenCLImageSizeIndex { IDX_X = 0, IDX_Y, IDX_DTYPE, IDX_NUM }; struct OpenCLToFormatParameter { OpParameter op_parameter; @@ -66,6 +67,7 @@ class OpenCLKernel : public LiteKernel { schema::Format op_format_{schema::Format::Format_NHWC4}; lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_; lite::opencl::OpenCLRuntime *ocl_runtime_; + std::vector img_size_; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc index 91b5b733ca..dcbfcf4fdb 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc @@ -29,33 +29,23 @@ class TestReshapeOpenCL : public mindspore::CommonTest { TestReshapeOpenCL() {} }; -void RunTestCaseReshape(const std::vector &shape, void *input_data, void *output_data, bool enable_fp16, - bool is_output_2d) { +void RunTestCaseReshape(const std::vector &shape_in, const std::vector &shape_out, void *input_data, + void *output_data, bool enable_fp16) { auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); ocl_runtime->Init(); size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float); ocl_runtime->SetFp16Enable(enable_fp16); auto allocator = ocl_runtime->GetAllocator(); - int n = shape[0]; - int h = shape[1]; - int w = shape[2]; - int c = shape[3]; - int oh = shape[4]; - int ow = shape[5]; - std::vector input_shape = {n, h, w, c}; auto tensor_x_ptr = std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), - input_shape, schema::Format_NHWC); + shape_in, schema::Format_NHWC); auto tensor_x = tensor_x_ptr.get(); if (tensor_x == nullptr) { MS_LOG(ERROR) << "tensor_x create error."; return; } - std::vector out_shape = {n, oh, ow, c}; - if (is_output_2d) { - out_shape = {n, h * w * c}; - } + bool is_output_2d = shape_out.size() == 2; auto tensor_out_ptr = - std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape, + std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), shape_out, is_output_2d ? schema::Format_NC : schema::Format_NHWC); auto tensor_out = tensor_out_ptr.get(); if (tensor_out == nullptr) { @@ -102,74 +92,108 @@ void RunTestCaseReshape(const std::vector &shape, void *input_data, void *o } TEST_F(TestReshapeOpenCL, ReshapeFp32) { - int n = 1; - int h = 1; - int w = 1; - int c = 7; - int oh = 1; - int ow = 1; - std::vector shape = {n, h, w, c, oh, ow}; + std::vector shape_in = {1, 1, 1, 7}; + std::vector shape_out = {1, 7}; std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; std::vector output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; - RunTestCaseReshape(shape, input_data.data(), output_data.data(), false, true); + RunTestCaseReshape(shape_in, shape_out, input_data.data(), output_data.data(), false); } TEST_F(TestReshapeOpenCL, ReshapeFp16) { - int n = 1; - int h = 1; - int w = 1; - int c = 7; - int oh = 1; - int ow = 1; - std::vector shape = {n, h, w, c, oh, ow}; + std::vector shape_in = {1, 1, 1, 7}; + std::vector shape_out = {1, 7}; std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; std::vector output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; - RunTestCaseReshape(shape, input_data.data(), output_data.data(), true, true); + RunTestCaseReshape(shape_in, shape_out, input_data.data(), output_data.data(), true); } TEST_F(TestReshapeOpenCL, Reshape4DFp32) { - int n = 1; - int h = 2; - int w = 2; - int c = 3; - int oh = 1; - int ow = 4; - std::vector shape = {n, h, w, c, oh, ow}; + std::vector shape_in = {1, 2, 2, 3}; + std::vector shape_out = {1, 1, 4, 3}; std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; std::vector output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; - RunTestCaseReshape(shape, input_data.data(), output_data.data(), false, false); + RunTestCaseReshape(shape_in, shape_out, input_data.data(), output_data.data(), false); } TEST_F(TestReshapeOpenCL, Reshape4DFp16) { - int n = 1; - int h = 2; - int w = 2; - int c = 3; - int oh = 1; - int ow = 4; - std::vector shape = {n, h, w, c, oh, ow}; + std::vector shape_in = {1, 2, 2, 3}; + std::vector shape_out = {1, 1, 4, 3}; std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; std::vector output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; - RunTestCaseReshape(shape, input_data.data(), output_data.data(), true, false); + RunTestCaseReshape(shape_in, shape_out, input_data.data(), output_data.data(), true); } TEST_F(TestReshapeOpenCL, Reshape4D2DFp32) { - int n = 1; - int h = 2; - int w = 2; - int c = 4; - int oh = 2; - int ow = 2; - std::vector shape = {n, h, w, c, oh, ow}; + std::vector shape_in = {1, 2, 2, 4}; + std::vector shape_out = {4, 4}; std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; std::vector output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; - RunTestCaseReshape(shape, input_data.data(), output_data.data(), false, true); + RunTestCaseReshape(shape_in, shape_out, input_data.data(), output_data.data(), false); +} +TEST_F(TestReshapeOpenCL, Reshape4DFp32Rem10) { + std::vector shape_in = {1, 3, 2, 4}; + std::vector shape_out = {1, 4, 2, 3}; + std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, + 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f}; + std::vector output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, + 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f}; + + RunTestCaseReshape(shape_in, shape_out, input_data.data(), output_data.data(), false); +} +TEST_F(TestReshapeOpenCL, Reshape4DFp32Rem01Test0) { + std::vector shape_in = {1, 4, 2, 3}; + std::vector shape_out = {1, 3, 2, 4}; + std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, + 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f}; + std::vector output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, + 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f}; + + RunTestCaseReshape(shape_in, shape_out, input_data.data(), output_data.data(), false); +} +TEST_F(TestReshapeOpenCL, Reshape4DFp32Rem01Test1) { + std::vector shape_in = {1, 2, 2, 5}; + std::vector shape_out = {1, 1, 5, 4}; + std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, + 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f}; + std::vector output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, + 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f}; + + RunTestCaseReshape(shape_in, shape_out, input_data.data(), output_data.data(), false); } +TEST_F(TestReshapeOpenCL, Reshape4DFp32Rem01Test2) { + std::vector shape_in = {1, 4, 2, 5}; + std::vector shape_out = {1, 2, 5, 4}; + std::vector input_data = { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, + 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, + 28.0f, 29.0f, 30.0f, 31.0f, 32.0f, 33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, + }; + std::vector output_data = { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, + 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, + 28.0f, 29.0f, 30.0f, 31.0f, 32.0f, 33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, + }; + + RunTestCaseReshape(shape_in, shape_out, input_data.data(), output_data.data(), false); +} +TEST_F(TestReshapeOpenCL, Reshape4DFp32Rem11) { + std::vector shape_in = {1, 3, 2, 5}; + std::vector shape_out = {1, 5, 2, 3}; + std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, + 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, + 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f}; + std::vector output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, + 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, + 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f}; + + RunTestCaseReshape(shape_in, shape_out, input_data.data(), output_data.data(), false); +} + } // namespace mindspore