From 0be2bb24282c138afe51d4eef69c5a2342c093aa Mon Sep 17 00:00:00 2001 From: chenzupeng Date: Tue, 1 Dec 2020 14:58:05 +0800 Subject: [PATCH] transpose support any perm --- .../src/runtime/kernel/opencl/cl/transpose.cl | 34 +++++++ .../runtime/kernel/opencl/kernel/transpose.cc | 93 +++++++++++++------ .../runtime/kernel/opencl/kernel/transpose.h | 6 +- .../src/runtime/kernel/opencl/opencl_kernel.h | 7 +- .../runtime/kernel/opencl/transpose_tests.cc | 49 ++++++++++ 5 files changed, 157 insertions(+), 32 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl index 66ffe8efd1..3e05aabc39 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl @@ -189,3 +189,37 @@ __kernel void transpose_0231_NC4HW4(__read_only image2d_t src_data, __write_only WRITE_IMAGE(dst_data, (int2)(4 * Y + 3, Z * shape.y + X), dst3); } } + +typedef union FLT4_array { + FLT c_array[4]; + FLT4 vector; +} FLT4_array; + +__kernel void transpose_general_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 out_shape, + int4 de_perm, int4 in_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= out_shape.y * out_shape.x || Y >= out_shape.z || 4 * Z >= out_shape.w) { + return; + } + int N = X / out_shape.y; + int H = X % out_shape.y; + int CI4_SIZE = UP_DIV(in_shape.w, 4); + FLT4_array result_tmp; + result_tmp.vector = (FLT4)(0.f); + FLT *result_ptr = result_tmp.c_array; + for (int i = 0; i < 4; i++) { + if (Z * 4 + i < out_shape.w) { + int out_index[4] = {N, H, Y, Z * 4 + i}; + FLT4 src = READ_IMAGE(src_data, smp_zero, + (int2)(out_index[de_perm.z] * CI4_SIZE + out_index[de_perm.w] / 4, + out_index[de_perm.x] * in_shape.y + out_index[de_perm.y])); + FLT4_array src_tmp; + src_tmp.vector = src; + result_tmp.c_array[i] = src_tmp.c_array[out_index[de_perm.w] % 4]; + } + } + int CO4_SIZE = UP_DIV(in_shape.w, 4); + WRITE_IMAGE(dst_data, (int2)(Y * CO4_SIZE + Z, X), result_tmp.vector); +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc index 6a3c90cded..f67116124b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc @@ -34,32 +34,58 @@ using mindspore::schema::PrimitiveType_Transpose; namespace mindspore::kernel { int TransposeOpenCLKernel::CheckSpecs() { - auto param = reinterpret_cast(op_parameter_); - if (in_tensors_[0]->shape().size() != 4 || in_tensors_[0]->shape()[0] > 1) { - MS_LOG(ERROR) << "Transpose only support 4d tensor and n = 1 yet."; - return mindspore::lite::RET_ERROR; + if ((in_tensors_.size() != 1 && in_tensors_.size() != 2) || out_tensors_.size() != 1) { + MS_LOG(ERROR) << "Transpose input output size unsupported."; + return RET_ERROR; } - if (param->num_axes_ == 4 && param->perm_[0] == 0 && param->perm_[1] == 3 && param->perm_[2] == 1 && - param->perm_[3] == 2) { - type = TransposeType::AXIS0312; - } else if (param->num_axes_ == 4 && param->perm_[0] == 0 && param->perm_[1] == 2 && param->perm_[2] == 3 && - param->perm_[3] == 1) { - type = TransposeType::AXIS0231; - } else { - MS_LOG(ERROR) << "unsupported transpose axes."; - return mindspore::lite::RET_ERROR; + tensor_size_ = GpuTensorInfo(out_tensors_[0]); + if (tensor_size_.NDim > 4) { + MS_LOG(ERROR) << "Transpose don't support 5d tensor or higher."; + return RET_ERROR; } return RET_OK; } int TransposeOpenCLKernel::Prepare() { + auto param = reinterpret_cast(op_parameter_); + if (tensor_size_.NDim == 2) { + perm_4d_[0] = tensor_size_.AlignAxis(param->perm_[0]); + perm_4d_[1] = 1; + perm_4d_[2] = 2; + perm_4d_[3] = tensor_size_.AlignAxis(param->perm_[1]); + } else if (tensor_size_.NDim == 3) { + perm_4d_[0] = tensor_size_.AlignAxis(param->perm_[0]); + perm_4d_[1] = 1; + perm_4d_[2] = tensor_size_.AlignAxis(param->perm_[1]); + perm_4d_[3] = tensor_size_.AlignAxis(param->perm_[2]); + } else if (tensor_size_.NDim == 4) { + perm_4d_[0] = tensor_size_.AlignAxis(param->perm_[0]); + perm_4d_[1] = tensor_size_.AlignAxis(param->perm_[1]); + perm_4d_[2] = tensor_size_.AlignAxis(param->perm_[2]); + perm_4d_[3] = tensor_size_.AlignAxis(param->perm_[3]); + } else { + perm_4d_[0] = 0; + perm_4d_[0] = 1; + perm_4d_[0] = 2; + perm_4d_[0] = 3; + } + if (tensor_size_.N == 1 && perm_4d_[0] == 0 && perm_4d_[1] == 3 && perm_4d_[2] == 1 && perm_4d_[3] == 2) { + type_ = TransposeType::AXIS0312; + } else if (tensor_size_.N == 1 && perm_4d_[0] == 0 && perm_4d_[1] == 2 && perm_4d_[2] == 3 && perm_4d_[3] == 1) { + type_ = TransposeType::AXIS0231; + } else { + type_ = TransposeType::GENERAL; + } std::string kernel_name = "transpose"; - if (type == TransposeType::AXIS0312) { + if (type_ == TransposeType::AXIS0312) { kernel_name += "_0312"; - } else if (type == TransposeType::AXIS0231) { + } else if (type_ == TransposeType::AXIS0231) { kernel_name += "_0231"; + } else { + kernel_name += "_general"; } - if (in_tensors_[0]->shape()[2] * UP_DIV(in_tensors_[0]->shape()[3], C4NUM) > MAX_IMAGE2D_SIZE) { + if (in_tensors_[0]->shape().size() == 4 && + in_tensors_[0]->shape()[2] * UP_DIV(in_tensors_[0]->shape()[3], C4NUM) > MAX_IMAGE2D_SIZE) { // just for input kernel_name += "_oversize"; } @@ -80,27 +106,40 @@ int TransposeOpenCLKernel::Prepare() { } void TransposeOpenCLKernel::SetConstArgs() { - std::vector shapex = out_tensors_[0]->shape(); - size_t n = shapex[0]; // n=1 - size_t h = shapex[1]; - size_t w = shapex[2]; - size_t c = shapex[3]; + size_t n = tensor_size_.N; + size_t h = tensor_size_.H; + size_t w = tensor_size_.W; + size_t c = tensor_size_.C; int arg_idx = 2; cl_int4 shape = {static_cast(n), static_cast(h), static_cast(w), static_cast(c)}; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, shape); + if (type_ == TransposeType::GENERAL) { + int de_perm[4]; // output to input perm + for (int i = 0; i < 4; i++) { + de_perm[perm_4d_[i]] = i; + } + cl_int4 de_perm_cl = {de_perm[0], de_perm[1], de_perm[2], de_perm[3]}; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, de_perm_cl); + GpuTensorInfo in_shape = GpuTensorInfo(in_tensors_[0]); + cl_int4 out_shape = {static_cast(in_shape.N), static_cast(in_shape.H), + static_cast(in_shape.W), static_cast(in_shape.C)}; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_shape); + } } void TransposeOpenCLKernel::SetGlobalLocal() { - std::vector shapex = out_tensors_[0]->shape(); - size_t h = shapex[1]; - size_t w = shapex[2]; - size_t c = shapex[3]; + size_t n = tensor_size_.N; + size_t h = tensor_size_.H; + size_t w = tensor_size_.W; + size_t c = tensor_size_.C; size_t c4 = UP_DIV(c, 4); local_size_ = {}; - if (type == TransposeType::AXIS0312) { // NHWC -> NCHW + if (type_ == TransposeType::AXIS0312) { // NHWC -> NCHW global_size_ = {UP_DIV(h, C4NUM), w, c4}; - } else if (type == TransposeType::AXIS0231) { // NCHW -> NHWC + } else if (type_ == TransposeType::AXIS0231) { // NCHW -> NHWC global_size_ = {h, UP_DIV(w, C4NUM), c4}; + } else { // general + global_size_ = {n * h, w, c4}; } AlignGlobalLocal(global_size_, local_size_); } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h index 8b6f3c80d9..cb44101f75 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h @@ -25,7 +25,7 @@ namespace mindspore::kernel { -enum class TransposeType { AXIS0312, AXIS0231 }; +enum class TransposeType { AXIS0312, AXIS0231, GENERAL }; class TransposeOpenCLKernel : public OpenCLKernel { public: @@ -41,7 +41,9 @@ class TransposeOpenCLKernel : public OpenCLKernel { void SetGlobalLocal() override; private: - TransposeType type{TransposeType::AXIS0312}; + TransposeType type_{TransposeType::AXIS0312}; + GpuTensorInfo tensor_size_{GpuTensorInfo(nullptr)}; + int perm_4d_[4]; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h index f155942986..70bbd03b4c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h @@ -113,8 +113,8 @@ struct GpuTensorInfo { } int AlignAxis(int oriAxis) const { - if (NDim == 0) { - return 0; + if (NDim == 0 || NDim == 1) { + return 3; } int no_neg_axis = static_cast((oriAxis + NDim) % NDim); if (no_neg_axis == 0) { @@ -231,6 +231,7 @@ class OpenCLKernel : public LiteKernel { return tuning_params; } BaseTuningParameter default_tuning_param = BaseTuningParameter(); + default_tuning_param.local_size = local_size_; tuning_params.push_back(default_tuning_param); std::vector max_work_items = ocl_runtime_->GetWorkItemSize(); size_t max_workgroup_size = ocl_runtime_->GetMaxWorkGroupSize(kernel_); @@ -263,7 +264,7 @@ class OpenCLKernel : public LiteKernel { return tuning_params; } - virtual int AssignTuningParam(const BaseTuningParameter param) { + virtual int AssignTuningParam(const BaseTuningParameter ¶m) { std::vector local_size_tmp = param.local_size; if (local_size_tmp.size() > global_size_.size()) { local_size_tmp = std::vector(local_size_tmp.begin(), local_size_tmp.begin() + global_size_.size()); diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc index 104605962f..7163e447cb 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc @@ -66,4 +66,53 @@ TEST_F(TestOpenCL_Transpose, NCHW2NHWC) { } } +TEST_F(TestOpenCL_Transpose, NHWC2NWHC) { + std::vector input_shape = {1, 2, 3, 4}; + std::vector perm = {0, 2, 1, 3}; + std::vector output_shape; + for (int axis : perm) { + output_shape.push_back(input_shape[axis]); + } + float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}; + float output_data[] = {0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 16, 17, 18, 19, 8, 9, 10, 11, 20, 21, 22, 23}; + + for (auto fp16_enable : {false, true}) { + auto *param = CreateParameter(perm); + TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable); + } +} + +TEST_F(TestOpenCL_Transpose, NWC2CWN) { + std::vector input_shape = {1, 2, 3}; + std::vector perm = {2, 1, 0}; + std::vector output_shape; + for (int axis : perm) { + output_shape.push_back(input_shape[axis]); + } + float input_data[] = {0, 1, 2, 3, 4, 5}; + float output_data[] = {0, 3, 1, 4, 2, 5}; + + for (auto fp16_enable : {false, true}) { + auto *param = CreateParameter(perm); + TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable); + } +} + +TEST_F(TestOpenCL_Transpose, NWC2WNC) { + std::vector input_shape = {2, 3, 5}; + std::vector perm = {1, 0, 2}; + std::vector output_shape; + for (int axis : perm) { + output_shape.push_back(input_shape[axis]); + } + float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29}; + float output_data[] = {0, 1, 2, 3, 4, 15, 16, 17, 18, 19, 5, 6, 7, 8, 9, + 20, 21, 22, 23, 24, 10, 11, 12, 13, 14, 25, 26, 27, 28, 29}; + + for (auto fp16_enable : {false, true}) { + auto *param = CreateParameter(perm); + TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable); + } +} } // namespace mindspore::lite::opencl::test