From: @chenzupeng Reviewed-by: @ddwsky,@zhang_xue_tong Signed-off-by: @ddwskytags/v1.1.0
| @@ -189,3 +189,37 @@ __kernel void transpose_0231_NC4HW4(__read_only image2d_t src_data, __write_only | |||||
| WRITE_IMAGE(dst_data, (int2)(4 * Y + 3, Z * shape.y + X), dst3); | WRITE_IMAGE(dst_data, (int2)(4 * Y + 3, Z * shape.y + X), dst3); | ||||
| } | } | ||||
| } | } | ||||
| typedef union FLT4_array { | |||||
| FLT c_array[4]; | |||||
| FLT4 vector; | |||||
| } FLT4_array; | |||||
| __kernel void transpose_general_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 out_shape, | |||||
| int4 de_perm, int4 in_shape) { | |||||
| int X = get_global_id(0); // N*H | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= out_shape.y * out_shape.x || Y >= out_shape.z || 4 * Z >= out_shape.w) { | |||||
| return; | |||||
| } | |||||
| int N = X / out_shape.y; | |||||
| int H = X % out_shape.y; | |||||
| int CI4_SIZE = UP_DIV(in_shape.w, 4); | |||||
| FLT4_array result_tmp; | |||||
| result_tmp.vector = (FLT4)(0.f); | |||||
| FLT *result_ptr = result_tmp.c_array; | |||||
| for (int i = 0; i < 4; i++) { | |||||
| if (Z * 4 + i < out_shape.w) { | |||||
| int out_index[4] = {N, H, Y, Z * 4 + i}; | |||||
| FLT4 src = READ_IMAGE(src_data, smp_zero, | |||||
| (int2)(out_index[de_perm.z] * CI4_SIZE + out_index[de_perm.w] / 4, | |||||
| out_index[de_perm.x] * in_shape.y + out_index[de_perm.y])); | |||||
| FLT4_array src_tmp; | |||||
| src_tmp.vector = src; | |||||
| result_tmp.c_array[i] = src_tmp.c_array[out_index[de_perm.w] % 4]; | |||||
| } | |||||
| } | |||||
| int CO4_SIZE = UP_DIV(in_shape.w, 4); | |||||
| WRITE_IMAGE(dst_data, (int2)(Y * CO4_SIZE + Z, X), result_tmp.vector); | |||||
| } | |||||
| @@ -34,32 +34,58 @@ using mindspore::schema::PrimitiveType_Transpose; | |||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| int TransposeOpenCLKernel::CheckSpecs() { | int TransposeOpenCLKernel::CheckSpecs() { | ||||
| auto param = reinterpret_cast<TransposeParameter *>(op_parameter_); | |||||
| if (in_tensors_[0]->shape().size() != 4 || in_tensors_[0]->shape()[0] > 1) { | |||||
| MS_LOG(ERROR) << "Transpose only support 4d tensor and n = 1 yet."; | |||||
| return mindspore::lite::RET_ERROR; | |||||
| if ((in_tensors_.size() != 1 && in_tensors_.size() != 2) || out_tensors_.size() != 1) { | |||||
| MS_LOG(ERROR) << "Transpose input output size unsupported."; | |||||
| return RET_ERROR; | |||||
| } | } | ||||
| if (param->num_axes_ == 4 && param->perm_[0] == 0 && param->perm_[1] == 3 && param->perm_[2] == 1 && | |||||
| param->perm_[3] == 2) { | |||||
| type = TransposeType::AXIS0312; | |||||
| } else if (param->num_axes_ == 4 && param->perm_[0] == 0 && param->perm_[1] == 2 && param->perm_[2] == 3 && | |||||
| param->perm_[3] == 1) { | |||||
| type = TransposeType::AXIS0231; | |||||
| } else { | |||||
| MS_LOG(ERROR) << "unsupported transpose axes."; | |||||
| return mindspore::lite::RET_ERROR; | |||||
| tensor_size_ = GpuTensorInfo(out_tensors_[0]); | |||||
| if (tensor_size_.NDim > 4) { | |||||
| MS_LOG(ERROR) << "Transpose don't support 5d tensor or higher."; | |||||
| return RET_ERROR; | |||||
| } | } | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int TransposeOpenCLKernel::Prepare() { | int TransposeOpenCLKernel::Prepare() { | ||||
| auto param = reinterpret_cast<TransposeParameter *>(op_parameter_); | |||||
| if (tensor_size_.NDim == 2) { | |||||
| perm_4d_[0] = tensor_size_.AlignAxis(param->perm_[0]); | |||||
| perm_4d_[1] = 1; | |||||
| perm_4d_[2] = 2; | |||||
| perm_4d_[3] = tensor_size_.AlignAxis(param->perm_[1]); | |||||
| } else if (tensor_size_.NDim == 3) { | |||||
| perm_4d_[0] = tensor_size_.AlignAxis(param->perm_[0]); | |||||
| perm_4d_[1] = 1; | |||||
| perm_4d_[2] = tensor_size_.AlignAxis(param->perm_[1]); | |||||
| perm_4d_[3] = tensor_size_.AlignAxis(param->perm_[2]); | |||||
| } else if (tensor_size_.NDim == 4) { | |||||
| perm_4d_[0] = tensor_size_.AlignAxis(param->perm_[0]); | |||||
| perm_4d_[1] = tensor_size_.AlignAxis(param->perm_[1]); | |||||
| perm_4d_[2] = tensor_size_.AlignAxis(param->perm_[2]); | |||||
| perm_4d_[3] = tensor_size_.AlignAxis(param->perm_[3]); | |||||
| } else { | |||||
| perm_4d_[0] = 0; | |||||
| perm_4d_[0] = 1; | |||||
| perm_4d_[0] = 2; | |||||
| perm_4d_[0] = 3; | |||||
| } | |||||
| if (tensor_size_.N == 1 && perm_4d_[0] == 0 && perm_4d_[1] == 3 && perm_4d_[2] == 1 && perm_4d_[3] == 2) { | |||||
| type_ = TransposeType::AXIS0312; | |||||
| } else if (tensor_size_.N == 1 && perm_4d_[0] == 0 && perm_4d_[1] == 2 && perm_4d_[2] == 3 && perm_4d_[3] == 1) { | |||||
| type_ = TransposeType::AXIS0231; | |||||
| } else { | |||||
| type_ = TransposeType::GENERAL; | |||||
| } | |||||
| std::string kernel_name = "transpose"; | std::string kernel_name = "transpose"; | ||||
| if (type == TransposeType::AXIS0312) { | |||||
| if (type_ == TransposeType::AXIS0312) { | |||||
| kernel_name += "_0312"; | kernel_name += "_0312"; | ||||
| } else if (type == TransposeType::AXIS0231) { | |||||
| } else if (type_ == TransposeType::AXIS0231) { | |||||
| kernel_name += "_0231"; | kernel_name += "_0231"; | ||||
| } else { | |||||
| kernel_name += "_general"; | |||||
| } | } | ||||
| if (in_tensors_[0]->shape()[2] * UP_DIV(in_tensors_[0]->shape()[3], C4NUM) > MAX_IMAGE2D_SIZE) { | |||||
| if (in_tensors_[0]->shape().size() == 4 && | |||||
| in_tensors_[0]->shape()[2] * UP_DIV(in_tensors_[0]->shape()[3], C4NUM) > MAX_IMAGE2D_SIZE) { | |||||
| // just for input | // just for input | ||||
| kernel_name += "_oversize"; | kernel_name += "_oversize"; | ||||
| } | } | ||||
| @@ -80,27 +106,40 @@ int TransposeOpenCLKernel::Prepare() { | |||||
| } | } | ||||
| void TransposeOpenCLKernel::SetConstArgs() { | void TransposeOpenCLKernel::SetConstArgs() { | ||||
| std::vector<int> shapex = out_tensors_[0]->shape(); | |||||
| size_t n = shapex[0]; // n=1 | |||||
| size_t h = shapex[1]; | |||||
| size_t w = shapex[2]; | |||||
| size_t c = shapex[3]; | |||||
| size_t n = tensor_size_.N; | |||||
| size_t h = tensor_size_.H; | |||||
| size_t w = tensor_size_.W; | |||||
| size_t c = tensor_size_.C; | |||||
| int arg_idx = 2; | int arg_idx = 2; | ||||
| cl_int4 shape = {static_cast<int>(n), static_cast<int>(h), static_cast<int>(w), static_cast<int>(c)}; | cl_int4 shape = {static_cast<int>(n), static_cast<int>(h), static_cast<int>(w), static_cast<int>(c)}; | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, shape); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, shape); | ||||
| if (type_ == TransposeType::GENERAL) { | |||||
| int de_perm[4]; // output to input perm | |||||
| for (int i = 0; i < 4; i++) { | |||||
| de_perm[perm_4d_[i]] = i; | |||||
| } | |||||
| cl_int4 de_perm_cl = {de_perm[0], de_perm[1], de_perm[2], de_perm[3]}; | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, de_perm_cl); | |||||
| GpuTensorInfo in_shape = GpuTensorInfo(in_tensors_[0]); | |||||
| cl_int4 out_shape = {static_cast<cl_int>(in_shape.N), static_cast<cl_int>(in_shape.H), | |||||
| static_cast<cl_int>(in_shape.W), static_cast<cl_int>(in_shape.C)}; | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_shape); | |||||
| } | |||||
| } | } | ||||
| void TransposeOpenCLKernel::SetGlobalLocal() { | void TransposeOpenCLKernel::SetGlobalLocal() { | ||||
| std::vector<int> shapex = out_tensors_[0]->shape(); | |||||
| size_t h = shapex[1]; | |||||
| size_t w = shapex[2]; | |||||
| size_t c = shapex[3]; | |||||
| size_t n = tensor_size_.N; | |||||
| size_t h = tensor_size_.H; | |||||
| size_t w = tensor_size_.W; | |||||
| size_t c = tensor_size_.C; | |||||
| size_t c4 = UP_DIV(c, 4); | size_t c4 = UP_DIV(c, 4); | ||||
| local_size_ = {}; | local_size_ = {}; | ||||
| if (type == TransposeType::AXIS0312) { // NHWC -> NCHW | |||||
| if (type_ == TransposeType::AXIS0312) { // NHWC -> NCHW | |||||
| global_size_ = {UP_DIV(h, C4NUM), w, c4}; | global_size_ = {UP_DIV(h, C4NUM), w, c4}; | ||||
| } else if (type == TransposeType::AXIS0231) { // NCHW -> NHWC | |||||
| } else if (type_ == TransposeType::AXIS0231) { // NCHW -> NHWC | |||||
| global_size_ = {h, UP_DIV(w, C4NUM), c4}; | global_size_ = {h, UP_DIV(w, C4NUM), c4}; | ||||
| } else { // general | |||||
| global_size_ = {n * h, w, c4}; | |||||
| } | } | ||||
| AlignGlobalLocal(global_size_, local_size_); | AlignGlobalLocal(global_size_, local_size_); | ||||
| } | } | ||||
| @@ -25,7 +25,7 @@ | |||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| enum class TransposeType { AXIS0312, AXIS0231 }; | |||||
| enum class TransposeType { AXIS0312, AXIS0231, GENERAL }; | |||||
| class TransposeOpenCLKernel : public OpenCLKernel { | class TransposeOpenCLKernel : public OpenCLKernel { | ||||
| public: | public: | ||||
| @@ -41,7 +41,9 @@ class TransposeOpenCLKernel : public OpenCLKernel { | |||||
| void SetGlobalLocal() override; | void SetGlobalLocal() override; | ||||
| private: | private: | ||||
| TransposeType type{TransposeType::AXIS0312}; | |||||
| TransposeType type_{TransposeType::AXIS0312}; | |||||
| GpuTensorInfo tensor_size_{GpuTensorInfo(nullptr)}; | |||||
| int perm_4d_[4]; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -113,8 +113,8 @@ struct GpuTensorInfo { | |||||
| } | } | ||||
| int AlignAxis(int oriAxis) const { | int AlignAxis(int oriAxis) const { | ||||
| if (NDim == 0) { | |||||
| return 0; | |||||
| if (NDim == 0 || NDim == 1) { | |||||
| return 3; | |||||
| } | } | ||||
| int no_neg_axis = static_cast<int>((oriAxis + NDim) % NDim); | int no_neg_axis = static_cast<int>((oriAxis + NDim) % NDim); | ||||
| if (no_neg_axis == 0) { | if (no_neg_axis == 0) { | ||||
| @@ -231,6 +231,7 @@ class OpenCLKernel : public LiteKernel { | |||||
| return tuning_params; | return tuning_params; | ||||
| } | } | ||||
| BaseTuningParameter default_tuning_param = BaseTuningParameter(); | BaseTuningParameter default_tuning_param = BaseTuningParameter(); | ||||
| default_tuning_param.local_size = local_size_; | |||||
| tuning_params.push_back(default_tuning_param); | tuning_params.push_back(default_tuning_param); | ||||
| std::vector<size_t> max_work_items = ocl_runtime_->GetWorkItemSize(); | std::vector<size_t> max_work_items = ocl_runtime_->GetWorkItemSize(); | ||||
| size_t max_workgroup_size = ocl_runtime_->GetMaxWorkGroupSize(kernel_); | size_t max_workgroup_size = ocl_runtime_->GetMaxWorkGroupSize(kernel_); | ||||
| @@ -263,7 +264,7 @@ class OpenCLKernel : public LiteKernel { | |||||
| return tuning_params; | return tuning_params; | ||||
| } | } | ||||
| virtual int AssignTuningParam(const BaseTuningParameter param) { | |||||
| virtual int AssignTuningParam(const BaseTuningParameter ¶m) { | |||||
| std::vector<size_t> local_size_tmp = param.local_size; | std::vector<size_t> local_size_tmp = param.local_size; | ||||
| if (local_size_tmp.size() > global_size_.size()) { | if (local_size_tmp.size() > global_size_.size()) { | ||||
| local_size_tmp = std::vector<size_t>(local_size_tmp.begin(), local_size_tmp.begin() + global_size_.size()); | local_size_tmp = std::vector<size_t>(local_size_tmp.begin(), local_size_tmp.begin() + global_size_.size()); | ||||
| @@ -66,4 +66,53 @@ TEST_F(TestOpenCL_Transpose, NCHW2NHWC) { | |||||
| } | } | ||||
| } | } | ||||
| TEST_F(TestOpenCL_Transpose, NHWC2NWHC) { | |||||
| std::vector<int> input_shape = {1, 2, 3, 4}; | |||||
| std::vector<int> perm = {0, 2, 1, 3}; | |||||
| std::vector<int> output_shape; | |||||
| for (int axis : perm) { | |||||
| output_shape.push_back(input_shape[axis]); | |||||
| } | |||||
| float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}; | |||||
| float output_data[] = {0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 16, 17, 18, 19, 8, 9, 10, 11, 20, 21, 22, 23}; | |||||
| for (auto fp16_enable : {false, true}) { | |||||
| auto *param = CreateParameter(perm); | |||||
| TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable); | |||||
| } | |||||
| } | |||||
| TEST_F(TestOpenCL_Transpose, NWC2CWN) { | |||||
| std::vector<int> input_shape = {1, 2, 3}; | |||||
| std::vector<int> perm = {2, 1, 0}; | |||||
| std::vector<int> output_shape; | |||||
| for (int axis : perm) { | |||||
| output_shape.push_back(input_shape[axis]); | |||||
| } | |||||
| float input_data[] = {0, 1, 2, 3, 4, 5}; | |||||
| float output_data[] = {0, 3, 1, 4, 2, 5}; | |||||
| for (auto fp16_enable : {false, true}) { | |||||
| auto *param = CreateParameter(perm); | |||||
| TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable); | |||||
| } | |||||
| } | |||||
| TEST_F(TestOpenCL_Transpose, NWC2WNC) { | |||||
| std::vector<int> input_shape = {2, 3, 5}; | |||||
| std::vector<int> perm = {1, 0, 2}; | |||||
| std::vector<int> output_shape; | |||||
| for (int axis : perm) { | |||||
| output_shape.push_back(input_shape[axis]); | |||||
| } | |||||
| float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, | |||||
| 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29}; | |||||
| float output_data[] = {0, 1, 2, 3, 4, 15, 16, 17, 18, 19, 5, 6, 7, 8, 9, | |||||
| 20, 21, 22, 23, 24, 10, 11, 12, 13, 14, 25, 26, 27, 28, 29}; | |||||
| for (auto fp16_enable : {false, true}) { | |||||
| auto *param = CreateParameter(perm); | |||||
| TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable); | |||||
| } | |||||
| } | |||||
| } // namespace mindspore::lite::opencl::test | } // namespace mindspore::lite::opencl::test | ||||