Merge pull request !7978 from chenzupeng/master-litetags/v1.1.0
| @@ -84,3 +84,94 @@ __kernel void sum_local_NHWC4(__read_only image2d_t src_data, __write_only image | |||||
| } | } | ||||
| WRITE_IMAGE(dst_data, (int2)(X, 0), TO_FLT4(result)); | WRITE_IMAGE(dst_data, (int2)(X, 0), TO_FLT4(result)); | ||||
| } | } | ||||
| __kernel void mean_WC_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { | |||||
| int X = get_global_id(0); // H | |||||
| if (X >= size.x) { | |||||
| return; | |||||
| } | |||||
| float4 result = (float4)0.f; | |||||
| for (int w = 0; w < size.y; w++) { | |||||
| for (int c = 0; c < size.z; c++) { | |||||
| result += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + c, X))); | |||||
| } | |||||
| } | |||||
| result /= size.y * size.w; | |||||
| FLT4 result2 = (FLT4)(0.f); | |||||
| result2.x = dot(TO_FLT4(result), (FLT4)(1.f)); | |||||
| WRITE_IMAGE(dst_data, (int2)(0, X), result2); | |||||
| } | |||||
| __kernel void mean_WC_local_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { | |||||
| int X = get_global_id(0); // H | |||||
| int localy = get_local_id(1); | |||||
| int localz = get_local_id(2); | |||||
| if (X >= size.x) return; | |||||
| __local float4 temp[LOCAL_CACHE_THREAD][LOCAL_CACHE_THREAD]; | |||||
| temp[localy][localz] = (float4)0.f; | |||||
| for (int w = localy; w < size.y; w += LOCAL_CACHE_THREAD) { | |||||
| for (int c = localz; c < size.z; c += LOCAL_CACHE_THREAD) { | |||||
| temp[localy][localz] += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + c, X))); | |||||
| } | |||||
| } | |||||
| barrier(CLK_LOCAL_MEM_FENCE); | |||||
| if (localz == 0) { | |||||
| for (int i = 1; i < LOCAL_CACHE_THREAD; i++) { | |||||
| temp[localy][0] += temp[localy][i]; | |||||
| } | |||||
| } | |||||
| barrier(CLK_LOCAL_MEM_FENCE); | |||||
| float4 result = temp[0][0]; | |||||
| for (int i = 1; i < LOCAL_CACHE_THREAD; i++) { | |||||
| result += temp[i][0]; | |||||
| } | |||||
| result /= size.y * size.w; | |||||
| FLT4 result2 = (FLT4)(0.f); | |||||
| result2.x = dot(TO_FLT4(result), (FLT4)(1.f)); | |||||
| WRITE_IMAGE(dst_data, (int2)(0, X), result2); | |||||
| } | |||||
| __kernel void sum_WC_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { | |||||
| int X = get_global_id(0); // H | |||||
| if (X >= size.x) { | |||||
| return; | |||||
| } | |||||
| FLT4 result = (FLT4)0.f; | |||||
| for (int w = 0; w < size.y; w++) { | |||||
| for (int c = 0; c < size.z; c++) { | |||||
| result += READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + c, X)); | |||||
| } | |||||
| } | |||||
| FLT4 result2 = (FLT4)(0.f); | |||||
| result2.x = dot(TO_FLT4(result), (FLT4)(1.f)); | |||||
| WRITE_IMAGE(dst_data, (int2)(0, X), result2); | |||||
| } | |||||
| __kernel void sum_WC_local_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { | |||||
| int X = get_global_id(0); // H | |||||
| int localy = get_local_id(1); | |||||
| int localz = get_local_id(2); | |||||
| if (X >= size.x) return; | |||||
| __local float4 temp[LOCAL_CACHE_THREAD][LOCAL_CACHE_THREAD]; | |||||
| temp[localy][localz] = (float4)0.f; | |||||
| for (int w = localy; w < size.y; w += LOCAL_CACHE_THREAD) { | |||||
| for (int c = localz; c < size.z; c += LOCAL_CACHE_THREAD) { | |||||
| temp[localy][localz] += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + c, X))); | |||||
| } | |||||
| } | |||||
| barrier(CLK_LOCAL_MEM_FENCE); | |||||
| if (localz == 0) { | |||||
| for (int i = 1; i < LOCAL_CACHE_THREAD; i++) { | |||||
| temp[localy][0] += temp[localy][i]; | |||||
| } | |||||
| } | |||||
| barrier(CLK_LOCAL_MEM_FENCE); | |||||
| float4 result = temp[0][0]; | |||||
| for (int i = 1; i < LOCAL_CACHE_THREAD; i++) { | |||||
| result += temp[i][0]; | |||||
| } | |||||
| FLT4 result2 = (FLT4)(0.f); | |||||
| result2.x = dot(TO_FLT4(result), (FLT4)(1.f)); | |||||
| WRITE_IMAGE(dst_data, (int2)(0, X), result2); | |||||
| } | |||||
| @@ -51,24 +51,34 @@ int ReduceOpenCLKernel::Init() { | |||||
| MS_LOG(ERROR) << "not supported reduce type:" << reduce_param->mode_; | MS_LOG(ERROR) << "not supported reduce type:" << reduce_param->mode_; | ||||
| return RET_PARAM_INVALID; | return RET_PARAM_INVALID; | ||||
| } | } | ||||
| if (reduce_param->num_axes_ != 2 || ((reduce_param->axes_[0] != 1 || reduce_param->axes_[1] != 2) && | |||||
| (reduce_param->axes_[0] != 2 || reduce_param->axes_[1] != 1))) { | |||||
| MS_LOG(ERROR) << "reduce op only support axes HW"; | |||||
| if (reduce_param->num_axes_ != 2) { | |||||
| MS_LOG(ERROR) << "reduce op only support axes=2"; | |||||
| return RET_PARAM_INVALID; | |||||
| } | |||||
| bool hw_reduce = (reduce_param->axes_[0] == 1 && reduce_param->axes_[1] == 2) || | |||||
| (reduce_param->axes_[0] == 2 && reduce_param->axes_[1] == 1); | |||||
| wc_reduce_ = (reduce_param->axes_[0] == 2 && reduce_param->axes_[1] == 3) || | |||||
| (reduce_param->axes_[0] == 3 && reduce_param->axes_[1] == 2); | |||||
| if (!hw_reduce && !wc_reduce_) { | |||||
| MS_LOG(ERROR) << "reduce op only support axis (1,2) or (2,3)"; | |||||
| return RET_PARAM_INVALID; | |||||
| } | |||||
| if (wc_reduce_ && reduce_param->keep_dims_ == false) { | |||||
| MS_LOG(ERROR) << "reduce axis (2,3) should keep dims"; | |||||
| return RET_PARAM_INVALID; | return RET_PARAM_INVALID; | ||||
| } | } | ||||
| std::string kernel_name = reduce_type2str.at(reduce_param->mode_); | std::string kernel_name = reduce_type2str.at(reduce_param->mode_); | ||||
| if (in_tensors_[0]->shape()[1] >= LOCAL_CACHE_THREAD || in_tensors_[0]->shape()[2] >= LOCAL_CACHE_THREAD) { | |||||
| if (wc_reduce_) { | |||||
| kernel_name += "_WC"; | |||||
| } | |||||
| if (in_tensors_[0]->shape()[reduce_param->axes_[0]] >= LOCAL_CACHE_THREAD || | |||||
| in_tensors_[0]->shape()[reduce_param->axes_[1]] >= LOCAL_CACHE_THREAD) { | |||||
| use_local_ = true; | use_local_ = true; | ||||
| kernel_name += "_local"; | kernel_name += "_local"; | ||||
| } | } | ||||
| kernel_name += "_NHWC4"; | kernel_name += "_NHWC4"; | ||||
| enable_fp16_ = ocl_runtime_->GetFp16Enable(); | enable_fp16_ = ocl_runtime_->GetFp16Enable(); | ||||
| if (in_tensors_[0]->shape().back() != out_tensors_[0]->shape().back()) { | |||||
| MS_LOG(ERROR) << "Reduce input channel " << in_tensors_[0]->shape().back() << " should equal output channel" | |||||
| << out_tensors_[0]->shape().back(); | |||||
| return mindspore::lite::RET_ERROR; | |||||
| } | |||||
| #ifdef PROGRAM_WITH_IL | #ifdef PROGRAM_WITH_IL | ||||
| kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); | kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); | ||||
| #else | #else | ||||
| @@ -109,7 +119,10 @@ int ReduceOpenCLKernel::Run() { | |||||
| local = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD}; | local = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD}; | ||||
| } | } | ||||
| std::vector<size_t> global = {static_cast<size_t>(c4), 1, 1}; | std::vector<size_t> global = {static_cast<size_t>(c4), 1, 1}; | ||||
| cl_int4 size = {h, w, c4, 1}; | |||||
| if (wc_reduce_) { | |||||
| global = {static_cast<size_t>(h), 1, 1}; | |||||
| } | |||||
| cl_int4 size = {h, w, c4, c}; | |||||
| int arg_idx = 0; | int arg_idx = 0; | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | ||||
| @@ -40,6 +40,7 @@ class ReduceOpenCLKernel : public OpenCLKernel { | |||||
| bool enable_fp16_{false}; | bool enable_fp16_{false}; | ||||
| std::vector<size_t> nhwc_shape_; | std::vector<size_t> nhwc_shape_; | ||||
| bool use_local_{false}; | bool use_local_{false}; | ||||
| bool wc_reduce_{false}; | |||||
| static const size_t LOCAL_CACHE_THREAD{16}; | static const size_t LOCAL_CACHE_THREAD{16}; | ||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -79,6 +79,7 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te | |||||
| new_tensor = nullptr; | new_tensor = nullptr; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| parameter->op_parameter.type_ = mindspore::schema::PrimitiveType_ToFormat; | |||||
| parameter->src_format = src_format; | parameter->src_format = src_format; | ||||
| parameter->dst_format = dst_format; | parameter->dst_format = dst_format; | ||||
| parameter->out_mem_type = mem_type; | parameter->out_mem_type = mem_type; | ||||
| @@ -30,7 +30,7 @@ class TestReduceOpenCL : public mindspore::CommonTest { | |||||
| }; | }; | ||||
| void RunTestCaseReduce(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16, | void RunTestCaseReduce(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16, | ||||
| int reduce_mode) { | |||||
| int reduce_mode, bool WC = false) { | |||||
| auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); | auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); | ||||
| ocl_runtime->Init(); | ocl_runtime->Init(); | ||||
| size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float); | size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float); | ||||
| @@ -43,6 +43,11 @@ void RunTestCaseReduce(const std::vector<int> &shape, void *input_data, void *ou | |||||
| } | } | ||||
| param->axes_[0] = 1; | param->axes_[0] = 1; | ||||
| param->axes_[1] = 2; | param->axes_[1] = 2; | ||||
| if (WC) { | |||||
| param->axes_[0] = 2; | |||||
| param->axes_[1] = 3; | |||||
| param->keep_dims_ = true; | |||||
| } | |||||
| param->num_axes_ = 2; | param->num_axes_ = 2; | ||||
| param->mode_ = reduce_mode; | param->mode_ = reduce_mode; | ||||
| int n = shape[0]; | int n = shape[0]; | ||||
| @@ -58,8 +63,11 @@ void RunTestCaseReduce(const std::vector<int> &shape, void *input_data, void *ou | |||||
| return; | return; | ||||
| } | } | ||||
| std::vector<int> out_shape = {n, c}; | std::vector<int> out_shape = {n, c}; | ||||
| if (WC) { | |||||
| out_shape = {n, h, 1, 1}; | |||||
| } | |||||
| auto tensor_out_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), | auto tensor_out_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), | ||||
| out_shape, schema::Format_NC); | |||||
| out_shape, WC ? schema::Format_NHWC : schema::Format_NC); | |||||
| auto tensor_out = tensor_out_ptr.get(); | auto tensor_out = tensor_out_ptr.get(); | ||||
| if (tensor_out == nullptr) { | if (tensor_out == nullptr) { | ||||
| MS_LOG(ERROR) << "tensor_out create error."; | MS_LOG(ERROR) << "tensor_out create error."; | ||||
| @@ -152,4 +160,28 @@ TEST_F(TestReduceOpenCL, ReduceSumFp16) { | |||||
| RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceSum); | RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceSum); | ||||
| } | } | ||||
| TEST_F(TestReduceOpenCL, ReduceMeanWCFp32) { | |||||
| int n = 1; | |||||
| int h = 3; | |||||
| int w = 2; | |||||
| int c = 2; | |||||
| std::vector<int> shape = {n, h, w, c}; | |||||
| std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; | |||||
| std::vector<float> output_data = {1.5f, 5.5f, 9.5f}; | |||||
| RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMean, true); | |||||
| } | |||||
| TEST_F(TestReduceOpenCL, ReduceSumWCFp32) { | |||||
| int n = 1; | |||||
| int h = 3; | |||||
| int w = 2; | |||||
| int c = 2; | |||||
| std::vector<int> shape = {n, h, w, c}; | |||||
| std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; | |||||
| std::vector<float> output_data = {6.0f, 22.0f, 38.0f}; | |||||
| RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceSum, true); | |||||
| } | |||||
| } // namespace mindspore | } // namespace mindspore | ||||