Merge pull request !7978 from chenzupeng/master-litetags/v1.1.0
| @@ -84,3 +84,94 @@ __kernel void sum_local_NHWC4(__read_only image2d_t src_data, __write_only image | |||
| } | |||
| WRITE_IMAGE(dst_data, (int2)(X, 0), TO_FLT4(result)); | |||
| } | |||
| __kernel void mean_WC_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { | |||
| int X = get_global_id(0); // H | |||
| if (X >= size.x) { | |||
| return; | |||
| } | |||
| float4 result = (float4)0.f; | |||
| for (int w = 0; w < size.y; w++) { | |||
| for (int c = 0; c < size.z; c++) { | |||
| result += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + c, X))); | |||
| } | |||
| } | |||
| result /= size.y * size.w; | |||
| FLT4 result2 = (FLT4)(0.f); | |||
| result2.x = dot(TO_FLT4(result), (FLT4)(1.f)); | |||
| WRITE_IMAGE(dst_data, (int2)(0, X), result2); | |||
| } | |||
| __kernel void mean_WC_local_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { | |||
| int X = get_global_id(0); // H | |||
| int localy = get_local_id(1); | |||
| int localz = get_local_id(2); | |||
| if (X >= size.x) return; | |||
| __local float4 temp[LOCAL_CACHE_THREAD][LOCAL_CACHE_THREAD]; | |||
| temp[localy][localz] = (float4)0.f; | |||
| for (int w = localy; w < size.y; w += LOCAL_CACHE_THREAD) { | |||
| for (int c = localz; c < size.z; c += LOCAL_CACHE_THREAD) { | |||
| temp[localy][localz] += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + c, X))); | |||
| } | |||
| } | |||
| barrier(CLK_LOCAL_MEM_FENCE); | |||
| if (localz == 0) { | |||
| for (int i = 1; i < LOCAL_CACHE_THREAD; i++) { | |||
| temp[localy][0] += temp[localy][i]; | |||
| } | |||
| } | |||
| barrier(CLK_LOCAL_MEM_FENCE); | |||
| float4 result = temp[0][0]; | |||
| for (int i = 1; i < LOCAL_CACHE_THREAD; i++) { | |||
| result += temp[i][0]; | |||
| } | |||
| result /= size.y * size.w; | |||
| FLT4 result2 = (FLT4)(0.f); | |||
| result2.x = dot(TO_FLT4(result), (FLT4)(1.f)); | |||
| WRITE_IMAGE(dst_data, (int2)(0, X), result2); | |||
| } | |||
| __kernel void sum_WC_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { | |||
| int X = get_global_id(0); // H | |||
| if (X >= size.x) { | |||
| return; | |||
| } | |||
| FLT4 result = (FLT4)0.f; | |||
| for (int w = 0; w < size.y; w++) { | |||
| for (int c = 0; c < size.z; c++) { | |||
| result += READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + c, X)); | |||
| } | |||
| } | |||
| FLT4 result2 = (FLT4)(0.f); | |||
| result2.x = dot(TO_FLT4(result), (FLT4)(1.f)); | |||
| WRITE_IMAGE(dst_data, (int2)(0, X), result2); | |||
| } | |||
| __kernel void sum_WC_local_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { | |||
| int X = get_global_id(0); // H | |||
| int localy = get_local_id(1); | |||
| int localz = get_local_id(2); | |||
| if (X >= size.x) return; | |||
| __local float4 temp[LOCAL_CACHE_THREAD][LOCAL_CACHE_THREAD]; | |||
| temp[localy][localz] = (float4)0.f; | |||
| for (int w = localy; w < size.y; w += LOCAL_CACHE_THREAD) { | |||
| for (int c = localz; c < size.z; c += LOCAL_CACHE_THREAD) { | |||
| temp[localy][localz] += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + c, X))); | |||
| } | |||
| } | |||
| barrier(CLK_LOCAL_MEM_FENCE); | |||
| if (localz == 0) { | |||
| for (int i = 1; i < LOCAL_CACHE_THREAD; i++) { | |||
| temp[localy][0] += temp[localy][i]; | |||
| } | |||
| } | |||
| barrier(CLK_LOCAL_MEM_FENCE); | |||
| float4 result = temp[0][0]; | |||
| for (int i = 1; i < LOCAL_CACHE_THREAD; i++) { | |||
| result += temp[i][0]; | |||
| } | |||
| FLT4 result2 = (FLT4)(0.f); | |||
| result2.x = dot(TO_FLT4(result), (FLT4)(1.f)); | |||
| WRITE_IMAGE(dst_data, (int2)(0, X), result2); | |||
| } | |||
| @@ -51,24 +51,34 @@ int ReduceOpenCLKernel::Init() { | |||
| MS_LOG(ERROR) << "not supported reduce type:" << reduce_param->mode_; | |||
| return RET_PARAM_INVALID; | |||
| } | |||
| if (reduce_param->num_axes_ != 2 || ((reduce_param->axes_[0] != 1 || reduce_param->axes_[1] != 2) && | |||
| (reduce_param->axes_[0] != 2 || reduce_param->axes_[1] != 1))) { | |||
| MS_LOG(ERROR) << "reduce op only support axes HW"; | |||
| if (reduce_param->num_axes_ != 2) { | |||
| MS_LOG(ERROR) << "reduce op only support axes=2"; | |||
| return RET_PARAM_INVALID; | |||
| } | |||
| bool hw_reduce = (reduce_param->axes_[0] == 1 && reduce_param->axes_[1] == 2) || | |||
| (reduce_param->axes_[0] == 2 && reduce_param->axes_[1] == 1); | |||
| wc_reduce_ = (reduce_param->axes_[0] == 2 && reduce_param->axes_[1] == 3) || | |||
| (reduce_param->axes_[0] == 3 && reduce_param->axes_[1] == 2); | |||
| if (!hw_reduce && !wc_reduce_) { | |||
| MS_LOG(ERROR) << "reduce op only support axis (1,2) or (2,3)"; | |||
| return RET_PARAM_INVALID; | |||
| } | |||
| if (wc_reduce_ && reduce_param->keep_dims_ == false) { | |||
| MS_LOG(ERROR) << "reduce axis (2,3) should keep dims"; | |||
| return RET_PARAM_INVALID; | |||
| } | |||
| std::string kernel_name = reduce_type2str.at(reduce_param->mode_); | |||
| if (in_tensors_[0]->shape()[1] >= LOCAL_CACHE_THREAD || in_tensors_[0]->shape()[2] >= LOCAL_CACHE_THREAD) { | |||
| if (wc_reduce_) { | |||
| kernel_name += "_WC"; | |||
| } | |||
| if (in_tensors_[0]->shape()[reduce_param->axes_[0]] >= LOCAL_CACHE_THREAD || | |||
| in_tensors_[0]->shape()[reduce_param->axes_[1]] >= LOCAL_CACHE_THREAD) { | |||
| use_local_ = true; | |||
| kernel_name += "_local"; | |||
| } | |||
| kernel_name += "_NHWC4"; | |||
| enable_fp16_ = ocl_runtime_->GetFp16Enable(); | |||
| if (in_tensors_[0]->shape().back() != out_tensors_[0]->shape().back()) { | |||
| MS_LOG(ERROR) << "Reduce input channel " << in_tensors_[0]->shape().back() << " should equal output channel" | |||
| << out_tensors_[0]->shape().back(); | |||
| return mindspore::lite::RET_ERROR; | |||
| } | |||
| #ifdef PROGRAM_WITH_IL | |||
| kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); | |||
| #else | |||
| @@ -109,7 +119,10 @@ int ReduceOpenCLKernel::Run() { | |||
| local = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD}; | |||
| } | |||
| std::vector<size_t> global = {static_cast<size_t>(c4), 1, 1}; | |||
| cl_int4 size = {h, w, c4, 1}; | |||
| if (wc_reduce_) { | |||
| global = {static_cast<size_t>(h), 1, 1}; | |||
| } | |||
| cl_int4 size = {h, w, c4, c}; | |||
| int arg_idx = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | |||
| @@ -40,6 +40,7 @@ class ReduceOpenCLKernel : public OpenCLKernel { | |||
| bool enable_fp16_{false}; | |||
| std::vector<size_t> nhwc_shape_; | |||
| bool use_local_{false}; | |||
| bool wc_reduce_{false}; | |||
| static const size_t LOCAL_CACHE_THREAD{16}; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -79,6 +79,7 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te | |||
| new_tensor = nullptr; | |||
| return RET_ERROR; | |||
| } | |||
| parameter->op_parameter.type_ = mindspore::schema::PrimitiveType_ToFormat; | |||
| parameter->src_format = src_format; | |||
| parameter->dst_format = dst_format; | |||
| parameter->out_mem_type = mem_type; | |||
| @@ -30,7 +30,7 @@ class TestReduceOpenCL : public mindspore::CommonTest { | |||
| }; | |||
| void RunTestCaseReduce(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16, | |||
| int reduce_mode) { | |||
| int reduce_mode, bool WC = false) { | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); | |||
| ocl_runtime->Init(); | |||
| size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float); | |||
| @@ -43,6 +43,11 @@ void RunTestCaseReduce(const std::vector<int> &shape, void *input_data, void *ou | |||
| } | |||
| param->axes_[0] = 1; | |||
| param->axes_[1] = 2; | |||
| if (WC) { | |||
| param->axes_[0] = 2; | |||
| param->axes_[1] = 3; | |||
| param->keep_dims_ = true; | |||
| } | |||
| param->num_axes_ = 2; | |||
| param->mode_ = reduce_mode; | |||
| int n = shape[0]; | |||
| @@ -58,8 +63,11 @@ void RunTestCaseReduce(const std::vector<int> &shape, void *input_data, void *ou | |||
| return; | |||
| } | |||
| std::vector<int> out_shape = {n, c}; | |||
| if (WC) { | |||
| out_shape = {n, h, 1, 1}; | |||
| } | |||
| auto tensor_out_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), | |||
| out_shape, schema::Format_NC); | |||
| out_shape, WC ? schema::Format_NHWC : schema::Format_NC); | |||
| auto tensor_out = tensor_out_ptr.get(); | |||
| if (tensor_out == nullptr) { | |||
| MS_LOG(ERROR) << "tensor_out create error."; | |||
| @@ -152,4 +160,28 @@ TEST_F(TestReduceOpenCL, ReduceSumFp16) { | |||
| RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceSum); | |||
| } | |||
| TEST_F(TestReduceOpenCL, ReduceMeanWCFp32) { | |||
| int n = 1; | |||
| int h = 3; | |||
| int w = 2; | |||
| int c = 2; | |||
| std::vector<int> shape = {n, h, w, c}; | |||
| std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; | |||
| std::vector<float> output_data = {1.5f, 5.5f, 9.5f}; | |||
| RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMean, true); | |||
| } | |||
| TEST_F(TestReduceOpenCL, ReduceSumWCFp32) { | |||
| int n = 1; | |||
| int h = 3; | |||
| int w = 2; | |||
| int c = 2; | |||
| std::vector<int> shape = {n, h, w, c}; | |||
| std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; | |||
| std::vector<float> output_data = {6.0f, 22.0f, 38.0f}; | |||
| RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceSum, true); | |||
| } | |||
| } // namespace mindspore | |||