Merge pull request !7890 from chenzupeng/master-litetags/v1.1.0
| @@ -2,46 +2,14 @@ | |||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | |||
| #endif | |||
| #define divide_no_check(a, b) (a / b) | |||
| __kernel void AvgPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape, | |||
| const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { | |||
| // axis to dst tensor coordinate | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| // boundary check | |||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| FLT4 r = (FLT4)(0.0f); | |||
| FLT window_size = 0.0f; | |||
| int xs = X * stride.x - padding.x; | |||
| int ys = Y * stride.y - padding.y; | |||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||
| int x_c = xs + kx; | |||
| bool outside_x = x_c < 0 || x_c >= input_shape.x; | |||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||
| int y_c = ys + ky; | |||
| bool outside = outside_x || y_c < 0 || y_c >= input_shape.y; | |||
| r += !outside ? input[(input_shape.y * x_c + y_c) * output_shape.w + Z] : (FLT4)(0.0f); | |||
| window_size += !outside ? 1.0f : 0.0f; | |||
| } | |||
| } | |||
| FLT4 result = TO_FLT4(r / window_size); | |||
| output[(output_shape.y * X + Y) * output_shape.w + Z] = result; | |||
| } | |||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | |||
| __kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, | |||
| const int4 output_shape, const int2 stride, const int2 kernel_size, | |||
| const int2 padding) { | |||
| // axis to dst tensor coordinate | |||
| int X = get_global_id(0); | |||
| int X = get_global_id(2); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| int Z = get_global_id(0); | |||
| // boundary check | |||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | |||
| @@ -66,35 +34,3 @@ __kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only i | |||
| FLT4 result = TO_FLT4(divide_no_check(r, window_size)); | |||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), result); | |||
| } | |||
| __kernel void AvgPooling2d_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output, | |||
| const int4 input_shape, const int4 output_shape, const int2 stride, | |||
| const int2 kernel_size, const int2 padding) { | |||
| // axis to dst tensor coordinate | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| // boundary check | |||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| FLT4 r = (FLT4)(0.0f); | |||
| FLT window_size = 0.0f; | |||
| int xs = X * stride.x - padding.x; | |||
| int ys = Y * stride.y - padding.y; | |||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||
| int y_c = ys + ky; | |||
| bool outside_y = y_c < 0 || y_c >= input_shape.y; | |||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||
| int x_c = xs + kx; | |||
| bool outside = outside_y || x_c < 0 || x_c >= input_shape.x; | |||
| r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c, Z * input_shape.x + x_c)) : (FLT4)(0.0f); | |||
| window_size += !outside ? 1.0f : 0.0f; | |||
| } | |||
| } | |||
| FLT4 result = TO_FLT4(divide_no_check(r, window_size)); | |||
| WRITE_IMAGE(output, (int2)(Y, Z * output_shape.x + X), result); | |||
| } | |||
| @@ -1,48 +1,14 @@ | |||
| #ifdef cl_khr_fp16 | |||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | |||
| #endif | |||
| __kernel void MaxPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape, | |||
| const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { | |||
| // axis to dst tensor coordinate | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| // boundary check | |||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| FLT4 maximum = (FLT4)(-10000.0f); | |||
| int xs = X * stride.x - padding.x; | |||
| int ys = Y * stride.y - padding.y; | |||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||
| int x_c = xs + kx; | |||
| if (x_c < 0 || x_c >= input_shape.x) { | |||
| continue; | |||
| } | |||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||
| int y_c = ys + ky; | |||
| if (y_c < 0 || y_c >= input_shape.y) { | |||
| continue; | |||
| } | |||
| FLT4 src = input[(input_shape.y * x_c + y_c) * input_shape.w + Z]; | |||
| maximum = max(src, maximum); | |||
| } | |||
| } | |||
| output[(output_shape.y * X + Y) * output_shape.w + Z] = maximum; | |||
| } | |||
| __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; | |||
| __kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, | |||
| const int4 output_shape, const int2 stride, const int2 kernel_size, | |||
| const int2 padding) { | |||
| // axis to dst tensor coordinate | |||
| int X = get_global_id(0); | |||
| int X = get_global_id(2); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| int Z = get_global_id(0); | |||
| // boundary check | |||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | |||
| @@ -69,9 +35,9 @@ __kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_o | |||
| const int4 input_shape, const int4 output_shape, const int2 stride, | |||
| const int2 kernel_size, const int2 padding) { | |||
| // axis to dst tensor coordinate | |||
| int X = get_global_id(0); | |||
| int X = get_global_id(2); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| int Z = get_global_id(0); | |||
| // boundary check | |||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | |||
| @@ -93,32 +59,3 @@ __kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_o | |||
| } | |||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), max(maximum, (FLT4)(0.f))); | |||
| } | |||
| __kernel void MaxPooling2d_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output, | |||
| const int4 input_shape, const int4 output_shape, const int2 stride, | |||
| const int2 kernel_size, const int2 padding) { | |||
| // axis to dst tensor coordinate | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| // boundary check | |||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| FLT4 maximum = (FLT4)(-10000.0f); | |||
| int xs = X * stride.x - padding.x; | |||
| int ys = Y * stride.y - padding.y; | |||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||
| int y_c = ys + ky; | |||
| if (y_c < 0 || y_c >= input_shape.y) continue; | |||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||
| int x_c = xs + kx; | |||
| if (x_c < 0 || x_c >= input_shape.x) continue; | |||
| FLT4 src = READ_IMAGE(input, smp_none, (int2)(y_c, Z * input_shape.x + x_c)); | |||
| maximum = max(src, maximum); | |||
| } | |||
| } | |||
| WRITE_IMAGE(output, (int2)(Y, Z * output_shape.x + X), maximum); | |||
| } | |||
| @@ -52,16 +52,23 @@ std::vector<size_t> ArithmeticOpenCLKernel::InitGlobalSize() const { | |||
| } | |||
| void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() { | |||
| local_size_ = {16, 16}; | |||
| auto out_shape = out_tensors_[0]->shape(); | |||
| if (out_shape.size() == 2) { | |||
| size_t H = out_shape[0]; | |||
| size_t W = UP_DIV(out_shape[1], C4NUM); | |||
| global_size_ = {W, H}; | |||
| if (element_flag_) { | |||
| local_size_ = {16, 16}; | |||
| auto out_shape = out_tensors_[0]->shape(); | |||
| if (out_shape.size() == 2) { | |||
| size_t H = out_shape[0]; | |||
| size_t W = UP_DIV(out_shape[1], C4NUM); | |||
| global_size_ = {W, H}; | |||
| } else { | |||
| size_t H = out_shape[0] * out_shape[1]; | |||
| size_t W = out_shape[2] * UP_DIV(out_shape[3], C4NUM); | |||
| global_size_ = {W, H}; | |||
| } | |||
| } else { | |||
| size_t H = out_shape[0] * out_shape[1]; | |||
| size_t W = out_shape[2] * UP_DIV(out_shape[3], C4NUM); | |||
| global_size_ = {W, H}; | |||
| local_size_ = {}; | |||
| auto out_shape = GetNHWCShape(out_tensors_[0]->shape()); | |||
| global_size_ = {static_cast<size_t>(UP_DIV(out_shape[3], C4NUM)), static_cast<size_t>(out_shape[2]), | |||
| static_cast<size_t>(out_shape[1] * out_shape[0])}; | |||
| } | |||
| } | |||
| @@ -129,6 +136,27 @@ int ArithmeticOpenCLKernel::InitBuffer() { | |||
| return RET_OK; | |||
| } | |||
| int ArithmeticOpenCLKernel::SetArgs() { | |||
| int arg_idx = 3; | |||
| if (!element_flag_) { | |||
| cl_int4 input0_shape = {inputs_nhwc_shapes_[0][0], inputs_nhwc_shapes_[0][1], inputs_nhwc_shapes_[0][2], | |||
| UP_DIV(inputs_nhwc_shapes_[0][3], C4NUM)}; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input0_shape); | |||
| cl_int4 input1_shape = {inputs_nhwc_shapes_[1][0], inputs_nhwc_shapes_[1][1], inputs_nhwc_shapes_[1][2], | |||
| UP_DIV(inputs_nhwc_shapes_[1][3], C4NUM)}; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input1_shape); | |||
| auto out_shape = GetNHWCShape(out_tensors_[0]->shape()); | |||
| cl_int4 output_shape{out_shape[0], out_shape[1], out_shape[2], UP_DIV(out_shape[3], C4NUM)}; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); | |||
| } else { | |||
| cl_int2 output_shape{static_cast<int>(global_size_[0]), static_cast<int>(global_size_[1])}; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); | |||
| } | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_); | |||
| return RET_OK; | |||
| } | |||
| int ArithmeticOpenCLKernel::Init() { | |||
| std::string kernel_name; | |||
| auto *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(op_parameter_); | |||
| @@ -237,6 +265,7 @@ int ArithmeticOpenCLKernel::Init() { | |||
| Image2dGetWorkGroupSize(); | |||
| InitBuffer(); | |||
| SetArgs(); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| return RET_OK; | |||
| } | |||
| @@ -250,29 +279,7 @@ int ArithmeticOpenCLKernel::Run() { | |||
| auto input_1_ptr = inputs_weight_ptrs_[1] == nullptr ? in_tensors_[1]->data_c() : inputs_weight_ptrs_[1]; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | |||
| if (!element_flag_) { | |||
| cl_int4 input0_shape = {inputs_nhwc_shapes_[0][0], inputs_nhwc_shapes_[0][1], inputs_nhwc_shapes_[0][2], | |||
| UP_DIV(inputs_nhwc_shapes_[0][3], C4NUM)}; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input0_shape); | |||
| cl_int4 input1_shape = {inputs_nhwc_shapes_[1][0], inputs_nhwc_shapes_[1][1], inputs_nhwc_shapes_[1][2], | |||
| UP_DIV(inputs_nhwc_shapes_[1][3], C4NUM)}; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input1_shape); | |||
| auto out_shape = GetNHWCShape(out_tensors_[0]->shape()); | |||
| cl_int4 output_shape{out_shape[0], out_shape[1], out_shape[2], UP_DIV(out_shape[3], C4NUM)}; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_); | |||
| ocl_runtime_->RunKernel(kernel_, | |||
| {static_cast<size_t>(UP_DIV(out_shape[3], C4NUM)), static_cast<size_t>(out_shape[2]), | |||
| static_cast<size_t>(out_shape[1] * out_shape[0])}, | |||
| {}, nullptr); | |||
| } else { | |||
| cl_int2 output_shape{static_cast<int>(global_size_[0]), static_cast<int>(global_size_[1])}; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_); | |||
| ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); | |||
| } | |||
| ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); | |||
| return RET_OK; | |||
| } | |||
| @@ -33,6 +33,7 @@ class ArithmeticOpenCLKernel : public OpenCLKernel { | |||
| int Init() override; | |||
| int Run() override; | |||
| int InitBuffer() override; | |||
| int SetArgs(); | |||
| private: | |||
| std::vector<size_t> InitGlobalSize() const; | |||
| @@ -83,17 +83,20 @@ int PoolingOpenCLKernel::Init() { | |||
| ocl_runtime_->LoadSource(program_name, source); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||
| #endif | |||
| InitGlobalSize(); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| std::vector<size_t> PoolingOpenCLKernel::InitGlobalSize() const { | |||
| void PoolingOpenCLKernel::InitGlobalSize() { | |||
| const size_t global_x = out_tensors_[0]->shape()[1]; | |||
| const size_t global_y = out_tensors_[0]->shape()[2]; | |||
| const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM); | |||
| std::vector<size_t> global = {global_x, global_y, global_z}; | |||
| return global; | |||
| global_size_ = {global_z, global_y, global_x}; | |||
| int max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())()); | |||
| local_size_ = GetCommonLocalSize(global_size_, max_work_group_size); | |||
| global_size_ = GetCommonGlobalSize(local_size_, global_size_); | |||
| } | |||
| int PoolingOpenCLKernel::Run() { | |||
| @@ -116,13 +119,7 @@ int PoolingOpenCLKernel::Run() { | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, kernel_size); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, padding); | |||
| std::vector<size_t> local_size; | |||
| std::vector<size_t> global_size = InitGlobalSize(); | |||
| int max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())()); | |||
| local_size = GetCommonLocalSize(global_size, max_work_group_size); | |||
| global_size = GetCommonGlobalSize(local_size, global_size); | |||
| ocl_runtime_->RunKernel(kernel_, global_size, local_size, nullptr); | |||
| ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| @@ -35,10 +35,12 @@ class PoolingOpenCLKernel : public OpenCLKernel { | |||
| int Run() override; | |||
| private: | |||
| std::vector<size_t> InitGlobalSize() const; | |||
| void InitGlobalSize(); | |||
| PoolingParameter *parameter_; | |||
| cl::Kernel kernel_; | |||
| bool enable_fp16_{false}; | |||
| std::vector<size_t> local_size_; | |||
| std::vector<size_t> global_size_; | |||
| }; | |||
| } // namespace mindspore::kernel | |||