Merge pull request !7890 from chenzupeng/master-litetags/v1.1.0
| @@ -2,46 +2,14 @@ | |||||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | #pragma OPENCL EXTENSION cl_khr_fp16 : enable | ||||
| #endif | #endif | ||||
| #define divide_no_check(a, b) (a / b) | #define divide_no_check(a, b) (a / b) | ||||
| __kernel void AvgPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape, | |||||
| const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { | |||||
| // axis to dst tensor coordinate | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| // boundary check | |||||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | |||||
| return; | |||||
| } | |||||
| FLT4 r = (FLT4)(0.0f); | |||||
| FLT window_size = 0.0f; | |||||
| int xs = X * stride.x - padding.x; | |||||
| int ys = Y * stride.y - padding.y; | |||||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||||
| int x_c = xs + kx; | |||||
| bool outside_x = x_c < 0 || x_c >= input_shape.x; | |||||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||||
| int y_c = ys + ky; | |||||
| bool outside = outside_x || y_c < 0 || y_c >= input_shape.y; | |||||
| r += !outside ? input[(input_shape.y * x_c + y_c) * output_shape.w + Z] : (FLT4)(0.0f); | |||||
| window_size += !outside ? 1.0f : 0.0f; | |||||
| } | |||||
| } | |||||
| FLT4 result = TO_FLT4(r / window_size); | |||||
| output[(output_shape.y * X + Y) * output_shape.w + Z] = result; | |||||
| } | |||||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | ||||
| __kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, | __kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, | ||||
| const int4 output_shape, const int2 stride, const int2 kernel_size, | const int4 output_shape, const int2 stride, const int2 kernel_size, | ||||
| const int2 padding) { | const int2 padding) { | ||||
| // axis to dst tensor coordinate | // axis to dst tensor coordinate | ||||
| int X = get_global_id(0); | |||||
| int X = get_global_id(2); | |||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| int Z = get_global_id(2); | |||||
| int Z = get_global_id(0); | |||||
| // boundary check | // boundary check | ||||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | ||||
| @@ -66,35 +34,3 @@ __kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only i | |||||
| FLT4 result = TO_FLT4(divide_no_check(r, window_size)); | FLT4 result = TO_FLT4(divide_no_check(r, window_size)); | ||||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), result); | WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), result); | ||||
| } | } | ||||
| __kernel void AvgPooling2d_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output, | |||||
| const int4 input_shape, const int4 output_shape, const int2 stride, | |||||
| const int2 kernel_size, const int2 padding) { | |||||
| // axis to dst tensor coordinate | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| // boundary check | |||||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | |||||
| return; | |||||
| } | |||||
| FLT4 r = (FLT4)(0.0f); | |||||
| FLT window_size = 0.0f; | |||||
| int xs = X * stride.x - padding.x; | |||||
| int ys = Y * stride.y - padding.y; | |||||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||||
| int y_c = ys + ky; | |||||
| bool outside_y = y_c < 0 || y_c >= input_shape.y; | |||||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||||
| int x_c = xs + kx; | |||||
| bool outside = outside_y || x_c < 0 || x_c >= input_shape.x; | |||||
| r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c, Z * input_shape.x + x_c)) : (FLT4)(0.0f); | |||||
| window_size += !outside ? 1.0f : 0.0f; | |||||
| } | |||||
| } | |||||
| FLT4 result = TO_FLT4(divide_no_check(r, window_size)); | |||||
| WRITE_IMAGE(output, (int2)(Y, Z * output_shape.x + X), result); | |||||
| } | |||||
| @@ -1,48 +1,14 @@ | |||||
| #ifdef cl_khr_fp16 | #ifdef cl_khr_fp16 | ||||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | #pragma OPENCL EXTENSION cl_khr_fp16 : enable | ||||
| #endif | #endif | ||||
| __kernel void MaxPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape, | |||||
| const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { | |||||
| // axis to dst tensor coordinate | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| // boundary check | |||||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | |||||
| return; | |||||
| } | |||||
| FLT4 maximum = (FLT4)(-10000.0f); | |||||
| int xs = X * stride.x - padding.x; | |||||
| int ys = Y * stride.y - padding.y; | |||||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||||
| int x_c = xs + kx; | |||||
| if (x_c < 0 || x_c >= input_shape.x) { | |||||
| continue; | |||||
| } | |||||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||||
| int y_c = ys + ky; | |||||
| if (y_c < 0 || y_c >= input_shape.y) { | |||||
| continue; | |||||
| } | |||||
| FLT4 src = input[(input_shape.y * x_c + y_c) * input_shape.w + Z]; | |||||
| maximum = max(src, maximum); | |||||
| } | |||||
| } | |||||
| output[(output_shape.y * X + Y) * output_shape.w + Z] = maximum; | |||||
| } | |||||
| __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; | __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; | ||||
| __kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, | __kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, | ||||
| const int4 output_shape, const int2 stride, const int2 kernel_size, | const int4 output_shape, const int2 stride, const int2 kernel_size, | ||||
| const int2 padding) { | const int2 padding) { | ||||
| // axis to dst tensor coordinate | // axis to dst tensor coordinate | ||||
| int X = get_global_id(0); | |||||
| int X = get_global_id(2); | |||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| int Z = get_global_id(2); | |||||
| int Z = get_global_id(0); | |||||
| // boundary check | // boundary check | ||||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | ||||
| @@ -69,9 +35,9 @@ __kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_o | |||||
| const int4 input_shape, const int4 output_shape, const int2 stride, | const int4 input_shape, const int4 output_shape, const int2 stride, | ||||
| const int2 kernel_size, const int2 padding) { | const int2 kernel_size, const int2 padding) { | ||||
| // axis to dst tensor coordinate | // axis to dst tensor coordinate | ||||
| int X = get_global_id(0); | |||||
| int X = get_global_id(2); | |||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| int Z = get_global_id(2); | |||||
| int Z = get_global_id(0); | |||||
| // boundary check | // boundary check | ||||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | ||||
| @@ -93,32 +59,3 @@ __kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_o | |||||
| } | } | ||||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), max(maximum, (FLT4)(0.f))); | WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), max(maximum, (FLT4)(0.f))); | ||||
| } | } | ||||
| __kernel void MaxPooling2d_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output, | |||||
| const int4 input_shape, const int4 output_shape, const int2 stride, | |||||
| const int2 kernel_size, const int2 padding) { | |||||
| // axis to dst tensor coordinate | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| // boundary check | |||||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | |||||
| return; | |||||
| } | |||||
| FLT4 maximum = (FLT4)(-10000.0f); | |||||
| int xs = X * stride.x - padding.x; | |||||
| int ys = Y * stride.y - padding.y; | |||||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||||
| int y_c = ys + ky; | |||||
| if (y_c < 0 || y_c >= input_shape.y) continue; | |||||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||||
| int x_c = xs + kx; | |||||
| if (x_c < 0 || x_c >= input_shape.x) continue; | |||||
| FLT4 src = READ_IMAGE(input, smp_none, (int2)(y_c, Z * input_shape.x + x_c)); | |||||
| maximum = max(src, maximum); | |||||
| } | |||||
| } | |||||
| WRITE_IMAGE(output, (int2)(Y, Z * output_shape.x + X), maximum); | |||||
| } | |||||
| @@ -52,16 +52,23 @@ std::vector<size_t> ArithmeticOpenCLKernel::InitGlobalSize() const { | |||||
| } | } | ||||
| void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() { | void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() { | ||||
| local_size_ = {16, 16}; | |||||
| auto out_shape = out_tensors_[0]->shape(); | |||||
| if (out_shape.size() == 2) { | |||||
| size_t H = out_shape[0]; | |||||
| size_t W = UP_DIV(out_shape[1], C4NUM); | |||||
| global_size_ = {W, H}; | |||||
| if (element_flag_) { | |||||
| local_size_ = {16, 16}; | |||||
| auto out_shape = out_tensors_[0]->shape(); | |||||
| if (out_shape.size() == 2) { | |||||
| size_t H = out_shape[0]; | |||||
| size_t W = UP_DIV(out_shape[1], C4NUM); | |||||
| global_size_ = {W, H}; | |||||
| } else { | |||||
| size_t H = out_shape[0] * out_shape[1]; | |||||
| size_t W = out_shape[2] * UP_DIV(out_shape[3], C4NUM); | |||||
| global_size_ = {W, H}; | |||||
| } | |||||
| } else { | } else { | ||||
| size_t H = out_shape[0] * out_shape[1]; | |||||
| size_t W = out_shape[2] * UP_DIV(out_shape[3], C4NUM); | |||||
| global_size_ = {W, H}; | |||||
| local_size_ = {}; | |||||
| auto out_shape = GetNHWCShape(out_tensors_[0]->shape()); | |||||
| global_size_ = {static_cast<size_t>(UP_DIV(out_shape[3], C4NUM)), static_cast<size_t>(out_shape[2]), | |||||
| static_cast<size_t>(out_shape[1] * out_shape[0])}; | |||||
| } | } | ||||
| } | } | ||||
| @@ -129,6 +136,27 @@ int ArithmeticOpenCLKernel::InitBuffer() { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int ArithmeticOpenCLKernel::SetArgs() { | |||||
| int arg_idx = 3; | |||||
| if (!element_flag_) { | |||||
| cl_int4 input0_shape = {inputs_nhwc_shapes_[0][0], inputs_nhwc_shapes_[0][1], inputs_nhwc_shapes_[0][2], | |||||
| UP_DIV(inputs_nhwc_shapes_[0][3], C4NUM)}; | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input0_shape); | |||||
| cl_int4 input1_shape = {inputs_nhwc_shapes_[1][0], inputs_nhwc_shapes_[1][1], inputs_nhwc_shapes_[1][2], | |||||
| UP_DIV(inputs_nhwc_shapes_[1][3], C4NUM)}; | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input1_shape); | |||||
| auto out_shape = GetNHWCShape(out_tensors_[0]->shape()); | |||||
| cl_int4 output_shape{out_shape[0], out_shape[1], out_shape[2], UP_DIV(out_shape[3], C4NUM)}; | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); | |||||
| } else { | |||||
| cl_int2 output_shape{static_cast<int>(global_size_[0]), static_cast<int>(global_size_[1])}; | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); | |||||
| } | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_); | |||||
| return RET_OK; | |||||
| } | |||||
| int ArithmeticOpenCLKernel::Init() { | int ArithmeticOpenCLKernel::Init() { | ||||
| std::string kernel_name; | std::string kernel_name; | ||||
| auto *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(op_parameter_); | auto *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(op_parameter_); | ||||
| @@ -237,6 +265,7 @@ int ArithmeticOpenCLKernel::Init() { | |||||
| Image2dGetWorkGroupSize(); | Image2dGetWorkGroupSize(); | ||||
| InitBuffer(); | InitBuffer(); | ||||
| SetArgs(); | |||||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | MS_LOG(DEBUG) << kernel_name << " Init Done!"; | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -250,29 +279,7 @@ int ArithmeticOpenCLKernel::Run() { | |||||
| auto input_1_ptr = inputs_weight_ptrs_[1] == nullptr ? in_tensors_[1]->data_c() : inputs_weight_ptrs_[1]; | auto input_1_ptr = inputs_weight_ptrs_[1] == nullptr ? in_tensors_[1]->data_c() : inputs_weight_ptrs_[1]; | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | ||||
| if (!element_flag_) { | |||||
| cl_int4 input0_shape = {inputs_nhwc_shapes_[0][0], inputs_nhwc_shapes_[0][1], inputs_nhwc_shapes_[0][2], | |||||
| UP_DIV(inputs_nhwc_shapes_[0][3], C4NUM)}; | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input0_shape); | |||||
| cl_int4 input1_shape = {inputs_nhwc_shapes_[1][0], inputs_nhwc_shapes_[1][1], inputs_nhwc_shapes_[1][2], | |||||
| UP_DIV(inputs_nhwc_shapes_[1][3], C4NUM)}; | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input1_shape); | |||||
| auto out_shape = GetNHWCShape(out_tensors_[0]->shape()); | |||||
| cl_int4 output_shape{out_shape[0], out_shape[1], out_shape[2], UP_DIV(out_shape[3], C4NUM)}; | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_); | |||||
| ocl_runtime_->RunKernel(kernel_, | |||||
| {static_cast<size_t>(UP_DIV(out_shape[3], C4NUM)), static_cast<size_t>(out_shape[2]), | |||||
| static_cast<size_t>(out_shape[1] * out_shape[0])}, | |||||
| {}, nullptr); | |||||
| } else { | |||||
| cl_int2 output_shape{static_cast<int>(global_size_[0]), static_cast<int>(global_size_[1])}; | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); | |||||
| } | |||||
| ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -33,6 +33,7 @@ class ArithmeticOpenCLKernel : public OpenCLKernel { | |||||
| int Init() override; | int Init() override; | ||||
| int Run() override; | int Run() override; | ||||
| int InitBuffer() override; | int InitBuffer() override; | ||||
| int SetArgs(); | |||||
| private: | private: | ||||
| std::vector<size_t> InitGlobalSize() const; | std::vector<size_t> InitGlobalSize() const; | ||||
| @@ -83,17 +83,20 @@ int PoolingOpenCLKernel::Init() { | |||||
| ocl_runtime_->LoadSource(program_name, source); | ocl_runtime_->LoadSource(program_name, source); | ||||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | ||||
| #endif | #endif | ||||
| InitGlobalSize(); | |||||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | MS_LOG(DEBUG) << kernel_name << " Init Done!"; | ||||
| return mindspore::lite::RET_OK; | return mindspore::lite::RET_OK; | ||||
| } | } | ||||
| std::vector<size_t> PoolingOpenCLKernel::InitGlobalSize() const { | |||||
| void PoolingOpenCLKernel::InitGlobalSize() { | |||||
| const size_t global_x = out_tensors_[0]->shape()[1]; | const size_t global_x = out_tensors_[0]->shape()[1]; | ||||
| const size_t global_y = out_tensors_[0]->shape()[2]; | const size_t global_y = out_tensors_[0]->shape()[2]; | ||||
| const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM); | const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM); | ||||
| std::vector<size_t> global = {global_x, global_y, global_z}; | |||||
| return global; | |||||
| global_size_ = {global_z, global_y, global_x}; | |||||
| int max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())()); | |||||
| local_size_ = GetCommonLocalSize(global_size_, max_work_group_size); | |||||
| global_size_ = GetCommonGlobalSize(local_size_, global_size_); | |||||
| } | } | ||||
| int PoolingOpenCLKernel::Run() { | int PoolingOpenCLKernel::Run() { | ||||
| @@ -116,13 +119,7 @@ int PoolingOpenCLKernel::Run() { | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, kernel_size); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, kernel_size); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, padding); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, padding); | ||||
| std::vector<size_t> local_size; | |||||
| std::vector<size_t> global_size = InitGlobalSize(); | |||||
| int max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())()); | |||||
| local_size = GetCommonLocalSize(global_size, max_work_group_size); | |||||
| global_size = GetCommonGlobalSize(local_size, global_size); | |||||
| ocl_runtime_->RunKernel(kernel_, global_size, local_size, nullptr); | |||||
| ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); | |||||
| return mindspore::lite::RET_OK; | return mindspore::lite::RET_OK; | ||||
| } | } | ||||
| @@ -35,10 +35,12 @@ class PoolingOpenCLKernel : public OpenCLKernel { | |||||
| int Run() override; | int Run() override; | ||||
| private: | private: | ||||
| std::vector<size_t> InitGlobalSize() const; | |||||
| void InitGlobalSize(); | |||||
| PoolingParameter *parameter_; | PoolingParameter *parameter_; | ||||
| cl::Kernel kernel_; | cl::Kernel kernel_; | ||||
| bool enable_fp16_{false}; | bool enable_fp16_{false}; | ||||
| std::vector<size_t> local_size_; | |||||
| std::vector<size_t> global_size_; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||