Merge pull request !7913 from chenzupeng/master-litetags/v1.1.0
| @@ -1,15 +1,12 @@ | |||||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | #pragma OPENCL EXTENSION cl_khr_fp16 : enable | ||||
| #define SLICES 4 | |||||
| #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) | |||||
| #define MIN(X, Y) (X < Y ? X : Y) | |||||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | ||||
| __kernel void LeakyRelu(__read_only image2d_t input, __write_only image2d_t output, const int4 img_shape, | |||||
| __kernel void LeakyRelu(__read_only image2d_t input, __write_only image2d_t output, const int2 img_shape, | |||||
| const float alpha) { | const float alpha) { | ||||
| int Y = get_global_id(0); // H | |||||
| int X = get_global_id(1); // W C4 | |||||
| if (X >= img_shape.z || Y >= img_shape.y) return; | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= img_shape.x || Y >= img_shape.y) return; | |||||
| FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y)); | FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y)); | ||||
| FLT4 tmp; | FLT4 tmp; | ||||
| FLT alpha_f = TO_FLT(alpha); | FLT alpha_f = TO_FLT(alpha); | ||||
| @@ -20,53 +17,40 @@ __kernel void LeakyRelu(__read_only image2d_t input, __write_only image2d_t outp | |||||
| WRITE_IMAGE(output, (int2)(X, Y), tmp); | WRITE_IMAGE(output, (int2)(X, Y), tmp); | ||||
| } | } | ||||
| __kernel void Relu(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape) { | |||||
| int Y = get_global_id(0); | |||||
| int X = get_global_id(1); | |||||
| if (X >= input_shape.z || Y >= input_shape.y) return; | |||||
| __kernel void Relu(__read_only image2d_t input, __write_only image2d_t output, const int2 img_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= img_shape.x || Y >= img_shape.y) return; | |||||
| FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y)); | FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y)); | ||||
| FLT4 tmp; | |||||
| tmp.x = in_c4.x > 0.0f ? in_c4.x : 0.0f; | |||||
| tmp.y = in_c4.y > 0.0f ? in_c4.y : 0.0f; | |||||
| tmp.z = in_c4.z > 0.0f ? in_c4.z : 0.0f; | |||||
| tmp.w = in_c4.w > 0.0f ? in_c4.w : 0.0f; | |||||
| WRITE_IMAGE(output, (int2)(X, Y), tmp); | |||||
| in_c4 = max(in_c4, (FLT)(0.f)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), in_c4); | |||||
| } | } | ||||
| __kernel void Relu6(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape) { | |||||
| int Y = get_global_id(0); | |||||
| int X = get_global_id(1); | |||||
| if (X >= input_shape.z || Y >= input_shape.y) return; | |||||
| __kernel void Relu6(__read_only image2d_t input, __write_only image2d_t output, const int2 img_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= img_shape.x || Y >= img_shape.y) return; | |||||
| FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y)); | FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y)); | ||||
| FLT4 tmp; | |||||
| tmp.x = in_c4.x > 0.0f ? MIN(in_c4.x, 6.0f) : 0.0f; | |||||
| tmp.y = in_c4.y > 0.0f ? MIN(in_c4.y, 6.0f) : 0.0f; | |||||
| tmp.z = in_c4.z > 0.0f ? MIN(in_c4.z, 6.0f) : 0.0f; | |||||
| tmp.w = in_c4.w > 0.0f ? MIN(in_c4.w, 6.0f) : 0.0f; | |||||
| WRITE_IMAGE(output, (int2)(X, Y), tmp); | |||||
| in_c4 = clamp(in_c4, (FLT)(0.f), (FLT)(6.f)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), in_c4); | |||||
| } | } | ||||
| __kernel void Sigmoid(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape) { | |||||
| int Y = get_global_id(0); | |||||
| int X = get_global_id(1); | |||||
| if (X >= input_shape.z || Y >= input_shape.y) return; | |||||
| __kernel void Sigmoid(__read_only image2d_t input, __write_only image2d_t output, const int2 img_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= img_shape.x || Y >= img_shape.y) return; | |||||
| FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y)); | FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y)); | ||||
| FLT4 tmp; | |||||
| tmp.x = 1.0f / (1.0f + exp(-in_c4.x)); | |||||
| tmp.y = 1.0f / (1.0f + exp(-in_c4.y)); | |||||
| tmp.z = 1.0f / (1.0f + exp(-in_c4.z)); | |||||
| tmp.w = 1.0f / (1.0f + exp(-in_c4.w)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), tmp); | |||||
| in_c4 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-in_c4)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), in_c4); | |||||
| } | } | ||||
| __kernel void Tanh(__read_only image2d_t input, __write_only image2d_t output, int4 input_shape) { | |||||
| int Y = get_global_id(0); | |||||
| int X = get_global_id(1); | |||||
| if (X >= input_shape.z || Y >= input_shape.y) return; | |||||
| __kernel void Tanh(__read_only image2d_t input, __write_only image2d_t output, const int2 img_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= img_shape.x || Y >= img_shape.y) return; | |||||
| FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y)); | FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y)); | ||||
| in_c4.x = (exp(in_c4.x) - exp(-in_c4.x)) / (exp(in_c4.x) + exp(-in_c4.x)); | |||||
| in_c4.y = (exp(in_c4.y) - exp(-in_c4.y)) / (exp(in_c4.y) + exp(-in_c4.y)); | |||||
| in_c4.z = (exp(in_c4.z) - exp(-in_c4.z)) / (exp(in_c4.z) + exp(-in_c4.z)); | |||||
| in_c4.w = (exp(in_c4.w) - exp(-in_c4.w)) / (exp(in_c4.w) + exp(-in_c4.w)); | |||||
| FLT4 exp0 = exp(in_c4); | |||||
| FLT4 exp1 = exp(-in_c4); | |||||
| in_c4 = (exp0 - exp1) / (exp0 + exp1); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), in_c4); | WRITE_IMAGE(output, (int2)(X, Y), in_c4); | ||||
| } | } | ||||
| @@ -41,26 +41,6 @@ using mindspore::schema::PrimitiveType_Activation; | |||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| int ActivationOpenClKernel::Init() { | int ActivationOpenClKernel::Init() { | ||||
| in_size_ = in_tensors_[0]->shape().size(); | |||||
| out_size_ = out_tensors_[0]->shape().size(); | |||||
| size_t n, h, w, c; | |||||
| if (in_size_ == 2) { | |||||
| n = in_tensors_[0]->shape()[0]; | |||||
| c = in_tensors_[0]->shape()[1]; | |||||
| h = w = 1; | |||||
| } else { | |||||
| n = in_tensors_[0]->shape()[0]; | |||||
| h = in_tensors_[0]->shape()[1]; | |||||
| w = in_tensors_[0]->shape()[2]; | |||||
| c = in_tensors_[0]->shape()[3]; | |||||
| } | |||||
| nhwc_shape_ = {n, h, w, c}; | |||||
| enable_fp16_ = ocl_runtime_->GetFp16Enable(); | |||||
| fp_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float); | |||||
| if (in_size_ != 2 && in_size_ != 4) { | |||||
| MS_LOG(ERROR) << "Activate fun only support dim=4 or 2, but your dim=" << in_size_; | |||||
| return mindspore::lite::RET_ERROR; | |||||
| } | |||||
| std::map<int, std::string> kernel_names{{ActivationType_LEAKY_RELU, "LeakyRelu"}, | std::map<int, std::string> kernel_names{{ActivationType_LEAKY_RELU, "LeakyRelu"}, | ||||
| {ActivationType_RELU, "Relu"}, | {ActivationType_RELU, "Relu"}, | ||||
| {ActivationType_SIGMOID, "Sigmoid"}, | {ActivationType_SIGMOID, "Sigmoid"}, | ||||
| @@ -70,45 +50,43 @@ int ActivationOpenClKernel::Init() { | |||||
| MS_LOG(ERROR) << "schema::ActivationType:" << type_ << "not found"; | MS_LOG(ERROR) << "schema::ActivationType:" << type_ << "not found"; | ||||
| return mindspore::lite::RET_ERROR; | return mindspore::lite::RET_ERROR; | ||||
| } | } | ||||
| outShape = Image2DInfo(out_tensors_[0]); | |||||
| local_size_ = {}; | |||||
| global_size_ = {outShape.width, outShape.height}; | |||||
| std::string source = activation_source; | std::string source = activation_source; | ||||
| std::set<std::string> build_options; | std::set<std::string> build_options; | ||||
| std::string program_name = "Activation"; | std::string program_name = "Activation"; | ||||
| ocl_runtime_->LoadSource(program_name, source); | ocl_runtime_->LoadSource(program_name, source); | ||||
| std::string kernel_name = kernel_names[type_]; | std::string kernel_name = kernel_names[type_]; | ||||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | ||||
| MS_LOG(DEBUG) << op_parameter_->name_ << " init Done!"; | |||||
| SetArgs(); | |||||
| MS_LOG(DEBUG) << kernel_name << " init Done!"; | |||||
| return mindspore::lite::RET_OK; | return mindspore::lite::RET_OK; | ||||
| } | } | ||||
| int ActivationOpenClKernel::SetArgs() { | |||||
| int arg_idx = 2; | |||||
| cl_int2 image_size = {static_cast<int>(outShape.width), static_cast<int>(outShape.height)}; | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, image_size); | |||||
| if (type_ == ActivationType_LEAKY_RELU) { | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, alpha_); | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int ActivationOpenClKernel::Run() { | int ActivationOpenClKernel::Run() { | ||||
| MS_LOG(DEBUG) << op_parameter_->name_ << " begin running!"; | |||||
| cl_int4 img2d_shape = GetImg2dShape(); | |||||
| MS_LOG(DEBUG) << this->name() << " begin running!"; | |||||
| int arg_idx = 0; | int arg_idx = 0; | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, img2d_shape); | |||||
| if (type_ == ActivationType_LEAKY_RELU) { | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, alpha_); | |||||
| } | |||||
| std::vector<size_t> local = {}; | |||||
| std::vector<size_t> global = {static_cast<size_t>(img2d_shape.s[1]), static_cast<size_t>(img2d_shape.s[2])}; | |||||
| auto ret = ocl_runtime_->RunKernel(kernel_, global, local, nullptr); | |||||
| auto ret = ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); | |||||
| if (ret != mindspore::lite::RET_OK) { | if (ret != mindspore::lite::RET_OK) { | ||||
| MS_LOG(ERROR) << "Run kernel:" << op_parameter_->name_ << " fail."; | |||||
| MS_LOG(ERROR) << "Run kernel:" << this->name() << " fail."; | |||||
| return mindspore::lite::RET_ERROR; | return mindspore::lite::RET_ERROR; | ||||
| } | } | ||||
| return mindspore::lite::RET_OK; | return mindspore::lite::RET_OK; | ||||
| } | } | ||||
| cl_int4 ActivationOpenClKernel::GetImg2dShape() { | |||||
| cl_int4 img2d_shape = {1, 1, 1, 1}; | |||||
| img2d_shape.s[1] = nhwc_shape_[1]; | |||||
| img2d_shape.s[2] = nhwc_shape_[2] * UP_DIV(nhwc_shape_[3], C4NUM); | |||||
| img2d_shape.s[3] = C4NUM; | |||||
| return img2d_shape; | |||||
| } | |||||
| kernel::LiteKernel *OpenClActivationKernelCreator(const std::vector<lite::Tensor *> &inputs, | kernel::LiteKernel *OpenClActivationKernelCreator(const std::vector<lite::Tensor *> &inputs, | ||||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | ||||
| const lite::InnerContext *ctx, const kernel::KernelKey &desc, | const lite::InnerContext *ctx, const kernel::KernelKey &desc, | ||||
| @@ -37,16 +37,11 @@ class ActivationOpenClKernel : public OpenCLKernel { | |||||
| int Run() override; | int Run() override; | ||||
| private: | private: | ||||
| cl_int4 GetImg2dShape(); | |||||
| int SetArgs(); | |||||
| cl::Kernel kernel_; | cl::Kernel kernel_; | ||||
| int type_; | int type_; | ||||
| float alpha_; | float alpha_; | ||||
| int in_size_{}; | |||||
| int out_size_{}; | |||||
| size_t fp_size{}; | |||||
| bool enable_fp16_{false}; | |||||
| std::vector<size_t> nhwc_shape_; | |||||
| Image2DInfo outShape = Image2DInfo(nullptr); | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||