| @@ -3,8 +3,8 @@ __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | |||||
| #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) | #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) | ||||
| #define C4NUM 4 | #define C4NUM 4 | ||||
| __kernel void ComputeMeanVarDim1NHWC4(__read_only image2d_t src_data, __global FLT *mean_, __global FLT *variance_, | |||||
| int4 in_shape, int normalized_shape_size) { | |||||
| __kernel void ComputeMeanVarAxis3NHWC4(__read_only image2d_t src_data, __global FLT *mean_, __global FLT *variance_, | |||||
| int4 in_shape, int normalized_shape_size) { | |||||
| int X = get_global_id(0); // n*h | int X = get_global_id(0); // n*h | ||||
| int Y = get_global_id(1); // w | int Y = get_global_id(1); // w | ||||
| if (X > in_shape.x * in_shape.y || Y > in_shape.z || in_shape.y == 0 || normalized_shape_size == 0) { | if (X > in_shape.x * in_shape.y || Y > in_shape.z || in_shape.y == 0 || normalized_shape_size == 0) { | ||||
| @@ -50,15 +50,14 @@ __kernel void ComputeMeanVarDim1NHWC4(__read_only image2d_t src_data, __global F | |||||
| var = (var_temp.x + var_temp.y + var_temp.z + var_temp.w) / normalized_shape_size; | var = (var_temp.x + var_temp.y + var_temp.z + var_temp.w) / normalized_shape_size; | ||||
| // write result to dst | // write result to dst | ||||
| int postion = (n * in_shape.y + h) * in_shape.z + w; | |||||
| mean_[postion] = mean; | |||||
| variance_[postion] = var; | |||||
| int position = (n * in_shape.y + h) * in_shape.z + w; | |||||
| mean_[position] = mean; | |||||
| variance_[position] = var; | |||||
| } | } | ||||
| __kernel void LayerNormalization_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, | __kernel void LayerNormalization_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, | ||||
| __global FLT *mean_, __global FLT *variance_, __global FLT *gamma_, | __global FLT *mean_, __global FLT *variance_, __global FLT *gamma_, | ||||
| __global FLT *beta_, int4 in_shape, float epsilon_, int normalized_dims_, | |||||
| int elementwise_affine_) { | |||||
| __global FLT *beta_, int4 in_shape, float epsilon_, int begin_params_axis_) { | |||||
| int X = get_global_id(0); // n*h | int X = get_global_id(0); // n*h | ||||
| int Y = get_global_id(1); // w | int Y = get_global_id(1); // w | ||||
| int Z = get_global_id(2); // c4 | int Z = get_global_id(2); // c4 | ||||
| @@ -72,32 +71,25 @@ __kernel void LayerNormalization_NHWC4(__read_only image2d_t src_data, __write_o | |||||
| int ci4 = UP_DIV(in_shape.w, C4NUM); | int ci4 = UP_DIV(in_shape.w, C4NUM); | ||||
| int postion_mv = 0; | int postion_mv = 0; | ||||
| int postion_gb = 0; | int postion_gb = 0; | ||||
| if (normalized_dims_ == 1) { | |||||
| postion_mv = (n * in_shape.y + h) * in_shape.z + w; | |||||
| postion_gb = c * C4NUM; | |||||
| } else if (normalized_dims_ == 2) { | |||||
| postion_mv = n * in_shape.y + h; | |||||
| postion_gb = w * ci4 * C4NUM + c * C4NUM; | |||||
| } else if (normalized_dims_ == 3) { | |||||
| if (begin_params_axis_ == 1) { | |||||
| postion_mv = n; | postion_mv = n; | ||||
| postion_gb = (h * in_shape.z + w) * ci4 * C4NUM + c * C4NUM; | postion_gb = (h * in_shape.z + w) * ci4 * C4NUM + c * C4NUM; | ||||
| } else if (begin_params_axis_ == 2) { | |||||
| postion_mv = n * in_shape.y + h; | |||||
| postion_gb = w * ci4 * C4NUM + c * C4NUM; | |||||
| } else if (begin_params_axis_ == 3) { | |||||
| postion_mv = (n * in_shape.y + h) * in_shape.z + w; | |||||
| postion_gb = c * C4NUM; | |||||
| } | } | ||||
| FLT4 result = {0.0f, 0.0f, 0.0f, 0.0f}; | FLT4 result = {0.0f, 0.0f, 0.0f, 0.0f}; | ||||
| FLT4 result_in = READ_IMAGE(src_data, smp_none, (int2)(w * ci4 + c, n * in_shape.y + h)); | FLT4 result_in = READ_IMAGE(src_data, smp_none, (int2)(w * ci4 + c, n * in_shape.y + h)); | ||||
| if (elementwise_affine_) { | |||||
| result.x = ((result_in.x - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)) * gamma_[postion_gb] + | |||||
| beta_[postion_gb]; | |||||
| result.y = ((result_in.y - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)) * gamma_[postion_gb + 1] + | |||||
| beta_[postion_gb + 1]; | |||||
| result.z = ((result_in.z - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)) * gamma_[postion_gb + 2] + | |||||
| beta_[postion_gb + 2]; | |||||
| result.w = ((result_in.w - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)) * gamma_[postion_gb + 3] + | |||||
| beta_[postion_gb + 3]; | |||||
| } else { | |||||
| result.x = ((result_in.x - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)); | |||||
| result.y = ((result_in.y - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)); | |||||
| result.z = ((result_in.z - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)); | |||||
| result.w = ((result_in.w - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)); | |||||
| } | |||||
| result.x = ((result_in.x - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)) * gamma_[postion_gb] + | |||||
| beta_[postion_gb]; | |||||
| result.y = ((result_in.y - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)) * gamma_[postion_gb + 1] + | |||||
| beta_[postion_gb + 1]; | |||||
| result.z = ((result_in.z - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)) * gamma_[postion_gb + 2] + | |||||
| beta_[postion_gb + 2]; | |||||
| result.w = ((result_in.w - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)) * gamma_[postion_gb + 3] + | |||||
| beta_[postion_gb + 3]; | |||||
| WRITE_IMAGE(dst_data, (int2)((w * ci4 + c), (n * in_shape.y + h)), result); | WRITE_IMAGE(dst_data, (int2)((w * ci4 + c), (n * in_shape.y + h)), result); | ||||
| } | } | ||||
| @@ -106,7 +106,7 @@ struct FusionEltwiseParameter { | |||||
| Node_(bool is_leaf, FusionEltwiseParameter *value, std::string value_name) | Node_(bool is_leaf, FusionEltwiseParameter *value, std::string value_name) | ||||
| : is_leaf_(is_leaf), value_(value), name_(std::move(value_name)) {} | : is_leaf_(is_leaf), value_(value), name_(std::move(value_name)) {} | ||||
| }; | }; | ||||
| OpParameter op_parameter_{}; | |||||
| OpParameter op_parameter_{"FusionEltwiseParameter", PrimitiveType_FusionEltwise, 1}; | |||||
| EltwiseOperator operator_; | EltwiseOperator operator_; | ||||
| std::string name_; | std::string name_; | ||||
| std::vector<Node_> inputs_; | std::vector<Node_> inputs_; | ||||
| @@ -115,8 +115,6 @@ struct FusionEltwiseParameter { | |||||
| const std::vector<lite::Tensor *> &in_tensors, | const std::vector<lite::Tensor *> &in_tensors, | ||||
| const std::map<lite::Tensor *, FusionEltwiseParameter *> &replace_map = {}) | const std::map<lite::Tensor *, FusionEltwiseParameter *> &replace_map = {}) | ||||
| : operator_(operator_init), name_(std::move(kernel_name)) { | : operator_(operator_init), name_(std::move(kernel_name)) { | ||||
| op_parameter_.type_ = PrimitiveType_FusionEltwise; | |||||
| snprintf(op_parameter_.name_, strlen("FusionEltwiseParameter"), "FusionEltwiseParameter"); | |||||
| for (int i = 0; i < in_tensors.size(); ++i) { | for (int i = 0; i < in_tensors.size(); ++i) { | ||||
| auto *in_tensor = in_tensors[i]; | auto *in_tensor = in_tensors[i]; | ||||
| if (replace_map.count(in_tensor)) { | if (replace_map.count(in_tensor)) { | ||||
| @@ -33,13 +33,22 @@ namespace mindspore::kernel { | |||||
| int LayerNormOpenCLKernel::CheckSpecs() { | int LayerNormOpenCLKernel::CheckSpecs() { | ||||
| auto param = reinterpret_cast<LayerNormParameter *>(this->op_parameter_); | auto param = reinterpret_cast<LayerNormParameter *>(this->op_parameter_); | ||||
| if (in_tensors_.at(0)->shape().size() != 4 || out_tensors_.size() != 1) { | |||||
| MS_LOG(ERROR) << "UnSupported in_tensors_.shape.size: " << in_tensors_.at(0)->shape().size() | |||||
| if (in_tensors_.size() != 3 || out_tensors_.size() != 1) { | |||||
| MS_LOG(ERROR) << "UnSupported in_tensors_.size: " << in_tensors_.size() | |||||
| << " out_tensors_.size(): " << out_tensors_.size(); | << " out_tensors_.size(): " << out_tensors_.size(); | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| if (param->normalized_dims_ != 1) { | |||||
| MS_LOG(ERROR) << "UnSupported normalized_shape_ size: " << param->normalized_dims_; | |||||
| if (in_tensors_.at(0)->shape().size() != 4) { | |||||
| MS_LOG(ERROR) << "UnSupported in_tensors_.shape.size: " << in_tensors_.at(0)->shape().size(); | |||||
| return RET_ERROR; | |||||
| } | |||||
| normalized_axis_ = param->begin_params_axis_; | |||||
| epsilon_ = param->epsilon_; | |||||
| if (normalized_axis_ < 0) { | |||||
| normalized_axis_ += in_tensors_.at(0)->shape().size(); | |||||
| } | |||||
| if (normalized_axis_ != 3) { | |||||
| MS_LOG(ERROR) << "UnSupported normalized_axis_ : " << param->normalized_dims_; | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| return RET_OK; | return RET_OK; | ||||
| @@ -61,15 +70,11 @@ void LayerNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t | |||||
| void LayerNormOpenCLKernel::SetConstArgs() { | void LayerNormOpenCLKernel::SetConstArgs() { | ||||
| int arg_cn = 6; | int arg_cn = 6; | ||||
| GpuTensorInfo img_info(in_tensors_.at(0)); | |||||
| in_shape_.s[0] = img_info.N, in_shape_.s[1] = img_info.H, in_shape_.s[2] = img_info.W, in_shape_.s[3] = img_info.C; | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_); | ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, epsilon_); | ocl_runtime_->SetKernelArg(kernel_, arg_cn++, epsilon_); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, normalized_dims_); | |||||
| if (elementwise_affine_) { | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, 1); | |||||
| } else { | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, 0); | |||||
| } | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, normalized_axis_); | |||||
| ocl_runtime_->SetKernelArg(kernel_mean_var_, 3, in_shape_); | ocl_runtime_->SetKernelArg(kernel_mean_var_, 3, in_shape_); | ||||
| ocl_runtime_->SetKernelArg(kernel_mean_var_, 4, normalized_shape_size_); | ocl_runtime_->SetKernelArg(kernel_mean_var_, 4, normalized_shape_size_); | ||||
| } | } | ||||
| @@ -91,32 +96,13 @@ void LayerNormOpenCLKernel::SetGlobalLocal() { | |||||
| const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize(); | const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize(); | ||||
| LayerNormGetWorkGroup(global_size_, &local_size_, max_global[0]); | LayerNormGetWorkGroup(global_size_, &local_size_, max_global[0]); | ||||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | ||||
| if (normalized_dims_ != in_tensors_.at(0)->shape().size()) { | |||||
| if (normalized_dims_ == 1) { | |||||
| OH = in_shape_.s[0] * in_shape_.s[1]; | |||||
| OW = in_shape_.s[2]; | |||||
| OC = 1; | |||||
| } else if (normalized_dims_ == 2) { | |||||
| OH = in_shape_.s[0] * in_shape_.s[1]; | |||||
| OW = 1; | |||||
| OC = 1; | |||||
| } else { | |||||
| OH = in_shape_.s[0]; | |||||
| OW = 1; | |||||
| OC = 1; | |||||
| } | |||||
| } else { | |||||
| OH = 1; | |||||
| OW = 1; | |||||
| OC = 1; | |||||
| } | |||||
| AlignMeanVarGlobalLocal({static_cast<int>(OH), static_cast<int>(OW), static_cast<int>(OC)}, {1, 1, 1}, | |||||
| &global_mean_var_, &local_mean_var_); | |||||
| AlignMeanVarGlobalLocal({static_cast<int>(OH), static_cast<int>(OW), 1}, {1, 1, 1}, &global_mean_var_, | |||||
| &local_mean_var_); | |||||
| } | } | ||||
| int LayerNormOpenCLKernel::Initweight() { | int LayerNormOpenCLKernel::Initweight() { | ||||
| auto allocator = ocl_runtime_->GetAllocator(); | auto allocator = ocl_runtime_->GetAllocator(); | ||||
| GpuTensorInfo img_info(in_tensors_.at(1)); // gamma | |||||
| GpuTensorInfo img_info(in_tensors_.at(1)); | |||||
| auto weight_tensor = in_tensors_.at(1); | auto weight_tensor = in_tensors_.at(1); | ||||
| size_t weight_size = img_info.Image2DSize; | size_t weight_size = img_info.Image2DSize; | ||||
| // allocated memory for weight and init value | // allocated memory for weight and init value | ||||
| @@ -165,40 +151,28 @@ int LayerNormOpenCLKernel::Initweight() { | |||||
| int LayerNormOpenCLKernel::Prepare() { | int LayerNormOpenCLKernel::Prepare() { | ||||
| use_fp16_enable_ = ocl_runtime_->GetFp16Enable(); | use_fp16_enable_ = ocl_runtime_->GetFp16Enable(); | ||||
| auto param = reinterpret_cast<LayerNormParameter *>(this->op_parameter_); | |||||
| elementwise_affine_ = true; // param->elementwise_mode_; | |||||
| normalized_dims_ = param->normalized_dims_; | |||||
| epsilon_ = param->epsilon_; | |||||
| if (elementwise_affine_) { | |||||
| int ret = Initweight(); | |||||
| if (ret) { | |||||
| MS_LOG(ERROR) << "Initweight failed "; | |||||
| return RET_ERROR; | |||||
| } | |||||
| int ret = Initweight(); | |||||
| if (ret) { | |||||
| MS_LOG(ERROR) << "Initweight failed "; | |||||
| return RET_ERROR; | |||||
| } | } | ||||
| normalized_shape_size_ = in_tensors_.at(0)->shape().at(normalized_axis_); | |||||
| auto allocator = ocl_runtime_->GetAllocator(); | auto allocator = ocl_runtime_->GetAllocator(); | ||||
| size_t mean_size = 1; | size_t mean_size = 1; | ||||
| size_t size = in_tensors_.at(0)->shape().size() - normalized_dims_; | |||||
| for (int i = 0; i < size; ++i) { | |||||
| for (int i = 0; i < normalized_axis_; ++i) { | |||||
| mean_size *= in_tensors_.at(0)->shape()[i]; | mean_size *= in_tensors_.at(0)->shape()[i]; | ||||
| } | } | ||||
| size_t size_dtype = use_fp16_enable_ ? sizeof(float16_t) : sizeof(float); | size_t size_dtype = use_fp16_enable_ ? sizeof(float16_t) : sizeof(float); | ||||
| mean_size *= size_dtype; | mean_size *= size_dtype; | ||||
| mean_ = allocator->Malloc(mean_size); | mean_ = allocator->Malloc(mean_size); | ||||
| var_ = allocator->Malloc(mean_size); | var_ = allocator->Malloc(mean_size); | ||||
| GpuTensorInfo img_info(in_tensors_.at(0)); | |||||
| in_shape_.s[0] = img_info.N, in_shape_.s[1] = img_info.H, in_shape_.s[2] = img_info.W, in_shape_.s[3] = img_info.C; | |||||
| for (int i = 0; i < normalized_dims_; ++i) { | |||||
| normalized_shape_size_ *= param->normalized_shape_[i]; | |||||
| } | |||||
| std::string kernel_name = "LayerNormalization_NHWC4"; | std::string kernel_name = "LayerNormalization_NHWC4"; | ||||
| std::string kernel_name_mean_var = "ComputeMeanVar"; | std::string kernel_name_mean_var = "ComputeMeanVar"; | ||||
| std::string source = layer_norm_source; | std::string source = layer_norm_source; | ||||
| std::string program_name = "LayerNormalization"; | std::string program_name = "LayerNormalization"; | ||||
| ocl_runtime_->LoadSource(program_name, source); | ocl_runtime_->LoadSource(program_name, source); | ||||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | ||||
| kernel_name_mean_var += "Dim" + std::to_string(normalized_dims_) + "NHWC4"; | |||||
| kernel_name_mean_var += "Axis" + std::to_string(normalized_axis_) + "NHWC4"; | |||||
| ocl_runtime_->BuildKernel(kernel_mean_var_, program_name, kernel_name_mean_var); | ocl_runtime_->BuildKernel(kernel_mean_var_, program_name, kernel_name_mean_var); | ||||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | MS_LOG(DEBUG) << kernel_name << " Init Done!"; | ||||
| SetConstArgs(); | SetConstArgs(); | ||||
| @@ -48,8 +48,7 @@ class LayerNormOpenCLKernel : public OpenCLKernel { | |||||
| void *var_{nullptr}; | void *var_{nullptr}; | ||||
| void *beta_{nullptr}; | void *beta_{nullptr}; | ||||
| cl_int4 in_shape_{}; | cl_int4 in_shape_{}; | ||||
| int elementwise_affine_; | |||||
| int32_t normalized_dims_{1}; | |||||
| int32_t normalized_axis_{3}; // default is C | |||||
| int normalized_shape_size_{1}; | int normalized_shape_size_{1}; | ||||
| float epsilon_{0.0f}; | float epsilon_{0.0f}; | ||||
| cl::Kernel kernel_; | cl::Kernel kernel_; | ||||
| @@ -22,20 +22,19 @@ class TestOpenCL_LayerNorm : public CommonTest {}; | |||||
| namespace { | namespace { | ||||
| // PrimitiveType_Stack: src/ops/populate/stack_populate.cc | // PrimitiveType_Stack: src/ops/populate/stack_populate.cc | ||||
| OpParameter *CreateParameter(float epsilon, int normalized_dims_, std::vector<int> normalizedShape) { | |||||
| OpParameter *CreateParameter(float epsilon, int begin_norm_axis_, int begin_param_axis_) { | |||||
| auto *param = test::CreateParameter<LayerNormParameter>(schema::PrimitiveType_LayerNorm); | auto *param = test::CreateParameter<LayerNormParameter>(schema::PrimitiveType_LayerNorm); | ||||
| param->epsilon_ = epsilon; | param->epsilon_ = epsilon; | ||||
| param->normalized_dims_ = normalized_dims_; | |||||
| for (int i = 0; i < normalizedShape.size() && i < normalized_dims_; ++i) { | |||||
| param->normalized_shape_[i] = normalizedShape[i]; | |||||
| } | |||||
| param->begin_norm_axis_ = begin_norm_axis_; | |||||
| param->begin_params_axis_ = begin_param_axis_; | |||||
| return reinterpret_cast<OpParameter *>(param); | return reinterpret_cast<OpParameter *>(param); | ||||
| } | } | ||||
| } // namespace | } // namespace | ||||
| TEST_F(TestOpenCL_LayerNorm, test1) { | TEST_F(TestOpenCL_LayerNorm, test1) { | ||||
| float epsilon = 1e-5; | float epsilon = 1e-5; | ||||
| int normalized_dims_ = 1; | |||||
| int begin_norm_axis_ = 3; | |||||
| int begin_param_axis_ = 3; | |||||
| std::vector<int> normalizedShape = {5}; | std::vector<int> normalizedShape = {5}; | ||||
| std::vector<int> input_shape = {2, 3, 4, 5}; | std::vector<int> input_shape = {2, 3, 4, 5}; | ||||
| std::vector<int> gamma_shape = {1, 1, 1, 5}; | std::vector<int> gamma_shape = {1, 1, 1, 5}; | ||||
| @@ -51,11 +50,10 @@ TEST_F(TestOpenCL_LayerNorm, test1) { | |||||
| auto beta_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(betaPpath.c_str(), &beta_size)); | auto beta_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(betaPpath.c_str(), &beta_size)); | ||||
| auto output_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size)); | auto output_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size)); | ||||
| for (auto fp16_enable : {false}) { | for (auto fp16_enable : {false}) { | ||||
| auto *param = CreateParameter(epsilon, normalized_dims_, normalizedShape); | |||||
| auto *param = CreateParameter(epsilon, begin_norm_axis_, begin_param_axis_); | |||||
| TestMain( | TestMain( | ||||
| {{input_shape, input_data, VAR}, {gamma_shape, gamma_data, CONST_TENSOR}, {beta_shape, beta_data, CONST_TENSOR}}, | {{input_shape, input_data, VAR}, {gamma_shape, gamma_data, CONST_TENSOR}, {beta_shape, beta_data, CONST_TENSOR}}, | ||||
| {output_shape, output_data}, param, fp16_enable, fp16_enable ? 1e-3 : 1e-6); | |||||
| {output_shape, output_data}, param, fp16_enable, fp16_enable ? 1e-3 : 1e-5); | |||||
| } | } | ||||
| } | } | ||||
| } // namespace mindspore::lite::opencl::test | } // namespace mindspore::lite::opencl::test | ||||