From 21ebb6ebb30c2c91256f70631536d40064038ac6 Mon Sep 17 00:00:00 2001 From: Pengyongrong Date: Tue, 26 Jan 2021 17:28:24 -0800 Subject: [PATCH] modify layernorm 2021/1/27 --- .../runtime/kernel/opencl/cl/layer_norm.cl | 50 +++++------- .../kernel/opencl/kernel/fusion_eltwise.h | 4 +- .../kernel/opencl/kernel/layer_norm.cc | 78 +++++++------------ .../runtime/kernel/opencl/kernel/layer_norm.h | 3 +- .../runtime/kernel/opencl/layer_norm_tests.cc | 16 ++-- 5 files changed, 56 insertions(+), 95 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/layer_norm.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/layer_norm.cl index 73bb16666f..8b5c96f3ef 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/layer_norm.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/layer_norm.cl @@ -3,8 +3,8 @@ __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) #define C4NUM 4 -__kernel void ComputeMeanVarDim1NHWC4(__read_only image2d_t src_data, __global FLT *mean_, __global FLT *variance_, - int4 in_shape, int normalized_shape_size) { +__kernel void ComputeMeanVarAxis3NHWC4(__read_only image2d_t src_data, __global FLT *mean_, __global FLT *variance_, + int4 in_shape, int normalized_shape_size) { int X = get_global_id(0); // n*h int Y = get_global_id(1); // w if (X > in_shape.x * in_shape.y || Y > in_shape.z || in_shape.y == 0 || normalized_shape_size == 0) { @@ -50,15 +50,14 @@ __kernel void ComputeMeanVarDim1NHWC4(__read_only image2d_t src_data, __global F var = (var_temp.x + var_temp.y + var_temp.z + var_temp.w) / normalized_shape_size; // write result to dst - int postion = (n * in_shape.y + h) * in_shape.z + w; - mean_[postion] = mean; - variance_[postion] = var; + int position = (n * in_shape.y + h) * in_shape.z + w; + mean_[position] = mean; + variance_[position] = var; } __kernel void LayerNormalization_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, __global FLT *mean_, __global FLT *variance_, __global FLT *gamma_, - __global FLT *beta_, int4 in_shape, float epsilon_, int normalized_dims_, - int elementwise_affine_) { + __global FLT *beta_, int4 in_shape, float epsilon_, int begin_params_axis_) { int X = get_global_id(0); // n*h int Y = get_global_id(1); // w int Z = get_global_id(2); // c4 @@ -72,32 +71,25 @@ __kernel void LayerNormalization_NHWC4(__read_only image2d_t src_data, __write_o int ci4 = UP_DIV(in_shape.w, C4NUM); int postion_mv = 0; int postion_gb = 0; - if (normalized_dims_ == 1) { - postion_mv = (n * in_shape.y + h) * in_shape.z + w; - postion_gb = c * C4NUM; - } else if (normalized_dims_ == 2) { - postion_mv = n * in_shape.y + h; - postion_gb = w * ci4 * C4NUM + c * C4NUM; - } else if (normalized_dims_ == 3) { + if (begin_params_axis_ == 1) { postion_mv = n; postion_gb = (h * in_shape.z + w) * ci4 * C4NUM + c * C4NUM; + } else if (begin_params_axis_ == 2) { + postion_mv = n * in_shape.y + h; + postion_gb = w * ci4 * C4NUM + c * C4NUM; + } else if (begin_params_axis_ == 3) { + postion_mv = (n * in_shape.y + h) * in_shape.z + w; + postion_gb = c * C4NUM; } FLT4 result = {0.0f, 0.0f, 0.0f, 0.0f}; FLT4 result_in = READ_IMAGE(src_data, smp_none, (int2)(w * ci4 + c, n * in_shape.y + h)); - if (elementwise_affine_) { - result.x = ((result_in.x - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)) * gamma_[postion_gb] + - beta_[postion_gb]; - result.y = ((result_in.y - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)) * gamma_[postion_gb + 1] + - beta_[postion_gb + 1]; - result.z = ((result_in.z - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)) * gamma_[postion_gb + 2] + - beta_[postion_gb + 2]; - result.w = ((result_in.w - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)) * gamma_[postion_gb + 3] + - beta_[postion_gb + 3]; - } else { - result.x = ((result_in.x - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)); - result.y = ((result_in.y - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)); - result.z = ((result_in.z - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)); - result.w = ((result_in.w - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)); - } + result.x = ((result_in.x - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)) * gamma_[postion_gb] + + beta_[postion_gb]; + result.y = ((result_in.y - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)) * gamma_[postion_gb + 1] + + beta_[postion_gb + 1]; + result.z = ((result_in.z - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)) * gamma_[postion_gb + 2] + + beta_[postion_gb + 2]; + result.w = ((result_in.w - mean_[postion_mv]) / sqrt(variance_[postion_mv] + epsilon_)) * gamma_[postion_gb + 3] + + beta_[postion_gb + 3]; WRITE_IMAGE(dst_data, (int2)((w * ci4 + c), (n * in_shape.y + h)), result); } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.h index 6d0d3c58d0..2974aecd66 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.h @@ -106,7 +106,7 @@ struct FusionEltwiseParameter { Node_(bool is_leaf, FusionEltwiseParameter *value, std::string value_name) : is_leaf_(is_leaf), value_(value), name_(std::move(value_name)) {} }; - OpParameter op_parameter_{}; + OpParameter op_parameter_{"FusionEltwiseParameter", PrimitiveType_FusionEltwise, 1}; EltwiseOperator operator_; std::string name_; std::vector inputs_; @@ -115,8 +115,6 @@ struct FusionEltwiseParameter { const std::vector &in_tensors, const std::map &replace_map = {}) : operator_(operator_init), name_(std::move(kernel_name)) { - op_parameter_.type_ = PrimitiveType_FusionEltwise; - snprintf(op_parameter_.name_, strlen("FusionEltwiseParameter"), "FusionEltwiseParameter"); for (int i = 0; i < in_tensors.size(); ++i) { auto *in_tensor = in_tensors[i]; if (replace_map.count(in_tensor)) { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc index b304f9729c..115161c871 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc @@ -33,13 +33,22 @@ namespace mindspore::kernel { int LayerNormOpenCLKernel::CheckSpecs() { auto param = reinterpret_cast(this->op_parameter_); - if (in_tensors_.at(0)->shape().size() != 4 || out_tensors_.size() != 1) { - MS_LOG(ERROR) << "UnSupported in_tensors_.shape.size: " << in_tensors_.at(0)->shape().size() + if (in_tensors_.size() != 3 || out_tensors_.size() != 1) { + MS_LOG(ERROR) << "UnSupported in_tensors_.size: " << in_tensors_.size() << " out_tensors_.size(): " << out_tensors_.size(); return RET_ERROR; } - if (param->normalized_dims_ != 1) { - MS_LOG(ERROR) << "UnSupported normalized_shape_ size: " << param->normalized_dims_; + if (in_tensors_.at(0)->shape().size() != 4) { + MS_LOG(ERROR) << "UnSupported in_tensors_.shape.size: " << in_tensors_.at(0)->shape().size(); + return RET_ERROR; + } + normalized_axis_ = param->begin_params_axis_; + epsilon_ = param->epsilon_; + if (normalized_axis_ < 0) { + normalized_axis_ += in_tensors_.at(0)->shape().size(); + } + if (normalized_axis_ != 3) { + MS_LOG(ERROR) << "UnSupported normalized_axis_ : " << param->normalized_dims_; return RET_ERROR; } return RET_OK; @@ -61,15 +70,11 @@ void LayerNormGetWorkGroup(const std::vector &global, std::vectorSetKernelArg(kernel_, arg_cn++, in_shape_); ocl_runtime_->SetKernelArg(kernel_, arg_cn++, epsilon_); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, normalized_dims_); - if (elementwise_affine_) { - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, 1); - } else { - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, 0); - } - + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, normalized_axis_); ocl_runtime_->SetKernelArg(kernel_mean_var_, 3, in_shape_); ocl_runtime_->SetKernelArg(kernel_mean_var_, 4, normalized_shape_size_); } @@ -91,32 +96,13 @@ void LayerNormOpenCLKernel::SetGlobalLocal() { const std::vector &max_global = ocl_runtime_->GetWorkItemSize(); LayerNormGetWorkGroup(global_size_, &local_size_, max_global[0]); OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); - if (normalized_dims_ != in_tensors_.at(0)->shape().size()) { - if (normalized_dims_ == 1) { - OH = in_shape_.s[0] * in_shape_.s[1]; - OW = in_shape_.s[2]; - OC = 1; - } else if (normalized_dims_ == 2) { - OH = in_shape_.s[0] * in_shape_.s[1]; - OW = 1; - OC = 1; - } else { - OH = in_shape_.s[0]; - OW = 1; - OC = 1; - } - } else { - OH = 1; - OW = 1; - OC = 1; - } - AlignMeanVarGlobalLocal({static_cast(OH), static_cast(OW), static_cast(OC)}, {1, 1, 1}, - &global_mean_var_, &local_mean_var_); + AlignMeanVarGlobalLocal({static_cast(OH), static_cast(OW), 1}, {1, 1, 1}, &global_mean_var_, + &local_mean_var_); } int LayerNormOpenCLKernel::Initweight() { auto allocator = ocl_runtime_->GetAllocator(); - GpuTensorInfo img_info(in_tensors_.at(1)); // gamma + GpuTensorInfo img_info(in_tensors_.at(1)); auto weight_tensor = in_tensors_.at(1); size_t weight_size = img_info.Image2DSize; // allocated memory for weight and init value @@ -165,40 +151,28 @@ int LayerNormOpenCLKernel::Initweight() { int LayerNormOpenCLKernel::Prepare() { use_fp16_enable_ = ocl_runtime_->GetFp16Enable(); - auto param = reinterpret_cast(this->op_parameter_); - elementwise_affine_ = true; // param->elementwise_mode_; - normalized_dims_ = param->normalized_dims_; - epsilon_ = param->epsilon_; - if (elementwise_affine_) { - int ret = Initweight(); - if (ret) { - MS_LOG(ERROR) << "Initweight failed "; - return RET_ERROR; - } + int ret = Initweight(); + if (ret) { + MS_LOG(ERROR) << "Initweight failed "; + return RET_ERROR; } + normalized_shape_size_ = in_tensors_.at(0)->shape().at(normalized_axis_); auto allocator = ocl_runtime_->GetAllocator(); size_t mean_size = 1; - size_t size = in_tensors_.at(0)->shape().size() - normalized_dims_; - for (int i = 0; i < size; ++i) { + for (int i = 0; i < normalized_axis_; ++i) { mean_size *= in_tensors_.at(0)->shape()[i]; } size_t size_dtype = use_fp16_enable_ ? sizeof(float16_t) : sizeof(float); mean_size *= size_dtype; mean_ = allocator->Malloc(mean_size); var_ = allocator->Malloc(mean_size); - GpuTensorInfo img_info(in_tensors_.at(0)); - in_shape_.s[0] = img_info.N, in_shape_.s[1] = img_info.H, in_shape_.s[2] = img_info.W, in_shape_.s[3] = img_info.C; - - for (int i = 0; i < normalized_dims_; ++i) { - normalized_shape_size_ *= param->normalized_shape_[i]; - } std::string kernel_name = "LayerNormalization_NHWC4"; std::string kernel_name_mean_var = "ComputeMeanVar"; std::string source = layer_norm_source; std::string program_name = "LayerNormalization"; ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); - kernel_name_mean_var += "Dim" + std::to_string(normalized_dims_) + "NHWC4"; + kernel_name_mean_var += "Axis" + std::to_string(normalized_axis_) + "NHWC4"; ocl_runtime_->BuildKernel(kernel_mean_var_, program_name, kernel_name_mean_var); MS_LOG(DEBUG) << kernel_name << " Init Done!"; SetConstArgs(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.h index 8ff86721ab..66def9480e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.h @@ -48,8 +48,7 @@ class LayerNormOpenCLKernel : public OpenCLKernel { void *var_{nullptr}; void *beta_{nullptr}; cl_int4 in_shape_{}; - int elementwise_affine_; - int32_t normalized_dims_{1}; + int32_t normalized_axis_{3}; // default is C int normalized_shape_size_{1}; float epsilon_{0.0f}; cl::Kernel kernel_; diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/layer_norm_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/layer_norm_tests.cc index 0c99e11cd9..634fb84489 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/layer_norm_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/layer_norm_tests.cc @@ -22,20 +22,19 @@ class TestOpenCL_LayerNorm : public CommonTest {}; namespace { // PrimitiveType_Stack: src/ops/populate/stack_populate.cc -OpParameter *CreateParameter(float epsilon, int normalized_dims_, std::vector normalizedShape) { +OpParameter *CreateParameter(float epsilon, int begin_norm_axis_, int begin_param_axis_) { auto *param = test::CreateParameter(schema::PrimitiveType_LayerNorm); param->epsilon_ = epsilon; - param->normalized_dims_ = normalized_dims_; - for (int i = 0; i < normalizedShape.size() && i < normalized_dims_; ++i) { - param->normalized_shape_[i] = normalizedShape[i]; - } + param->begin_norm_axis_ = begin_norm_axis_; + param->begin_params_axis_ = begin_param_axis_; return reinterpret_cast(param); } } // namespace TEST_F(TestOpenCL_LayerNorm, test1) { float epsilon = 1e-5; - int normalized_dims_ = 1; + int begin_norm_axis_ = 3; + int begin_param_axis_ = 3; std::vector normalizedShape = {5}; std::vector input_shape = {2, 3, 4, 5}; std::vector gamma_shape = {1, 1, 1, 5}; @@ -51,11 +50,10 @@ TEST_F(TestOpenCL_LayerNorm, test1) { auto beta_data = reinterpret_cast(mindspore::lite::ReadFile(betaPpath.c_str(), &beta_size)); auto output_data = reinterpret_cast(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size)); for (auto fp16_enable : {false}) { - auto *param = CreateParameter(epsilon, normalized_dims_, normalizedShape); - + auto *param = CreateParameter(epsilon, begin_norm_axis_, begin_param_axis_); TestMain( {{input_shape, input_data, VAR}, {gamma_shape, gamma_data, CONST_TENSOR}, {beta_shape, beta_data, CONST_TENSOR}}, - {output_shape, output_data}, param, fp16_enable, fp16_enable ? 1e-3 : 1e-6); + {output_shape, output_data}, param, fp16_enable, fp16_enable ? 1e-3 : 1e-5); } } } // namespace mindspore::lite::opencl::test