| @@ -192,6 +192,7 @@ OpParameter *PopulateBatchNorm(const mindspore::lite::PrimitiveC *primitive) { | |||||
| } | } | ||||
| batch_norm_param->op_parameter_.type_ = primitive->Type(); | batch_norm_param->op_parameter_.type_ = primitive->Type(); | ||||
| batch_norm_param->epsilon_ = param->GetEpsilon(); | batch_norm_param->epsilon_ = param->GetEpsilon(); | ||||
| batch_norm_param->fused_ = false; | |||||
| return reinterpret_cast<OpParameter *>(batch_norm_param); | return reinterpret_cast<OpParameter *>(batch_norm_param); | ||||
| } | } | ||||
| @@ -648,6 +649,7 @@ OpParameter *PopulateFusedBatchNorm(const mindspore::lite::PrimitiveC *primitive | |||||
| batch_norm_param->op_parameter_.type_ = primitive->Type(); | batch_norm_param->op_parameter_.type_ = primitive->Type(); | ||||
| auto param = dynamic_cast<const mindspore::lite::FusedBatchNorm *>(primitive); | auto param = dynamic_cast<const mindspore::lite::FusedBatchNorm *>(primitive); | ||||
| batch_norm_param->epsilon_ = param->GetEpsilon(); | batch_norm_param->epsilon_ = param->GetEpsilon(); | ||||
| batch_norm_param->fused_ = true; | |||||
| return reinterpret_cast<OpParameter *>(batch_norm_param); | return reinterpret_cast<OpParameter *>(batch_norm_param); | ||||
| } | } | ||||
| @@ -27,6 +27,7 @@ using mindspore::lite::KernelRegistrar; | |||||
| using mindspore::lite::RET_ERROR; | using mindspore::lite::RET_ERROR; | ||||
| using mindspore::lite::RET_OK; | using mindspore::lite::RET_OK; | ||||
| using mindspore::schema::PrimitiveType_BatchNorm; | using mindspore::schema::PrimitiveType_BatchNorm; | ||||
| using mindspore::schema::PrimitiveType_FusedBatchNorm; | |||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| BatchnormInt8CPUKernel::~BatchnormInt8CPUKernel() { | BatchnormInt8CPUKernel::~BatchnormInt8CPUKernel() { | ||||
| @@ -82,22 +83,86 @@ int BatchnormInt8CPUKernel::InitConstTensor() { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int BatchnormInt8CPUKernel::InitFusedConstTensor() { | |||||
| auto input = in_tensors_[0]; | |||||
| auto scale = in_tensors_[1]; | |||||
| auto offset = in_tensors_[2]; | |||||
| auto mean = in_tensors_[3]; | |||||
| auto variance = in_tensors_[4]; | |||||
| auto output = out_tensors_[0]; | |||||
| auto scale_ptr = reinterpret_cast<int8_t *>(scale->Data()); | |||||
| auto offset_ptr = reinterpret_cast<int8_t *>(offset->Data()); | |||||
| auto mean_ptr = reinterpret_cast<int8_t *>(mean->Data()); | |||||
| auto var_ptr = reinterpret_cast<int8_t *>(variance->Data()); | |||||
| alpha_addr_ = reinterpret_cast<float *>(malloc(mean->ElementsNum() * sizeof(float))); | |||||
| if (alpha_addr_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| beta_addr_ = reinterpret_cast<float *>(malloc(variance->ElementsNum() * sizeof(float))); | |||||
| if (beta_addr_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| // compute alpha, beta; | |||||
| // 0. tmp = (S6 * Sqrt(e + S5 * (q5 - Z5))); | |||||
| // 1. A = S1 * S2 * (q2 - Z2) / tmp; | |||||
| // 2. B = Z6 - (A1 * Z1) -((S3 * (q3 - Z3)) / S6 - S2 * S4 * (q2 - Z4) * (q4 - z4) / tmp; | |||||
| auto eps = batchnorm_param_->epsilon_; | |||||
| auto zp_in = input->GetQuantParams().front().zeroPoint; | |||||
| auto zp_scale = scale->GetQuantParams().front().zeroPoint; | |||||
| auto zp_offset = offset->GetQuantParams().front().zeroPoint; | |||||
| auto zp_mean = mean->GetQuantParams().front().zeroPoint; | |||||
| auto zp_var = variance->GetQuantParams().front().zeroPoint; | |||||
| auto zp_out = output->GetQuantParams().front().zeroPoint; | |||||
| auto s_in = input->GetQuantParams().front().scale; | |||||
| auto s_scale = scale->GetQuantParams().front().scale; | |||||
| auto s_offset = offset->GetQuantParams().front().scale; | |||||
| auto s_mean = mean->GetQuantParams().front().scale; | |||||
| auto s_var = variance->GetQuantParams().front().scale; | |||||
| auto s_out = output->GetQuantParams().front().scale; | |||||
| float mul_12 = s_in * s_scale; | |||||
| float mul_24 = s_scale * s_mean; | |||||
| float div_36 = s_offset / s_out; | |||||
| for (int i = 0; i < batchnorm_param_->channel_; ++i) { | |||||
| float tmp = s_out * sqrt(eps + s_var * (var_ptr[i] - zp_var)); | |||||
| float tmp_a = (mul_12 * (scale_ptr[i] - zp_scale)) / tmp; | |||||
| float tmp_b = zp_out + div_36 * (offset_ptr[i] - zp_offset) - tmp_a * zp_in - | |||||
| (mul_24 * (scale_ptr[i] - zp_scale) * (mean_ptr[i] - zp_mean)) / tmp; | |||||
| alpha_addr_[i] = tmp_a; | |||||
| beta_addr_[i] = tmp_b; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int BatchnormInt8CPUKernel::Init() { | int BatchnormInt8CPUKernel::Init() { | ||||
| auto input_shapes = in_tensors_[0]->shape(); | auto input_shapes = in_tensors_[0]->shape(); | ||||
| auto n_dim = input_shapes.size(); | auto n_dim = input_shapes.size(); | ||||
| batchnorm_param_->channel_ = input_shapes[n_dim - 1]; | batchnorm_param_->channel_ = input_shapes[n_dim - 1]; | ||||
| batchnorm_param_->unit_ = 1; | |||||
| batchnorm_param_->units_ = 1; | |||||
| for (int i = 0; i < n_dim - 1; i++) { | for (int i = 0; i < n_dim - 1; i++) { | ||||
| batchnorm_param_->unit_ *= input_shapes[i]; | |||||
| batchnorm_param_->units_ *= input_shapes[i]; | |||||
| } | } | ||||
| batchnorm_param_->op_parameter_.thread_num_ = | batchnorm_param_->op_parameter_.thread_num_ = | ||||
| MSMIN(batchnorm_param_->op_parameter_.thread_num_, batchnorm_param_->channel_); | MSMIN(batchnorm_param_->op_parameter_.thread_num_, batchnorm_param_->channel_); | ||||
| auto ret = InitConstTensor(); | |||||
| if (ret != 0) { | |||||
| MS_LOG(ERROR) << "Batchnorm fp32 InitConstTensor failed."; | |||||
| return RET_ERROR; | |||||
| batchnorm_param_->unit_ = UP_DIV(batchnorm_param_->units_, batchnorm_param_->op_parameter_.thread_num_); | |||||
| if (batchnorm_param_->fused_) { | |||||
| auto ret = InitFusedConstTensor(); | |||||
| if (ret != 0) { | |||||
| MS_LOG(ERROR) << "FusedBatchnorm int8 InitFusedConstTensor failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| } else { | |||||
| auto ret = InitConstTensor(); | |||||
| if (ret != 0) { | |||||
| MS_LOG(ERROR) << "Batchnorm int8 InitConstTensor failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| } | } | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -165,4 +230,5 @@ kernel::LiteKernel *CpuBatchnormInt8KernelCreator(const std::vector<lite::tensor | |||||
| } | } | ||||
| REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_BatchNorm, CpuBatchnormInt8KernelCreator) | REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_BatchNorm, CpuBatchnormInt8KernelCreator) | ||||
| REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_FusedBatchNorm, CpuBatchnormInt8KernelCreator) | |||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -40,6 +40,7 @@ class BatchnormInt8CPUKernel : public LiteKernel { | |||||
| int ReSize() override; | int ReSize() override; | ||||
| int Run() override; | int Run() override; | ||||
| int InitConstTensor(); | int InitConstTensor(); | ||||
| int InitFusedConstTensor(); | |||||
| int DoExecute(int tid); | int DoExecute(int tid); | ||||
| private: | private: | ||||
| @@ -23,7 +23,9 @@ typedef struct BatchNormParameter { | |||||
| OpParameter op_parameter_; | OpParameter op_parameter_; | ||||
| float epsilon_; | float epsilon_; | ||||
| int unit_; | int unit_; | ||||
| int units_; | |||||
| int channel_; | int channel_; | ||||
| bool fused_; | |||||
| } BatchNormParameter; | } BatchNormParameter; | ||||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_BATCHNORM_PARAMETER_H_ | #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_BATCHNORM_PARAMETER_H_ | ||||
| @@ -20,8 +20,10 @@ | |||||
| void BatchNormInt8(int8_t *output_ptr, const int8_t *input_ptr, const float *alpha_ptr, const float *beta_ptr, | void BatchNormInt8(int8_t *output_ptr, const int8_t *input_ptr, const float *alpha_ptr, const float *beta_ptr, | ||||
| int task_id, BatchNormParameter *param) { | int task_id, BatchNormParameter *param) { | ||||
| for (int c = task_id; c < param->channel_; c += param->op_parameter_.thread_num_) { | |||||
| for (int u = 0; u < param->unit_; u++) { | |||||
| int unit_st = task_id * param->unit_; | |||||
| int unit_end = MSMIN((task_id + 1) * param->unit_, param->units_); | |||||
| for (int u = unit_st; u < unit_end; u++) { | |||||
| for (int c = 0; c < param->channel_; c++) { | |||||
| int32_t output_tmp = round(input_ptr[u * param->channel_ + c] * alpha_ptr[c] + beta_ptr[c]); | int32_t output_tmp = round(input_ptr[u * param->channel_ + c] * alpha_ptr[c] + beta_ptr[c]); | ||||
| output_tmp = output_tmp > 127 ? 127 : output_tmp; | output_tmp = output_tmp > 127 ? 127 : output_tmp; | ||||
| output_tmp = output_tmp < -128 ? -128 : output_tmp; | output_tmp = output_tmp < -128 ? -128 : output_tmp; | ||||
| @@ -27,6 +27,104 @@ class TestBatchnormInt8 : public mindspore::CommonTest { | |||||
| TestBatchnormInt8() {} | TestBatchnormInt8() {} | ||||
| }; | }; | ||||
| TEST_F(TestBatchnormInt8, FusedTest) { | |||||
| std::vector<int8_t> in_data = {11, 41, 21, 51, 31, 61, -11, -41, -21, -51, -31, -61}; | |||||
| std::vector<int8_t> in_data1 = {4, 4}; | |||||
| std::vector<int8_t> in_data2 = {8, 33}; | |||||
| std::vector<int8_t> in_data3 = {35, 55}; | |||||
| std::vector<int8_t> in_data4 = {2, 3}; | |||||
| std::vector<lite::tensor::Tensor *> inputs_tensor; | |||||
| std::vector<lite::tensor::Tensor *> outputs_tensor; | |||||
| BatchNormParameter op_param; | |||||
| op_param.op_parameter_.type_ = schema::PrimitiveType_FusedBatchNorm; | |||||
| op_param.epsilon_ = 0.001f; | |||||
| op_param.fused_ = true; | |||||
| std::vector<int> shape = {1, 1, 6, 2}; | |||||
| lite::tensor::QuantArg input_quant_arg; | |||||
| input_quant_arg.scale = 0.1; | |||||
| input_quant_arg.zeroPoint = 1; | |||||
| lite::tensor::QuantArg input_quant_arg_1; | |||||
| input_quant_arg_1.scale = 0.5; | |||||
| input_quant_arg_1.zeroPoint = 2; | |||||
| lite::tensor::QuantArg input_quant_arg_2; | |||||
| input_quant_arg_2.scale = 0.02; | |||||
| input_quant_arg_2.zeroPoint = 3; | |||||
| lite::tensor::QuantArg input_quant_arg_3; | |||||
| input_quant_arg_3.scale = 0.5; | |||||
| input_quant_arg_3.zeroPoint = 15; | |||||
| lite::tensor::QuantArg input_quant_arg_4; | |||||
| input_quant_arg_4.scale = 0.25; | |||||
| input_quant_arg_4.zeroPoint = 1; | |||||
| lite::tensor::QuantArg output_quant_arg; | |||||
| output_quant_arg.scale = 0.8; | |||||
| output_quant_arg.zeroPoint = 0; | |||||
| lite::tensor::Tensor input0_tensor; | |||||
| lite::tensor::Tensor input1_tensor; | |||||
| lite::tensor::Tensor input2_tensor; | |||||
| lite::tensor::Tensor input3_tensor; | |||||
| lite::tensor::Tensor input4_tensor; | |||||
| inputs_tensor.push_back(&input0_tensor); | |||||
| inputs_tensor.push_back(&input1_tensor); | |||||
| inputs_tensor.push_back(&input2_tensor); | |||||
| inputs_tensor.push_back(&input3_tensor); | |||||
| inputs_tensor.push_back(&input4_tensor); | |||||
| input0_tensor.SetData(in_data.data()); | |||||
| input1_tensor.SetData(in_data1.data()); | |||||
| input2_tensor.SetData(in_data2.data()); | |||||
| input3_tensor.SetData(in_data3.data()); | |||||
| input4_tensor.SetData(in_data4.data()); | |||||
| input0_tensor.set_shape(shape); | |||||
| input1_tensor.set_shape({2}); | |||||
| input2_tensor.set_shape({2}); | |||||
| input3_tensor.set_shape({2}); | |||||
| input4_tensor.set_shape({2}); | |||||
| input0_tensor.AddQuantParam(input_quant_arg); | |||||
| input1_tensor.AddQuantParam(input_quant_arg_1); | |||||
| input2_tensor.AddQuantParam(input_quant_arg_2); | |||||
| input3_tensor.AddQuantParam(input_quant_arg_3); | |||||
| input4_tensor.AddQuantParam(input_quant_arg_4); | |||||
| std::vector<int8_t> output(12); | |||||
| // std::vector<int8_t> corr_out = {-18, -22, -16, -21, -14, -19, -22, -34, -24, -35, -26, -36 }; | |||||
| std::vector<int8_t> corr_out = {-22, -28, -20, -26, -17, -24, -28, -42, -30, -44, -33, -46}; | |||||
| lite::tensor::Tensor output0_tensor; | |||||
| outputs_tensor.push_back(&output0_tensor); | |||||
| output0_tensor.SetData(output.data()); | |||||
| output0_tensor.set_shape(shape); | |||||
| output0_tensor.AddQuantParam(output_quant_arg); | |||||
| kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeInt8, schema::PrimitiveType_FusedBatchNorm}; | |||||
| auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc); | |||||
| ASSERT_NE(creator, nullptr); | |||||
| lite::Context ctx; | |||||
| ctx.thread_num_ = 3; | |||||
| kernel::LiteKernel *kernel = | |||||
| creator(inputs_tensor, outputs_tensor, reinterpret_cast<OpParameter *>(&op_param), &ctx, desc, nullptr); | |||||
| ASSERT_NE(kernel, nullptr); | |||||
| auto output_tensor_shape = output0_tensor.shape(); | |||||
| kernel->Run(); | |||||
| printf("==================output data=================\n"); | |||||
| for (int i = 0; i < output0_tensor.ElementsNum(); i++) { | |||||
| printf("%d, ", output[i]); | |||||
| } | |||||
| std::cout << std::endl; | |||||
| CompareOutputData(output.data(), corr_out.data(), output0_tensor.ElementsNum(), 0.001); | |||||
| input0_tensor.SetData(nullptr); | |||||
| input1_tensor.SetData(nullptr); | |||||
| input2_tensor.SetData(nullptr); | |||||
| input3_tensor.SetData(nullptr); | |||||
| input4_tensor.SetData(nullptr); | |||||
| output0_tensor.SetData(nullptr); | |||||
| MS_LOG(INFO) << "TestBathNormFp32 accuracy passed"; | |||||
| } | |||||
| TEST_F(TestBatchnormInt8, BNTest) { | TEST_F(TestBatchnormInt8, BNTest) { | ||||
| std::vector<int8_t> in_data = {11, 41, 21, 51, 31, 61, -11, -41, -21, -51, -31, -61}; | std::vector<int8_t> in_data = {11, 41, 21, 51, 31, 61, -11, -41, -21, -51, -31, -61}; | ||||
| std::vector<int8_t> in_data1 = {4, 14}; | std::vector<int8_t> in_data1 = {4, 14}; | ||||
| @@ -37,6 +135,7 @@ TEST_F(TestBatchnormInt8, BNTest) { | |||||
| BatchNormParameter op_param; | BatchNormParameter op_param; | ||||
| op_param.op_parameter_.type_ = schema::PrimitiveType_BatchNorm; | op_param.op_parameter_.type_ = schema::PrimitiveType_BatchNorm; | ||||
| op_param.epsilon_ = 0.001f; | op_param.epsilon_ = 0.001f; | ||||
| op_param.fused_ = false; | |||||
| std::vector<int> shape = {1, 1, 6, 2}; | std::vector<int> shape = {1, 1, 6, 2}; | ||||
| @@ -50,7 +149,7 @@ TEST_F(TestBatchnormInt8, BNTest) { | |||||
| input_quant_arg_2.scale = 0.1; | input_quant_arg_2.scale = 0.1; | ||||
| input_quant_arg_2.zeroPoint = -1; | input_quant_arg_2.zeroPoint = -1; | ||||
| lite::tensor::QuantArg output_quant_arg; | lite::tensor::QuantArg output_quant_arg; | ||||
| output_quant_arg.scale = 1; | |||||
| output_quant_arg.scale = 0.5; | |||||
| output_quant_arg.zeroPoint = 0; | output_quant_arg.zeroPoint = 0; | ||||
| lite::tensor::Tensor input0_tensor; | lite::tensor::Tensor input0_tensor; | ||||
| @@ -70,8 +169,7 @@ TEST_F(TestBatchnormInt8, BNTest) { | |||||
| input2_tensor.AddQuantParam(input_quant_arg_2); | input2_tensor.AddQuantParam(input_quant_arg_2); | ||||
| std::vector<int8_t> output(12); | std::vector<int8_t> output(12); | ||||
| // std::vector<int8_t> corr_out1 = {5, 17, 11, 22, 17, 27, -6, -23, -12, -28, -18, -33}; | |||||
| std::vector<int8_t> corr_out = {1, 2, 1, 2, 2, 3, -1, -2, -1, -3, -2, -3}; | |||||
| std::vector<int8_t> corr_out = {1, 3, 2, 4, 3, 5, -2, -5, -3, -6, -4, -7}; | |||||
| lite::tensor::Tensor output0_tensor; | lite::tensor::Tensor output0_tensor; | ||||
| outputs_tensor.push_back(&output0_tensor); | outputs_tensor.push_back(&output0_tensor); | ||||
| @@ -87,6 +185,7 @@ TEST_F(TestBatchnormInt8, BNTest) { | |||||
| kernel::LiteKernel *kernel = | kernel::LiteKernel *kernel = | ||||
| creator(inputs_tensor, outputs_tensor, reinterpret_cast<OpParameter *>(&op_param), &ctx, desc, nullptr); | creator(inputs_tensor, outputs_tensor, reinterpret_cast<OpParameter *>(&op_param), &ctx, desc, nullptr); | ||||
| ASSERT_NE(kernel, nullptr); | ASSERT_NE(kernel, nullptr); | ||||
| auto output_tensor_shape = output0_tensor.shape(); | auto output_tensor_shape = output0_tensor.shape(); | ||||
| kernel->Run(); | kernel->Run(); | ||||