diff --git a/mindspore/lite/nnacl/fp16/pooling_fp16.c b/mindspore/lite/nnacl/fp16/pooling_fp16.c index c4a89e1269..9f5598f285 100644 --- a/mindspore/lite/nnacl/fp16/pooling_fp16.c +++ b/mindspore/lite/nnacl/fp16/pooling_fp16.c @@ -17,7 +17,8 @@ #include #include "nnacl/errorcode.h" -int AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id) { +int AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id, + float16_t min, float16_t max) { int stride_w = pooling_param->stride_w_; int stride_h = pooling_param->stride_h_; int pad_w = pooling_param->pad_l_; @@ -40,6 +41,12 @@ int AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPar int thread_num = pooling_param->thread_num_; // input channel is equal to output channel +#ifdef ENABLE_NEON + float16x8_t min_value = vdupq_n_f16(min); + float16x8_t max_value = vdupq_n_f16(max); + float16x4_t min_value2 = vdup_n_f16(min); + float16x4_t max_value2 = vdup_n_f16(max); +#endif for (int batch = 0; batch < output_batch; batch++) { int in_batch_offset = batch * in_h * in_w * channel; int out_batch_offset = batch * output_h * output_w * channel; @@ -88,10 +95,16 @@ int AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPar return NNACL_ERR; } #ifdef ENABLE_NEON - vst1q_f16(output_ptr + out_channel_offset, tmp_avg / vdupq_n_f16(real_count)); + tmp_avg = vdivq_f16(tmp_avg, vdupq_n_f16(real_count)); + tmp_avg = vmaxq_f16(tmp_avg, min_value); + tmp_avg = vminq_f16(tmp_avg, max_value); + vst1q_f16(output_ptr + out_channel_offset, tmp_avg); #else for (int t = 0; t < C8NUM; ++t) { - *(output_ptr + out_channel_offset + t) = tmp_avg[t] / (float16_t)real_count; + float16_t tmp_value = tmp_avg[t] / (float16_t)real_count; + tmp_value = fmax(tmp_value, min); + tmp_value = fmin(tmp_value, max); + output_ptr[out_channel_offset + t] = tmp_value; } #endif } // c8 loop @@ -126,10 +139,16 @@ int AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPar return NNACL_ERR; } #ifdef ENABLE_NEON - vst1_f16(output_ptr + out_channel_offset, tmp_avg / vdup_n_f16(real_count)); + tmp_avg = vdiv_f16(tmp_avg, vdup_n_f16(real_count)); + tmp_avg = vmax_f16(tmp_avg, min_value2); + tmp_avg = vmin_f16(tmp_avg, max_value2); + vst1_f16(output_ptr + out_channel_offset, tmp_avg); #else for (int t = 0; t < C4NUM; ++t) { - *(output_ptr + out_channel_offset + t) = tmp_avg[t] / (float16_t)real_count; + float16_t tmp_value = tmp_avg[t] / (float16_t)real_count; + tmp_value = fmax(tmp_value, min); + tmp_value = fmin(tmp_value, max); + output_ptr[out_channel_offset + t] = tmp_value; } #endif } // c4 loop @@ -150,7 +169,10 @@ int AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPar if (real_count == 0) { return NNACL_ERR; } - *(output_ptr + out_channel_offset) = tmp_avg / (float16_t)real_count; + float16_t tmp_value = tmp_avg / (float16_t)real_count; + tmp_value = fmax(tmp_value, min); + tmp_value = fmin(tmp_value, max); + output_ptr[out_channel_offset] = tmp_value; } // channel_res loop } // real_cal_num loop } // out_plane loop @@ -158,7 +180,8 @@ int AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPar return NNACL_OK; } -void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id) { +void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id, + float16_t min, float16_t max) { int stride_w = pooling_param->stride_w_; int stride_h = pooling_param->stride_h_; int pad_w = pooling_param->pad_l_; @@ -177,6 +200,12 @@ void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa int c8 = channel / C8NUM; int c8_res = channel % C8NUM; int c4 = c8_res / C4NUM; +#ifdef ENABLE_NEON + float16x8_t min_value = vdupq_n_f16(min); + float16x8_t max_value = vdupq_n_f16(max); + float16x4_t min_value2 = vdup_n_f16(min); + float16x4_t max_value2 = vdup_n_f16(max); +#endif // input channel is equal to output channel for (int batch = 0; batch < output_batch; batch++) { @@ -219,9 +248,13 @@ void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa } // win_w loop } // win_h loop #ifdef ENABLE_NEON + tmp_max = vmaxq_f16(tmp_max, min_value); + tmp_max = vminq_f16(tmp_max, max_value); vst1q_f16(output_ptr + out_channel_offset, tmp_max); #else for (int l = 0; l < C8NUM; ++l) { + tmp_max[l] = fmax(tmp_max[l], min); + tmp_max[l] = fmin(tmp_max[l], max); *(output_ptr + out_channel_offset + l) = tmp_max[l]; } #endif @@ -249,10 +282,14 @@ void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa } // win_w loop } // win_h loop #ifdef ENABLE_NEON + tmp_max = vmax_f16(tmp_max, min_value2); + tmp_max = vmin_f16(tmp_max, max_value2); vst1_f16(output_ptr + out_channel_offset, tmp_max); #else for (int l = 0; l < C4NUM; ++l) { - *(output_ptr + out_channel_offset + l) = tmp_max[l]; + tmp_max[l] = fmax(tmp_max[l], min); + tmp_max[l] = fmin(tmp_max[l], max); + output_ptr[out_channel_offset + l] = tmp_max[l]; } #endif } // c4 loop @@ -268,7 +305,9 @@ void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPa tmp_max = fmax(tmp_max, *(input_ptr + in_offset)); } // win_w loop } // win_h loop - *(output_ptr + out_channel_offset) = tmp_max; + tmp_max = fmax(tmp_max, min); + tmp_max = fmin(tmp_max, max); + output_ptr[out_channel_offset] = tmp_max; } // channel_res loop } // real_cal_num loop } // out_plane loop diff --git a/mindspore/lite/nnacl/fp16/pooling_fp16.h b/mindspore/lite/nnacl/fp16/pooling_fp16.h index 8ee04680f4..5ae395f46e 100644 --- a/mindspore/lite/nnacl/fp16/pooling_fp16.h +++ b/mindspore/lite/nnacl/fp16/pooling_fp16.h @@ -24,9 +24,11 @@ #ifdef __cplusplus extern "C" { #endif -int AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id); +int AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id, + float16_t min, float16_t max); -void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id); +void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id, + float16_t min, float16_t max); #ifdef __cplusplus } #endif diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.cc index d1316348f6..32aeb8cb21 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.cc @@ -28,10 +28,7 @@ using mindspore::schema::PrimitiveType_Pad; namespace mindspore::kernel { int PadFp16CPUKernel::RunImpl(int task_id) { - auto input_data = reinterpret_cast(in_tensors_.at(0)->MutableData()); - auto output_data = reinterpret_cast(out_tensors_.at(0)->MutableData()); - - PadFp16(input_data, output_data, in_, out_, pad_param_->paddings_, task_id, context_->thread_num_); + PadFp16(input_, output_, in_, out_, pad_param_->paddings_, task_id, context_->thread_num_); return RET_OK; } @@ -54,7 +51,13 @@ int PadFp16CPUKernel::Run() { return RET_ERROR; } - memset(output_, 0, output_tensor->ElementsNum() * sizeof(float16_t)); + if (pad_param_->constant_value_ - 0.0f < 1e-5) { + memset(output_, 0, output_tensor->ElementsNum() * sizeof(float16_t)); + } else { + for (int i = 0; i < output_tensor->ElementsNum(); ++i) { + output_[i] = pad_param_->constant_value_; + } + } ret = ParallelLaunch(this->context_->thread_pool_, PadImpl, this, op_parameter_->thread_num_); if (ret != RET_OK) { MS_LOG(ERROR) << "BatchnormRun error error_code[" << ret << "]"; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc index 1a226aa853..7834f8f4d4 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc @@ -53,10 +53,18 @@ int PoolingFp16CPUKernel::ReSize() { } int PoolingFp16CPUKernel::RunImpl(int task_id) { + float16_t minf = -FLT_MAX; + float16_t maxf = FLT_MAX; + if (pooling_param_->act_type_ == ActType_Relu) { + minf = 0.f; + } else if (pooling_param_->act_type_ == ActType_Relu6) { + minf = 0.f; + maxf = 6.f; + } if (pooling_param_->pool_mode_ == PoolMode_MaxPool) { - MaxPoolingFp16(fp16_input_, fp16_output_, pooling_param_, task_id); + MaxPoolingFp16(fp16_input_, fp16_output_, pooling_param_, task_id, minf, maxf); } else { - auto ret = AvgPoolingFp16(fp16_input_, fp16_output_, pooling_param_, task_id); + auto ret = AvgPoolingFp16(fp16_input_, fp16_output_, pooling_param_, task_id, minf, maxf); if (ret != RET_OK) { MS_LOG(ERROR) << "AvgPooling run failed."; return ret;