diff --git a/mindspore/lite/nnacl/fp16/pack_fp16.c b/mindspore/lite/nnacl/fp16/pack_fp16.c index 12e824dc46..7ad87da47e 100644 --- a/mindspore/lite/nnacl/fp16/pack_fp16.c +++ b/mindspore/lite/nnacl/fp16/pack_fp16.c @@ -256,6 +256,7 @@ void PackNHWCToNHWC4Fp16(const void *src, void *dst, int batch, int plane, int c void PackNHWCToNHWC8Fp16(const void *src, void *dst, int batch, int plane, int channel) { int ic8 = UP_DIV(channel, C8NUM); + int c8_channel = ic8 * C8NUM; int nhwc8_batch_unit_offset = ic8 * C8NUM * plane; int ic_remainder_ = channel % C8NUM; if (ic_remainder_ != 0) { @@ -263,8 +264,11 @@ void PackNHWCToNHWC8Fp16(const void *src, void *dst, int batch, int plane, int c for (int b = 0; b < batch; b++) { int batch_offset = b * channel * plane; for (int i = 0; i < plane; i++) { - memcpy((float16_t *)dst + nhwc8_batch_offset + i * ic8 * C8NUM, (float16_t *)src + batch_offset + i * channel, - channel * sizeof(float16_t)); + float16_t *dst_per_plane = (float16_t *)dst + nhwc8_batch_offset + i * c8_channel; + memcpy(dst_per_plane, (float16_t *)src + batch_offset + i * channel, channel * sizeof(float16_t)); + for (int j = channel; j < c8_channel; ++j) { + dst_per_plane[j] = 0; + } } nhwc8_batch_offset += nhwc8_batch_unit_offset; } diff --git a/mindspore/lite/nnacl/pack.c b/mindspore/lite/nnacl/pack.c index e142a176f5..d3bc1513fa 100644 --- a/mindspore/lite/nnacl/pack.c +++ b/mindspore/lite/nnacl/pack.c @@ -641,21 +641,38 @@ void PackInputToC8Int8(const int8_t *input_data, int16_t *packed_input, ConvPara int in_h = conv_param->input_h_; int in_w = conv_param->input_w_; int ic8 = UP_DIV(in_channel, C8NUM); + int ic8_minus = ic8 - 1; for (int b = 0; b < in_batch; b++) { int src_batch_offset = b * in_channel * in_h * in_w; int dst_batch_offset = b * ic8 * C8NUM * in_h * in_w; - for (int c = 0; c < in_channel; c++) { - int ic8_block = c / C8NUM; - int ic8_res = c % C8NUM; - int src_c_offset = src_batch_offset + c; - int dst_c_offset = dst_batch_offset + ic8_block * C8NUM * in_h * in_w + ic8_res; - for (int k = 0; k < in_w * in_h; k++) { - int src_plane_offset = src_c_offset + k * in_channel; - int dst_plane_offset = dst_c_offset + k * C8NUM; - (packed_input + dst_plane_offset)[0] = (int16_t)(input_data + src_plane_offset)[0]; - } - } + for (int k = 0; k < in_w * in_h; k++) { + int src_plane_offset = src_batch_offset + k * in_channel; + int dst_plane_offset = dst_batch_offset + k * C8NUM; + for (int i = 0; i < ic8_minus; ++i) { + int src_c_offset = src_plane_offset + i * C8NUM; + int dst_c_offset = dst_plane_offset + i * C8NUM * in_h * in_w; +#ifdef ENABLE_ARM + vst1q_s16(packed_input + dst_c_offset, vmovl_s8(vld1_s8(input_data + src_c_offset))); +#else + for (int j = 0; j < C8NUM; ++j) { + (packed_input + dst_c_offset)[j] = (int16_t)(input_data + src_c_offset)[j]; + } +#endif + } // ic8_minus loop + int tmp_ic = ic8_minus * C8NUM; + int res_c = in_channel - tmp_ic; + int tmp_ic_offset = tmp_ic * in_h * in_w; + for (int l = 0; l < res_c; ++l) { + int src_c_offset = src_plane_offset + tmp_ic + l; + int dst_c_offset = dst_plane_offset + tmp_ic_offset + l; + (packed_input + dst_c_offset)[l] = (int16_t)(input_data + src_c_offset)[l]; + } // res ic loop + for (int l = res_c; l < C8NUM; ++l) { + int dst_c_offset = dst_plane_offset + tmp_ic_offset + l; + (packed_input + dst_c_offset)[l] = 0; + } // res ic loop + } // kh * kw loop } } @@ -692,17 +709,30 @@ void PackWeightToC8Int8(const int8_t *origin_weight_data, int16_t *packed_weight void PackNHWCToNC4HW4Fp32(const void *src, void *dst, int batch, int plane, int channel) { int c4 = UP_DIV(channel, C4NUM); + int c4_minus = c4 - 1; for (int b = 0; b < batch; b++) { int src_oc_offset = b * plane * channel; int dst_oc_offset = b * plane * c4 * C4NUM; for (int k = 0; k < plane; k++) { int src_kernel_offset = src_oc_offset + k * channel; int dst_kernel_offset = dst_oc_offset + k * C4NUM; - for (int i = 0; i < channel; i++) { - int c4_block_num = i / C4NUM; - int c4_block_rem = i % C4NUM; - int src_ic_offset = src_kernel_offset + i; - int dst_ic_offset = dst_kernel_offset + c4_block_num * plane * C4NUM + c4_block_rem; + for (int j = 0; j < c4_minus; ++j) { + int src_ic_offset = src_kernel_offset + j * C4NUM; + int dst_ic_offset = dst_kernel_offset + j * plane * C4NUM; +#ifdef ENABLE_ARM + vst1q_f32((float *)dst + dst_ic_offset, vld1q_f32((float *)src + src_ic_offset)); +#else + for (int i = 0; i < C4NUM; ++i) { + ((float *)dst + dst_ic_offset)[i] = ((float *)src + src_ic_offset)[i]; + } +#endif + } + int tmp_c = c4_minus * C4NUM; + int tmp_c_offset = tmp_c * plane; + int res_c = channel - tmp_c; + for (int l = 0; l < res_c; ++l) { + int src_ic_offset = src_kernel_offset + tmp_c + l; + int dst_ic_offset = dst_kernel_offset + tmp_c_offset + l; ((float *)dst + dst_ic_offset)[0] = ((float *)src + src_ic_offset)[0]; } } @@ -956,6 +986,7 @@ void PackNHWCToC8HWN8Fp32(const void *src, void *dst, int batch, int plane, int void PackNHWCToNHWC4Int8(const void *src, void *dst, int batch, int plane, int channel) { int c4 = UP_DIV(channel, C4NUM); + int c4_channel = c4 * C4NUM; int nhwc4_batch_unit_offset = c4 * C4NUM * plane; int ic_remainder_ = channel % C4NUM; if (ic_remainder_ != 0) { @@ -963,8 +994,11 @@ void PackNHWCToNHWC4Int8(const void *src, void *dst, int batch, int plane, int c for (int b = 0; b < batch; b++) { int batch_offset = b * channel * plane; for (int i = 0; i < plane; i++) { - memcpy((int8_t *)dst + nhwc4_batch_offset + i * c4 * C4NUM, (int8_t *)src + batch_offset + i * channel, - channel); + int8_t *dst_per_plane = (int8_t *)dst + nhwc4_batch_offset + i * c4_channel; + memcpy(dst_per_plane, (int8_t *)src + batch_offset + i * channel, channel); + for (int j = channel; j < c4_channel; ++j) { + dst_per_plane[j] = 0; + } } nhwc4_batch_offset += nhwc4_batch_unit_offset; } diff --git a/mindspore/lite/nnacl/winograd_transform.c b/mindspore/lite/nnacl/winograd_transform.c index 5e5ee1ca6c..3391e01d63 100644 --- a/mindspore/lite/nnacl/winograd_transform.c +++ b/mindspore/lite/nnacl/winograd_transform.c @@ -379,50 +379,50 @@ void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4 float32x4_t dst01 = g01; float32x4_t dst02 = g02; - float32x4_t dst10 = vaddq_f32(vmulq_n_f32(g00, 0.5), vmulq_n_f32(g10, 0.5)); - dst10 = vaddq_f32(dst10, vmulq_n_f32(g20, 0.5)); - float32x4_t dst11 = vaddq_f32(vmulq_n_f32(g01, 0.5), vmulq_n_f32(g11, 0.5)); - dst11 = vaddq_f32(dst11, vmulq_n_f32(g21, 0.5)); - float32x4_t dst12 = vaddq_f32(vmulq_n_f32(g02, 0.5), vmulq_n_f32(g12, 0.5)); - dst12 = vaddq_f32(dst12, vmulq_n_f32(g22, 0.5)); - - float32x4_t dst20 = vsubq_f32(vmulq_n_f32(g00, 0.5), vmulq_n_f32(g10, 0.5)); - dst20 = vaddq_f32(dst20, vmulq_n_f32(g20, 0.5)); - float32x4_t dst21 = vsubq_f32(vmulq_n_f32(g01, 0.5), vmulq_n_f32(g11, 0.5)); - dst21 = vaddq_f32(dst21, vmulq_n_f32(g21, 0.5)); - float32x4_t dst22 = vsubq_f32(vmulq_n_f32(g02, 0.5), vmulq_n_f32(g12, 0.5)); - dst22 = vaddq_f32(dst22, vmulq_n_f32(g22, 0.5)); + float32x4_t dst10 = vaddq_f32(vaddq_f32(g00, g10), g20); + dst10 = vmulq_n_f32(dst10, 0.5); + float32x4_t dst11 = vaddq_f32(vaddq_f32(g01, g11), g21); + dst11 = vmulq_n_f32(dst11, 0.5); + float32x4_t dst12 = vaddq_f32(vaddq_f32(g02, g12), g22); + dst12 = vmulq_n_f32(dst12, 0.5); + + float32x4_t dst20 = vaddq_f32(vsubq_f32(g00, g10), g20); + dst20 = vmulq_n_f32(dst20, 0.5); + float32x4_t dst21 = vaddq_f32(vsubq_f32(g01, g11), g21); + dst21 = vmulq_n_f32(dst21, 0.5); + float32x4_t dst22 = vaddq_f32(vsubq_f32(g02, g12), g22); + dst22 = vmulq_n_f32(dst22, 0.5); float32x4_t dst30 = g20; float32x4_t dst31 = g21; float32x4_t dst32 = g22; float32x4_t m00 = dst00; - float32x4_t m01 = vaddq_f32(vmulq_n_f32(dst00, 0.5), vmulq_n_f32(dst01, 0.5)); - m01 = vaddq_f32(m01, vmulq_n_f32(dst02, 0.5)); - float32x4_t m02 = vsubq_f32(vmulq_n_f32(dst00, 0.5), vmulq_n_f32(dst01, 0.5)); - m02 = vaddq_f32(m02, vmulq_n_f32(dst02, 0.5)); + float32x4_t m01 = vaddq_f32(vaddq_f32(dst00, dst01), dst02); + m01 = vmulq_n_f32(m01, 0.5); + float32x4_t m02 = vaddq_f32(vsubq_f32(dst00, dst01), dst02); + m02 = vmulq_n_f32(m02, 0.5); float32x4_t m03 = dst02; float32x4_t m10 = dst10; - float32x4_t m11 = vaddq_f32(vmulq_n_f32(dst10, 0.5), vmulq_n_f32(dst11, 0.5)); - m11 = vaddq_f32(m11, vmulq_n_f32(dst12, 0.5)); - float32x4_t m12 = vsubq_f32(vmulq_n_f32(dst10, 0.5), vmulq_n_f32(dst11, 0.5)); - m12 = vaddq_f32(m12, vmulq_n_f32(dst12, 0.5)); + float32x4_t m11 = vaddq_f32(vaddq_f32(dst10, dst11), dst12); + m11 = vmulq_n_f32(m11, 0.5); + float32x4_t m12 = vaddq_f32(vsubq_f32(dst10, dst11), dst12); + m12 = vmulq_n_f32(m12, 0.5); float32x4_t m13 = dst12; float32x4_t m20 = dst20; - float32x4_t m21 = vaddq_f32(vmulq_n_f32(dst20, 0.5), vmulq_n_f32(dst21, 0.5)); - m21 = vaddq_f32(m21, vmulq_n_f32(dst22, 0.5)); - float32x4_t m22 = vsubq_f32(vmulq_n_f32(dst20, 0.5), vmulq_n_f32(dst21, 0.5)); - m22 = vaddq_f32(m22, vmulq_n_f32(dst22, 0.5)); + float32x4_t m21 = vaddq_f32(vaddq_f32(dst20, dst21), dst22); + m21 = vmulq_n_f32(m21, 0.5); + float32x4_t m22 = vaddq_f32(vsubq_f32(dst20, dst21), dst22); + m22 = vmulq_n_f32(m22, 0.5); float32x4_t m23 = dst22; float32x4_t m30 = dst30; - float32x4_t m31 = vaddq_f32(vmulq_n_f32(dst30, 0.5), vmulq_n_f32(dst31, 0.5)); - m31 = vaddq_f32(m31, vmulq_n_f32(dst32, 0.5)); - float32x4_t m32 = vsubq_f32(vmulq_n_f32(dst30, 0.5), vmulq_n_f32(dst31, 0.5)); - m32 = vaddq_f32(m32, vmulq_n_f32(dst32, 0.5)); + float32x4_t m31 = vaddq_f32(vaddq_f32(dst30, dst31), dst32); + m31 = vmulq_n_f32(m31, 0.5); + float32x4_t m32 = vaddq_f32(vsubq_f32(dst30, dst31), dst32); + m32 = vmulq_n_f32(m32, 0.5); float32x4_t m33 = dst32; dst_ic4_ptr[0] = m00[0]; diff --git a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.h b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.h index 9f54577d17..d824990f09 100644 --- a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.h +++ b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.h @@ -68,7 +68,7 @@ class ConvolutionBaseCPUKernel : public LiteKernel { int thread_count_; ConvParameter *conv_param_; ConvQuantArg *conv_quant_arg_; - LayoutConvertor convert_func_; + LayoutConvertor convert_func_ = nullptr; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc index 6ffa2e9fa0..33bb6cb80d 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc @@ -98,8 +98,16 @@ int Convolution3x3FP16CPUKernel::InitTmpBuffer() { int iC8 = UP_DIV(conv_param_->input_channel_, C8NUM); MS_ASSERT(ctx_->allocator != nullptr); + size_t nhwc8_input_size = + iC8 * C8NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); + nhwc4_input_ = ctx_->allocator->Malloc(nhwc8_input_size); + if (nhwc4_input_ == nullptr) { + MS_LOG(ERROR) << "malloc nhwc4_input_ failed."; + return RET_ERROR; + } + size_t tile_buffer_size = thread_count_ * tile_num * k_plane * iC8 * C8NUM * sizeof(float16_t); - tile_buffer_ = reinterpret_cast(malloc(tile_buffer_size)); + tile_buffer_ = reinterpret_cast(ctx_->allocator->Malloc(tile_buffer_size)); if (tile_buffer_ == nullptr) { MS_LOG(ERROR) << "malloc tile_buffer_ failed."; return RET_ERROR; @@ -160,27 +168,11 @@ int Convolution3x3FP16CPUKernel::ReSize() { return ret; } - if (nhwc4_input_ != nullptr) { - free(nhwc4_input_); - nhwc4_input_ = nullptr; - } - ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { MS_LOG(ERROR) << "ConvolutionBase init failed."; return ret; } - - int iC8 = UP_DIV(conv_param_->input_channel_, C8NUM); - size_t nhwc8_input_size = - iC8 * C8NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); - nhwc4_input_ = malloc(nhwc8_input_size); - if (nhwc4_input_ == nullptr) { - MS_LOG(ERROR) << "malloc nhwc4_input_ failed."; - return RET_ERROR; - } - memset(nhwc4_input_, 0, nhwc8_input_size); - return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h index 613e2a5931..25021e2d10 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h @@ -52,6 +52,10 @@ class Convolution3x3FP16CPUKernel : public ConvolutionBaseFP16CPUKernel { private: void FreeTmpBuffer() { + if (nhwc4_input_ != nullptr) { + ctx_->allocator->Free(nhwc4_input_); + nhwc4_input_ = nullptr; + } if (tile_buffer_ != nullptr) { ctx_->allocator->Free(tile_buffer_); tile_buffer_ = nullptr; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc index f8203a7951..1237d90b34 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc @@ -105,6 +105,15 @@ int ConvolutionSWFP16CPUKernel::InitTmpBuffer() { int out_channel = conv_param_->output_channel_; int oc4 = UP_DIV(out_channel, C4NUM); + int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM); + size_t nhwc4_input_size = + ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); + nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size); + if (nhwc4_input_ == nullptr) { + MS_LOG(ERROR) << "malloc nhwc4_input_ failed."; + return RET_ERROR; + } + tmp_output_block_ = reinterpret_cast(ctx_->allocator->Malloc( conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * oc4 * C4NUM * sizeof(float16_t))); if (tmp_output_block_ == nullptr) { @@ -145,10 +154,6 @@ int ConvolutionSWFP16CPUKernel::ReSize() { return ret; } - if (nhwc4_input_ != nullptr) { - free(nhwc4_input_); - nhwc4_input_ = nullptr; - } if (slidingWindow_param_ != nullptr) { delete slidingWindow_param_; slidingWindow_param_ = nullptr; @@ -160,16 +165,6 @@ int ConvolutionSWFP16CPUKernel::ReSize() { return ret; } - int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM); - size_t nhwc4_input_size = - ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); - nhwc4_input_ = malloc(nhwc4_input_size); - if (nhwc4_input_ == nullptr) { - MS_LOG(ERROR) << "malloc nhwc4_input_ failed."; - return RET_ERROR; - } - memset(nhwc4_input_, 0, nhwc4_input_size); - // init sliding window param slidingWindow_param_ = new (std::nothrow) SlidingWindowParam; if (slidingWindow_param_ == nullptr) { diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h index d1ed0f6300..c35d46eb31 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h @@ -54,6 +54,10 @@ class ConvolutionSWFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { private: void FreeTmpBuffer() { + if (nhwc4_input_ != nullptr) { + ctx_->allocator->Free(nhwc4_input_); + nhwc4_input_ = nullptr; + } if (tmp_output_block_ != nullptr) { ctx_->allocator->Free(tmp_output_block_); tmp_output_block_ = nullptr; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc index b15296b9c7..14d8e73a00 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc @@ -236,6 +236,14 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() { int oc8 = UP_DIV(channel_out, C8NUM); int ic8 = UP_DIV(conv_param_->input_channel_, C8NUM); + size_t nhwc8_input_size = + ic8 * C8NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); + nhwc4_input_ = ctx_->allocator->Malloc(nhwc8_input_size); + if (nhwc4_input_ == nullptr) { + MS_LOG(ERROR) << "malloc nhwc4_input_ failed."; + return RET_ERROR; + } + size_t tile_buffer_size = thread_count_ * cal_num * input_unit_ * input_unit_ * ic8 * C8NUM * sizeof(float16_t); trans_input_ = reinterpret_cast(ctx_->allocator->Malloc(tile_buffer_size)); if (trans_input_ == nullptr) { @@ -303,11 +311,6 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() { return ret; } - if (nhwc4_input_ != nullptr) { - free(nhwc4_input_); - nhwc4_input_ = nullptr; - } - ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { MS_LOG(ERROR) << "ConvolutionBase init failed."; @@ -318,17 +321,6 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() { conv_param_->input_unit_ = input_unit_; conv_param_->output_unit_ = output_unit_; - int channel_in = conv_param_->input_channel_; - int ic8 = UP_DIV(channel_in, C8NUM); - size_t nhwc8_input_size = - ic8 * C8NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); - nhwc4_input_ = malloc(nhwc8_input_size); - if (nhwc4_input_ == nullptr) { - MS_LOG(ERROR) << "malloc nhwc4_input_ failed."; - return RET_ERROR; - } - memset(nhwc4_input_, 0, nhwc8_input_size); - ret = ConfigInputOutput(); if (ret != RET_OK) { MS_LOG(ERROR) << "ConfigInputOutput failed."; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h index db576525d9..f025450d60 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h @@ -59,6 +59,10 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { private: void FreeTmpBuffer() { + if (nhwc4_input_ != nullptr) { + ctx_->allocator->Free(nhwc4_input_); + nhwc4_input_ = nullptr; + } if (trans_input_ != nullptr) { ctx_->allocator->Free(trans_input_); trans_input_ = nullptr; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc index 2186311679..ce8c9392d9 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc @@ -100,6 +100,14 @@ int Convolution3x3CPUKernel::InitTmpBuffer() { #else int tile_num = 12; #endif + size_t nhwc4_input_size = + ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float); + nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size); + if (nhwc4_input_ == nullptr) { + MS_LOG(ERROR) << "malloc nhwc4_input_ failed."; + return RET_ERROR; + } + size_t tile_buffer_size = thread_count_ * tile_num * C16NUM * ic4 * C4NUM * sizeof(float); tile_buffer_ = reinterpret_cast(ctx_->allocator->Malloc(tile_buffer_size)); if (tile_buffer_ == nullptr) { @@ -174,27 +182,11 @@ int Convolution3x3CPUKernel::ReSize() { return ret; } - if (nhwc4_input_ != nullptr) { - free(nhwc4_input_); - nhwc4_input_ = nullptr; - } - ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { MS_LOG(ERROR) << "ConvolutionBase init failed.ret: " << ret; return RET_ERROR; } - - int iC4 = UP_DIV(conv_param_->input_channel_, C4NUM); - size_t nhwc4_input_size = - iC4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float); - nhwc4_input_ = malloc(nhwc4_input_size); - if (nhwc4_input_ == nullptr) { - MS_LOG(ERROR) << "malloc nhwc4_input_ failed."; - return RET_ERROR; - } - memset(nhwc4_input_, 0, nhwc4_input_size); - return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h index 07ac6dce03..198bb11746 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h @@ -45,6 +45,10 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel { private: void FreeTmpBuffer() { + if (nhwc4_input_ != nullptr) { + ctx_->allocator->Free(nhwc4_input_); + nhwc4_input_ = nullptr; + } if (tile_buffer_ != nullptr) { ctx_->allocator->Free(tile_buffer_); tile_buffer_ = nullptr; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.cc index 0e037c4b6c..975061fd20 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.cc @@ -79,6 +79,14 @@ int ConvolutionSWCPUKernel::InitTmpBuffer() { int out_channel = conv_param_->output_channel_; int oc4 = UP_DIV(out_channel, C4NUM); MS_ASSERT(ctx_->allocator != nullptr); + int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM); + size_t nhwc4_input_size = + ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float); + nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size); + if (nhwc4_input_ == nullptr) { + MS_LOG(ERROR) << "malloc nhwc4 input failed."; + return RET_ERROR; + } tmp_output_block_ = reinterpret_cast(ctx_->allocator->Malloc( conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * oc4 * C4NUM * sizeof(float))); @@ -116,10 +124,6 @@ int ConvolutionSWCPUKernel::ReSize() { return ret; } - if (nhwc4_input_ != nullptr) { - free(nhwc4_input_); - nhwc4_input_ = nullptr; - } if (slidingWindow_param_ != nullptr) { delete slidingWindow_param_; slidingWindow_param_ = nullptr; @@ -131,16 +135,6 @@ int ConvolutionSWCPUKernel::ReSize() { return RET_ERROR; } - int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM); - size_t nhwc4_input_size = - ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float); - nhwc4_input_ = malloc(nhwc4_input_size); - if (nhwc4_input_ == nullptr) { - MS_LOG(ERROR) << "malloc nhwc4 input failed."; - return RET_ERROR; - } - memset(nhwc4_input_, 0, nhwc4_input_size); - // init sliding window param slidingWindow_param_ = new (std::nothrow) SlidingWindowParam; if (slidingWindow_param_ == nullptr) { diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.h index d2421f224d..d23a51732e 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.h @@ -53,6 +53,10 @@ class ConvolutionSWCPUKernel : public ConvolutionBaseCPUKernel { private: void FreeTmpBuffer() { + if (nhwc4_input_ != nullptr) { + ctx_->allocator->Free(nhwc4_input_); + nhwc4_input_ = nullptr; + } if (tmp_output_block_ != nullptr) { ctx_->allocator->Free(tmp_output_block_); tmp_output_block_ = nullptr; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc index d51c6dcea5..98caadc6be 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc @@ -157,6 +157,14 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() { #endif MS_ASSERT(ctx_->allocator != nullptr); + size_t nhwc4_input_size = + ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float); + nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size); + if (nhwc4_input_ == nullptr) { + MS_LOG(ERROR) << "malloc nhwc4_input_ failed."; + return RET_ERROR; + } + size_t tile_buffer_size = thread_count_ * tile_num * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float); trans_input_ = reinterpret_cast(ctx_->allocator->Malloc(tile_buffer_size)); if (trans_input_ == nullptr) { @@ -249,11 +257,6 @@ int ConvolutionWinogradCPUKernel::ReSize() { return ret; } - if (nhwc4_input_ != nullptr) { - free(nhwc4_input_); - nhwc4_input_ = nullptr; - } - ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { MS_LOG(ERROR) << "ConvolutionBase init failed."; @@ -265,16 +268,6 @@ int ConvolutionWinogradCPUKernel::ReSize() { conv_param_->input_unit_ = input_unit_; conv_param_->output_unit_ = output_unit_; - int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM); - size_t nhwc4_input_size = - ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float); - nhwc4_input_ = malloc(nhwc4_input_size); - if (nhwc4_input_ == nullptr) { - MS_LOG(ERROR) << "malloc nhwc4_input_ failed."; - return RET_ERROR; - } - memset(nhwc4_input_, 0, nhwc4_input_size); - ret = ConfigInputOutput(); if (ret != RET_OK) { MS_LOG(ERROR) << "ConfigInputOutput failed."; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h index 3c164cb43b..61c7a1f118 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h @@ -50,6 +50,10 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel { private: void FreeTmpBuffer() { + if (nhwc4_input_ != nullptr) { + ctx_->allocator->Free(nhwc4_input_); + nhwc4_input_ = nullptr; + } if (trans_input_ != nullptr) { ctx_->allocator->Free(trans_input_); trans_input_ = nullptr; diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_3x3_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_3x3_int8.cc index 16460af883..53642ac863 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_3x3_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_3x3_int8.cc @@ -44,6 +44,10 @@ void ProcessFilterUint8(int8_t *origin_weight, int16_t *dst_weight, ConvParamete } void Convolution3x3Int8CPUKernel::FreeTmpBuffer() { + if (input_data_ != nullptr) { + ctx_->allocator->Free(input_data_); + input_data_ = nullptr; + } if (tile_buffer_ != nullptr) { ctx_->allocator->Free(tile_buffer_); tile_buffer_ = nullptr; @@ -67,10 +71,6 @@ Convolution3x3Int8CPUKernel::~Convolution3x3Int8CPUKernel() { free(transformed_filter_addr_); transformed_filter_addr_ = nullptr; } - if (input_data_ != nullptr) { - free(input_data_); - input_data_ = nullptr; - } FreeQuantParam(); } @@ -118,6 +118,14 @@ int Convolution3x3Int8CPUKernel::InitTmpBuffer() { int ic8 = UP_DIV(conv_param_->input_channel_, C8NUM); MS_ASSERT(ctx_->allocator != nullptr); + size_t c8_input_size = + conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * ic8 * C8NUM * sizeof(int16_t); + input_data_ = reinterpret_cast(ctx_->allocator->Malloc(c8_input_size)); + if (input_data_ == nullptr) { + MS_LOG(ERROR) << "malloc input_data_ failed."; + return RET_ERROR; + } + size_t tile_buffer_size = thread_count_ * TILE_NUM * C16NUM * ic8 * C8NUM * sizeof(int16_t); tile_buffer_ = reinterpret_cast(ctx_->allocator->Malloc(tile_buffer_size)); if (tile_buffer_ == nullptr) { @@ -179,27 +187,11 @@ int Convolution3x3Int8CPUKernel::ReSize() { return ret; } - if (input_data_ != nullptr) { - free(input_data_); - input_data_ = nullptr; - } - ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { MS_LOG(ERROR) << "ConvolutionBase init failed."; return RET_ERROR; } - - int ic8 = UP_DIV(conv_param_->input_channel_, C8NUM); - size_t c8_input_size = - conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * ic8 * C8NUM * sizeof(int16_t); - input_data_ = reinterpret_cast(malloc(c8_input_size)); - if (input_data_ == nullptr) { - MS_LOG(ERROR) << "malloc input_data_ failed."; - return RET_ERROR; - } - memset(input_data_, 0, c8_input_size); - return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc index 0440309b52..3b4bf822e3 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc @@ -132,6 +132,26 @@ int ConvolutionInt8CPUKernel::InitWeightBias() { int ConvolutionInt8CPUKernel::InitTmpBuffer() { MS_ASSERT(ctx_->allocator != nullptr); + int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM); + int output_count = conv_param_->output_h_ * conv_param_->output_w_; + int output_tile_count = UP_DIV(output_count, tile_num_); + int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_; + int plane_c4 = UP_DIV(kernel_plane, C4NUM); + int unit_size = plane_c4 * C4NUM * ic4 * C4NUM; + int packed_input_size = output_tile_count * tile_num_ * unit_size; + packed_input_ = reinterpret_cast(ctx_->allocator->Malloc(conv_param_->input_batch_ * packed_input_size)); + if (packed_input_ == nullptr) { + MS_LOG(ERROR) << "malloc packed_input_ failed."; + return RET_ERROR; + } + + size_t nhwc4_input_size = ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_; + nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size); + if (nhwc4_input_ == nullptr) { + MS_LOG(ERROR) << "malloc nhwc4 input failed."; + return RET_ERROR; + } + size_t tmp_dst_size = thread_count_ * tile_num_ * conv_param_->output_channel_ * sizeof(int32_t); tmp_dst_ = reinterpret_cast(ctx_->allocator->Malloc(tmp_dst_size)); if (tmp_dst_ == nullptr) { @@ -219,6 +239,14 @@ int ConvolutionInt8CPUKernel::InitWeightBiasOpt() { int ConvolutionInt8CPUKernel::InitTmpBufferOpt() { MS_ASSERT(ctx_->allocator != nullptr); + int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM); + size_t nhwc4_input_size = ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_; + nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size); + if (nhwc4_input_ == nullptr) { + MS_LOG(ERROR) << "malloc nhwc4 input failed."; + return RET_ERROR; + } + size_t tmp_dst_size = thread_count_ * tile_num_ * conv_param_->output_channel_ * sizeof(int32_t); tmp_dst_ = reinterpret_cast(ctx_->allocator->Malloc(tmp_dst_size)); if (tmp_dst_ == nullptr) { @@ -238,12 +266,6 @@ int ConvolutionInt8CPUKernel::InitTmpBufferOpt() { void ConvolutionInt8CPUKernel::ConfigInputOutput() { auto output_tensor = out_tensors_.at(kOutputIndex); output_tensor->SetFormat(schema::Format::Format_NHWC); - auto input_tensor = in_tensors_.at(kInputIndex); - auto ret = CheckLayout(input_tensor); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Check layout failed."; - return; - } } int ConvolutionInt8CPUKernel::Init() { @@ -283,43 +305,11 @@ int ConvolutionInt8CPUKernel::ReSize() { return ret; } - if (nhwc4_input_ != nullptr) { - free(nhwc4_input_); - nhwc4_input_ = nullptr; - } - if (packed_input_ != nullptr) { - free(packed_input_); - packed_input_ = nullptr; - } - ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { MS_LOG(ERROR) << "ConvolutionBase init failed."; return RET_ERROR; } - - int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM); - size_t nhwc4_input_size = ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_; - nhwc4_input_ = malloc(nhwc4_input_size); - if (nhwc4_input_ == nullptr) { - MS_LOG(ERROR) << "malloc nhwc4 input failed."; - return RET_ERROR; - } - memset(nhwc4_input_, 0, nhwc4_input_size); - - int output_count = conv_param_->output_h_ * conv_param_->output_w_; - int output_tile_count = UP_DIV(output_count, tile_num_); - int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_; - int plane_c4 = UP_DIV(kernel_plane, C4NUM); - int unit_size = plane_c4 * C4NUM * ic4 * C4NUM; - int packed_input_size = output_tile_count * tile_num_ * unit_size; - packed_input_ = reinterpret_cast(malloc(conv_param_->input_batch_ * packed_input_size)); - if (packed_input_ == nullptr) { - MS_LOG(ERROR) << "malloc packed_input_ failed."; - return RET_ERROR; - } - memset(packed_input_, 0, conv_param_->input_batch_ * packed_input_size); - return RET_OK; } @@ -353,25 +343,17 @@ int ConvolutionInt8CPUKernel::Run() { MS_LOG(ERROR) << "Prepare failed."; return RET_ERROR; } - if (support_optimize_) { - ret = InitTmpBufferOpt(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init tmp buffer failed."; - return RET_ERROR; - } - } else { - // init tmp input, output - ret = InitTmpBuffer(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init tmp buffer failed."; - return RET_ERROR; - } + // init tmp input, output + ret = InitTmpBuffer(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init tmp buffer failed."; + return RET_ERROR; } auto input_tensor = in_tensors_.at(kInputIndex); auto ori_input_data = input_tensor->MutableData(); - convert_func_(ori_input_data, nhwc4_input_, conv_param_->input_batch_, conv_param_->input_h_ * conv_param_->input_w_, - conv_param_->input_channel_); + PackNHWCToNHWC4Int8(ori_input_data, nhwc4_input_, conv_param_->input_batch_, + conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ConvolutionInt8Impl, this, thread_count_); if (error_code != RET_OK) { diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h index 8970943929..147f5c5854 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h @@ -36,10 +36,6 @@ class ConvolutionInt8CPUKernel : public ConvolutionBaseCPUKernel { free(packed_weight_); packed_weight_ = nullptr; } - if (packed_input_ != nullptr) { - free(packed_input_); - packed_input_ = nullptr; - } if (input_sum_ != nullptr) { free(input_sum_); input_sum_ = nullptr; @@ -59,6 +55,14 @@ class ConvolutionInt8CPUKernel : public ConvolutionBaseCPUKernel { private: void FreeTmpBuffer() { + if (nhwc4_input_ != nullptr) { + ctx_->allocator->Free(nhwc4_input_); + nhwc4_input_ = nullptr; + } + if (packed_input_ != nullptr) { + ctx_->allocator->Free(packed_input_); + packed_input_ = nullptr; + } if (tmp_dst_ != nullptr) { ctx_->allocator->Free(tmp_dst_); tmp_dst_ = nullptr;