From 88a44a0ef0faacb1e57a7cf1d97c99e2abcc8df6 Mon Sep 17 00:00:00 2001 From: yangruoqi713 Date: Mon, 3 Aug 2020 16:33:57 +0800 Subject: [PATCH] optimize arm cpu op: conv_depthwise, deconv_depthwise --- .../arm/fp16/convolution_depthwise_fp16.cc | 1 - .../kernel/arm/fp32/convolution_depthwise.cc | 106 ++++++++++------- .../kernel/arm/fp32/convolution_depthwise.h | 7 +- .../arm/fp32/deconvolution_depthwise.cc | 109 +++++++++++------- .../kernel/arm/fp32/deconvolution_depthwise.h | 14 ++- .../arm/int8/convolution_depthwise_int8.cc | 72 ++++++++---- .../arm/int8/convolution_depthwise_int8.h | 1 + .../arm/int8/deconvolution_depthwise_int8.cc | 92 ++++++++++----- .../arm/int8/deconvolution_depthwise_int8.h | 1 + .../arm/opclib/fp16/conv_depthwise_fp16.cc | 7 +- .../kernel/arm/opclib/fp32/conv_depthwise.cc | 10 +- .../arm/opclib/int8/conv_depthwise_int8.cc | 4 +- 12 files changed, 276 insertions(+), 148 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc index 6b00c60b59..dc8a70e3d8 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc @@ -46,7 +46,6 @@ int ConvolutionDepthwiseFp16CPUKernel::InitBuffer() { MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; } - memset(packed_output_, 0, pack_output_size * sizeof(float16_t)); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc index 75856bfba3..077cbd812d 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc @@ -27,27 +27,7 @@ using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_DepthwiseConv2D; namespace mindspore::kernel { -int ConvolutionDepthwiseCPUKernel::Init() { - // conv base init - ConvolutionBaseCPUKernel::Init(); - - // init sliding window param - sliding_ = new SlidingWindowParam; - InitSlidingParam(sliding_, conv_param_, C4NUM); - - // pack input function: convert_func_ - auto input_tensor = inputs_[kInputIndex]; - auto data_type = input_tensor->data_type(); - auto input_format = input_tensor->GetFormat(); - schema::Format execute_format = schema::Format_NHWC4; - if (input_format != execute_format) { - convert_func_ = LayoutTransform(data_type, input_format, execute_format); - if (convert_func_ == nullptr) { - MS_LOG(ERROR) << "layout convert func is nullptr."; - return RET_ERROR; - } - } - +int ConvolutionDepthwiseCPUKernel::InitWeightBias() { // init weight: o, h, w, i; o == group, i == 1 auto weight_tensor = inputs_[kWeightIndex]; auto origin_weight = reinterpret_cast(weight_tensor->Data()); @@ -55,42 +35,93 @@ int ConvolutionDepthwiseCPUKernel::Init() { int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_; packed_weight_ = reinterpret_cast(malloc(pack_weight_size * sizeof(float))); + if (packed_weight_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } memset(packed_weight_, 0, pack_weight_size * sizeof(float)); PackNCHWToNC4HW4Fp32(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_, conv_param_->output_channel_); // init bias bias_data_ = reinterpret_cast(malloc(C4NUM * OC4 * sizeof(float))); + if (bias_data_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } memset(bias_data_, 0, C4NUM * OC4 * sizeof(float)); if (inputs_.size() == kInputSize2) { auto ori_bias = reinterpret_cast(inputs_.at(kBiasIndex)->Data()); memcpy(bias_data_, ori_bias, conv_param_->output_channel_ * sizeof(float)); - } else { - MS_ASSERT(inputs_.size() == kInputSize1); } // init threadNum; conv_param_->thread_num_ = MSMIN(thread_count_, OC4); - ReSize(); return RET_OK; } -int ConvolutionDepthwiseCPUKernel::ReSize() { - // malloc pack input buffer - if (convert_func_ != nullptr) { +int ConvolutionDepthwiseCPUKernel::InitBuffer() { + // malloc pack input and output buffer + if (conv_param_->input_channel_ % C4NUM != 0) { + need_align_ = true; int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM); int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * IC4; packed_input_ = reinterpret_cast(malloc(pack_input_size * sizeof(float))); + if (packed_input_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } memset(packed_input_, 0, pack_input_size * sizeof(float)); - } - // malloc tmp output buffer - if (conv_param_->output_channel_ % C4NUM != 0) { - need_align_ = true; int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM); int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * OC4; packed_output_ = reinterpret_cast(malloc(pack_output_size * sizeof(float))); - memset(packed_output_, 0, pack_output_size * sizeof(float)); + if (packed_output_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } + } + return RET_OK; +} + +int ConvolutionDepthwiseCPUKernel::Init() { + // conv base init + ConvolutionBaseCPUKernel::Init(); + + // init sliding window param + sliding_ = new SlidingWindowParam; + InitSlidingParam(sliding_, conv_param_, C4NUM); + + auto ret = InitWeightBias(); + if (ret != 0) { + MS_LOG(ERROR) << "Convolution depthwise fp32 InitWeightBias failed."; + return RET_ERROR; + } + + ret = InitBuffer(); + if (ret != 0) { + MS_LOG(ERROR) << "Convolution depthwise fp32 InitBuffer failed."; + return RET_ERROR; + } + return RET_OK; +} + +int ConvolutionDepthwiseCPUKernel::ReSize() { + if (need_align_) { + free(packed_input_); + free(packed_output_); + } + // conv base init + ConvolutionBaseCPUKernel::Init(); + + // init sliding window param + sliding_ = new SlidingWindowParam; + InitSlidingParam(sliding_, conv_param_, C4NUM); + + auto ret = InitBuffer(); + if (ret != 0) { + MS_LOG(ERROR) << "Convolution depthwise fp32 InitBuffer failed."; + return RET_ERROR; } return RET_OK; } @@ -120,15 +151,14 @@ int ConvolutionDepthwiseCPUKernel::Run() { auto input_addr = reinterpret_cast(input_tensor->Data()); // pack input: to nhwc4 - if (convert_func_ != nullptr) { - convert_func_(input_addr, packed_input_, conv_param_->input_batch_, conv_param_->input_h_ * conv_param_->input_w_, - conv_param_->input_channel_); + if (need_align_) { + PackNHWCToNHWC4Fp32(input_addr, packed_input_, conv_param_->input_batch_, + conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); } else { packed_input_ = input_addr; } - output_addr = reinterpret_cast(outputs_.at(kOutputIndex)->Data()); - memset(output_addr, 0, outputs_.at(kOutputIndex)->ElementsNum() * sizeof(float)); + auto output_addr = reinterpret_cast(outputs_.at(kOutputIndex)->Data()); if (!need_align_) { packed_output_ = output_addr; } @@ -146,7 +176,6 @@ int ConvolutionDepthwiseCPUKernel::Run() { return RET_OK; } - kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector &inputs, const std::vector &outputs, OpParameter *opParameter, const Context *ctx, @@ -170,4 +199,3 @@ kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vectordata_type(); - auto input_format = input_tensor->GetFormat(); - schema::Format execute_format = schema::Format_NHWC4; - if (input_format != execute_format) { - convert_func_ = LayoutTransform(data_type, input_format, execute_format); - if (convert_func_ == nullptr) { - MS_LOG(ERROR) << "layout convert func is nullptr."; - return RET_ERROR; - } - } - +int DeconvolutionDepthwiseCPUKernel::InitWeightBias() { // init weight: o, h, w, i; o == group, i == 1 auto weight_tensor = inputs_[kWeightIndex]; auto origin_weight = reinterpret_cast(weight_tensor->Data()); @@ -68,55 +51,102 @@ int DeconvolutionDepthwiseCPUKernel::Init() { int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_; packed_weight_ = reinterpret_cast(malloc(pack_weight_size * sizeof(float))); + if (packed_weight_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } memset(packed_weight_, 0, pack_weight_size * sizeof(float)); PackNCHWToNC4HW4Fp32(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_, conv_param_->output_channel_); // init bias bias_data_ = reinterpret_cast(malloc(C4NUM * OC4 * sizeof(float))); + if (bias_data_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } memset(bias_data_, 0, C4NUM * OC4 * sizeof(float)); if (inputs_.size() == kInputSize2) { auto ori_bias = reinterpret_cast(inputs_.at(kBiasIndex)->Data()); memcpy(bias_data_, ori_bias, conv_param_->output_channel_ * sizeof(float)); - } else { - MS_ASSERT(inputs_.size() == kInputSize1); } // init threadNum; conv_param_->thread_num_ = MSMIN(conv_param_->thread_num_, OC4); - ReSize(); return RET_OK; } -int DeconvolutionDepthwiseCPUKernel::ReSize() { - // malloc pack input buffer - if (convert_func_ != nullptr) { +int DeconvolutionDepthwiseCPUKernel::InitBuffer() { + // malloc pack input and output buffer + if (conv_param_->input_channel_ % C4NUM != 0) { + need_align_ = true; int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM); int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * IC4; packed_input_ = reinterpret_cast(malloc(pack_input_size * sizeof(float))); + if (packed_input_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } memset(packed_input_, 0, pack_input_size * sizeof(float)); - } - // malloc tmp output buffer - if (conv_param_->output_channel_ % C4NUM != 0) { - need_pack_ = true; int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM); int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * OC4; packed_output_ = reinterpret_cast(malloc(pack_output_size * sizeof(float))); + if (packed_output_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } memset(packed_output_, 0, pack_output_size * sizeof(float)); } return RET_OK; } -int DeconvolutionDepthwiseCPUKernel::DoExcute(int task_id) { +int DeconvolutionDepthwiseCPUKernel::Init() { + InitSlideParam(); + // conv base init + ConvolutionBaseCPUKernel::Init(); + + auto ret = InitWeightBias(); + if (ret != 0) { + MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitWeightBias failed."; + return RET_ERROR; + } + + ret = InitBuffer(); + if (ret != 0) { + MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitBuffer failed."; + return RET_ERROR; + } + return RET_OK; +} + +int DeconvolutionDepthwiseCPUKernel::ReSize() { + if (need_align_) { + free(packed_input_); + free(packed_output_); + } + InitSlideParam(); + + // conv base init + ConvolutionBaseCPUKernel::Init(); + + auto ret = InitBuffer(); + if (ret != 0) { + MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitBuffer failed."; + return RET_ERROR; + } + return RET_OK; +} + +int DeconvolutionDepthwiseCPUKernel::Execute(int task_id) { DeconvDwC4Fp32(packed_output_, packed_input_, packed_weight_, reinterpret_cast(bias_data_), conv_param_, sliding_, task_id); return RET_OK; } int DeconvDwRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) { - auto conv_dw = reinterpret_cast(cdata); - auto ret = conv_dw->DoExcute(task_id); + auto deconv_dw = reinterpret_cast(cdata); + auto ret = deconv_dw->Execute(task_id); if (ret != RET_OK) { MS_LOG(ERROR) << "DeconvolutionDepthwiseRun error task_id[" << task_id << "] error_code[" << ret << "]"; return RET_ERROR; @@ -133,26 +163,26 @@ int DeconvolutionDepthwiseCPUKernel::Run() { auto input_addr = reinterpret_cast(input_tensor->Data()); // pack input: to nhwc4 - if (convert_func_ != nullptr) { - convert_func_(input_addr, packed_input_, conv_param_->input_batch_, conv_param_->input_h_ * conv_param_->input_w_, - conv_param_->input_channel_); + if (need_align_) { + PackNHWCToNHWC4Fp32(input_addr, packed_input_, conv_param_->input_batch_, + conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); } else { packed_input_ = input_addr; } - output_addr = reinterpret_cast(outputs_.at(kOutputIndex)->Data()); - memset(output_addr, 0, outputs_.at(kOutputIndex)->ElementsNum() * sizeof(float)); - if (!need_pack_) { + auto output_addr = reinterpret_cast(outputs_.at(kOutputIndex)->Data()); + if (!need_align_) { + memset(output_addr, 0, outputs_.at(kOutputIndex)->ElementsNum() * sizeof(float)); packed_output_ = output_addr; } auto ret = LiteBackendParallelLaunch(DeconvDwRun, this, conv_param_->thread_num_); if (ret != RET_OK) { - MS_LOG(ERROR) << "ConvDwRun error: error_code[" << ret << "]"; + MS_LOG(ERROR) << "DeconvDwRun error: error_code[" << ret << "]"; return RET_ERROR; } - if (need_pack_) { + if (need_align_) { PackNHWC4ToNHWCFp32(packed_output_, output_addr, conv_param_->output_batch_, conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); } @@ -182,4 +212,3 @@ kernel::LiteKernel *CpuDeconvDwFp32KernelCreator(const std::vectoroutput_channel_, C4NUM); int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_; packed_weight_ = reinterpret_cast(malloc(pack_weight_size * sizeof(int16_t))); + if (packed_weight_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } memset(packed_weight_, 0, pack_weight_size * sizeof(int16_t)); PackDepthwiseInt8Weight(origin_weight, packed_weight_, conv_param_); // init bias, add output zp bias_data_ = reinterpret_cast(malloc(C4NUM * OC4 * sizeof(int32_t))); + if (bias_data_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t)); if (inputs_.size() == kInputSize2) { auto ori_bias = reinterpret_cast(inputs_.at(kBiasIndex)->Data()); @@ -48,6 +56,30 @@ int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() { return RET_OK; } +int ConvolutionDepthwiseInt8CPUKernel::InitBuffer() { + // malloc packed input buffer + int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * + UP_DIV(conv_param_->input_channel_, 4); + packed_input_ = reinterpret_cast(malloc(pack_input_size * sizeof(int16_t))); + memset(packed_input_, 0, pack_input_size * sizeof(int16_t)); + if (packed_input_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } + + if (conv_param_->input_channel_ % C4NUM != 0) { + need_align_ = true; + int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * + UP_DIV(conv_param_->output_channel_, C4NUM); + packed_output_ = reinterpret_cast(malloc(pack_output_size * sizeof(int8_t))); + if (packed_input_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } + } + return RET_OK; +} + int ConvolutionDepthwiseInt8CPUKernel::Init() { // conv base init ConvolutionBaseCPUKernel::Init(); @@ -66,7 +98,7 @@ int ConvolutionDepthwiseInt8CPUKernel::Init() { return ret; } - ret = ReSize(); + ret = InitBuffer(); if (ret != RET_OK) { MS_LOG(ERROR) << "Depthwise int8 ReSize error!"; return ret; @@ -75,26 +107,23 @@ int ConvolutionDepthwiseInt8CPUKernel::Init() { } int ConvolutionDepthwiseInt8CPUKernel::ReSize() { - // malloc packed input buffer - int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * - UP_DIV(conv_param_->input_channel_, 4); - packed_input_ = reinterpret_cast(malloc(pack_input_size * sizeof(int16_t))); - memset(packed_input_, 0, pack_input_size * sizeof(int16_t)); - if (packed_input_ == nullptr) { - MS_LOG(ERROR) << "Malloc buffer failed."; - return RET_ERROR; + free(packed_input_); + if (need_align_) { + free(packed_output_); } + // conv base init + ConvolutionBaseCPUKernel::Init(); - if (conv_param_->input_channel_ % C4NUM != 0) { - need_align_ = true; - int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * - (conv_param_->output_channel_, C4NUM); - packed_output_ = reinterpret_cast(malloc(pack_output_size * sizeof(int8_t))); - if (packed_input_ == nullptr) { - MS_LOG(ERROR) << "Malloc buffer failed."; - return RET_ERROR; - } - memset(packed_output_, 0, pack_output_size * sizeof(int8_t)); + // init sliding window param + InitSlidingParam(sliding, conv_param_, C4NUM); + + // init quant param + ConvolutionBaseCPUKernel::SetQuantParam(); + + auto ret = InitBuffer(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Depthwise int8 ReSize error!"; + return ret; } return RET_OK; } @@ -106,8 +135,8 @@ int ConvolutionDepthwiseInt8CPUKernel::Execute(int task_id) { } int ConvDwInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) { - auto conv_dw = reinterpret_cast(cdata); - auto ret = conv_dw->Execute(task_id); + auto conv_dw_int8 = reinterpret_cast(cdata); + auto ret = conv_dw_int8->Execute(task_id); if (ret != RET_OK) { MS_LOG(ERROR) << "ConvolutionDepthwiseInt8Run error task_id[" << task_id << "] error_code[" << ret << "]"; return RET_ERROR; @@ -127,7 +156,6 @@ int ConvolutionDepthwiseInt8CPUKernel::Run() { PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_); auto output_addr = reinterpret_cast(outputs_.at(kOutputIndex)->Data()); - memset(output_addr, 0, outputs_.at(kOutputIndex)->ElementsNum() * sizeof(int8_t)); if (!need_align_) { packed_output_ = output_addr; } diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h index 2e9ad6fd39..5e5f687006 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h @@ -42,6 +42,7 @@ class ConvolutionDepthwiseInt8CPUKernel : public ConvolutionBaseCPUKernel { int Run() override; int InitWeightBias(); + int InitBuffer(); int Execute(int task_id); private: diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc index 52a0b1ffde..b2e59c7255 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc @@ -35,11 +35,19 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitWeightBias() { int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM); int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_; packed_weight_ = reinterpret_cast(malloc(pack_weight_size * sizeof(int16_t))); + if (packed_weight_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } memset(packed_weight_, 0, pack_weight_size * sizeof(int16_t)); PackDepthwiseInt8Weight(origin_weight, packed_weight_, conv_param_); // init bias, add output zp bias_data_ = reinterpret_cast(malloc(C4NUM * OC4 * sizeof(int32_t))); + if (bias_data_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t)); if (inputs_.size() == kInputSize2) { auto ori_bias = reinterpret_cast(inputs_.at(kBiasIndex)->Data()); @@ -59,7 +67,6 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitSlideParam() { conv_param_->output_channel_ = inputs_.front()->shape().at(kNHWC_C); // init sliding window param - sliding = new SlidingWindowParam; InitSlidingParam(sliding, conv_param_, C4NUM); sliding->in_h_step_ = conv_param_->input_w_ * C4NUM; @@ -70,31 +77,7 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitSlideParam() { return RET_OK; } -int DeconvolutionDepthwiseInt8CPUKernel::Init() { - InitSlideParam(); - - // conv base init - ConvolutionBaseCPUKernel::Init(); - - // init quant param - ConvolutionBaseCPUKernel::SetQuantParam(); - - // init weight and bias - auto ret = InitWeightBias(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Deconv Depthwise int8 InitWeightBias error!"; - return ret; - } - - ret = ReSize(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Deconv Depthwise int8 ReSize error!"; - return ret; - } - return RET_OK; -} - -int DeconvolutionDepthwiseInt8CPUKernel::ReSize() { +int DeconvolutionDepthwiseInt8CPUKernel::InitBuffer() { // malloc packed input buffer int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * UP_DIV(conv_param_->input_channel_, 4); @@ -108,9 +91,9 @@ int DeconvolutionDepthwiseInt8CPUKernel::ReSize() { if (conv_param_->input_channel_ % C4NUM != 0) { need_align_ = true; int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * - (conv_param_->output_channel_, C4NUM); + UP_DIV(conv_param_->output_channel_, C4NUM); packed_output_ = reinterpret_cast(malloc(pack_output_size * sizeof(int8_t))); - if (packed_input_ == nullptr) { + if (packed_output_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; } @@ -120,6 +103,10 @@ int DeconvolutionDepthwiseInt8CPUKernel::ReSize() { // malloc tmp buffer for int32 output output_buffer = reinterpret_cast(malloc(conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * sizeof(int32_t))); + if (output_buffer == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } if (packed_input_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; @@ -127,6 +114,49 @@ int DeconvolutionDepthwiseInt8CPUKernel::ReSize() { return RET_OK; } +int DeconvolutionDepthwiseInt8CPUKernel::Init() { + sliding = new SlidingWindowParam; + InitSlideParam(); + + // conv base init + ConvolutionBaseCPUKernel::Init(); + + // init quant param + ConvolutionBaseCPUKernel::SetQuantParam(); + + // init weight and bias + auto ret = InitWeightBias(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Deconv Depthwise int8 InitWeightBias error!"; + return ret; + } + + ret = InitBuffer(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Deconv Depthwise int8 InitBuffer error!"; + return ret; + } + return RET_OK; +} + +int DeconvolutionDepthwiseInt8CPUKernel::ReSize() { + free(packed_input_); + if (need_align_) { + free(packed_output_); + } + InitSlideParam(); + + // conv base init + ConvolutionBaseCPUKernel::Init(); + + auto ret = InitBuffer(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Deconv Depthwise int8 InitBuffer error!"; + return ret; + } + return RET_OK; +} + int DeconvolutionDepthwiseInt8CPUKernel::Execute(int task_id) { DeconvDwInt8(packed_output_, output_buffer, packed_input_, packed_weight_, reinterpret_cast(bias_data_), conv_param_, sliding, task_id); @@ -134,8 +164,8 @@ int DeconvolutionDepthwiseInt8CPUKernel::Execute(int task_id) { } int DeconvDwInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) { - auto deconv_dw = reinterpret_cast(cdata); - auto ret = deconv_dw->Execute(task_id); + auto deconv_dw_int8 = reinterpret_cast(cdata); + auto ret = deconv_dw_int8->Execute(task_id); if (ret != RET_OK) { MS_LOG(ERROR) << "DeconvolutionDepthwiseInt8Run error task_id[" << task_id << "] error_code[" << ret << "]"; return RET_ERROR; @@ -155,8 +185,8 @@ int DeconvolutionDepthwiseInt8CPUKernel::Run() { PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_); auto output_addr = reinterpret_cast(outputs_.at(kOutputIndex)->Data()); - memset(output_addr, 0, outputs_.at(kOutputIndex)->ElementsNum() * sizeof(int8_t)); if (!need_align_) { + memset(output_addr, 0, outputs_.at(kOutputIndex)->ElementsNum() * sizeof(int8_t)); packed_output_ = output_addr; } diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.h index a394839bca..74f658b2d2 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.h +++ b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.h @@ -43,6 +43,7 @@ class DeconvolutionDepthwiseInt8CPUKernel : public ConvolutionBaseCPUKernel { int InitSlideParam(); int InitWeightBias(); + int InitBuffer(); int Execute(int task_id); private: diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.cc index 9117d4f821..122f1fe29d 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.cc @@ -21,6 +21,9 @@ void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, int height, int width, int in_kh_step, int in_kw_step, int kernel_w, bool is_relu, bool is_relu6) { + for (int c = 0; c < C8NUM; c++) { + dst[c] = 0; + } const float16_t *src_kh = src; const float16_t *weight_kh = weight; for (int kh = 0; kh < height; kh++) { @@ -87,6 +90,9 @@ void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t * for (int ow = 0; ow < width; ow++) { const float16_t *src_kh = src_w; const float16_t *weight_kh = weight; + for (int c = 0; c < C8NUM; c++) { + dst_w[c] = 0; + } for (int kh = 0; kh < kernel_h; kh++) { const float16_t *src_kw = src_kh; const float16_t *weight_kw = weight_kh; @@ -297,4 +303,3 @@ void DeconvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const f // output nchwc8 } /*deconv depthwise fp16 end*/ - diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv_depthwise.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv_depthwise.cc index 90d7240537..8e706605ee 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv_depthwise.cc +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv_depthwise.cc @@ -63,6 +63,9 @@ void DepthwiseBorderPixel(float *dst, const float *src, const float *weight, con int in_kh_step, int in_kw_step, int kernel_w, bool is_relu, bool is_relu6) { const float *src_kh = src; const float *weight_kh = weight; + for (int c = 0; c < C4NUM; c++) { + dst[c] = 0; + } for (int kh = 0; kh < height; kh++) { const float *src_kw = src_kh; const float *weight_kw = weight_kh; @@ -132,6 +135,9 @@ void DepthwiseCenter(float *dst, const float *src, const float *weight, const fl for (int ow = 0; ow < width; ow++) { const float *src_kh = src_w; const float *weight_kh = weight; + for (int c = 0; c < C4NUM; c++) { + dst_w[c] = 0; + } for (int kh = 0; kh < kernel_h; kh++) { const float *src_kw = src_kh; const float *weight_kw = weight_kh; @@ -202,7 +208,7 @@ void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weig src += sliding->in_step_; dst += sliding->out_step_; } // batch loop - // output nc4hwc4 + // output nhwc4 } /*conv depthwise fp32 end*/ @@ -350,6 +356,6 @@ void DeconvDwC4Fp32(float *output_data, const float *input_data, const float *we src += sliding->in_step_; dst += sliding->out_step_; } // batch loop - // output nc4hwc4 + // output nhwc4 } /*deconv depthwise fp32 end*/ diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_depthwise_int8.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_depthwise_int8.cc index 02bba0ae38..b44024d913 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_depthwise_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_depthwise_int8.cc @@ -171,7 +171,7 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w src += sliding->in_step_; dst += sliding->out_step_; } // batch loop - // output nc4hwc4 + // output nhwc4 } /*conv depthwise int8 end*/ @@ -317,6 +317,6 @@ void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *in src += sliding->in_step_; dst += sliding->out_step_; } // batch loop - // output nc4hwc4 + // output nhwc4 } /*deconv depthwise int8 end*/