| @@ -46,7 +46,6 @@ int ConvolutionDepthwiseFp16CPUKernel::InitBuffer() { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | MS_LOG(ERROR) << "Malloc buffer failed."; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| memset(packed_output_, 0, pack_output_size * sizeof(float16_t)); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -27,27 +27,7 @@ using mindspore::lite::RET_OK; | |||||
| using mindspore::schema::PrimitiveType_DepthwiseConv2D; | using mindspore::schema::PrimitiveType_DepthwiseConv2D; | ||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| int ConvolutionDepthwiseCPUKernel::Init() { | |||||
| // conv base init | |||||
| ConvolutionBaseCPUKernel::Init(); | |||||
| // init sliding window param | |||||
| sliding_ = new SlidingWindowParam; | |||||
| InitSlidingParam(sliding_, conv_param_, C4NUM); | |||||
| // pack input function: convert_func_ | |||||
| auto input_tensor = inputs_[kInputIndex]; | |||||
| auto data_type = input_tensor->data_type(); | |||||
| auto input_format = input_tensor->GetFormat(); | |||||
| schema::Format execute_format = schema::Format_NHWC4; | |||||
| if (input_format != execute_format) { | |||||
| convert_func_ = LayoutTransform(data_type, input_format, execute_format); | |||||
| if (convert_func_ == nullptr) { | |||||
| MS_LOG(ERROR) << "layout convert func is nullptr."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| } | |||||
| int ConvolutionDepthwiseCPUKernel::InitWeightBias() { | |||||
| // init weight: o, h, w, i; o == group, i == 1 | // init weight: o, h, w, i; o == group, i == 1 | ||||
| auto weight_tensor = inputs_[kWeightIndex]; | auto weight_tensor = inputs_[kWeightIndex]; | ||||
| auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data()); | auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data()); | ||||
| @@ -55,42 +35,93 @@ int ConvolutionDepthwiseCPUKernel::Init() { | |||||
| int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_; | int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_; | ||||
| packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float))); | packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float))); | ||||
| if (packed_weight_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| memset(packed_weight_, 0, pack_weight_size * sizeof(float)); | memset(packed_weight_, 0, pack_weight_size * sizeof(float)); | ||||
| PackNCHWToNC4HW4Fp32(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_, | PackNCHWToNC4HW4Fp32(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_, | ||||
| conv_param_->output_channel_); | conv_param_->output_channel_); | ||||
| // init bias | // init bias | ||||
| bias_data_ = reinterpret_cast<float *>(malloc(C4NUM * OC4 * sizeof(float))); | bias_data_ = reinterpret_cast<float *>(malloc(C4NUM * OC4 * sizeof(float))); | ||||
| if (bias_data_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| memset(bias_data_, 0, C4NUM * OC4 * sizeof(float)); | memset(bias_data_, 0, C4NUM * OC4 * sizeof(float)); | ||||
| if (inputs_.size() == kInputSize2) { | if (inputs_.size() == kInputSize2) { | ||||
| auto ori_bias = reinterpret_cast<float *>(inputs_.at(kBiasIndex)->Data()); | auto ori_bias = reinterpret_cast<float *>(inputs_.at(kBiasIndex)->Data()); | ||||
| memcpy(bias_data_, ori_bias, conv_param_->output_channel_ * sizeof(float)); | memcpy(bias_data_, ori_bias, conv_param_->output_channel_ * sizeof(float)); | ||||
| } else { | |||||
| MS_ASSERT(inputs_.size() == kInputSize1); | |||||
| } | } | ||||
| // init threadNum; | // init threadNum; | ||||
| conv_param_->thread_num_ = MSMIN(thread_count_, OC4); | conv_param_->thread_num_ = MSMIN(thread_count_, OC4); | ||||
| ReSize(); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int ConvolutionDepthwiseCPUKernel::ReSize() { | |||||
| // malloc pack input buffer | |||||
| if (convert_func_ != nullptr) { | |||||
| int ConvolutionDepthwiseCPUKernel::InitBuffer() { | |||||
| // malloc pack input and output buffer | |||||
| if (conv_param_->input_channel_ % C4NUM != 0) { | |||||
| need_align_ = true; | |||||
| int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM); | int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM); | ||||
| int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * IC4; | int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * IC4; | ||||
| packed_input_ = reinterpret_cast<float *>(malloc(pack_input_size * sizeof(float))); | packed_input_ = reinterpret_cast<float *>(malloc(pack_input_size * sizeof(float))); | ||||
| if (packed_input_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| memset(packed_input_, 0, pack_input_size * sizeof(float)); | memset(packed_input_, 0, pack_input_size * sizeof(float)); | ||||
| } | |||||
| // malloc tmp output buffer | |||||
| if (conv_param_->output_channel_ % C4NUM != 0) { | |||||
| need_align_ = true; | |||||
| int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM); | int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM); | ||||
| int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * OC4; | int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * OC4; | ||||
| packed_output_ = reinterpret_cast<float *>(malloc(pack_output_size * sizeof(float))); | packed_output_ = reinterpret_cast<float *>(malloc(pack_output_size * sizeof(float))); | ||||
| memset(packed_output_, 0, pack_output_size * sizeof(float)); | |||||
| if (packed_output_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int ConvolutionDepthwiseCPUKernel::Init() { | |||||
| // conv base init | |||||
| ConvolutionBaseCPUKernel::Init(); | |||||
| // init sliding window param | |||||
| sliding_ = new SlidingWindowParam; | |||||
| InitSlidingParam(sliding_, conv_param_, C4NUM); | |||||
| auto ret = InitWeightBias(); | |||||
| if (ret != 0) { | |||||
| MS_LOG(ERROR) << "Convolution depthwise fp32 InitWeightBias failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| ret = InitBuffer(); | |||||
| if (ret != 0) { | |||||
| MS_LOG(ERROR) << "Convolution depthwise fp32 InitBuffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int ConvolutionDepthwiseCPUKernel::ReSize() { | |||||
| if (need_align_) { | |||||
| free(packed_input_); | |||||
| free(packed_output_); | |||||
| } | |||||
| // conv base init | |||||
| ConvolutionBaseCPUKernel::Init(); | |||||
| // init sliding window param | |||||
| sliding_ = new SlidingWindowParam; | |||||
| InitSlidingParam(sliding_, conv_param_, C4NUM); | |||||
| auto ret = InitBuffer(); | |||||
| if (ret != 0) { | |||||
| MS_LOG(ERROR) << "Convolution depthwise fp32 InitBuffer failed."; | |||||
| return RET_ERROR; | |||||
| } | } | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -120,15 +151,14 @@ int ConvolutionDepthwiseCPUKernel::Run() { | |||||
| auto input_addr = reinterpret_cast<float *>(input_tensor->Data()); | auto input_addr = reinterpret_cast<float *>(input_tensor->Data()); | ||||
| // pack input: to nhwc4 | // pack input: to nhwc4 | ||||
| if (convert_func_ != nullptr) { | |||||
| convert_func_(input_addr, packed_input_, conv_param_->input_batch_, conv_param_->input_h_ * conv_param_->input_w_, | |||||
| conv_param_->input_channel_); | |||||
| if (need_align_) { | |||||
| PackNHWCToNHWC4Fp32(input_addr, packed_input_, conv_param_->input_batch_, | |||||
| conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); | |||||
| } else { | } else { | ||||
| packed_input_ = input_addr; | packed_input_ = input_addr; | ||||
| } | } | ||||
| output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data()); | |||||
| memset(output_addr, 0, outputs_.at(kOutputIndex)->ElementsNum() * sizeof(float)); | |||||
| auto output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data()); | |||||
| if (!need_align_) { | if (!need_align_) { | ||||
| packed_output_ = output_addr; | packed_output_ = output_addr; | ||||
| } | } | ||||
| @@ -146,7 +176,6 @@ int ConvolutionDepthwiseCPUKernel::Run() { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs, | kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs, | ||||
| const std::vector<lite::tensor::Tensor *> &outputs, | const std::vector<lite::tensor::Tensor *> &outputs, | ||||
| OpParameter *opParameter, const Context *ctx, | OpParameter *opParameter, const Context *ctx, | ||||
| @@ -170,4 +199,3 @@ kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector<lite::tensor::T | |||||
| REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_DepthwiseConv2D, CpuConvDwFp32KernelCreator) | REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_DepthwiseConv2D, CpuConvDwFp32KernelCreator) | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -31,10 +31,8 @@ class ConvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel { | |||||
| ~ConvolutionDepthwiseCPUKernel() override { | ~ConvolutionDepthwiseCPUKernel() override { | ||||
| delete sliding_; | delete sliding_; | ||||
| free(packed_weight_); | free(packed_weight_); | ||||
| if (convert_func_ != nullptr) { | |||||
| free(packed_input_); | |||||
| } | |||||
| if (need_align_) { | if (need_align_) { | ||||
| free(packed_input_); | |||||
| free(packed_output_); | free(packed_output_); | ||||
| } | } | ||||
| }; | }; | ||||
| @@ -43,6 +41,8 @@ class ConvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel { | |||||
| int ReSize() override; | int ReSize() override; | ||||
| int Run() override; | int Run() override; | ||||
| int InitBuffer(); | |||||
| int InitWeightBias(); | |||||
| int Execute(int task_id); | int Execute(int task_id); | ||||
| private: | private: | ||||
| @@ -50,7 +50,6 @@ class ConvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel { | |||||
| float *packed_weight_; | float *packed_weight_; | ||||
| float *packed_input_; | float *packed_input_; | ||||
| float *packed_output_; | float *packed_output_; | ||||
| float *output_addr; | |||||
| bool need_align_ = false; | bool need_align_ = false; | ||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -43,24 +43,7 @@ int DeconvolutionDepthwiseCPUKernel::InitSlideParam() { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int DeconvolutionDepthwiseCPUKernel::Init() { | |||||
| InitSlideParam(); | |||||
| // conv base init | |||||
| ConvolutionBaseCPUKernel::Init(); | |||||
| // pack input function: convert_func_ | |||||
| auto input_tensor = inputs_[kInputIndex]; | |||||
| auto data_type = input_tensor->data_type(); | |||||
| auto input_format = input_tensor->GetFormat(); | |||||
| schema::Format execute_format = schema::Format_NHWC4; | |||||
| if (input_format != execute_format) { | |||||
| convert_func_ = LayoutTransform(data_type, input_format, execute_format); | |||||
| if (convert_func_ == nullptr) { | |||||
| MS_LOG(ERROR) << "layout convert func is nullptr."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| } | |||||
| int DeconvolutionDepthwiseCPUKernel::InitWeightBias() { | |||||
| // init weight: o, h, w, i; o == group, i == 1 | // init weight: o, h, w, i; o == group, i == 1 | ||||
| auto weight_tensor = inputs_[kWeightIndex]; | auto weight_tensor = inputs_[kWeightIndex]; | ||||
| auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data()); | auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data()); | ||||
| @@ -68,55 +51,102 @@ int DeconvolutionDepthwiseCPUKernel::Init() { | |||||
| int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_; | int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_; | ||||
| packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float))); | packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float))); | ||||
| if (packed_weight_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| memset(packed_weight_, 0, pack_weight_size * sizeof(float)); | memset(packed_weight_, 0, pack_weight_size * sizeof(float)); | ||||
| PackNCHWToNC4HW4Fp32(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_, | PackNCHWToNC4HW4Fp32(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_, | ||||
| conv_param_->output_channel_); | conv_param_->output_channel_); | ||||
| // init bias | // init bias | ||||
| bias_data_ = reinterpret_cast<float *>(malloc(C4NUM * OC4 * sizeof(float))); | bias_data_ = reinterpret_cast<float *>(malloc(C4NUM * OC4 * sizeof(float))); | ||||
| if (bias_data_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| memset(bias_data_, 0, C4NUM * OC4 * sizeof(float)); | memset(bias_data_, 0, C4NUM * OC4 * sizeof(float)); | ||||
| if (inputs_.size() == kInputSize2) { | if (inputs_.size() == kInputSize2) { | ||||
| auto ori_bias = reinterpret_cast<float *>(inputs_.at(kBiasIndex)->Data()); | auto ori_bias = reinterpret_cast<float *>(inputs_.at(kBiasIndex)->Data()); | ||||
| memcpy(bias_data_, ori_bias, conv_param_->output_channel_ * sizeof(float)); | memcpy(bias_data_, ori_bias, conv_param_->output_channel_ * sizeof(float)); | ||||
| } else { | |||||
| MS_ASSERT(inputs_.size() == kInputSize1); | |||||
| } | } | ||||
| // init threadNum; | // init threadNum; | ||||
| conv_param_->thread_num_ = MSMIN(conv_param_->thread_num_, OC4); | conv_param_->thread_num_ = MSMIN(conv_param_->thread_num_, OC4); | ||||
| ReSize(); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int DeconvolutionDepthwiseCPUKernel::ReSize() { | |||||
| // malloc pack input buffer | |||||
| if (convert_func_ != nullptr) { | |||||
| int DeconvolutionDepthwiseCPUKernel::InitBuffer() { | |||||
| // malloc pack input and output buffer | |||||
| if (conv_param_->input_channel_ % C4NUM != 0) { | |||||
| need_align_ = true; | |||||
| int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM); | int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM); | ||||
| int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * IC4; | int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * IC4; | ||||
| packed_input_ = reinterpret_cast<float *>(malloc(pack_input_size * sizeof(float))); | packed_input_ = reinterpret_cast<float *>(malloc(pack_input_size * sizeof(float))); | ||||
| if (packed_input_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| memset(packed_input_, 0, pack_input_size * sizeof(float)); | memset(packed_input_, 0, pack_input_size * sizeof(float)); | ||||
| } | |||||
| // malloc tmp output buffer | |||||
| if (conv_param_->output_channel_ % C4NUM != 0) { | |||||
| need_pack_ = true; | |||||
| int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM); | int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM); | ||||
| int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * OC4; | int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * OC4; | ||||
| packed_output_ = reinterpret_cast<float *>(malloc(pack_output_size * sizeof(float))); | packed_output_ = reinterpret_cast<float *>(malloc(pack_output_size * sizeof(float))); | ||||
| if (packed_output_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| memset(packed_output_, 0, pack_output_size * sizeof(float)); | memset(packed_output_, 0, pack_output_size * sizeof(float)); | ||||
| } | } | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int DeconvolutionDepthwiseCPUKernel::DoExcute(int task_id) { | |||||
| int DeconvolutionDepthwiseCPUKernel::Init() { | |||||
| InitSlideParam(); | |||||
| // conv base init | |||||
| ConvolutionBaseCPUKernel::Init(); | |||||
| auto ret = InitWeightBias(); | |||||
| if (ret != 0) { | |||||
| MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitWeightBias failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| ret = InitBuffer(); | |||||
| if (ret != 0) { | |||||
| MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitBuffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int DeconvolutionDepthwiseCPUKernel::ReSize() { | |||||
| if (need_align_) { | |||||
| free(packed_input_); | |||||
| free(packed_output_); | |||||
| } | |||||
| InitSlideParam(); | |||||
| // conv base init | |||||
| ConvolutionBaseCPUKernel::Init(); | |||||
| auto ret = InitBuffer(); | |||||
| if (ret != 0) { | |||||
| MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitBuffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int DeconvolutionDepthwiseCPUKernel::Execute(int task_id) { | |||||
| DeconvDwC4Fp32(packed_output_, packed_input_, packed_weight_, reinterpret_cast<float *>(bias_data_), conv_param_, | DeconvDwC4Fp32(packed_output_, packed_input_, packed_weight_, reinterpret_cast<float *>(bias_data_), conv_param_, | ||||
| sliding_, task_id); | sliding_, task_id); | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int DeconvDwRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) { | int DeconvDwRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) { | ||||
| auto conv_dw = reinterpret_cast<DeconvolutionDepthwiseCPUKernel *>(cdata); | |||||
| auto ret = conv_dw->DoExcute(task_id); | |||||
| auto deconv_dw = reinterpret_cast<DeconvolutionDepthwiseCPUKernel *>(cdata); | |||||
| auto ret = deconv_dw->Execute(task_id); | |||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "DeconvolutionDepthwiseRun error task_id[" << task_id << "] error_code[" << ret << "]"; | MS_LOG(ERROR) << "DeconvolutionDepthwiseRun error task_id[" << task_id << "] error_code[" << ret << "]"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| @@ -133,26 +163,26 @@ int DeconvolutionDepthwiseCPUKernel::Run() { | |||||
| auto input_addr = reinterpret_cast<float *>(input_tensor->Data()); | auto input_addr = reinterpret_cast<float *>(input_tensor->Data()); | ||||
| // pack input: to nhwc4 | // pack input: to nhwc4 | ||||
| if (convert_func_ != nullptr) { | |||||
| convert_func_(input_addr, packed_input_, conv_param_->input_batch_, conv_param_->input_h_ * conv_param_->input_w_, | |||||
| conv_param_->input_channel_); | |||||
| if (need_align_) { | |||||
| PackNHWCToNHWC4Fp32(input_addr, packed_input_, conv_param_->input_batch_, | |||||
| conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); | |||||
| } else { | } else { | ||||
| packed_input_ = input_addr; | packed_input_ = input_addr; | ||||
| } | } | ||||
| output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data()); | |||||
| memset(output_addr, 0, outputs_.at(kOutputIndex)->ElementsNum() * sizeof(float)); | |||||
| if (!need_pack_) { | |||||
| auto output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data()); | |||||
| if (!need_align_) { | |||||
| memset(output_addr, 0, outputs_.at(kOutputIndex)->ElementsNum() * sizeof(float)); | |||||
| packed_output_ = output_addr; | packed_output_ = output_addr; | ||||
| } | } | ||||
| auto ret = LiteBackendParallelLaunch(DeconvDwRun, this, conv_param_->thread_num_); | auto ret = LiteBackendParallelLaunch(DeconvDwRun, this, conv_param_->thread_num_); | ||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "ConvDwRun error: error_code[" << ret << "]"; | |||||
| MS_LOG(ERROR) << "DeconvDwRun error: error_code[" << ret << "]"; | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| if (need_pack_) { | |||||
| if (need_align_) { | |||||
| PackNHWC4ToNHWCFp32(packed_output_, output_addr, conv_param_->output_batch_, | PackNHWC4ToNHWCFp32(packed_output_, output_addr, conv_param_->output_batch_, | ||||
| conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | ||||
| } | } | ||||
| @@ -182,4 +212,3 @@ kernel::LiteKernel *CpuDeconvDwFp32KernelCreator(const std::vector<lite::tensor: | |||||
| REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_DeDepthwiseConv2D, CpuDeconvDwFp32KernelCreator) | REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_DeDepthwiseConv2D, CpuDeconvDwFp32KernelCreator) | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -31,8 +31,10 @@ class DeconvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel { | |||||
| ~DeconvolutionDepthwiseCPUKernel() override { | ~DeconvolutionDepthwiseCPUKernel() override { | ||||
| delete sliding_; | delete sliding_; | ||||
| free(packed_weight_); | free(packed_weight_); | ||||
| free(packed_input_); | |||||
| free(packed_output_); | |||||
| if (need_align_) { | |||||
| free(packed_input_); | |||||
| free(packed_output_); | |||||
| } | |||||
| }; | }; | ||||
| int Init() override; | int Init() override; | ||||
| @@ -40,17 +42,17 @@ class DeconvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel { | |||||
| int ReSize() override; | int ReSize() override; | ||||
| int Run() override; | int Run() override; | ||||
| int DoExcute(int task_id); | |||||
| int InitBuffer(); | |||||
| int InitWeightBias(); | |||||
| int Execute(int task_id); | |||||
| private: | private: | ||||
| SlidingWindowParam *sliding_; | SlidingWindowParam *sliding_; | ||||
| float *packed_weight_; | float *packed_weight_; | ||||
| float *packed_input_; | float *packed_input_; | ||||
| float *packed_output_; | float *packed_output_; | ||||
| float *output_addr; | |||||
| bool need_pack_ = false; | |||||
| bool need_align_ = false; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_DECONVOLUTION_DEPTHWISE_H_ | #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_DECONVOLUTION_DEPTHWISE_H_ | ||||
| @@ -35,11 +35,19 @@ int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() { | |||||
| int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM); | int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM); | ||||
| int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_; | int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_; | ||||
| packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t))); | packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t))); | ||||
| if (packed_weight_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| memset(packed_weight_, 0, pack_weight_size * sizeof(int16_t)); | memset(packed_weight_, 0, pack_weight_size * sizeof(int16_t)); | ||||
| PackDepthwiseInt8Weight(origin_weight, packed_weight_, conv_param_); | PackDepthwiseInt8Weight(origin_weight, packed_weight_, conv_param_); | ||||
| // init bias, add output zp | // init bias, add output zp | ||||
| bias_data_ = reinterpret_cast<int32_t *>(malloc(C4NUM * OC4 * sizeof(int32_t))); | bias_data_ = reinterpret_cast<int32_t *>(malloc(C4NUM * OC4 * sizeof(int32_t))); | ||||
| if (bias_data_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t)); | memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t)); | ||||
| if (inputs_.size() == kInputSize2) { | if (inputs_.size() == kInputSize2) { | ||||
| auto ori_bias = reinterpret_cast<int32_t *>(inputs_.at(kBiasIndex)->Data()); | auto ori_bias = reinterpret_cast<int32_t *>(inputs_.at(kBiasIndex)->Data()); | ||||
| @@ -48,6 +56,30 @@ int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int ConvolutionDepthwiseInt8CPUKernel::InitBuffer() { | |||||
| // malloc packed input buffer | |||||
| int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * | |||||
| UP_DIV(conv_param_->input_channel_, 4); | |||||
| packed_input_ = reinterpret_cast<int16_t *>(malloc(pack_input_size * sizeof(int16_t))); | |||||
| memset(packed_input_, 0, pack_input_size * sizeof(int16_t)); | |||||
| if (packed_input_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| if (conv_param_->input_channel_ % C4NUM != 0) { | |||||
| need_align_ = true; | |||||
| int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * | |||||
| UP_DIV(conv_param_->output_channel_, C4NUM); | |||||
| packed_output_ = reinterpret_cast<int8_t *>(malloc(pack_output_size * sizeof(int8_t))); | |||||
| if (packed_input_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int ConvolutionDepthwiseInt8CPUKernel::Init() { | int ConvolutionDepthwiseInt8CPUKernel::Init() { | ||||
| // conv base init | // conv base init | ||||
| ConvolutionBaseCPUKernel::Init(); | ConvolutionBaseCPUKernel::Init(); | ||||
| @@ -66,7 +98,7 @@ int ConvolutionDepthwiseInt8CPUKernel::Init() { | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| ret = ReSize(); | |||||
| ret = InitBuffer(); | |||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "Depthwise int8 ReSize error!"; | MS_LOG(ERROR) << "Depthwise int8 ReSize error!"; | ||||
| return ret; | return ret; | ||||
| @@ -75,26 +107,23 @@ int ConvolutionDepthwiseInt8CPUKernel::Init() { | |||||
| } | } | ||||
| int ConvolutionDepthwiseInt8CPUKernel::ReSize() { | int ConvolutionDepthwiseInt8CPUKernel::ReSize() { | ||||
| // malloc packed input buffer | |||||
| int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * | |||||
| UP_DIV(conv_param_->input_channel_, 4); | |||||
| packed_input_ = reinterpret_cast<int16_t *>(malloc(pack_input_size * sizeof(int16_t))); | |||||
| memset(packed_input_, 0, pack_input_size * sizeof(int16_t)); | |||||
| if (packed_input_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| free(packed_input_); | |||||
| if (need_align_) { | |||||
| free(packed_output_); | |||||
| } | } | ||||
| // conv base init | |||||
| ConvolutionBaseCPUKernel::Init(); | |||||
| if (conv_param_->input_channel_ % C4NUM != 0) { | |||||
| need_align_ = true; | |||||
| int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * | |||||
| (conv_param_->output_channel_, C4NUM); | |||||
| packed_output_ = reinterpret_cast<int8_t *>(malloc(pack_output_size * sizeof(int8_t))); | |||||
| if (packed_input_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| memset(packed_output_, 0, pack_output_size * sizeof(int8_t)); | |||||
| // init sliding window param | |||||
| InitSlidingParam(sliding, conv_param_, C4NUM); | |||||
| // init quant param | |||||
| ConvolutionBaseCPUKernel::SetQuantParam(); | |||||
| auto ret = InitBuffer(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Depthwise int8 ReSize error!"; | |||||
| return ret; | |||||
| } | } | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -106,8 +135,8 @@ int ConvolutionDepthwiseInt8CPUKernel::Execute(int task_id) { | |||||
| } | } | ||||
| int ConvDwInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) { | int ConvDwInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) { | ||||
| auto conv_dw = reinterpret_cast<ConvolutionDepthwiseInt8CPUKernel *>(cdata); | |||||
| auto ret = conv_dw->Execute(task_id); | |||||
| auto conv_dw_int8 = reinterpret_cast<ConvolutionDepthwiseInt8CPUKernel *>(cdata); | |||||
| auto ret = conv_dw_int8->Execute(task_id); | |||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "ConvolutionDepthwiseInt8Run error task_id[" << task_id << "] error_code[" << ret << "]"; | MS_LOG(ERROR) << "ConvolutionDepthwiseInt8Run error task_id[" << task_id << "] error_code[" << ret << "]"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| @@ -127,7 +156,6 @@ int ConvolutionDepthwiseInt8CPUKernel::Run() { | |||||
| PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_); | PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_); | ||||
| auto output_addr = reinterpret_cast<int8_t *>(outputs_.at(kOutputIndex)->Data()); | auto output_addr = reinterpret_cast<int8_t *>(outputs_.at(kOutputIndex)->Data()); | ||||
| memset(output_addr, 0, outputs_.at(kOutputIndex)->ElementsNum() * sizeof(int8_t)); | |||||
| if (!need_align_) { | if (!need_align_) { | ||||
| packed_output_ = output_addr; | packed_output_ = output_addr; | ||||
| } | } | ||||
| @@ -42,6 +42,7 @@ class ConvolutionDepthwiseInt8CPUKernel : public ConvolutionBaseCPUKernel { | |||||
| int Run() override; | int Run() override; | ||||
| int InitWeightBias(); | int InitWeightBias(); | ||||
| int InitBuffer(); | |||||
| int Execute(int task_id); | int Execute(int task_id); | ||||
| private: | private: | ||||
| @@ -35,11 +35,19 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitWeightBias() { | |||||
| int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM); | int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM); | ||||
| int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_; | int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_; | ||||
| packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t))); | packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t))); | ||||
| if (packed_weight_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| memset(packed_weight_, 0, pack_weight_size * sizeof(int16_t)); | memset(packed_weight_, 0, pack_weight_size * sizeof(int16_t)); | ||||
| PackDepthwiseInt8Weight(origin_weight, packed_weight_, conv_param_); | PackDepthwiseInt8Weight(origin_weight, packed_weight_, conv_param_); | ||||
| // init bias, add output zp | // init bias, add output zp | ||||
| bias_data_ = reinterpret_cast<int32_t *>(malloc(C4NUM * OC4 * sizeof(int32_t))); | bias_data_ = reinterpret_cast<int32_t *>(malloc(C4NUM * OC4 * sizeof(int32_t))); | ||||
| if (bias_data_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t)); | memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t)); | ||||
| if (inputs_.size() == kInputSize2) { | if (inputs_.size() == kInputSize2) { | ||||
| auto ori_bias = reinterpret_cast<int32_t *>(inputs_.at(kBiasIndex)->Data()); | auto ori_bias = reinterpret_cast<int32_t *>(inputs_.at(kBiasIndex)->Data()); | ||||
| @@ -59,7 +67,6 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitSlideParam() { | |||||
| conv_param_->output_channel_ = inputs_.front()->shape().at(kNHWC_C); | conv_param_->output_channel_ = inputs_.front()->shape().at(kNHWC_C); | ||||
| // init sliding window param | // init sliding window param | ||||
| sliding = new SlidingWindowParam; | |||||
| InitSlidingParam(sliding, conv_param_, C4NUM); | InitSlidingParam(sliding, conv_param_, C4NUM); | ||||
| sliding->in_h_step_ = conv_param_->input_w_ * C4NUM; | sliding->in_h_step_ = conv_param_->input_w_ * C4NUM; | ||||
| @@ -70,31 +77,7 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitSlideParam() { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int DeconvolutionDepthwiseInt8CPUKernel::Init() { | |||||
| InitSlideParam(); | |||||
| // conv base init | |||||
| ConvolutionBaseCPUKernel::Init(); | |||||
| // init quant param | |||||
| ConvolutionBaseCPUKernel::SetQuantParam(); | |||||
| // init weight and bias | |||||
| auto ret = InitWeightBias(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Deconv Depthwise int8 InitWeightBias error!"; | |||||
| return ret; | |||||
| } | |||||
| ret = ReSize(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Deconv Depthwise int8 ReSize error!"; | |||||
| return ret; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int DeconvolutionDepthwiseInt8CPUKernel::ReSize() { | |||||
| int DeconvolutionDepthwiseInt8CPUKernel::InitBuffer() { | |||||
| // malloc packed input buffer | // malloc packed input buffer | ||||
| int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * | int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * | ||||
| UP_DIV(conv_param_->input_channel_, 4); | UP_DIV(conv_param_->input_channel_, 4); | ||||
| @@ -108,9 +91,9 @@ int DeconvolutionDepthwiseInt8CPUKernel::ReSize() { | |||||
| if (conv_param_->input_channel_ % C4NUM != 0) { | if (conv_param_->input_channel_ % C4NUM != 0) { | ||||
| need_align_ = true; | need_align_ = true; | ||||
| int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * | int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * | ||||
| (conv_param_->output_channel_, C4NUM); | |||||
| UP_DIV(conv_param_->output_channel_, C4NUM); | |||||
| packed_output_ = reinterpret_cast<int8_t *>(malloc(pack_output_size * sizeof(int8_t))); | packed_output_ = reinterpret_cast<int8_t *>(malloc(pack_output_size * sizeof(int8_t))); | ||||
| if (packed_input_ == nullptr) { | |||||
| if (packed_output_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | MS_LOG(ERROR) << "Malloc buffer failed."; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| @@ -120,6 +103,10 @@ int DeconvolutionDepthwiseInt8CPUKernel::ReSize() { | |||||
| // malloc tmp buffer for int32 output | // malloc tmp buffer for int32 output | ||||
| output_buffer = | output_buffer = | ||||
| reinterpret_cast<int32_t *>(malloc(conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * sizeof(int32_t))); | reinterpret_cast<int32_t *>(malloc(conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * sizeof(int32_t))); | ||||
| if (output_buffer == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| if (packed_input_ == nullptr) { | if (packed_input_ == nullptr) { | ||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | MS_LOG(ERROR) << "Malloc buffer failed."; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| @@ -127,6 +114,49 @@ int DeconvolutionDepthwiseInt8CPUKernel::ReSize() { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int DeconvolutionDepthwiseInt8CPUKernel::Init() { | |||||
| sliding = new SlidingWindowParam; | |||||
| InitSlideParam(); | |||||
| // conv base init | |||||
| ConvolutionBaseCPUKernel::Init(); | |||||
| // init quant param | |||||
| ConvolutionBaseCPUKernel::SetQuantParam(); | |||||
| // init weight and bias | |||||
| auto ret = InitWeightBias(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Deconv Depthwise int8 InitWeightBias error!"; | |||||
| return ret; | |||||
| } | |||||
| ret = InitBuffer(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Deconv Depthwise int8 InitBuffer error!"; | |||||
| return ret; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int DeconvolutionDepthwiseInt8CPUKernel::ReSize() { | |||||
| free(packed_input_); | |||||
| if (need_align_) { | |||||
| free(packed_output_); | |||||
| } | |||||
| InitSlideParam(); | |||||
| // conv base init | |||||
| ConvolutionBaseCPUKernel::Init(); | |||||
| auto ret = InitBuffer(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Deconv Depthwise int8 InitBuffer error!"; | |||||
| return ret; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int DeconvolutionDepthwiseInt8CPUKernel::Execute(int task_id) { | int DeconvolutionDepthwiseInt8CPUKernel::Execute(int task_id) { | ||||
| DeconvDwInt8(packed_output_, output_buffer, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_), | DeconvDwInt8(packed_output_, output_buffer, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_), | ||||
| conv_param_, sliding, task_id); | conv_param_, sliding, task_id); | ||||
| @@ -134,8 +164,8 @@ int DeconvolutionDepthwiseInt8CPUKernel::Execute(int task_id) { | |||||
| } | } | ||||
| int DeconvDwInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) { | int DeconvDwInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) { | ||||
| auto deconv_dw = reinterpret_cast<DeconvolutionDepthwiseInt8CPUKernel *>(cdata); | |||||
| auto ret = deconv_dw->Execute(task_id); | |||||
| auto deconv_dw_int8 = reinterpret_cast<DeconvolutionDepthwiseInt8CPUKernel *>(cdata); | |||||
| auto ret = deconv_dw_int8->Execute(task_id); | |||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "DeconvolutionDepthwiseInt8Run error task_id[" << task_id << "] error_code[" << ret << "]"; | MS_LOG(ERROR) << "DeconvolutionDepthwiseInt8Run error task_id[" << task_id << "] error_code[" << ret << "]"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| @@ -155,8 +185,8 @@ int DeconvolutionDepthwiseInt8CPUKernel::Run() { | |||||
| PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_); | PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_); | ||||
| auto output_addr = reinterpret_cast<int8_t *>(outputs_.at(kOutputIndex)->Data()); | auto output_addr = reinterpret_cast<int8_t *>(outputs_.at(kOutputIndex)->Data()); | ||||
| memset(output_addr, 0, outputs_.at(kOutputIndex)->ElementsNum() * sizeof(int8_t)); | |||||
| if (!need_align_) { | if (!need_align_) { | ||||
| memset(output_addr, 0, outputs_.at(kOutputIndex)->ElementsNum() * sizeof(int8_t)); | |||||
| packed_output_ = output_addr; | packed_output_ = output_addr; | ||||
| } | } | ||||
| @@ -43,6 +43,7 @@ class DeconvolutionDepthwiseInt8CPUKernel : public ConvolutionBaseCPUKernel { | |||||
| int InitSlideParam(); | int InitSlideParam(); | ||||
| int InitWeightBias(); | int InitWeightBias(); | ||||
| int InitBuffer(); | |||||
| int Execute(int task_id); | int Execute(int task_id); | ||||
| private: | private: | ||||
| @@ -21,6 +21,9 @@ | |||||
| void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, | void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, | ||||
| int height, int width, int in_kh_step, int in_kw_step, int kernel_w, bool is_relu, | int height, int width, int in_kh_step, int in_kw_step, int kernel_w, bool is_relu, | ||||
| bool is_relu6) { | bool is_relu6) { | ||||
| for (int c = 0; c < C8NUM; c++) { | |||||
| dst[c] = 0; | |||||
| } | |||||
| const float16_t *src_kh = src; | const float16_t *src_kh = src; | ||||
| const float16_t *weight_kh = weight; | const float16_t *weight_kh = weight; | ||||
| for (int kh = 0; kh < height; kh++) { | for (int kh = 0; kh < height; kh++) { | ||||
| @@ -87,6 +90,9 @@ void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t * | |||||
| for (int ow = 0; ow < width; ow++) { | for (int ow = 0; ow < width; ow++) { | ||||
| const float16_t *src_kh = src_w; | const float16_t *src_kh = src_w; | ||||
| const float16_t *weight_kh = weight; | const float16_t *weight_kh = weight; | ||||
| for (int c = 0; c < C8NUM; c++) { | |||||
| dst_w[c] = 0; | |||||
| } | |||||
| for (int kh = 0; kh < kernel_h; kh++) { | for (int kh = 0; kh < kernel_h; kh++) { | ||||
| const float16_t *src_kw = src_kh; | const float16_t *src_kw = src_kh; | ||||
| const float16_t *weight_kw = weight_kh; | const float16_t *weight_kw = weight_kh; | ||||
| @@ -297,4 +303,3 @@ void DeconvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const f | |||||
| // output nchwc8 | // output nchwc8 | ||||
| } | } | ||||
| /*deconv depthwise fp16 end*/ | /*deconv depthwise fp16 end*/ | ||||
| @@ -63,6 +63,9 @@ void DepthwiseBorderPixel(float *dst, const float *src, const float *weight, con | |||||
| int in_kh_step, int in_kw_step, int kernel_w, bool is_relu, bool is_relu6) { | int in_kh_step, int in_kw_step, int kernel_w, bool is_relu, bool is_relu6) { | ||||
| const float *src_kh = src; | const float *src_kh = src; | ||||
| const float *weight_kh = weight; | const float *weight_kh = weight; | ||||
| for (int c = 0; c < C4NUM; c++) { | |||||
| dst[c] = 0; | |||||
| } | |||||
| for (int kh = 0; kh < height; kh++) { | for (int kh = 0; kh < height; kh++) { | ||||
| const float *src_kw = src_kh; | const float *src_kw = src_kh; | ||||
| const float *weight_kw = weight_kh; | const float *weight_kw = weight_kh; | ||||
| @@ -132,6 +135,9 @@ void DepthwiseCenter(float *dst, const float *src, const float *weight, const fl | |||||
| for (int ow = 0; ow < width; ow++) { | for (int ow = 0; ow < width; ow++) { | ||||
| const float *src_kh = src_w; | const float *src_kh = src_w; | ||||
| const float *weight_kh = weight; | const float *weight_kh = weight; | ||||
| for (int c = 0; c < C4NUM; c++) { | |||||
| dst_w[c] = 0; | |||||
| } | |||||
| for (int kh = 0; kh < kernel_h; kh++) { | for (int kh = 0; kh < kernel_h; kh++) { | ||||
| const float *src_kw = src_kh; | const float *src_kw = src_kh; | ||||
| const float *weight_kw = weight_kh; | const float *weight_kw = weight_kh; | ||||
| @@ -202,7 +208,7 @@ void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weig | |||||
| src += sliding->in_step_; | src += sliding->in_step_; | ||||
| dst += sliding->out_step_; | dst += sliding->out_step_; | ||||
| } // batch loop | } // batch loop | ||||
| // output nc4hwc4 | |||||
| // output nhwc4 | |||||
| } | } | ||||
| /*conv depthwise fp32 end*/ | /*conv depthwise fp32 end*/ | ||||
| @@ -350,6 +356,6 @@ void DeconvDwC4Fp32(float *output_data, const float *input_data, const float *we | |||||
| src += sliding->in_step_; | src += sliding->in_step_; | ||||
| dst += sliding->out_step_; | dst += sliding->out_step_; | ||||
| } // batch loop | } // batch loop | ||||
| // output nc4hwc4 | |||||
| // output nhwc4 | |||||
| } | } | ||||
| /*deconv depthwise fp32 end*/ | /*deconv depthwise fp32 end*/ | ||||
| @@ -171,7 +171,7 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w | |||||
| src += sliding->in_step_; | src += sliding->in_step_; | ||||
| dst += sliding->out_step_; | dst += sliding->out_step_; | ||||
| } // batch loop | } // batch loop | ||||
| // output nc4hwc4 | |||||
| // output nhwc4 | |||||
| } | } | ||||
| /*conv depthwise int8 end*/ | /*conv depthwise int8 end*/ | ||||
| @@ -317,6 +317,6 @@ void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *in | |||||
| src += sliding->in_step_; | src += sliding->in_step_; | ||||
| dst += sliding->out_step_; | dst += sliding->out_step_; | ||||
| } // batch loop | } // batch loop | ||||
| // output nc4hwc4 | |||||
| // output nhwc4 | |||||
| } | } | ||||
| /*deconv depthwise int8 end*/ | /*deconv depthwise int8 end*/ | ||||