From: @lzkcode Reviewed-by: @zhang_xue_tong,@zhanghaibo5 Signed-off-by: @zhang_xue_tongtags/v1.1.0
| @@ -587,10 +587,16 @@ bool CheckConvDwUseIndirectBuffer(const ConvParameter *conv_param) { | |||
| void ConvDwInitIndirection(float **indirect_buffer, float *src, float *zero_ptr, const ConvParameter *conv_param, | |||
| int step_h, int step_w) { | |||
| int ic_4 = UP_DIV(conv_param->input_channel_, C4NUM) * C4NUM; | |||
| #ifdef ENABLE_AVX | |||
| int div = C8NUM; | |||
| #else | |||
| int div = C4NUM; | |||
| #endif | |||
| int ic_div = UP_DIV(conv_param->input_channel_, div) * div; | |||
| for (int b = 0; b < conv_param->output_batch_; b++) { | |||
| float **indirect = indirect_buffer + b * conv_param->output_h_ * step_h; | |||
| float *input = src + b * conv_param->input_h_ * conv_param->input_w_ * ic_4; | |||
| float *input = src + b * conv_param->input_h_ * conv_param->input_w_ * ic_div; | |||
| for (int oh = 0; oh < conv_param->output_h_; oh++) { | |||
| for (int kh = 0; kh < conv_param->kernel_h_; kh++) { | |||
| int ih = oh * conv_param->stride_h_ + kh * conv_param->dilation_h_ - conv_param->pad_u_; | |||
| @@ -600,7 +606,7 @@ void ConvDwInitIndirection(float **indirect_buffer, float *src, float *zero_ptr, | |||
| int iw = ow * conv_param->stride_w_ + kw * conv_param->dilation_w_ - conv_param->pad_l_; | |||
| int index = oh * step_h + ow * step_w * conv_param->kernel_h_ + kw * conv_param->kernel_h_ + kh; | |||
| if (iw < conv_param->input_w_ && iw >= 0) { | |||
| indirect[index] = input + (ih * conv_param->input_w_ + iw) * ic_4; | |||
| indirect[index] = input + (ih * conv_param->input_w_ + iw) * ic_div; | |||
| } else { | |||
| indirect[index] = zero_ptr; | |||
| } | |||
| @@ -619,7 +625,7 @@ void ConvDwInitIndirection(float **indirect_buffer, float *src, float *zero_ptr, | |||
| } | |||
| } | |||
| #ifndef ENABLE_ARM64 | |||
| #if !defined(ENABLE_ARM64) && !defined(ENABLE_AVX) | |||
| void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, const float *bias, int channels, | |||
| int output_width, int input_stride, bool relu, bool relu6, int kernel) { | |||
| do { | |||
| @@ -674,6 +680,15 @@ void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, c | |||
| } | |||
| #endif | |||
| #ifdef ENABLE_AVX | |||
| void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, const float *bias, int channels, | |||
| int output_width, int input_stride, bool relu, bool relu6, int kernel) { | |||
| if (kernel == 9) { | |||
| ConvDwFp32Avx3x3(output, input, weights, bias, channels, output_width, input_stride * sizeof(float *), relu, relu6); | |||
| } | |||
| } | |||
| #endif | |||
| void ConvDwIndirection(float *output_data, float **indirect_buffer, const float *weight_data, const float *bias_data, | |||
| float *zero_ptr, const ConvParameter *conv_param, int task_id) { | |||
| int step_w = conv_param->dilation_w_ == 1 ? conv_param->stride_w_ : conv_param->kernel_w_; | |||
| @@ -66,6 +66,11 @@ void ConvDwFp32Indirect5x5(float *output, float **input, const float *weights, c | |||
| int output_width, size_t input_stride, size_t relu, size_t relu6); | |||
| #endif | |||
| #ifdef ENABLE_AVX | |||
| void ConvDwFp32Avx3x3(float *output, float **input, const float *weights, const float *bias, int channels, | |||
| int output_width, size_t input_stride, size_t relu, size_t relu6); | |||
| #endif | |||
| void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, const float *bias, int channels, | |||
| int output_width, int input_stride, bool relu, bool relu6, int kernel); | |||
| @@ -500,6 +500,30 @@ void PackNHWCToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int c | |||
| } | |||
| } | |||
| void PackNHWCToNHWC8Fp32(const void *src, void *dst, int batch, int plane, int channel) { | |||
| int c8 = UP_DIV(channel, C8NUM); | |||
| int c8_channel = c8 * C8NUM; | |||
| int nhwc8_batch_unit_offset = c8 * C8NUM * plane; | |||
| int ic_remainder_ = channel % C8NUM; | |||
| if (ic_remainder_ != 0) { | |||
| int nhwc8_batch_offset = 0; | |||
| for (int b = 0; b < batch; b++) { | |||
| int batch_offset = b * channel * plane; | |||
| for (int i = 0; i < plane; i++) { | |||
| float *dst_per_plane = (float *)dst + nhwc8_batch_offset + i * c8_channel; | |||
| memcpy(dst_per_plane, (float *)src + batch_offset + i * channel, channel * sizeof(float)); | |||
| for (int j = channel; j < c8_channel; ++j) { | |||
| dst_per_plane[j] = 0; | |||
| } | |||
| } | |||
| nhwc8_batch_offset += nhwc8_batch_unit_offset; | |||
| } | |||
| } else { | |||
| size_t ori_input_size = batch * plane * channel * sizeof(float); | |||
| memcpy((float *)dst, (float *)src, ori_input_size); | |||
| } | |||
| } | |||
| void PackNHWC4ToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel) { | |||
| int c4 = UP_DIV(channel, C4NUM); | |||
| int ic_remainder_ = channel % C4NUM; | |||
| @@ -600,6 +624,23 @@ void PackDepthwiseIndirectWeightC4Fp32(const void *src, void *dst, int height, i | |||
| } | |||
| } | |||
| void PackDepthwiseIndirectWeightC8Fp32(const void *src, void *dst, int height, int width, int channel) { | |||
| int c8 = UP_DIV(channel, C8NUM); | |||
| for (int c = 0; c < c8; c++) { | |||
| int dst_off_c = c * C8NUM * height * width; | |||
| for (int i = 0; i < C8NUM; i++) { | |||
| int src_off_c = (c * C8NUM + i) * height * width; | |||
| for (int kh = 0; kh < height; kh++) { | |||
| int src_off_kh = src_off_c + kh * width; | |||
| for (int kw = 0; kw < width; kw++) { | |||
| int dst_off = dst_off_c + kw * height * C8NUM + kh * C8NUM + i; | |||
| ((float *)dst)[dst_off] = ((float *)src)[src_off_kh + kw]; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| void PackNHWCToNHWC4Int8(const void *src, void *dst, int batch, int plane, int channel) { | |||
| int c4 = UP_DIV(channel, C4NUM); | |||
| int c4_channel = c4 * C4NUM; | |||
| @@ -64,6 +64,8 @@ void PackNCHWToNC4HW4Fp32(const void *src, void *dst, int batch, int plane, int | |||
| void PackNHWCToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int channel); | |||
| void PackNHWCToNHWC8Fp32(const void *src, void *dst, int batch, int plane, int channel); | |||
| void PackNHWCToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel); | |||
| void PackNHWCToNCHWInt8(const void *src, void *dst, int batch, int plane, int channel); | |||
| @@ -80,6 +82,8 @@ void PackNHWCToC8HWN8Fp32(const void *src, void *dst, int batch, int plane, int | |||
| void PackDepthwiseIndirectWeightC4Fp32(const void *src, void *dst, int height, int width, int channel); | |||
| void PackDepthwiseIndirectWeightC8Fp32(const void *src, void *dst, int height, int width, int channel); | |||
| void PackNHWCToNHWC4Int8(const void *src, void *dst, int batch, int plane, int channel); | |||
| void PackNHWC4ToNHWCInt8(const void *src, void *dst, int batch, int plane, int channel); | |||
| @@ -147,7 +147,7 @@ kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector<lite::Tensor *> | |||
| conv_param->input_channel_ = inputs[kInputIndex]->Channel(); | |||
| conv_param->output_h_ = outputs[kOutputIndex]->Height(); | |||
| conv_param->output_w_ = outputs[kOutputIndex]->Width(); | |||
| #ifdef ENABLE_ARM64 | |||
| #if defined(ENABLE_ARM64) || defined(ENABLE_AVX) | |||
| if (CheckConvDwUseIndirectBuffer(conv_param)) { | |||
| kernel = | |||
| new (std::nothrow) kernel::ConvolutionDepthwiseIndirectCPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||
| @@ -47,37 +47,47 @@ int ConvolutionDepthwiseIndirectCPUKernel::InitWeightBias() { | |||
| // init weight: o, h, w, i; o == group, i == 1 | |||
| auto weight_tensor = in_tensors_[kWeightIndex]; | |||
| auto origin_weight = reinterpret_cast<float *>(weight_tensor->MutableData()); | |||
| int C4 = UP_DIV(weight_tensor->Batch(), C4NUM); | |||
| int pack_weight_size = C4NUM * C4 * weight_tensor->Height() * weight_tensor->Width(); | |||
| #ifdef ENABLE_AVX | |||
| int div_flag = C8NUM; | |||
| #else | |||
| int div_flag = C4NUM; | |||
| #endif | |||
| int batch_flag = UP_DIV(weight_tensor->Batch(), div_flag); | |||
| int pack_weight_size = div_flag * batch_flag * weight_tensor->Height() * weight_tensor->Width(); | |||
| packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float))); | |||
| if (packed_weight_ == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||
| return RET_ERROR; | |||
| } | |||
| #ifdef ENABLE_AVX | |||
| PackDepthwiseIndirectWeightC8Fp32(origin_weight, packed_weight_, weight_tensor->Height(), weight_tensor->Width(), | |||
| weight_tensor->Batch()); | |||
| #else | |||
| PackDepthwiseIndirectWeightC4Fp32(origin_weight, packed_weight_, weight_tensor->Height(), weight_tensor->Width(), | |||
| weight_tensor->Batch()); | |||
| #endif | |||
| auto bias_tensor = in_tensors_[kBiasIndex]; | |||
| bias_data_ = reinterpret_cast<float *>(malloc(C4NUM * C4 * sizeof(float))); | |||
| bias_data_ = reinterpret_cast<float *>(malloc(batch_flag * div_flag * sizeof(float))); | |||
| if (bias_data_ == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||
| return RET_ERROR; | |||
| } | |||
| memset(bias_data_, 0, C4NUM * C4 * sizeof(float)); | |||
| memset(bias_data_, 0, batch_flag * div_flag * sizeof(float)); | |||
| if (in_tensors_.size() == kInputSize2) { | |||
| auto ori_bias = reinterpret_cast<float *>(bias_tensor->MutableData()); | |||
| memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(float)); | |||
| } | |||
| // malloc zero ptr | |||
| zero_ptr_ = reinterpret_cast<float *>(malloc(C4NUM * C4 * sizeof(float))); | |||
| zero_ptr_ = reinterpret_cast<float *>(malloc(batch_flag * div_flag * sizeof(float))); | |||
| if (zero_ptr_ == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||
| return RET_ERROR; | |||
| } | |||
| memset(zero_ptr_, 0, C4NUM * C4 * sizeof(float)); | |||
| memset(zero_ptr_, 0, batch_flag * div_flag * sizeof(float)); | |||
| return RET_OK; | |||
| } | |||
| @@ -139,8 +149,13 @@ int ConvDwIndirectRun(void *cdata, int task_id) { | |||
| } | |||
| int ConvolutionDepthwiseIndirectCPUKernel::MallocPackedInput() { | |||
| int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM); | |||
| int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * IC4; | |||
| #ifdef ENABLE_AVX | |||
| int div_flag = C8NUM; | |||
| #else | |||
| int div_flag = C4NUM; | |||
| #endif | |||
| int IC_DIV = UP_DIV(conv_param_->input_channel_, div_flag); | |||
| int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * div_flag * IC_DIV; | |||
| packed_input_ = reinterpret_cast<float *>(context_->allocator->Malloc(pack_input_size * sizeof(float))); | |||
| if (packed_input_ == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||
| @@ -152,14 +167,24 @@ int ConvolutionDepthwiseIndirectCPUKernel::MallocPackedInput() { | |||
| int ConvolutionDepthwiseIndirectCPUKernel::Run() { | |||
| auto input_tensor = in_tensors_.at(kInputIndex); | |||
| auto input_ptr = reinterpret_cast<float *>(input_tensor->data_c()); | |||
| if (conv_param_->input_channel_ % C4NUM != 0) { | |||
| #ifdef ENABLE_AVX | |||
| int div_flag = C8NUM; | |||
| #else | |||
| int div_flag = C4NUM; | |||
| #endif | |||
| if (conv_param_->input_channel_ % div_flag != 0) { | |||
| auto ret = MallocPackedInput(); | |||
| if (ret != 0) { | |||
| MS_LOG(ERROR) << "Convolution depthwise fp32 indirect buffer MallocPackedInput failed."; | |||
| return RET_ERROR; | |||
| } | |||
| #ifdef ENABLE_AVX | |||
| PackNHWCToNHWC8Fp32(input_ptr, packed_input_, conv_param_->input_batch_, | |||
| conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); | |||
| #else | |||
| PackNHWCToNHWC4Fp32(input_ptr, packed_input_, conv_param_->input_batch_, | |||
| conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); | |||
| #endif | |||
| } else { | |||
| packed_input_ = input_ptr; | |||
| } | |||
| @@ -174,7 +199,7 @@ int ConvolutionDepthwiseIndirectCPUKernel::Run() { | |||
| MS_LOG(ERROR) << "ConvDwIndirectRun error: error_code[" << ret << "]"; | |||
| return RET_ERROR; | |||
| } | |||
| if (conv_param_->input_channel_ % C4NUM != 0) { | |||
| if (conv_param_->input_channel_ % div_flag != 0) { | |||
| context_->allocator->Free(packed_input_); | |||
| } | |||
| return RET_OK; | |||