From c6d7525367a4e66063bf82cce050d04ec60cb3ae Mon Sep 17 00:00:00 2001 From: nihuini Date: Thu, 30 Jul 2020 18:49:24 +0800 Subject: [PATCH] convolutiondepthwise arm fp16sa pack8 --- src/layer/arm/convolution_arm.cpp | 4 +- src/layer/arm/convolutiondepthwise_arm.cpp | 116 ++++++++++++++++++--- src/layer/arm/convolutiondepthwise_arm.h | 1 - 3 files changed, 101 insertions(+), 20 deletions(-) diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp index 2485a0e73..ea97a821b 100644 --- a/src/layer/arm/convolution_arm.cpp +++ b/src/layer/arm/convolution_arm.cpp @@ -1323,7 +1323,7 @@ int Convolution_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const sum = bias_data[p]; } - const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * channels * p; + const __fp16* kptr = weight_data_fp16.channel(p); // channels for (int q = 0; q < channels; q++) @@ -1881,7 +1881,7 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const sum = bias_data[p]; } - const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * channels * p; + const __fp16* kptr = weight_data_fp16.channel(p); // channels for (int q = 0; q < channels; q++) diff --git a/src/layer/arm/convolutiondepthwise_arm.cpp b/src/layer/arm/convolutiondepthwise_arm.cpp index e12b0eda9..ef3d5267d 100644 --- a/src/layer/arm/convolutiondepthwise_arm.cpp +++ b/src/layer/arm/convolutiondepthwise_arm.cpp @@ -114,12 +114,27 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt) #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC if (opt.use_fp16_storage) { + if (opt.use_packing_layout) + { + elempack = opt.use_fp16_arithmetic && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1; + } + + if (elempack == 8) + { + Mat weight_data_r2 = weight_data.reshape(maxk, group); + Mat weight_data_r2_packed; + convert_packing(weight_data_r2, weight_data_r2_packed, 8); + + ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_fp16, opt); + } + if (elempack == 4) { Mat weight_data_r2 = weight_data.reshape(maxk, group); - convert_packing(weight_data_r2, weight_data_pack4, 4); + Mat weight_data_r2_packed; + convert_packing(weight_data_r2, weight_data_r2_packed, 4); - ncnn::cast_float32_to_float16(weight_data_pack4, weight_data_pack4_fp16, opt); + ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_fp16, opt); } if (elempack == 1) @@ -623,7 +638,7 @@ int ConvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blo for (int g = 0; g < channels; g++) { __fp16* outptr = top_blob.channel(g); - const __fp16* kptr = (const __fp16*)weight_data_pack4_fp16 + maxk * g * 4; + const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 4; const Mat m = bottom_blob_bordered.channel(g); for (int i = 0; i < outh; i++) @@ -655,8 +670,6 @@ int ConvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blo } } } - - return 0; } if (elempack == 1) @@ -794,7 +807,11 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl int outw = (w - kernel_extent_w) / stride_w + 1; int outh = (h - kernel_extent_h) / stride_h + 1; - int out_elempack = (support_packing && opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1; + int out_elempack = 1; + if (opt.use_packing_layout) + { + out_elempack = opt.use_fp16_arithmetic && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + } size_t out_elemsize = elemsize / elempack * out_elempack; top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); @@ -804,6 +821,68 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl // depth-wise if (channels * elempack == group && group == num_output) { + if (elempack == 8) + { + { + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < channels; g++) + { + __fp16* outptr = top_blob.channel(g); + const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 8; + const Mat m = bottom_blob_bordered.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + float16x8_t _sum = vdupq_n_f16((__fp16)0.f); + + if (bias_term) + { + _sum = vld1q_f16(((const __fp16*)bias_data_fp16) + g * 8); + } + + const __fp16* sptr = m.row(i * stride_h) + j * stride_w * 8; + + for (int k = 0; k < maxk; k++) + { + float16x8_t _val = vld1q_f16(sptr + space_ofs[k] * 8); + float16x8_t _w = vld1q_f16(kptr + k * 8); + _sum = vfmaq_f16(_sum, _val, _w); + } + + _sum = activation_ps(_sum, activation_type, activation_params); + + vst1q_f16(outptr + j * 8, _sum); + } + + outptr += outw * 8; + } + } + } + } + if (elempack == 4) { { @@ -832,7 +911,7 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl for (int g = 0; g < channels; g++) { __fp16* outptr = top_blob.channel(g); - const __fp16* kptr = (const __fp16*)weight_data_pack4_fp16 + maxk * g * 4; + const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 4; const Mat m = bottom_blob_bordered.channel(g); for (int i = 0; i < outh; i++) @@ -864,8 +943,6 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl } } } - - return 0; } if (elempack == 1) @@ -960,22 +1037,27 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl const int channels_g = channels * elempack / group; const int num_output_g = num_output / group; - int g_elempack = (support_packing && opt.use_packing_layout && channels_g % 4 == 0) ? 4 : 1; - int out_g_elempack = (support_packing && opt.use_packing_layout && num_output_g % 4 == 0) ? 4 : 1; + int g_elempack = 1; + int out_g_elempack = 1; + if (opt.use_packing_layout) + { + g_elempack = opt.use_fp16_arithmetic && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1; + out_g_elempack = opt.use_fp16_arithmetic && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1; + } // unpacking Mat bottom_blob_bordered_unpacked = bottom_blob_bordered; - if (elempack == 4 && g_elempack == 1) + if (elempack > g_elempack) { Option opt_p = opt; opt_p.blob_allocator = opt.workspace_allocator; - convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p); + convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p); } Mat top_blob_unpacked = top_blob; - if (out_g_elempack == 1 && out_elempack == 4) + if (out_g_elempack < out_elempack) { - top_blob_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator); + top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator); if (top_blob_unpacked.empty()) return -100; } @@ -995,9 +1077,9 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl } // packing - if (out_g_elempack == 1 && out_elempack == 4) + if (out_g_elempack < out_elempack) { - convert_packing(top_blob_unpacked, top_blob, 4, opt); + convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); } else { diff --git a/src/layer/arm/convolutiondepthwise_arm.h b/src/layer/arm/convolutiondepthwise_arm.h index c421c446a..af8fda730 100644 --- a/src/layer/arm/convolutiondepthwise_arm.h +++ b/src/layer/arm/convolutiondepthwise_arm.h @@ -48,7 +48,6 @@ public: // fp16 Mat weight_data_fp16; - Mat weight_data_pack4_fp16; Mat bias_data_fp16; // bf16