From c6d7525367a4e66063bf82cce050d04ec60cb3ae Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Thu, 30 Jul 2020 18:49:24 +0800
Subject: [PATCH] convolutiondepthwise arm fp16sa pack8

---
 src/layer/arm/convolution_arm.cpp          |   4 +-
 src/layer/arm/convolutiondepthwise_arm.cpp | 116 ++++++++++++++++++---
 src/layer/arm/convolutiondepthwise_arm.h   |   1 -
 3 files changed, 101 insertions(+), 20 deletions(-)

diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp
index 2485a0e73..ea97a821b 100644
--- a/src/layer/arm/convolution_arm.cpp
+++ b/src/layer/arm/convolution_arm.cpp
@@ -1323,7 +1323,7 @@ int Convolution_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const
                             sum = bias_data[p];
                         }
 
-                        const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * channels * p;
+                        const __fp16* kptr = weight_data_fp16.channel(p);
 
                         // channels
                         for (int q = 0; q < channels; q++)
@@ -1881,7 +1881,7 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const
                             sum = bias_data[p];
                         }
 
-                        const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * channels * p;
+                        const __fp16* kptr = weight_data_fp16.channel(p);
 
                         // channels
                         for (int q = 0; q < channels; q++)
diff --git a/src/layer/arm/convolutiondepthwise_arm.cpp b/src/layer/arm/convolutiondepthwise_arm.cpp
index e12b0eda9..ef3d5267d 100644
--- a/src/layer/arm/convolutiondepthwise_arm.cpp
+++ b/src/layer/arm/convolutiondepthwise_arm.cpp
@@ -114,12 +114,27 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         if (opt.use_fp16_storage)
         {
+            if (opt.use_packing_layout)
+            {
+                elempack = opt.use_fp16_arithmetic && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
+            }
+
+            if (elempack == 8)
+            {
+                Mat weight_data_r2 = weight_data.reshape(maxk, group);
+                Mat weight_data_r2_packed;
+                convert_packing(weight_data_r2, weight_data_r2_packed, 8);
+
+                ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_fp16, opt);
+            }
+
             if (elempack == 4)
             {
                 Mat weight_data_r2 = weight_data.reshape(maxk, group);
-                convert_packing(weight_data_r2, weight_data_pack4, 4);
+                Mat weight_data_r2_packed;
+                convert_packing(weight_data_r2, weight_data_r2_packed, 4);
 
-                ncnn::cast_float32_to_float16(weight_data_pack4, weight_data_pack4_fp16, opt);
+                ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_fp16, opt);
             }
 
             if (elempack == 1)
@@ -623,7 +638,7 @@ int ConvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blo
                 for (int g = 0; g < channels; g++)
                 {
                     __fp16* outptr = top_blob.channel(g);
-                    const __fp16* kptr = (const __fp16*)weight_data_pack4_fp16 + maxk * g * 4;
+                    const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 4;
                     const Mat m = bottom_blob_bordered.channel(g);
 
                     for (int i = 0; i < outh; i++)
@@ -655,8 +670,6 @@ int ConvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blo
                     }
                 }
             }
-
-            return 0;
         }
 
         if (elempack == 1)
@@ -794,7 +807,11 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl
 
     int outw = (w - kernel_extent_w) / stride_w + 1;
     int outh = (h - kernel_extent_h) / stride_h + 1;
-    int out_elempack = (support_packing && opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
+    int out_elempack = 1;
+    if (opt.use_packing_layout)
+    {
+        out_elempack = opt.use_fp16_arithmetic && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+    }
     size_t out_elemsize = elemsize / elempack * out_elempack;
 
     top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
@@ -804,6 +821,68 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl
     // depth-wise
     if (channels * elempack == group && group == num_output)
     {
+        if (elempack == 8)
+        {
+            {
+                const int maxk = kernel_w * kernel_h;
+
+                // kernel offsets
+                std::vector<int> _space_ofs(maxk);
+                int* space_ofs = &_space_ofs[0];
+                {
+                    int p1 = 0;
+                    int p2 = 0;
+                    int gap = w * dilation_h - kernel_w * dilation_w;
+                    for (int i = 0; i < kernel_h; i++)
+                    {
+                        for (int j = 0; j < kernel_w; j++)
+                        {
+                            space_ofs[p1] = p2;
+                            p1++;
+                            p2 += dilation_w;
+                        }
+                        p2 += gap;
+                    }
+                }
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int g = 0; g < channels; g++)
+                {
+                    __fp16* outptr = top_blob.channel(g);
+                    const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 8;
+                    const Mat m = bottom_blob_bordered.channel(g);
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        for (int j = 0; j < outw; j++)
+                        {
+                            float16x8_t _sum = vdupq_n_f16((__fp16)0.f);
+
+                            if (bias_term)
+                            {
+                                _sum = vld1q_f16(((const __fp16*)bias_data_fp16) + g * 8);
+                            }
+
+                            const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * 8;
+
+                            for (int k = 0; k < maxk; k++)
+                            {
+                                float16x8_t _val = vld1q_f16(sptr + space_ofs[k] * 8);
+                                float16x8_t _w = vld1q_f16(kptr + k * 8);
+                                _sum = vfmaq_f16(_sum, _val, _w);
+                            }
+
+                            _sum = activation_ps(_sum, activation_type, activation_params);
+
+                            vst1q_f16(outptr + j * 8, _sum);
+                        }
+
+                        outptr += outw * 8;
+                    }
+                }
+            }
+        }
+
         if (elempack == 4)
         {
             {
@@ -832,7 +911,7 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl
                 for (int g = 0; g < channels; g++)
                 {
                     __fp16* outptr = top_blob.channel(g);
-                    const __fp16* kptr = (const __fp16*)weight_data_pack4_fp16 + maxk * g * 4;
+                    const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 4;
                     const Mat m = bottom_blob_bordered.channel(g);
 
                     for (int i = 0; i < outh; i++)
@@ -864,8 +943,6 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl
                     }
                 }
             }
-
-            return 0;
         }
 
         if (elempack == 1)
@@ -960,22 +1037,27 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl
     const int channels_g = channels * elempack / group;
     const int num_output_g = num_output / group;
 
-    int g_elempack = (support_packing && opt.use_packing_layout && channels_g % 4 == 0) ? 4 : 1;
-    int out_g_elempack = (support_packing && opt.use_packing_layout && num_output_g % 4 == 0) ? 4 : 1;
+    int g_elempack = 1;
+    int out_g_elempack = 1;
+    if (opt.use_packing_layout)
+    {
+        g_elempack = opt.use_fp16_arithmetic && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
+        out_g_elempack = opt.use_fp16_arithmetic && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
+    }
 
     // unpacking
     Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
-    if (elempack == 4 && g_elempack == 1)
+    if (elempack > g_elempack)
     {
         Option opt_p = opt;
         opt_p.blob_allocator = opt.workspace_allocator;
-        convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p);
+        convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p);
     }
 
     Mat top_blob_unpacked = top_blob;
-    if (out_g_elempack == 1 && out_elempack == 4)
+    if (out_g_elempack < out_elempack)
     {
-        top_blob_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
+        top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
         if (top_blob_unpacked.empty())
             return -100;
     }
@@ -995,9 +1077,9 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl
     }
 
     // packing
-    if (out_g_elempack == 1 && out_elempack == 4)
+    if (out_g_elempack < out_elempack)
     {
-        convert_packing(top_blob_unpacked, top_blob, 4, opt);
+        convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
     }
     else
     {
diff --git a/src/layer/arm/convolutiondepthwise_arm.h b/src/layer/arm/convolutiondepthwise_arm.h
index c421c446a..af8fda730 100644
--- a/src/layer/arm/convolutiondepthwise_arm.h
+++ b/src/layer/arm/convolutiondepthwise_arm.h
@@ -48,7 +48,6 @@ public:
 
     // fp16
     Mat weight_data_fp16;
-    Mat weight_data_pack4_fp16;
     Mat bias_data_fp16;
 
     // bf16