convolutiondepthwise arm fp16sa pack8

6 years ago · c6d7525367
--- a/src/layer/arm/convolution_arm.cpp
+++ b/src/layer/arm/convolution_arm.cpp
@@ -1323,7 +1323,7 @@ int Convolution_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const
                            sum = bias_data[p];
                        }

                        const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * channels * p;
                        const __fp16* kptr = weight_data_fp16.channel(p);

                        // channels
                        for (int q = 0; q < channels; q++)
@@ -1881,7 +1881,7 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const
                            sum = bias_data[p];
                        }

                        const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * channels * p;
                        const __fp16* kptr = weight_data_fp16.channel(p);

                        // channels
                        for (int q = 0; q < channels; q++)
--- a/src/layer/arm/convolutiondepthwise_arm.cpp
+++ b/src/layer/arm/convolutiondepthwise_arm.cpp
@@ -114,12 +114,27 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
        if (opt.use_fp16_storage)
        {
            if (opt.use_packing_layout)
            {
                elempack = opt.use_fp16_arithmetic && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
            }

            if (elempack == 8)
            {
                Mat weight_data_r2 = weight_data.reshape(maxk, group);
                Mat weight_data_r2_packed;
                convert_packing(weight_data_r2, weight_data_r2_packed, 8);

                ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_fp16, opt);
            }

            if (elempack == 4)
            {
                Mat weight_data_r2 = weight_data.reshape(maxk, group);
                convert_packing(weight_data_r2, weight_data_pack4, 4);
                Mat weight_data_r2_packed;
                convert_packing(weight_data_r2, weight_data_r2_packed, 4);

                ncnn::cast_float32_to_float16(weight_data_pack4, weight_data_pack4_fp16, opt);
                ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_fp16, opt);
            }

            if (elempack == 1)
@@ -623,7 +638,7 @@ int ConvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blo
                for (int g = 0; g < channels; g++)
                {
                    __fp16* outptr = top_blob.channel(g);
                    const __fp16* kptr = (const __fp16*)weight_data_pack4_fp16 + maxk * g * 4;
                    const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 4;
                    const Mat m = bottom_blob_bordered.channel(g);

                    for (int i = 0; i < outh; i++)
@@ -655,8 +670,6 @@ int ConvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blo
                    }
                }
            }

            return 0;
        }

        if (elempack == 1)
@@ -794,7 +807,11 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl

    int outw = (w - kernel_extent_w) / stride_w + 1;
    int outh = (h - kernel_extent_h) / stride_h + 1;
    int out_elempack = (support_packing && opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
    int out_elempack = 1;
    if (opt.use_packing_layout)
    {
        out_elempack = opt.use_fp16_arithmetic && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
    }
    size_t out_elemsize = elemsize / elempack * out_elempack;

    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
@@ -804,6 +821,68 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl
    // depth-wise
    if (channels * elempack == group && group == num_output)
    {
        if (elempack == 8)
        {
            {
                const int maxk = kernel_w * kernel_h;

                // kernel offsets
                std::vector<int> _space_ofs(maxk);
                int* space_ofs = &_space_ofs[0];
                {
                    int p1 = 0;
                    int p2 = 0;
                    int gap = w * dilation_h - kernel_w * dilation_w;
                    for (int i = 0; i < kernel_h; i++)
                    {
                        for (int j = 0; j < kernel_w; j++)
                        {
                            space_ofs[p1] = p2;
                            p1++;
                            p2 += dilation_w;
                        }
                        p2 += gap;
                    }
                }

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int g = 0; g < channels; g++)
                {
                    __fp16* outptr = top_blob.channel(g);
                    const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 8;
                    const Mat m = bottom_blob_bordered.channel(g);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            float16x8_t _sum = vdupq_n_f16((__fp16)0.f);

                            if (bias_term)
                            {
                                _sum = vld1q_f16(((const __fp16*)bias_data_fp16) + g * 8);
                            }

                            const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * 8;

                            for (int k = 0; k < maxk; k++)
                            {
                                float16x8_t _val = vld1q_f16(sptr + space_ofs[k] * 8);
                                float16x8_t _w = vld1q_f16(kptr + k * 8);
                                _sum = vfmaq_f16(_sum, _val, _w);
                            }

                            _sum = activation_ps(_sum, activation_type, activation_params);

                            vst1q_f16(outptr + j * 8, _sum);
                        }

                        outptr += outw * 8;
                    }
                }
            }
        }

        if (elempack == 4)
        {
            {
@@ -832,7 +911,7 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl
                for (int g = 0; g < channels; g++)
                {
                    __fp16* outptr = top_blob.channel(g);
                    const __fp16* kptr = (const __fp16*)weight_data_pack4_fp16 + maxk * g * 4;
                    const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 4;
                    const Mat m = bottom_blob_bordered.channel(g);

                    for (int i = 0; i < outh; i++)
@@ -864,8 +943,6 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl
                    }
                }
            }

            return 0;
        }

        if (elempack == 1)
@@ -960,22 +1037,27 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl
    const int channels_g = channels * elempack / group;
    const int num_output_g = num_output / group;

    int g_elempack = (support_packing && opt.use_packing_layout && channels_g % 4 == 0) ? 4 : 1;
    int out_g_elempack = (support_packing && opt.use_packing_layout && num_output_g % 4 == 0) ? 4 : 1;
    int g_elempack = 1;
    int out_g_elempack = 1;
    if (opt.use_packing_layout)
    {
        g_elempack = opt.use_fp16_arithmetic && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
        out_g_elempack = opt.use_fp16_arithmetic && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
    }

    // unpacking
    Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
    if (elempack == 4 && g_elempack == 1)
    if (elempack > g_elempack)
    {
        Option opt_p = opt;
        opt_p.blob_allocator = opt.workspace_allocator;
        convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p);
        convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p);
    }

    Mat top_blob_unpacked = top_blob;
    if (out_g_elempack == 1 && out_elempack == 4)
    if (out_g_elempack < out_elempack)
    {
        top_blob_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
        top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
        if (top_blob_unpacked.empty())
            return -100;
    }
@@ -995,9 +1077,9 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl
    }

    // packing
    if (out_g_elempack == 1 && out_elempack == 4)
    if (out_g_elempack < out_elempack)
    {
        convert_packing(top_blob_unpacked, top_blob, 4, opt);
        convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
    }
    else
    {
--- a/src/layer/arm/convolutiondepthwise_arm.h
+++ b/src/layer/arm/convolutiondepthwise_arm.h
@@ -48,7 +48,6 @@ public:

    // fp16
    Mat weight_data_fp16;
    Mat weight_data_pack4_fp16;
    Mat bias_data_fp16;

    // bf16