sgemm convolution 1x1 wip, about 20%~75% faster on aarch64, while armv7 compiler is foolish qaq

8 years ago · 0fdb8da60e
--- a/src/layer/arm/convolution_1x1.h
+++ b/src/layer/arm/convolution_1x1.h
--- a/src/layer/arm/convolution_arm.cpp
+++ b/src/layer/arm/convolution_arm.cpp
@@ -32,6 +32,7 @@ int Convolution_arm::load_param(const ParamDict& pd)
        return ret;

    use_winograd3x3 = false;
    use_sgemm1x1 = false;

    if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
    {
@@ -41,6 +42,17 @@ int Convolution_arm::load_param(const ParamDict& pd)
            use_winograd3x3 = true;
    }

 #if __aarch64__
    // TODO armv7 compiler is foolish!
    // TODO assume more proper condition
    if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
    {
        int num_input = weight_data_size / num_output;
        if (num_input >= 16 && num_output >= 16)
            use_sgemm1x1 = true;
    }
 #endif // __aarch64__

    return 0;
 }

@@ -57,6 +69,12 @@ int Convolution_arm::load_model(const ModelBin& mb)
        conv3x3s1_winograd64_transform_kernel_neon5(weight_data, weight_3x3_winograd64_data, num_input, num_output);
    }

    if (use_sgemm1x1)
    {
        int num_input = weight_data_size / num_output;
        conv1x1s1_sgemm_transform_kernel_neon(weight_data, weight_1x1_sgemm_data, num_input, num_output);
    }

    return 0;
 }

@@ -297,6 +315,11 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
 //         conv3x3s1_winograd64_neon4(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data);
        conv3x3s1_winograd64_neon5(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data);
    }
    else if (use_sgemm1x1 && w <= 120 && h <= 120)
    {
        // TODO assume more proper condition
        conv1x1s1_sgemm_neon(bottom_blob_bordered, top_blob, weight_1x1_sgemm_data, bias_data);
    }
    else
        conv(bottom_blob_bordered, top_blob, weight_data, bias_data);

--- a/src/layer/arm/convolution_arm.h
+++ b/src/layer/arm/convolution_arm.h
@@ -33,7 +33,9 @@ public:

 public:
    bool use_winograd3x3;
    bool use_sgemm1x1;
    Mat weight_3x3_winograd64_data;
    Mat weight_1x1_sgemm_data;
 };

 } // namespace ncnn