Browse Source

sgemm convolution 1x1 wip, about 20%~75% faster on aarch64, while armv7 compiler is foolish qaq

tags/20180704
nihuini 8 years ago
parent
commit
0fdb8da60e
3 changed files with 1262 additions and 1 deletions
  1. +1237
    -1
      src/layer/arm/convolution_1x1.h
  2. +23
    -0
      src/layer/arm/convolution_arm.cpp
  3. +2
    -0
      src/layer/arm/convolution_arm.h

+ 1237
- 1
src/layer/arm/convolution_1x1.h
File diff suppressed because it is too large
View File


+ 23
- 0
src/layer/arm/convolution_arm.cpp View File

@@ -32,6 +32,7 @@ int Convolution_arm::load_param(const ParamDict& pd)
return ret;

use_winograd3x3 = false;
use_sgemm1x1 = false;

if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
@@ -41,6 +42,17 @@ int Convolution_arm::load_param(const ParamDict& pd)
use_winograd3x3 = true;
}

#if __aarch64__
// TODO armv7 compiler is foolish!
// TODO assume more proper condition
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
int num_input = weight_data_size / num_output;
if (num_input >= 16 && num_output >= 16)
use_sgemm1x1 = true;
}
#endif // __aarch64__

return 0;
}

@@ -57,6 +69,12 @@ int Convolution_arm::load_model(const ModelBin& mb)
conv3x3s1_winograd64_transform_kernel_neon5(weight_data, weight_3x3_winograd64_data, num_input, num_output);
}

if (use_sgemm1x1)
{
int num_input = weight_data_size / num_output;
conv1x1s1_sgemm_transform_kernel_neon(weight_data, weight_1x1_sgemm_data, num_input, num_output);
}

return 0;
}

@@ -297,6 +315,11 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
// conv3x3s1_winograd64_neon4(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data);
conv3x3s1_winograd64_neon5(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data);
}
else if (use_sgemm1x1 && w <= 120 && h <= 120)
{
// TODO assume more proper condition
conv1x1s1_sgemm_neon(bottom_blob_bordered, top_blob, weight_1x1_sgemm_data, bias_data);
}
else
conv(bottom_blob_bordered, top_blob, weight_data, bias_data);



+ 2
- 0
src/layer/arm/convolution_arm.h View File

@@ -33,7 +33,9 @@ public:

public:
bool use_winograd3x3;
bool use_sgemm1x1;
Mat weight_3x3_winograd64_data;
Mat weight_1x1_sgemm_data;
};

} // namespace ncnn


Loading…
Cancel
Save