Browse Source

conv1x1 pack4to1 bf16s neon kernel

tags/20200413
nihuini 6 years ago
parent
commit
c6ebd13afb
3 changed files with 2163 additions and 7 deletions
  1. +2123
    -0
      src/layer/arm/convolution_1x1_pack4to1_bf16s.h
  2. +6
    -6
      src/layer/arm/convolution_3x3_pack1to4_bf16s.h
  3. +34
    -1
      src/layer/arm/convolution_arm.cpp

+ 2123
- 0
src/layer/arm/convolution_1x1_pack4to1_bf16s.h
File diff suppressed because it is too large
View File


+ 6
- 6
src/layer/arm/convolution_3x3_pack1to4_bf16s.h View File

@@ -2102,7 +2102,7 @@ static void conv3x3s2_pack1to4_bf16s_neon(const Mat& bottom_blob, Mat& top_blob,
"fmla v12.4s, %17.4s, v1.s[0] \n"
"fmla v13.4s, %17.4s, v1.s[2] \n"

"ld1 {v4.h}[7], [%1] \n"
"ld1 {v4.h}[0], [%1] \n"

"fmla v6.4s, %9.4s, v0.s[1] \n"
"fmla v7.4s, %9.4s, v0.s[3] \n"
@@ -2142,7 +2142,7 @@ static void conv3x3s2_pack1to4_bf16s_neon(const Mat& bottom_blob, Mat& top_blob,
"fmla v12.4s, %20.4s, v3.s[0] \n"
"fmla v13.4s, %20.4s, v3.s[2] \n"

"ld1 {v5.h}[7], [%2] \n"
"ld1 {v5.h}[0], [%2] \n"

"fmla v6.4s, %12.4s, v2.s[1] \n"
"fmla v7.4s, %12.4s, v2.s[3] \n"
@@ -2183,7 +2183,7 @@ static void conv3x3s2_pack1to4_bf16s_neon(const Mat& bottom_blob, Mat& top_blob,
"fmla v12.4s, %23.4s, v1.s[0] \n"
"fmla v13.4s, %23.4s, v1.s[2] \n"

"ld1 {v4.h}[7], [%3] \n"
"ld1 {v4.h}[0], [%3] \n"

"fmla v6.4s, %15.4s, v0.s[1] \n"
"fmla v7.4s, %15.4s, v0.s[3] \n"
@@ -2462,7 +2462,7 @@ static void conv3x3s2_pack1to4_bf16s_neon(const Mat& bottom_blob, Mat& top_blob,
"fmla v12.4s, %21.4s, v1.s[0] \n"
"fmla v13.4s, %21.4s, v1.s[2] \n"

"ld1 {v4.h}[7], [%3] \n"
"ld1 {v4.h}[0], [%3] \n"

"fmla v6.4s, %13.4s, v0.s[1] \n"
"fmla v7.4s, %13.4s, v0.s[3] \n"
@@ -2503,7 +2503,7 @@ static void conv3x3s2_pack1to4_bf16s_neon(const Mat& bottom_blob, Mat& top_blob,
"fmla v12.4s, %24.4s, v3.s[0] \n"
"fmla v13.4s, %24.4s, v3.s[2] \n"

"ld1 {v5.h}[7], [%4] \n"
"ld1 {v5.h}[0], [%4] \n"

"fmla v6.4s, %16.4s, v2.s[1] \n"
"fmla v7.4s, %16.4s, v2.s[3] \n"
@@ -2544,7 +2544,7 @@ static void conv3x3s2_pack1to4_bf16s_neon(const Mat& bottom_blob, Mat& top_blob,
"fmla v12.4s, %27.4s, v1.s[0] \n"
"fmla v13.4s, %27.4s, v1.s[2] \n"

"ld1 {v4.h}[7], [%5] \n"
"ld1 {v4.h}[0], [%5] \n"

"fmla v6.4s, %19.4s, v0.s[1] \n"
"fmla v7.4s, %19.4s, v0.s[3] \n"


+ 34
- 1
src/layer/arm/convolution_arm.cpp View File

@@ -46,8 +46,8 @@ namespace ncnn {
#include "convolution_5x5_pack4.h"
#include "convolution_7x7_pack1to4.h"


#include "convolution_1x1_pack4_bf16s.h"
#include "convolution_1x1_pack4to1_bf16s.h"
#include "convolution_3x3_pack4_bf16s.h"
#include "convolution_3x3_pack1to4_bf16s.h"
#endif // __ARM_NEON
@@ -1233,6 +1233,39 @@ int Convolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const

if (elempack == 4 && out_elempack == 1)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv1x1s1_sgemm_pack4to1_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack4to1_bf16, bias_data, opt);

if (activation)
{
activation->forward_inplace(top_blob, opt);
}
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv1x1s2_pack4to1_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack4to1_bf16, bias_data, opt);

if (activation)
{
activation->forward_inplace(top_blob, opt);
}
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
fprintf(stderr, "implement me!\n");

// // TODO more proper condition
// conv3x3s1_winograd64_pack4to1_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack4to1, bias_data, opt);
//
// // conv3x3s1_pack4to1_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack4to1, bias_data, opt);

if (activation)
{
activation->forward_inplace(top_blob, opt);
}
}
else
{
// num_output
#pragma omp parallel for num_threads(opt.num_threads)


Loading…
Cancel
Save