|
|
|
@@ -37,6 +37,8 @@ namespace ncnn { |
|
|
|
#include "convolution_1x1_int8.h" |
|
|
|
#include "convolution_3x3_int8.h" |
|
|
|
|
|
|
|
#include "convolution_1x1_bf16s.h" |
|
|
|
|
|
|
|
#if __ARM_NEON |
|
|
|
#include "convolution_1x1_pack4.h" |
|
|
|
#include "convolution_1x1_pack4to1.h" |
|
|
|
@@ -1192,7 +1194,14 @@ int Convolution_arm::create_pipeline_bf16s(const Option& opt) |
|
|
|
// pack1 |
|
|
|
if (elempack == 1 && out_elempack == 1) |
|
|
|
{ |
|
|
|
ncnn::cast_float32_to_bfloat16(weight_data, weight_data_bf16, opt); |
|
|
|
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) |
|
|
|
{ |
|
|
|
conv1x1s1_sgemm_transform_kernel_bf16s_neon(weight_data, weight_data_bf16, num_input, num_output); |
|
|
|
} |
|
|
|
else |
|
|
|
{ |
|
|
|
ncnn::cast_float32_to_bfloat16(weight_data, weight_data_bf16, opt); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
return 0; |
|
|
|
@@ -1542,6 +1551,16 @@ int Convolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const |
|
|
|
|
|
|
|
if (elempack == 1 && out_elempack == 1) |
|
|
|
{ |
|
|
|
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) |
|
|
|
{ |
|
|
|
conv1x1s1_sgemm_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_bf16, bias_data, opt); |
|
|
|
|
|
|
|
if (activation) |
|
|
|
{ |
|
|
|
activation->forward_inplace(top_blob, opt); |
|
|
|
} |
|
|
|
} |
|
|
|
else |
|
|
|
{ |
|
|
|
// num_output |
|
|
|
#pragma omp parallel for num_threads(opt.num_threads) |
|
|
|
|