|
|
|
@@ -114,12 +114,27 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt) |
|
|
|
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC |
|
|
|
if (opt.use_fp16_storage) |
|
|
|
{ |
|
|
|
if (opt.use_packing_layout) |
|
|
|
{ |
|
|
|
elempack = opt.use_fp16_arithmetic && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1; |
|
|
|
} |
|
|
|
|
|
|
|
if (elempack == 8) |
|
|
|
{ |
|
|
|
Mat weight_data_r2 = weight_data.reshape(maxk, group); |
|
|
|
Mat weight_data_r2_packed; |
|
|
|
convert_packing(weight_data_r2, weight_data_r2_packed, 8); |
|
|
|
|
|
|
|
ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_fp16, opt); |
|
|
|
} |
|
|
|
|
|
|
|
if (elempack == 4) |
|
|
|
{ |
|
|
|
Mat weight_data_r2 = weight_data.reshape(maxk, group); |
|
|
|
convert_packing(weight_data_r2, weight_data_pack4, 4); |
|
|
|
Mat weight_data_r2_packed; |
|
|
|
convert_packing(weight_data_r2, weight_data_r2_packed, 4); |
|
|
|
|
|
|
|
ncnn::cast_float32_to_float16(weight_data_pack4, weight_data_pack4_fp16, opt); |
|
|
|
ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_fp16, opt); |
|
|
|
} |
|
|
|
|
|
|
|
if (elempack == 1) |
|
|
|
@@ -623,7 +638,7 @@ int ConvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blo |
|
|
|
for (int g = 0; g < channels; g++) |
|
|
|
{ |
|
|
|
__fp16* outptr = top_blob.channel(g); |
|
|
|
const __fp16* kptr = (const __fp16*)weight_data_pack4_fp16 + maxk * g * 4; |
|
|
|
const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 4; |
|
|
|
const Mat m = bottom_blob_bordered.channel(g); |
|
|
|
|
|
|
|
for (int i = 0; i < outh; i++) |
|
|
|
@@ -655,8 +670,6 @@ int ConvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blo |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
return 0; |
|
|
|
} |
|
|
|
|
|
|
|
if (elempack == 1) |
|
|
|
@@ -794,7 +807,11 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl |
|
|
|
|
|
|
|
int outw = (w - kernel_extent_w) / stride_w + 1; |
|
|
|
int outh = (h - kernel_extent_h) / stride_h + 1; |
|
|
|
int out_elempack = (support_packing && opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1; |
|
|
|
int out_elempack = 1; |
|
|
|
if (opt.use_packing_layout) |
|
|
|
{ |
|
|
|
out_elempack = opt.use_fp16_arithmetic && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; |
|
|
|
} |
|
|
|
size_t out_elemsize = elemsize / elempack * out_elempack; |
|
|
|
|
|
|
|
top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); |
|
|
|
@@ -804,6 +821,68 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl |
|
|
|
// depth-wise |
|
|
|
if (channels * elempack == group && group == num_output) |
|
|
|
{ |
|
|
|
if (elempack == 8) |
|
|
|
{ |
|
|
|
{ |
|
|
|
const int maxk = kernel_w * kernel_h; |
|
|
|
|
|
|
|
// kernel offsets |
|
|
|
std::vector<int> _space_ofs(maxk); |
|
|
|
int* space_ofs = &_space_ofs[0]; |
|
|
|
{ |
|
|
|
int p1 = 0; |
|
|
|
int p2 = 0; |
|
|
|
int gap = w * dilation_h - kernel_w * dilation_w; |
|
|
|
for (int i = 0; i < kernel_h; i++) |
|
|
|
{ |
|
|
|
for (int j = 0; j < kernel_w; j++) |
|
|
|
{ |
|
|
|
space_ofs[p1] = p2; |
|
|
|
p1++; |
|
|
|
p2 += dilation_w; |
|
|
|
} |
|
|
|
p2 += gap; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
#pragma omp parallel for num_threads(opt.num_threads) |
|
|
|
for (int g = 0; g < channels; g++) |
|
|
|
{ |
|
|
|
__fp16* outptr = top_blob.channel(g); |
|
|
|
const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 8; |
|
|
|
const Mat m = bottom_blob_bordered.channel(g); |
|
|
|
|
|
|
|
for (int i = 0; i < outh; i++) |
|
|
|
{ |
|
|
|
for (int j = 0; j < outw; j++) |
|
|
|
{ |
|
|
|
float16x8_t _sum = vdupq_n_f16((__fp16)0.f); |
|
|
|
|
|
|
|
if (bias_term) |
|
|
|
{ |
|
|
|
_sum = vld1q_f16(((const __fp16*)bias_data_fp16) + g * 8); |
|
|
|
} |
|
|
|
|
|
|
|
const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * 8; |
|
|
|
|
|
|
|
for (int k = 0; k < maxk; k++) |
|
|
|
{ |
|
|
|
float16x8_t _val = vld1q_f16(sptr + space_ofs[k] * 8); |
|
|
|
float16x8_t _w = vld1q_f16(kptr + k * 8); |
|
|
|
_sum = vfmaq_f16(_sum, _val, _w); |
|
|
|
} |
|
|
|
|
|
|
|
_sum = activation_ps(_sum, activation_type, activation_params); |
|
|
|
|
|
|
|
vst1q_f16(outptr + j * 8, _sum); |
|
|
|
} |
|
|
|
|
|
|
|
outptr += outw * 8; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
if (elempack == 4) |
|
|
|
{ |
|
|
|
{ |
|
|
|
@@ -832,7 +911,7 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl |
|
|
|
for (int g = 0; g < channels; g++) |
|
|
|
{ |
|
|
|
__fp16* outptr = top_blob.channel(g); |
|
|
|
const __fp16* kptr = (const __fp16*)weight_data_pack4_fp16 + maxk * g * 4; |
|
|
|
const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 4; |
|
|
|
const Mat m = bottom_blob_bordered.channel(g); |
|
|
|
|
|
|
|
for (int i = 0; i < outh; i++) |
|
|
|
@@ -864,8 +943,6 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
return 0; |
|
|
|
} |
|
|
|
|
|
|
|
if (elempack == 1) |
|
|
|
@@ -960,22 +1037,27 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl |
|
|
|
const int channels_g = channels * elempack / group; |
|
|
|
const int num_output_g = num_output / group; |
|
|
|
|
|
|
|
int g_elempack = (support_packing && opt.use_packing_layout && channels_g % 4 == 0) ? 4 : 1; |
|
|
|
int out_g_elempack = (support_packing && opt.use_packing_layout && num_output_g % 4 == 0) ? 4 : 1; |
|
|
|
int g_elempack = 1; |
|
|
|
int out_g_elempack = 1; |
|
|
|
if (opt.use_packing_layout) |
|
|
|
{ |
|
|
|
g_elempack = opt.use_fp16_arithmetic && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1; |
|
|
|
out_g_elempack = opt.use_fp16_arithmetic && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1; |
|
|
|
} |
|
|
|
|
|
|
|
// unpacking |
|
|
|
Mat bottom_blob_bordered_unpacked = bottom_blob_bordered; |
|
|
|
if (elempack == 4 && g_elempack == 1) |
|
|
|
if (elempack > g_elempack) |
|
|
|
{ |
|
|
|
Option opt_p = opt; |
|
|
|
opt_p.blob_allocator = opt.workspace_allocator; |
|
|
|
convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p); |
|
|
|
convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p); |
|
|
|
} |
|
|
|
|
|
|
|
Mat top_blob_unpacked = top_blob; |
|
|
|
if (out_g_elempack == 1 && out_elempack == 4) |
|
|
|
if (out_g_elempack < out_elempack) |
|
|
|
{ |
|
|
|
top_blob_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator); |
|
|
|
top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator); |
|
|
|
if (top_blob_unpacked.empty()) |
|
|
|
return -100; |
|
|
|
} |
|
|
|
@@ -995,9 +1077,9 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl |
|
|
|
} |
|
|
|
|
|
|
|
// packing |
|
|
|
if (out_g_elempack == 1 && out_elempack == 4) |
|
|
|
if (out_g_elempack < out_elempack) |
|
|
|
{ |
|
|
|
convert_packing(top_blob_unpacked, top_blob, 4, opt); |
|
|
|
convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); |
|
|
|
} |
|
|
|
else |
|
|
|
{ |
|
|
|
|