Browse Source

convolutiondepthwise arm fp16sa pack8

tags/20200916
nihuini 5 years ago
parent
commit
c6d7525367
3 changed files with 101 additions and 20 deletions
  1. +2
    -2
      src/layer/arm/convolution_arm.cpp
  2. +99
    -17
      src/layer/arm/convolutiondepthwise_arm.cpp
  3. +0
    -1
      src/layer/arm/convolutiondepthwise_arm.h

+ 2
- 2
src/layer/arm/convolution_arm.cpp View File

@@ -1323,7 +1323,7 @@ int Convolution_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const
sum = bias_data[p];
}

const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * channels * p;
const __fp16* kptr = weight_data_fp16.channel(p);

// channels
for (int q = 0; q < channels; q++)
@@ -1881,7 +1881,7 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const
sum = bias_data[p];
}

const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * channels * p;
const __fp16* kptr = weight_data_fp16.channel(p);

// channels
for (int q = 0; q < channels; q++)


+ 99
- 17
src/layer/arm/convolutiondepthwise_arm.cpp View File

@@ -114,12 +114,27 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
if (opt.use_fp16_storage)
{
if (opt.use_packing_layout)
{
elempack = opt.use_fp16_arithmetic && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
}

if (elempack == 8)
{
Mat weight_data_r2 = weight_data.reshape(maxk, group);
Mat weight_data_r2_packed;
convert_packing(weight_data_r2, weight_data_r2_packed, 8);

ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_fp16, opt);
}

if (elempack == 4)
{
Mat weight_data_r2 = weight_data.reshape(maxk, group);
convert_packing(weight_data_r2, weight_data_pack4, 4);
Mat weight_data_r2_packed;
convert_packing(weight_data_r2, weight_data_r2_packed, 4);

ncnn::cast_float32_to_float16(weight_data_pack4, weight_data_pack4_fp16, opt);
ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_fp16, opt);
}

if (elempack == 1)
@@ -623,7 +638,7 @@ int ConvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blo
for (int g = 0; g < channels; g++)
{
__fp16* outptr = top_blob.channel(g);
const __fp16* kptr = (const __fp16*)weight_data_pack4_fp16 + maxk * g * 4;
const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 4;
const Mat m = bottom_blob_bordered.channel(g);

for (int i = 0; i < outh; i++)
@@ -655,8 +670,6 @@ int ConvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blo
}
}
}

return 0;
}

if (elempack == 1)
@@ -794,7 +807,11 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl

int outw = (w - kernel_extent_w) / stride_w + 1;
int outh = (h - kernel_extent_h) / stride_h + 1;
int out_elempack = (support_packing && opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
int out_elempack = 1;
if (opt.use_packing_layout)
{
out_elempack = opt.use_fp16_arithmetic && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
}
size_t out_elemsize = elemsize / elempack * out_elempack;

top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
@@ -804,6 +821,68 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl
// depth-wise
if (channels * elempack == group && group == num_output)
{
if (elempack == 8)
{
{
const int maxk = kernel_w * kernel_h;

// kernel offsets
std::vector<int> _space_ofs(maxk);
int* space_ofs = &_space_ofs[0];
{
int p1 = 0;
int p2 = 0;
int gap = w * dilation_h - kernel_w * dilation_w;
for (int i = 0; i < kernel_h; i++)
{
for (int j = 0; j < kernel_w; j++)
{
space_ofs[p1] = p2;
p1++;
p2 += dilation_w;
}
p2 += gap;
}
}

#pragma omp parallel for num_threads(opt.num_threads)
for (int g = 0; g < channels; g++)
{
__fp16* outptr = top_blob.channel(g);
const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 8;
const Mat m = bottom_blob_bordered.channel(g);

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)
{
float16x8_t _sum = vdupq_n_f16((__fp16)0.f);

if (bias_term)
{
_sum = vld1q_f16(((const __fp16*)bias_data_fp16) + g * 8);
}

const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * 8;

for (int k = 0; k < maxk; k++)
{
float16x8_t _val = vld1q_f16(sptr + space_ofs[k] * 8);
float16x8_t _w = vld1q_f16(kptr + k * 8);
_sum = vfmaq_f16(_sum, _val, _w);
}

_sum = activation_ps(_sum, activation_type, activation_params);

vst1q_f16(outptr + j * 8, _sum);
}

outptr += outw * 8;
}
}
}
}

if (elempack == 4)
{
{
@@ -832,7 +911,7 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl
for (int g = 0; g < channels; g++)
{
__fp16* outptr = top_blob.channel(g);
const __fp16* kptr = (const __fp16*)weight_data_pack4_fp16 + maxk * g * 4;
const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 4;
const Mat m = bottom_blob_bordered.channel(g);

for (int i = 0; i < outh; i++)
@@ -864,8 +943,6 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl
}
}
}

return 0;
}

if (elempack == 1)
@@ -960,22 +1037,27 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl
const int channels_g = channels * elempack / group;
const int num_output_g = num_output / group;

int g_elempack = (support_packing && opt.use_packing_layout && channels_g % 4 == 0) ? 4 : 1;
int out_g_elempack = (support_packing && opt.use_packing_layout && num_output_g % 4 == 0) ? 4 : 1;
int g_elempack = 1;
int out_g_elempack = 1;
if (opt.use_packing_layout)
{
g_elempack = opt.use_fp16_arithmetic && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
out_g_elempack = opt.use_fp16_arithmetic && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
}

// unpacking
Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
if (elempack == 4 && g_elempack == 1)
if (elempack > g_elempack)
{
Option opt_p = opt;
opt_p.blob_allocator = opt.workspace_allocator;
convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p);
convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p);
}

Mat top_blob_unpacked = top_blob;
if (out_g_elempack == 1 && out_elempack == 4)
if (out_g_elempack < out_elempack)
{
top_blob_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
if (top_blob_unpacked.empty())
return -100;
}
@@ -995,9 +1077,9 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl
}

// packing
if (out_g_elempack == 1 && out_elempack == 4)
if (out_g_elempack < out_elempack)
{
convert_packing(top_blob_unpacked, top_blob, 4, opt);
convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
}
else
{


+ 0
- 1
src/layer/arm/convolutiondepthwise_arm.h View File

@@ -48,7 +48,6 @@ public:

// fp16
Mat weight_data_fp16;
Mat weight_data_pack4_fp16;
Mat bias_data_fp16;

// bf16


Loading…
Cancel
Save