Browse Source

pooling pack4 arm neon

tags/20190908
nihuini 6 years ago
parent
commit
640a0372d5
1 changed files with 276 additions and 5 deletions
  1. +276
    -5
      src/layer/arm/pooling_arm.cpp

+ 276
- 5
src/layer/arm/pooling_arm.cpp View File

@@ -31,6 +31,282 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
// max value in NxN window
// avg value in NxN window

int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

if (opt.use_packing_layout)
{

// fprintf(stderr, "Pooling input %d x %d pad = %d %d %d %d ksize=%d %d stride=%d %d\n", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h);

if (elempack == 4)
{
if (global_pooling)
{
top_blob.create(channels, elemsize, elempack, opt.blob_allocator);
if (top_blob.empty())
return -100;

int size = w * h;

if (pooling_type == PoolMethod_MAX)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);

float32x4_t _max = vld1q_f32(ptr);
for (int i=0; i<size; i++)
{
float32x4_t _val = vld1q_f32(ptr);
_max = vmaxq_f32(_max, _val);
ptr += 4;
}

float* outptr = top_blob;
vst1q_f32(outptr + q * 4, _max);
}
}
else if (pooling_type == PoolMethod_AVE)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);

float32x4_t _sum = vdupq_n_f32(0.f);
for (int i=0; i<size; i++)
{
float32x4_t _val = vld1q_f32(ptr);
_sum = vaddq_f32(_sum, _val);
ptr += 4;
}

float32x4_t _inv_size = vdupq_n_f32(1.f / size);
float32x4_t _avg = vmulq_f32(_sum, _inv_size);

float* outptr = top_blob;
vst1q_f32(outptr + q * 4, _avg);
}
}

return 0;
}

Mat bottom_blob_bordered = bottom_blob;

float pad_value = 0.f;
if (pooling_type == PoolMethod_MAX)
{
pad_value = -FLT_MAX;
}
else if (pooling_type == PoolMethod_AVE)
{
pad_value = 0.f;
}

int wtailpad = 0;
int htailpad = 0;

if (pad_mode == 0) // full padding
{
int wtail = (w + pad_left + pad_right - kernel_w) % stride_w;
int htail = (h + pad_top + pad_bottom - kernel_h) % stride_h;

if (wtail != 0)
wtailpad = stride_w - wtail;
if (htail != 0)
htailpad = stride_h - htail;

copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom + htailpad, pad_left, pad_right + wtailpad, BORDER_CONSTANT, pad_value, opt.workspace_allocator, opt.num_threads);
if (bottom_blob_bordered.empty())
return -100;

w = bottom_blob_bordered.w;
h = bottom_blob_bordered.h;
}
else if (pad_mode == 1) // valid padding
{
copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, BORDER_CONSTANT, pad_value, opt.workspace_allocator, opt.num_threads);
if (bottom_blob_bordered.empty())
return -100;

w = bottom_blob_bordered.w;
h = bottom_blob_bordered.h;
}
else if (pad_mode == 2) // tensorflow padding=SAME
{
int wpad = kernel_w + (w - 1) / stride_w * stride_w - w;
int hpad = kernel_h + (h - 1) / stride_h * stride_h - h;
if (wpad > 0 || hpad > 0)
{
copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value, opt.workspace_allocator, opt.num_threads);
if (bottom_blob_bordered.empty())
return -100;
}

w = bottom_blob_bordered.w;
h = bottom_blob_bordered.h;
}

int outw = (w - kernel_w) / stride_w + 1;
int outh = (h - kernel_h) / stride_h + 1;

top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
if (top_blob.empty())
return -100;

const int maxk = kernel_w * kernel_h;

// kernel offsets
std::vector<int> _space_ofs(maxk);
int* space_ofs = &_space_ofs[0];
{
int p1 = 0;
int p2 = 0;
int gap = w - kernel_w;
for (int i = 0; i < kernel_h; i++)
{
for (int j = 0; j < kernel_w; j++)
{
space_ofs[p1] = p2;
p1++;
p2++;
}
p2 += gap;
}
}

if (pooling_type == PoolMethod_MAX)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const Mat m = bottom_blob_bordered.channel(q);
float* outptr = top_blob.channel(q);

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)
{
const float* sptr = m.row(i*stride_h) + j*stride_w * 4;

float32x4_t _max = vld1q_f32(sptr);

for (int k = 0; k < maxk; k++)
{
float32x4_t _val = vld1q_f32( sptr + space_ofs[k] * 4 );
_max = vmaxq_f32(_max, _val);
}

vst1q_f32(outptr + j * 4, _max);
}

outptr += outw * 4;
}
}
}
else if (pooling_type == PoolMethod_AVE)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const Mat m = bottom_blob_bordered.channel(q);
float* outptr = top_blob.channel(q);

float32x4_t _inv_maxk = vdupq_n_f32(1.f / maxk);

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)
{
const float* sptr = m.row(i*stride_h) + j*stride_w * 4;

float32x4_t _sum = vdupq_n_f32(0.f);

for (int k = 0; k < maxk; k++)
{
float32x4_t _val = vld1q_f32( sptr + space_ofs[k] * 4 );
_sum = vaddq_f32(_sum, _val);
}

float32x4_t _avg = vmulq_f32(_sum, _inv_maxk);
vst1q_f32(outptr + j * 4, _avg);
}

outptr += outw * 4;
}

// fix pad
if (pad_top != 0)
{
const float scale = (float)kernel_h / (kernel_h - pad_top);
float32x4_t _scale = vdupq_n_f32(scale);

outptr = top_blob.channel(q).row(0);
for (int i = 0; i < outw; i++)
{
float32x4_t _v = vld1q_f32(outptr);
_v = vmulq_f32(_v, _scale);
vst1q_f32(outptr, _v);
outptr += 4;
}
}
if (pad_bottom + htailpad != 0)
{
const float scale = (float)kernel_h / (kernel_h - pad_bottom - htailpad);
float32x4_t _scale = vdupq_n_f32(scale);

outptr = top_blob.channel(q).row(outh - 1);
for (int i = 0; i < outw; i++)
{
float32x4_t _v = vld1q_f32(outptr);
_v = vmulq_f32(_v, _scale);
vst1q_f32(outptr, _v);
outptr += 4;
}
}
if (pad_left != 0)
{
const float scale = (float)kernel_w / (kernel_w - pad_left);
float32x4_t _scale = vdupq_n_f32(scale);

outptr = top_blob.channel(q);
for (int i = 0; i < outh; i++)
{
float32x4_t _v = vld1q_f32(outptr);
_v = vmulq_f32(_v, _scale);
vst1q_f32(outptr, _v);
outptr += outw * 4;
}
}
if (pad_right + wtailpad != 0)
{
const float scale = (float)kernel_w / (kernel_w - pad_right - wtailpad);
float32x4_t _scale = vdupq_n_f32(scale);

outptr = top_blob.channel(q);
outptr += (outw - 1) * 4;
for (int i = 0; i < outh; i++)
{
float32x4_t _v = vld1q_f32(outptr);
_v = vmulq_f32(_v, _scale);
vst1q_f32(outptr, _v);
outptr += outw * 4;
}
}
}
}

return 0;
}

} // opt.use_packing_layout

if (kernel_w != kernel_h || stride_w != stride_h)
{
return Pooling::forward(bottom_blob, top_blob, opt);
@@ -49,11 +325,6 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
return Pooling::forward(bottom_blob, top_blob, opt);
}

int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;

Mat bottom_blob_bordered = bottom_blob;

float pad_value = 0.f;


Loading…
Cancel
Save