| @@ -31,6 +31,282 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op | |||
| // max value in NxN window | |||
| // avg value in NxN window | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| if (opt.use_packing_layout) | |||
| { | |||
| // fprintf(stderr, "Pooling input %d x %d pad = %d %d %d %d ksize=%d %d stride=%d %d\n", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h); | |||
| if (elempack == 4) | |||
| { | |||
| if (global_pooling) | |||
| { | |||
| top_blob.create(channels, elemsize, elempack, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| int size = w * h; | |||
| if (pooling_type == PoolMethod_MAX) | |||
| { | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| float32x4_t _max = vld1q_f32(ptr); | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| float32x4_t _val = vld1q_f32(ptr); | |||
| _max = vmaxq_f32(_max, _val); | |||
| ptr += 4; | |||
| } | |||
| float* outptr = top_blob; | |||
| vst1q_f32(outptr + q * 4, _max); | |||
| } | |||
| } | |||
| else if (pooling_type == PoolMethod_AVE) | |||
| { | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| float32x4_t _sum = vdupq_n_f32(0.f); | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| float32x4_t _val = vld1q_f32(ptr); | |||
| _sum = vaddq_f32(_sum, _val); | |||
| ptr += 4; | |||
| } | |||
| float32x4_t _inv_size = vdupq_n_f32(1.f / size); | |||
| float32x4_t _avg = vmulq_f32(_sum, _inv_size); | |||
| float* outptr = top_blob; | |||
| vst1q_f32(outptr + q * 4, _avg); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| Mat bottom_blob_bordered = bottom_blob; | |||
| float pad_value = 0.f; | |||
| if (pooling_type == PoolMethod_MAX) | |||
| { | |||
| pad_value = -FLT_MAX; | |||
| } | |||
| else if (pooling_type == PoolMethod_AVE) | |||
| { | |||
| pad_value = 0.f; | |||
| } | |||
| int wtailpad = 0; | |||
| int htailpad = 0; | |||
| if (pad_mode == 0) // full padding | |||
| { | |||
| int wtail = (w + pad_left + pad_right - kernel_w) % stride_w; | |||
| int htail = (h + pad_top + pad_bottom - kernel_h) % stride_h; | |||
| if (wtail != 0) | |||
| wtailpad = stride_w - wtail; | |||
| if (htail != 0) | |||
| htailpad = stride_h - htail; | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom + htailpad, pad_left, pad_right + wtailpad, BORDER_CONSTANT, pad_value, opt.workspace_allocator, opt.num_threads); | |||
| if (bottom_blob_bordered.empty()) | |||
| return -100; | |||
| w = bottom_blob_bordered.w; | |||
| h = bottom_blob_bordered.h; | |||
| } | |||
| else if (pad_mode == 1) // valid padding | |||
| { | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, BORDER_CONSTANT, pad_value, opt.workspace_allocator, opt.num_threads); | |||
| if (bottom_blob_bordered.empty()) | |||
| return -100; | |||
| w = bottom_blob_bordered.w; | |||
| h = bottom_blob_bordered.h; | |||
| } | |||
| else if (pad_mode == 2) // tensorflow padding=SAME | |||
| { | |||
| int wpad = kernel_w + (w - 1) / stride_w * stride_w - w; | |||
| int hpad = kernel_h + (h - 1) / stride_h * stride_h - h; | |||
| if (wpad > 0 || hpad > 0) | |||
| { | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value, opt.workspace_allocator, opt.num_threads); | |||
| if (bottom_blob_bordered.empty()) | |||
| return -100; | |||
| } | |||
| w = bottom_blob_bordered.w; | |||
| h = bottom_blob_bordered.h; | |||
| } | |||
| int outw = (w - kernel_w) / stride_w + 1; | |||
| int outh = (h - kernel_h) / stride_h + 1; | |||
| top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| const int maxk = kernel_w * kernel_h; | |||
| // kernel offsets | |||
| std::vector<int> _space_ofs(maxk); | |||
| int* space_ofs = &_space_ofs[0]; | |||
| { | |||
| int p1 = 0; | |||
| int p2 = 0; | |||
| int gap = w - kernel_w; | |||
| for (int i = 0; i < kernel_h; i++) | |||
| { | |||
| for (int j = 0; j < kernel_w; j++) | |||
| { | |||
| space_ofs[p1] = p2; | |||
| p1++; | |||
| p2++; | |||
| } | |||
| p2 += gap; | |||
| } | |||
| } | |||
| if (pooling_type == PoolMethod_MAX) | |||
| { | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const Mat m = bottom_blob_bordered.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| for (int j = 0; j < outw; j++) | |||
| { | |||
| const float* sptr = m.row(i*stride_h) + j*stride_w * 4; | |||
| float32x4_t _max = vld1q_f32(sptr); | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| float32x4_t _val = vld1q_f32( sptr + space_ofs[k] * 4 ); | |||
| _max = vmaxq_f32(_max, _val); | |||
| } | |||
| vst1q_f32(outptr + j * 4, _max); | |||
| } | |||
| outptr += outw * 4; | |||
| } | |||
| } | |||
| } | |||
| else if (pooling_type == PoolMethod_AVE) | |||
| { | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const Mat m = bottom_blob_bordered.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| float32x4_t _inv_maxk = vdupq_n_f32(1.f / maxk); | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| for (int j = 0; j < outw; j++) | |||
| { | |||
| const float* sptr = m.row(i*stride_h) + j*stride_w * 4; | |||
| float32x4_t _sum = vdupq_n_f32(0.f); | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| float32x4_t _val = vld1q_f32( sptr + space_ofs[k] * 4 ); | |||
| _sum = vaddq_f32(_sum, _val); | |||
| } | |||
| float32x4_t _avg = vmulq_f32(_sum, _inv_maxk); | |||
| vst1q_f32(outptr + j * 4, _avg); | |||
| } | |||
| outptr += outw * 4; | |||
| } | |||
| // fix pad | |||
| if (pad_top != 0) | |||
| { | |||
| const float scale = (float)kernel_h / (kernel_h - pad_top); | |||
| float32x4_t _scale = vdupq_n_f32(scale); | |||
| outptr = top_blob.channel(q).row(0); | |||
| for (int i = 0; i < outw; i++) | |||
| { | |||
| float32x4_t _v = vld1q_f32(outptr); | |||
| _v = vmulq_f32(_v, _scale); | |||
| vst1q_f32(outptr, _v); | |||
| outptr += 4; | |||
| } | |||
| } | |||
| if (pad_bottom + htailpad != 0) | |||
| { | |||
| const float scale = (float)kernel_h / (kernel_h - pad_bottom - htailpad); | |||
| float32x4_t _scale = vdupq_n_f32(scale); | |||
| outptr = top_blob.channel(q).row(outh - 1); | |||
| for (int i = 0; i < outw; i++) | |||
| { | |||
| float32x4_t _v = vld1q_f32(outptr); | |||
| _v = vmulq_f32(_v, _scale); | |||
| vst1q_f32(outptr, _v); | |||
| outptr += 4; | |||
| } | |||
| } | |||
| if (pad_left != 0) | |||
| { | |||
| const float scale = (float)kernel_w / (kernel_w - pad_left); | |||
| float32x4_t _scale = vdupq_n_f32(scale); | |||
| outptr = top_blob.channel(q); | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| float32x4_t _v = vld1q_f32(outptr); | |||
| _v = vmulq_f32(_v, _scale); | |||
| vst1q_f32(outptr, _v); | |||
| outptr += outw * 4; | |||
| } | |||
| } | |||
| if (pad_right + wtailpad != 0) | |||
| { | |||
| const float scale = (float)kernel_w / (kernel_w - pad_right - wtailpad); | |||
| float32x4_t _scale = vdupq_n_f32(scale); | |||
| outptr = top_blob.channel(q); | |||
| outptr += (outw - 1) * 4; | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| float32x4_t _v = vld1q_f32(outptr); | |||
| _v = vmulq_f32(_v, _scale); | |||
| vst1q_f32(outptr, _v); | |||
| outptr += outw * 4; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } // opt.use_packing_layout | |||
| if (kernel_w != kernel_h || stride_w != stride_h) | |||
| { | |||
| return Pooling::forward(bottom_blob, top_blob, opt); | |||
| @@ -49,11 +325,6 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op | |||
| return Pooling::forward(bottom_blob, top_blob, opt); | |||
| } | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| Mat bottom_blob_bordered = bottom_blob; | |||
| float pad_value = 0.f; | |||