| @@ -15,219 +15,247 @@ | |||
| #include "interp_arm.h" | |||
| #include <math.h> | |||
| #if __ARM_NEON | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| namespace ncnn { | |||
| #include "interp_bilinear.h" | |||
| #include "interp_bicubic.h" | |||
| #include "interp_bilinear_bf16s.h" | |||
| #include "interp_bicubic_bf16s.h" | |||
| #if __ARM_NEON | |||
| #include "interp_bilinear_pack4.h" | |||
| #include "interp_bicubic_pack4.h" | |||
| #include "interp_bilinear_pack4_bf16s.h" | |||
| #include "interp_bicubic_pack4_bf16s.h" | |||
| #endif | |||
| DEFINE_LAYER_CREATOR(Interp_arm) | |||
| static void linear_coeffs(int w, int outw, int* xofs, float* alpha) | |||
| Interp_arm::Interp_arm() | |||
| { | |||
| double scale = (double)w / outw; | |||
| #if __ARM_NEON | |||
| support_packing = true; | |||
| #endif // __ARM_NEON | |||
| for (int dx = 0; dx < outw; dx++) | |||
| { | |||
| float fx = (float)((dx + 0.5) * scale - 0.5); | |||
| int sx = floor(fx); | |||
| fx -= sx; | |||
| support_bf16_storage = true; | |||
| } | |||
| if (sx < 0) | |||
| { | |||
| sx = 0; | |||
| fx = 0.f; | |||
| } | |||
| if (sx >= w - 1) | |||
| { | |||
| sx = w - 2; | |||
| fx = 1.f; | |||
| } | |||
| int Interp_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| if (opt.use_bf16_storage) | |||
| return forward_bf16s(bottom_blob, top_blob, opt); | |||
| xofs[dx] = sx; | |||
| int h = bottom_blob.h; | |||
| int w = bottom_blob.w; | |||
| int channels = bottom_blob.c; | |||
| int dims = bottom_blob.dims; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| alpha[dx*2 ] = 1.f - fx; | |||
| alpha[dx*2 + 1] = fx; | |||
| if (dims == 1) | |||
| { | |||
| return Interp::forward(bottom_blob, top_blob, opt); | |||
| } | |||
| } | |||
| static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) | |||
| { | |||
| int w = dst.w; | |||
| int h = dst.h; | |||
| // loop body | |||
| Mat rowsbuf0(w); | |||
| Mat rowsbuf1(w); | |||
| float* rows0 = rowsbuf0; | |||
| float* rows1 = rowsbuf1; | |||
| int outh = output_height; | |||
| int outw = output_width; | |||
| int prev_sy1 = -2; | |||
| if (outh == 0 || outw == 0) | |||
| { | |||
| outh = h * height_scale; | |||
| outw = w * width_scale; | |||
| } | |||
| for (int dy = 0; dy < h; dy++ ) | |||
| if (outh == h && outw == w) | |||
| { | |||
| int sy = yofs[dy]; | |||
| top_blob = bottom_blob; | |||
| return 0; | |||
| } | |||
| top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| if (sy == prev_sy1) | |||
| { | |||
| // reuse all rows | |||
| } | |||
| else if (sy == prev_sy1 + 1) | |||
| { | |||
| // hresize one row | |||
| float* rows0_old = rows0; | |||
| rows0 = rows1; | |||
| rows1 = rows0_old; | |||
| const float* S1 = src.row(sy+1); | |||
| const float* alphap = alpha; | |||
| float* rows1p = rows1; | |||
| int dx = 0; | |||
| #if __ARM_NEON | |||
| for ( ; dx+1 < w; dx += 2 ) | |||
| if (elempack == 4) | |||
| { | |||
| if (resize_type == 1)// nearest | |||
| { | |||
| const float hs = output_height ? h / (float)output_height : 1.f / height_scale; | |||
| const float ws = output_width ? w / (float)output_width : 1.f / width_scale; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| int sx = xofs[dx]; | |||
| int sxn = xofs[dx+1]; | |||
| const float* S1p = S1 + sx; | |||
| const float* S1np = S1 + sxn; | |||
| const Mat src = bottom_blob.channel(q); | |||
| Mat dst = top_blob.channel(q); | |||
| float32x4_t _a = vld1q_f32(alphap); | |||
| float32x2_t _S1 = vld1_f32(S1p); | |||
| float32x2_t _S1n = vld1_f32(S1np); | |||
| for (int y = 0; y < outh; y++) | |||
| { | |||
| int in_y = std::min((int) (y * hs), (h - 1)); | |||
| float32x4_t _S1S1n = vcombine_f32(_S1, _S1n); | |||
| float32x4_t _ms1 = vmulq_f32(_S1S1n, _a); | |||
| float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1)); | |||
| const float* ptr = src.row(in_y); | |||
| float* outptr = dst.row(y); | |||
| for (int x = 0; x < outw; x++) | |||
| { | |||
| int in_x = std::min((int) (x * ws), (w - 1)); | |||
| vst1_f32(rows1p + dx, _rows1); | |||
| float32x4_t _p = vld1q_f32(ptr + in_x * 4); | |||
| vst1q_f32(outptr, _p); | |||
| alphap += 4; | |||
| outptr += 4; | |||
| } | |||
| } | |||
| } | |||
| #endif // __ARM_NEON | |||
| for ( ; dx < w; dx++ ) | |||
| { | |||
| int sx = xofs[dx]; | |||
| const float* S1p = S1 + sx; | |||
| } | |||
| if (resize_type == 2)// bilinear | |||
| { | |||
| int* buf = new int[outw + outh + outw*2 + outh*2]; | |||
| int* xofs = buf;//new int[outw]; | |||
| int* yofs = buf + outw;//new int[outh]; | |||
| float a0 = alphap[0]; | |||
| float a1 = alphap[1]; | |||
| rows1p[dx] = S1p[0]*a0 + S1p[1]*a1; | |||
| float* alpha = (float*)(buf + outw + outh);//new float[outw * 2]; | |||
| float* beta = (float*)(buf + outw + outh + outw*2);//new float[outh * 2]; | |||
| alphap += 2; | |||
| linear_coeffs(w, outw, xofs, alpha); | |||
| linear_coeffs(h, outh, yofs, beta); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const Mat src = bottom_blob.channel(q); | |||
| Mat dst = top_blob.channel(q); | |||
| resize_bilinear_image_pack4(src, dst, alpha, xofs, beta, yofs); | |||
| } | |||
| delete[] buf; | |||
| } | |||
| else | |||
| if (resize_type == 3)// bicubic | |||
| { | |||
| // hresize two rows | |||
| const float* S0 = src.row(sy); | |||
| const float* S1 = src.row(sy+1); | |||
| const float* alphap = alpha; | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| int dx = 0; | |||
| #if __ARM_NEON | |||
| for ( ; dx+1 < w; dx += 2 ) | |||
| int* buf = new int[outw + outh + outw*4 + outh*4]; | |||
| int* xofs = buf;//new int[outw]; | |||
| int* yofs = buf + outw;//new int[outh]; | |||
| float* alpha = (float*)(buf + outw + outh);//new float[outw * 4]; | |||
| float* beta = (float*)(buf + outw + outh + outw*4);//new float[outh * 4]; | |||
| cubic_coeffs(w, outw, xofs, alpha); | |||
| cubic_coeffs(h, outh, yofs, beta); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| int sx = xofs[dx]; | |||
| int sxn = xofs[dx+1]; | |||
| const float* S0p = S0 + sx; | |||
| const float* S1p = S1 + sx; | |||
| const float* S0np = S0 + sxn; | |||
| const float* S1np = S1 + sxn; | |||
| float32x4_t _a = vld1q_f32(alphap); | |||
| float32x2_t _S0 = vld1_f32(S0p); | |||
| float32x2_t _S1 = vld1_f32(S1p); | |||
| float32x2_t _S0n = vld1_f32(S0np); | |||
| float32x2_t _S1n = vld1_f32(S1np); | |||
| float32x4_t _S0S0n = vcombine_f32(_S0, _S0n); | |||
| float32x4_t _S1S1n = vcombine_f32(_S1, _S1n); | |||
| float32x4_t _ms0 = vmulq_f32(_S0S0n, _a); | |||
| float32x4_t _ms1 = vmulq_f32(_S1S1n, _a); | |||
| float32x2_t _rows0 = vpadd_f32(vget_low_f32(_ms0), vget_high_f32(_ms0)); | |||
| float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1)); | |||
| vst1_f32(rows0p + dx, _rows0); | |||
| vst1_f32(rows1p + dx, _rows1); | |||
| alphap += 4; | |||
| const Mat src = bottom_blob.channel(q); | |||
| Mat dst = top_blob.channel(q); | |||
| resize_bicubic_image_pack4(src, dst, alpha, xofs, beta, yofs); | |||
| } | |||
| delete[] buf; | |||
| } | |||
| return 0; | |||
| } | |||
| #endif // __ARM_NEON | |||
| for ( ; dx < w; dx++ ) | |||
| { | |||
| int sx = xofs[dx]; | |||
| const float* S0p = S0 + sx; | |||
| const float* S1p = S1 + sx; | |||
| float a0 = alphap[0]; | |||
| float a1 = alphap[1]; | |||
| rows0p[dx] = S0p[0]*a0 + S0p[1]*a1; | |||
| rows1p[dx] = S1p[0]*a0 + S1p[1]*a1; | |||
| if (resize_type == 1)// nearest | |||
| { | |||
| const float hs = output_height ? h / (float)output_height : 1.f / height_scale; | |||
| const float ws = output_width ? w / (float)output_width : 1.f / width_scale; | |||
| alphap += 2; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const Mat src = bottom_blob.channel(q); | |||
| Mat dst = top_blob.channel(q); | |||
| for (int y = 0; y < outh; y++) | |||
| { | |||
| int in_y = std::min((int) (y * hs), (h - 1)); | |||
| const float* ptr = src.row(in_y); | |||
| float* outptr = dst.row(y); | |||
| for (int x = 0; x < outw; x++) | |||
| { | |||
| int in_x = std::min((int) (x * ws), (w - 1)); | |||
| *outptr++ = ptr[in_x]; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| prev_sy1 = sy; | |||
| if (resize_type == 2)// bilinear | |||
| { | |||
| int* buf = new int[outw + outh + outw*2 + outh*2]; | |||
| // vresize | |||
| float b0 = beta[0]; | |||
| float b1 = beta[1]; | |||
| int* xofs = buf;//new int[outw]; | |||
| int* yofs = buf + outw;//new int[outh]; | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| float* Dp = dst.row(dy); | |||
| float* alpha = (float*)(buf + outw + outh);//new float[outw * 2]; | |||
| float* beta = (float*)(buf + outw + outh + outw*2);//new float[outh * 2]; | |||
| #if __ARM_NEON | |||
| int nn = w >> 3; | |||
| #else | |||
| int nn = 0; | |||
| #endif | |||
| int remain = w - (nn << 3); | |||
| linear_coeffs(w, outw, xofs, alpha); | |||
| linear_coeffs(h, outh, yofs, beta); | |||
| #if __ARM_NEON | |||
| float32x4_t _b0 = vdupq_n_f32(b0); | |||
| float32x4_t _b1 = vdupq_n_f32(b1); | |||
| for (; nn>0; nn--) | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| float32x4_t _rows0 = vld1q_f32(rows0p); | |||
| float32x4_t _rows1 = vld1q_f32(rows1p); | |||
| const Mat src = bottom_blob.channel(q); | |||
| Mat dst = top_blob.channel(q); | |||
| resize_bilinear_image(src, dst, alpha, xofs, beta, yofs); | |||
| } | |||
| float32x4_t _D = vmulq_f32(_rows0, _b0); | |||
| _D = vmlaq_f32(_D, _rows1, _b1); | |||
| delete[] buf; | |||
| } | |||
| vst1q_f32(Dp, _D); | |||
| if (resize_type == 3)// bicubic | |||
| { | |||
| int* buf = new int[outw + outh + outw*4 + outh*4]; | |||
| float32x4_t _rows0n = vld1q_f32(rows0p+4); | |||
| float32x4_t _rows1n = vld1q_f32(rows1p+4); | |||
| int* xofs = buf;//new int[outw]; | |||
| int* yofs = buf + outw;//new int[outh]; | |||
| float32x4_t _Dn = vmulq_f32(_rows0n, _b0); | |||
| _Dn = vmlaq_f32(_Dn, _rows1n, _b1); | |||
| float* alpha = (float*)(buf + outw + outh);//new float[outw * 4]; | |||
| float* beta = (float*)(buf + outw + outh + outw*4);//new float[outh * 4]; | |||
| vst1q_f32(Dp+4, _Dn); | |||
| cubic_coeffs(w, outw, xofs, alpha); | |||
| cubic_coeffs(h, outh, yofs, beta); | |||
| Dp += 8; | |||
| rows0p += 8; | |||
| rows1p += 8; | |||
| } | |||
| #endif // __ARM_NEON | |||
| for ( ; remain; --remain ) | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| // D[x] = rows0[x]*b0 + rows1[x]*b1; | |||
| *Dp++ = *rows0p++ * b0 + *rows1p++ * b1; | |||
| const Mat src = bottom_blob.channel(q); | |||
| Mat dst = top_blob.channel(q); | |||
| resize_bicubic_image(src, dst, alpha, xofs, beta, yofs); | |||
| } | |||
| beta += 2; | |||
| delete[] buf; | |||
| } | |||
| return 0; | |||
| } | |||
| int Interp_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| int Interp_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| int h = bottom_blob.h; | |||
| int w = bottom_blob.w; | |||
| int channels = bottom_blob.c; | |||
| int dims = bottom_blob.dims; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| if (resize_type != 2 || dims == 1) | |||
| if (dims == 1) | |||
| { | |||
| return Interp::forward(bottom_blob, top_blob, opt); | |||
| } | |||
| @@ -247,31 +275,172 @@ int Interp_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt | |||
| return 0; | |||
| } | |||
| top_blob.create(outw, outh, channels, elemsize, opt.blob_allocator); | |||
| top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| int* buf = new int[outw + outh + outw*2 + outh*2]; | |||
| #if __ARM_NEON | |||
| if (elempack == 4) | |||
| { | |||
| if (resize_type == 1)// nearest | |||
| { | |||
| const float hs = output_height ? h / (float)output_height : 1.f / height_scale; | |||
| const float ws = output_width ? w / (float)output_width : 1.f / width_scale; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const Mat src = bottom_blob.channel(q); | |||
| Mat dst = top_blob.channel(q); | |||
| for (int y = 0; y < outh; y++) | |||
| { | |||
| int in_y = std::min((int) (y * hs), (h - 1)); | |||
| const unsigned short* ptr = src.row<const unsigned short>(in_y); | |||
| unsigned short* outptr = dst.row<unsigned short>(y); | |||
| for (int x = 0; x < outw; x++) | |||
| { | |||
| int in_x = std::min((int) (x * ws), (w - 1)); | |||
| uint16x4_t _p = vld1_u16(ptr + in_x * 4); | |||
| vst1_u16(outptr, _p); | |||
| outptr += 4; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| if (resize_type == 2)// bilinear | |||
| { | |||
| int* buf = new int[outw + outh + outw*2 + outh*2]; | |||
| int* xofs = buf;//new int[outw]; | |||
| int* yofs = buf + outw;//new int[outh]; | |||
| float* alpha = (float*)(buf + outw + outh);//new float[outw * 2]; | |||
| float* beta = (float*)(buf + outw + outh + outw*2);//new float[outh * 2]; | |||
| linear_coeffs(w, outw, xofs, alpha); | |||
| linear_coeffs(h, outh, yofs, beta); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const Mat src = bottom_blob.channel(q); | |||
| Mat dst = top_blob.channel(q); | |||
| resize_bilinear_image_pack4_bf16s(src, dst, alpha, xofs, beta, yofs); | |||
| } | |||
| delete[] buf; | |||
| } | |||
| if (resize_type == 3)// bicubic | |||
| { | |||
| int* buf = new int[outw + outh + outw*4 + outh*4]; | |||
| int* xofs = buf;//new int[outw]; | |||
| int* yofs = buf + outw;//new int[outh]; | |||
| float* alpha = (float*)(buf + outw + outh);//new float[outw * 4]; | |||
| float* beta = (float*)(buf + outw + outh + outw*4);//new float[outh * 4]; | |||
| int* xofs = buf;//new int[outw]; | |||
| int* yofs = buf + outw;//new int[outh]; | |||
| cubic_coeffs(w, outw, xofs, alpha); | |||
| cubic_coeffs(h, outh, yofs, beta); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const Mat src = bottom_blob.channel(q); | |||
| Mat dst = top_blob.channel(q); | |||
| float* alpha = (float*)(buf + outw + outh);//new float[outw * 2]; | |||
| float* beta = (float*)(buf + outw + outh + outw*2);//new float[outh * 2]; | |||
| resize_bicubic_image_pack4_bf16s(src, dst, alpha, xofs, beta, yofs); | |||
| } | |||
| linear_coeffs(w, outw, xofs, alpha); | |||
| linear_coeffs(h, outh, yofs, beta); | |||
| delete[] buf; | |||
| } | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| return 0; | |||
| } | |||
| #endif // __ARM_NEON | |||
| if (resize_type == 1)// nearest | |||
| { | |||
| const Mat src = bottom_blob.channel(q); | |||
| Mat dst = top_blob.channel(q); | |||
| const float hs = output_height ? h / (float)output_height : 1.f / height_scale; | |||
| const float ws = output_width ? w / (float)output_width : 1.f / width_scale; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const Mat src = bottom_blob.channel(q); | |||
| Mat dst = top_blob.channel(q); | |||
| resize_bilinear_image(src, dst, alpha, xofs, beta, yofs); | |||
| for (int y = 0; y < outh; y++) | |||
| { | |||
| int in_y = std::min((int) (y * hs), (h - 1)); | |||
| const unsigned short* ptr = src.row<const unsigned short>(in_y); | |||
| unsigned short* outptr = dst.row<unsigned short>(y); | |||
| for (int x = 0; x < outw; x++) | |||
| { | |||
| int in_x = std::min((int) (x * ws), (w - 1)); | |||
| *outptr++ = ptr[in_x]; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| delete[] buf; | |||
| if (resize_type == 2)// bilinear | |||
| { | |||
| int* buf = new int[outw + outh + outw*2 + outh*2]; | |||
| int* xofs = buf;//new int[outw]; | |||
| int* yofs = buf + outw;//new int[outh]; | |||
| float* alpha = (float*)(buf + outw + outh);//new float[outw * 2]; | |||
| float* beta = (float*)(buf + outw + outh + outw*2);//new float[outh * 2]; | |||
| linear_coeffs(w, outw, xofs, alpha); | |||
| linear_coeffs(h, outh, yofs, beta); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const Mat src = bottom_blob.channel(q); | |||
| Mat dst = top_blob.channel(q); | |||
| resize_bilinear_image_bf16s(src, dst, alpha, xofs, beta, yofs); | |||
| } | |||
| delete[] buf; | |||
| } | |||
| if (resize_type == 3)// bicubic | |||
| { | |||
| int* buf = new int[outw + outh + outw*4 + outh*4]; | |||
| int* xofs = buf;//new int[outw]; | |||
| int* yofs = buf + outw;//new int[outh]; | |||
| float* alpha = (float*)(buf + outw + outh);//new float[outw * 4]; | |||
| float* beta = (float*)(buf + outw + outh + outw*4);//new float[outh * 4]; | |||
| cubic_coeffs(w, outw, xofs, alpha); | |||
| cubic_coeffs(h, outh, yofs, beta); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const Mat src = bottom_blob.channel(q); | |||
| Mat dst = top_blob.channel(q); | |||
| resize_bicubic_image_bf16s(src, dst, alpha, xofs, beta, yofs); | |||
| } | |||
| delete[] buf; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -22,7 +22,12 @@ namespace ncnn { | |||
| class Interp_arm : virtual public Interp | |||
| { | |||
| public: | |||
| Interp_arm(); | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| protected: | |||
| int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,252 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static inline void interpolate_cubic(float fx, float* coeffs) | |||
| { | |||
| const float A = -0.75f; | |||
| float fx0 = fx + 1; | |||
| float fx1 = fx; | |||
| float fx2 = 1 - fx; | |||
| // float fx3 = 2 - fx; | |||
| coeffs[0] = A * fx0*fx0*fx0 - 5*A * fx0*fx0 + 8*A * fx0 - 4*A; | |||
| coeffs[1] = (A+2) * fx1*fx1*fx1 - (A+3) * fx1*fx1 + 1; | |||
| coeffs[2] = (A+2) * fx2*fx2*fx2 - (A+3) * fx2*fx2 + 1; | |||
| coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; | |||
| } | |||
| static void cubic_coeffs(int w, int outw, int* xofs, float* alpha) | |||
| { | |||
| double scale = (double)w / outw; | |||
| for (int dx = 0; dx < outw; dx++) | |||
| { | |||
| float fx = (float)((dx + 0.5) * scale - 0.5); | |||
| int sx = static_cast<int>(floor(fx)); | |||
| fx -= sx; | |||
| interpolate_cubic(fx, alpha + dx*4); | |||
| if (sx <= -1) | |||
| { | |||
| sx = 1; | |||
| alpha[dx*4 +0] = 1.f - alpha[dx*4 +3]; | |||
| alpha[dx*4 +1] = alpha[dx*4 +3]; | |||
| alpha[dx*4 +2] = 0.f; | |||
| alpha[dx*4 +3] = 0.f; | |||
| } | |||
| if (sx == 0) | |||
| { | |||
| sx = 1; | |||
| alpha[dx*4 +0] = alpha[dx*4 +0] + alpha[dx*4 +1]; | |||
| alpha[dx*4 +1] = alpha[dx*4 +2]; | |||
| alpha[dx*4 +2] = alpha[dx*4 +3]; | |||
| alpha[dx*4 +3] = 0.f; | |||
| } | |||
| if (sx == w - 2) | |||
| { | |||
| sx = w - 3; | |||
| alpha[dx*4 +3] = alpha[dx*4 +2] + alpha[dx*4 +3]; | |||
| alpha[dx*4 +2] = alpha[dx*4 +1]; | |||
| alpha[dx*4 +1] = alpha[dx*4 +0]; | |||
| alpha[dx*4 +0] = 0.f; | |||
| } | |||
| if (sx >= w - 1) | |||
| { | |||
| sx = w - 3; | |||
| alpha[dx*4 +3] = 1.f - alpha[dx*4 +0]; | |||
| alpha[dx*4 +2] = alpha[dx*4 +0]; | |||
| alpha[dx*4 +1] = 0.f; | |||
| alpha[dx*4 +0] = 0.f; | |||
| } | |||
| xofs[dx] = sx; | |||
| } | |||
| } | |||
| static void resize_bicubic_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) | |||
| { | |||
| int w = dst.w; | |||
| int h = dst.h; | |||
| // loop body | |||
| Mat rowsbuf0(w); | |||
| Mat rowsbuf1(w); | |||
| Mat rowsbuf2(w); | |||
| Mat rowsbuf3(w); | |||
| float* rows0 = rowsbuf0; | |||
| float* rows1 = rowsbuf1; | |||
| float* rows2 = rowsbuf2; | |||
| float* rows3 = rowsbuf3; | |||
| int prev_sy1 = -3; | |||
| for (int dy = 0; dy < h; dy++ ) | |||
| { | |||
| int sy = yofs[dy]; | |||
| if (sy == prev_sy1) | |||
| { | |||
| // reuse all rows | |||
| } | |||
| else if (sy == prev_sy1 + 1) | |||
| { | |||
| // hresize one row | |||
| float* rows0_old = rows0; | |||
| rows0 = rows1; | |||
| rows1 = rows2; | |||
| rows2 = rows3; | |||
| rows3 = rows0_old; | |||
| const float* S3 = src.row(sy+2); | |||
| const float* alphap = alpha; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx]; | |||
| const float* S3p = S3 + sx; | |||
| float a0 = alphap[0]; | |||
| float a1 = alphap[1]; | |||
| float a2 = alphap[2]; | |||
| float a3 = alphap[3]; | |||
| rows3p[dx] = S3p[-1]*a0 + S3p[0]*a1 + S3p[1]*a2 + S3p[2]*a3; | |||
| alphap += 4; | |||
| } | |||
| } | |||
| else if (sy == prev_sy1 + 2) | |||
| { | |||
| // hresize two rows | |||
| float* rows0_old = rows0; | |||
| float* rows1_old = rows1; | |||
| rows0 = rows2; | |||
| rows1 = rows3; | |||
| rows2 = rows0_old; | |||
| rows3 = rows1_old; | |||
| const float* S2 = src.row(sy+1); | |||
| const float* S3 = src.row(sy+2); | |||
| const float* alphap = alpha; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx]; | |||
| const float* S2p = S2 + sx; | |||
| const float* S3p = S3 + sx; | |||
| float a0 = alphap[0]; | |||
| float a1 = alphap[1]; | |||
| float a2 = alphap[2]; | |||
| float a3 = alphap[3]; | |||
| rows2p[dx] = S2p[-1]*a0 + S2p[0]*a1 + S2p[1]*a2 + S2p[2]*a3; | |||
| rows3p[dx] = S3p[-1]*a0 + S3p[0]*a1 + S3p[1]*a2 + S3p[2]*a3; | |||
| alphap += 4; | |||
| } | |||
| } | |||
| else if (sy == prev_sy1 + 3) | |||
| { | |||
| // hresize three rows | |||
| float* rows0_old = rows0; | |||
| float* rows1_old = rows1; | |||
| float* rows2_old = rows2; | |||
| rows0 = rows3; | |||
| rows1 = rows0_old; | |||
| rows2 = rows1_old; | |||
| rows3 = rows2_old; | |||
| const float* S1 = src.row(sy); | |||
| const float* S2 = src.row(sy+1); | |||
| const float* S3 = src.row(sy+2); | |||
| const float* alphap = alpha; | |||
| float* rows1p = rows1; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx]; | |||
| const float* S1p = S1 + sx; | |||
| const float* S2p = S2 + sx; | |||
| const float* S3p = S3 + sx; | |||
| float a0 = alphap[0]; | |||
| float a1 = alphap[1]; | |||
| float a2 = alphap[2]; | |||
| float a3 = alphap[3]; | |||
| rows1p[dx] = S1p[-1]*a0 + S1p[0]*a1 + S1p[1]*a2 + S1p[2]*a3; | |||
| rows2p[dx] = S2p[-1]*a0 + S2p[0]*a1 + S2p[1]*a2 + S2p[2]*a3; | |||
| rows3p[dx] = S3p[-1]*a0 + S3p[0]*a1 + S3p[1]*a2 + S3p[2]*a3; | |||
| alphap += 4; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| // hresize four rows | |||
| const float* S0 = src.row(sy-1); | |||
| const float* S1 = src.row(sy); | |||
| const float* S2 = src.row(sy+1); | |||
| const float* S3 = src.row(sy+2); | |||
| const float* alphap = alpha; | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx]; | |||
| const float* S0p = S0 + sx; | |||
| const float* S1p = S1 + sx; | |||
| const float* S2p = S2 + sx; | |||
| const float* S3p = S3 + sx; | |||
| float a0 = alphap[0]; | |||
| float a1 = alphap[1]; | |||
| float a2 = alphap[2]; | |||
| float a3 = alphap[3]; | |||
| rows0p[dx] = S0p[-1]*a0 + S0p[0]*a1 + S0p[1]*a2 + S0p[2]*a3; | |||
| rows1p[dx] = S1p[-1]*a0 + S1p[0]*a1 + S1p[1]*a2 + S1p[2]*a3; | |||
| rows2p[dx] = S2p[-1]*a0 + S2p[0]*a1 + S2p[1]*a2 + S2p[2]*a3; | |||
| rows3p[dx] = S3p[-1]*a0 + S3p[0]*a1 + S3p[1]*a2 + S3p[2]*a3; | |||
| alphap += 4; | |||
| } | |||
| } | |||
| prev_sy1 = sy; | |||
| // vresize | |||
| float b0 = beta[0]; | |||
| float b1 = beta[1]; | |||
| float b2 = beta[2]; | |||
| float b3 = beta[3]; | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| float* Dp = dst.row(dy); | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| // D[x] = rows0[x]*b0 + rows1[x]*b1 + rows2[x]*b2 + rows3[x]*b3; | |||
| *Dp++ = *rows0p++ * b0 + *rows1p++ * b1 + *rows2p++ * b2 + *rows3p++ * b3; | |||
| } | |||
| beta += 4; | |||
| } | |||
| } | |||
| @@ -0,0 +1,188 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void resize_bicubic_image_bf16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) | |||
| { | |||
| int w = dst.w; | |||
| int h = dst.h; | |||
| // loop body | |||
| Mat rowsbuf0(w); | |||
| Mat rowsbuf1(w); | |||
| Mat rowsbuf2(w); | |||
| Mat rowsbuf3(w); | |||
| float* rows0 = rowsbuf0; | |||
| float* rows1 = rowsbuf1; | |||
| float* rows2 = rowsbuf2; | |||
| float* rows3 = rowsbuf3; | |||
| int prev_sy1 = -3; | |||
| for (int dy = 0; dy < h; dy++ ) | |||
| { | |||
| int sy = yofs[dy]; | |||
| if (sy == prev_sy1) | |||
| { | |||
| // reuse all rows | |||
| } | |||
| else if (sy == prev_sy1 + 1) | |||
| { | |||
| // hresize one row | |||
| float* rows0_old = rows0; | |||
| rows0 = rows1; | |||
| rows1 = rows2; | |||
| rows2 = rows3; | |||
| rows3 = rows0_old; | |||
| const unsigned short* S3 = src.row<const unsigned short>(sy+2); | |||
| const float* alphap = alpha; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx]; | |||
| const unsigned short* S3p = S3 + sx; | |||
| float a0 = alphap[0]; | |||
| float a1 = alphap[1]; | |||
| float a2 = alphap[2]; | |||
| float a3 = alphap[3]; | |||
| rows3p[dx] = bfloat16_to_float32(S3p[-1])*a0 + bfloat16_to_float32(S3p[0])*a1 + bfloat16_to_float32(S3p[1])*a2 + bfloat16_to_float32(S3p[2])*a3; | |||
| alphap += 4; | |||
| } | |||
| } | |||
| else if (sy == prev_sy1 + 2) | |||
| { | |||
| // hresize two rows | |||
| float* rows0_old = rows0; | |||
| float* rows1_old = rows1; | |||
| rows0 = rows2; | |||
| rows1 = rows3; | |||
| rows2 = rows0_old; | |||
| rows3 = rows1_old; | |||
| const unsigned short* S2 = src.row<const unsigned short>(sy+1); | |||
| const unsigned short* S3 = src.row<const unsigned short>(sy+2); | |||
| const float* alphap = alpha; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx]; | |||
| const unsigned short* S2p = S2 + sx; | |||
| const unsigned short* S3p = S3 + sx; | |||
| float a0 = alphap[0]; | |||
| float a1 = alphap[1]; | |||
| float a2 = alphap[2]; | |||
| float a3 = alphap[3]; | |||
| rows2p[dx] = bfloat16_to_float32(S2p[-1])*a0 + bfloat16_to_float32(S2p[0])*a1 + bfloat16_to_float32(S2p[1])*a2 + bfloat16_to_float32(S2p[2])*a3; | |||
| rows3p[dx] = bfloat16_to_float32(S3p[-1])*a0 + bfloat16_to_float32(S3p[0])*a1 + bfloat16_to_float32(S3p[1])*a2 + bfloat16_to_float32(S3p[2])*a3; | |||
| alphap += 4; | |||
| } | |||
| } | |||
| else if (sy == prev_sy1 + 3) | |||
| { | |||
| // hresize three rows | |||
| float* rows0_old = rows0; | |||
| float* rows1_old = rows1; | |||
| float* rows2_old = rows2; | |||
| rows0 = rows3; | |||
| rows1 = rows0_old; | |||
| rows2 = rows1_old; | |||
| rows3 = rows2_old; | |||
| const unsigned short* S1 = src.row<const unsigned short>(sy); | |||
| const unsigned short* S2 = src.row<const unsigned short>(sy+1); | |||
| const unsigned short* S3 = src.row<const unsigned short>(sy+2); | |||
| const float* alphap = alpha; | |||
| float* rows1p = rows1; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx]; | |||
| const unsigned short* S1p = S1 + sx; | |||
| const unsigned short* S2p = S2 + sx; | |||
| const unsigned short* S3p = S3 + sx; | |||
| float a0 = alphap[0]; | |||
| float a1 = alphap[1]; | |||
| float a2 = alphap[2]; | |||
| float a3 = alphap[3]; | |||
| rows1p[dx] = bfloat16_to_float32(S1p[-1])*a0 + bfloat16_to_float32(S1p[0])*a1 + bfloat16_to_float32(S1p[1])*a2 + bfloat16_to_float32(S1p[2])*a3; | |||
| rows2p[dx] = bfloat16_to_float32(S2p[-1])*a0 + bfloat16_to_float32(S2p[0])*a1 + bfloat16_to_float32(S2p[1])*a2 + bfloat16_to_float32(S2p[2])*a3; | |||
| rows3p[dx] = bfloat16_to_float32(S3p[-1])*a0 + bfloat16_to_float32(S3p[0])*a1 + bfloat16_to_float32(S3p[1])*a2 + bfloat16_to_float32(S3p[2])*a3; | |||
| alphap += 4; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| // hresize four rows | |||
| const unsigned short* S0 = src.row<const unsigned short>(sy-1); | |||
| const unsigned short* S1 = src.row<const unsigned short>(sy); | |||
| const unsigned short* S2 = src.row<const unsigned short>(sy+1); | |||
| const unsigned short* S3 = src.row<const unsigned short>(sy+2); | |||
| const float* alphap = alpha; | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx]; | |||
| const unsigned short* S0p = S0 + sx; | |||
| const unsigned short* S1p = S1 + sx; | |||
| const unsigned short* S2p = S2 + sx; | |||
| const unsigned short* S3p = S3 + sx; | |||
| float a0 = alphap[0]; | |||
| float a1 = alphap[1]; | |||
| float a2 = alphap[2]; | |||
| float a3 = alphap[3]; | |||
| rows0p[dx] = bfloat16_to_float32(S0p[-1])*a0 + bfloat16_to_float32(S0p[0])*a1 + bfloat16_to_float32(S0p[1])*a2 + bfloat16_to_float32(S0p[2])*a3; | |||
| rows1p[dx] = bfloat16_to_float32(S1p[-1])*a0 + bfloat16_to_float32(S1p[0])*a1 + bfloat16_to_float32(S1p[1])*a2 + bfloat16_to_float32(S1p[2])*a3; | |||
| rows2p[dx] = bfloat16_to_float32(S2p[-1])*a0 + bfloat16_to_float32(S2p[0])*a1 + bfloat16_to_float32(S2p[1])*a2 + bfloat16_to_float32(S2p[2])*a3; | |||
| rows3p[dx] = bfloat16_to_float32(S3p[-1])*a0 + bfloat16_to_float32(S3p[0])*a1 + bfloat16_to_float32(S3p[1])*a2 + bfloat16_to_float32(S3p[2])*a3; | |||
| alphap += 4; | |||
| } | |||
| } | |||
| prev_sy1 = sy; | |||
| // vresize | |||
| float b0 = beta[0]; | |||
| float b1 = beta[1]; | |||
| float b2 = beta[2]; | |||
| float b3 = beta[3]; | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| unsigned short* Dp = dst.row<unsigned short>(dy); | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| // D[x] = rows0[x]*b0 + rows1[x]*b1 + rows2[x]*b2 + rows3[x]*b3; | |||
| *Dp++ = float32_to_bfloat16(*rows0p++ * b0 + *rows1p++ * b1 + *rows2p++ * b2 + *rows3p++ * b3); | |||
| } | |||
| beta += 4; | |||
| } | |||
| } | |||
| @@ -0,0 +1,272 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void resize_bicubic_image_pack4(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) | |||
| { | |||
| int w = dst.w; | |||
| int h = dst.h; | |||
| // loop body | |||
| Mat rowsbuf0(w, (size_t)4 * 4u, 4); | |||
| Mat rowsbuf1(w, (size_t)4 * 4u, 4); | |||
| Mat rowsbuf2(w, (size_t)4 * 4u, 4); | |||
| Mat rowsbuf3(w, (size_t)4 * 4u, 4); | |||
| float* rows0 = rowsbuf0; | |||
| float* rows1 = rowsbuf1; | |||
| float* rows2 = rowsbuf2; | |||
| float* rows3 = rowsbuf3; | |||
| int prev_sy1 = -3; | |||
| for (int dy = 0; dy < h; dy++ ) | |||
| { | |||
| int sy = yofs[dy]; | |||
| if (sy == prev_sy1) | |||
| { | |||
| // reuse all rows | |||
| } | |||
| else if (sy == prev_sy1 + 1) | |||
| { | |||
| // hresize one row | |||
| float* rows0_old = rows0; | |||
| rows0 = rows1; | |||
| rows1 = rows2; | |||
| rows2 = rows3; | |||
| rows3 = rows0_old; | |||
| const float* S3 = src.row(sy+2); | |||
| const float* alphap = alpha; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx] * 4; | |||
| const float* S3p = S3 + sx; | |||
| float32x4_t _a0123 = vld1q_f32(alphap); | |||
| float32x4_t _S30 = vld1q_f32(S3p - 4); | |||
| float32x4_t _S31 = vld1q_f32(S3p + 0); | |||
| float32x4_t _S32 = vld1q_f32(S3p + 4); | |||
| float32x4_t _S33 = vld1q_f32(S3p + 8); | |||
| float32x4_t _rows3 = vmulq_lane_f32(_S30, vget_low_f32(_a0123), 0); | |||
| _rows3 = vmlaq_lane_f32(_rows3, _S31, vget_low_f32(_a0123), 1); | |||
| _rows3 = vmlaq_lane_f32(_rows3, _S32, vget_high_f32(_a0123), 0); | |||
| _rows3 = vmlaq_lane_f32(_rows3, _S33, vget_high_f32(_a0123), 1); | |||
| vst1q_f32(rows3p + dx * 4, _rows3); | |||
| alphap += 4; | |||
| } | |||
| } | |||
| else if (sy == prev_sy1 + 2) | |||
| { | |||
| // hresize two rows | |||
| float* rows0_old = rows0; | |||
| float* rows1_old = rows1; | |||
| rows0 = rows2; | |||
| rows1 = rows3; | |||
| rows2 = rows0_old; | |||
| rows3 = rows1_old; | |||
| const float* S2 = src.row(sy+1); | |||
| const float* S3 = src.row(sy+2); | |||
| const float* alphap = alpha; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx] * 4; | |||
| const float* S2p = S2 + sx; | |||
| const float* S3p = S3 + sx; | |||
| float32x4_t _a0123 = vld1q_f32(alphap); | |||
| float32x4_t _S20 = vld1q_f32(S2p - 4); | |||
| float32x4_t _S21 = vld1q_f32(S2p + 0); | |||
| float32x4_t _S22 = vld1q_f32(S2p + 4); | |||
| float32x4_t _S23 = vld1q_f32(S2p + 8); | |||
| float32x4_t _S30 = vld1q_f32(S3p - 4); | |||
| float32x4_t _S31 = vld1q_f32(S3p + 0); | |||
| float32x4_t _S32 = vld1q_f32(S3p + 4); | |||
| float32x4_t _S33 = vld1q_f32(S3p + 8); | |||
| float32x4_t _rows2 = vmulq_lane_f32(_S20, vget_low_f32(_a0123), 0); | |||
| float32x4_t _rows3 = vmulq_lane_f32(_S30, vget_low_f32(_a0123), 0); | |||
| _rows2 = vmlaq_lane_f32(_rows2, _S21, vget_low_f32(_a0123), 1); | |||
| _rows3 = vmlaq_lane_f32(_rows3, _S31, vget_low_f32(_a0123), 1); | |||
| _rows2 = vmlaq_lane_f32(_rows2, _S22, vget_high_f32(_a0123), 0); | |||
| _rows3 = vmlaq_lane_f32(_rows3, _S32, vget_high_f32(_a0123), 0); | |||
| _rows2 = vmlaq_lane_f32(_rows2, _S23, vget_high_f32(_a0123), 1); | |||
| _rows3 = vmlaq_lane_f32(_rows3, _S33, vget_high_f32(_a0123), 1); | |||
| vst1q_f32(rows2p + dx * 4, _rows2); | |||
| vst1q_f32(rows3p + dx * 4, _rows3); | |||
| alphap += 4; | |||
| } | |||
| } | |||
| else if (sy == prev_sy1 + 3) | |||
| { | |||
| // hresize three rows | |||
| float* rows0_old = rows0; | |||
| float* rows1_old = rows1; | |||
| float* rows2_old = rows2; | |||
| rows0 = rows3; | |||
| rows1 = rows0_old; | |||
| rows2 = rows1_old; | |||
| rows3 = rows2_old; | |||
| const float* S1 = src.row(sy); | |||
| const float* S2 = src.row(sy+1); | |||
| const float* S3 = src.row(sy+2); | |||
| const float* alphap = alpha; | |||
| float* rows1p = rows1; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx] * 4; | |||
| const float* S1p = S1 + sx; | |||
| const float* S2p = S2 + sx; | |||
| const float* S3p = S3 + sx; | |||
| float32x4_t _a0123 = vld1q_f32(alphap); | |||
| float32x4_t _S10 = vld1q_f32(S1p - 4); | |||
| float32x4_t _S11 = vld1q_f32(S1p + 0); | |||
| float32x4_t _S12 = vld1q_f32(S1p + 4); | |||
| float32x4_t _S13 = vld1q_f32(S1p + 8); | |||
| float32x4_t _S20 = vld1q_f32(S2p - 4); | |||
| float32x4_t _S21 = vld1q_f32(S2p + 0); | |||
| float32x4_t _S22 = vld1q_f32(S2p + 4); | |||
| float32x4_t _S23 = vld1q_f32(S2p + 8); | |||
| float32x4_t _S30 = vld1q_f32(S3p - 4); | |||
| float32x4_t _S31 = vld1q_f32(S3p + 0); | |||
| float32x4_t _S32 = vld1q_f32(S3p + 4); | |||
| float32x4_t _S33 = vld1q_f32(S3p + 8); | |||
| float32x4_t _rows1 = vmulq_lane_f32(_S10, vget_low_f32(_a0123), 0); | |||
| float32x4_t _rows2 = vmulq_lane_f32(_S20, vget_low_f32(_a0123), 0); | |||
| float32x4_t _rows3 = vmulq_lane_f32(_S30, vget_low_f32(_a0123), 0); | |||
| _rows1 = vmlaq_lane_f32(_rows1, _S11, vget_low_f32(_a0123), 1); | |||
| _rows2 = vmlaq_lane_f32(_rows2, _S21, vget_low_f32(_a0123), 1); | |||
| _rows3 = vmlaq_lane_f32(_rows3, _S31, vget_low_f32(_a0123), 1); | |||
| _rows1 = vmlaq_lane_f32(_rows1, _S12, vget_high_f32(_a0123), 0); | |||
| _rows2 = vmlaq_lane_f32(_rows2, _S22, vget_high_f32(_a0123), 0); | |||
| _rows3 = vmlaq_lane_f32(_rows3, _S32, vget_high_f32(_a0123), 0); | |||
| _rows1 = vmlaq_lane_f32(_rows1, _S13, vget_high_f32(_a0123), 1); | |||
| _rows2 = vmlaq_lane_f32(_rows2, _S23, vget_high_f32(_a0123), 1); | |||
| _rows3 = vmlaq_lane_f32(_rows3, _S33, vget_high_f32(_a0123), 1); | |||
| vst1q_f32(rows1p + dx * 4, _rows1); | |||
| vst1q_f32(rows2p + dx * 4, _rows2); | |||
| vst1q_f32(rows3p + dx * 4, _rows3); | |||
| alphap += 4; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| // hresize four rows | |||
| const float* S0 = src.row(sy-1); | |||
| const float* S1 = src.row(sy); | |||
| const float* S2 = src.row(sy+1); | |||
| const float* S3 = src.row(sy+2); | |||
| const float* alphap = alpha; | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx] * 4; | |||
| const float* S0p = S0 + sx; | |||
| const float* S1p = S1 + sx; | |||
| const float* S2p = S2 + sx; | |||
| const float* S3p = S3 + sx; | |||
| float32x4_t _a0123 = vld1q_f32(alphap); | |||
| // TODO check the generated assembly on armv7 | |||
| float32x4_t _S00 = vld1q_f32(S0p - 4); | |||
| float32x4_t _S01 = vld1q_f32(S0p + 0); | |||
| float32x4_t _S02 = vld1q_f32(S0p + 4); | |||
| float32x4_t _S03 = vld1q_f32(S0p + 8); | |||
| float32x4_t _S10 = vld1q_f32(S1p - 4); | |||
| float32x4_t _S11 = vld1q_f32(S1p + 0); | |||
| float32x4_t _S12 = vld1q_f32(S1p + 4); | |||
| float32x4_t _S13 = vld1q_f32(S1p + 8); | |||
| float32x4_t _S20 = vld1q_f32(S2p - 4); | |||
| float32x4_t _S21 = vld1q_f32(S2p + 0); | |||
| float32x4_t _S22 = vld1q_f32(S2p + 4); | |||
| float32x4_t _S23 = vld1q_f32(S2p + 8); | |||
| float32x4_t _S30 = vld1q_f32(S3p - 4); | |||
| float32x4_t _S31 = vld1q_f32(S3p + 0); | |||
| float32x4_t _S32 = vld1q_f32(S3p + 4); | |||
| float32x4_t _S33 = vld1q_f32(S3p + 8); | |||
| float32x4_t _rows0 = vmulq_lane_f32(_S00, vget_low_f32(_a0123), 0); | |||
| float32x4_t _rows1 = vmulq_lane_f32(_S10, vget_low_f32(_a0123), 0); | |||
| float32x4_t _rows2 = vmulq_lane_f32(_S20, vget_low_f32(_a0123), 0); | |||
| float32x4_t _rows3 = vmulq_lane_f32(_S30, vget_low_f32(_a0123), 0); | |||
| _rows0 = vmlaq_lane_f32(_rows0, _S01, vget_low_f32(_a0123), 1); | |||
| _rows1 = vmlaq_lane_f32(_rows1, _S11, vget_low_f32(_a0123), 1); | |||
| _rows2 = vmlaq_lane_f32(_rows2, _S21, vget_low_f32(_a0123), 1); | |||
| _rows3 = vmlaq_lane_f32(_rows3, _S31, vget_low_f32(_a0123), 1); | |||
| _rows0 = vmlaq_lane_f32(_rows0, _S02, vget_high_f32(_a0123), 0); | |||
| _rows1 = vmlaq_lane_f32(_rows1, _S12, vget_high_f32(_a0123), 0); | |||
| _rows2 = vmlaq_lane_f32(_rows2, _S22, vget_high_f32(_a0123), 0); | |||
| _rows3 = vmlaq_lane_f32(_rows3, _S32, vget_high_f32(_a0123), 0); | |||
| _rows0 = vmlaq_lane_f32(_rows0, _S03, vget_high_f32(_a0123), 1); | |||
| _rows1 = vmlaq_lane_f32(_rows1, _S13, vget_high_f32(_a0123), 1); | |||
| _rows2 = vmlaq_lane_f32(_rows2, _S23, vget_high_f32(_a0123), 1); | |||
| _rows3 = vmlaq_lane_f32(_rows3, _S33, vget_high_f32(_a0123), 1); | |||
| vst1q_f32(rows0p + dx * 4, _rows0); | |||
| vst1q_f32(rows1p + dx * 4, _rows1); | |||
| vst1q_f32(rows2p + dx * 4, _rows2); | |||
| vst1q_f32(rows3p + dx * 4, _rows3); | |||
| alphap += 4; | |||
| } | |||
| } | |||
| prev_sy1 = sy; | |||
| // vresize | |||
| float32x4_t _b0123 = vld1q_f32(beta); | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| float* Dp = dst.row(dy); | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| float32x4_t _rows0 = vld1q_f32(rows0p); | |||
| float32x4_t _rows1 = vld1q_f32(rows1p); | |||
| float32x4_t _rows2 = vld1q_f32(rows2p); | |||
| float32x4_t _rows3 = vld1q_f32(rows3p); | |||
| float32x4_t _D = vmulq_lane_f32(_rows0, vget_low_f32(_b0123), 0); | |||
| _D = vmlaq_lane_f32(_D, _rows1, vget_low_f32(_b0123), 1); | |||
| _D = vmlaq_lane_f32(_D, _rows2, vget_high_f32(_b0123), 0); | |||
| _D = vmlaq_lane_f32(_D, _rows3, vget_high_f32(_b0123), 1); | |||
| vst1q_f32(Dp, _D); | |||
| Dp += 4; | |||
| rows0p += 4; | |||
| rows1p += 4; | |||
| rows2p += 4; | |||
| rows3p += 4; | |||
| } | |||
| beta += 4; | |||
| } | |||
| } | |||
| @@ -0,0 +1,272 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void resize_bicubic_image_pack4_bf16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) | |||
| { | |||
| int w = dst.w; | |||
| int h = dst.h; | |||
| // loop body | |||
| Mat rowsbuf0(w, (size_t)4 * 4u, 4); | |||
| Mat rowsbuf1(w, (size_t)4 * 4u, 4); | |||
| Mat rowsbuf2(w, (size_t)4 * 4u, 4); | |||
| Mat rowsbuf3(w, (size_t)4 * 4u, 4); | |||
| float* rows0 = rowsbuf0; | |||
| float* rows1 = rowsbuf1; | |||
| float* rows2 = rowsbuf2; | |||
| float* rows3 = rowsbuf3; | |||
| int prev_sy1 = -3; | |||
| for (int dy = 0; dy < h; dy++ ) | |||
| { | |||
| int sy = yofs[dy]; | |||
| if (sy == prev_sy1) | |||
| { | |||
| // reuse all rows | |||
| } | |||
| else if (sy == prev_sy1 + 1) | |||
| { | |||
| // hresize one row | |||
| float* rows0_old = rows0; | |||
| rows0 = rows1; | |||
| rows1 = rows2; | |||
| rows2 = rows3; | |||
| rows3 = rows0_old; | |||
| const unsigned short* S3 = src.row<const unsigned short>(sy+2); | |||
| const float* alphap = alpha; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx] * 4; | |||
| const unsigned short* S3p = S3 + sx; | |||
| float32x4_t _a0123 = vld1q_f32(alphap); | |||
| float32x4_t _S30 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p - 4), 16)); | |||
| float32x4_t _S31 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p + 0), 16)); | |||
| float32x4_t _S32 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p + 4), 16)); | |||
| float32x4_t _S33 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p + 8), 16)); | |||
| float32x4_t _rows3 = vmulq_lane_f32(_S30, vget_low_f32(_a0123), 0); | |||
| _rows3 = vmlaq_lane_f32(_rows3, _S31, vget_low_f32(_a0123), 1); | |||
| _rows3 = vmlaq_lane_f32(_rows3, _S32, vget_high_f32(_a0123), 0); | |||
| _rows3 = vmlaq_lane_f32(_rows3, _S33, vget_high_f32(_a0123), 1); | |||
| vst1q_f32(rows3p + dx * 4, _rows3); | |||
| alphap += 4; | |||
| } | |||
| } | |||
| else if (sy == prev_sy1 + 2) | |||
| { | |||
| // hresize two rows | |||
| float* rows0_old = rows0; | |||
| float* rows1_old = rows1; | |||
| rows0 = rows2; | |||
| rows1 = rows3; | |||
| rows2 = rows0_old; | |||
| rows3 = rows1_old; | |||
| const unsigned short* S2 = src.row<const unsigned short>(sy+1); | |||
| const unsigned short* S3 = src.row<const unsigned short>(sy+2); | |||
| const float* alphap = alpha; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx] * 4; | |||
| const unsigned short* S2p = S2 + sx; | |||
| const unsigned short* S3p = S3 + sx; | |||
| float32x4_t _a0123 = vld1q_f32(alphap); | |||
| float32x4_t _S20 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S2p - 4), 16)); | |||
| float32x4_t _S21 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S2p + 0), 16)); | |||
| float32x4_t _S22 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S2p + 4), 16)); | |||
| float32x4_t _S23 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S2p + 8), 16)); | |||
| float32x4_t _S30 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p - 4), 16)); | |||
| float32x4_t _S31 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p + 0), 16)); | |||
| float32x4_t _S32 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p + 4), 16)); | |||
| float32x4_t _S33 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p + 8), 16)); | |||
| float32x4_t _rows2 = vmulq_lane_f32(_S20, vget_low_f32(_a0123), 0); | |||
| float32x4_t _rows3 = vmulq_lane_f32(_S30, vget_low_f32(_a0123), 0); | |||
| _rows2 = vmlaq_lane_f32(_rows2, _S21, vget_low_f32(_a0123), 1); | |||
| _rows3 = vmlaq_lane_f32(_rows3, _S31, vget_low_f32(_a0123), 1); | |||
| _rows2 = vmlaq_lane_f32(_rows2, _S22, vget_high_f32(_a0123), 0); | |||
| _rows3 = vmlaq_lane_f32(_rows3, _S32, vget_high_f32(_a0123), 0); | |||
| _rows2 = vmlaq_lane_f32(_rows2, _S23, vget_high_f32(_a0123), 1); | |||
| _rows3 = vmlaq_lane_f32(_rows3, _S33, vget_high_f32(_a0123), 1); | |||
| vst1q_f32(rows2p + dx * 4, _rows2); | |||
| vst1q_f32(rows3p + dx * 4, _rows3); | |||
| alphap += 4; | |||
| } | |||
| } | |||
| else if (sy == prev_sy1 + 3) | |||
| { | |||
| // hresize three rows | |||
| float* rows0_old = rows0; | |||
| float* rows1_old = rows1; | |||
| float* rows2_old = rows2; | |||
| rows0 = rows3; | |||
| rows1 = rows0_old; | |||
| rows2 = rows1_old; | |||
| rows3 = rows2_old; | |||
| const unsigned short* S1 = src.row<const unsigned short>(sy); | |||
| const unsigned short* S2 = src.row<const unsigned short>(sy+1); | |||
| const unsigned short* S3 = src.row<const unsigned short>(sy+2); | |||
| const float* alphap = alpha; | |||
| float* rows1p = rows1; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx] * 4; | |||
| const unsigned short* S1p = S1 + sx; | |||
| const unsigned short* S2p = S2 + sx; | |||
| const unsigned short* S3p = S3 + sx; | |||
| float32x4_t _a0123 = vld1q_f32(alphap); | |||
| float32x4_t _S10 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S1p - 4), 16)); | |||
| float32x4_t _S11 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S1p + 0), 16)); | |||
| float32x4_t _S12 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S1p + 4), 16)); | |||
| float32x4_t _S13 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S1p + 8), 16)); | |||
| float32x4_t _S20 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S2p - 4), 16)); | |||
| float32x4_t _S21 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S2p + 0), 16)); | |||
| float32x4_t _S22 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S2p + 4), 16)); | |||
| float32x4_t _S23 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S2p + 8), 16)); | |||
| float32x4_t _S30 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p - 4), 16)); | |||
| float32x4_t _S31 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p + 0), 16)); | |||
| float32x4_t _S32 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p + 4), 16)); | |||
| float32x4_t _S33 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p + 8), 16)); | |||
| float32x4_t _rows1 = vmulq_lane_f32(_S10, vget_low_f32(_a0123), 0); | |||
| float32x4_t _rows2 = vmulq_lane_f32(_S20, vget_low_f32(_a0123), 0); | |||
| float32x4_t _rows3 = vmulq_lane_f32(_S30, vget_low_f32(_a0123), 0); | |||
| _rows1 = vmlaq_lane_f32(_rows1, _S11, vget_low_f32(_a0123), 1); | |||
| _rows2 = vmlaq_lane_f32(_rows2, _S21, vget_low_f32(_a0123), 1); | |||
| _rows3 = vmlaq_lane_f32(_rows3, _S31, vget_low_f32(_a0123), 1); | |||
| _rows1 = vmlaq_lane_f32(_rows1, _S12, vget_high_f32(_a0123), 0); | |||
| _rows2 = vmlaq_lane_f32(_rows2, _S22, vget_high_f32(_a0123), 0); | |||
| _rows3 = vmlaq_lane_f32(_rows3, _S32, vget_high_f32(_a0123), 0); | |||
| _rows1 = vmlaq_lane_f32(_rows1, _S13, vget_high_f32(_a0123), 1); | |||
| _rows2 = vmlaq_lane_f32(_rows2, _S23, vget_high_f32(_a0123), 1); | |||
| _rows3 = vmlaq_lane_f32(_rows3, _S33, vget_high_f32(_a0123), 1); | |||
| vst1q_f32(rows1p + dx * 4, _rows1); | |||
| vst1q_f32(rows2p + dx * 4, _rows2); | |||
| vst1q_f32(rows3p + dx * 4, _rows3); | |||
| alphap += 4; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| // hresize four rows | |||
| const unsigned short* S0 = src.row<const unsigned short>(sy-1); | |||
| const unsigned short* S1 = src.row<const unsigned short>(sy); | |||
| const unsigned short* S2 = src.row<const unsigned short>(sy+1); | |||
| const unsigned short* S3 = src.row<const unsigned short>(sy+2); | |||
| const float* alphap = alpha; | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| int sx = xofs[dx] * 4; | |||
| const unsigned short* S0p = S0 + sx; | |||
| const unsigned short* S1p = S1 + sx; | |||
| const unsigned short* S2p = S2 + sx; | |||
| const unsigned short* S3p = S3 + sx; | |||
| float32x4_t _a0123 = vld1q_f32(alphap); | |||
| // TODO check the generated assembly on armv7 | |||
| float32x4_t _S00 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S0p - 4), 16)); | |||
| float32x4_t _S01 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S0p + 0), 16)); | |||
| float32x4_t _S02 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S0p + 4), 16)); | |||
| float32x4_t _S03 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S0p + 8), 16)); | |||
| float32x4_t _S10 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S1p - 4), 16)); | |||
| float32x4_t _S11 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S1p + 0), 16)); | |||
| float32x4_t _S12 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S1p + 4), 16)); | |||
| float32x4_t _S13 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S1p + 8), 16)); | |||
| float32x4_t _S20 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S2p - 4), 16)); | |||
| float32x4_t _S21 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S2p + 0), 16)); | |||
| float32x4_t _S22 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S2p + 4), 16)); | |||
| float32x4_t _S23 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S2p + 8), 16)); | |||
| float32x4_t _S30 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p - 4), 16)); | |||
| float32x4_t _S31 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p + 0), 16)); | |||
| float32x4_t _S32 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p + 4), 16)); | |||
| float32x4_t _S33 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p + 8), 16)); | |||
| float32x4_t _rows0 = vmulq_lane_f32(_S00, vget_low_f32(_a0123), 0); | |||
| float32x4_t _rows1 = vmulq_lane_f32(_S10, vget_low_f32(_a0123), 0); | |||
| float32x4_t _rows2 = vmulq_lane_f32(_S20, vget_low_f32(_a0123), 0); | |||
| float32x4_t _rows3 = vmulq_lane_f32(_S30, vget_low_f32(_a0123), 0); | |||
| _rows0 = vmlaq_lane_f32(_rows0, _S01, vget_low_f32(_a0123), 1); | |||
| _rows1 = vmlaq_lane_f32(_rows1, _S11, vget_low_f32(_a0123), 1); | |||
| _rows2 = vmlaq_lane_f32(_rows2, _S21, vget_low_f32(_a0123), 1); | |||
| _rows3 = vmlaq_lane_f32(_rows3, _S31, vget_low_f32(_a0123), 1); | |||
| _rows0 = vmlaq_lane_f32(_rows0, _S02, vget_high_f32(_a0123), 0); | |||
| _rows1 = vmlaq_lane_f32(_rows1, _S12, vget_high_f32(_a0123), 0); | |||
| _rows2 = vmlaq_lane_f32(_rows2, _S22, vget_high_f32(_a0123), 0); | |||
| _rows3 = vmlaq_lane_f32(_rows3, _S32, vget_high_f32(_a0123), 0); | |||
| _rows0 = vmlaq_lane_f32(_rows0, _S03, vget_high_f32(_a0123), 1); | |||
| _rows1 = vmlaq_lane_f32(_rows1, _S13, vget_high_f32(_a0123), 1); | |||
| _rows2 = vmlaq_lane_f32(_rows2, _S23, vget_high_f32(_a0123), 1); | |||
| _rows3 = vmlaq_lane_f32(_rows3, _S33, vget_high_f32(_a0123), 1); | |||
| vst1q_f32(rows0p + dx * 4, _rows0); | |||
| vst1q_f32(rows1p + dx * 4, _rows1); | |||
| vst1q_f32(rows2p + dx * 4, _rows2); | |||
| vst1q_f32(rows3p + dx * 4, _rows3); | |||
| alphap += 4; | |||
| } | |||
| } | |||
| prev_sy1 = sy; | |||
| // vresize | |||
| float32x4_t _b0123 = vld1q_f32(beta); | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| float* rows2p = rows2; | |||
| float* rows3p = rows3; | |||
| unsigned short* Dp = dst.row<unsigned short>(dy); | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| float32x4_t _rows0 = vld1q_f32(rows0p); | |||
| float32x4_t _rows1 = vld1q_f32(rows1p); | |||
| float32x4_t _rows2 = vld1q_f32(rows2p); | |||
| float32x4_t _rows3 = vld1q_f32(rows3p); | |||
| float32x4_t _D = vmulq_lane_f32(_rows0, vget_low_f32(_b0123), 0); | |||
| _D = vmlaq_lane_f32(_D, _rows1, vget_low_f32(_b0123), 1); | |||
| _D = vmlaq_lane_f32(_D, _rows2, vget_high_f32(_b0123), 0); | |||
| _D = vmlaq_lane_f32(_D, _rows3, vget_high_f32(_b0123), 1); | |||
| vst1_u16(Dp, vshrn_n_u32(vreinterpretq_u32_f32(_D), 16)); | |||
| Dp += 4; | |||
| rows0p += 4; | |||
| rows1p += 4; | |||
| rows2p += 4; | |||
| rows3p += 4; | |||
| } | |||
| beta += 4; | |||
| } | |||
| } | |||
| @@ -0,0 +1,213 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void linear_coeffs(int w, int outw, int* xofs, float* alpha) | |||
| { | |||
| double scale = (double)w / outw; | |||
| for (int dx = 0; dx < outw; dx++) | |||
| { | |||
| float fx = (float)((dx + 0.5) * scale - 0.5); | |||
| int sx = floor(fx); | |||
| fx -= sx; | |||
| if (sx < 0) | |||
| { | |||
| sx = 0; | |||
| fx = 0.f; | |||
| } | |||
| if (sx >= w - 1) | |||
| { | |||
| sx = w - 2; | |||
| fx = 1.f; | |||
| } | |||
| xofs[dx] = sx; | |||
| alpha[dx*2 ] = 1.f - fx; | |||
| alpha[dx*2 + 1] = fx; | |||
| } | |||
| } | |||
| static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) | |||
| { | |||
| int w = dst.w; | |||
| int h = dst.h; | |||
| // loop body | |||
| Mat rowsbuf0(w); | |||
| Mat rowsbuf1(w); | |||
| float* rows0 = rowsbuf0; | |||
| float* rows1 = rowsbuf1; | |||
| int prev_sy1 = -2; | |||
| for (int dy = 0; dy < h; dy++ ) | |||
| { | |||
| int sy = yofs[dy]; | |||
| if (sy == prev_sy1) | |||
| { | |||
| // reuse all rows | |||
| } | |||
| else if (sy == prev_sy1 + 1) | |||
| { | |||
| // hresize one row | |||
| float* rows0_old = rows0; | |||
| rows0 = rows1; | |||
| rows1 = rows0_old; | |||
| const float* S1 = src.row(sy+1); | |||
| const float* alphap = alpha; | |||
| float* rows1p = rows1; | |||
| int dx = 0; | |||
| #if __ARM_NEON | |||
| for ( ; dx+1 < w; dx += 2 ) | |||
| { | |||
| int sx = xofs[dx]; | |||
| int sxn = xofs[dx+1]; | |||
| const float* S1p = S1 + sx; | |||
| const float* S1np = S1 + sxn; | |||
| float32x4_t _a = vld1q_f32(alphap); | |||
| float32x2_t _S1 = vld1_f32(S1p); | |||
| float32x2_t _S1n = vld1_f32(S1np); | |||
| float32x4_t _S1S1n = vcombine_f32(_S1, _S1n); | |||
| float32x4_t _ms1 = vmulq_f32(_S1S1n, _a); | |||
| float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1)); | |||
| vst1_f32(rows1p + dx, _rows1); | |||
| alphap += 4; | |||
| } | |||
| #endif // __ARM_NEON | |||
| for ( ; dx < w; dx++ ) | |||
| { | |||
| int sx = xofs[dx]; | |||
| const float* S1p = S1 + sx; | |||
| float a0 = alphap[0]; | |||
| float a1 = alphap[1]; | |||
| rows1p[dx] = S1p[0]*a0 + S1p[1]*a1; | |||
| alphap += 2; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| // hresize two rows | |||
| const float* S0 = src.row(sy); | |||
| const float* S1 = src.row(sy+1); | |||
| const float* alphap = alpha; | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| int dx = 0; | |||
| #if __ARM_NEON | |||
| for ( ; dx+1 < w; dx += 2 ) | |||
| { | |||
| int sx = xofs[dx]; | |||
| int sxn = xofs[dx+1]; | |||
| const float* S0p = S0 + sx; | |||
| const float* S1p = S1 + sx; | |||
| const float* S0np = S0 + sxn; | |||
| const float* S1np = S1 + sxn; | |||
| float32x4_t _a = vld1q_f32(alphap); | |||
| float32x2_t _S0 = vld1_f32(S0p); | |||
| float32x2_t _S1 = vld1_f32(S1p); | |||
| float32x2_t _S0n = vld1_f32(S0np); | |||
| float32x2_t _S1n = vld1_f32(S1np); | |||
| float32x4_t _S0S0n = vcombine_f32(_S0, _S0n); | |||
| float32x4_t _S1S1n = vcombine_f32(_S1, _S1n); | |||
| float32x4_t _ms0 = vmulq_f32(_S0S0n, _a); | |||
| float32x4_t _ms1 = vmulq_f32(_S1S1n, _a); | |||
| float32x2_t _rows0 = vpadd_f32(vget_low_f32(_ms0), vget_high_f32(_ms0)); | |||
| float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1)); | |||
| vst1_f32(rows0p + dx, _rows0); | |||
| vst1_f32(rows1p + dx, _rows1); | |||
| alphap += 4; | |||
| } | |||
| #endif // __ARM_NEON | |||
| for ( ; dx < w; dx++ ) | |||
| { | |||
| int sx = xofs[dx]; | |||
| const float* S0p = S0 + sx; | |||
| const float* S1p = S1 + sx; | |||
| float a0 = alphap[0]; | |||
| float a1 = alphap[1]; | |||
| rows0p[dx] = S0p[0]*a0 + S0p[1]*a1; | |||
| rows1p[dx] = S1p[0]*a0 + S1p[1]*a1; | |||
| alphap += 2; | |||
| } | |||
| } | |||
| prev_sy1 = sy; | |||
| // vresize | |||
| float b0 = beta[0]; | |||
| float b1 = beta[1]; | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| float* Dp = dst.row(dy); | |||
| #if __ARM_NEON | |||
| int nn = w >> 3; | |||
| #else | |||
| int nn = 0; | |||
| #endif | |||
| int remain = w - (nn << 3); | |||
| #if __ARM_NEON | |||
| float32x4_t _b0 = vdupq_n_f32(b0); | |||
| float32x4_t _b1 = vdupq_n_f32(b1); | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _rows0 = vld1q_f32(rows0p); | |||
| float32x4_t _rows1 = vld1q_f32(rows1p); | |||
| float32x4_t _D = vmulq_f32(_rows0, _b0); | |||
| _D = vmlaq_f32(_D, _rows1, _b1); | |||
| vst1q_f32(Dp, _D); | |||
| float32x4_t _rows0n = vld1q_f32(rows0p+4); | |||
| float32x4_t _rows1n = vld1q_f32(rows1p+4); | |||
| float32x4_t _Dn = vmulq_f32(_rows0n, _b0); | |||
| _Dn = vmlaq_f32(_Dn, _rows1n, _b1); | |||
| vst1q_f32(Dp+4, _Dn); | |||
| Dp += 8; | |||
| rows0p += 8; | |||
| rows1p += 8; | |||
| } | |||
| #endif // __ARM_NEON | |||
| for ( ; remain; --remain ) | |||
| { | |||
| // D[x] = rows0[x]*b0 + rows1[x]*b1; | |||
| *Dp++ = *rows0p++ * b0 + *rows1p++ * b1; | |||
| } | |||
| beta += 2; | |||
| } | |||
| } | |||
| @@ -0,0 +1,135 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void resize_bilinear_image_bf16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) | |||
| { | |||
| int w = dst.w; | |||
| int h = dst.h; | |||
| // loop body | |||
| Mat rowsbuf0(w); | |||
| Mat rowsbuf1(w); | |||
| float* rows0 = rowsbuf0; | |||
| float* rows1 = rowsbuf1; | |||
| int prev_sy1 = -2; | |||
| for (int dy = 0; dy < h; dy++ ) | |||
| { | |||
| int sy = yofs[dy]; | |||
| if (sy == prev_sy1) | |||
| { | |||
| // reuse all rows | |||
| } | |||
| else if (sy == prev_sy1 + 1) | |||
| { | |||
| // hresize one row | |||
| float* rows0_old = rows0; | |||
| rows0 = rows1; | |||
| rows1 = rows0_old; | |||
| const unsigned short* S1 = src.row<const unsigned short>(sy+1); | |||
| const float* alphap = alpha; | |||
| float* rows1p = rows1; | |||
| int dx = 0; | |||
| for ( ; dx < w; dx++ ) | |||
| { | |||
| int sx = xofs[dx]; | |||
| const unsigned short* S1p = S1 + sx; | |||
| float a0 = alphap[0]; | |||
| float a1 = alphap[1]; | |||
| rows1p[dx] = bfloat16_to_float32(S1p[0])*a0 + bfloat16_to_float32(S1p[1])*a1; | |||
| alphap += 2; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| // hresize two rows | |||
| const unsigned short* S0 = src.row<const unsigned short>(sy); | |||
| const unsigned short* S1 = src.row<const unsigned short>(sy+1); | |||
| const float* alphap = alpha; | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| int dx = 0; | |||
| for ( ; dx < w; dx++ ) | |||
| { | |||
| int sx = xofs[dx]; | |||
| const unsigned short* S0p = S0 + sx; | |||
| const unsigned short* S1p = S1 + sx; | |||
| float a0 = alphap[0]; | |||
| float a1 = alphap[1]; | |||
| rows0p[dx] = bfloat16_to_float32(S0p[0])*a0 + bfloat16_to_float32(S0p[1])*a1; | |||
| rows1p[dx] = bfloat16_to_float32(S1p[0])*a0 + bfloat16_to_float32(S1p[1])*a1; | |||
| alphap += 2; | |||
| } | |||
| } | |||
| prev_sy1 = sy; | |||
| // vresize | |||
| float b0 = beta[0]; | |||
| float b1 = beta[1]; | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| unsigned short* Dp = dst.row<unsigned short>(dy); | |||
| #if __ARM_NEON | |||
| int nn = w >> 3; | |||
| #else | |||
| int nn = 0; | |||
| #endif | |||
| int remain = w - (nn << 3); | |||
| #if __ARM_NEON | |||
| float32x4_t _b0 = vdupq_n_f32(b0); | |||
| float32x4_t _b1 = vdupq_n_f32(b1); | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _rows0 = vld1q_f32(rows0p); | |||
| float32x4_t _rows1 = vld1q_f32(rows1p); | |||
| float32x4_t _D = vmulq_f32(_rows0, _b0); | |||
| _D = vmlaq_f32(_D, _rows1, _b1); | |||
| vst1_u16(Dp, vshrn_n_u32(vreinterpretq_u32_f32(_D), 16)); | |||
| float32x4_t _rows0n = vld1q_f32(rows0p+4); | |||
| float32x4_t _rows1n = vld1q_f32(rows1p+4); | |||
| float32x4_t _Dn = vmulq_f32(_rows0n, _b0); | |||
| _Dn = vmlaq_f32(_Dn, _rows1n, _b1); | |||
| vst1_u16(Dp+4, vshrn_n_u32(vreinterpretq_u32_f32(_Dn), 16)); | |||
| Dp += 8; | |||
| rows0p += 8; | |||
| rows1p += 8; | |||
| } | |||
| #endif // __ARM_NEON | |||
| for ( ; remain; --remain ) | |||
| { | |||
| // D[x] = rows0[x]*b0 + rows1[x]*b1; | |||
| *Dp++ = float32_to_bfloat16(*rows0p++ * b0 + *rows1p++ * b1); | |||
| } | |||
| beta += 2; | |||
| } | |||
| } | |||
| @@ -0,0 +1,120 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void resize_bilinear_image_pack4(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) | |||
| { | |||
| int w = dst.w; | |||
| int h = dst.h; | |||
| // loop body | |||
| Mat rowsbuf0(w, (size_t)4 * 4u, 4); | |||
| Mat rowsbuf1(w, (size_t)4 * 4u, 4); | |||
| float* rows0 = rowsbuf0; | |||
| float* rows1 = rowsbuf1; | |||
| int prev_sy1 = -2; | |||
| for (int dy = 0; dy < h; dy++ ) | |||
| { | |||
| int sy = yofs[dy]; | |||
| if (sy == prev_sy1) | |||
| { | |||
| // reuse all rows | |||
| } | |||
| else if (sy == prev_sy1 + 1) | |||
| { | |||
| // hresize one row | |||
| float* rows0_old = rows0; | |||
| rows0 = rows1; | |||
| rows1 = rows0_old; | |||
| const float* S1 = src.row(sy+1); | |||
| const float* alphap = alpha; | |||
| float* rows1p = rows1; | |||
| int dx = 0; | |||
| for ( ; dx < w; dx++ ) | |||
| { | |||
| int sx = xofs[dx] * 4; | |||
| const float* S1p = S1 + sx; | |||
| float32x2_t _a01 = vld1_f32(alphap); | |||
| float32x4_t _S10 = vld1q_f32(S1p); | |||
| float32x4_t _S11 = vld1q_f32(S1p + 4); | |||
| float32x4_t _rows1 = vmulq_lane_f32(_S10, _a01, 0); | |||
| _rows1 = vmlaq_lane_f32(_rows1, _S11, _a01, 1); | |||
| vst1q_f32(rows1p + dx * 4, _rows1); | |||
| alphap += 2; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| // hresize two rows | |||
| const float* S0 = src.row(sy); | |||
| const float* S1 = src.row(sy+1); | |||
| const float* alphap = alpha; | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| int dx = 0; | |||
| for ( ; dx < w; dx++ ) | |||
| { | |||
| int sx = xofs[dx] * 4; | |||
| const float* S0p = S0 + sx; | |||
| const float* S1p = S1 + sx; | |||
| float32x2_t _a01 = vld1_f32(alphap); | |||
| float32x4_t _S00 = vld1q_f32(S0p); | |||
| float32x4_t _S01 = vld1q_f32(S0p + 4); | |||
| float32x4_t _S10 = vld1q_f32(S1p); | |||
| float32x4_t _S11 = vld1q_f32(S1p + 4); | |||
| float32x4_t _rows0 = vmulq_lane_f32(_S00, _a01, 0); | |||
| float32x4_t _rows1 = vmulq_lane_f32(_S10, _a01, 0); | |||
| _rows0 = vmlaq_lane_f32(_rows0, _S01, _a01, 1); | |||
| _rows1 = vmlaq_lane_f32(_rows1, _S11, _a01, 1); | |||
| vst1q_f32(rows0p + dx * 4, _rows0); | |||
| vst1q_f32(rows1p + dx * 4, _rows1); | |||
| alphap += 2; | |||
| } | |||
| } | |||
| prev_sy1 = sy; | |||
| // vresize | |||
| float32x2_t _b01 = vld1_f32(beta); | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| float* Dp = dst.row(dy); | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| float32x4_t _rows0 = vld1q_f32(rows0p); | |||
| float32x4_t _rows1 = vld1q_f32(rows1p); | |||
| float32x4_t _D = vmulq_lane_f32(_rows0, _b01, 0); | |||
| _D = vmlaq_lane_f32(_D, _rows1, _b01, 1); | |||
| vst1q_f32(Dp, _D); | |||
| Dp += 4; | |||
| rows0p += 4; | |||
| rows1p += 4; | |||
| } | |||
| beta += 2; | |||
| } | |||
| } | |||
| @@ -0,0 +1,120 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void resize_bilinear_image_pack4_bf16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) | |||
| { | |||
| int w = dst.w; | |||
| int h = dst.h; | |||
| // loop body | |||
| Mat rowsbuf0(w, (size_t)4 * 4u, 4); | |||
| Mat rowsbuf1(w, (size_t)4 * 4u, 4); | |||
| float* rows0 = rowsbuf0; | |||
| float* rows1 = rowsbuf1; | |||
| int prev_sy1 = -2; | |||
| for (int dy = 0; dy < h; dy++ ) | |||
| { | |||
| int sy = yofs[dy]; | |||
| if (sy == prev_sy1) | |||
| { | |||
| // reuse all rows | |||
| } | |||
| else if (sy == prev_sy1 + 1) | |||
| { | |||
| // hresize one row | |||
| float* rows0_old = rows0; | |||
| rows0 = rows1; | |||
| rows1 = rows0_old; | |||
| const unsigned short* S1 = src.row<const unsigned short>(sy+1); | |||
| const float* alphap = alpha; | |||
| float* rows1p = rows1; | |||
| int dx = 0; | |||
| for ( ; dx < w; dx++ ) | |||
| { | |||
| int sx = xofs[dx] * 4; | |||
| const unsigned short* S1p = S1 + sx; | |||
| float32x2_t _a01 = vld1_f32(alphap); | |||
| float32x4_t _S10 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S1p), 16)); | |||
| float32x4_t _S11 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S1p + 4), 16)); | |||
| float32x4_t _rows1 = vmulq_lane_f32(_S10, _a01, 0); | |||
| _rows1 = vmlaq_lane_f32(_rows1, _S11, _a01, 1); | |||
| vst1q_f32(rows1p + dx * 4, _rows1); | |||
| alphap += 2; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| // hresize two rows | |||
| const unsigned short* S0 = src.row<const unsigned short>(sy); | |||
| const unsigned short* S1 = src.row<const unsigned short>(sy+1); | |||
| const float* alphap = alpha; | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| int dx = 0; | |||
| for ( ; dx < w; dx++ ) | |||
| { | |||
| int sx = xofs[dx] * 4; | |||
| const unsigned short* S0p = S0 + sx; | |||
| const unsigned short* S1p = S1 + sx; | |||
| float32x2_t _a01 = vld1_f32(alphap); | |||
| float32x4_t _S00 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S0p), 16)); | |||
| float32x4_t _S01 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S0p + 4), 16)); | |||
| float32x4_t _S10 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S1p), 16)); | |||
| float32x4_t _S11 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S1p + 4), 16)); | |||
| float32x4_t _rows0 = vmulq_lane_f32(_S00, _a01, 0); | |||
| float32x4_t _rows1 = vmulq_lane_f32(_S10, _a01, 0); | |||
| _rows0 = vmlaq_lane_f32(_rows0, _S01, _a01, 1); | |||
| _rows1 = vmlaq_lane_f32(_rows1, _S11, _a01, 1); | |||
| vst1q_f32(rows0p + dx * 4, _rows0); | |||
| vst1q_f32(rows1p + dx * 4, _rows1); | |||
| alphap += 2; | |||
| } | |||
| } | |||
| prev_sy1 = sy; | |||
| // vresize | |||
| float32x2_t _b01 = vld1_f32(beta); | |||
| float* rows0p = rows0; | |||
| float* rows1p = rows1; | |||
| unsigned short* Dp = dst.row<unsigned short>(dy); | |||
| for (int dx = 0; dx < w; dx++) | |||
| { | |||
| float32x4_t _rows0 = vld1q_f32(rows0p); | |||
| float32x4_t _rows1 = vld1q_f32(rows1p); | |||
| float32x4_t _D = vmulq_lane_f32(_rows0, _b01, 0); | |||
| _D = vmlaq_lane_f32(_D, _rows1, _b01, 1); | |||
| vst1_u16(Dp, vshrn_n_u32(vreinterpretq_u32_f32(_D), 16)); | |||
| Dp += 4; | |||
| rows0p += 4; | |||
| rows1p += 4; | |||
| } | |||
| beta += 2; | |||
| } | |||
| } | |||
| @@ -33,6 +33,12 @@ int Interp::load_param(const ParamDict& pd) | |||
| output_height = pd.get(3, 0); | |||
| output_width = pd.get(4, 0); | |||
| if (resize_type < 1 || resize_type > 3) | |||
| { | |||
| fprintf(stderr, "unsupported resize type %d\n", resize_type); | |||
| return -1; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -391,37 +397,38 @@ static void resize_bicubic_image(const Mat& src, Mat& dst, float* alpha, int* xo | |||
| int Interp::forward(const Mat &bottom_blob, Mat &top_blob, const Option& opt) const | |||
| { | |||
| int h = bottom_blob.h; | |||
| int w = bottom_blob.w; | |||
| int c = bottom_blob.c; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int oh = output_height; | |||
| int ow = output_width; | |||
| int outh = output_height; | |||
| int outw = output_width; | |||
| if (bottom_blob.dims == 1) | |||
| { | |||
| h = 1; | |||
| w = 1; | |||
| c = bottom_blob.w; | |||
| channels = bottom_blob.w; | |||
| } | |||
| if (oh == 0 || ow == 0) | |||
| if (outh == 0 || outw == 0) | |||
| { | |||
| oh = static_cast<int>(h * height_scale); | |||
| ow = static_cast<int>(w * width_scale); | |||
| outh = static_cast<int>(h * height_scale); | |||
| outw = static_cast<int>(w * width_scale); | |||
| } | |||
| if (oh == h && ow == w) | |||
| if (outh == h && outw == w) | |||
| { | |||
| top_blob = bottom_blob; | |||
| return 0; | |||
| } | |||
| top_blob.create(ow, oh, c, elemsize, opt.blob_allocator); | |||
| top_blob.create(outw, outh, channels, elemsize, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| if (bottom_blob.dims == 1) | |||
| { | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < c; ++q) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| Mat top_blob_c = top_blob.channel(q); | |||
| const float *ptr = ((const float*)bottom_blob.data + q); | |||
| @@ -436,38 +443,37 @@ int Interp::forward(const Mat &bottom_blob, Mat &top_blob, const Option& opt) co | |||
| const float ws = output_width ? w / (float)output_width : 1.f / width_scale; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < c; q++) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| for (int y = 0; y < oh; y++) | |||
| for (int y = 0; y < outh; y++) | |||
| { | |||
| int in_y = std::min((int) (y * hs), (h - 1)); | |||
| for (int x = 0; x < ow; x++) | |||
| for (int x = 0; x < outw; x++) | |||
| { | |||
| int in_x = std::min((int) (x * ws), (w - 1)); | |||
| *outptr++ = ptr[in_y * w + in_x]; | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| else if (resize_type == 2)// bilinear | |||
| if (resize_type == 2)// bilinear | |||
| { | |||
| int* buf = new int[ow + oh + ow*2 + oh*2]; | |||
| int* buf = new int[outw + outh + outw*2 + outh*2]; | |||
| int* xofs = buf;//new int[ow]; | |||
| int* yofs = buf + ow;//new int[oh]; | |||
| int* xofs = buf;//new int[outw]; | |||
| int* yofs = buf + outw;//new int[outh]; | |||
| float* alpha = (float*)(buf + ow + oh);//new float[ow * 2]; | |||
| float* beta = (float*)(buf + ow + oh + ow*2);//new float[oh * 2]; | |||
| float* alpha = (float*)(buf + outw + outh);//new float[outw * 2]; | |||
| float* beta = (float*)(buf + outw + outh + outw*2);//new float[outh * 2]; | |||
| linear_coeffs(w, ow, xofs, alpha); | |||
| linear_coeffs(h, oh, yofs, beta); | |||
| linear_coeffs(w, outw, xofs, alpha); | |||
| linear_coeffs(h, outh, yofs, beta); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < c; ++q) | |||
| for (int q = 0; q < channels; ++q) | |||
| { | |||
| const Mat src = bottom_blob.channel(q); | |||
| Mat dst = top_blob.channel(q); | |||
| @@ -476,24 +482,23 @@ int Interp::forward(const Mat &bottom_blob, Mat &top_blob, const Option& opt) co | |||
| } | |||
| delete[] buf; | |||
| return 0; | |||
| } | |||
| else if (resize_type == 3)// bicubic | |||
| if (resize_type == 3)// bicubic | |||
| { | |||
| int* buf = new int[ow + oh + ow*4 + oh*4]; | |||
| int* buf = new int[outw + outh + outw*4 + outh*4]; | |||
| int* xofs = buf;//new int[ow]; | |||
| int* yofs = buf + ow;//new int[oh]; | |||
| int* xofs = buf;//new int[outw]; | |||
| int* yofs = buf + outw;//new int[outh]; | |||
| float* alpha = (float*)(buf + ow + oh);//new float[ow * 4]; | |||
| float* beta = (float*)(buf + ow + oh + ow*4);//new float[oh * 4]; | |||
| float* alpha = (float*)(buf + outw + outh);//new float[outw * 4]; | |||
| float* beta = (float*)(buf + outw + outh + outw*4);//new float[outh * 4]; | |||
| cubic_coeffs(w, ow, xofs, alpha); | |||
| cubic_coeffs(h, oh, yofs, beta); | |||
| cubic_coeffs(w, outw, xofs, alpha); | |||
| cubic_coeffs(h, outh, yofs, beta); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < c; ++q) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const Mat src = bottom_blob.channel(q); | |||
| Mat dst = top_blob.channel(q); | |||
| @@ -502,14 +507,9 @@ int Interp::forward(const Mat &bottom_blob, Mat &top_blob, const Option& opt) co | |||
| } | |||
| delete[] buf; | |||
| return 0; | |||
| } | |||
| else | |||
| { | |||
| fprintf(stderr, "unsupported resize type %d %d %d\n", resize_type, oh, ow); | |||
| return -233; | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||