|
- // Tencent is pleased to support the open source community by making ncnn available.
- //
- // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
- //
- // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
- // in compliance with the License. You may obtain a copy of the License at
- //
- // https://opensource.org/licenses/BSD-3-Clause
- //
- // Unless required by applicable law or agreed to in writing, software distributed
- // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
- // CONDITIONS OF ANY KIND, either express or implied. See the License for the
- // specific language governing permissions and limitations under the License.
-
- #include "mat.h"
- #include <limits.h>
- #include <math.h>
- #include <algorithm>
- #if __ARM_NEON
- #include <arm_neon.h>
- #endif // __ARM_NEON
- #include "platform.h"
-
- namespace ncnn {
-
- #if NCNN_PIXEL
- void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
- {
- return resize_bilinear_c1(src, srcw, srch, srcw, dst, w, h, w);
- }
-
- void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
- {
- return resize_bilinear_c2(src, srcw, srch, srcw * 2, dst, w, h, w * 2);
- }
-
- void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
- {
- return resize_bilinear_c3(src, srcw, srch, srcw * 3, dst, w, h, w * 3);
- }
-
- void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
- {
- return resize_bilinear_c4(src, srcw, srch, srcw * 4, dst, w, h, w * 4);
- }
-
- void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
- {
- const int INTER_RESIZE_COEF_BITS=11;
- const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
- // const int ONE=INTER_RESIZE_COEF_SCALE;
-
- double scale_x = (double)srcw / w;
- double scale_y = (double)srch / h;
-
- int* buf = new int[w + h + w + h];
-
- int* xofs = buf;//new int[w];
- int* yofs = buf + w;//new int[h];
-
- short* ialpha = (short*)(buf + w + h);//new short[w * 2];
- short* ibeta = (short*)(buf + w + h + w);//new short[h * 2];
-
- float fx;
- float fy;
- int sx;
- int sy;
-
- #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
-
- for (int dx = 0; dx < w; dx++)
- {
- fx = (float)((dx + 0.5) * scale_x - 0.5);
- sx = static_cast<int>(floor(fx));
- fx -= sx;
-
- if (sx < 0)
- {
- sx = 0;
- fx = 0.f;
- }
- if (sx >= srcw - 1)
- {
- sx = srcw - 2;
- fx = 1.f;
- }
-
- xofs[dx] = sx;
-
- float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
- float a1 = fx * INTER_RESIZE_COEF_SCALE;
-
- ialpha[dx*2 ] = SATURATE_CAST_SHORT(a0);
- ialpha[dx*2 + 1] = SATURATE_CAST_SHORT(a1);
- }
-
- for (int dy = 0; dy < h; dy++)
- {
- fy = (float)((dy + 0.5) * scale_y - 0.5);
- sy = static_cast<int>(floor(fy));
- fy -= sy;
-
- if (sy < 0)
- {
- sy = 0;
- fy = 0.f;
- }
- if (sy >= srch - 1)
- {
- sy = srch - 2;
- fy = 1.f;
- }
-
- yofs[dy] = sy;
-
- float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
- float b1 = fy * INTER_RESIZE_COEF_SCALE;
-
- ibeta[dy*2 ] = SATURATE_CAST_SHORT(b0);
- ibeta[dy*2 + 1] = SATURATE_CAST_SHORT(b1);
- }
-
- #undef SATURATE_CAST_SHORT
-
- // loop body
- Mat rowsbuf0(w, (size_t)2u);
- Mat rowsbuf1(w, (size_t)2u);
- short* rows0 = (short*)rowsbuf0.data;
- short* rows1 = (short*)rowsbuf1.data;
-
- int prev_sy1 = -2;
-
- for (int dy = 0; dy < h; dy++ )
- {
- int sy = yofs[dy];
-
- if (sy == prev_sy1)
- {
- // reuse all rows
- }
- else if (sy == prev_sy1 + 1)
- {
- // hresize one row
- short* rows0_old = rows0;
- rows0 = rows1;
- rows1 = rows0_old;
- const unsigned char *S1 = src + srcstride * (sy+1);
-
- const short* ialphap = ialpha;
- short* rows1p = rows1;
- for ( int dx = 0; dx < w; dx++ )
- {
- int sx = xofs[dx];
- short a0 = ialphap[0];
- short a1 = ialphap[1];
-
- const unsigned char* S1p = S1 + sx;
- rows1p[dx] = (S1p[0]*a0 + S1p[1]*a1) >> 4;
-
- ialphap += 2;
- }
- }
- else
- {
- // hresize two rows
- const unsigned char *S0 = src + srcstride * (sy);
- const unsigned char *S1 = src + srcstride * (sy+1);
-
- const short* ialphap = ialpha;
- short* rows0p = rows0;
- short* rows1p = rows1;
- for ( int dx = 0; dx < w; dx++ )
- {
- int sx = xofs[dx];
- short a0 = ialphap[0];
- short a1 = ialphap[1];
-
- const unsigned char* S0p = S0 + sx;
- const unsigned char* S1p = S1 + sx;
- rows0p[dx] = (S0p[0]*a0 + S0p[1]*a1) >> 4;
- rows1p[dx] = (S1p[0]*a0 + S1p[1]*a1) >> 4;
-
- ialphap += 2;
- }
- }
-
- prev_sy1 = sy;
-
- // vresize
- short b0 = ibeta[0];
- short b1 = ibeta[1];
-
- short* rows0p = rows0;
- short* rows1p = rows1;
- unsigned char* Dp = dst + stride * (dy);
-
- #if __ARM_NEON
- int nn = w >> 3;
- #else
- int nn = 0;
- #endif
- int remain = w - (nn << 3);
-
- #if __ARM_NEON
- #if __aarch64__
- int16x4_t _b0 = vdup_n_s16(b0);
- int16x4_t _b1 = vdup_n_s16(b1);
- int32x4_t _v2 = vdupq_n_s32(2);
- for (; nn>0; nn--)
- {
- int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
- int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
- int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p+4);
- int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p+4);
-
- int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
- int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
- int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
- int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
-
- int32x4_t _acc = _v2;
- _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
- _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
-
- int32x4_t _acc_1 = _v2;
- _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
- _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
-
- int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
- int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
-
- uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
-
- vst1_u8(Dp, _D);
-
- Dp += 8;
- rows0p += 8;
- rows1p += 8;
- }
- #else
- if (nn > 0)
- {
- asm volatile(
- "vdup.s16 d16, %8 \n"
- "mov r4, #2 \n"
- "vdup.s16 d17, %9 \n"
- "vdup.s32 q12, r4 \n"
- "pld [%0, #128] \n"
- "vld1.s16 {d2-d3}, [%0 :128]!\n"
- "pld [%1, #128] \n"
- "vld1.s16 {d6-d7}, [%1 :128]!\n"
- "0: \n"
- "vmull.s16 q0, d2, d16 \n"
- "vmull.s16 q1, d3, d16 \n"
- "vorr.s32 q10, q12, q12 \n"
- "vorr.s32 q11, q12, q12 \n"
- "vmull.s16 q2, d6, d17 \n"
- "vmull.s16 q3, d7, d17 \n"
- "vsra.s32 q10, q0, #16 \n"
- "vsra.s32 q11, q1, #16 \n"
- "pld [%0, #128] \n"
- "vld1.s16 {d2-d3}, [%0 :128]!\n"
- "vsra.s32 q10, q2, #16 \n"
- "vsra.s32 q11, q3, #16 \n"
- "pld [%1, #128] \n"
- "vld1.s16 {d6-d7}, [%1 :128]!\n"
- "vshrn.s32 d20, q10, #2 \n"
- "vshrn.s32 d21, q11, #2 \n"
- "vqmovun.s16 d20, q10 \n"
- "vst1.8 {d20}, [%2]! \n"
- "subs %3, #1 \n"
- "bne 0b \n"
- "sub %0, #16 \n"
- "sub %1, #16 \n"
- : "=r"(rows0p), // %0
- "=r"(rows1p), // %1
- "=r"(Dp), // %2
- "=r"(nn) // %3
- : "0"(rows0p),
- "1"(rows1p),
- "2"(Dp),
- "3"(nn),
- "r"(b0), // %8
- "r"(b1) // %9
- : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12"
- );
- }
- #endif // __aarch64__
- #endif // __ARM_NEON
- for ( ; remain; --remain )
- {
- // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
- *Dp++ = (unsigned char)(( (short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2)>>2);
- }
-
- ibeta += 2;
- }
-
- delete[] buf;
- }
-
- void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
- {
- const int INTER_RESIZE_COEF_BITS=11;
- const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
- // const int ONE=INTER_RESIZE_COEF_SCALE;
-
- double scale_x = (double)srcw / w;
- double scale_y = (double)srch / h;
-
- int* buf = new int[w + h + w + h];
-
- int* xofs = buf;//new int[w];
- int* yofs = buf + w;//new int[h];
-
- short* ialpha = (short*)(buf + w + h);//new short[w * 2];
- short* ibeta = (short*)(buf + w + h + w);//new short[h * 2];
-
- float fx;
- float fy;
- int sx;
- int sy;
-
- #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
-
- for (int dx = 0; dx < w; dx++)
- {
- fx = (float)((dx + 0.5) * scale_x - 0.5);
- sx = static_cast<int>(floor(fx));
- fx -= sx;
-
- if (sx < 0)
- {
- sx = 0;
- fx = 0.f;
- }
- if (sx >= srcw - 1)
- {
- sx = srcw - 2;
- fx = 1.f;
- }
-
- xofs[dx] = sx*2;
-
- float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
- float a1 = fx * INTER_RESIZE_COEF_SCALE;
-
- ialpha[dx*2 ] = SATURATE_CAST_SHORT(a0);
- ialpha[dx*2 + 1] = SATURATE_CAST_SHORT(a1);
- }
-
- for (int dy = 0; dy < h; dy++)
- {
- fy = (float)((dy + 0.5) * scale_y - 0.5);
- sy = static_cast<int>(floor(fy));
- fy -= sy;
-
- if (sy < 0)
- {
- sy = 0;
- fy = 0.f;
- }
- if (sy >= srch - 1)
- {
- sy = srch - 2;
- fy = 1.f;
- }
-
- yofs[dy] = sy;
-
- float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
- float b1 = fy * INTER_RESIZE_COEF_SCALE;
-
- ibeta[dy*2 ] = SATURATE_CAST_SHORT(b0);
- ibeta[dy*2 + 1] = SATURATE_CAST_SHORT(b1);
- }
-
- #undef SATURATE_CAST_SHORT
-
- // loop body
- Mat rowsbuf0(w*2+2, (size_t)2u);
- Mat rowsbuf1(w*2+2, (size_t)2u);
- short* rows0 = (short*)rowsbuf0.data;
- short* rows1 = (short*)rowsbuf1.data;
-
- int prev_sy1 = -2;
-
- for (int dy = 0; dy < h; dy++ )
- {
- int sy = yofs[dy];
-
- if (sy == prev_sy1)
- {
- // reuse all rows
- }
- else if (sy == prev_sy1 + 1)
- {
- // hresize one row
- short* rows0_old = rows0;
- rows0 = rows1;
- rows1 = rows0_old;
- const unsigned char *S1 = src + srcstride * (sy+1);
-
- const short* ialphap = ialpha;
- short* rows1p = rows1;
- for ( int dx = 0; dx < w; dx++ )
- {
- int sx = xofs[dx];
-
- const unsigned char* S1p = S1 + sx;
- #if __ARM_NEON
- int16x4_t _a0a1XX = vld1_s16(ialphap);
- int16x4_t _a0a0a1a1 = vzip_s16(_a0a1XX, _a0a1XX).val[0];
- uint8x8_t _S1 = uint8x8_t();
-
- _S1 = vld1_lane_u8(S1p, _S1, 0);
- _S1 = vld1_lane_u8(S1p+1, _S1, 1);
- _S1 = vld1_lane_u8(S1p+2, _S1, 2);
- _S1 = vld1_lane_u8(S1p+3, _S1, 3);
-
- int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
- int16x4_t _S1lowhigh = vget_low_s16(_S116);
- int32x4_t _S1ma0a1 = vmull_s16(_S1lowhigh, _a0a0a1a1);
- int32x2_t _rows1low = vadd_s32(vget_low_s32(_S1ma0a1), vget_high_s32(_S1ma0a1));
- int32x4_t _rows1 = vcombine_s32(_rows1low, vget_high_s32(_S1ma0a1));
- int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
- vst1_s16(rows1p, _rows1_sr4);
- #else
- short a0 = ialphap[0];
- short a1 = ialphap[1];
-
- rows1p[0] = (S1p[0]*a0 + S1p[2]*a1) >> 4;
- rows1p[1] = (S1p[1]*a0 + S1p[3]*a1) >> 4;
- #endif // __ARM_NEON
-
- ialphap += 2;
- rows1p += 2;
- }
- }
- else
- {
- // hresize two rows
- const unsigned char *S0 = src + srcstride * (sy);
- const unsigned char *S1 = src + srcstride * (sy+1);
-
- const short* ialphap = ialpha;
- short* rows0p = rows0;
- short* rows1p = rows1;
- for ( int dx = 0; dx < w; dx++ )
- {
- int sx = xofs[dx];
- short a0 = ialphap[0];
- short a1 = ialphap[1];
-
- const unsigned char* S0p = S0 + sx;
- const unsigned char* S1p = S1 + sx;
- #if __ARM_NEON
- int16x4_t _a0 = vdup_n_s16(a0);
- int16x4_t _a1 = vdup_n_s16(a1);
- uint8x8_t _S0 = uint8x8_t();
- uint8x8_t _S1 = uint8x8_t();
-
- _S0 = vld1_lane_u8(S0p, _S0, 0);
- _S0 = vld1_lane_u8(S0p+1, _S0, 1);
- _S0 = vld1_lane_u8(S0p+2, _S0, 2);
- _S0 = vld1_lane_u8(S0p+3, _S0, 3);
-
- _S1 = vld1_lane_u8(S1p, _S1, 0);
- _S1 = vld1_lane_u8(S1p+1, _S1, 1);
- _S1 = vld1_lane_u8(S1p+2, _S1, 2);
- _S1 = vld1_lane_u8(S1p+3, _S1, 3);
-
- int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
- int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
- int16x4_t _S0lowhigh = vget_low_s16(_S016);
- int16x4_t _S1lowhigh = vget_low_s16(_S116);
- int32x2x2_t _S0S1low_S0S1high = vtrn_s32(vreinterpret_s32_s16(_S0lowhigh), vreinterpret_s32_s16(_S1lowhigh));
- int32x4_t _rows01 = vmull_s16(vreinterpret_s16_s32(_S0S1low_S0S1high.val[0]), _a0);
- _rows01 = vmlal_s16(_rows01, vreinterpret_s16_s32(_S0S1low_S0S1high.val[1]), _a1);
- int16x4_t _rows01_sr4 = vshrn_n_s32(_rows01, 4);
- int16x4_t _rows1_sr4 = vext_s16(_rows01_sr4, _rows01_sr4, 2);
- vst1_s16(rows0p, _rows01_sr4);
- vst1_s16(rows1p, _rows1_sr4);
- #else
- rows0p[0] = (S0p[0]*a0 + S0p[2]*a1) >> 4;
- rows0p[1] = (S0p[1]*a0 + S0p[3]*a1) >> 4;
- rows1p[0] = (S1p[0]*a0 + S1p[2]*a1) >> 4;
- rows1p[1] = (S1p[1]*a0 + S1p[3]*a1) >> 4;
- #endif // __ARM_NEON
-
- ialphap += 2;
- rows0p += 2;
- rows1p += 2;
- }
- }
-
- prev_sy1 = sy;
-
- // vresize
- short b0 = ibeta[0];
- short b1 = ibeta[1];
-
- short* rows0p = rows0;
- short* rows1p = rows1;
- unsigned char* Dp = dst + stride * (dy);
-
- #if __ARM_NEON
- int nn = (w * 2) >> 3;
- #else
- int nn = 0;
- #endif
- int remain = (w * 2) - (nn << 3);
-
- #if __ARM_NEON
- #if __aarch64__
- int16x4_t _b0 = vdup_n_s16(b0);
- int16x4_t _b1 = vdup_n_s16(b1);
- int32x4_t _v2 = vdupq_n_s32(2);
- for (; nn>0; nn--)
- {
- int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
- int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
- int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p+4);
- int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p+4);
-
- int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
- int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
- int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
- int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
-
- int32x4_t _acc = _v2;
- _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
- _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
-
- int32x4_t _acc_1 = _v2;
- _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
- _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
-
- int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
- int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
-
- uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
-
- vst1_u8(Dp, _D);
-
- Dp += 8;
- rows0p += 8;
- rows1p += 8;
- }
- #else
- if (nn > 0)
- {
- asm volatile(
- "vdup.s16 d16, %8 \n"
- "mov r4, #2 \n"
- "vdup.s16 d17, %9 \n"
- "vdup.s32 q12, r4 \n"
- "pld [%0, #128] \n"
- "vld1.s16 {d2-d3}, [%0 :128]!\n"
- "pld [%1, #128] \n"
- "vld1.s16 {d6-d7}, [%1 :128]!\n"
- "0: \n"
- "vmull.s16 q0, d2, d16 \n"
- "vmull.s16 q1, d3, d16 \n"
- "vorr.s32 q10, q12, q12 \n"
- "vorr.s32 q11, q12, q12 \n"
- "vmull.s16 q2, d6, d17 \n"
- "vmull.s16 q3, d7, d17 \n"
- "vsra.s32 q10, q0, #16 \n"
- "vsra.s32 q11, q1, #16 \n"
- "pld [%0, #128] \n"
- "vld1.s16 {d2-d3}, [%0 :128]!\n"
- "vsra.s32 q10, q2, #16 \n"
- "vsra.s32 q11, q3, #16 \n"
- "pld [%1, #128] \n"
- "vld1.s16 {d6-d7}, [%1 :128]!\n"
- "vshrn.s32 d20, q10, #2 \n"
- "vshrn.s32 d21, q11, #2 \n"
- "vqmovun.s16 d20, q10 \n"
- "vst1.8 {d20}, [%2]! \n"
- "subs %3, #1 \n"
- "bne 0b \n"
- "sub %0, #16 \n"
- "sub %1, #16 \n"
- : "=r"(rows0p), // %0
- "=r"(rows1p), // %1
- "=r"(Dp), // %2
- "=r"(nn) // %3
- : "0"(rows0p),
- "1"(rows1p),
- "2"(Dp),
- "3"(nn),
- "r"(b0), // %8
- "r"(b1) // %9
- : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12"
- );
- }
- #endif // __aarch64__
- #endif // __ARM_NEON
- for ( ; remain; --remain )
- {
- // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
- *Dp++ = (unsigned char)(( (short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2)>>2);
- }
-
- ibeta += 2;
- }
-
- delete[] buf;
- }
-
- void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
- {
- const int INTER_RESIZE_COEF_BITS=11;
- const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
- // const int ONE=INTER_RESIZE_COEF_SCALE;
-
- double scale_x = (double)srcw / w;
- double scale_y = (double)srch / h;
-
- int* buf = new int[w + h + w + h];
-
- int* xofs = buf;//new int[w];
- int* yofs = buf + w;//new int[h];
-
- short* ialpha = (short*)(buf + w + h);//new short[w * 2];
- short* ibeta = (short*)(buf + w + h + w);//new short[h * 2];
-
- float fx;
- float fy;
- int sx;
- int sy;
-
- #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
-
- for (int dx = 0; dx < w; dx++)
- {
- fx = (float)((dx + 0.5) * scale_x - 0.5);
- sx = static_cast<int>(floor(fx));
- fx -= sx;
-
- if (sx < 0)
- {
- sx = 0;
- fx = 0.f;
- }
- if (sx >= srcw - 1)
- {
- sx = srcw - 2;
- fx = 1.f;
- }
-
- xofs[dx] = sx*3;
-
- float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
- float a1 = fx * INTER_RESIZE_COEF_SCALE;
-
- ialpha[dx*2 ] = SATURATE_CAST_SHORT(a0);
- ialpha[dx*2 + 1] = SATURATE_CAST_SHORT(a1);
- }
-
- for (int dy = 0; dy < h; dy++)
- {
- fy = (float)((dy + 0.5) * scale_y - 0.5);
- sy = static_cast<int>(floor(fy));
- fy -= sy;
-
- if (sy < 0)
- {
- sy = 0;
- fy = 0.f;
- }
- if (sy >= srch - 1)
- {
- sy = srch - 2;
- fy = 1.f;
- }
-
- yofs[dy] = sy;
-
- float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
- float b1 = fy * INTER_RESIZE_COEF_SCALE;
-
- ibeta[dy*2 ] = SATURATE_CAST_SHORT(b0);
- ibeta[dy*2 + 1] = SATURATE_CAST_SHORT(b1);
- }
-
- #undef SATURATE_CAST_SHORT
-
- // loop body
- Mat rowsbuf0(w*3+1, (size_t)2u);
- Mat rowsbuf1(w*3+1, (size_t)2u);
- short* rows0 = (short*)rowsbuf0.data;
- short* rows1 = (short*)rowsbuf1.data;
-
- int prev_sy1 = -2;
-
- for (int dy = 0; dy < h; dy++ )
- {
- int sy = yofs[dy];
-
- if (sy == prev_sy1)
- {
- // reuse all rows
- }
- else if (sy == prev_sy1 + 1)
- {
- // hresize one row
- short* rows0_old = rows0;
- rows0 = rows1;
- rows1 = rows0_old;
- const unsigned char *S1 = src + srcstride * (sy+1);
-
- const short* ialphap = ialpha;
- short* rows1p = rows1;
- for ( int dx = 0; dx < w; dx++ )
- {
- int sx = xofs[dx];
- short a0 = ialphap[0];
- short a1 = ialphap[1];
-
- const unsigned char* S1p = S1 + sx;
- #if __ARM_NEON
- int16x4_t _a0 = vdup_n_s16(a0);
- int16x4_t _a1 = vdup_n_s16(a1);
- uint8x8_t _S1 = uint8x8_t();
-
- _S1 = vld1_lane_u8(S1p, _S1, 0);
- _S1 = vld1_lane_u8(S1p+1, _S1, 1);
- _S1 = vld1_lane_u8(S1p+2, _S1, 2);
- _S1 = vld1_lane_u8(S1p+3, _S1, 3);
- _S1 = vld1_lane_u8(S1p+4, _S1, 4);
- _S1 = vld1_lane_u8(S1p+5, _S1, 5);
-
- int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
- int16x4_t _S1low = vget_low_s16(_S116);
- int16x4_t _S1high = vext_s16(_S1low, vget_high_s16(_S116), 3);
- int32x4_t _rows1 = vmull_s16(_S1low, _a0);
- _rows1 = vmlal_s16(_rows1, _S1high, _a1);
- int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
- vst1_s16(rows1p, _rows1_sr4);
- #else
- rows1p[0] = (S1p[0]*a0 + S1p[3]*a1) >> 4;
- rows1p[1] = (S1p[1]*a0 + S1p[4]*a1) >> 4;
- rows1p[2] = (S1p[2]*a0 + S1p[5]*a1) >> 4;
- #endif // __ARM_NEON
-
- ialphap += 2;
- rows1p += 3;
- }
- }
- else
- {
- // hresize two rows
- const unsigned char *S0 = src + srcstride * (sy);
- const unsigned char *S1 = src + srcstride * (sy+1);
-
- const short* ialphap = ialpha;
- short* rows0p = rows0;
- short* rows1p = rows1;
- for ( int dx = 0; dx < w; dx++ )
- {
- int sx = xofs[dx];
- short a0 = ialphap[0];
- short a1 = ialphap[1];
-
- const unsigned char* S0p = S0 + sx;
- const unsigned char* S1p = S1 + sx;
- #if __ARM_NEON
- int16x4_t _a0 = vdup_n_s16(a0);
- int16x4_t _a1 = vdup_n_s16(a1);
- uint8x8_t _S0 = uint8x8_t();
- uint8x8_t _S1 = uint8x8_t();
-
- _S0 = vld1_lane_u8(S0p, _S0, 0);
- _S0 = vld1_lane_u8(S0p+1, _S0, 1);
- _S0 = vld1_lane_u8(S0p+2, _S0, 2);
- _S0 = vld1_lane_u8(S0p+3, _S0, 3);
- _S0 = vld1_lane_u8(S0p+4, _S0, 4);
- _S0 = vld1_lane_u8(S0p+5, _S0, 5);
-
- _S1 = vld1_lane_u8(S1p, _S1, 0);
- _S1 = vld1_lane_u8(S1p+1, _S1, 1);
- _S1 = vld1_lane_u8(S1p+2, _S1, 2);
- _S1 = vld1_lane_u8(S1p+3, _S1, 3);
- _S1 = vld1_lane_u8(S1p+4, _S1, 4);
- _S1 = vld1_lane_u8(S1p+5, _S1, 5);
-
- int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
- int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
- int16x4_t _S0low = vget_low_s16(_S016);
- int16x4_t _S1low = vget_low_s16(_S116);
- int16x4_t _S0high = vext_s16(_S0low, vget_high_s16(_S016), 3);
- int16x4_t _S1high = vext_s16(_S1low, vget_high_s16(_S116), 3);
- int32x4_t _rows0 = vmull_s16(_S0low, _a0);
- int32x4_t _rows1 = vmull_s16(_S1low, _a0);
- _rows0 = vmlal_s16(_rows0, _S0high, _a1);
- _rows1 = vmlal_s16(_rows1, _S1high, _a1);
- int16x4_t _rows0_sr4 = vshrn_n_s32(_rows0, 4);
- int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
- vst1_s16(rows0p, _rows0_sr4);
- vst1_s16(rows1p, _rows1_sr4);
- #else
- rows0p[0] = (S0p[0]*a0 + S0p[3]*a1) >> 4;
- rows0p[1] = (S0p[1]*a0 + S0p[4]*a1) >> 4;
- rows0p[2] = (S0p[2]*a0 + S0p[5]*a1) >> 4;
- rows1p[0] = (S1p[0]*a0 + S1p[3]*a1) >> 4;
- rows1p[1] = (S1p[1]*a0 + S1p[4]*a1) >> 4;
- rows1p[2] = (S1p[2]*a0 + S1p[5]*a1) >> 4;
- #endif // __ARM_NEON
-
- ialphap += 2;
- rows0p += 3;
- rows1p += 3;
- }
- }
-
- prev_sy1 = sy;
-
- // vresize
- short b0 = ibeta[0];
- short b1 = ibeta[1];
-
- short* rows0p = rows0;
- short* rows1p = rows1;
- unsigned char* Dp = dst + stride * (dy);
-
- #if __ARM_NEON
- int nn = (w * 3) >> 3;
- #else
- int nn = 0;
- #endif
- int remain = (w * 3) - (nn << 3);
-
- #if __ARM_NEON
- #if __aarch64__
- int16x4_t _b0 = vdup_n_s16(b0);
- int16x4_t _b1 = vdup_n_s16(b1);
- int32x4_t _v2 = vdupq_n_s32(2);
- for (; nn>0; nn--)
- {
- int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
- int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
- int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p+4);
- int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p+4);
-
- int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
- int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
- int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
- int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
-
- int32x4_t _acc = _v2;
- _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
- _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
-
- int32x4_t _acc_1 = _v2;
- _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
- _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
-
- int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
- int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
-
- uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
-
- vst1_u8(Dp, _D);
-
- Dp += 8;
- rows0p += 8;
- rows1p += 8;
- }
- #else
- if (nn > 0)
- {
- asm volatile(
- "vdup.s16 d16, %8 \n"
- "mov r4, #2 \n"
- "vdup.s16 d17, %9 \n"
- "vdup.s32 q12, r4 \n"
- "pld [%0, #128] \n"
- "vld1.s16 {d2-d3}, [%0 :128]!\n"
- "pld [%1, #128] \n"
- "vld1.s16 {d6-d7}, [%1 :128]!\n"
- "0: \n"
- "vmull.s16 q0, d2, d16 \n"
- "vmull.s16 q1, d3, d16 \n"
- "vorr.s32 q10, q12, q12 \n"
- "vorr.s32 q11, q12, q12 \n"
- "vmull.s16 q2, d6, d17 \n"
- "vmull.s16 q3, d7, d17 \n"
- "vsra.s32 q10, q0, #16 \n"
- "vsra.s32 q11, q1, #16 \n"
- "pld [%0, #128] \n"
- "vld1.s16 {d2-d3}, [%0 :128]!\n"
- "vsra.s32 q10, q2, #16 \n"
- "vsra.s32 q11, q3, #16 \n"
- "pld [%1, #128] \n"
- "vld1.s16 {d6-d7}, [%1 :128]!\n"
- "vshrn.s32 d20, q10, #2 \n"
- "vshrn.s32 d21, q11, #2 \n"
- "vqmovun.s16 d20, q10 \n"
- "vst1.8 {d20}, [%2]! \n"
- "subs %3, #1 \n"
- "bne 0b \n"
- "sub %0, #16 \n"
- "sub %1, #16 \n"
- : "=r"(rows0p), // %0
- "=r"(rows1p), // %1
- "=r"(Dp), // %2
- "=r"(nn) // %3
- : "0"(rows0p),
- "1"(rows1p),
- "2"(Dp),
- "3"(nn),
- "r"(b0), // %8
- "r"(b1) // %9
- : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12"
- );
- }
- #endif // __aarch64__
- #endif // __ARM_NEON
- for ( ; remain; --remain )
- {
- // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
- *Dp++ = (unsigned char)(( (short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2)>>2);
- }
-
- ibeta += 2;
- }
-
- delete[] buf;
- }
-
- void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
- {
- const int INTER_RESIZE_COEF_BITS=11;
- const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
- // const int ONE=INTER_RESIZE_COEF_SCALE;
-
- double scale_x = (double)srcw / w;
- double scale_y = (double)srch / h;
-
- int* buf = new int[w + h + w + h];
-
- int* xofs = buf;//new int[w];
- int* yofs = buf + w;//new int[h];
-
- short* ialpha = (short*)(buf + w + h);//new short[w * 2];
- short* ibeta = (short*)(buf + w + h + w);//new short[h * 2];
-
- float fx;
- float fy;
- int sx;
- int sy;
-
- #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
-
- for (int dx = 0; dx < w; dx++)
- {
- fx = (float)((dx + 0.5) * scale_x - 0.5);
- sx = static_cast<int>(floor(fx));
- fx -= sx;
-
- if (sx < 0)
- {
- sx = 0;
- fx = 0.f;
- }
- if (sx >= srcw - 1)
- {
- sx = srcw - 2;
- fx = 1.f;
- }
-
- xofs[dx] = sx*4;
-
- float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
- float a1 = fx * INTER_RESIZE_COEF_SCALE;
-
- ialpha[dx*2 ] = SATURATE_CAST_SHORT(a0);
- ialpha[dx*2 + 1] = SATURATE_CAST_SHORT(a1);
- }
-
- for (int dy = 0; dy < h; dy++)
- {
- fy = (float)((dy + 0.5) * scale_y - 0.5);
- sy = static_cast<int>(floor(fy));
- fy -= sy;
-
- if (sy < 0)
- {
- sy = 0;
- fy = 0.f;
- }
- if (sy >= srch - 1)
- {
- sy = srch - 2;
- fy = 1.f;
- }
-
- yofs[dy] = sy;
-
- float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
- float b1 = fy * INTER_RESIZE_COEF_SCALE;
-
- ibeta[dy*2 ] = SATURATE_CAST_SHORT(b0);
- ibeta[dy*2 + 1] = SATURATE_CAST_SHORT(b1);
- }
-
- #undef SATURATE_CAST_SHORT
-
- // loop body
- Mat rowsbuf0(w*4, (size_t)2u);
- Mat rowsbuf1(w*4, (size_t)2u);
- short* rows0 = (short*)rowsbuf0.data;
- short* rows1 = (short*)rowsbuf1.data;
-
- int prev_sy1 = -2;
-
- for (int dy = 0; dy < h; dy++ )
- {
- int sy = yofs[dy];
-
- if (sy == prev_sy1)
- {
- // reuse all rows
- }
- else if (sy == prev_sy1 + 4)
- {
- // hresize one row
- short* rows0_old = rows0;
- rows0 = rows1;
- rows1 = rows0_old;
- const unsigned char *S1 = src + srcstride * (sy+1);
-
- const short* ialphap = ialpha;
- short* rows1p = rows1;
- for ( int dx = 0; dx < w; dx++ )
- {
- int sx = xofs[dx];
- short a0 = ialphap[0];
- short a1 = ialphap[1];
-
- const unsigned char* S1p = S1 + sx;
- #if __ARM_NEON
- int16x4_t _a0 = vdup_n_s16(a0);
- int16x4_t _a1 = vdup_n_s16(a1);
- uint8x8_t _S1 = vld1_u8(S1p);
- int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
- int16x4_t _S1low = vget_low_s16(_S116);
- int16x4_t _S1high = vget_high_s16(_S116);
- int32x4_t _rows1 = vmull_s16(_S1low, _a0);
- _rows1 = vmlal_s16(_rows1, _S1high, _a1);
- int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
- vst1_s16(rows1p, _rows1_sr4);
- #else
- rows1p[0] = (S1p[0]*a0 + S1p[4]*a1) >> 4;
- rows1p[1] = (S1p[1]*a0 + S1p[5]*a1) >> 4;
- rows1p[2] = (S1p[2]*a0 + S1p[6]*a1) >> 4;
- rows1p[3] = (S1p[3]*a0 + S1p[7]*a1) >> 4;
- #endif // __ARM_NEON
-
- ialphap += 2;
- rows1p += 4;
- }
- }
- else
- {
- // hresize two rows
- const unsigned char *S0 = src + srcstride * (sy);
- const unsigned char *S1 = src + srcstride * (sy+1);
-
- const short* ialphap = ialpha;
- short* rows0p = rows0;
- short* rows1p = rows1;
- for ( int dx = 0; dx < w; dx++ )
- {
- int sx = xofs[dx];
- short a0 = ialphap[0];
- short a1 = ialphap[1];
-
- const unsigned char* S0p = S0 + sx;
- const unsigned char* S1p = S1 + sx;
- #if __ARM_NEON
- int16x4_t _a0 = vdup_n_s16(a0);
- int16x4_t _a1 = vdup_n_s16(a1);
- uint8x8_t _S0 = vld1_u8(S0p);
- uint8x8_t _S1 = vld1_u8(S1p);
- int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
- int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
- int16x4_t _S0low = vget_low_s16(_S016);
- int16x4_t _S1low = vget_low_s16(_S116);
- int16x4_t _S0high = vget_high_s16(_S016);
- int16x4_t _S1high = vget_high_s16(_S116);
- int32x4_t _rows0 = vmull_s16(_S0low, _a0);
- int32x4_t _rows1 = vmull_s16(_S1low, _a0);
- _rows0 = vmlal_s16(_rows0, _S0high, _a1);
- _rows1 = vmlal_s16(_rows1, _S1high, _a1);
- int16x4_t _rows0_sr4 = vshrn_n_s32(_rows0, 4);
- int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
- vst1_s16(rows0p, _rows0_sr4);
- vst1_s16(rows1p, _rows1_sr4);
- #else
- rows0p[0] = (S0p[0]*a0 + S0p[4]*a1) >> 4;
- rows0p[1] = (S0p[1]*a0 + S0p[5]*a1) >> 4;
- rows0p[2] = (S0p[2]*a0 + S0p[6]*a1) >> 4;
- rows0p[3] = (S0p[3]*a0 + S0p[7]*a1) >> 4;
- rows1p[0] = (S1p[0]*a0 + S1p[4]*a1) >> 4;
- rows1p[1] = (S1p[1]*a0 + S1p[5]*a1) >> 4;
- rows1p[2] = (S1p[2]*a0 + S1p[6]*a1) >> 4;
- rows1p[3] = (S1p[3]*a0 + S1p[7]*a1) >> 4;
- #endif // __ARM_NEON
-
- ialphap += 2;
- rows0p += 4;
- rows1p += 4;
- }
- }
-
- prev_sy1 = sy;
-
- // vresize
- short b0 = ibeta[0];
- short b1 = ibeta[1];
-
- short* rows0p = rows0;
- short* rows1p = rows1;
- unsigned char* Dp = dst + stride * (dy);
-
- #if __ARM_NEON
- int nn = (w * 4) >> 3;
- #else
- int nn = 0;
- #endif
- int remain = (w * 4) - (nn << 3);
-
- #if __ARM_NEON
- #if __aarch64__
- int16x4_t _b0 = vdup_n_s16(b0);
- int16x4_t _b1 = vdup_n_s16(b1);
- int32x4_t _v2 = vdupq_n_s32(2);
- for (; nn>0; nn--)
- {
- int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
- int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
- int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p+4);
- int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p+4);
-
- int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
- int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
- int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
- int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
-
- int32x4_t _acc = _v2;
- _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
- _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
-
- int32x4_t _acc_1 = _v2;
- _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
- _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
-
- int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
- int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
-
- uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
-
- vst1_u8(Dp, _D);
-
- Dp += 8;
- rows0p += 8;
- rows1p += 8;
- }
- #else
- if (nn > 0)
- {
- asm volatile(
- "vdup.s16 d16, %8 \n"
- "mov r4, #2 \n"
- "vdup.s16 d17, %9 \n"
- "vdup.s32 q12, r4 \n"
- "pld [%0, #128] \n"
- "vld1.s16 {d2-d3}, [%0 :128]!\n"
- "pld [%1, #128] \n"
- "vld1.s16 {d6-d7}, [%1 :128]!\n"
- "0: \n"
- "vmull.s16 q0, d2, d16 \n"
- "vmull.s16 q1, d3, d16 \n"
- "vorr.s32 q10, q12, q12 \n"
- "vorr.s32 q11, q12, q12 \n"
- "vmull.s16 q2, d6, d17 \n"
- "vmull.s16 q3, d7, d17 \n"
- "vsra.s32 q10, q0, #16 \n"
- "vsra.s32 q11, q1, #16 \n"
- "pld [%0, #128] \n"
- "vld1.s16 {d2-d3}, [%0 :128]!\n"
- "vsra.s32 q10, q2, #16 \n"
- "vsra.s32 q11, q3, #16 \n"
- "pld [%1, #128] \n"
- "vld1.s16 {d6-d7}, [%1 :128]!\n"
- "vshrn.s32 d20, q10, #2 \n"
- "vshrn.s32 d21, q11, #2 \n"
- "vqmovun.s16 d20, q10 \n"
- "vst1.8 {d20}, [%2]! \n"
- "subs %3, #1 \n"
- "bne 0b \n"
- "sub %0, #16 \n"
- "sub %1, #16 \n"
- : "=r"(rows0p), // %0
- "=r"(rows1p), // %1
- "=r"(Dp), // %2
- "=r"(nn) // %3
- : "0"(rows0p),
- "1"(rows1p),
- "2"(Dp),
- "3"(nn),
- "r"(b0), // %8
- "r"(b1) // %9
- : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12"
- );
- }
- #endif // __aarch64__
- #endif // __ARM_NEON
- for ( ; remain; --remain )
- {
- // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
- *Dp++ = (unsigned char)(( (short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2)>>2);
- }
-
- ibeta += 2;
- }
-
- delete[] buf;
- }
-
- void resize_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
- {
- // assert srcw % 2 == 0
- // assert srch % 2 == 0
- // assert w % 2 == 0
- // assert h % 2 == 0
-
- const unsigned char* srcY = src;
- unsigned char* dstY = dst;
- resize_bilinear_c1(srcY, srcw, srch, dstY, w, h);
-
- const unsigned char* srcUV = src + srcw * srch;
- unsigned char* dstUV = dst + w * h;
- resize_bilinear_c2(srcUV, srcw / 2, srch / 2, dstUV, w / 2, h / 2);
- }
- #endif // NCNN_PIXEL
-
- } // namespace ncnn
|