From da7989bda32a6b9cc2c35652b295712dc8f010ca Mon Sep 17 00:00:00 2001 From: nihuini Date: Mon, 14 Sep 2020 19:29:02 +0800 Subject: [PATCH] yuv420sp2rgb_nv12 --- src/mat.h | 6 +- src/mat_pixel.cpp | 195 +++++++++++++++++++++++++++++++++++++++ tests/test_mat_pixel.cpp | 44 ++++++++- 3 files changed, 242 insertions(+), 3 deletions(-) diff --git a/src/mat.h b/src/mat.h index 8bea6ee8f..36176b14c 100644 --- a/src/mat.h +++ b/src/mat.h @@ -529,6 +529,8 @@ union vk_constant_type #if NCNN_PIXEL // convert yuv420sp(nv21) to rgb, the fast approximate version void yuv420sp2rgb(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb); +// convert yuv420sp(nv12) to rgb, the fast approximate version +void yuv420sp2rgb_nv12(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb); // convert yuv420sp(nv21) to rgb with half resize, the faster approximate version void yuv420sp2rgb_half(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb); // image pixel bilinear resize @@ -541,7 +543,7 @@ void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstr void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride); void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride); void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride); -// image pixel bilinear resize, convenient wrapper for yuv420sp(nv21) +// image pixel bilinear resize, convenient wrapper for yuv420sp(nv21/nv12) void resize_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h); #endif // NCNN_PIXEL #if NCNN_PIXEL_ROTATE @@ -566,7 +568,7 @@ void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, int srcstride void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type); void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type); void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type); -// image pixel kanna rotate, convenient wrapper for yuv420sp(nv21) +// image pixel kanna rotate, convenient wrapper for yuv420sp(nv21/nv12) void kanna_rotate_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type); #endif // NCNN_PIXEL_ROTATE diff --git a/src/mat_pixel.cpp b/src/mat_pixel.cpp index aeac0d530..55ba7c780 100644 --- a/src/mat_pixel.cpp +++ b/src/mat_pixel.cpp @@ -2153,6 +2153,201 @@ void yuv420sp2rgb(const unsigned char* yuv420sp, int w, int h, unsigned char* rg } } +void yuv420sp2rgb_nv12(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb) +{ + const unsigned char* yptr = yuv420sp; + const unsigned char* uvptr = yuv420sp + w * h; + +#if __ARM_NEON + uint8x8_t _v128 = vdup_n_u8(128); + int8x8_t _v90 = vdup_n_s8(90); + int8x8_t _v46 = vdup_n_s8(46); + int8x8_t _v22 = vdup_n_s8(22); + int8x8_t _v113 = vdup_n_s8(113); +#endif // __ARM_NEON + + for (int y = 0; y < h; y += 2) + { + const unsigned char* yptr0 = yptr; + const unsigned char* yptr1 = yptr + w; + unsigned char* rgb0 = rgb; + unsigned char* rgb1 = rgb + w * 3; + +#if __ARM_NEON + int nn = w >> 3; + int remain = w - (nn << 3); +#else + int remain = w; +#endif // __ARM_NEON + +#if __ARM_NEON +#if __aarch64__ + for (; nn > 0; nn--) + { + int16x8_t _yy0 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr0), 6)); + int16x8_t _yy1 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr1), 6)); + + int8x8_t _uuvv = vreinterpret_s8_u8(vsub_u8(vld1_u8(uvptr), _v128)); + int8x8x2_t _uuuuvvvv = vtrn_s8(_uuvv, _uuvv); + int8x8_t _uu = _uuuuvvvv.val[0]; + int8x8_t _vv = _uuuuvvvv.val[1]; + + int16x8_t _r0 = vmlal_s8(_yy0, _vv, _v90); + int16x8_t _g0 = vmlsl_s8(_yy0, _vv, _v46); + _g0 = vmlsl_s8(_g0, _uu, _v22); + int16x8_t _b0 = vmlal_s8(_yy0, _uu, _v113); + + int16x8_t _r1 = vmlal_s8(_yy1, _vv, _v90); + int16x8_t _g1 = vmlsl_s8(_yy1, _vv, _v46); + _g1 = vmlsl_s8(_g1, _uu, _v22); + int16x8_t _b1 = vmlal_s8(_yy1, _uu, _v113); + + uint8x8x3_t _rgb0; + _rgb0.val[0] = vqshrun_n_s16(_r0, 6); + _rgb0.val[1] = vqshrun_n_s16(_g0, 6); + _rgb0.val[2] = vqshrun_n_s16(_b0, 6); + + uint8x8x3_t _rgb1; + _rgb1.val[0] = vqshrun_n_s16(_r1, 6); + _rgb1.val[1] = vqshrun_n_s16(_g1, 6); + _rgb1.val[2] = vqshrun_n_s16(_b1, 6); + + vst3_u8(rgb0, _rgb0); + vst3_u8(rgb1, _rgb1); + + yptr0 += 8; + yptr1 += 8; + uvptr += 8; + rgb0 += 24; + rgb1 += 24; + } +#else + if (nn > 0) + { + asm volatile( + "pld [%3, #128] \n" + "vld1.u8 {d2}, [%3]! \n" + "vsub.s8 d2, d2, %12 \n" + "0: \n" + "pld [%1, #128] \n" + "vld1.u8 {d0}, [%1]! \n" + "pld [%2, #128] \n" + "vld1.u8 {d1}, [%2]! \n" + "vshll.u8 q2, d0, #6 \n" + "vorr d3, d2, d2 \n" + "vshll.u8 q3, d1, #6 \n" + "vorr q9, q2, q2 \n" + "vtrn.s8 d2, d3 \n" + "vorr q11, q3, q3 \n" + "vmlsl.s8 q9, d3, %14 \n" + "vorr q8, q2, q2 \n" + "vmlsl.s8 q11, d3, %14 \n" + "vorr q10, q3, q3 \n" + "vmlal.s8 q8, d3, %13 \n" + "vmlal.s8 q2, d2, %16 \n" + "vmlal.s8 q10, d3, %13 \n" + "vmlsl.s8 q9, d2, %15 \n" + "vmlal.s8 q3, d2, %16 \n" + "vmlsl.s8 q11, d2, %15 \n" + "vqshrun.s16 d24, q8, #6 \n" + "vqshrun.s16 d26, q2, #6 \n" + "vqshrun.s16 d4, q10, #6 \n" + "vqshrun.s16 d25, q9, #6 \n" + "vqshrun.s16 d6, q3, #6 \n" + "vqshrun.s16 d5, q11, #6 \n" + "pld [%3, #128] \n" + "vld1.u8 {d2}, [%3]! \n" + "subs %0, #1 \n" + "vst3.u8 {d24-d26}, [%4]! \n" + "vsub.s8 d2, d2, %12 \n" + "vst3.u8 {d4-d6}, [%5]! \n" + "bne 0b \n" + "sub %3, #8 \n" + : "=r"(nn), // %0 + "=r"(yptr0), // %1 + "=r"(yptr1), // %2 + "=r"(uvptr), // %3 + "=r"(rgb0), // %4 + "=r"(rgb1) // %5 + : "0"(nn), + "1"(yptr0), + "2"(yptr1), + "3"(uvptr), + "4"(rgb0), + "5"(rgb1), + "w"(_v128), // %12 + "w"(_v90), // %13 + "w"(_v46), // %14 + "w"(_v22), // %15 + "w"(_v113) // %16 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "d26"); + } +#endif // __aarch64__ +#endif // __ARM_NEON + +#define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255); + for (; remain > 0; remain -= 2) + { + // R = 1.164 * yy + 1.596 * vv + // G = 1.164 * yy - 0.813 * vv - 0.391 * uu + // B = 1.164 * yy + 2.018 * uu + + // R = Y + (1.370705 * (V-128)) + // G = Y - (0.698001 * (V-128)) - (0.337633 * (U-128)) + // B = Y + (1.732446 * (U-128)) + + // R = ((Y << 6) + 87.72512 * (V-128)) >> 6 + // G = ((Y << 6) - 44.672064 * (V-128) - 21.608512 * (U-128)) >> 6 + // B = ((Y << 6) + 110.876544 * (U-128)) >> 6 + + // R = ((Y << 6) + 90 * (V-128)) >> 6 + // G = ((Y << 6) - 46 * (V-128) - 22 * (U-128)) >> 6 + // B = ((Y << 6) + 113 * (U-128)) >> 6 + + // R = (yy + 90 * vv) >> 6 + // G = (yy - 46 * vv - 22 * uu) >> 6 + // B = (yy + 113 * uu) >> 6 + + int u = uvptr[0] - 128; + int v = uvptr[1] - 128; + + int ruv = 90 * v; + int guv = -46 * v + -22 * u; + int buv = 113 * u; + + int y00 = yptr0[0] << 6; + rgb0[0] = SATURATE_CAST_UCHAR((y00 + ruv) >> 6); + rgb0[1] = SATURATE_CAST_UCHAR((y00 + guv) >> 6); + rgb0[2] = SATURATE_CAST_UCHAR((y00 + buv) >> 6); + + int y01 = yptr0[1] << 6; + rgb0[3] = SATURATE_CAST_UCHAR((y01 + ruv) >> 6); + rgb0[4] = SATURATE_CAST_UCHAR((y01 + guv) >> 6); + rgb0[5] = SATURATE_CAST_UCHAR((y01 + buv) >> 6); + + int y10 = yptr1[0] << 6; + rgb1[0] = SATURATE_CAST_UCHAR((y10 + ruv) >> 6); + rgb1[1] = SATURATE_CAST_UCHAR((y10 + guv) >> 6); + rgb1[2] = SATURATE_CAST_UCHAR((y10 + buv) >> 6); + + int y11 = yptr1[1] << 6; + rgb1[3] = SATURATE_CAST_UCHAR((y11 + ruv) >> 6); + rgb1[4] = SATURATE_CAST_UCHAR((y11 + guv) >> 6); + rgb1[5] = SATURATE_CAST_UCHAR((y11 + buv) >> 6); + + yptr0 += 2; + yptr1 += 2; + uvptr += 2; + rgb0 += 6; + rgb1 += 6; + } +#undef SATURATE_CAST_UCHAR + + yptr += 2 * w; + rgb += 2 * 3 * w; + } +} + void yuv420sp2rgb_half(const unsigned char* yuv, int w, int h, unsigned char* rgb) { const unsigned char* puv = yuv + w * h; diff --git a/tests/test_mat_pixel.cpp b/tests/test_mat_pixel.cpp index 64246b71d..26f0019b7 100644 --- a/tests/test_mat_pixel.cpp +++ b/tests/test_mat_pixel.cpp @@ -323,6 +323,38 @@ static int test_mat_pixel_roi_bgra(int w, int h, int roix, int roiy, int roiw, i return 0; } +static int test_mat_pixel_yuv420sp2rgb(int w, int h) +{ + ncnn::Mat nv21 = RandomMat(w, h / 2 * 3, 1); + + ncnn::Mat nv12 = nv21.clone(); + + // swap VU to UV + unsigned char* p = (unsigned char*)nv12 + w * h; + for (int i = 0; i < w * h / 4; i++) + { + unsigned char v = p[0]; + unsigned char u = p[1]; + p[0] = u; + p[1] = v; + p += 2; + } + + ncnn::Mat rgb(w, h, 3u, 3); + yuv420sp2rgb(nv21, w, h, rgb); + + ncnn::Mat rgb2(w, h, 3u, 3); + yuv420sp2rgb_nv12(nv12, w, h, rgb2); + + if (memcmp(rgb, rgb2, w * h * 3) != 0) + { + fprintf(stderr, "test_mat_pixel_yuv420sp2rgb failed w=%d h=%d\n", w, h); + return -1; + } + + return 0; +} + static int test_mat_pixel_0() { return 0 @@ -383,6 +415,15 @@ static int test_mat_pixel_5() || test_mat_pixel_roi_bgra(15, 15, 7, 3, 1, 1); } +static int test_mat_pixel_6() +{ + return 0 + || test_mat_pixel_yuv420sp2rgb(16, 16) + || test_mat_pixel_yuv420sp2rgb(12, 12) + || test_mat_pixel_yuv420sp2rgb(2, 2) + || test_mat_pixel_yuv420sp2rgb(6, 6); +} + int main() { SRAND(7767517); @@ -393,5 +434,6 @@ int main() || test_mat_pixel_2() || test_mat_pixel_3() || test_mat_pixel_4() - || test_mat_pixel_5(); + || test_mat_pixel_5() + || test_mat_pixel_6(); }