From da7989bda32a6b9cc2c35652b295712dc8f010ca Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Mon, 14 Sep 2020 19:29:02 +0800
Subject: [PATCH] yuv420sp2rgb_nv12

---
 src/mat.h                |   6 +-
 src/mat_pixel.cpp        | 195 +++++++++++++++++++++++++++++++++++++++
 tests/test_mat_pixel.cpp |  44 ++++++++-
 3 files changed, 242 insertions(+), 3 deletions(-)

diff --git a/src/mat.h b/src/mat.h
index 8bea6ee8f..36176b14c 100644
--- a/src/mat.h
+++ b/src/mat.h
@@ -529,6 +529,8 @@ union vk_constant_type
 #if NCNN_PIXEL
 // convert yuv420sp(nv21) to rgb, the fast approximate version
 void yuv420sp2rgb(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// convert yuv420sp(nv12) to rgb, the fast approximate version
+void yuv420sp2rgb_nv12(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
 // convert yuv420sp(nv21) to rgb with half resize, the faster approximate version
 void yuv420sp2rgb_half(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
 // image pixel bilinear resize
@@ -541,7 +543,7 @@ void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstr
 void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
 void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
 void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
-// image pixel bilinear resize, convenient wrapper for yuv420sp(nv21)
+// image pixel bilinear resize, convenient wrapper for yuv420sp(nv21/nv12)
 void resize_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
 #endif // NCNN_PIXEL
 #if NCNN_PIXEL_ROTATE
@@ -566,7 +568,7 @@ void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, int srcstride
 void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
 void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
 void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
-// image pixel kanna rotate, convenient wrapper for yuv420sp(nv21)
+// image pixel kanna rotate, convenient wrapper for yuv420sp(nv21/nv12)
 void kanna_rotate_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
 #endif // NCNN_PIXEL_ROTATE
 
diff --git a/src/mat_pixel.cpp b/src/mat_pixel.cpp
index aeac0d530..55ba7c780 100644
--- a/src/mat_pixel.cpp
+++ b/src/mat_pixel.cpp
@@ -2153,6 +2153,201 @@ void yuv420sp2rgb(const unsigned char* yuv420sp, int w, int h, unsigned char* rg
     }
 }
 
+void yuv420sp2rgb_nv12(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb)
+{
+    const unsigned char* yptr = yuv420sp;
+    const unsigned char* uvptr = yuv420sp + w * h;
+
+#if __ARM_NEON
+    uint8x8_t _v128 = vdup_n_u8(128);
+    int8x8_t _v90 = vdup_n_s8(90);
+    int8x8_t _v46 = vdup_n_s8(46);
+    int8x8_t _v22 = vdup_n_s8(22);
+    int8x8_t _v113 = vdup_n_s8(113);
+#endif // __ARM_NEON
+
+    for (int y = 0; y < h; y += 2)
+    {
+        const unsigned char* yptr0 = yptr;
+        const unsigned char* yptr1 = yptr + w;
+        unsigned char* rgb0 = rgb;
+        unsigned char* rgb1 = rgb + w * 3;
+
+#if __ARM_NEON
+        int nn = w >> 3;
+        int remain = w - (nn << 3);
+#else
+        int remain = w;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+        for (; nn > 0; nn--)
+        {
+            int16x8_t _yy0 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr0), 6));
+            int16x8_t _yy1 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr1), 6));
+
+            int8x8_t _uuvv = vreinterpret_s8_u8(vsub_u8(vld1_u8(uvptr), _v128));
+            int8x8x2_t _uuuuvvvv = vtrn_s8(_uuvv, _uuvv);
+            int8x8_t _uu = _uuuuvvvv.val[0];
+            int8x8_t _vv = _uuuuvvvv.val[1];
+
+            int16x8_t _r0 = vmlal_s8(_yy0, _vv, _v90);
+            int16x8_t _g0 = vmlsl_s8(_yy0, _vv, _v46);
+            _g0 = vmlsl_s8(_g0, _uu, _v22);
+            int16x8_t _b0 = vmlal_s8(_yy0, _uu, _v113);
+
+            int16x8_t _r1 = vmlal_s8(_yy1, _vv, _v90);
+            int16x8_t _g1 = vmlsl_s8(_yy1, _vv, _v46);
+            _g1 = vmlsl_s8(_g1, _uu, _v22);
+            int16x8_t _b1 = vmlal_s8(_yy1, _uu, _v113);
+
+            uint8x8x3_t _rgb0;
+            _rgb0.val[0] = vqshrun_n_s16(_r0, 6);
+            _rgb0.val[1] = vqshrun_n_s16(_g0, 6);
+            _rgb0.val[2] = vqshrun_n_s16(_b0, 6);
+
+            uint8x8x3_t _rgb1;
+            _rgb1.val[0] = vqshrun_n_s16(_r1, 6);
+            _rgb1.val[1] = vqshrun_n_s16(_g1, 6);
+            _rgb1.val[2] = vqshrun_n_s16(_b1, 6);
+
+            vst3_u8(rgb0, _rgb0);
+            vst3_u8(rgb1, _rgb1);
+
+            yptr0 += 8;
+            yptr1 += 8;
+            uvptr += 8;
+            rgb0 += 24;
+            rgb1 += 24;
+        }
+#else
+        if (nn > 0)
+        {
+            asm volatile(
+                "pld        [%3, #128]          \n"
+                "vld1.u8    {d2}, [%3]!         \n"
+                "vsub.s8    d2, d2, %12         \n"
+                "0:                             \n"
+                "pld        [%1, #128]          \n"
+                "vld1.u8    {d0}, [%1]!         \n"
+                "pld        [%2, #128]          \n"
+                "vld1.u8    {d1}, [%2]!         \n"
+                "vshll.u8   q2, d0, #6          \n"
+                "vorr       d3, d2, d2          \n"
+                "vshll.u8   q3, d1, #6          \n"
+                "vorr       q9, q2, q2          \n"
+                "vtrn.s8    d2, d3              \n"
+                "vorr       q11, q3, q3         \n"
+                "vmlsl.s8   q9, d3, %14         \n"
+                "vorr       q8, q2, q2          \n"
+                "vmlsl.s8   q11, d3, %14        \n"
+                "vorr       q10, q3, q3         \n"
+                "vmlal.s8   q8, d3, %13         \n"
+                "vmlal.s8   q2, d2, %16         \n"
+                "vmlal.s8   q10, d3, %13        \n"
+                "vmlsl.s8   q9, d2, %15         \n"
+                "vmlal.s8   q3, d2, %16         \n"
+                "vmlsl.s8   q11, d2, %15        \n"
+                "vqshrun.s16 d24, q8, #6        \n"
+                "vqshrun.s16 d26, q2, #6        \n"
+                "vqshrun.s16 d4, q10, #6        \n"
+                "vqshrun.s16 d25, q9, #6        \n"
+                "vqshrun.s16 d6, q3, #6         \n"
+                "vqshrun.s16 d5, q11, #6        \n"
+                "pld        [%3, #128]          \n"
+                "vld1.u8    {d2}, [%3]!         \n"
+                "subs       %0, #1              \n"
+                "vst3.u8    {d24-d26}, [%4]!    \n"
+                "vsub.s8    d2, d2, %12         \n"
+                "vst3.u8    {d4-d6}, [%5]!      \n"
+                "bne        0b                  \n"
+                "sub        %3, #8              \n"
+                : "=r"(nn),    // %0
+                "=r"(yptr0), // %1
+                "=r"(yptr1), // %2
+                "=r"(uvptr), // %3
+                "=r"(rgb0),  // %4
+                "=r"(rgb1)   // %5
+                : "0"(nn),
+                "1"(yptr0),
+                "2"(yptr1),
+                "3"(uvptr),
+                "4"(rgb0),
+                "5"(rgb1),
+                "w"(_v128), // %12
+                "w"(_v90),  // %13
+                "w"(_v46),  // %14
+                "w"(_v22),  // %15
+                "w"(_v113)  // %16
+                : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "d26");
+        }
+#endif // __aarch64__
+#endif // __ARM_NEON
+
+#define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
+        for (; remain > 0; remain -= 2)
+        {
+            // R = 1.164 * yy + 1.596 * vv
+            // G = 1.164 * yy - 0.813 * vv - 0.391 * uu
+            // B = 1.164 * yy              + 2.018 * uu
+
+            // R = Y + (1.370705 * (V-128))
+            // G = Y - (0.698001 * (V-128)) - (0.337633 * (U-128))
+            // B = Y + (1.732446 * (U-128))
+
+            // R = ((Y << 6) + 87.72512 * (V-128)) >> 6
+            // G = ((Y << 6) - 44.672064 * (V-128) - 21.608512 * (U-128)) >> 6
+            // B = ((Y << 6) + 110.876544 * (U-128)) >> 6
+
+            // R = ((Y << 6) + 90 * (V-128)) >> 6
+            // G = ((Y << 6) - 46 * (V-128) - 22 * (U-128)) >> 6
+            // B = ((Y << 6) + 113 * (U-128)) >> 6
+
+            // R = (yy + 90 * vv) >> 6
+            // G = (yy - 46 * vv - 22 * uu) >> 6
+            // B = (yy + 113 * uu) >> 6
+
+            int u = uvptr[0] - 128;
+            int v = uvptr[1] - 128;
+
+            int ruv = 90 * v;
+            int guv = -46 * v + -22 * u;
+            int buv = 113 * u;
+
+            int y00 = yptr0[0] << 6;
+            rgb0[0] = SATURATE_CAST_UCHAR((y00 + ruv) >> 6);
+            rgb0[1] = SATURATE_CAST_UCHAR((y00 + guv) >> 6);
+            rgb0[2] = SATURATE_CAST_UCHAR((y00 + buv) >> 6);
+
+            int y01 = yptr0[1] << 6;
+            rgb0[3] = SATURATE_CAST_UCHAR((y01 + ruv) >> 6);
+            rgb0[4] = SATURATE_CAST_UCHAR((y01 + guv) >> 6);
+            rgb0[5] = SATURATE_CAST_UCHAR((y01 + buv) >> 6);
+
+            int y10 = yptr1[0] << 6;
+            rgb1[0] = SATURATE_CAST_UCHAR((y10 + ruv) >> 6);
+            rgb1[1] = SATURATE_CAST_UCHAR((y10 + guv) >> 6);
+            rgb1[2] = SATURATE_CAST_UCHAR((y10 + buv) >> 6);
+
+            int y11 = yptr1[1] << 6;
+            rgb1[3] = SATURATE_CAST_UCHAR((y11 + ruv) >> 6);
+            rgb1[4] = SATURATE_CAST_UCHAR((y11 + guv) >> 6);
+            rgb1[5] = SATURATE_CAST_UCHAR((y11 + buv) >> 6);
+
+            yptr0 += 2;
+            yptr1 += 2;
+            uvptr += 2;
+            rgb0 += 6;
+            rgb1 += 6;
+        }
+#undef SATURATE_CAST_UCHAR
+
+        yptr += 2 * w;
+        rgb += 2 * 3 * w;
+    }
+}
+
 void yuv420sp2rgb_half(const unsigned char* yuv, int w, int h, unsigned char* rgb)
 {
     const unsigned char* puv = yuv + w * h;
diff --git a/tests/test_mat_pixel.cpp b/tests/test_mat_pixel.cpp
index 64246b71d..26f0019b7 100644
--- a/tests/test_mat_pixel.cpp
+++ b/tests/test_mat_pixel.cpp
@@ -323,6 +323,38 @@ static int test_mat_pixel_roi_bgra(int w, int h, int roix, int roiy, int roiw, i
     return 0;
 }
 
+static int test_mat_pixel_yuv420sp2rgb(int w, int h)
+{
+    ncnn::Mat nv21 = RandomMat(w, h / 2 * 3, 1);
+
+    ncnn::Mat nv12 = nv21.clone();
+
+    // swap VU to UV
+    unsigned char* p = (unsigned char*)nv12 + w * h;
+    for (int i = 0; i < w * h / 4; i++)
+    {
+        unsigned char v = p[0];
+        unsigned char u = p[1];
+        p[0] = u;
+        p[1] = v;
+        p += 2;
+    }
+
+    ncnn::Mat rgb(w, h, 3u, 3);
+    yuv420sp2rgb(nv21, w, h, rgb);
+
+    ncnn::Mat rgb2(w, h, 3u, 3);
+    yuv420sp2rgb_nv12(nv12, w, h, rgb2);
+
+    if (memcmp(rgb, rgb2, w * h * 3) != 0)
+    {
+        fprintf(stderr, "test_mat_pixel_yuv420sp2rgb failed w=%d h=%d\n", w, h);
+        return -1;
+    }
+
+    return 0;
+}
+
 static int test_mat_pixel_0()
 {
     return 0
@@ -383,6 +415,15 @@ static int test_mat_pixel_5()
            || test_mat_pixel_roi_bgra(15, 15, 7, 3, 1, 1);
 }
 
+static int test_mat_pixel_6()
+{
+    return 0
+           || test_mat_pixel_yuv420sp2rgb(16, 16)
+           || test_mat_pixel_yuv420sp2rgb(12, 12)
+           || test_mat_pixel_yuv420sp2rgb(2, 2)
+           || test_mat_pixel_yuv420sp2rgb(6, 6);
+}
+
 int main()
 {
     SRAND(7767517);
@@ -393,5 +434,6 @@ int main()
            || test_mat_pixel_2()
            || test_mat_pixel_3()
            || test_mat_pixel_4()
-           || test_mat_pixel_5();
+           || test_mat_pixel_5()
+           || test_mat_pixel_6();
 }