GitOrigin-RevId: d50415b236
tags/v1.7.0
| @@ -748,11 +748,16 @@ void cvt_BT601_yuv_transform(const Mat8u& src, Mat8u& dst) { | |||
| } // namespace | |||
| template<bool rgb = true> | |||
| void cvt_rgb2gray_32f_neon(const Mat32f& src, Mat32f& dst) { | |||
| static const float coef[] = {0.299f, 0.587f, 0.114f}; | |||
| // load coef into neon types | |||
| const float32x4_t v_cr(vdupq_n_f32(coef[0])), v_cg(vdupq_n_f32(coef[1])), | |||
| v_cb(vdupq_n_f32(coef[2])); | |||
| float coef_c0 = rgb ? coef[0] : coef[2]; | |||
| float coef_c1 = coef[1]; | |||
| float coef_c2 = rgb ? coef[2] : coef[0]; | |||
| const float32x4_t v_cr(vdupq_n_f32(coef_c0)), v_cg(vdupq_n_f32(coef_c1)), | |||
| v_cb(vdupq_n_f32(coef_c2)); | |||
| #define EXPAND(offset) \ | |||
| v_src = vld3q_f32(psrc + offset * 3); \ | |||
| @@ -796,7 +801,7 @@ void cvt_rgb2gray_32f_neon(const Mat32f& src, Mat32f& dst) { | |||
| } | |||
| // loop over left pixels | |||
| for (; psrc < pend; psrc += 3, pdst += 1) { | |||
| *pdst = psrc[0] * coef[0] + psrc[1] * coef[1] + psrc[2] * coef[2]; | |||
| *pdst = psrc[0] * coef_c0 + psrc[1] * coef_c1 + psrc[2] * coef_c2; | |||
| } | |||
| } | |||
| #undef EXPAND | |||
| @@ -1187,7 +1192,7 @@ void cvt_rgb2gray<float>(const Mat32f& src, Mat32f& dst) { | |||
| megdnn_assert(src.rows() == dst.rows()); | |||
| megdnn_assert(src.cols() == dst.cols()); | |||
| return cvt_rgb2gray_32f_neon(src, dst); | |||
| return cvt_rgb2gray_32f_neon<true>(src, dst); | |||
| } | |||
| // gray2rgb | |||
| @@ -1381,6 +1386,16 @@ void cvt_bgr2gray<uchar>(const Mat8u& src, Mat8u& dst) { | |||
| } | |||
| } | |||
| template <> | |||
| void cvt_bgr2gray<float>(const Mat32f& src, Mat32f& dst) { | |||
| megdnn_assert(src.channels() == 3); | |||
| megdnn_assert(dst.channels() == 1); | |||
| megdnn_assert(src.rows() == dst.rows()); | |||
| megdnn_assert(src.cols() == dst.cols()); | |||
| return cvt_rgb2gray_32f_neon<false>(src, dst); | |||
| } | |||
| template <> | |||
| void cvt_bgr2rgb<uchar>(const Mat8u& src, Mat8u& dst) { | |||
| return cvt_rgb2bgr<uchar>(src, dst); | |||
| @@ -45,7 +45,6 @@ | |||
| _cb(cvt_rgba2bgr, float) \ | |||
| _cb(cvt_rgba2gray, float) \ | |||
| _cb(cvt_rgb2bgr, float) \ | |||
| _cb(cvt_bgr2gray, float) \ | |||
| _cb(cvt_bgr2rgb, float) \ | |||
| _cb(cvt_yuv2gray_nv21, float) \ | |||
| _cb(cvt_yuv2rgb_nv21, float) \ | |||
| @@ -145,6 +145,73 @@ __global__ void cvt_rgb2gray_32f_kernel(const float* src, float* dst, | |||
| } | |||
| } | |||
| __global__ void cvt_bgr2gray_8u_kernel(const uchar* src, uchar* dst, | |||
| const size_t rows, const size_t cols, | |||
| const size_t src_step, | |||
| const size_t dst_step) { | |||
| size_t t = blockIdx.x * blockDim.x + threadIdx.x; | |||
| if (t < (rows * cols) / U8_PROCESS_PER_THREADS_X) { | |||
| size_t offset = t * U8_PROCESS_PER_THREADS_X; | |||
| src += 3 * offset; | |||
| dst += 1 * offset; | |||
| uchar temp_des[4]; | |||
| uchar temp_src[12]; | |||
| *((uint3*)temp_src) = *((uint3*)src); | |||
| temp_des[0] = (temp_src[0] * 1868 + temp_src[1] * 9617 + | |||
| temp_src[2] * 4899 + (1 << 13)) >> | |||
| 14; | |||
| temp_des[1] = (temp_src[3] * 1868 + temp_src[4] * 9617 + | |||
| temp_src[5] * 4899 + (1 << 13)) >> | |||
| 14; | |||
| temp_des[2] = (temp_src[6] * 1868 + temp_src[7] * 9617 + | |||
| temp_src[8] * 4899 + (1 << 13)) >> | |||
| 14; | |||
| temp_des[3] = (temp_src[9] * 1868 + temp_src[10] * 9617 + | |||
| temp_src[11] * 4899 + (1 << 13)) >> | |||
| 14; | |||
| *((uint32_t*)dst) = *((uint32_t*)temp_des); | |||
| } else if (t == (rows * cols) / U8_PROCESS_PER_THREADS_X) { | |||
| size_t rest = (rows * cols) % U8_PROCESS_PER_THREADS_X; | |||
| if (rest != 0) { | |||
| size_t offset = t * U8_PROCESS_PER_THREADS_X; | |||
| src += 3 * offset; | |||
| dst += 1 * offset; | |||
| for (int i = 0; i < rest; i++, src += 3, dst += 1) | |||
| dst[0] = (src[0] * 1868 + src[1] * 9617 + src[2] * 4899 + | |||
| (1 << 13)) >> | |||
| 14; | |||
| } | |||
| } | |||
| } | |||
| __global__ void cvt_bgr2gray_32f_kernel(const float* src, float* dst, | |||
| const size_t rows, const size_t cols, | |||
| const size_t src_step, | |||
| const size_t dst_step) { | |||
| size_t t = blockIdx.x * blockDim.x + threadIdx.x; | |||
| if (t < rows * cols) { | |||
| size_t offset = t; | |||
| src += offset * 3; | |||
| dst += offset * 1; | |||
| float temp_src[3], temp_dst; | |||
| *((float3*)temp_src) = *((float3*)src); | |||
| temp_dst = temp_src[0] * 0.114f + temp_src[1] * 0.587f + | |||
| temp_src[2] * 0.299f; | |||
| dst[0] = temp_dst; | |||
| } | |||
| } | |||
| __global__ void cvt_gray2rgb_8u_kernel(const uchar* src, uchar* dst, | |||
| const size_t rows, const size_t cols, | |||
| const size_t src_step, | |||
| @@ -683,6 +750,9 @@ void cvt_color_8u_proxy(const uchar* src, uchar* dst, const size_t src_rows, | |||
| case CvtColor::Mode::RGB2GRAY: | |||
| CALL_CVT_OPR_8U_KERNEL(rgb2gray) | |||
| break; | |||
| case CvtColor::Mode::BGR2GRAY: | |||
| CALL_CVT_OPR_8U_KERNEL(bgr2gray) | |||
| break; | |||
| case CvtColor::Mode::RGB2YUV: | |||
| CALL_CVT_OPR_8U_KERNEL(rgb2yuv) | |||
| break; | |||
| @@ -739,6 +809,9 @@ void cvt_color_32f_proxy(const float* src, float* dst, const size_t src_rows, | |||
| case CvtColor::Mode::RGB2GRAY: | |||
| CALL_CVT_OPR_32F_KERNEL(rgb2gray) | |||
| break; | |||
| case CvtColor::Mode::BGR2GRAY: | |||
| CALL_CVT_OPR_32F_KERNEL(bgr2gray) | |||
| break; | |||
| case CvtColor::Mode::RGB2YUV: | |||
| CALL_CVT_OPR_32F_KERNEL(rgb2yuv) | |||
| break; | |||
| @@ -684,6 +684,26 @@ void cvt_bgr2gray<uchar>(const Mat8u& src, Mat8u& dst) { | |||
| } | |||
| } | |||
| template <> | |||
| void cvt_bgr2gray<float>(const Mat32f& src, Mat32f& dst) { | |||
| megdnn_assert(src.channels() == 3); | |||
| megdnn_assert(dst.channels() == 1); | |||
| megdnn_assert(src.rows() == dst.rows()); | |||
| megdnn_assert(src.cols() == dst.cols()); | |||
| const float coef_r = 0.299f, coef_g = 0.587f, coef_b = 0.114f; | |||
| for (size_t r = 0; r < src.rows(); ++r) { | |||
| for (size_t c = 0; c < src.cols(); ++c) { | |||
| float B = src.at(r, c, 0); | |||
| float G = src.at(r, c, 1); | |||
| float R = src.at(r, c, 2); | |||
| float& Y = dst.at(r, c, 0); | |||
| Y = R * coef_r + G * coef_g + B * coef_b; | |||
| } | |||
| } | |||
| } | |||
| template <> | |||
| void cvt_bgr2rgb<uchar>(const Mat8u& src, Mat8u& dst) { | |||
| return cvt_rgb2bgr<uchar>(src, dst); | |||
| @@ -1311,6 +1311,41 @@ void cvt_rgb2gray_32f_SSE_4_2(const Mat32f& src, Mat32f& dst) { | |||
| } | |||
| } | |||
| MEGDNN_ATTRIBUTE_TARGET("sse4.2") | |||
| void cvt_bgr2gray_32f_SSE_4_2(const Mat32f& src, Mat32f& dst) { | |||
| const float coef_r = 0.299f, coef_g = 0.587f, coef_b = 0.114f; | |||
| __m128 v_coef_r = _mm_set1_ps(coef_r); | |||
| __m128 v_coef_g = _mm_set1_ps(coef_g); | |||
| __m128 v_coef_b = _mm_set1_ps(coef_b); | |||
| for (size_t r = 0; r < src.rows(); ++r) { | |||
| const float* psrc = src.ptr(r); | |||
| float* pdst = dst.ptr(r); | |||
| const float* const pend = psrc + src.cols() * 3; | |||
| __m128 v_r, v_g, v_b, ans; | |||
| for (; psrc <= pend - 4 * 3; psrc += 4 * 3, pdst += 4) { | |||
| v_b = _mm_set_ps(psrc[9], psrc[6], psrc[3], psrc[0]); | |||
| v_b = _mm_mul_ps(v_b, v_coef_b); | |||
| v_g = _mm_set_ps(psrc[10], psrc[7], psrc[4], psrc[1]); | |||
| v_g = _mm_mul_ps(v_g, v_coef_g); | |||
| v_r = _mm_set_ps(psrc[11], psrc[8], psrc[5], psrc[2]); | |||
| v_r = _mm_mul_ps(v_r, v_coef_r); | |||
| ans = _mm_add_ps(v_r, _mm_add_ps(v_g, v_b)); | |||
| _mm_storeu_ps(pdst, ans); | |||
| } | |||
| for (; psrc < pend; psrc += 3, pdst += 1) { | |||
| pdst[0] = psrc[1] * coef_g + psrc[0] * coef_b + psrc[2] * coef_r; | |||
| } | |||
| } | |||
| } | |||
| MEGDNN_ATTRIBUTE_TARGET("sse4.2") | |||
| void cvt_rgba2rgb_8u_SSE_4_2(const Mat8u& src, Mat8u& dst) { | |||
| __m128i dst_data0, dst_data1, dst_data2; | |||
| @@ -1705,6 +1740,16 @@ void cvt_bgr2gray<uchar>(const Mat8u& src, Mat8u& dst) { | |||
| } | |||
| } | |||
| template <> | |||
| void cvt_bgr2gray<float>(const Mat32f& src, Mat32f& dst) { | |||
| megdnn_assert(src.channels() == 3); | |||
| megdnn_assert(dst.channels() == 1); | |||
| megdnn_assert(src.rows() == dst.rows()); | |||
| megdnn_assert(src.cols() == dst.cols()); | |||
| return cvt_bgr2gray_32f_SSE_4_2(src, dst); | |||
| } | |||
| template <> | |||
| void cvt_bgr2rgb<uchar>(const Mat8u& src, Mat8u& dst) { | |||
| return cvt_rgb2bgr<uchar>(src, dst); | |||
| @@ -133,6 +133,9 @@ inline std::vector<TestArg> get_cuda_args() { | |||
| for (size_t i = 2; i <= 10; ++i) { | |||
| for (size_t j = 2; j <= 10; ++j) { | |||
| cur_param.mode = Mode::RGB2GRAY; | |||
| args.emplace_back(cur_param, TensorShape{1, i, j, 3}, | |||
| dtype::Uint8()); | |||
| cur_param.mode = Mode::BGR2GRAY; | |||
| args.emplace_back(cur_param, TensorShape{1, i, j, 3}, | |||
| dtype::Uint8()); | |||
| cur_param.mode = Mode::RGB2YUV; | |||
| @@ -146,6 +149,9 @@ inline std::vector<TestArg> get_cuda_args() { | |||
| dtype::Uint8()); | |||
| // float32 test | |||
| cur_param.mode = Mode::RGB2GRAY; | |||
| args.emplace_back(cur_param, TensorShape{1, i, j, 3}, | |||
| dtype::Float32()); | |||
| cur_param.mode = Mode::BGR2GRAY; | |||
| args.emplace_back(cur_param, TensorShape{1, i, j, 3}, | |||
| dtype::Float32()); | |||
| cur_param.mode = Mode::RGB2YUV; | |||
| @@ -1057,12 +1057,19 @@ def test_cvt_color(): | |||
| def rgb2gray(rgb): | |||
| return np.dot(rgb[..., :3], [0.299, 0.587, 0.114]) | |||
| def bgr2gray(bgr): | |||
| return np.dot(bgr[..., :3], [0.114, 0.587, 0.299]) | |||
| inp = np.random.randn(3, 3, 3, 3).astype(np.float32) | |||
| out = np.expand_dims(rgb2gray(inp), 3).astype(np.float32) | |||
| x = tensor(inp) | |||
| y = F.vision.cvt_color(x, mode="RGB2GRAY") | |||
| np.testing.assert_allclose(y.numpy(), out, atol=1e-5) | |||
| out1 = np.expand_dims(bgr2gray(inp), 3).astype(np.float32) | |||
| y1 = F.vision.cvt_color(x, mode="BGR2GRAY") | |||
| np.testing.assert_allclose(y1.numpy(), out1, atol=1e-5) | |||
| @pytest.mark.parametrize("val", [2, [2,], [2, 3]]) | |||
| def test_ones(val): | |||