|
- /**
- * By downloading, copying, installing or using the software you agree to this license.
- * If you do not agree to this license, do not download, install,
- * copy or use the software.
- *
- *
- * License Agreement
- * For Open Source Computer Vision Library
- * (3-clause BSD License)
- *
- * Copyright (C) 2000-2020, Intel Corporation, all rights reserved.
- * Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
- * Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved.
- * Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
- * Copyright (C) 2015-2016, OpenCV Foundation, all rights reserved.
- * Copyright (C) 2015-2016, Itseez Inc., all rights reserved.
- * Copyright (C) 2019-2020, Xperience AI, all rights reserved.
- * Third party copyrights are property of their respective owners.
- *
- * Redistribution and use in source and binary forms, with or without modification,
- * are permitted provided that the following conditions are met:
- *
- * * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * * Neither the names of the copyright holders nor the names of the contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * This software is provided by the copyright holders and contributors "as is" and
- * any express or implied warranties, including, but not limited to, the implied
- * warranties of merchantability and fitness for a particular purpose are disclaimed.
- * In no event shall copyright holders or contributors be liable for any direct,
- * indirect, incidental, special, exemplary, or consequential damages
- * (including, but not limited to, procurement of substitute goods or services;
- * loss of use, data, or profits; or business interruption) however caused
- * and on any theory of liability, whether in contract, strict liability,
- * or tort (including negligence or otherwise) arising in any way out of
- * the use of this software, even if advised of the possibility of such damage.
- *
- * ---------------------------------------------------------------------------
- * \file dnn/src/cuda/resize/resize_cv.cu
- *
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *
- * This file has been modified by Megvii ("Megvii Modifications").
- * All Megvii Modifications are Copyright (C) 2014-2021 Megvii Inc. All rights reserved.
- *
- * ---------------------------------------------------------------------------
- */
- #include "src/cuda/cv/kernel_common.cuh"
- #include "src/common/resize.cuh"
- #include "src/cuda/resize/resize_cv.cuh"
- #include "src/cuda/utils.cuh"
-
- using namespace megdnn;
- using namespace cuda;
- using namespace megcv;
- using megdnn::resize::interpolate_cubic;
-
- namespace {
-
- #define SCALE 11
- #define at(A, r, c, ch) A[(r)*A##_step + (c)*CH + (ch)]
- #define ONE (1 << SCALE)
-
- #define ELEMENTS_PER_THREADS 8
- #define THREADS_X 32
- #define THREADS_Y 16
-
- __global__ void precompute_lanczos4_coef_f32(float* dst, float scale,
- size_t size) {
- int tid = blockIdx.x * blockDim.x + threadIdx.x;
- if (tid >= size)
- return;
-
- float fr = (tid + 0.5) * scale - 0.5;
- int* sr = (int*)(dst + size * 8);
- sr[tid] = (int)(floorf(fr));
-
- fr -= sr[tid];
- float coef[8];
- interpolate_lanczos4_coefs(fr, coef);
- #pragma unroll
- for (int j = 0, index = 0; j < 8; j++, index += size) {
- dst[tid + index] = coef[j];
- }
- }
-
- __global__ void precompute_lanczos4_coef_u8(short* dst, float scale,
- size_t size) {
- int tid = blockIdx.x * blockDim.x + threadIdx.x;
- if (tid >= size)
- return;
-
- float fr = (tid + 0.5) * scale - 0.5;
- int* sr = (int*)(dst + size * 8);
- sr[tid] = (int)(floorf(fr));
-
- fr -= sr[tid];
- float coef[8];
- interpolate_lanczos4_coefs(fr, coef);
- #pragma unroll
- for (int j = 0, index = 0; j < 8; j++, index += size) {
- dst[tid + index] = (short)(coef[j] * ONE);
- }
- }
-
- __global__ void precompute_cubic_coef_f32(float* dst, float scale,
- size_t size) {
- int tid = blockIdx.x * blockDim.x + threadIdx.x;
- if (tid >= size)
- return;
-
- float fr = (tid + 0.5) * scale - 0.5;
- int* sr = (int*)(dst + size * 4);
- sr[tid] = (int)(floorf(fr));
-
- fr -= sr[tid];
- float coef[4];
- interpolate_cubic(fr, coef);
- #pragma unroll
- for (int j = 0, index = 0; j < 4; j++, index += size) {
- dst[tid + index] = coef[j];
- }
- }
-
- __global__ void precompute_cubic_coef_u8(short* dst, float scale, size_t size) {
- int tid = blockIdx.x * blockDim.x + threadIdx.x;
- if (tid >= size)
- return;
-
- float fr = (tid + 0.5) * scale - 0.5;
- int* sr = (int*)(dst + size * 4);
- sr[tid] = (int)(floorf(fr));
-
- fr -= sr[tid];
- float coef[4];
- interpolate_cubic(fr, coef);
- #pragma unroll
- for (int j = 0, index = 0; j < 4; j++, index += size) {
- dst[tid + index] = (short)(coef[j] * ONE);
- }
- }
-
- template <typename T, size_t CH>
- __global__ void resize_nearest_vector_kernel(
- const T* src, T* dst, const size_t dst_rows, const size_t dst_cols,
- const size_t src_step, const size_t dst_step, const float row_scale,
- const float col_scale) {
- size_t dc = blockIdx.x * blockDim.x + threadIdx.x;
- size_t dr = blockIdx.y * blockDim.y * ELEMENTS_PER_THREADS + threadIdx.y;
-
- if (dr < dst_rows && dc < dst_cols) {
- int dst_address_incress = dr * dst_step + dc * CH;
- size_t sc = dc * col_scale;
- src += sc * CH;
-
- for (int i = 0; i < ELEMENTS_PER_THREADS; i++) {
- if (dr >= dst_rows)
- return;
-
- size_t sr = dr * row_scale;
- int src_address_incress = sr * src_step;
- for (size_t ch = 0; ch < CH; ch++)
- dst[dst_address_incress + ch] = src[src_address_incress + ch];
-
- dr += blockDim.y;
- dst_address_incress += blockDim.y * dst_step;
- }
- }
- }
-
- template <typename T, size_t CH>
- __global__ void resize_nearest_kernel(
- const T* __restrict__ src, T* dst, const size_t dst_rows,
- const size_t dst_cols, const size_t src_step, const size_t dst_step,
- const float row_scale, const float col_scale) {
- size_t dc = blockIdx.x * blockDim.x + threadIdx.x;
- size_t dr = blockIdx.y * blockDim.y + threadIdx.y;
- if (dr < dst_rows && dc < dst_cols) {
- size_t sr = dr * row_scale;
- size_t sc = dc * col_scale;
- src += sr * src_step + sc * CH;
- dst += dr * dst_step + dc * CH;
- #pragma unroll
- for (size_t ch = 0; ch < CH; ++ch)
- dst[ch] = src[ch];
- }
- }
-
- template <typename T, size_t CH>
- void resize_nearest_proxy(const T* src, T* dst, const size_t src_rows,
- const size_t src_cols, const size_t dst_rows,
- const size_t dst_cols, const size_t src_step,
- const size_t dst_step, void* workspace,
- cudaStream_t stream) {
- MEGDNN_MARK_USED_VAR(workspace);
- float row_scale = (float)src_rows / dst_rows;
- float col_scale = (float)src_cols / dst_cols;
-
- if (CH == 3 && sizeof(T) == 4 &&
- (dst_cols * dst_rows <= src_cols * src_rows)) {
- dim3 THREADS(32, 8, 1);
- dim3 BLOCKS(DIVUP(dst_cols, THREADS.x), DIVUP(dst_rows, THREADS.y));
-
- cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
-
- resize_nearest_kernel<T, CH><<<BLOCKS, THREADS, 0, stream>>>(
- src, dst, dst_rows, dst_cols, src_step, dst_step, row_scale,
- col_scale);
-
- } else {
- dim3 THREADS(32, 8, 1);
- dim3 BLOCKS(DIVUP(dst_cols, THREADS.x),
- DIVUP(dst_rows, THREADS.y * ELEMENTS_PER_THREADS));
-
- if (CH == 3 && sizeof(T) == 1)
- cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
-
- resize_nearest_vector_kernel<T, CH><<<BLOCKS, THREADS, 0, stream>>>(
- src, dst, dst_rows, dst_cols, src_step, dst_step, row_scale,
- col_scale);
- }
- }
-
- template <typename T, size_t CH>
- __global__ void resize_linear_Restric_kernel(
- const T* __restrict__ src, T* dst, const size_t src_rows,
- const size_t src_cols, const size_t dst_rows, const size_t dst_cols,
- const size_t src_step, const size_t dst_step, const float row_scale,
- const float col_scale, const float inverse_row_scale,
- const float inverse_col_scale) {
- size_t dc = blockIdx.x * blockDim.x + threadIdx.x;
- size_t dr = blockIdx.y * blockDim.y + threadIdx.y;
-
- if (dr < dst_rows && dc < dst_cols) {
- float fc = (dc + 0.5f) * inverse_col_scale - 0.5f;
- float fr = (dr + 0.5f) * inverse_row_scale - 0.5f;
- int sc = __float2int_rd(fc);
- int sr = __float2int_rd(fr);
-
- fc -= sc;
- fr -= sr;
-
- if (sc < 0) {
- sc = 0;
- fc = 0;
- }
- if (sr < 0) {
- sr = 0;
- fr = 0;
- }
-
- if (sc + 1 >= src_cols) {
- sc = src_cols - 2;
- fc = 1;
- }
-
- if (sr + 1 >= src_rows) {
- sr = src_rows - 2;
- fr = 1;
- }
-
- int src_address = sr * src_step + sc * CH;
-
- // if the type is uchar, use sr and sc to donate fx * (1 << SCALE)
- float dst_data[CH] = {0};
- #pragma unroll
- for (int ch = 0; ch < CH; ch++) {
- float pcrsc00 = src[src_address + ch];
- float pcrsc01 = src[src_address + CH + ch];
- float pcrsc10 = src[src_address + src_step + ch];
- float pcrsc11 = src[src_address + src_step + CH + ch];
- dst_data[ch] = fr * (pcrsc11 * fc + pcrsc10 * (1 - fc)) +
- (1 - fr) * (pcrsc01 * fc + pcrsc00 * (1 - fc));
- }
- int dst_address = dr * dst_step + dc * CH;
- #pragma unroll
- for (int ch = 0; ch < CH; ch++)
- dst[dst_address++] = (T)(dst_data[ch]);
- }
- }
-
- template <typename T, size_t CH>
- __global__ void resize_linear_vector_kernel(
- const T* src, T* dst, const size_t src_rows, const size_t src_cols,
- const size_t dst_rows, const size_t dst_cols, const size_t src_step,
- const size_t dst_step, const float row_scale, const float col_scale,
- const float inverse_row_scale, const float inverse_col_scale) {
- size_t dc = blockIdx.x * blockDim.x + threadIdx.x;
- size_t dr = blockIdx.y * blockDim.y * ELEMENTS_PER_THREADS + threadIdx.y;
-
- if (dr < dst_rows && dc < dst_cols) {
- float fc = (dc + 0.5f) * inverse_col_scale - 0.5f;
- int sc = __float2int_rd(fc);
- fc -= sc;
- if (sc < 0) {
- sc = 0;
- fc = 0;
- }
-
- if (sc + 1 >= src_cols) {
- sc = src_cols - 2;
- fc = 1;
- }
- for (int i = 0; i < ELEMENTS_PER_THREADS; i++) {
- if (dr >= dst_rows)
- return;
-
- float fr = (dr + 0.5f) * inverse_row_scale - 0.5f;
- int sr = __float2int_rd(fr);
- fr -= sr;
-
- if (sr < 0) {
- sr = 0;
- fr = 0;
- }
- if (sr + 1 >= src_rows) {
- sr = src_rows - 2;
- fr = 1;
- }
- int src_address = sr * src_step + sc * CH;
- float dst_data[CH] = {0};
- #pragma unroll
- for (int ch = 0; ch < CH; ch++) {
- float pcrsc00 = src[src_address + ch];
- float pcrsc01 = src[src_address + CH + ch];
- float pcrsc10 = src[src_address + src_step + ch];
- float pcrsc11 = src[src_address + src_step + CH + ch];
- dst_data[ch] = fr * (pcrsc11 * fc + pcrsc10 * (1 - fc)) +
- (1 - fr) * (pcrsc01 * fc + pcrsc00 * (1 - fc));
- }
-
- int dst_address = dr * dst_step + dc * CH;
- #pragma unroll
- for (int ch = 0; ch < CH; ch++)
- dst[dst_address++] = (T)(dst_data[ch]);
-
- dr += blockDim.y;
- }
- }
- }
-
- template <typename T, size_t CH>
- void resize_area_proxy(const T*, T*, size_t, size_t, size_t, size_t, size_t,
- size_t, void*, cudaStream_t);
-
- template <typename T, size_t CH>
- void resize_linear_proxy(const T* src, T* dst, const size_t src_rows,
- const size_t src_cols, const size_t dst_rows,
- const size_t dst_cols, const size_t src_step,
- const size_t dst_step, void* workspace,
- cudaStream_t stream) {
- if (src_rows == dst_rows * 2 && src_cols == dst_cols * 2) {
- resize_area_proxy<T, CH>(src, dst, src_rows, src_cols, dst_rows,
- dst_cols, src_step, dst_step, workspace,
- stream);
- return;
- }
-
- dim3 THREADS(32, 8, 1);
-
- float row_scale = (float)dst_rows / src_rows;
- float col_scale = (float)dst_cols / src_cols;
-
- if (CH == 3 && (dst_rows < src_rows && dst_cols < src_cols)) {
- dim3 BLOCKS(DIVUP(dst_cols, THREADS.x), DIVUP(dst_rows, THREADS.y));
-
- cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
-
- resize_linear_Restric_kernel<T, CH><<<BLOCKS, THREADS, 0, stream>>>(
- src, dst, src_rows, src_cols, dst_rows, dst_cols, src_step,
- dst_step, row_scale, col_scale, 1 / row_scale, 1 / col_scale);
-
- } else {
- dim3 BLOCKS(DIVUP(dst_cols, THREADS.x),
- DIVUP(dst_rows, THREADS.y * ELEMENTS_PER_THREADS));
-
- cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
-
- resize_linear_vector_kernel<T, CH><<<BLOCKS, THREADS, 0, stream>>>(
- src, dst, src_rows, src_cols, dst_rows, dst_cols, src_step,
- dst_step, row_scale, col_scale, 1 / row_scale, 1 / col_scale);
- }
- }
-
- template <size_t CH>
- __global__ void resize_cubic_32f_kernel_vector(
- const float* __restrict__ src, float* dst, const size_t src_rows,
- const size_t src_cols, const size_t dst_rows, const size_t dst_cols,
- const size_t src_step, const size_t dst_step, const float row_scale,
- const float col_scale) {
- size_t dc = blockIdx.x * blockDim.x + threadIdx.x;
- size_t dr = blockIdx.y * blockDim.y * ELEMENTS_PER_THREADS + threadIdx.y;
- if (dr < dst_rows && dc < dst_cols) {
- float fc = ((float)dc + 0.5) * col_scale - 0.5;
- int sc = floor(fc);
- fc -= sc;
- float coef_col[4];
- interpolate_cubic(fc, coef_col);
-
- for (int i = 0; i < ELEMENTS_PER_THREADS; i++) {
- if (dr >= dst_rows)
- return;
- float fr = ((float)dr + 0.5) * row_scale - 0.5;
- int sr = floor(fr);
- fr -= sr;
- float coef_row[4];
- interpolate_cubic(fr, coef_row);
- float dst_data[CH] = {0};
- #pragma unroll
- for (int offset_r = 0; offset_r < 4; ++offset_r) {
- int tr_step =
- saturate(sr + offset_r - 1, 0, (int)src_rows - 1) *
- src_step;
- #pragma unroll
- for (int offset_c = 0; offset_c < 4; ++offset_c) {
- int tc_step =
- saturate(sc + offset_c - 1, 0, (int)src_cols - 1) *
- CH;
- int src_address = tr_step + tc_step;
- #pragma unroll
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += coef_row[offset_r] *
- coef_col[offset_c] * src[src_address++];
- }
- }
- }
- int dst_address = dr * dst_step + dc * CH;
- #pragma unroll
- for (int i = 0; i < CH; i++)
- dst[dst_address++] = dst_data[i];
- dr += blockDim.y;
- }
- }
- }
-
- template <size_t CH>
- __global__ void resize_cubic_8u_kernel_vector(
- const uchar* __restrict__ src, uchar* dst, const size_t src_rows,
- const size_t src_cols, const size_t dst_rows, const size_t dst_cols,
- const size_t src_step, const size_t dst_step, const float row_scale,
- const float col_scale) {
- size_t dc = blockIdx.x * blockDim.x + threadIdx.x;
- size_t dr = blockIdx.y * blockDim.y * ELEMENTS_PER_THREADS + threadIdx.y;
- if (dr < dst_rows && dc < dst_cols) {
- float fc = ((float)dc + 0.5) * col_scale - 0.5;
- int sc = __float2int_rd(fc);
- fc -= sc;
- short icoef_col[4] = {0};
-
- float coef_col[4];
- interpolate_cubic(fc, coef_col);
- #pragma unroll
- for (int i = 0; i < 4; i++) {
- icoef_col[i] = (short)(coef_col[i] * ONE);
- }
-
- for (int i = 0; i < ELEMENTS_PER_THREADS; i++) {
- if (dr >= dst_rows)
- return;
- float fr = ((float)dr + 0.5) * row_scale - 0.5;
- int sr = __float2int_rd(fr);
- fr -= sr;
- short icoef_row[4];
- float coef_row[4];
- interpolate_cubic(fr, coef_row);
- #pragma unroll
- for (int i = 0; i < 4; i++) {
- icoef_row[i] = (short)(coef_row[i] * ONE);
- }
-
- int dst_data[CH] = {0};
- #pragma unroll
- for (int offset_r = 0; offset_r < 4; ++offset_r) {
- int tr_step =
- saturate(sr + offset_r - 1, 0, (int)src_rows - 1) *
- src_step;
- #pragma unroll
- for (int offset_c = 0; offset_c < 4; ++offset_c) {
- int tc_step =
- saturate(sc + offset_c - 1, 0, (int)src_cols - 1) *
- CH;
- int src_address = tr_step + tc_step;
- #pragma unroll
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += icoef_row[offset_r] *
- icoef_col[offset_c] *
- src[src_address++];
- }
- }
- }
- int dst_address = dr * dst_step + dc * CH;
- #pragma unroll
- for (int i = 0; i < CH; i++)
- dst[dst_address++] =
- saturate(dst_data[i] >> (SCALE + SCALE), 0, 255);
- dr += blockDim.y;
- }
- }
- }
-
- template <size_t CH>
- __global__ void resize_cubic_32f_kernel_cacheToGlobal(
- const float* src, float* dst, const size_t src_rows,
- const size_t src_cols, const size_t dst_rows, const size_t dst_cols,
- const size_t src_step, const size_t dst_step, const float* gl_coef_row,
- const float* gl_coef_col, const int* gl_sr, const int* gl_sc) {
- size_t dc = blockIdx.x * blockDim.x + threadIdx.x;
- size_t dr = blockIdx.y * blockDim.y * ELEMENTS_PER_THREADS + threadIdx.y;
-
- if (dr < dst_rows && dc < dst_cols) {
- int sc = gl_sc[dc];
- float coef_col[4];
- #pragma unroll
- for (int i = 0, index = dc; i < 4; i++, index += dst_cols)
- coef_col[i] = gl_coef_col[index];
-
- for (int i = 0; i < ELEMENTS_PER_THREADS; i++) {
- if (dr >= dst_rows)
- return;
- int sr = gl_sr[dr];
- float coef_row[4];
- #pragma unroll
- for (int i = 0, index = dr; i < 4; i++, index += dst_rows)
- coef_row[i] = gl_coef_row[index];
-
- float dst_data[CH] = {0};
- #pragma unroll
- for (int offset_r = 0; offset_r < 4; ++offset_r) {
- int tr_step =
- saturate(sr + offset_r - 1, 0, (int)src_rows - 1) *
- src_step;
- #pragma unroll
- for (int offset_c = 0; offset_c < 4; ++offset_c) {
- int tc_step =
- saturate(sc + offset_c - 1, 0, (int)src_cols - 1) *
- CH;
- int src_address = tr_step + tc_step;
- #pragma unroll
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += coef_row[offset_r] *
- coef_col[offset_c] * src[src_address++];
- }
- }
- }
- int dst_address = dr * dst_step + dc * CH;
- #pragma unroll
- for (int i = 0; i < CH; i++)
- dst[dst_address++] = dst_data[i];
-
- dr += blockDim.y;
- }
- }
- }
-
- template <size_t CH>
- __global__ void resize_cubic_8u_kernel_cacheToGlobal(
- const uchar* src, uchar* dst, const size_t src_rows,
- const size_t src_cols, const size_t dst_rows, const size_t dst_cols,
- const size_t src_step, const size_t dst_step, const short* gl_icoef_row,
- const short* gl_icoef_col, const int* gl_sr, const int* gl_sc) {
- size_t dc = blockIdx.x * blockDim.x + threadIdx.x;
- size_t dr = blockIdx.y * blockDim.y * ELEMENTS_PER_THREADS + threadIdx.y;
-
- if (dr < dst_rows && dc < dst_cols) {
- int sc = gl_sc[dc];
- short icoef_col[4];
- #pragma unroll
- for (int i = 0, index = dc; i < 4; i++, index += dst_cols)
- icoef_col[i] = gl_icoef_col[index];
-
- for (int i = 0; i < ELEMENTS_PER_THREADS; i++) {
- if (dr >= dst_rows)
- return;
- int sr = gl_sr[dr];
- short icoef_row[4];
- #pragma unroll
- for (int i = 0, index = dr; i < 4; i++, index += dst_rows)
- icoef_row[i] = gl_icoef_row[index];
-
- int dst_data[CH] = {0};
- #pragma unroll
- for (int offset_r = 0; offset_r < 4; ++offset_r) {
- int tr_step =
- saturate(sr + offset_r - 1, 0, (int)src_rows - 1) *
- src_step;
- #pragma unroll
- for (int offset_c = 0; offset_c < 4; ++offset_c) {
- int tc_step =
- saturate(sc + offset_c - 1, 0, (int)src_cols - 1) *
- CH;
- int src_address = tr_step + tc_step;
- #pragma unroll
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += icoef_row[offset_r] *
- icoef_col[offset_c] *
- src[src_address++];
- }
- }
- }
- int dst_address = dr * dst_step + dc * CH;
- #pragma unroll
- for (int i = 0; i < CH; i++)
- dst[dst_address++] =
- saturate(dst_data[i] >> (SCALE + SCALE), 0, 255);
-
- dr += blockDim.y;
- }
- }
- }
-
- template <typename T, size_t CH>
- void resize_cubic_proxy(const T* src, T* dst, const size_t src_rows,
- const size_t src_cols, const size_t dst_rows,
- const size_t dst_cols, const size_t src_step,
- const size_t dst_step, void* workspace,
- cudaStream_t stream) {
- dim3 THREADS(32, 8, 1);
- float row_scale = (float)src_rows / dst_rows;
- float col_scale = (float)src_cols / dst_cols;
-
- size_t dst_area_size = dst_rows * dst_cols;
- size_t src_area_size = src_rows * src_cols;
-
- bool enlarge = dst_area_size > src_area_size;
- bool shrink = dst_area_size <= src_area_size;
- bool U8 = sizeof(T) == sizeof(uchar);
- bool F32_1 = sizeof(T) == sizeof(float) && CH == 1;
- bool F32_3 = sizeof(T) == sizeof(float) && CH == 3;
-
- bool use_vector = (enlarge && (dst_area_size <= 500 * 500)) ||
- (shrink && (F32_3 || (U8 && dst_area_size <= 500 * 500) ||
- (F32_1 && dst_area_size <= 1000 * 1000)));
-
- if (use_vector) {
- dim3 BLOCKS(DIVUP(dst_cols, THREADS.x),
- DIVUP(dst_rows, THREADS.y * ELEMENTS_PER_THREADS));
-
- if (sizeof(T) == sizeof(float)) {
- resize_cubic_32f_kernel_vector<CH><<<BLOCKS, THREADS, 0, stream>>>(
- (const float*)src, (float*)dst, src_rows, src_cols,
- dst_rows, dst_cols, src_step, dst_step, row_scale,
- col_scale);
- } else {
- resize_cubic_8u_kernel_vector<CH><<<BLOCKS, THREADS, 0, stream>>>(
- (const uchar*)src, (uchar*)dst, src_rows, src_cols,
- dst_rows, dst_cols, src_step, dst_step, row_scale,
- col_scale);
- }
-
- } else {
- dim3 BLOCKS(DIVUP(dst_cols, THREADS.x),
- DIVUP(dst_rows, THREADS.y * ELEMENTS_PER_THREADS));
-
- cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
-
- if (sizeof(T) == sizeof(float)) {
- float* dev_coef_row = static_cast<float*>(workspace);
- int* dev_sr = reinterpret_cast<int*>(dev_coef_row + dst_rows * 4);
- float* dev_coef_col = reinterpret_cast<float*>(dev_sr + dst_rows);
- int* dev_sc = reinterpret_cast<int*>(dev_coef_col + dst_cols * 4);
-
- precompute_cubic_coef_f32<<<DIVUP(dst_rows, 128), 128, 0, stream>>>(
- dev_coef_row, row_scale, dst_rows);
- precompute_cubic_coef_f32<<<DIVUP(dst_cols, 128), 128, 0, stream>>>(
- dev_coef_col, col_scale, dst_cols);
-
- resize_cubic_32f_kernel_cacheToGlobal<CH>
- <<<BLOCKS, THREADS, 0, stream>>>(
- (const float*)src, (float*)dst, src_rows, src_cols,
- dst_rows, dst_cols, src_step, dst_step,
- dev_coef_row, dev_coef_col, dev_sr, dev_sc);
-
- } else {
- short* dev_coef_row = static_cast<short*>(workspace);
- int* dev_sr = reinterpret_cast<int*>(dev_coef_row + dst_rows * 4);
- short* dev_coef_col = reinterpret_cast<short*>(dev_sr + dst_rows);
- int* dev_sc = reinterpret_cast<int*>(dev_coef_col + dst_cols * 4);
-
- precompute_cubic_coef_u8<<<DIVUP(dst_rows, 128), 128, 0, stream>>>(
- dev_coef_row, row_scale, dst_rows);
- precompute_cubic_coef_u8<<<DIVUP(dst_cols, 128), 128, 0, stream>>>(
- dev_coef_col, col_scale, dst_cols);
-
- resize_cubic_8u_kernel_cacheToGlobal<CH>
- <<<BLOCKS, THREADS, 0, stream>>>(
- (const uchar*)src, (uchar*)dst, src_rows, src_cols,
- dst_rows, dst_cols, src_step, dst_step,
- dev_coef_row, dev_coef_col, dev_sr, dev_sc);
- }
- }
- }
-
- template <size_t CH>
- __global__ void resize_lanczos4_32f_kernel_vector(
- const float* __restrict__ src, float* dst, const size_t src_rows,
- const size_t src_cols, const size_t dst_rows, const size_t dst_cols,
- const size_t src_step, const size_t dst_step, const float row_scale,
- const float col_scale) {
- size_t dc = blockIdx.x * blockDim.x + threadIdx.x;
- size_t dr = blockIdx.y * blockDim.y * ELEMENTS_PER_THREADS + threadIdx.y;
- if (dr < dst_rows && dc < dst_cols) {
- float fc = ((float)dc + 0.5) * col_scale - 0.5;
- int sc = floor(fc);
- fc -= sc;
- float coef_col[8];
- interpolate_lanczos4_coefs(fc, coef_col);
-
- for (int i = 0; i < ELEMENTS_PER_THREADS; i++) {
- if (dr >= dst_rows)
- return;
- float fr = ((float)dr + 0.5) * row_scale - 0.5;
- int sr = floor(fr);
- fr -= sr;
- float coef_row[8];
- interpolate_lanczos4_coefs(fr, coef_row);
- float dst_data[CH] = {0};
- #pragma unroll
- for (int offset_r = 0; offset_r < 8; ++offset_r) {
- int tr_step =
- saturate(sr + offset_r - 3, 0, (int)src_rows - 1) *
- src_step;
- #pragma unroll
- for (int offset_c = 0; offset_c < 8; ++offset_c) {
- int tc_step =
- saturate(sc + offset_c - 3, 0, (int)src_cols - 1) *
- CH;
- int src_address = tr_step + tc_step;
- #pragma unroll
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += coef_row[offset_r] *
- coef_col[offset_c] * src[src_address++];
- }
- }
- }
- int dst_address = dr * dst_step + dc * CH;
- #pragma unroll
- for (int i = 0; i < CH; i++)
- dst[dst_address++] = dst_data[i];
- dr += blockDim.y;
- }
- }
- }
-
- template <size_t CH>
- __global__ void resize_lanczos4_8u_kernel_vector(
- const uchar* __restrict__ src, uchar* dst, const size_t src_rows,
- const size_t src_cols, const size_t dst_rows, const size_t dst_cols,
- const size_t src_step, const size_t dst_step, const float row_scale,
- const float col_scale) {
- size_t dc = blockIdx.x * blockDim.x + threadIdx.x;
- size_t dr = blockIdx.y * blockDim.y * ELEMENTS_PER_THREADS + threadIdx.y;
- if (dr < dst_rows && dc < dst_cols) {
- float fc = ((float)dc + 0.5) * col_scale - 0.5;
- int sc = floor(fc);
- fc -= sc;
- short icoef_col[8] = {0};
- const float s45 = 0.70710678118654752440084436210485;
- const float cs[][2] = {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45},
- {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}};
- const float MEGCV_PI = 3.1415926536;
-
- {
- if (fc < FLT_EPSILON)
- icoef_col[3] = ONE;
- else {
- float coef_col[8];
- float sum = 0;
- float y0 = -(fc + 3) * MEGCV_PI * 0.25, s0 = sin(y0),
- c0 = cos(y0);
- #pragma unroll
- for (int i = 0; i < 8; i++) {
- float y = -(fc + 3 - i) * MEGCV_PI * 0.25;
- coef_col[i] =
- (float)((cs[i][0] * s0 + cs[i][1] * c0) / (y * y));
- sum += coef_col[i];
- }
-
- sum = 1.f / sum;
- #pragma unroll
- for (int i = 0; i < 8; i++) {
- coef_col[i] *= sum;
- icoef_col[i] = (short)(coef_col[i] * ONE);
- }
- }
- }
-
- for (int i = 0; i < ELEMENTS_PER_THREADS; i++) {
- if (dr >= dst_rows)
- return;
- float fr = ((float)dr + 0.5) * row_scale - 0.5;
- int sr = floor(fr);
- fr -= sr;
- short icoef_row[8] = {0};
- {
- if (fr < FLT_EPSILON)
- icoef_row[3] = ONE;
- else {
- float coef_row[8];
- float sum = 0;
- float y0 = -(fr + 3) * MEGCV_PI * 0.25, s0 = sin(y0),
- c0 = cos(y0);
- #pragma unroll
- for (int i = 0; i < 8; i++) {
- float y = -(fr + 3 - i) * MEGCV_PI * 0.25;
- coef_row[i] = (float)((cs[i][0] * s0 + cs[i][1] * c0) /
- (y * y));
- sum += coef_row[i];
- }
-
- sum = 1.f / sum;
- #pragma unroll
- for (int i = 0; i < 8; i++) {
- coef_row[i] *= sum;
- icoef_row[i] = (short)(coef_row[i] * ONE);
- }
- }
- }
-
- int dst_data[CH] = {0};
- #pragma unroll
- for (int offset_r = 0; offset_r < 8; ++offset_r) {
- int tr_step =
- saturate(sr + offset_r - 3, 0, (int)src_rows - 1) *
- src_step;
- #pragma unroll
- for (int offset_c = 0; offset_c < 8; ++offset_c) {
- int tc_step =
- saturate(sc + offset_c - 3, 0, (int)src_cols - 1) *
- CH;
- int src_address = tr_step + tc_step;
- #pragma unroll
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += icoef_row[offset_r] *
- icoef_col[offset_c] *
- src[src_address++];
- }
- }
- }
-
- int dst_address = dr * dst_step + dc * CH;
- for (int ch = 0; ch < CH; ch++)
- dst[dst_address++] =
- saturate(dst_data[ch] >> (SCALE + SCALE), 0, 255);
- dr += blockDim.y;
- }
- }
- }
-
- template <size_t CH>
- __global__ void resize_lanczos4_32f_kernel_cacheToGlobal(
- const float* src, float* dst, const size_t src_rows,
- const size_t src_cols, const size_t dst_rows, const size_t dst_cols,
- const size_t src_step, const size_t dst_step, const float* gl_coef_row,
- const float* gl_coef_col, const int* gl_sr, const int* gl_sc) {
- size_t dc = blockIdx.x * blockDim.x + threadIdx.x;
- size_t dr = blockIdx.y * blockDim.y * ELEMENTS_PER_THREADS + threadIdx.y;
-
- if (dr < dst_rows && dc < dst_cols) {
- int sc = gl_sc[dc];
- float coef_col[8];
- #pragma unroll
- for (int i = 0, index = dc; i < 8; i++, index += dst_cols)
- coef_col[i] = gl_coef_col[index];
-
- for (int i = 0; i < ELEMENTS_PER_THREADS; i++) {
- if (dr >= dst_rows)
- return;
- int sr = gl_sr[dr];
- float coef_row[8];
- #pragma unroll
- for (int i = 0, index = dr; i < 8; i++, index += dst_rows)
- coef_row[i] = gl_coef_row[index];
-
- float dst_data[CH] = {0};
- #pragma unroll
- for (int offset_r = 0; offset_r < 8; ++offset_r) {
- int tr_step =
- saturate(sr + offset_r - 3, 0, (int)src_rows - 1) *
- src_step;
- #pragma unroll
- for (int offset_c = 0; offset_c < 8; ++offset_c) {
- int tc_step =
- saturate(sc + offset_c - 3, 0, (int)src_cols - 1) *
- CH;
- int src_address = tr_step + tc_step;
- #pragma unroll
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += coef_row[offset_r] *
- coef_col[offset_c] * src[src_address++];
- }
- }
- }
- int dst_address = dr * dst_step + dc * CH;
- #pragma unroll
- for (int i = 0; i < CH; i++)
- dst[dst_address++] = dst_data[i];
-
- dr += blockDim.y;
- }
- }
- }
-
- template <size_t CH>
- __global__ void resize_lanczos4_8u_kernel_cacheToGlobal(
- const uchar* src, uchar* dst, const size_t src_rows,
- const size_t src_cols, const size_t dst_rows, const size_t dst_cols,
- const size_t src_step, const size_t dst_step, const short* gl_icoef_row,
- const short* gl_icoef_col, const int* gl_sr, const int* gl_sc) {
- size_t dc = blockIdx.x * blockDim.x + threadIdx.x;
- size_t dr = blockIdx.y * blockDim.y * ELEMENTS_PER_THREADS + threadIdx.y;
-
- if (dr < dst_rows && dc < dst_cols) {
- int sc = gl_sc[dc];
- short icoef_col[8];
- #pragma unroll
- for (int i = 0, index = dc; i < 8; i++, index += dst_cols)
- icoef_col[i] = gl_icoef_col[index];
-
- for (int i = 0; i < ELEMENTS_PER_THREADS; i++) {
- if (dr >= dst_rows)
- return;
- int sr = gl_sr[dr];
- short icoef_row[8];
- #pragma unroll
- for (int i = 0, index = dr; i < 8; i++, index += dst_rows)
- icoef_row[i] = gl_icoef_row[index];
-
- int dst_data[CH] = {0};
- #pragma unroll
- for (int offset_r = 0; offset_r < 8; ++offset_r) {
- int tr_step =
- saturate(sr + offset_r - 3, 0, (int)src_rows - 1) *
- src_step;
- #pragma unroll
- for (int offset_c = 0; offset_c < 8; ++offset_c) {
- int tc_step =
- saturate(sc + offset_c - 3, 0, (int)src_cols - 1) *
- CH;
- int src_address = tr_step + tc_step;
- #pragma unroll
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += icoef_row[offset_r] *
- icoef_col[offset_c] *
- src[src_address++];
- }
- }
- }
- int dst_address = dr * dst_step + dc * CH;
- #pragma unroll
- for (int i = 0; i < CH; i++)
- dst[dst_address++] =
- saturate(dst_data[i] >> (SCALE + SCALE), 0, 255);
-
- dr += blockDim.y;
- }
- }
- }
-
- template <typename T, size_t CH>
- void resize_lanczos4_proxy(const T* src, T* dst, const size_t src_rows,
- const size_t src_cols, const size_t dst_rows,
- const size_t dst_cols, const size_t src_step,
- const size_t dst_step, void* workspace,
- cudaStream_t stream) {
- dim3 THREADS(16, 16, 1);
-
- float row_scale = (float)src_rows / dst_rows;
- float col_scale = (float)src_cols / dst_cols;
-
- size_t dst_area_size = dst_rows * dst_cols;
- size_t src_area_size = src_rows * src_cols;
-
- bool enlarge = dst_area_size > src_area_size;
- bool shrink = dst_area_size <= src_area_size;
- bool U8 = sizeof(T) == sizeof(uchar);
- bool F32_1 = sizeof(T) == sizeof(float) && CH == 1;
- bool F32_3 = sizeof(T) == sizeof(float) && CH == 3;
-
- bool use_vector = (enlarge && (dst_area_size <= 500 * 500)) ||
- (shrink && (F32_3 || (U8 && dst_area_size <= 500 * 500) ||
- (F32_1 && dst_area_size <= 1000 * 1000)));
-
- if (use_vector) {
- dim3 BLOCKS(DIVUP(dst_cols, THREADS.x),
- DIVUP(dst_rows, THREADS.y * ELEMENTS_PER_THREADS));
-
- if (sizeof(T) == sizeof(float)) {
- resize_lanczos4_32f_kernel_vector<CH>
- <<<BLOCKS, THREADS, 0, stream>>>(
- (const float*)src, (float*)dst, src_rows, src_cols,
- dst_rows, dst_cols, src_step, dst_step, row_scale,
- col_scale);
- } else {
- resize_lanczos4_8u_kernel_vector<CH>
- <<<BLOCKS, THREADS, 0, stream>>>(
- (const uchar*)src, (uchar*)dst, src_rows, src_cols,
- dst_rows, dst_cols, src_step, dst_step, row_scale,
- col_scale);
- }
-
- } else {
- dim3 BLOCKS(DIVUP(dst_cols, THREADS.x),
- DIVUP(dst_rows, THREADS.y * ELEMENTS_PER_THREADS));
-
- cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
-
- if (sizeof(T) == sizeof(float)) {
- float* dev_coef_row = static_cast<float*>(workspace);
- int* dev_sr = reinterpret_cast<int*>(dev_coef_row + dst_rows * 8);
- float* dev_coef_col = reinterpret_cast<float*>(dev_sr + dst_rows);
- int* dev_sc = reinterpret_cast<int*>(dev_coef_col + dst_cols * 8);
-
- precompute_lanczos4_coef_f32<<<DIVUP(dst_rows, 128), 128, 0,
- stream>>>(dev_coef_row, row_scale,
- dst_rows);
- precompute_lanczos4_coef_f32<<<DIVUP(dst_cols, 128), 128, 0,
- stream>>>(dev_coef_col, col_scale,
- dst_cols);
- resize_lanczos4_32f_kernel_cacheToGlobal<CH>
- <<<BLOCKS, THREADS, 0, stream>>>(
- (const float*)src, (float*)dst, src_rows, src_cols,
- dst_rows, dst_cols, src_step, dst_step,
- dev_coef_row, dev_coef_col, dev_sr, dev_sc);
-
- } else {
- short* dev_coef_row = static_cast<short*>(workspace);
- int* dev_sr = reinterpret_cast<int*>(dev_coef_row + dst_rows * 8);
- short* dev_coef_col = reinterpret_cast<short*>(dev_sr + dst_rows);
- int* dev_sc = reinterpret_cast<int*>(dev_coef_col + dst_cols * 8);
-
- precompute_lanczos4_coef_u8<<<DIVUP(dst_rows, 128), 128, 0,
- stream>>>(dev_coef_row, row_scale,
- dst_rows);
- precompute_lanczos4_coef_u8<<<DIVUP(dst_cols, 128), 128, 0,
- stream>>>(dev_coef_col, col_scale,
- dst_cols);
-
- resize_lanczos4_8u_kernel_cacheToGlobal<CH>
- <<<BLOCKS, THREADS, 0, stream>>>(
- (const uchar*)src, (uchar*)dst, src_rows, src_cols,
- dst_rows, dst_cols, src_step, dst_step,
- dev_coef_row, dev_coef_col, dev_sr, dev_sc);
- }
- }
- }
-
- template <size_t CH>
- __global__ void resize_area_version1_shrink_32f_kernel(
- const float* src, float* dst, const size_t src_rows,
- const size_t src_cols, const size_t dst_rows, const size_t dst_cols,
- const size_t src_step, const size_t dst_step, const float row_scale,
- const float col_scale, const float _row_scale, const float _col_scale) {
- size_t dc = blockIdx.x * blockDim.x + threadIdx.x;
- size_t dr = blockIdx.y * blockDim.y + threadIdx.y;
- if (dr < dst_rows && dc < dst_cols) {
- float fsr1 = (float)dr * row_scale;
- float fsr2 = (float)(dr + 1) * row_scale;
- int sr1 = floor(fsr1);
- int sr2 = ceil(fsr2);
-
- float fsc1 = (float)dc * col_scale;
- float fsc2 = (float)(dc + 1) * col_scale;
- int sc1 = floor(fsc1);
- int sc2 = ceil(fsc2);
-
- float dst_data[CH] = {0};
-
- {
- float coefr = (float)(sr1 + 1 - fsr1) * _row_scale;
- {
- float coefc = (float)(sc1 + 1 - fsc1) * _col_scale;
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += coefr * coefc * at(src, sr1, sc1, ch);
- }
- }
- for (int sc = sc1 + 1; sc < sc2 - 1; ++sc) {
- float coefc = _col_scale;
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += coefr * coefc * at(src, sr1, sc, ch);
- }
- }
- {
- float coefc = (float)(fsc2 - (sc2 - 1)) * _col_scale;
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += coefr * coefc * at(src, sr1, sc2 - 1, ch);
- }
- }
- }
-
- for (int sr = sr1 + 1; sr < sr2 - 1; ++sr) {
- float coefr = 1.0f * _row_scale;
- {
- float coefc = (float)(sc1 + 1 - fsc1) * _col_scale;
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += coefr * coefc * at(src, sr, sc1, ch);
- }
- }
- for (int sc = sc1 + 1; sc < sc2 - 1; ++sc) {
- float coefc = _col_scale;
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += coefr * coefc * at(src, sr, sc, ch);
- }
- }
- {
- float coefc = (float)(fsc2 - (sc2 - 1)) * _col_scale;
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += coefr * coefc * at(src, sr, sc2 - 1, ch);
- }
- }
- }
-
- {
- float coefr = (float)(fsr2 - (sr2 - 1)) * _row_scale;
- {
- float coefc = (float)(sc1 + 1 - fsc1) * _col_scale;
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += coefr * coefc * at(src, sr2 - 1, sc1, ch);
- }
- }
- for (int sc = sc1 + 1; sc < sc2 - 1; ++sc) {
- float coefc = _col_scale;
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += coefr * coefc * at(src, sr2 - 1, sc, ch);
- }
- }
- {
- float coefc = (float)(fsc2 - (sc2 - 1)) * _col_scale;
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] +=
- coefr * coefc * at(src, sr2 - 1, sc2 - 1, ch);
- }
- }
- }
-
- for (size_t ch = 0; ch < CH; ++ch)
- at(dst, dr, dc, ch) = dst_data[ch];
- }
- }
-
- template <size_t CH>
- __global__ void resize_area_version1_shrink_8u_kernel(
- const uchar* src, uchar* dst, const size_t src_rows,
- const size_t src_cols, const size_t dst_rows, const size_t dst_cols,
- const size_t src_step, const size_t dst_step, const float row_scale,
- const float col_scale, const float _row_scale, const float _col_scale) {
- size_t dc = blockIdx.x * blockDim.x + threadIdx.x;
- size_t dr = blockIdx.y * blockDim.y + threadIdx.y;
- if (dr < dst_rows && dc < dst_cols) {
- float fsr1 = (float)dr * row_scale;
- float fsr2 = (float)(dr + 1) * row_scale;
- int sr1 = floor(fsr1);
- int sr2 = ceil(fsr2);
-
- float fsc1 = (float)dc * col_scale;
- float fsc2 = (float)(dc + 1) * col_scale;
- int sc1 = floor(fsc1);
- int sc2 = ceil(fsc2);
- float dst_data[CH] = {0};
-
- {
- float coefr = (float)(sr1 + 1 - fsr1) * _row_scale;
- {
- float coefc = (float)(sc1 + 1 - fsc1) * _col_scale;
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += coefr * coefc * at(src, sr1, sc1, ch);
- }
- }
- for (int sc = sc1 + 1; sc < sc2 - 1; ++sc) {
- float coefc = _col_scale;
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += coefr * coefc * at(src, sr1, sc, ch);
- }
- }
- {
- float coefc = (float)(fsc2 - (sc2 - 1)) * _col_scale;
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += coefr * coefc * at(src, sr1, sc2 - 1, ch);
- }
- }
- }
- for (int sr = sr1 + 1; sr < sr2 - 1; ++sr) {
- float coefr = 1.0f * _row_scale;
- {
- float coefc = (float)(sc1 + 1 - fsc1) * _col_scale;
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += coefr * coefc * at(src, sr, sc1, ch);
- }
- }
- for (int sc = sc1 + 1; sc < sc2 - 1; ++sc) {
- float coefc = _col_scale;
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += coefr * coefc * at(src, sr, sc, ch);
- }
- }
- {
- float coefc = (float)(fsc2 - (sc2 - 1)) * _col_scale;
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += coefr * coefc * at(src, sr, sc2 - 1, ch);
- }
- }
- }
-
- {
- float coefr = (float)(fsr2 - (sr2 - 1)) * _row_scale;
- {
- float coefc = (float)(sc1 + 1 - fsc1) * _col_scale;
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += coefr * coefc * at(src, sr2 - 1, sc1, ch);
- }
- }
- for (int sc = sc1 + 1; sc < sc2 - 1; ++sc) {
- float coefc = _col_scale;
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += coefr * coefc * at(src, sr2 - 1, sc, ch);
- }
- }
- {
- float coefc = (float)(fsc2 - (sc2 - 1)) * _col_scale;
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] +=
- coefr * coefc * at(src, sr2 - 1, sc2 - 1, ch);
- }
- }
- }
-
- for (size_t ch = 0; ch < CH; ++ch)
- at(dst, dr, dc, ch) = saturate((int)dst_data[ch], 0, 255);
- }
- }
-
- template <size_t CH>
- __global__ void resize_area_version2_shrink_32f_kernel(
- const float* src, float* dst, const size_t src_rows,
- const size_t src_cols, const size_t dst_rows, const size_t dst_cols,
- const size_t src_step, const size_t dst_step, const float row_scale,
- const float col_scale, const float _row_scale, const float _col_scale) {
- size_t dc0 = blockIdx.x * blockDim.x;
- size_t dr = blockIdx.y * blockDim.y + threadIdx.y;
- if (dr < dst_rows && dc0 < dst_cols) {
- __shared__ float lc_dst_data[THREADS_Y][THREADS_X * CH];
-
- size_t dc = dc0 + threadIdx.x;
-
- float fsr1 = (float)dr * row_scale;
- float fsr2 = (float)(dr + 1) * row_scale;
- int sr1 = floor(fsr1);
- int sr2 = ceil(fsr2);
-
- float fsc1 = (float)dc0 * col_scale;
- float fsc2 = (float)(dc0 + blockDim.x) * col_scale;
- int sc1 = floor(fsc1);
- int sc2 = ceil(fsc2);
-
- for (size_t ch = 0; ch < CH; ch++)
- lc_dst_data[threadIdx.y][threadIdx.x * CH + ch] = 0;
-
- __syncthreads();
-
- size_t min_col_edge = min((int)src_cols, sc2) * CH;
- for (int sc_address = sc1 * CH + threadIdx.x; sc_address < min_col_edge;
- sc_address += blockDim.x) {
- float sum = 0;
- {
- float coefr = (float)(sr1 + 1 - fsr1) * _row_scale;
- sum += coefr * src[sr1 * src_step + sc_address];
- }
- float coefr = _row_scale;
- for (int sr = sr1 + 1; sr < sr2 - 1; ++sr) {
- sum += coefr * src[sr * src_step + sc_address];
- }
- {
- float coefr = (float)(fsr2 - (sr2 - 1)) * _row_scale;
- sum += coefr * src[(sr2 - 1) * src_step + sc_address];
- }
-
- size_t multi = floor(((sc_address / CH) + 1) * _col_scale);
- float x = ((sc_address / CH) + 1) - multi * col_scale;
- if (x >= 1) {
- atomicAdd(&(lc_dst_data[threadIdx.y]
- [(multi - dc0) * CH + sc_address % CH]),
- sum * _col_scale);
- } else {
- if (multi < dc0 + blockDim.x)
- atomicAdd(&(lc_dst_data[threadIdx.y][(multi - dc0) * CH +
- sc_address % CH]),
- sum * (x * _col_scale));
- if (multi - 1 >= dc0)
- atomicAdd(
- &(lc_dst_data[threadIdx.y][(multi - 1 - dc0) * CH +
- sc_address % CH]),
- sum * ((1 - x) * _col_scale));
- }
- }
-
- __syncthreads();
-
- if (dc < dst_cols) {
- for (size_t ch = 0; ch < CH; ++ch)
- at(dst, dr, dc, ch) =
- lc_dst_data[threadIdx.y][(threadIdx.x) * CH + ch];
- }
- }
- }
-
- template <size_t CH>
- __global__ void resize_area_version2_shrink_8u_kernel(
- const uchar* src, uchar* dst, const size_t src_rows,
- const size_t src_cols, const size_t dst_rows, const size_t dst_cols,
- const size_t src_step, const size_t dst_step, const float row_scale,
- const float col_scale, const float _row_scale, const float _col_scale) {
- size_t dc0 = blockIdx.x * blockDim.x;
- size_t dr = blockIdx.y * blockDim.y + threadIdx.y;
- if (dr < dst_rows && dc0 < dst_cols) {
- __shared__ float lc_dst_data[THREADS_Y][THREADS_X * CH];
-
- size_t dc = dc0 + threadIdx.x;
-
- float fsr1 = (float)dr * row_scale;
- float fsr2 = (float)(dr + 1) * row_scale;
- int sr1 = floor(fsr1);
- int sr2 = ceil(fsr2);
-
- float fsc1 = (float)dc0 * col_scale;
- float fsc2 = (float)(dc0 + blockDim.x) * col_scale;
- int sc1 = floor(fsc1);
- int sc2 = ceil(fsc2);
-
- for (size_t ch = 0; ch < CH; ch++)
- lc_dst_data[threadIdx.y][threadIdx.x * CH + ch] = 0;
-
- __syncthreads();
-
- size_t min_col_edge = min((int)src_cols, sc2) * CH;
- for (int sc_address = sc1 * CH + threadIdx.x; sc_address < min_col_edge;
- sc_address += blockDim.x) {
- float sum = 0;
- {
- float coefr = (float)(sr1 + 1 - fsr1) * _row_scale;
- sum += coefr * src[sr1 * src_step + sc_address];
- }
- float coefr = _row_scale;
- for (int sr = sr1 + 1; sr < sr2 - 1; ++sr) {
- sum += coefr * src[sr * src_step + sc_address];
- }
- {
- float coefr = (float)(fsr2 - (sr2 - 1)) * _row_scale;
- sum += coefr * src[(sr2 - 1) * src_step + sc_address];
- }
-
- size_t multi = floor(((sc_address / CH) + 1) * _col_scale);
- float x = ((sc_address / CH) + 1) - multi * col_scale;
- if (x >= 1) {
- atomicAdd(&(lc_dst_data[threadIdx.y]
- [(multi - dc0) * CH + sc_address % CH]),
- sum * _col_scale);
- } else {
- if (multi < dc0 + blockDim.x)
- atomicAdd(&(lc_dst_data[threadIdx.y][(multi - dc0) * CH +
- sc_address % CH]),
- sum * (x * _col_scale));
- if (multi - 1 >= dc0)
- atomicAdd(
- &(lc_dst_data[threadIdx.y][(multi - 1 - dc0) * CH +
- sc_address % CH]),
- sum * ((1 - x) * _col_scale));
- }
- }
-
- __syncthreads();
-
- if (dc < dst_cols) {
- for (size_t ch = 0; ch < CH; ++ch)
- at(dst, dr, dc, ch) = saturate(
- (int)lc_dst_data[threadIdx.y][(threadIdx.x) * CH + ch],
- 0, 255);
- }
- }
- }
-
- template <size_t CH>
- __global__ void resize_area_version1_shrink_fast_32f_kernel(
- const float* __restrict__ src, float* dst, const size_t src_rows,
- const size_t src_cols, const size_t dst_rows, const size_t dst_cols,
- const size_t src_step, const size_t dst_step, const size_t cell_rows,
- const size_t cell_cols, const float _cell_rows,
- const float _cell_cols) {
- size_t dc = blockIdx.x * blockDim.x + threadIdx.x;
- size_t dr = blockIdx.y * blockDim.y + threadIdx.y;
- if (dr < dst_rows && dc < dst_cols) {
- int sr0 = dr * cell_rows;
- int sc0 = dc * cell_cols;
- float dst_data[CH] = {0};
- for (int sr = sr0; sr < cell_rows + sr0; ++sr) {
- for (int sc = sc0; sc < cell_cols + sc0; ++sc) {
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += at(src, sr, sc, ch);
- }
- }
- }
-
- for (size_t ch = 0; ch < CH; ++ch)
- at(dst, dr, dc, ch) = dst_data[ch] * _cell_rows * _cell_cols;
- }
- }
-
- template <size_t CH>
- __global__ void resize_area_version1_shrink_fast_8u_kernel(
- const uchar* __restrict__ src, uchar* dst, const size_t src_rows,
- const size_t src_cols, const size_t dst_rows, const size_t dst_cols,
- const size_t src_step, const size_t dst_step, const size_t cell_rows,
- const size_t cell_cols, const float _cell_rows,
- const float _cell_cols) {
- size_t dc = blockIdx.x * blockDim.x + threadIdx.x;
- size_t dr = blockIdx.y * blockDim.y + threadIdx.y;
- if (dr < dst_rows && dc < dst_cols) {
- int sr0 = dr * cell_rows;
- int sc0 = dc * cell_cols;
- int dst_data[CH] = {0};
- for (int sr = sr0; sr < cell_rows + sr0; ++sr) {
- for (int sc = sc0; sc < cell_cols + sc0; ++sc) {
- for (size_t ch = 0; ch < CH; ++ch) {
- dst_data[ch] += at(src, sr, sc, ch);
- }
- }
- }
-
- for (size_t ch = 0; ch < CH; ++ch) {
- at(dst, dr, dc, ch) =
- (uchar)(dst_data[ch] * _cell_rows * _cell_cols);
- }
- }
- }
-
- template <size_t CH>
- __global__ void resize_area_version2_shrink_fast_32f_kernel(
- const float* __restrict__ src, float* dst, const size_t src_rows,
- const size_t src_cols, const size_t dst_rows, const size_t dst_cols,
- const size_t src_step, const size_t dst_step, const size_t cell_rows,
- const size_t cell_cols, const float _cell_rows,
- const float _cell_cols) {
- size_t dc0 = blockIdx.x * blockDim.x;
- size_t dr = blockIdx.y * blockDim.y + threadIdx.y;
- if (dr < dst_rows && dc0 < dst_cols) {
- __shared__ float lc_dst_data[THREADS_Y][THREADS_X * CH];
- int sc0 = dc0 * cell_cols * CH;
- int sr0 = dr * cell_rows;
-
- for (size_t ch = 0; ch < CH; ch++)
- lc_dst_data[threadIdx.y][threadIdx.x * CH + ch] = 0;
-
- __syncthreads();
-
- size_t block_cell_width = cell_cols * CH * blockDim.x;
- for (int i = threadIdx.x, sc = sc0 + threadIdx.x;
- i < block_cell_width && sc < src_cols * CH;
- i += blockDim.x, sc += blockDim.x) {
- float sum = 0;
- for (int j = 0, sr = sr0 * src_step; j < cell_rows;
- j++, sr += src_step)
- sum += src[sr + sc];
- atomicAdd(&(lc_dst_data[threadIdx.y]
- [(i / (cell_cols * CH)) * CH + i % CH]),
- sum);
- }
-
- __syncthreads();
-
- size_t dc = dc0 + threadIdx.x;
- if (dc < dst_cols) {
- for (size_t ch = 0; ch < CH; ++ch)
- at(dst, dr, dc, ch) =
- lc_dst_data[threadIdx.y][threadIdx.x * CH + ch] *
- _cell_rows * _cell_cols;
- }
- }
- }
-
- template <size_t CH>
- __global__ void resize_area_version2_shrink_fast_8u_kernel(
- const uchar* __restrict__ src, uchar* dst, const size_t src_rows,
- const size_t src_cols, const size_t dst_rows, const size_t dst_cols,
- const size_t src_step, const size_t dst_step, const size_t cell_rows,
- const size_t cell_cols, const float _cell_rows,
- const float _cell_cols) {
- size_t dc0 = blockIdx.x * blockDim.x;
- size_t dr = blockIdx.y * blockDim.y + threadIdx.y;
- if (dr < dst_rows && dc0 < dst_cols) {
- __shared__ int lc_dst_data[THREADS_Y][THREADS_X * CH];
- int sc0 = dc0 * cell_cols * CH;
- int sr0 = dr * cell_rows;
-
- for (size_t ch = 0; ch < CH; ch++)
- lc_dst_data[threadIdx.y][threadIdx.x * CH + ch] = 0;
-
- __syncthreads();
-
- size_t block_cell_width = cell_cols * CH * blockDim.x;
- for (int i = threadIdx.x, sc = sc0 + threadIdx.x;
- i < block_cell_width && sc < src_cols * CH;
- i += blockDim.x, sc += blockDim.x) {
- int sum = 0;
- for (int j = 0, sr = sr0 * src_step; j < cell_rows;
- j++, sr += src_step)
- sum += src[sr + sc];
- atomicAdd(&(lc_dst_data[threadIdx.y]
- [(i / (cell_cols * CH)) * CH + i % CH]),
- sum);
- }
-
- __syncthreads();
-
- size_t dc = dc0 + threadIdx.x;
- if (dc < dst_cols) {
- for (size_t ch = 0; ch < CH; ++ch)
- at(dst, dr, dc, ch) = (uchar)(
- lc_dst_data[threadIdx.y][threadIdx.x * CH + ch] *
- _cell_rows * _cell_cols);
- }
- }
- }
-
- template <typename T, size_t CH>
- void resize_area_proxy(const T* src, T* dst, const size_t src_rows,
- const size_t src_cols, const size_t dst_rows,
- const size_t dst_cols, const size_t src_step,
- const size_t dst_step, void* workspace,
- cudaStream_t stream) {
- dim3 THREADS(THREADS_X, THREADS_Y, 1);
-
- float row_scale = (float)src_rows / dst_rows;
- float col_scale = (float)src_cols / dst_cols;
-
- if (src_rows > dst_rows && src_cols > dst_cols) {
- if (src_rows % dst_rows == 0 && src_cols % dst_cols == 0) {
- dim3 BLOCKS(DIVUP(dst_cols, THREADS.x), DIVUP(dst_rows, THREADS.y));
-
- if (sizeof(T) == sizeof(float)) {
- if ((CH == 1 && (sizeof(T) * CH * col_scale <= 24)) ||
- (CH == 3 && (sizeof(T) * CH * col_scale <= 36))) {
- cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
- resize_area_version1_shrink_fast_32f_kernel<CH>
- <<<BLOCKS, THREADS, 0, stream>>>(
- (const float*)src, (float*)dst, src_rows,
- src_cols, dst_rows, dst_cols, src_step,
- dst_step, (size_t)row_scale,
- (size_t)col_scale, (float)1 / row_scale,
- (float)1 / col_scale);
- } else {
- cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
- resize_area_version2_shrink_fast_32f_kernel<CH>
- <<<BLOCKS, THREADS, 0, stream>>>(
- (const float*)src, (float*)dst, src_rows,
- src_cols, dst_rows, dst_cols, src_step,
- dst_step, (size_t)row_scale,
- (size_t)col_scale, (float)1 / row_scale,
- (float)1 / col_scale);
- }
-
- } else {
- if (sizeof(T) * CH * col_scale <= 24) {
- cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
- resize_area_version1_shrink_fast_8u_kernel<CH>
- <<<BLOCKS, THREADS, 0, stream>>>(
- (const uchar*)src, (uchar*)dst, src_rows,
- src_cols, dst_rows, dst_cols, src_step,
- dst_step, (size_t)row_scale,
- (size_t)col_scale, (float)1 / row_scale,
- (float)1 / col_scale);
- } else {
- cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
- resize_area_version2_shrink_fast_8u_kernel<CH>
- <<<BLOCKS, THREADS, 0, stream>>>(
- (const uchar*)src, (uchar*)dst, src_rows,
- src_cols, dst_rows, dst_cols, src_step,
- dst_step, (size_t)row_scale,
- (size_t)col_scale, (float)1 / row_scale,
- (float)1 / col_scale);
- }
- }
-
- } else {
- size_t access_step = (int)(sizeof(T) * CH * col_scale);
- if (access_step <= 24) {
- dim3 BLOCKS(DIVUP(dst_cols, THREADS.x),
- DIVUP(dst_rows, THREADS.y));
-
- cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
-
- if (sizeof(T) == sizeof(float)) {
- resize_area_version1_shrink_32f_kernel<CH>
- <<<BLOCKS, THREADS, 0, stream>>>(
- (const float*)src, (float*)dst, src_rows,
- src_cols, dst_rows, dst_cols, src_step,
- dst_step, row_scale, col_scale,
- (float)1 / row_scale, (float)1 / col_scale);
- } else {
- resize_area_version1_shrink_8u_kernel<CH>
- <<<BLOCKS, THREADS, 0, stream>>>(
- (const uchar*)src, (uchar*)dst, src_rows,
- src_cols, dst_rows, dst_cols, src_step,
- dst_step, row_scale, col_scale,
- (float)1 / row_scale, (float)1 / col_scale);
- }
-
- } else if (access_step > 24) {
- dim3 BLOCKS(DIVUP(dst_cols, THREADS.x),
- DIVUP(dst_rows, THREADS.y));
-
- cudaDeviceSetCacheConfig(cudaFuncCachePreferNone);
-
- if (sizeof(T) == sizeof(float)) {
- resize_area_version2_shrink_32f_kernel<CH>
- <<<BLOCKS, THREADS, 0, stream>>>(
- (const float*)src, (float*)dst, src_rows,
- src_cols, dst_rows, dst_cols, src_step,
- dst_step, row_scale, col_scale,
- (float)1 / row_scale, (float)1 / col_scale);
- } else {
- resize_area_version2_shrink_8u_kernel<CH>
- <<<BLOCKS, THREADS, 0, stream>>>(
- (const uchar*)src, (uchar*)dst, src_rows,
- src_cols, dst_rows, dst_cols, src_step,
- dst_step, row_scale, col_scale,
- (float)1 / row_scale, (float)1 / col_scale);
- }
- }
- }
- } else {
- resize_linear_proxy<T, CH>(src, dst, src_rows, src_cols, dst_rows,
- dst_cols, src_step, dst_step, workspace,
- stream);
- }
- }
-
- } // anonymous namespace
-
- template <typename T>
- void megdnn::cuda::resize::resize_cv(
- const T* src, T* dst, const size_t src_rows, const size_t src_cols,
- const size_t dst_rows, const size_t dst_cols, const size_t src_step,
- const size_t dst_step, size_t ch, InterpolationMode imode,
- void* workspace, cudaStream_t stream) {
- megdnn_assert(ch == 1 || ch == 3);
- #define cb(_mode, _MODE) \
- case INTER_##_MODE: { \
- if (ch == 1) { \
- resize_##_mode##_proxy<T, 1>(src, dst, src_rows, src_cols, \
- dst_rows, dst_cols, src_step, \
- dst_step, workspace, stream); \
- } else { \
- resize_##_mode##_proxy<T, 3>(src, dst, src_rows, src_cols, \
- dst_rows, dst_cols, src_step, \
- dst_step, workspace, stream); \
- } \
- break; \
- }
-
- switch (imode) {
- cb(nearest, NEAREST);
- cb(linear, LINEAR);
- cb(cubic, CUBIC);
- cb(lanczos4, LANCZOS4);
- cb(area, AREA);
- default:
- megdnn_throw("unsupported interpolation mode");
- break;
- }
- #undef cb
- }
-
- #define INST(_type) \
- template void megdnn::cuda::resize::resize_cv<_type>( \
- const _type* src, _type* dst, const size_t src_rows, \
- const size_t src_cols, const size_t dst_rows, \
- const size_t dst_cols, const size_t src_step, \
- const size_t dst_step, size_t ch, InterpolationMode imode, \
- void* workspace, cudaStream_t stream);
-
- INST(float);
- INST(uchar);
-
- #undef cb
-
- // vim: syntax=cpp.doxygen
|