wangwei
/
MegEngine

 
			
			   
				 
					
						
						
							
							/**
 * \file dnn/src/cuda/remap/backward_data.cu
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
#include <cuda_runtime.h>
#include "src/common/rounding_converter.cuh"
#include "src/cuda/cv/kernel_common.cuh"
#include "src/cuda/remap/common.h"
#include "src/cuda/utils.cuh"

using namespace megdnn;
using namespace cuda;
using namespace remap;
using namespace rounding;

namespace {

template <const uint32_t format>
__device__ inline int get_offset(int height, int width, int channel, int h,
                                 int w, int c);

template <>
__device__ inline int get_offset<param_enumv::Remap::Format::NCHW>(
        int height, int width, int channel, int h, int w, int c) {
    return channel * h * w + height * w + width;
}

template <typename ctype, const uint32_t format, ::BorderMode bmode>
struct GetSrcData {
    __device__ static inline int get_index(int height, int width, int channel,
                                           int h, int w, int c) {
        height = megcv::border_interpolate<bmode>(height, h);
        width = megcv::border_interpolate<bmode>(width, w);
        return get_offset<format>(height, width, channel, h, w, c);
    }
};

template <typename ctype, const uint32_t format>
struct GetSrcData<ctype, format, ::BorderMode::BORDER_CONSTANT> {
    __device__ static inline int get_index(int height, int width, int channel,
                                           int h, int w, int c) {
        return (height >= 0 && height < h && width >= 0 && width < w)
                       ? get_offset<format>(height, width, channel, h, w, c)
                       : -1;
    }
};

template <typename ctype, const uint32_t format, ::BorderMode bmode>
__global__ void kern_general(ctype* __restrict grad, const float* map_xy,
                             const ctype* diff, int C, int IH, int IW, int OH,
                             int OW) {
    int ow = blockIdx.x * blockDim.x + threadIdx.x;
    int oh = blockIdx.y * blockDim.y + threadIdx.y;
    grad += blockIdx.z * C * IH * IW;
    diff += blockIdx.z * C * OH * OW;
    map_xy += blockIdx.z * 2 * OH * OW;
    RoundingConverter<ctype> round_converter;

    if (ow < OW && oh < OH) {
        float index_col = map_xy[oh * OW * 2 + ow * 2 + 0];
        float index_row = map_xy[oh * OW * 2 + ow * 2 + 1];
        int col = static_cast<int>(floor(index_col));
        int row = static_cast<int>(floor(index_row));
        float v = index_col - col;  // alphah
        float u = index_row - row;  // alphaw
        const float one = 1.f;
        for (int c = 0; c < C; ++c) {
            float hidden = static_cast<float>(
                    diff[get_offset<format>(oh, ow, c, OH, OW, C)]);

            int a00 = GetSrcData<ctype, format, bmode>::get_index(
                    row + 0, col + 0, c, IH, IW, C);
            if (a00 != -1) {
                atomic_add(grad + a00,
                           round_converter((one - u) * (one - v) * hidden));
            }

            int a01 = GetSrcData<ctype, format, bmode>::get_index(
                    row + 0, col + 1, c, IH, IW, C);
            if (a01 != -1) {
                atomic_add(grad + a01, round_converter((one - u) * v * hidden));
            }

            int a10 = GetSrcData<ctype, format, bmode>::get_index(
                    row + 1, col + 0, c, IH, IW, C);
            if (a10 != -1) {
                atomic_add(grad + a10, round_converter(u * (one - v) * hidden));
            }

            int a11 = GetSrcData<ctype, param_enumv::Remap::Format::NCHW,
                                 bmode>::get_index(row + 1, col + 1, c, IH, IW,
                                                   C);
            if (a11 != -1) {
                atomic_add(grad + a11, round_converter(u * v * hidden));
            }
        }
    }
}

template <typename ctype, const uint32_t format, ::BorderMode bmode>
void dispatch_backwarddata(ctype* grad, const float* map_xy, const ctype* diff,
                           int N, int C, int IH, int IW, int OH, int OW,
                           cudaStream_t stream) {
    const int BX = 32, BY = 16;
    const int max_batch_size = 65535;
    while (N) {
        size_t curr_batch_size = N < max_batch_size ? N : max_batch_size;
        dim3 threads(BX, BY);
        dim3 blocks((OW + BX - 1) / BX, (OH + BY - 1) / BY, curr_batch_size);

        cuda_check(cudaMemsetAsync(
                grad, 0, sizeof(ctype) * curr_batch_size * C * IH * IW,
                stream));
        kern_general<ctype, format, bmode><<<blocks, threads, 0, stream>>>(
                grad, map_xy, diff, C, IH, IW, OH, OW);

        N -= curr_batch_size;
        grad += curr_batch_size * C * IH * IW;
        diff += curr_batch_size * C * OH * OW;
        map_xy += curr_batch_size * 2 * OH * OW;
    }
}

}  // anonymous namespace

namespace megdnn {
namespace cuda {
namespace remap {

template <typename ctype, const uint32_t format, ::BorderMode bmode>
void backwarddata_proxy(ctype* grad, const float* map_xy, const ctype* diff,
                        int N, int C, int IH, int IW, int OH, int OW,
                        cudaStream_t stream) {
    dispatch_backwarddata<ctype, format, bmode>(grad, map_xy, diff, N, C, IH,
                                                IW, OH, OW, stream);
    after_kernel_launch();
}

#define INST(ctype, format, bmode)                                            \
    template void backwarddata_proxy<                                         \
            ctype, param_enumv::Remap::Format::format, ::BorderMode::bmode>(  \
            ctype*, const float*, const ctype*, int, int, int, int, int, int, \
            cudaStream_t);

#define FOR_FORMAT_BMODE(ctype)           \
    INST(ctype, NCHW, BORDER_CONSTANT)    \
    INST(ctype, NCHW, BORDER_REPLICATE)   \
    INST(ctype, NCHW, BORDER_REFLECT)     \
    INST(ctype, NCHW, BORDER_REFLECT_101) \
    INST(ctype, NCHW, BORDER_WRAP)

FOR_FORMAT_BMODE(float)
MEGDNN_INC_FLOAT16(FOR_FORMAT_BMODE(dt_bfloat16))

#undef FOR_FORMAT_BMODE
#undef INST

}  // namespace remap
}  // namespace cuda
}  // namespace megdnn

// vim: syntax=cpp.doxygen