GitOrigin-RevId: f36495a46a
tags/v0.4.0
| @@ -14,43 +14,7 @@ | |||||
| #include "src/cuda/utils.h" | #include "src/cuda/utils.h" | ||||
| #include "src/cuda/handle.h" | #include "src/cuda/handle.h" | ||||
| namespace megdnn { | |||||
| namespace cuda { | |||||
| namespace local { | |||||
| void check_input(size_t N, | |||||
| size_t IC, size_t IH, size_t IW, | |||||
| size_t OC, size_t OH, size_t OW, | |||||
| size_t FH, size_t FW, | |||||
| size_t INs, size_t ONs, | |||||
| size_t PH, size_t PW, | |||||
| size_t SH, size_t SW, | |||||
| bool is_xcorr) | |||||
| { | |||||
| megdnn_ignore(N); | |||||
| megdnn_ignore(IC); | |||||
| megdnn_ignore(IH); | |||||
| megdnn_ignore(IW); | |||||
| megdnn_ignore(OC); | |||||
| megdnn_ignore(OH); | |||||
| megdnn_ignore(OW); | |||||
| megdnn_ignore(FH); | |||||
| megdnn_ignore(FW); | |||||
| megdnn_ignore(INs); | |||||
| megdnn_ignore(ONs); | |||||
| megdnn_ignore(PH); | |||||
| megdnn_ignore(PW); | |||||
| megdnn_ignore(SH); | |||||
| megdnn_ignore(SW); | |||||
| megdnn_ignore(is_xcorr); | |||||
| // shared memory constraint | |||||
| megdnn_assert(IH*IW <= 768, "spatial size should not be larger than 768."); | |||||
| // megdnn_assert(4 * 4 * 4 * IH * IW <= 49152); | |||||
| } | |||||
| } // namespace local | |||||
| } // namespace cuda | |||||
| } // namespace megdnn | |||||
| #include "src/common/utils.cuh" | |||||
| namespace megdnn { | namespace megdnn { | ||||
| namespace cuda { | namespace cuda { | ||||
| @@ -94,13 +58,9 @@ void LocalForwardImpl::exec(_megdnn_tensor_in src, | |||||
| param().stride_h, param().stride_w, | param().stride_h, param().stride_w, | ||||
| cublas, stream, | cublas, stream, | ||||
| one, zero); | one, zero); | ||||
| } else { | |||||
| local::check_input(N, IC, IH, IW, OC, OH, OW, FH, FW, | |||||
| IC*IH*IW, OC*OH*OW, | |||||
| param().pad_h, param().pad_w, | |||||
| param().stride_h, param().stride_w, | |||||
| is_xcorr); | |||||
| local::forward_proxy_weiming(src.ptr<dt_float32>(), | |||||
| } else if (local::forward_proxy_default_share_mem_in_bytes(IH, IW) <= | |||||
| handle->device_prop().sharedMemPerBlock) { | |||||
| local::forward_proxy_default(src.ptr<dt_float32>(), | |||||
| filter.ptr<dt_float32>(), | filter.ptr<dt_float32>(), | ||||
| dst.ptr<dt_float32>(), | dst.ptr<dt_float32>(), | ||||
| N, | N, | ||||
| @@ -112,6 +72,11 @@ void LocalForwardImpl::exec(_megdnn_tensor_in src, | |||||
| param().stride_h, param().stride_w, | param().stride_h, param().stride_w, | ||||
| is_xcorr, | is_xcorr, | ||||
| stream); | stream); | ||||
| } else { | |||||
| megdnn_throw(ssprintf( | |||||
| "No usable kernel for local conv, src: %s filter: %s \n", | |||||
| src.layout.to_string().c_str(), | |||||
| filter.layout.to_string().c_str())); | |||||
| } | } | ||||
| } | } | ||||
| @@ -18,6 +18,12 @@ namespace megdnn { | |||||
| namespace cuda { | namespace cuda { | ||||
| namespace local { | namespace local { | ||||
| constexpr size_t Ns = 4, ICs = 4; | |||||
| size_t forward_proxy_default_share_mem_in_bytes(size_t IH, size_t IW) { | |||||
| return Ns * ICs * sizeof(float) * IH * IW; | |||||
| } | |||||
| // blockIdx.y is OC*OH*OW/1024 | // blockIdx.y is OC*OH*OW/1024 | ||||
| // blockIdx.x is N/4 | // blockIdx.x is N/4 | ||||
| // threadIdx.x is [0, 1024) | // threadIdx.x is [0, 1024) | ||||
| @@ -96,7 +102,7 @@ __global__ void forward_kernel(const float * __restrict__ src, | |||||
| } | } | ||||
| } | } | ||||
| void forward_proxy_weiming(const float *src, const float *filter, float *dst, | |||||
| void forward_proxy_default(const float *src, const float *filter, float *dst, | |||||
| size_t N, | size_t N, | ||||
| size_t IC, size_t IH, size_t IW, | size_t IC, size_t IH, size_t IW, | ||||
| size_t OC, size_t OH, size_t OW, | size_t OC, size_t OH, size_t OW, | ||||
| @@ -108,7 +114,6 @@ void forward_proxy_weiming(const float *src, const float *filter, float *dst, | |||||
| cudaStream_t stream) | cudaStream_t stream) | ||||
| { | { | ||||
| size_t threads = 256; | size_t threads = 256; | ||||
| const size_t Ns = 4, ICs = 4; | |||||
| dim3 blocks = dim3(DIVUP(N, Ns), DIVUP(OC*OH*OW, threads)); | dim3 blocks = dim3(DIVUP(N, Ns), DIVUP(OC*OH*OW, threads)); | ||||
| if (is_xcorr) { | if (is_xcorr) { | ||||
| forward_kernel<Ns, ICs, true><<<blocks, threads, | forward_kernel<Ns, ICs, true><<<blocks, threads, | ||||
| @@ -17,17 +17,10 @@ namespace megdnn { | |||||
| namespace cuda { | namespace cuda { | ||||
| namespace local { | namespace local { | ||||
| void check_input(size_t N, | |||||
| size_t IC, size_t IH, size_t IW, | |||||
| size_t OC, size_t OH, size_t OW, | |||||
| size_t FH, size_t FW, | |||||
| size_t INs, size_t ONs, | |||||
| size_t PH, size_t PW, | |||||
| size_t SH, size_t SW, | |||||
| bool is_xcorr); | |||||
| size_t forward_proxy_default_share_mem_in_bytes(size_t IH, size_t IW); | |||||
| void forward_proxy_weiming(const float *src, const float *filter, float *dst, | |||||
| size_t N, | |||||
| void forward_proxy_default(const float *src, const float *filter, float *dst, | |||||
| size_t N, | |||||
| size_t IC, size_t IH, size_t IW, | size_t IC, size_t IH, size_t IW, | ||||
| size_t OC, size_t OH, size_t OW, | size_t OC, size_t OH, size_t OW, | ||||
| size_t FH, size_t FW, | size_t FH, size_t FW, | ||||
| @@ -39,7 +32,7 @@ void forward_proxy_weiming(const float *src, const float *filter, float *dst, | |||||
| /// forward | /// forward | ||||
| bool can_forward_proxy_convnet(size_t N, | |||||
| bool can_forward_proxy_convnet(size_t N, | |||||
| size_t IC, size_t IH, size_t IW, | size_t IC, size_t IH, size_t IW, | ||||
| size_t OC, size_t OH, size_t OW, | size_t OC, size_t OH, size_t OW, | ||||
| size_t FH, size_t FW, | size_t FH, size_t FW, | ||||
| @@ -70,7 +63,7 @@ size_t get_workspace_in_floats_forward_proxy_convnet(size_t N, | |||||
| /// bwd data | /// bwd data | ||||
| bool can_backward_data_proxy_convnet(size_t N, | |||||
| bool can_backward_data_proxy_convnet(size_t N, | |||||
| size_t IC, size_t IH, size_t IW, | size_t IC, size_t IH, size_t IW, | ||||
| size_t OC, size_t OH, size_t OW, | size_t OC, size_t OH, size_t OW, | ||||
| size_t FH, size_t FW, | size_t FH, size_t FW, | ||||
| @@ -78,7 +71,7 @@ bool can_backward_data_proxy_convnet(size_t N, | |||||
| size_t PH, size_t PW, | size_t PH, size_t PW, | ||||
| size_t SH, size_t SW); | size_t SH, size_t SW); | ||||
| void backward_data_proxy_convnet(const float *filter, | |||||
| void backward_data_proxy_convnet(const float *filter, | |||||
| const float *diff, | const float *diff, | ||||
| float *grad, | float *grad, | ||||
| float *workspace, | float *workspace, | ||||
| @@ -103,7 +96,7 @@ size_t get_workspace_in_floats_backward_data_proxy_convnet(size_t N, | |||||
| /// bwd filter | /// bwd filter | ||||
| bool can_backward_filter_proxy_convnet(size_t N, | |||||
| bool can_backward_filter_proxy_convnet(size_t N, | |||||
| size_t IC, size_t IH, size_t IW, | size_t IC, size_t IH, size_t IW, | ||||
| size_t OC, size_t OH, size_t OW, | size_t OC, size_t OH, size_t OW, | ||||
| size_t FH, size_t FW, | size_t FH, size_t FW, | ||||