| @@ -14,6 +14,7 @@ | |||||
| #include "src/cuda/utils.h" | #include "src/cuda/utils.h" | ||||
| #include "src/cuda/cudnn_wrapper.h" | #include "src/cuda/cudnn_wrapper.h" | ||||
| #include "src/cuda/convolution/helper.h" | #include "src/cuda/convolution/helper.h" | ||||
| #include "src/cuda/conv_bias/helper.h" | |||||
| using namespace megdnn; | using namespace megdnn; | ||||
| using namespace cuda; | using namespace cuda; | ||||
| @@ -31,27 +32,16 @@ bool ConvolutionBackwardDataImpl::AlgoCUDNN::is_available( | |||||
| CUDNNBwdDataDescs D; | CUDNNBwdDataDescs D; | ||||
| if (!is_cudnn_supported(args.as_fwd_args())) | |||||
| TensorLayout bias_layout, z_layout; | |||||
| conv_bias::CanonizedFilterMeta meta; | |||||
| meta.copy_from(args.filter_meta); | |||||
| conv_bias::BiasForwardSizeArgs bias_args{args.handle, | |||||
| args.grad_layout, args.filter_layout, &bias_layout, | |||||
| &z_layout, meta, args.diff_layout, param::ConvBias::NonlineMode::IDENTITY, | |||||
| }; | |||||
| if (!conv_bias::is_cudnn_supported(bias_args)) | |||||
| return false; | return false; | ||||
| #if CUDNN_VERSION >= 7500 | |||||
| // As in cuda10.0 and cudnn7.5, algo CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 with | |||||
| // TensorCore operations produces incorrect result. So we disable | |||||
| // this algo. Please remove the following code, when | |||||
| // nvidia has fixed this issue. | |||||
| // incorrect case: | |||||
| // inp={2x8x18x18}, kern={8x8x2x2}, pad_h=pad_w=2, stride_h=stride_w=2, | |||||
| // dtype=float16 | |||||
| if (args.filter_meta.dtype == dtype::Float16()) { | |||||
| const char* algo_1 = "CUDNN_CONVOLUTION_BWD_DATA_ALGO_1"; | |||||
| auto cmp_len = strlen(algo_1); | |||||
| if (is_compute_capability_required(7, 0) && | |||||
| strncmp(name(), algo_1, cmp_len) == 0) { | |||||
| return false; | |||||
| } | |||||
| } | |||||
| #endif | |||||
| auto& cudnn = args.handle->cudnn(); | auto& cudnn = args.handle->cudnn(); | ||||
| args.init_desc(D); | args.init_desc(D); | ||||
| size_t workspace_size; | size_t workspace_size; | ||||
| @@ -14,6 +14,7 @@ | |||||
| #include "src/cuda/utils.h" | #include "src/cuda/utils.h" | ||||
| #include "src/cuda/cudnn_wrapper.h" | #include "src/cuda/cudnn_wrapper.h" | ||||
| #include "src/cuda/convolution/helper.h" | #include "src/cuda/convolution/helper.h" | ||||
| #include "src/cuda/conv_bias/helper.h" | |||||
| using namespace megdnn; | using namespace megdnn; | ||||
| using namespace cuda; | using namespace cuda; | ||||
| @@ -31,7 +32,14 @@ bool ConvolutionBackwardFilterImpl::AlgoCUDNN::is_available( | |||||
| auto& cudnn = args.handle->cudnn(); | auto& cudnn = args.handle->cudnn(); | ||||
| CUDNNBwdFilterDescs D; | CUDNNBwdFilterDescs D; | ||||
| if (!is_cudnn_supported(args.as_fwd_args())) | |||||
| TensorLayout bias_layout, z_layout; | |||||
| conv_bias::CanonizedFilterMeta meta; | |||||
| meta.copy_from(args.grad_filter_meta); | |||||
| conv_bias::BiasForwardSizeArgs bias_args{args.handle, | |||||
| args.src_layout, args.grad_layout, &bias_layout, | |||||
| &z_layout, meta, args.diff_layout, param::ConvBias::NonlineMode::IDENTITY, | |||||
| }; | |||||
| if (!conv_bias::is_cudnn_supported(bias_args)) | |||||
| return false; | return false; | ||||
| args.init_desc(D); | args.init_desc(D); | ||||
| @@ -33,7 +33,8 @@ bool convolution::is_cudnn_supported(const ForwardSizeArgs &args) { | |||||
| args.dst_layout->dtype.enumv() != DTypeEnum::QuantizedS8) { | args.dst_layout->dtype.enumv() != DTypeEnum::QuantizedS8) { | ||||
| return false; | return false; | ||||
| } | } | ||||
| } else if (args.filter_meta.format != param::Convolution::Format::NCHW) { | |||||
| } else if (args.filter_meta.format != param::Convolution::Format::NCHW && | |||||
| args.filter_meta.format != param::Convolution::Format::NHWC) { | |||||
| return false; | return false; | ||||
| } | } | ||||
| auto& fm = args.filter_meta; | auto& fm = args.filter_meta; | ||||
| @@ -284,6 +284,16 @@ std::vector<TestArg> convolution::get_args_cudnn_5_1_failures() { | |||||
| return args; | return args; | ||||
| } | } | ||||
| std::vector<TestArg> convolution::get_args_cudnn_5_1_backward() { | |||||
| std::vector<TestArg> args; | |||||
| args.emplace_back( | |||||
| param::Convolution{param::Convolution::Mode::CROSS_CORRELATION, 2, | |||||
| 2, 2, 2}, | |||||
| TensorShape{2, 8, 18, 18}, TensorShape{8, 8, 2, 2}); | |||||
| return args; | |||||
| } | |||||
| std::vector<TestArg> convolution::get_args_x86_winograd_algorithm() { | std::vector<TestArg> convolution::get_args_x86_winograd_algorithm() { | ||||
| std::vector<TestArg> args; | std::vector<TestArg> args; | ||||
| for (size_t ic_size : {8, 16}) { | for (size_t ic_size : {8, 16}) { | ||||
| @@ -40,6 +40,7 @@ std::vector<TestArg> get_args_x86_direct_case_2(); | |||||
| std::vector<TestArg> get_args_fallback_templated_impl(); | std::vector<TestArg> get_args_fallback_templated_impl(); | ||||
| std::vector<TestArg> get_args_fallback_non_templated_impl(); | std::vector<TestArg> get_args_fallback_non_templated_impl(); | ||||
| std::vector<TestArg> get_args_cudnn_5_1_failures(); | std::vector<TestArg> get_args_cudnn_5_1_failures(); | ||||
| std::vector<TestArg> get_args_cudnn_5_1_backward(); | |||||
| std::vector<TestArg> get_args_x86_winograd_algorithm(); | std::vector<TestArg> get_args_x86_winograd_algorithm(); | ||||
| std::vector<TestArg> get_args_BRAIN_481(); | std::vector<TestArg> get_args_BRAIN_481(); | ||||
| std::vector<TestArg> get_args(); | std::vector<TestArg> get_args(); | ||||
| @@ -238,6 +238,87 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) { | |||||
| } | } | ||||
| } | } | ||||
| TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FP16_CUDNN7_5) { | |||||
| // algo CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 with | |||||
| // TensorCore operations produces incorrect result. | |||||
| // Maybe nvidia has fixed this issue | |||||
| // There is a test using incorrect case: | |||||
| // inp={2x8x18x18}, kern={8x8x2x2}, pad_h=pad_w=2, stride_h=stride_w=2, | |||||
| // dtype=float16 | |||||
| using namespace convolution; | |||||
| std::vector<TestArg> args = get_args_cudnn_5_1_backward(); | |||||
| Checker<ConvolutionBackwardData> checker(handle_cuda()); | |||||
| NormalRNG default_rng; | |||||
| for (auto&& arg : args) { | |||||
| float scale = | |||||
| 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]); | |||||
| scale = std::max(scale, 1.f); | |||||
| UniformFloatRNG rng(scale, 2 * scale); | |||||
| arg.param.format = param::Convolution::Format::NHWC; | |||||
| arg.src = cvt_src_or_dst_nchw2nhwc(arg.src); | |||||
| arg.filter = cvt_filter_nchw2nhwc(arg.filter); | |||||
| auto src = TensorLayout(arg.src, dtype::Float32()); | |||||
| auto filter = TensorLayout(arg.filter, dtype::Float32()); | |||||
| TensorLayout dst; | |||||
| { | |||||
| auto opr = handle_cuda()->create_operator<Convolution>(); | |||||
| opr->param() = arg.param; | |||||
| opr->deduce_layout(src, filter, dst); | |||||
| } | |||||
| src.dtype = dst.dtype = filter.dtype = dtype::Float16(); | |||||
| arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | |||||
| checker.set_rng(0, &rng) | |||||
| .set_rng(1, &rng) | |||||
| .set_epsilon(1e-2) | |||||
| .set_param(arg.param) | |||||
| .exec(TensorLayoutArray{filter, dst, src}); | |||||
| src.dtype = dst.dtype = filter.dtype = dtype::Float32(); | |||||
| arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT; | |||||
| checker.set_rng(0, &rng) | |||||
| .set_rng(1, &rng) | |||||
| .set_epsilon(1e-2) | |||||
| .set_param(arg.param) | |||||
| .exec(TensorLayoutArray{filter, dst, src}); | |||||
| } | |||||
| } | |||||
| TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_NHWC) { | |||||
| using namespace convolution; | |||||
| std::vector<TestArg> args = get_args_cuda_conv_bwd_data(); | |||||
| Checker<ConvolutionBackwardData> checker(handle_cuda()); | |||||
| NormalRNG default_rng; | |||||
| for (auto&& arg : args) { | |||||
| float scale = | |||||
| 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]); | |||||
| UniformFloatRNG rng(scale, 2 * scale); | |||||
| arg.param.format = param::Convolution::Format::NHWC; | |||||
| arg.src = cvt_src_or_dst_nchw2nhwc(arg.src); | |||||
| arg.filter = cvt_filter_nchw2nhwc(arg.filter); | |||||
| auto src = TensorLayout(arg.src, dtype::Float32()); | |||||
| auto filter = TensorLayout(arg.filter, dtype::Float32()); | |||||
| TensorLayout dst; | |||||
| { | |||||
| auto opr = handle_cuda()->create_operator<Convolution>(); | |||||
| opr->param() = arg.param; | |||||
| opr->deduce_layout(src, filter, dst); | |||||
| } | |||||
| src.dtype = dst.dtype = filter.dtype = dtype::Float16(); | |||||
| arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | |||||
| checker.set_rng(0, &rng) | |||||
| .set_rng(1, &rng) | |||||
| .set_epsilon(1e-2) | |||||
| .set_param(arg.param) | |||||
| .exec(TensorLayoutArray{filter, dst, src}); | |||||
| src.dtype = dst.dtype = filter.dtype = dtype::Float32(); | |||||
| arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT; | |||||
| checker.set_rng(0, &rng) | |||||
| .set_rng(1, &rng) | |||||
| .set_epsilon(1e-2) | |||||
| .set_param(arg.param) | |||||
| .exec(TensorLayoutArray{filter, dst, src}); | |||||
| } | |||||
| } | |||||
| TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_CUDNN) { | TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_CUDNN) { | ||||
| if (cuda::is_compute_capability_required(7, 0)) | if (cuda::is_compute_capability_required(7, 0)) | ||||
| return; | return; | ||||