| @@ -14,6 +14,7 @@ | |||
| #include "src/cuda/utils.h" | |||
| #include "src/cuda/cudnn_wrapper.h" | |||
| #include "src/cuda/convolution/helper.h" | |||
| #include "src/cuda/conv_bias/helper.h" | |||
| using namespace megdnn; | |||
| using namespace cuda; | |||
| @@ -31,27 +32,16 @@ bool ConvolutionBackwardDataImpl::AlgoCUDNN::is_available( | |||
| CUDNNBwdDataDescs D; | |||
| if (!is_cudnn_supported(args.as_fwd_args())) | |||
| TensorLayout bias_layout, z_layout; | |||
| conv_bias::CanonizedFilterMeta meta; | |||
| meta.copy_from(args.filter_meta); | |||
| conv_bias::BiasForwardSizeArgs bias_args{args.handle, | |||
| args.grad_layout, args.filter_layout, &bias_layout, | |||
| &z_layout, meta, args.diff_layout, param::ConvBias::NonlineMode::IDENTITY, | |||
| }; | |||
| if (!conv_bias::is_cudnn_supported(bias_args)) | |||
| return false; | |||
| #if CUDNN_VERSION >= 7500 | |||
| // As in cuda10.0 and cudnn7.5, algo CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 with | |||
| // TensorCore operations produces incorrect result. So we disable | |||
| // this algo. Please remove the following code, when | |||
| // nvidia has fixed this issue. | |||
| // incorrect case: | |||
| // inp={2x8x18x18}, kern={8x8x2x2}, pad_h=pad_w=2, stride_h=stride_w=2, | |||
| // dtype=float16 | |||
| if (args.filter_meta.dtype == dtype::Float16()) { | |||
| const char* algo_1 = "CUDNN_CONVOLUTION_BWD_DATA_ALGO_1"; | |||
| auto cmp_len = strlen(algo_1); | |||
| if (is_compute_capability_required(7, 0) && | |||
| strncmp(name(), algo_1, cmp_len) == 0) { | |||
| return false; | |||
| } | |||
| } | |||
| #endif | |||
| auto& cudnn = args.handle->cudnn(); | |||
| args.init_desc(D); | |||
| size_t workspace_size; | |||
| @@ -14,6 +14,7 @@ | |||
| #include "src/cuda/utils.h" | |||
| #include "src/cuda/cudnn_wrapper.h" | |||
| #include "src/cuda/convolution/helper.h" | |||
| #include "src/cuda/conv_bias/helper.h" | |||
| using namespace megdnn; | |||
| using namespace cuda; | |||
| @@ -31,7 +32,14 @@ bool ConvolutionBackwardFilterImpl::AlgoCUDNN::is_available( | |||
| auto& cudnn = args.handle->cudnn(); | |||
| CUDNNBwdFilterDescs D; | |||
| if (!is_cudnn_supported(args.as_fwd_args())) | |||
| TensorLayout bias_layout, z_layout; | |||
| conv_bias::CanonizedFilterMeta meta; | |||
| meta.copy_from(args.grad_filter_meta); | |||
| conv_bias::BiasForwardSizeArgs bias_args{args.handle, | |||
| args.src_layout, args.grad_layout, &bias_layout, | |||
| &z_layout, meta, args.diff_layout, param::ConvBias::NonlineMode::IDENTITY, | |||
| }; | |||
| if (!conv_bias::is_cudnn_supported(bias_args)) | |||
| return false; | |||
| args.init_desc(D); | |||
| @@ -33,7 +33,8 @@ bool convolution::is_cudnn_supported(const ForwardSizeArgs &args) { | |||
| args.dst_layout->dtype.enumv() != DTypeEnum::QuantizedS8) { | |||
| return false; | |||
| } | |||
| } else if (args.filter_meta.format != param::Convolution::Format::NCHW) { | |||
| } else if (args.filter_meta.format != param::Convolution::Format::NCHW && | |||
| args.filter_meta.format != param::Convolution::Format::NHWC) { | |||
| return false; | |||
| } | |||
| auto& fm = args.filter_meta; | |||
| @@ -284,6 +284,16 @@ std::vector<TestArg> convolution::get_args_cudnn_5_1_failures() { | |||
| return args; | |||
| } | |||
| std::vector<TestArg> convolution::get_args_cudnn_5_1_backward() { | |||
| std::vector<TestArg> args; | |||
| args.emplace_back( | |||
| param::Convolution{param::Convolution::Mode::CROSS_CORRELATION, 2, | |||
| 2, 2, 2}, | |||
| TensorShape{2, 8, 18, 18}, TensorShape{8, 8, 2, 2}); | |||
| return args; | |||
| } | |||
| std::vector<TestArg> convolution::get_args_x86_winograd_algorithm() { | |||
| std::vector<TestArg> args; | |||
| for (size_t ic_size : {8, 16}) { | |||
| @@ -40,6 +40,7 @@ std::vector<TestArg> get_args_x86_direct_case_2(); | |||
| std::vector<TestArg> get_args_fallback_templated_impl(); | |||
| std::vector<TestArg> get_args_fallback_non_templated_impl(); | |||
| std::vector<TestArg> get_args_cudnn_5_1_failures(); | |||
| std::vector<TestArg> get_args_cudnn_5_1_backward(); | |||
| std::vector<TestArg> get_args_x86_winograd_algorithm(); | |||
| std::vector<TestArg> get_args_BRAIN_481(); | |||
| std::vector<TestArg> get_args(); | |||
| @@ -238,6 +238,87 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) { | |||
| } | |||
| } | |||
| TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FP16_CUDNN7_5) { | |||
| // algo CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 with | |||
| // TensorCore operations produces incorrect result. | |||
| // Maybe nvidia has fixed this issue | |||
| // There is a test using incorrect case: | |||
| // inp={2x8x18x18}, kern={8x8x2x2}, pad_h=pad_w=2, stride_h=stride_w=2, | |||
| // dtype=float16 | |||
| using namespace convolution; | |||
| std::vector<TestArg> args = get_args_cudnn_5_1_backward(); | |||
| Checker<ConvolutionBackwardData> checker(handle_cuda()); | |||
| NormalRNG default_rng; | |||
| for (auto&& arg : args) { | |||
| float scale = | |||
| 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]); | |||
| scale = std::max(scale, 1.f); | |||
| UniformFloatRNG rng(scale, 2 * scale); | |||
| arg.param.format = param::Convolution::Format::NHWC; | |||
| arg.src = cvt_src_or_dst_nchw2nhwc(arg.src); | |||
| arg.filter = cvt_filter_nchw2nhwc(arg.filter); | |||
| auto src = TensorLayout(arg.src, dtype::Float32()); | |||
| auto filter = TensorLayout(arg.filter, dtype::Float32()); | |||
| TensorLayout dst; | |||
| { | |||
| auto opr = handle_cuda()->create_operator<Convolution>(); | |||
| opr->param() = arg.param; | |||
| opr->deduce_layout(src, filter, dst); | |||
| } | |||
| src.dtype = dst.dtype = filter.dtype = dtype::Float16(); | |||
| arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | |||
| checker.set_rng(0, &rng) | |||
| .set_rng(1, &rng) | |||
| .set_epsilon(1e-2) | |||
| .set_param(arg.param) | |||
| .exec(TensorLayoutArray{filter, dst, src}); | |||
| src.dtype = dst.dtype = filter.dtype = dtype::Float32(); | |||
| arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT; | |||
| checker.set_rng(0, &rng) | |||
| .set_rng(1, &rng) | |||
| .set_epsilon(1e-2) | |||
| .set_param(arg.param) | |||
| .exec(TensorLayoutArray{filter, dst, src}); | |||
| } | |||
| } | |||
| TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_NHWC) { | |||
| using namespace convolution; | |||
| std::vector<TestArg> args = get_args_cuda_conv_bwd_data(); | |||
| Checker<ConvolutionBackwardData> checker(handle_cuda()); | |||
| NormalRNG default_rng; | |||
| for (auto&& arg : args) { | |||
| float scale = | |||
| 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]); | |||
| UniformFloatRNG rng(scale, 2 * scale); | |||
| arg.param.format = param::Convolution::Format::NHWC; | |||
| arg.src = cvt_src_or_dst_nchw2nhwc(arg.src); | |||
| arg.filter = cvt_filter_nchw2nhwc(arg.filter); | |||
| auto src = TensorLayout(arg.src, dtype::Float32()); | |||
| auto filter = TensorLayout(arg.filter, dtype::Float32()); | |||
| TensorLayout dst; | |||
| { | |||
| auto opr = handle_cuda()->create_operator<Convolution>(); | |||
| opr->param() = arg.param; | |||
| opr->deduce_layout(src, filter, dst); | |||
| } | |||
| src.dtype = dst.dtype = filter.dtype = dtype::Float16(); | |||
| arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | |||
| checker.set_rng(0, &rng) | |||
| .set_rng(1, &rng) | |||
| .set_epsilon(1e-2) | |||
| .set_param(arg.param) | |||
| .exec(TensorLayoutArray{filter, dst, src}); | |||
| src.dtype = dst.dtype = filter.dtype = dtype::Float32(); | |||
| arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT; | |||
| checker.set_rng(0, &rng) | |||
| .set_rng(1, &rng) | |||
| .set_epsilon(1e-2) | |||
| .set_param(arg.param) | |||
| .exec(TensorLayoutArray{filter, dst, src}); | |||
| } | |||
| } | |||
| TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_CUDNN) { | |||
| if (cuda::is_compute_capability_required(7, 0)) | |||
| return; | |||