| @@ -18,4 +18,39 @@ | |||
| #include "megdnn/oprs/utils.h" | |||
| #include "megdnn/oprs/linalg.h" | |||
| template <typename Opr> | |||
| struct OprArityTrait; | |||
| template <typename Opr, int _arity_in, int _arity_out> | |||
| struct OprArityTraitTmpl { | |||
| static constexpr int arity_in = _arity_in; | |||
| static constexpr int arity_out = _arity_out; | |||
| static constexpr int arity = arity_in + arity_out; | |||
| }; | |||
| #define INST_ARITY(_Opr, _in, _out) \ | |||
| template <> \ | |||
| struct OprArityTrait<_Opr> : public OprArityTraitTmpl<_Opr, _in, _out> {}; | |||
| INST_ARITY(megdnn::ConvolutionBackwardData, 2, 1); | |||
| INST_ARITY(megdnn::ConvolutionBackwardFilter, 2, 1); | |||
| INST_ARITY(megdnn::Convolution3DForward, 2, 1); | |||
| INST_ARITY(megdnn::Convolution3DBackwardData, 2, 1); | |||
| INST_ARITY(megdnn::Convolution3DBackwardFilter, 2, 1); | |||
| INST_ARITY(megdnn::LocalShareForward, 2, 1); | |||
| INST_ARITY(megdnn::LocalShareBackwardData, 2, 1); | |||
| INST_ARITY(megdnn::LocalShareBackwardFilter, 2, 1); | |||
| INST_ARITY(megdnn::Convolution, 2, 1); | |||
| INST_ARITY(megdnn::DeformableConvForward, 4, 1); | |||
| INST_ARITY(megdnn::DeformableConvBackwardFilter, 4, 1); | |||
| INST_ARITY(megdnn::BatchConvBiasForward, 4, 1); | |||
| INST_ARITY(megdnn::ConvBias, 4, 1); | |||
| INST_ARITY(megdnn::DeformableConvBackwardData, 5, 3); | |||
| INST_ARITY(megdnn::MatrixMul, 2, 1); | |||
| INST_ARITY(megdnn::BatchedMatrixMul, 2, 1); | |||
| #undef INST_ARITY | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -47,6 +47,9 @@ namespace megdnn { | |||
| return algo_pack().all_algos_map().at(desc); \ | |||
| } | |||
| #define MEGDNN_FOREACH_ALGO_ATTRIBUTE_INHERITABLE(cb) \ | |||
| cb(AlgoAttribute::ACCURACY_DEPEND_ON_BATCH) | |||
| /** | |||
| * \brief construct algo from AlgorithmDesc | |||
| */ | |||
| @@ -323,6 +323,34 @@ void handle_bias_and_nonlinear(Handle* handle, param::ConvBias args, | |||
| } | |||
| } | |||
| bool check_bias_share_in_channel(const TensorLayout& bias, | |||
| const param::ConvBias::Format format) { | |||
| bool share_in_channel = false; | |||
| if (format == param::ConvBias::Format::NCHW || | |||
| format == param::ConvBias::Format::NCHW4_NCHW) { | |||
| share_in_channel = (bias.ndim == 4 && bias[0] == 1 && bias[2] == 1 && | |||
| bias[3] == 1); | |||
| } else if (format == param::ConvBias::Format::NHWC) { | |||
| share_in_channel = (bias.ndim == 4 && bias[0] == 1 && bias[1] == 1 && | |||
| bias[2] == 1); | |||
| } else if (format == param::ConvBias::Format::NCHW4 || | |||
| format == param::ConvBias::Format::NCHW8 || | |||
| format == param::ConvBias::Format::NCHW32 || | |||
| format == param::ConvBias::Format::NCHW4_NCHW32 || | |||
| format == param::ConvBias::Format::NCHW32_NCHW4) { | |||
| share_in_channel = (bias.ndim == 5 && bias[0] == 1 && bias[2] == 1 && | |||
| bias[3] == 1); | |||
| } else if (format == param::ConvBias::Format::NHWCD4) { | |||
| share_in_channel = (bias.ndim == 5 && bias[0] == 1 && bias[1] == 1 && | |||
| bias[3] == 1); | |||
| } else { | |||
| megdnn_assert(format == param::ConvBias::Format::CHWN4); | |||
| share_in_channel = (bias.ndim == 5 && bias[1] == 1 && bias[2] == 1 && | |||
| bias[3] == 1); | |||
| } | |||
| return share_in_channel; | |||
| } | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -21,6 +21,9 @@ void handle_bias_and_nonlinear(Handle* handle, param::ConvBias args, | |||
| const TensorND* conv_dst_tensor, | |||
| const TensorND* dst_tensor, | |||
| const TensorND* bias_tensor); | |||
| bool check_bias_share_in_channel(const TensorLayout& bias, | |||
| const param::ConvBias::Format format); | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -9,7 +9,7 @@ | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| */ | |||
| #include "src/common/utils.h" | |||
| #include "src/common/conv_bias.h" | |||
| #include "src/cuda/batch_conv_bias/algo.h" | |||
| #include "src/cuda/batch_conv_bias/batch_conv_bias.cuh" | |||
| #include "src/cuda/batch_conv_bias/opr_impl.h" | |||
| @@ -106,7 +106,7 @@ bool BatchConvBiasForwardImpl::AlgoInt8NCHW4DotProdGemm::is_available( | |||
| using Mode = Param::Mode; | |||
| bool available = true; | |||
| auto&& param = args.opr->param(); | |||
| if (!conv_bias::check_bias_share_in_channel(args.bias_layout, param.format)) | |||
| if (!check_bias_share_in_channel(args.bias_layout, param.format)) | |||
| return false; | |||
| if (param.format != Format::NCHW4) | |||
| return false; | |||
| @@ -10,7 +10,7 @@ | |||
| */ | |||
| #include "megdnn/oprs/general.h" | |||
| #include "src/common/utils.h" | |||
| #include "src/common/conv_bias.h" | |||
| #include "src/cuda/batch_conv_bias/algo.h" | |||
| #include "src/cuda/batch_conv_bias/batch_conv_bias.cuh" | |||
| #include "src/cuda/batch_conv_bias/opr_impl.h" | |||
| @@ -86,7 +86,7 @@ bool BatchConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemmPrecomp:: | |||
| using Mode = Param::Mode; | |||
| bool available = true; | |||
| auto&& param = args.opr->param(); | |||
| if (!conv_bias::check_bias_share_in_channel(args.bias_layout, param.format)) | |||
| if (!check_bias_share_in_channel(args.bias_layout, param.format)) | |||
| return false; | |||
| if (param.format != Format::NCHW4) | |||
| return false; | |||
| @@ -115,7 +115,8 @@ public: | |||
| size_t get_workspace_in_bytes(const SizeArgs& /*args*/) const override; | |||
| void exec(const ExecArgs& args) const final; | |||
| AlgoAttribute attribute() const override { | |||
| return AlgoAttribute::REPRODUCIBLE; | |||
| return AlgoAttribute::REPRODUCIBLE | | |||
| AlgoAttribute::ACCURACY_DEPEND_ON_BATCH; | |||
| } | |||
| const char* name() const override { return "CUBLAS"; } | |||
| MEGDNN_DECL_ALGO_TYPE(CUDA_CUBLAS) | |||
| @@ -128,7 +129,8 @@ public: | |||
| size_t get_workspace_in_bytes(const SizeArgs& /*args*/) const override; | |||
| void exec(const ExecArgs& args) const final; | |||
| AlgoAttribute attribute() const override { | |||
| return AlgoAttribute::REPRODUCIBLE; | |||
| return AlgoAttribute::REPRODUCIBLE | | |||
| AlgoAttribute::ACCURACY_DEPEND_ON_BATCH; | |||
| } | |||
| const char* name() const override { return "CUBLAS_LT"; } | |||
| MEGDNN_DECL_ALGO_TYPE(CUDA_CUBLASLT) | |||
| @@ -173,6 +173,9 @@ public: | |||
| if (m_attr.is_reproducible) { | |||
| ret |= AlgoAttribute::REPRODUCIBLE; | |||
| } | |||
| if (m_attr.accuracy_depend_on_batch) { | |||
| ret |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH; | |||
| } | |||
| return ret; | |||
| } | |||
| @@ -280,6 +283,9 @@ public: | |||
| if (m_attr.is_reproducible) { | |||
| ret |= AlgoAttribute::REPRODUCIBLE; | |||
| } | |||
| if (m_attr.accuracy_depend_on_batch) { | |||
| ret |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH; | |||
| } | |||
| return ret; | |||
| } | |||
| @@ -352,7 +358,8 @@ public: | |||
| const OperatorBase* opr) const override; | |||
| MEGDNN_DECL_ALGO_TYPE(CUDA_MATMUL) | |||
| AlgoAttribute attribute() const override { | |||
| return AlgoAttribute::REPRODUCIBLE; | |||
| return AlgoAttribute::REPRODUCIBLE | | |||
| AlgoAttribute::ACCURACY_DEPEND_ON_BATCH; | |||
| } | |||
| private: | |||
| @@ -406,7 +413,8 @@ public: | |||
| const OperatorBase* opr) const override; | |||
| AlgoAttribute attribute() const override { | |||
| return AlgoAttribute::REPRODUCIBLE; | |||
| return AlgoAttribute::REPRODUCIBLE | | |||
| AlgoAttribute::ACCURACY_DEPEND_ON_BATCH; | |||
| } | |||
| MEGDNN_DECL_ALGO_TYPE(CUDA_BATCHED_MATMUL) | |||
| @@ -428,7 +436,14 @@ public: | |||
| const char* name() const override { return m_name.c_str(); } | |||
| AlgoAttribute attribute() const override { | |||
| auto ret = static_cast<AlgoAttribute>(0); | |||
| auto ret = AlgoAttribute::DEFAULT; | |||
| #define cb(attr) \ | |||
| if (m_impl->contain_attribute_all(attr)) { \ | |||
| ret |= attr; \ | |||
| } | |||
| MEGDNN_FOREACH_ALGO_ATTRIBUTE_INHERITABLE(cb) | |||
| #undef cb | |||
| if (m_impl->contain_attribute_all(AlgoAttribute::REPRODUCIBLE)) { | |||
| ret |= AlgoAttribute::REPRODUCIBLE; | |||
| } | |||
| @@ -16,6 +16,7 @@ | |||
| #include "src/cuda/conv_bias/helper.h" | |||
| #include "src/cuda/cudnn_wrapper.h" | |||
| #include "src/cuda/utils.h" | |||
| #include "src/common/conv_bias.h" | |||
| using namespace megdnn; | |||
| using namespace cuda; | |||
| @@ -29,7 +30,7 @@ bool ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::is_available( | |||
| } | |||
| if (args.bias_layout->ndim == 0 || | |||
| !conv_bias::check_bias_share_in_channel(*(args.bias_layout), | |||
| !check_bias_share_in_channel(*(args.bias_layout), | |||
| args.opr->param().format)) { | |||
| return false; | |||
| } | |||
| @@ -168,34 +168,6 @@ bool is_cudnn_supported(const BiasForwardSizeArgs& args) { | |||
| return supported; | |||
| } | |||
| bool check_bias_share_in_channel(const TensorLayout& bias, | |||
| const param::ConvBias::Format format) { | |||
| bool share_in_channel = false; | |||
| if (format == param::ConvBias::Format::NCHW || | |||
| format == param::ConvBias::Format::NCHW4_NCHW) { | |||
| share_in_channel = (bias.ndim == 4 && bias[0] == 1 && bias[2] == 1 && | |||
| bias[3] == 1); | |||
| } else if (format == param::ConvBias::Format::NHWC) { | |||
| share_in_channel = (bias.ndim == 4 && bias[0] == 1 && bias[1] == 1 && | |||
| bias[2] == 1); | |||
| } else if (format == param::ConvBias::Format::NCHW4 || | |||
| format == param::ConvBias::Format::NCHW8 || | |||
| format == param::ConvBias::Format::NCHW32 || | |||
| format == param::ConvBias::Format::NCHW4_NCHW32 || | |||
| format == param::ConvBias::Format::NCHW32_NCHW4) { | |||
| share_in_channel = (bias.ndim == 5 && bias[0] == 1 && bias[2] == 1 && | |||
| bias[3] == 1); | |||
| } else if (format == param::ConvBias::Format::NHWCD4) { | |||
| share_in_channel = (bias.ndim == 5 && bias[0] == 1 && bias[1] == 1 && | |||
| bias[3] == 1); | |||
| } else { | |||
| megdnn_assert(format == param::ConvBias::Format::CHWN4); | |||
| share_in_channel = (bias.ndim == 5 && bias[1] == 1 && bias[2] == 1 && | |||
| bias[3] == 1); | |||
| } | |||
| return share_in_channel; | |||
| } | |||
| SmallVector<size_t> matmul_get_workspace_bundle( | |||
| const BiasForwardSizeArgs& args) { | |||
| auto dtype = args.src_layout->dtype; | |||
| @@ -126,9 +126,6 @@ namespace conv_bias { | |||
| } | |||
| }; | |||
| bool check_bias_share_in_channel(const TensorLayout& bias, | |||
| const param::ConvBias::Format format); | |||
| } // namespace conv_bias | |||
| } // namespace cuda | |||
| } // namespace megdnn | |||
| @@ -15,6 +15,7 @@ | |||
| #include "src/cuda/convolution_helper/layout.cuh" | |||
| #include "src/cuda/convolution_helper/parameter.cuh" | |||
| #include "src/cuda/utils.h" | |||
| #include "src/common/conv_bias.h" | |||
| using namespace megdnn; | |||
| using namespace cuda; | |||
| @@ -83,7 +84,7 @@ bool ConvBiasForwardImpl::AlgoInt8CHWN4DotProdImplicitGemm::is_available( | |||
| bool available = true; | |||
| auto&& param = args.opr->param(); | |||
| auto&& fm = args.filter_meta; | |||
| if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout), | |||
| if (!check_bias_share_in_channel(*(args.bias_layout), | |||
| param.format)) | |||
| return false; | |||
| if (param.format != Format::CHWN4) | |||
| @@ -15,6 +15,7 @@ | |||
| #include "src/cuda/convolution_helper/layout.cuh" | |||
| #include "src/cuda/convolution_helper/parameter.cuh" | |||
| #include "src/cuda/utils.h" | |||
| #include "src/common/conv_bias.h" | |||
| using namespace megdnn; | |||
| using namespace cuda; | |||
| @@ -71,7 +72,7 @@ bool ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemm::is_available( | |||
| bool available = true; | |||
| auto&& param = args.opr->param(); | |||
| auto&& fm = args.filter_meta; | |||
| if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout), | |||
| if (!check_bias_share_in_channel(*(args.bias_layout), | |||
| param.format)) | |||
| return false; | |||
| if (param.format != Format::CHWN4) | |||
| @@ -15,6 +15,7 @@ | |||
| #include "src/cuda/convolution_helper/layout.cuh" | |||
| #include "src/cuda/convolution_helper/parameter.cuh" | |||
| #include "src/cuda/utils.h" | |||
| #include "src/common/conv_bias.h" | |||
| using namespace megdnn; | |||
| using namespace cuda; | |||
| @@ -118,7 +119,7 @@ bool ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmReorderFilter:: | |||
| bool available = true; | |||
| auto&& param = args.opr->param(); | |||
| auto&& fm = args.filter_meta; | |||
| if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout), | |||
| if (!check_bias_share_in_channel(*(args.bias_layout), | |||
| param.format)) | |||
| return false; | |||
| if (param.format != Format::CHWN4) | |||
| @@ -15,6 +15,7 @@ | |||
| #include "src/cuda/convolution_helper/layout.cuh" | |||
| #include "src/cuda/convolution_helper/parameter.cuh" | |||
| #include "src/cuda/utils.h" | |||
| #include "src/common/conv_bias.h" | |||
| using namespace megdnn; | |||
| using namespace cuda; | |||
| @@ -118,7 +119,7 @@ bool ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth:: | |||
| bool available = true; | |||
| auto&& param = args.opr->param(); | |||
| auto&& fm = args.filter_meta; | |||
| if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout), | |||
| if (!check_bias_share_in_channel(*(args.bias_layout), | |||
| param.format)) | |||
| return false; | |||
| if (param.format != Format::CHWN4) | |||
| @@ -14,6 +14,7 @@ | |||
| #include "src/cuda/conv_bias/cutlass_convolution_wrapper.cuh" | |||
| #include "src/cuda/convolution_helper/parameter.cuh" | |||
| #include "src/cuda/utils.h" | |||
| #include "src/common/conv_bias.h" | |||
| using namespace megdnn; | |||
| using namespace cuda; | |||
| @@ -32,7 +33,7 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::is_available( | |||
| bool available = true; | |||
| auto&& param = args.opr->param(); | |||
| auto&& fm = args.filter_meta; | |||
| if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout), | |||
| if (!check_bias_share_in_channel(*(args.bias_layout), | |||
| param.format)) | |||
| return false; | |||
| if (param.format != Format::NCHW32 && param.format != Format::NCHW32_NCHW4) | |||
| @@ -13,6 +13,7 @@ | |||
| #include "src/cuda/utils.h" | |||
| #include "src/cuda/convolution_helper/parameter.cuh" | |||
| #include "src/cuda/conv_bias/cutlass_convolution_wrapper.cuh" | |||
| #include "src/common/conv_bias.h" | |||
| using namespace megdnn; | |||
| using namespace cuda; | |||
| @@ -29,7 +30,7 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available( | |||
| bool available = true; | |||
| auto&& param = args.opr->param(); | |||
| auto&& fm = args.filter_meta; | |||
| if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout), | |||
| if (!check_bias_share_in_channel(*(args.bias_layout), | |||
| param.format)) | |||
| return false; | |||
| if (param.format == Format::NCHW4_NCHW32) { | |||
| @@ -12,6 +12,7 @@ | |||
| #include "./algo.h" | |||
| #include "src/cuda/utils.h" | |||
| #include "src/cuda/convolution_helper/bias_visitor.cuh" | |||
| #include "src/common/conv_bias.h" | |||
| using namespace megdnn; | |||
| using namespace cuda; | |||
| @@ -29,7 +30,7 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4IMMAImplicitGemm::is_available( | |||
| bool available = true; | |||
| auto&& param = args.opr->param(); | |||
| auto&& fm = args.filter_meta; | |||
| if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout), | |||
| if (!check_bias_share_in_channel(*(args.bias_layout), | |||
| param.format)) | |||
| return false; | |||
| if (param.format != Format::NCHW4) | |||
| @@ -127,6 +127,9 @@ public: | |||
| if (m_attr.is_reproducible) { | |||
| ret |= AlgoAttribute::REPRODUCIBLE; | |||
| } | |||
| if (m_attr.accuracy_depend_on_batch) { | |||
| ret |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH; | |||
| } | |||
| return ret; | |||
| } | |||
| cudnnConvolutionBwdDataAlgo_t cudnn_enum() const { return m_cudnn_enum; } | |||
| @@ -158,7 +161,8 @@ public: | |||
| const char* name() const override { return "MATMUL"; } | |||
| MEGDNN_DECL_ALGO_TYPE(CUDA_MATMUL) | |||
| AlgoAttribute attribute() const override { | |||
| return AlgoAttribute::REPRODUCIBLE; | |||
| return AlgoAttribute::REPRODUCIBLE | | |||
| AlgoAttribute::ACCURACY_DEPEND_ON_BATCH; | |||
| } | |||
| }; | |||
| @@ -123,6 +123,9 @@ public: | |||
| if (m_attr.is_reproducible) { | |||
| ret |= AlgoAttribute::REPRODUCIBLE; | |||
| } | |||
| if (m_attr.accuracy_depend_on_batch) { | |||
| ret |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH; | |||
| } | |||
| return ret; | |||
| } | |||
| @@ -155,7 +158,8 @@ public: | |||
| const char* name() const override { return "MATMUL"; } | |||
| MEGDNN_DECL_ALGO_TYPE(CUDA_MATMUL) | |||
| AlgoAttribute attribute() const override { | |||
| return AlgoAttribute::REPRODUCIBLE; | |||
| return AlgoAttribute::REPRODUCIBLE | | |||
| AlgoAttribute::ACCURACY_DEPEND_ON_BATCH; | |||
| } | |||
| }; | |||
| @@ -119,6 +119,9 @@ public: | |||
| if (m_attr.is_reproducible) { | |||
| ret |= AlgoAttribute::REPRODUCIBLE; | |||
| } | |||
| if (m_attr.accuracy_depend_on_batch) { | |||
| ret |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH; | |||
| } | |||
| return ret; | |||
| } | |||
| @@ -112,6 +112,9 @@ public: | |||
| if (m_attr.is_reproducible) { | |||
| ret |= AlgoAttribute::REPRODUCIBLE; | |||
| } | |||
| if (m_attr.accuracy_depend_on_batch) { | |||
| ret |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH; | |||
| } | |||
| return ret; | |||
| } | |||
| @@ -106,7 +106,8 @@ public: | |||
| const char* name() const override { return "1x1x1"; } | |||
| AlgoAttribute attribute() const override { | |||
| return AlgoAttribute::REPRODUCIBLE; | |||
| return AlgoAttribute::REPRODUCIBLE | | |||
| AlgoAttribute::ACCURACY_DEPEND_ON_BATCH; | |||
| } | |||
| MEGDNN_DECL_ALGO_TYPE(CUDA_1X1X1) | |||
| }; | |||
| @@ -126,10 +127,17 @@ public: | |||
| const char* name() const override { return m_name.c_str(); } | |||
| AlgoAttribute attribute() const override { | |||
| auto ret = static_cast<AlgoAttribute>(0); | |||
| auto ret = AlgoAttribute::DEFAULT; | |||
| if (m_impl->contain_attribute_all(AlgoAttribute::REPRODUCIBLE)) { | |||
| ret |= AlgoAttribute::REPRODUCIBLE; | |||
| } | |||
| #define cb(attr) \ | |||
| if (m_impl->contain_attribute_all(attr)) { \ | |||
| ret |= attr; \ | |||
| } | |||
| MEGDNN_FOREACH_ALGO_ATTRIBUTE_INHERITABLE(cb) | |||
| #undef cb | |||
| return ret; | |||
| } | |||
| static void modify_size_args(SizeArgs& args, TensorLayout& src_pg, | |||
| @@ -157,6 +165,9 @@ public: | |||
| if (m_attr.is_reproducible) { | |||
| ret |= AlgoAttribute::REPRODUCIBLE; | |||
| } | |||
| if (m_attr.accuracy_depend_on_batch) { | |||
| ret |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH; | |||
| } | |||
| return ret; | |||
| } | |||
| @@ -470,9 +470,9 @@ void Conv3DDesc::set(const param::Convolution3D& param, const size_t nr_group) { | |||
| #define V(v) V1(v) | |||
| #define DEF_NAME(NAME) \ | |||
| #NAME "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL) | |||
| #define DEF_ALGO(NAME, PROD) \ | |||
| { \ | |||
| NAME, { DEF_NAME(NAME), PROD } \ | |||
| #define DEF_ALGO(NAME, PROD1, PROD2) \ | |||
| { \ | |||
| NAME, { DEF_NAME(NAME), PROD1, PROD2 } \ | |||
| } | |||
| #if !(CUDNN_MAJOR >= 6 || CUDNN_MINOR >= 1) | |||
| @@ -483,19 +483,18 @@ const std::unordered_map<cudnnConvolutionBwdDataAlgo_t, CudnnAlgoPack::Attr> | |||
| CudnnAlgoPack::conv_bwd_data_algos() { | |||
| static const std::unordered_map<cudnnConvolutionBwdDataAlgo_t, | |||
| CudnnAlgoPack::Attr> | |||
| algos = { | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_0, false), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, true), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT, true), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING, true), | |||
| algos = | |||
| { DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_0, false, false), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, true, false), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT, true, true), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING, true, true), | |||
| #if CUDNN_MAJOR >= 5 | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD, true), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD, true, false), | |||
| #if CUDNN_MAJOR >= 6 || CUDNN_MINOR >= 1 | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED, | |||
| true), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED, true, false), | |||
| #endif | |||
| #endif | |||
| }; | |||
| }; | |||
| return algos; | |||
| } | |||
| @@ -505,15 +504,16 @@ CudnnAlgoPack::conv_bwd_flt_algos() { | |||
| static const std::unordered_map<cudnnConvolutionBwdFilterAlgo_t, | |||
| CudnnAlgoPack::Attr> | |||
| algos = { | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0, false), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, true), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT, true), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3, false), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0, false, false), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, true, false), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT, true, true), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3, false, false), | |||
| #if CUDNN_MAJOR >= 6 || (CUDNN_MAJOR >= 5 && CUDNN_MINOR >= 1) | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED, | |||
| true), | |||
| true, false), | |||
| #if CUDNN_MAJOR >= 6 | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING, true), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING, true, | |||
| true), | |||
| #endif | |||
| #endif | |||
| @@ -522,28 +522,30 @@ CudnnAlgoPack::conv_bwd_flt_algos() { | |||
| return algos; | |||
| } | |||
| const std::unordered_map<cudnnConvolutionFwdAlgo_t, CudnnAlgoPack::Attr> | |||
| CudnnAlgoPack::conv_fwd_algos() { | |||
| static const std::unordered_map<cudnnConvolutionFwdAlgo_t, | |||
| CudnnAlgoPack::Attr> | |||
| algos = { | |||
| DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, true), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, | |||
| true), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_GEMM, true), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_DIRECT, true), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_FFT, true), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING, true), | |||
| algos = | |||
| { DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, true, false), | |||
| #if CUDNN_VERSION == 8004 | |||
| DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, true, true), | |||
| #else | |||
| DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, true, false), | |||
| #endif | |||
| DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_GEMM, true, false), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_DIRECT, true, false), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_FFT, true, true), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING, true, true), | |||
| #if CUDNN_MAJOR >= 5 | |||
| DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, true), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, true, false), | |||
| #if CUDNN_MAJOR >= 6 || CUDNN_MINOR >= 1 | |||
| DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED, true), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED, true, false), | |||
| #endif | |||
| #endif | |||
| }; | |||
| }; | |||
| return algos; | |||
| } | |||
| @@ -553,9 +555,10 @@ CudnnAlgoPack::conv3d_bwd_data_algos() { | |||
| static const std::unordered_map<cudnnConvolutionBwdDataAlgo_t, | |||
| CudnnAlgoPack::Attr> | |||
| algos = { | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_0, false), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, true), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING, true), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_0, false, false), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, true, false), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING, true, | |||
| true), | |||
| }; | |||
| return algos; | |||
| @@ -568,9 +571,9 @@ CudnnAlgoPack::conv3d_bwd_flt_algos() { | |||
| static const std::unordered_map<cudnnConvolutionBwdFilterAlgo_t, | |||
| CudnnAlgoPack::Attr> | |||
| algos = { | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0, false), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, true), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3, false), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0, false, false), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, true, false), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3, false, false), | |||
| }; | |||
| return algos; | |||
| @@ -581,10 +584,15 @@ CudnnAlgoPack::conv3d_fwd_algos() { | |||
| static const std::unordered_map<cudnnConvolutionFwdAlgo_t, | |||
| CudnnAlgoPack::Attr> | |||
| algos = { | |||
| DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, true), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, | |||
| true), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING, true), | |||
| DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, true, false), | |||
| #if CUDNN_VERSION == 8004 | |||
| DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, true, | |||
| true), | |||
| #else | |||
| DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, true, | |||
| false), | |||
| #endif | |||
| DEF_ALGO(CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING, true, true), | |||
| }; | |||
| return algos; | |||
| @@ -112,6 +112,7 @@ public: | |||
| struct Attr { | |||
| std::string name; | |||
| bool is_reproducible; | |||
| bool accuracy_depend_on_batch; | |||
| }; | |||
| static const std::unordered_map<cudnnConvolutionBwdDataAlgo_t, Attr> | |||
| @@ -115,7 +115,8 @@ public: | |||
| MEGDNN_DECL_ALGO_TYPE(CUDA_CUBLAS) | |||
| AlgoAttribute attribute() const override { | |||
| return AlgoAttribute::REPRODUCIBLE | | |||
| AlgoAttribute::USABLE_DEPEND_ON_SHAPE; | |||
| AlgoAttribute::USABLE_DEPEND_ON_SHAPE | | |||
| AlgoAttribute::ACCURACY_DEPEND_ON_BATCH; | |||
| } | |||
| }; | |||
| @@ -142,7 +143,8 @@ public: | |||
| void exec(const ExecArgs& args) const override; | |||
| MEGDNN_DECL_ALGO_TYPE(CUDA_CUBLASLT) | |||
| AlgoAttribute attribute() const override { | |||
| return AlgoAttribute::REPRODUCIBLE; | |||
| return AlgoAttribute::REPRODUCIBLE | | |||
| AlgoAttribute::ACCURACY_DEPEND_ON_BATCH; | |||
| } | |||
| }; | |||
| #endif | |||
| @@ -25,7 +25,8 @@ public: | |||
| size_t get_workspace(const KernSizeParam&) const override { return 0; } | |||
| kern_t get_kern(const KernSizeParam&) const override; | |||
| AlgoAttribute attribute() const override { | |||
| return AlgoAttribute::REPRODUCIBLE; | |||
| return AlgoAttribute::REPRODUCIBLE | | |||
| AlgoAttribute::ACCURACY_DEPEND_ON_BATCH; | |||
| } | |||
| PackMode packmode() const override { return PackMode::NO_PACK; } | |||
| MEGDNN_OVERRIDE_MATMUL_DESC(8, 16, 1, 4, AlgoDataType::FLOAT32, DEFAULT) | |||
| @@ -36,7 +37,8 @@ public: | |||
| class MatrixMulImpl::AlgoF32MKLPackA : public AlgoBase { | |||
| public: | |||
| AlgoAttribute attribute() const override { | |||
| return AlgoAttribute::REPRODUCIBLE; | |||
| return AlgoAttribute::REPRODUCIBLE | | |||
| AlgoAttribute::ACCURACY_DEPEND_ON_BATCH; | |||
| } | |||
| const char* name() const override { return "X86_F32_MKL_PACKA"; } | |||
| bool usable(const KernSizeParam&) const override; | |||
| @@ -0,0 +1,109 @@ | |||
| /** | |||
| * \file dnn/test/common/accuracy_shake_checker.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #include "test/common/accuracy_shake_checker.h" | |||
| using namespace megdnn; | |||
| using namespace test; | |||
| namespace { | |||
| template <typename ctype> | |||
| ::testing::AssertionResult assert_tensor_binary_eq( | |||
| const char* expr0, const char* expr1, const char* /*expr2*/, | |||
| const TensorND& v0, const TensorND& v1, const std::string& algo_name) { | |||
| ctype* it0_orig = v0.ptr<ctype>(); | |||
| ctype* it1 = v1.ptr<ctype>(); | |||
| ctype* it0 = it0_orig; | |||
| auto nr_elem = v1.layout.total_nr_elems(); | |||
| auto nr_elem_single_batch = v0.layout.total_nr_elems(); | |||
| for (size_t i = 0; i < nr_elem; ++i) { | |||
| if (i % nr_elem_single_batch == 0) { | |||
| it0 = it0_orig; | |||
| } | |||
| ctype iv0 = *it0, iv1 = *it1; | |||
| if (!good_float(iv0) || !good_float(iv1) || | |||
| memcmp(it0, it1, sizeof(ctype))) { | |||
| Index index(v1.layout, i); | |||
| return ::testing::AssertionFailure() | |||
| << "Unequal value\n" | |||
| << "Value of: " << expr1 << "\n" | |||
| << " Actual: " << (iv1 + 0) << "\n" | |||
| << "Expected: " << expr0 << "\n" | |||
| << "Which is: " << (iv0 + 0) << "\n" | |||
| << "At index: " << index.to_string() << "/" | |||
| << v1.layout.TensorShape::to_string() << "\n" | |||
| << " DType: " << v1.layout.dtype.name() << "\n" | |||
| << "algo: " << algo_name; | |||
| } | |||
| ++it0; | |||
| ++it1; | |||
| } | |||
| return ::testing::AssertionSuccess(); | |||
| } | |||
| } // namespace | |||
| ::testing::AssertionResult test::__assert_tensor_binary_eq( | |||
| const char* expr0, const char* expr1, const char* expr2, | |||
| const TensorND& v0, const TensorND& v1, | |||
| const Algorithm::Info::Desc& algo) { | |||
| bool shape_match = v0.layout[0] == 1; | |||
| for (size_t i = 1; i < v0.layout.ndim; ++i) { | |||
| shape_match &= v0.layout[i] == v1.layout[i]; | |||
| } | |||
| if (!shape_match) { | |||
| return ::testing::AssertionFailure() | |||
| << "Shape mismatch\n" | |||
| << "Value of: " << expr1 << "\n" | |||
| << " Actual: " << v1.layout.TensorShape::to_string() << "\n" | |||
| << "Expected: " << expr0 << "\n" | |||
| << "Which is: " << v0.layout.TensorShape::to_string() << "\n" | |||
| << "algo: " << algo.name << "\n"; | |||
| } | |||
| if (!v0.layout.is_physical_contiguous() || | |||
| !v1.layout.is_physical_contiguous()) { | |||
| return ::testing::AssertionFailure() | |||
| << "layout should be physical contiguous\n" | |||
| << "Value of: " << expr1 << "\n" | |||
| << " Actual: " << v1.layout.is_physical_contiguous() << "\n" | |||
| << "Expected: " << expr0 << "\n" | |||
| << "Which is: " << v0.layout.is_physical_contiguous() << "\n" | |||
| << "algo: " << algo.name << "\n"; | |||
| } | |||
| auto dtype = v0.layout.dtype; | |||
| if (dtype != v1.layout.dtype) { | |||
| return ::testing::AssertionFailure() | |||
| << "Data type should match\n" | |||
| << "Value of: " << expr1 << "\n" | |||
| << " Actual: " << v1.layout.dtype.name() << "\n" | |||
| << "Expected: " << expr0 << "\n" | |||
| << "Which is: " << v0.layout.dtype.name() << "\n" | |||
| << "algo: " << algo.name << "\n"; | |||
| } | |||
| switch (dtype.enumv()) { | |||
| #define cb(_dt) \ | |||
| case DTypeTrait<_dt>::enumv: \ | |||
| return assert_tensor_binary_eq<DTypeTrait<_dt>::ctype>( \ | |||
| expr0, expr1, expr2, v0, v1, algo.name); | |||
| MEGDNN_FOREACH_COMPUTING_DTYPE(cb) | |||
| MEGDNN_FOREACH_QUANTIZED_DTYPE(cb) | |||
| #undef cb | |||
| default : megdnn_trap(); | |||
| } | |||
| } | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -0,0 +1,396 @@ | |||
| /** | |||
| * \file dnn/test/common/accuracy_shake_checker.h | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| */ | |||
| #pragma once | |||
| #include <vector> | |||
| #include "megdnn/oprs.h" | |||
| #include "src/common/conv_bias.h" | |||
| #include "src/common/utils.h" | |||
| #include "test/common/checker.h" | |||
| #include "test/common/index.h" | |||
| namespace megdnn { | |||
| namespace test { | |||
| namespace { | |||
| template <class Opr> | |||
| struct BatchTrait { | |||
| //! index of batch in tensor, 3 for CHWN4 e.g. | |||
| static size_t index_of_batch(const typename Opr::Param&) { return 0; } | |||
| //! indices contain batch in inputs and outputs, src(0) dst(2) for conv e.g. | |||
| static std::vector<size_t> indices_contain_batch; | |||
| static std::vector<size_t> indices_contain_batch_broadcast; | |||
| }; | |||
| template <class Opr> | |||
| std::vector<size_t> BatchTrait<Opr>::indices_contain_batch = {}; | |||
| template <class Opr> | |||
| std::vector<size_t> BatchTrait<Opr>::indices_contain_batch_broadcast = {}; | |||
| #define DEFAULT_INDEX_OF_BATCH(opr) \ | |||
| static size_t index_of_batch(const opr::Param&) { return 0; } | |||
| #define CONV_INDEX_OF_BATCH(opr) \ | |||
| static size_t index_of_batch(const opr::Param& p) { \ | |||
| if (p.format == opr::Param::Format::CHWN4) { \ | |||
| return 3; \ | |||
| } \ | |||
| return 0; \ | |||
| } | |||
| #define OPR_WITHOUT_INPUT_BROADCAST(INDEX_OF_BATCH, opr, idxs, idxs_brdcst) \ | |||
| template <> \ | |||
| struct BatchTrait<opr> { \ | |||
| INDEX_OF_BATCH(opr) \ | |||
| static std::vector<size_t> indices_contain_batch; \ | |||
| static std::vector<size_t> indices_contain_batch_broadcast; \ | |||
| }; \ | |||
| std::vector<size_t> BatchTrait<opr>::indices_contain_batch = idxs; \ | |||
| std::vector<size_t> BatchTrait<opr>::indices_contain_batch_broadcast = \ | |||
| idxs_brdcst; | |||
| OPR_WITHOUT_INPUT_BROADCAST(DEFAULT_INDEX_OF_BATCH, | |||
| megdnn::Convolution3DForward, | |||
| (std::initializer_list<size_t>{0, 2}), {}) | |||
| OPR_WITHOUT_INPUT_BROADCAST(DEFAULT_INDEX_OF_BATCH, | |||
| megdnn::Convolution3DBackwardData, | |||
| (std::initializer_list<size_t>{1, 2}), {}) | |||
| OPR_WITHOUT_INPUT_BROADCAST(DEFAULT_INDEX_OF_BATCH, | |||
| megdnn::Convolution3DBackwardFilter, | |||
| (std::initializer_list<size_t>{0, 1}), {}) | |||
| OPR_WITHOUT_INPUT_BROADCAST(DEFAULT_INDEX_OF_BATCH, megdnn::BatchedMatrixMul, | |||
| (std::initializer_list<size_t>{0, 1, 2}), {}) | |||
| OPR_WITHOUT_INPUT_BROADCAST(CONV_INDEX_OF_BATCH, megdnn::ConvolutionForward, | |||
| (std::initializer_list<size_t>{0, 2}), {}) | |||
| OPR_WITHOUT_INPUT_BROADCAST(CONV_INDEX_OF_BATCH, | |||
| megdnn::ConvolutionBackwardData, | |||
| (std::initializer_list<size_t>{1, 2}), {}) | |||
| OPR_WITHOUT_INPUT_BROADCAST(CONV_INDEX_OF_BATCH, | |||
| megdnn::ConvolutionBackwardFilter, | |||
| (std::initializer_list<size_t>{0, 1}), {}) | |||
| OPR_WITHOUT_INPUT_BROADCAST(CONV_INDEX_OF_BATCH, megdnn::LocalShareForward, | |||
| (std::initializer_list<size_t>{0, 2}), {}) | |||
| OPR_WITHOUT_INPUT_BROADCAST(CONV_INDEX_OF_BATCH, megdnn::LocalShareBackwardData, | |||
| (std::initializer_list<size_t>{1, 2}), {}) | |||
| OPR_WITHOUT_INPUT_BROADCAST(CONV_INDEX_OF_BATCH, | |||
| megdnn::LocalShareBackwardFilter, | |||
| (std::initializer_list<size_t>{0, 1}), {}) | |||
| OPR_WITHOUT_INPUT_BROADCAST(CONV_INDEX_OF_BATCH, megdnn::DeformableConvForward, | |||
| (std::initializer_list<size_t>{0, 2, 3, 4}), {}) | |||
| OPR_WITHOUT_INPUT_BROADCAST( | |||
| CONV_INDEX_OF_BATCH, megdnn::DeformableConvBackwardData, | |||
| (std::initializer_list<size_t>{0, 2, 3, 4, 5, 6, 7}), {}) | |||
| OPR_WITHOUT_INPUT_BROADCAST(CONV_INDEX_OF_BATCH, | |||
| megdnn::DeformableConvBackwardFilter, | |||
| (std::initializer_list<size_t>{0, 1, 2, 3}), {}) | |||
| OPR_WITHOUT_INPUT_BROADCAST(CONV_INDEX_OF_BATCH, megdnn::BatchConvBiasForward, | |||
| (std::initializer_list<size_t>{0, 1, 2, 3, 4}), {}) | |||
| OPR_WITHOUT_INPUT_BROADCAST(CONV_INDEX_OF_BATCH, megdnn::ConvBiasForward, | |||
| (std::initializer_list<size_t>{0, 3, 4}), {2}) | |||
| #undef OPR_WITHOUT_INPUT_BROADCAST | |||
| #undef DEFAULT_INDEX_OF_BATCH | |||
| #undef CONV_INDEX_OF_BATCH | |||
| template <class Opr> | |||
| struct LayoutsModifier { | |||
| static void on(TensorLayoutArray& layouts, const typename Opr::Param& p, | |||
| size_t new_batch_size) { | |||
| size_t batch_index = BatchTrait<Opr>::index_of_batch(p); | |||
| for (size_t index : BatchTrait<Opr>::indices_contain_batch) { | |||
| layouts.at(index)[batch_index] = new_batch_size; | |||
| } | |||
| for (size_t index : BatchTrait<Opr>::indices_contain_batch_broadcast) { | |||
| if (!check_bias_share_in_channel(layouts.at(index), p.format)) { | |||
| layouts.at(index)[batch_index] = new_batch_size; | |||
| } | |||
| } | |||
| } | |||
| }; | |||
| #define OPR_NO_BIAS(opr) \ | |||
| template <> \ | |||
| struct LayoutsModifier<opr> { \ | |||
| static void on(TensorLayoutArray& layouts, \ | |||
| const typename opr::Param& p, size_t new_batch_size) { \ | |||
| size_t batch_index = BatchTrait<opr>::index_of_batch(p); \ | |||
| for (size_t index : BatchTrait<opr>::indices_contain_batch) { \ | |||
| layouts.at(index)[batch_index] = new_batch_size; \ | |||
| } \ | |||
| } \ | |||
| }; | |||
| OPR_NO_BIAS(megdnn::Convolution3D) | |||
| OPR_NO_BIAS(megdnn::BatchedMatrixMul) | |||
| #undef OPR_NO_BIAS | |||
| template <> | |||
| struct LayoutsModifier<megdnn::MatrixMul> { | |||
| public: | |||
| static void on(TensorLayoutArray& layouts, | |||
| const megdnn::MatrixMul::Param& p, | |||
| size_t new_batch_size) { | |||
| assert(!p.transposeA && !p.transposeB); | |||
| MEGDNN_MARK_USED_VAR(p); | |||
| layouts.at(0)[0] = new_batch_size; | |||
| layouts.at(2)[0] = new_batch_size; | |||
| } | |||
| }; | |||
| template <class Opr, typename OprAlgoProxy = OprAlgoProxy<Opr>> | |||
| class AlgoGenerator { | |||
| public: | |||
| AlgoGenerator(ExecutionPolicyAlgoName name) | |||
| : m_policy_name{name} {} | |||
| std::vector<Algorithm::Info::Desc> operator()( | |||
| Opr* opr, const CheckerHelper::TensorValueArray& arr) { | |||
| TensorLayoutArray layouts; | |||
| for (auto&& val : arr) { | |||
| layouts.push_back(val.layout); | |||
| } | |||
| std::vector<Algorithm::Info::Desc> ret; | |||
| megdnn_assert(layouts.size() == OprTrait<Opr>::arity); | |||
| for (auto algo_info : | |||
| AlgoProxy<Opr, OprTrait<Opr>::arity>::get_all_algorithms_info( | |||
| opr, layouts)) { | |||
| if (!(algo_info.attribute & | |||
| AlgoAttribute::ACCURACY_DEPEND_ON_BATCH) && | |||
| std::regex_match( | |||
| algo_info.desc.name, | |||
| std::regex("(.*)(" + m_policy_name.name + ")(.*)"))) { | |||
| ret.push_back(algo_info.desc); | |||
| } else { | |||
| continue; | |||
| } | |||
| } | |||
| return ret; | |||
| } | |||
| private: | |||
| ExecutionPolicyAlgoName m_policy_name; | |||
| }; | |||
| } // namespace | |||
| ::testing::AssertionResult __assert_tensor_binary_eq( | |||
| const char* expr0, const char* expr1, const char* expr2, | |||
| const TensorND& v0, const TensorND& v1, | |||
| const Algorithm::Info::Desc& algo); | |||
| template <typename Opr, typename Proxy = OprProxy<Opr>> | |||
| class AccuracyShakeChecker : public CheckerHelper { | |||
| public: | |||
| static constexpr int arity_in = OprArityTrait<Opr>::arity_in; | |||
| using Param = typename Opr::Param; | |||
| using BeforeExecCallback = std::function<std::vector<Algorithm::Info::Desc>( | |||
| Opr*, const TensorValueArray&)>; | |||
| AccuracyShakeChecker(Handle* handle, bool check_dispatch = false) | |||
| : CheckerHelper(handle, check_dispatch), | |||
| m_before_exec_callback{AlgoGenerator<Opr>("")}, | |||
| m_param(Param()) {} | |||
| TensorLayoutArray make_layouts(const TensorShapeArray& shapes) { | |||
| TensorLayoutArray layouts(shapes.size()); | |||
| for (size_t i = 0; i < shapes.size(); ++i) { | |||
| DType dt = (m_dtype.find(i) != m_dtype.end() ? m_dtype[i] | |||
| : dtype::Float32()); | |||
| TensorFormat fmt = | |||
| (m_fmt.find(i) != m_fmt.end() ? m_fmt[i] : TensorFormat{}); | |||
| layouts[i] = TensorLayout(shapes[i], dt, fmt); | |||
| } | |||
| return layouts; | |||
| } | |||
| /*! | |||
| * \brief execute opr on current param/dtype/rng config | |||
| * \param shapes input/output shapes, which would be passed as | |||
| * arguments to Opr::deduce_layout | |||
| * | |||
| * Checker would construct TensorLayout vectors from shapes and dtypes, | |||
| * and call exec(TensorLayoutArray &). | |||
| */ | |||
| AccuracyShakeChecker& exec(const TensorShapeArray& shapes) { | |||
| exec(make_layouts(shapes)); | |||
| return *this; | |||
| } | |||
| void exec(TensorLayoutArray layouts); | |||
| AccuracyShakeChecker& set_param(Param p) { | |||
| m_param = p; | |||
| opr()->param() = p; | |||
| return *this; | |||
| } | |||
| AccuracyShakeChecker& set_dtype(size_t idx, DType dtype) { | |||
| m_dtype[idx] = dtype; | |||
| return *this; | |||
| } | |||
| AccuracyShakeChecker& set_rng(size_t idx, RNG* rng) { | |||
| m_rng[idx] = rng; | |||
| return *this; | |||
| } | |||
| //! set a callback to be invoked before executing the operator | |||
| AccuracyShakeChecker& set_before_exec_callback( | |||
| const BeforeExecCallback& cb) { | |||
| m_before_exec_callback = cb; | |||
| return *this; | |||
| } | |||
| AccuracyShakeChecker& reset_before_exec_callback() { | |||
| m_before_exec_callback = nullptr; | |||
| return *this; | |||
| } | |||
| //! get the opr impl so setting other than param() can be modified | |||
| Opr* opr() { | |||
| if (!m_opr_cur) { | |||
| m_opr_cur = m_handle_cur->create_operator<Opr>(); | |||
| } | |||
| return m_opr_cur.get(); | |||
| } | |||
| private: | |||
| BeforeExecCallback m_before_exec_callback; | |||
| Param m_param; | |||
| Proxy m_proxy; | |||
| std::unique_ptr<Opr> m_opr_cur; | |||
| std::shared_ptr<TensorValueArray> m_tensors_cur_host, | |||
| m_tensors_single_batch_host; | |||
| void init_host_values(); | |||
| void check_tensors_ignore_batch( | |||
| const TensorValueArray& tensors_single_batch, | |||
| const TensorValueArray& tensors, const Algorithm::Info::Desc& desc); | |||
| }; | |||
| template <typename Opr, typename Proxy> | |||
| void AccuracyShakeChecker<Opr, Proxy>::exec(TensorLayoutArray layouts) { | |||
| auto opr_cur = this->opr(); | |||
| opr_cur->param() = m_param; | |||
| m_proxy.deduce_layout(opr_cur, layouts); | |||
| TensorLayoutArray layouts_single_batch = layouts; | |||
| for (size_t i=0; i<layouts_single_batch.size(); ++i) { | |||
| ASSERT_TRUE(layouts[i].is_physical_contiguous()) | |||
| << "layouts should be physical contiguous " | |||
| << layouts[i].to_string(); | |||
| } | |||
| ASSERT_TRUE(0 == BatchTrait<Opr>::index_of_batch(opr_cur->param())) | |||
| << "index of batch should be 0 "; | |||
| LayoutsModifier<Opr>::on(layouts_single_batch, opr_cur->param(), 1); | |||
| // allocate input | |||
| auto tensors_single_batch_storage = | |||
| alloc_tensors(m_handle_cur, layouts_single_batch, 0); | |||
| m_tensors_single_batch_host = | |||
| alloc_tensors(m_handle_naive.get(), layouts_single_batch, 0); | |||
| auto tensors_cur_storage = alloc_tensors(m_handle_cur, layouts, 0); | |||
| m_tensors_cur_host = | |||
| alloc_tensors(m_handle_naive.get(), layouts, 0); | |||
| auto &&tensors_single_batch = *tensors_single_batch_storage; | |||
| auto &&tensors_single_batch_host = *m_tensors_single_batch_host; | |||
| auto &&tensors_cur = *tensors_cur_storage; | |||
| auto &&tensors_cur_host = *m_tensors_cur_host; | |||
| // allocate output | |||
| auto tensors_single_batch_storage_out = | |||
| alloc_tensors(m_handle_naive.get(), layouts_single_batch, 0); | |||
| auto tensors_cur_storage_out = | |||
| alloc_tensors(m_handle_naive.get(), layouts, 0); | |||
| auto &&tensors_single_batch_out = *tensors_single_batch_storage_out; | |||
| auto &&tensors_cur_out = *tensors_cur_storage_out; | |||
| init_host_values(); | |||
| copy_tensors_to_device(tensors_cur, tensors_cur_host); | |||
| copy_tensors_to_device(tensors_single_batch, tensors_single_batch_host); | |||
| std::vector<Algorithm::Info::Desc> algo_desc; | |||
| if (m_before_exec_callback) { | |||
| algo_desc = m_before_exec_callback(opr_cur, tensors_cur); | |||
| } else { | |||
| algo_desc.push_back({}); | |||
| } | |||
| for (size_t i = 0; i < algo_desc.size(); ++i) { | |||
| opr_cur->execution_policy().algo = algo_desc[i]; | |||
| m_proxy.exec(opr_cur, tensors_cur); | |||
| m_proxy.exec(opr_cur, tensors_single_batch); | |||
| copy_tensors_from_device(tensors_cur_out, tensors_cur); | |||
| copy_tensors_from_device(tensors_single_batch_out, | |||
| tensors_single_batch); | |||
| check_tensors_ignore_batch(tensors_single_batch_out, tensors_cur_out, | |||
| algo_desc[i]); | |||
| } | |||
| } | |||
| template <typename Opr, typename Proxy> | |||
| void AccuracyShakeChecker<Opr, Proxy>::init_host_values() { | |||
| size_t index_of_batch = 0; | |||
| auto &&tensors_single_batch = *m_tensors_single_batch_host; | |||
| auto &&tensors_cur = *m_tensors_cur_host; | |||
| for (size_t i = 0; i < arity_in; ++i) { | |||
| auto &&tensor_single_batch = tensors_single_batch[i]; | |||
| auto &&tensor_cur = tensors_cur[i]; | |||
| auto rng = m_rng[i]; | |||
| if (!rng) | |||
| rng = m_default_rng.get(); | |||
| rng->gen(tensor_single_batch); | |||
| dt_byte* raw_storage_cur = static_cast<dt_byte*>(tensor_cur.raw_ptr) + | |||
| tensor_cur.layout.span().low_byte; | |||
| dt_byte* raw_storage_single_batch = | |||
| static_cast<dt_byte*>(tensor_single_batch.raw_ptr) + | |||
| tensor_single_batch.layout.span().low_byte; | |||
| const size_t step = tensor_single_batch.layout.span().dist_byte(); | |||
| if (tensor_cur.layout.eq_shape(tensor_single_batch.layout)) { | |||
| memcpy(raw_storage_cur, raw_storage_single_batch, step); | |||
| } else { | |||
| ASSERT_TRUE(1 == tensor_single_batch.layout[index_of_batch]) | |||
| << "bad batch size " | |||
| << tensor_single_batch.layout[index_of_batch]; | |||
| for (size_t b=0; b<tensor_cur.layout[index_of_batch]; ++b) { | |||
| memcpy(raw_storage_cur, raw_storage_single_batch, step); | |||
| raw_storage_cur += step; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| template <typename Opr, typename Proxy> | |||
| void AccuracyShakeChecker<Opr, Proxy>::check_tensors_ignore_batch( | |||
| const TensorValueArray& tensors_single_batch, | |||
| const TensorValueArray& tensors, const Algorithm::Info::Desc& algo) { | |||
| for (size_t i = 0; i < tensors_single_batch.size(); ++i) { | |||
| if (tensors_single_batch[i].layout.ndim == 0 || | |||
| tensors_single_batch[i].layout.eq_shape(tensors[i].layout)) | |||
| continue; | |||
| ASSERT_PRED_FORMAT3(::megdnn::test::__assert_tensor_binary_eq, | |||
| tensors_single_batch[i], tensors[i], algo); | |||
| } | |||
| } | |||
| } // namespace test | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -19,50 +19,6 @@ using namespace megdnn; | |||
| using namespace test; | |||
| namespace { | |||
| bool good_float(float val) { | |||
| return std::isfinite(val); | |||
| } | |||
| bool good_float(int) { | |||
| return true; | |||
| } | |||
| bool good_float(dt_qint8) { | |||
| return true; | |||
| } | |||
| bool good_float(dt_qint16) { | |||
| return true; | |||
| } | |||
| bool good_float(dt_quint8) { | |||
| return true; | |||
| } | |||
| bool good_float(dt_qint32) { | |||
| return true; | |||
| } | |||
| // A hack for the (x+0) promote to int trick on dt_quint8. | |||
| int operator +(dt_quint8 lhs, int rhs) { | |||
| megdnn_assert(rhs == 0, "unexpected rhs"); | |||
| return lhs.as_uint8(); | |||
| } | |||
| int operator +(dt_qint32 lhs, int rhs) { | |||
| megdnn_assert(rhs == 0, "unexpected rhs"); | |||
| return lhs.as_int32(); | |||
| } | |||
| int operator +(dt_qint8 lhs, int rhs) { | |||
| megdnn_assert(rhs == 0, "unexpected rhs"); | |||
| return int8_t(lhs); | |||
| } | |||
| int operator +(dt_qint16 lhs, int rhs) { | |||
| megdnn_assert(rhs == 0, "unexpected rhs"); | |||
| return lhs.as_int16(); | |||
| } | |||
| template<typename ctype, class Iter> | |||
| ::testing::AssertionResult assert_tensor_eq_with_iter( | |||
| @@ -86,6 +86,7 @@ protected: | |||
| size_t m_offset = 0; | |||
| CheckerHelper(Handle* handle, bool check_dispatch = true); | |||
| ~CheckerHelper() noexcept; | |||
| using OprExec = std::function<void(const TensorValueArray&)>; | |||
| @@ -100,14 +101,15 @@ protected: | |||
| void enable_contig_naive() { m_enable_contig_naive = true; } | |||
| private: | |||
| std::shared_ptr<TensorValueArray> m_tensors_naive; | |||
| void init_naive_values(); | |||
| void copy_tensors_to_device(const TensorValueArray& dest, | |||
| const TensorValueArray& src); | |||
| void copy_tensors_from_device(const TensorValueArray& dest, | |||
| const TensorValueArray& src); | |||
| private: | |||
| std::shared_ptr<TensorValueArray> m_tensors_naive; | |||
| void init_naive_values(); | |||
| void check_tensors(const TensorValueArray& expected, | |||
| const TensorValueArray& computed); | |||
| }; | |||
| @@ -311,6 +311,51 @@ public: | |||
| size_t get_cpu_count(); | |||
| static inline bool good_float(float val) { | |||
| return std::isfinite(val); | |||
| } | |||
| static inline bool good_float(int) { | |||
| return true; | |||
| } | |||
| static inline bool good_float(dt_qint8) { | |||
| return true; | |||
| } | |||
| static inline bool good_float(dt_qint16) { | |||
| return true; | |||
| } | |||
| static inline bool good_float(dt_quint8) { | |||
| return true; | |||
| } | |||
| static inline bool good_float(dt_qint32) { | |||
| return true; | |||
| } | |||
| // A hack for the (x+0) promote to int trick on dt_quint8. | |||
| static inline int operator+(dt_quint8 lhs, int rhs) { | |||
| megdnn_assert(rhs == 0, "unexpected rhs"); | |||
| return lhs.as_uint8(); | |||
| } | |||
| static inline int operator+(dt_qint32 lhs, int rhs) { | |||
| megdnn_assert(rhs == 0, "unexpected rhs"); | |||
| return lhs.as_int32(); | |||
| } | |||
| static inline int operator+(dt_qint8 lhs, int rhs) { | |||
| megdnn_assert(rhs == 0, "unexpected rhs"); | |||
| return int8_t(lhs); | |||
| } | |||
| static inline int operator+(dt_qint16 lhs, int rhs) { | |||
| megdnn_assert(rhs == 0, "unexpected rhs"); | |||
| return lhs.as_int16(); | |||
| } | |||
| } // namespace test | |||
| static inline bool operator==(const TensorLayout& a, const TensorLayout& b) { | |||
| @@ -0,0 +1,247 @@ | |||
| /** | |||
| * \file dnn/test/cuda/accuracy_shake.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #include "megdnn/dtype.h" | |||
| #include "megdnn/oprs.h" | |||
| #include "megdnn/opr_param_defs.h" | |||
| #include "test/cuda/fixture.h" | |||
| #include "test/cuda/utils.h" | |||
| #include "test/common/rng.h" | |||
| #include "test/common/accuracy_shake_checker.h" | |||
| namespace megdnn { | |||
| namespace test { | |||
| TEST_F(CUDA, SHAKE_CONV_BIAS_FORWARD) { | |||
| require_compute_capability(6, 1); | |||
| AccuracyShakeChecker<ConvBiasForward> checker(handle_cuda()); | |||
| NormalRNG default_rng; | |||
| checker.set_dtype(0, dtype::Float32()) | |||
| .set_dtype(1, dtype::Float32()) | |||
| .set_dtype(2, dtype::Float32()) | |||
| .set_rng(0, &default_rng) | |||
| .set_rng(1, &default_rng); | |||
| // convolution | |||
| checker.exec({{64, 16, 32, 32}, {64, 16, 3, 3}, {}, {}, {}}); | |||
| // convbias without z | |||
| checker.exec({{64, 16, 32, 32}, {64, 16, 3, 3}, {1, 64, 1, 1}, {}, {}}); | |||
| // convbias with z | |||
| checker.exec({{64, 16, 32, 32}, | |||
| {64, 16, 3, 3}, | |||
| {1, 64, 1, 1}, | |||
| {64, 64, 30, 30}, | |||
| {}}); | |||
| ConvBias::Param param; | |||
| // group | |||
| param.sparse = ConvBias::Param::Sparse::GROUP; | |||
| checker.set_param(param); | |||
| checker.exec({{64, 16, 32, 32}, {2, 32, 8, 3, 3}, {}, {}, {}}); | |||
| checker.exec({{64, 16, 32, 32}, {2, 32, 8, 3, 3}, {1, 64, 1, 1}, {}, {}}); | |||
| checker.exec({{64, 16, 32, 32}, | |||
| {2, 32, 8, 3, 3}, | |||
| {1, 64, 1, 1}, | |||
| {64, 64, 30, 30}, | |||
| {}}); | |||
| } | |||
| TEST_F(CUDA, SHAKE_CONV_BIAS_FORWARD_QS8_NCHW) { | |||
| require_compute_capability(6, 1); | |||
| AccuracyShakeChecker<ConvBiasForward> checker(handle_cuda()); | |||
| UniformIntRNG int_rng{-128, 127}; | |||
| checker.set_dtype(0, dtype::QuantizedS8(2.5f)) | |||
| .set_dtype(1, dtype::QuantizedS8(2.5f)) | |||
| .set_dtype(2, dtype::QuantizedS32(6.25f)) | |||
| .set_dtype(3, dtype::QuantizedS8(0.25f)) | |||
| .set_dtype(4, dtype::QuantizedS8(0.25f)) | |||
| .set_rng(0, &int_rng) | |||
| .set_rng(1, &int_rng) | |||
| .set_rng(2, &int_rng) | |||
| .set_rng(3, &int_rng); | |||
| // convolution | |||
| checker.exec({{64, 16, 32, 32}, {64, 16, 3, 3}, {}, {}, {}}); | |||
| // convbias without z | |||
| checker.exec({{64, 16, 32, 32}, {64, 16, 3, 3}, {1, 64, 1, 1}, {}, {}}); | |||
| // convbias with z | |||
| checker.exec({{64, 16, 32, 32}, | |||
| {64, 16, 3, 3}, | |||
| {1, 64, 1, 1}, | |||
| {64, 64, 30, 30}, | |||
| {}}); | |||
| // group | |||
| ConvBias::Param param; | |||
| param.sparse = ConvBias::Param::Sparse::GROUP; | |||
| checker.set_param(param); | |||
| checker.exec({{64, 16, 32, 32}, {2, 32, 8, 3, 3}, {}, {}, {}}); | |||
| checker.exec({{64, 16, 32, 32}, {2, 32, 8, 3, 3}, {1, 64, 1, 1}, {}, {}}); | |||
| checker.exec({{64, 16, 32, 32}, | |||
| {2, 32, 8, 3, 3}, | |||
| {1, 64, 1, 1}, | |||
| {64, 64, 30, 30}, | |||
| {}}); | |||
| } | |||
| TEST_F(CUDA, SHAKE_CONV_BIAS_FORWARD_QS8_NHWC) { | |||
| require_compute_capability(6, 1); | |||
| UniformIntRNG int_rng{-50, 50}; | |||
| AccuracyShakeChecker<ConvBiasForward> checker(handle_cuda()); | |||
| ConvBias::Param param; | |||
| param.format = ConvBias::Param::Format::NHWC; | |||
| checker.set_dtype(0, dtype::QuantizedS8(2.5f)) | |||
| .set_dtype(1, dtype::QuantizedS8(2.5f)) | |||
| .set_dtype(2, dtype::QuantizedS32(6.25f)) | |||
| .set_dtype(4, dtype::QuantizedS8(60.25f)) | |||
| .set_rng(0, &int_rng) | |||
| .set_rng(1, &int_rng) | |||
| .set_rng(2, &int_rng) | |||
| .set_param(param); | |||
| checker.exec({{20, 32, 32, 4}, {24, 1, 1, 4}, {1, 1, 1, 24}, {}, {}}); | |||
| param.sparse = ConvBias::Param::Sparse::GROUP; | |||
| checker.set_param(param).exec( | |||
| {{20, 32, 32, 16}, {4, 4, 1, 1, 4}, {1, 1, 1, 16}, {}, {}}); | |||
| } | |||
| TEST_F(CUDA, SHAKE_CONV_BIAS_FORWARD_QS8_NCHWX) { | |||
| using Format = ConvBias::Param::Format; | |||
| require_compute_capability(6, 1); | |||
| AccuracyShakeChecker<ConvBiasForward> checker(handle_cuda()); | |||
| UniformIntRNG int_rng{-5, 5}; | |||
| UniformFloatRNG float_rng{-50, 50}; | |||
| checker.set_dtype(0, dtype::QuantizedS8(1.2f)) | |||
| .set_dtype(1, dtype::QuantizedS8(1.3f)) | |||
| .set_dtype(2, dtype::QuantizedS32(1.2 * 1.3f)) | |||
| .set_dtype(3, dtype::QuantizedS8(1.3f)) | |||
| .set_dtype(4, dtype::QuantizedS8(1.3f)) | |||
| .set_rng(0, &int_rng) | |||
| .set_rng(1, &int_rng) | |||
| .set_rng(2, &int_rng) | |||
| .set_rng(3, &int_rng); | |||
| auto run = [&](const TensorShapeArray& shapes, const Format& format) { | |||
| ConvBias::Param param; | |||
| param.format = format; | |||
| checker.set_param(param).exec( | |||
| {shapes[0], shapes[1], shapes[2], {}, {}}); | |||
| }; | |||
| run({{20, 2, 24, 24, 4}, {24, 2, 3, 3, 4}, {1, 6, 1, 1, 4}}, Format::NCHW4); | |||
| run({{20, 1, 24, 24, 32}, {64, 1, 3, 3, 32}, {1, 2, 1, 1, 32}}, | |||
| Format::NCHW32); | |||
| run({{16, 4, 23, 40, 4}, | |||
| {32, 4, 3, 3, 4}, | |||
| {1, 1, 1, 1, 32}}, Format::NCHW4_NCHW32); | |||
| checker.set_dtype(0, dtype::QuantizedS8(1.9980618f)) | |||
| .set_dtype(1, dtype::QuantizedS8(1.9980927f)) | |||
| .set_dtype(2, dtype::Float32()) | |||
| .set_dtype(3, dtype::Float32()) | |||
| .set_dtype(4, dtype::Float32()) | |||
| .set_rng(0, &int_rng) | |||
| .set_rng(1, &int_rng) | |||
| .set_rng(2, &float_rng) | |||
| .set_rng(3, &float_rng); | |||
| run({{16, 4, 92, 160, 4}, {20, 4, 3, 3, 4}, {1, 20, 1, 1}}, | |||
| Format::NCHW4_NCHW); | |||
| } | |||
| TEST_F(CUDA, SHAKE_MATRIX_MUL_FORWARD) { | |||
| AccuracyShakeChecker<MatrixMul> checker(handle_cuda()); | |||
| checker.set_dtype(0, dtype::Float32()) | |||
| .set_dtype(1, dtype::Float32()) | |||
| .set_dtype(2, dtype::Float32()) | |||
| .exec({{50, 100}, {100, 60}, {}}); | |||
| } | |||
| TEST_F(CUDA, SHAKE_BATCH_CONV_BIAS_QS8) { | |||
| require_compute_capability(6, 1); | |||
| AccuracyShakeChecker<BatchConvBiasForward> checker(handle_cuda()); | |||
| UniformIntRNG const_rng{1, 1}; | |||
| UniformIntRNG rng{-5, 5}; | |||
| UniformIntRNG bias_rng{-50, 50}; | |||
| checker.set_rng(0, &rng) | |||
| .set_rng(1, &rng) | |||
| .set_rng(2, &rng) | |||
| .set_rng(3, &rng) | |||
| .set_dtype(0, dtype::QuantizedS8{1.2f}) | |||
| .set_dtype(1, dtype::QuantizedS8{1.3f}) | |||
| .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f}) | |||
| .set_dtype(3, dtype::QuantizedS8{1.1f}) | |||
| .set_dtype(4, dtype::QuantizedS8{1.1f}); | |||
| param::BatchConvBias param; | |||
| param.pad_h = 2, param.pad_w = 1; | |||
| param.stride_h = 1, param.stride_w = 2; | |||
| param.format = param::BatchConvBias::Format::NCHW4; | |||
| checker.set_param(param).exec({{32, 4, 24, 24, 4}, | |||
| {32, 32, 4, 1, 1, 4}, | |||
| {1, 8, 1, 1, 4}, | |||
| {}, | |||
| {}}); | |||
| } | |||
| TEST_F(CUDA, SHAKE_BATCHED_MATRIX_MUL) { | |||
| AccuracyShakeChecker<BatchedMatrixMul> checker(handle_cuda()); | |||
| UniformIntRNG int_rng{-127, 127}; | |||
| NormalRNG default_rng; | |||
| checker.set_dtype(0, dtype::QuantizedS8(1.2f)) | |||
| .set_dtype(1, dtype::QuantizedS8(1.3f)) | |||
| .set_dtype(2, {}) | |||
| .set_rng(0, &int_rng) | |||
| .set_rng(1, &int_rng); | |||
| checker.exec({{20, 424, 368}, {20, 368, 256}, {20, 424, 256}}); | |||
| checker.set_dtype(0, dtype::Float32()) | |||
| .set_dtype(1, dtype::Float32()) | |||
| .set_dtype(2, dtype::Float32()) | |||
| .set_rng(0, &default_rng) | |||
| .set_rng(1, &default_rng); | |||
| checker.exec({{20, 424, 368}, {20, 368, 256}, {20, 424, 256}}); | |||
| } | |||
| TEST_F(CUDA, SHAKE_CONVOLUTION3D_FORWARD) { | |||
| AccuracyShakeChecker<Convolution3DForward> checker(handle_cuda()); | |||
| NormalRNG default_rng; | |||
| float scale = 1.0f / sqrt(5); | |||
| UniformFloatRNG rng(scale, 2 * scale); | |||
| param::Convolution3D param; | |||
| param.mode = param::Convolution3D::Mode::CROSS_CORRELATION; | |||
| param.stride_d = param.stride_h = param.stride_w = 2; | |||
| param.pad_d = param.pad_h = param.pad_w = 0; | |||
| param.dilate_d = param.dilate_h = param.dilate_w = 1; | |||
| checker.set_dtype(0, dtype::Float32()) | |||
| .set_dtype(1, dtype::Float32()) | |||
| .set_rng(0, &default_rng) | |||
| .set_rng(1, &default_rng) | |||
| .set_param(param) | |||
| .exec({{20, 5, 12, 12, 16}, {5, 5, 3, 3, 3}, {}}); | |||
| } | |||
| TEST_F(CUDA, SHAKE_LOCAL_SHARE) { | |||
| AccuracyShakeChecker<LocalShare> checker(handle_cuda()); | |||
| using Param = LocalShare::Param; | |||
| Param param; | |||
| param.spatial_groups_h = param.spatial_groups_w = 3; | |||
| checker.set_param(param); | |||
| checker.exec({{20, 16, 32, 32}, {3, 3, 16, 3, 3, 64}, {}}); | |||
| } | |||
| } // namespace test | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -20,6 +20,7 @@ | |||
| #include "test/common/rng.h" | |||
| #include "test/cuda/benchmark.h" | |||
| #include "src/cuda/utils.h" | |||
| #include "test/common/accuracy_shake_checker.h" | |||
| #define V1(x) #x | |||
| #define V(x) V1(x) | |||
| @@ -0,0 +1,104 @@ | |||
| /** | |||
| * \file dnn/test/x86/accuracy_shake.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #include "test/x86/fixture.h" | |||
| #include "megdnn/opr_param_defs.h" | |||
| #include "megdnn/oprs.h" | |||
| #include "test/common/accuracy_shake_checker.h" | |||
| #include "test/common/convolution.h" | |||
| #include "test/common/rng.h" | |||
| #include "test/common/tensor.h" | |||
| #include "test/common/workspace_wrapper.h" | |||
| namespace megdnn { | |||
| namespace test { | |||
| TEST_F(X86, SHAKE_CONV_BIAS_FORWARD) { | |||
| AccuracyShakeChecker<ConvBiasForward> checker(handle()); | |||
| NormalRNG default_rng; | |||
| checker.set_dtype(0, dtype::Float32()) | |||
| .set_dtype(1, dtype::Float32()) | |||
| .set_dtype(2, dtype::Float32()) | |||
| .set_rng(0, &default_rng) | |||
| .set_rng(1, &default_rng); | |||
| checker.set_before_exec_callback(AlgoGenerator<ConvBiasForward>("X86")); | |||
| // convolution | |||
| checker.exec({{6, 16, 32, 32}, {64, 16, 3, 3}, {}, {}, {}}); | |||
| // convbias without z | |||
| checker.exec({{6, 16, 32, 32}, {64, 16, 3, 3}, {1, 64, 1, 1}, {}, {}}); | |||
| // convbias with z | |||
| checker.exec({{6, 16, 32, 32}, | |||
| {64, 16, 3, 3}, | |||
| {1, 64, 1, 1}, | |||
| {6, 64, 30, 30}, | |||
| {}}); | |||
| // group | |||
| ConvBias::Param param; | |||
| param.sparse = ConvBias::Param::Sparse::GROUP; | |||
| checker.set_param(param); | |||
| checker.exec({{6, 16, 32, 32}, {2, 32, 8, 3, 3}, {}, {}, {}}); | |||
| checker.exec({{6, 16, 32, 32}, {2, 32, 8, 3, 3}, {1, 64, 1, 1}, {}, {}}); | |||
| checker.exec({{6, 16, 32, 32}, | |||
| {2, 32, 8, 3, 3}, | |||
| {1, 64, 1, 1}, | |||
| {6, 64, 30, 30}, | |||
| {}}); | |||
| } | |||
| TEST_F(X86, SHAKE_CONV_BIAS_FORWARD_INT8) { | |||
| AccuracyShakeChecker<ConvBiasForward> checker(handle()); | |||
| UniformIntRNG rng{-50, 50}; | |||
| checker.set_dtype(0, dtype::QuantizedS8(2.5f)) | |||
| .set_dtype(1, dtype::QuantizedS8(2.5f)) | |||
| .set_dtype(2, dtype::QuantizedS32(6.25f)) | |||
| .set_dtype(3, dtype::QuantizedS32(6.25f)) | |||
| .set_dtype(4, {}) | |||
| .set_rng(0, &rng) | |||
| .set_rng(1, &rng) | |||
| .set_rng(2, &rng); | |||
| checker.set_before_exec_callback(AlgoGenerator<ConvBiasForward>("X86")); | |||
| // convolution | |||
| checker.exec({{6, 16, 32, 32}, {64, 16, 3, 3}, {}, {}, {}}); | |||
| // convbias without z | |||
| checker.exec({{6, 16, 32, 32}, {64, 16, 3, 3}, {1, 64, 1, 1}, {}, {}}); | |||
| // convbias with z | |||
| checker.exec({{6, 16, 32, 32}, | |||
| {64, 16, 3, 3}, | |||
| {1, 64, 1, 1}, | |||
| {6, 64, 30, 30}, | |||
| {}}); | |||
| // group | |||
| ConvBias::Param param; | |||
| param.sparse = ConvBias::Param::Sparse::GROUP; | |||
| checker.set_param(param); | |||
| checker.exec({{6, 16, 32, 32}, {2, 32, 8, 3, 3}, {}, {}, {}}); | |||
| checker.exec({{6, 16, 32, 32}, {2, 32, 8, 3, 3}, {1, 64, 1, 1}, {}, {}}); | |||
| checker.exec({{6, 16, 32, 32}, | |||
| {2, 32, 8, 3, 3}, | |||
| {1, 64, 1, 1}, | |||
| {6, 64, 30, 30}, | |||
| {}}); | |||
| } | |||
| TEST_F(X86, SHAKE_MATRIX_MUL_FORWARD) { | |||
| AccuracyShakeChecker<MatrixMul> checker(handle()); | |||
| checker.set_dtype(0, dtype::Float32()) | |||
| .set_dtype(1, dtype::Float32()) | |||
| .set_dtype(2, dtype::Float32()) | |||
| .exec({{20, 100}, {100, 60}, {}}); | |||
| } | |||
| } // namespace test | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -15,6 +15,7 @@ | |||
| #include "megdnn/oprs.h" | |||
| #include "test/common/benchmarker.h" | |||
| #include "test/common/checker.h" | |||
| #include "test/common/accuracy_shake_checker.h" | |||
| #include "test/common/convolution.h" | |||
| #include "test/common/rng.h" | |||
| #include "test/common/tensor.h" | |||
| @@ -18,9 +18,7 @@ | |||
| #include "megbrain/comp_node.h" | |||
| #include "megdnn/basic_types.h" | |||
| #include "megdnn/oprs/base.h" | |||
| #include "megdnn/oprs/linalg.h" | |||
| #include "megdnn/oprs/nn.h" | |||
| #include "megdnn/oprs.h" | |||
| namespace mgb { | |||
| namespace opr { | |||
| @@ -45,39 +43,6 @@ namespace opr { | |||
| cb(BatchedMatrixMul) | |||
| // clang-format on | |||
| template <typename Opr> | |||
| struct OprArityTrait; | |||
| template <typename Opr, int _arity_in, int _arity_out> | |||
| struct OprArityTraitTmpl { | |||
| static constexpr int arity_in = _arity_in; | |||
| static constexpr int arity_out = _arity_out; | |||
| static constexpr int arity = arity_in + arity_out; | |||
| }; | |||
| #define INST_ARITY(_Opr, _in, _out) \ | |||
| template <> \ | |||
| struct OprArityTrait<_Opr> : public OprArityTraitTmpl<_Opr, _in, _out> {}; | |||
| INST_ARITY(megdnn::ConvolutionBackwardData, 2, 1); | |||
| INST_ARITY(megdnn::ConvolutionBackwardFilter, 2, 1); | |||
| INST_ARITY(megdnn::Convolution3DForward, 2, 1); | |||
| INST_ARITY(megdnn::Convolution3DBackwardData, 2, 1); | |||
| INST_ARITY(megdnn::Convolution3DBackwardFilter, 2, 1); | |||
| INST_ARITY(megdnn::LocalShareForward, 2, 1); | |||
| INST_ARITY(megdnn::LocalShareBackwardData, 2, 1); | |||
| INST_ARITY(megdnn::LocalShareBackwardFilter, 2, 1); | |||
| INST_ARITY(megdnn::Convolution, 2, 1); | |||
| INST_ARITY(megdnn::DeformableConvForward, 4, 1); | |||
| INST_ARITY(megdnn::DeformableConvBackwardFilter, 4, 1); | |||
| INST_ARITY(megdnn::BatchConvBiasForward, 4, 1); | |||
| INST_ARITY(megdnn::ConvBias, 4, 1); | |||
| INST_ARITY(megdnn::DeformableConvBackwardData, 5, 3); | |||
| INST_ARITY(megdnn::MatrixMul, 2, 1); | |||
| INST_ARITY(megdnn::BatchedMatrixMul, 2, 1); | |||
| #undef INST_ARITY | |||
| template <typename Opr> | |||
| constexpr bool opr_supports_preprocess() { | |||
| return std::is_same<Opr, megdnn::ConvolutionForward>::value || | |||