| @@ -561,7 +561,8 @@ void ConvolutionBase<Parameter>::check_or_deduce_dtype_fwd( | |||
| src.enumv() == DTypeEnum::QuantizedS8 || | |||
| src.enumv() == DTypeEnum::Quantized8Asymm || | |||
| src.enumv() == DTypeEnum::QuantizedS4 || | |||
| src.enumv() == DTypeEnum::Quantized4Asymm) { | |||
| src.enumv() == DTypeEnum::Quantized4Asymm || | |||
| src.enumv() == DTypeEnum::QuantizedS1) { | |||
| supported_dst_dtype.push_back(dtype::QuantizedS32(mul_scale(src, filter))); | |||
| bool cond_dst = dst.valid() && (dst.enumv() == src.enumv() || | |||
| ((dst.enumv() == DTypeEnum::QuantizedS4 || | |||
| @@ -25,7 +25,7 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() { | |||
| non_cudnn_algos.push_back(&matmul); | |||
| non_cudnn_algos.push_back(&matmul8x8x32); | |||
| non_cudnn_algos.push_back(&batched_matmul); | |||
| non_cudnn_algos.push_back(&int1_simple); | |||
| fill_cudnn_algos(); | |||
| for (auto&& algo : cudnn_conv_bias_activations) { | |||
| all_algos.push_back(&algo); | |||
| @@ -45,6 +45,7 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() { | |||
| conv_algos.push_back(&matmul8x8x32); | |||
| conv_algos.push_back(&batched_matmul); | |||
| conv_algos.push_back(&group); | |||
| conv_algos.push_back(&int1_simple); | |||
| for (auto&& algo : conv_algos) { | |||
| all_algos.push_back(algo); | |||
| @@ -87,6 +87,7 @@ public: | |||
| CUDA_FALLBACK_NCHW_INT4, | |||
| CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32, | |||
| CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16, | |||
| CUDA_SIMPLE_INT1, | |||
| }; | |||
| using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>; | |||
| @@ -1089,6 +1090,24 @@ private: | |||
| WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; | |||
| }; | |||
| class ConvBiasForwardImpl::AlgoSimpleInt1 final : public AlgoBase { | |||
| public: | |||
| bool is_available(const SizeArgs& args) const override; | |||
| size_t get_workspace_in_bytes(const SizeArgs& args) const override; | |||
| void exec(const ExecArgs& args) const override; | |||
| std::vector<SearchItem> get_subopr_list( | |||
| const TensorLayoutArray& layouts, const OperatorBase* opr) const override; | |||
| const char* name() const override { return "CONVBIAS_SIMPLE_INT1"; } | |||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; } | |||
| MEGDNN_DECL_ALGO_TYPE(CUDA_SIMPLE_INT1) | |||
| private: | |||
| WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; | |||
| }; | |||
| class ConvBiasForwardImpl::AlgoPack : NonCopyableObj { | |||
| private: | |||
| AlgoBase::Mapper m_all_algos_map; | |||
| @@ -1132,6 +1151,7 @@ public: | |||
| std::vector<AlgoFloat16NCHWHMMAImplicitBatchedGemm> f16_implicit_bmm; | |||
| AlgoGroupConvGeneral group; | |||
| AlgoBFloat16 bfloat16; | |||
| AlgoSimpleInt1 int1_simple; | |||
| AlgoBase* cudnn_conv_bias_act_from_enum(cudnnConvolutionFwdAlgo_t algo); | |||
| @@ -30,6 +30,8 @@ bool ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::is_available( | |||
| return false; | |||
| } | |||
| } | |||
| if (args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS1) | |||
| return false; | |||
| if ((args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS4 || | |||
| args.src_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) && | |||
| args.filter_layout->dtype.enumv() == DTypeEnum::QuantizedS4) | |||
| @@ -134,6 +134,9 @@ void ConvBiasDesc::set_conv( | |||
| namespace conv_bias { | |||
| bool is_cudnn_supported(const BiasForwardSizeArgs& args) { | |||
| if (args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS1) | |||
| return false; | |||
| if ((args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS4 || | |||
| args.src_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) && | |||
| args.filter_layout->dtype.enumv() == DTypeEnum::QuantizedS4) | |||
| @@ -221,6 +221,11 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( | |||
| return &sm_algo_pack.fallback_nchw_qs8; | |||
| } | |||
| if (sm_algo_pack.int1_simple.is_available_attribute( | |||
| args, positive_attr, negative_attr, workspace_limit_in_bytes)) { | |||
| return &sm_algo_pack.int1_simple; | |||
| } | |||
| if (args.src_layout->dtype.enumv() != DTypeTrait<dtype::BFloat16>::enumv) { | |||
| return megdnn::get_algo_match_attribute<ConvBiasForwardImpl>( | |||
| sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes, | |||
| @@ -72,6 +72,7 @@ public: | |||
| class AlgoInt4Int4NHWCIMMAImplicitGemm; | |||
| class AlgoUInt4Int4NHWCIMMAImplicitGemm; | |||
| class AlgoBFloat16; | |||
| class AlgoSimpleInt1; | |||
| // The following algorithms are suitable for channel wise convolution | |||
| class AlgoFloat32NCHWFMAImplicitBatchedGemm; | |||
| class AlgoFloat16NCHWHMMAImplicitBatchedGemm; | |||
| @@ -0,0 +1,145 @@ | |||
| /** | |||
| * \file dnn/src/cuda/conv_bias/simple_int1.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #include "src/common/algo_base.h" | |||
| #include "src/cuda/conv_bias/algo.h" | |||
| #include "src/cuda/handle.h" | |||
| #include "src/cuda/utils.cuh" | |||
| #include "src/cuda/utils.h" | |||
| using namespace megdnn; | |||
| using namespace cuda; | |||
| using namespace conv_bias; | |||
| namespace { | |||
| std::pair<TensorLayoutArray, ConvBiasForwardImpl::Param> sub_opr_config( | |||
| const TensorLayoutArray& layouts, const ConvBiasForwardImpl* opr) { | |||
| megdnn_assert(layouts.size() >= 3); | |||
| std::pair<TensorLayoutArray, ConvBiasForwardImpl::Param> ret; | |||
| ret.first = layouts; | |||
| auto change_dtype = [](TensorLayout& layout) { | |||
| if (layout.dtype.enumv() == DTypeEnum::QuantizedS1 || | |||
| layout.dtype.enumv() == DTypeEnum::QuantizedS32) { | |||
| layout.dtype = dtype::Float32(); | |||
| } | |||
| }; | |||
| change_dtype(ret.first[0]); | |||
| change_dtype(ret.first[1]); | |||
| change_dtype(ret.first[2]); | |||
| change_dtype(ret.first[3]); | |||
| change_dtype(ret.first[4]); | |||
| ret.second = opr->param(); | |||
| ret.second.compute_mode = ConvBiasForwardImpl::Param::ComputeMode::DEFAULT; | |||
| return ret; | |||
| } | |||
| std::pair<TensorLayoutArray, std::unique_ptr<ConvBiasForward>> prepare_sub_opr( | |||
| const ConvBiasForwardImpl::AlgoBase::SizeArgs& args) { | |||
| auto convbias_opr = args.handle->create_operator<ConvBias>(); | |||
| auto&& config = sub_opr_config( | |||
| {*args.src_layout, *args.filter_layout, *args.bias_layout, *args.z_layout, | |||
| *args.dst_layout}, | |||
| args.opr); | |||
| convbias_opr->param() = config.second; | |||
| return {config.first, std::move(convbias_opr)}; | |||
| } | |||
| } // namespace | |||
| std::vector<Algorithm::SearchItem> ConvBiasForwardImpl::AlgoSimpleInt1::get_subopr_list( | |||
| const TensorLayoutArray& layouts, const OperatorBase* opr) const { | |||
| auto&& config = | |||
| sub_opr_config(layouts, static_cast<const ConvBiasForwardImpl*>(opr)); | |||
| std::string param_str; | |||
| Algorithm::serialize_write_pod(config.second, param_str); | |||
| return {{Algorithm::OprType::CONVBIAS_FORWARD, param_str, config.first}}; | |||
| } | |||
| bool ConvBiasForwardImpl::AlgoSimpleInt1::is_available(const SizeArgs& args) const { | |||
| if (args.src_layout->dtype.valid() && args.filter_layout->dtype.valid() && | |||
| args.bias_layout->dtype.valid() && args.z_layout->dtype.valid() && | |||
| args.dst_layout->dtype.valid()) { | |||
| auto config = prepare_sub_opr(args); | |||
| return args.src_layout->dtype.enumv() == args.filter_layout->dtype.enumv() && | |||
| args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS1 && | |||
| get_algorithm( | |||
| static_cast<ConvBiasForwardImpl*>(config.second.get()), | |||
| config.first[0], config.first[1], config.first[2], | |||
| config.first[3], config.first[4]); | |||
| } else { | |||
| return false; | |||
| } | |||
| } | |||
| WorkspaceBundle ConvBiasForwardImpl::AlgoSimpleInt1::get_workspace_bundle( | |||
| void* ptr, const SizeArgs& args) const { | |||
| auto config = prepare_sub_opr(args); | |||
| SmallVector<size_t> sizes; | |||
| auto get_workspace = [&sizes](const TensorLayout& src, const TensorLayout& dst) { | |||
| if (src.dtype != dst.dtype) { | |||
| sizes.push_back(dst.span().dist_byte()); | |||
| } | |||
| }; | |||
| get_workspace(*args.src_layout, config.first[0]); | |||
| get_workspace(*args.filter_layout, config.first[1]); | |||
| get_workspace(*args.bias_layout, config.first[2]); | |||
| get_workspace(*args.z_layout, config.first[3]); | |||
| get_workspace(*args.dst_layout, config.first[4]); | |||
| sizes.push_back(config.second->get_workspace_in_bytes( | |||
| config.first[0], config.first[1], config.first[2], config.first[3], | |||
| config.first[4], nullptr)); | |||
| return {ptr, std::move(sizes)}; | |||
| } | |||
| size_t ConvBiasForwardImpl::AlgoSimpleInt1::get_workspace_in_bytes( | |||
| const SizeArgs& args) const { | |||
| return get_workspace_bundle(nullptr, args).total_size_in_bytes(); | |||
| } | |||
| void ConvBiasForwardImpl::AlgoSimpleInt1::exec(const ExecArgs& args) const { | |||
| TensorND fsrc_tensor = *args.src_tensor; | |||
| TensorND ffilter_tensor = *args.filter_tensor; | |||
| TensorND fbias_tensor = *args.bias_tensor; | |||
| TensorND fz_tensor = *args.z_tensor; | |||
| TensorND fdst_tensor = *args.dst_tensor; | |||
| auto config = prepare_sub_opr(args); | |||
| auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args); | |||
| CompTypeCvter<dtype::QuantizedS1, dtype::Float32> cvter(args.handle, &bundle); | |||
| { | |||
| cvter.src_to_comp_type(*args.src_tensor, fsrc_tensor) | |||
| .src_to_comp_type(*args.filter_tensor, ffilter_tensor); | |||
| } | |||
| WorkspaceBundle dst_bundle = { | |||
| bundle.get(2), | |||
| {bundle.get_size(2), bundle.get_size(3), bundle.get_size(4), | |||
| bundle.get_size(5)}}; | |||
| CompTypeCvter<dtype::QuantizedS32, dtype::Float32> dst_cvter( | |||
| args.handle, &dst_bundle); | |||
| { | |||
| dst_cvter.src_to_comp_type(*args.bias_tensor, fbias_tensor) | |||
| .src_to_comp_type(*args.z_tensor, fz_tensor) | |||
| .src_to_comp_type(*args.dst_tensor, fdst_tensor); | |||
| } | |||
| config.second->exec( | |||
| fsrc_tensor, ffilter_tensor, fbias_tensor, fz_tensor, fdst_tensor, nullptr, | |||
| dst_cvter.workspace()); | |||
| { dst_cvter.comp_to_dst_type(fdst_tensor, *args.dst_tensor); } | |||
| } | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -44,6 +44,10 @@ std::pair<TensorLayoutArray, ConvBiasForward::Param> sub_opr_config( | |||
| src.dtype.param<dtype::Quantized4Asymm>().scale * | |||
| filter.dtype.param<dtype::Quantized4Asymm>().scale); | |||
| } else if (src.dtype.enumv() == DTypeEnum::QuantizedS1) { | |||
| bias_type = dtype::QuantizedS32( | |||
| src.dtype.param<dtype::QuantizedS1>().scale * | |||
| filter.dtype.param<dtype::QuantizedS1>().scale); | |||
| } else { | |||
| megdnn_assert(src.dtype.category() == DTypeCategory::FLOAT); | |||
| bias_type = src.dtype; | |||
| @@ -278,6 +278,9 @@ void ConvBiasForwardImpl::exec( | |||
| DISPATCH_RAW( | |||
| Quantized4Asymm, QuantizedS4, QuantizedS32, QuantizedS32, DEFAULT, | |||
| (convolution::forward_bias<dt_quint4, dt_qint4, dt_qint32, dt_qint32>)) | |||
| DISPATCH_RAW( | |||
| QuantizedS1, QuantizedS1, QuantizedS32, QuantizedS32, FLOAT32, | |||
| (convolution::forward_bias<dt_qint1, dt_qint1, dt_qint32, dt_qint32>)) | |||
| #if !MEGDNN_DISABLE_FLOAT16 | |||
| DISPATCH(Float16, Float16) | |||
| DISPATCH_RAW( | |||
| @@ -84,6 +84,15 @@ inline void StrategyFwd::on( | |||
| d += cast(s) * cast(f); | |||
| } | |||
| template <> | |||
| inline void StrategyFwd::on( | |||
| dt_qint1& s, dt_qint1& f, dt_qint32& d, DType, DType, DType) { | |||
| auto cast = [](const dt_qint1& val) { | |||
| return dt_qint32(static_cast<int32_t>(val.as_int8())); | |||
| }; | |||
| d += cast(s) * cast(f); | |||
| } | |||
| struct StrategyBwdData { | |||
| template <typename st, typename ft, typename dt> | |||
| static void on(st& s, ft& f, dt& d, DType, DType, DType) { | |||
| @@ -133,6 +133,32 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_BF16) { | |||
| } | |||
| } | |||
| TEST_F(CUDA, CONV_BIAS_FORWARD_QS1) { | |||
| require_compute_capability(6, 1); | |||
| UniformIntRNG int_rng{1, 1}; | |||
| Checker<ConvBiasForward> checker(handle_cuda()); | |||
| checker.set_before_exec_callback(AlgoChecker<ConvBiasForward>( | |||
| ExecutionPolicyAlgoName{"CONVBIAS_SIMPLE_INT1", {{"MATMUL", {}}}})); | |||
| ConvBias::Param param; | |||
| param.format = ConvBias::Param::Format::NCHW; | |||
| param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | |||
| { | |||
| auto src_shape = TensorShape{20, 2, 224, 224}; | |||
| auto filter_shape = TensorShape{20, 2, 3, 3}; | |||
| checker.set_dtype(0, dtype::QuantizedS1(1.0f)) | |||
| .set_dtype(1, dtype::QuantizedS1(1.0f)) | |||
| .set_dtype(2, dtype::QuantizedS32(1.0f)) | |||
| .set_dtype(3, dtype::QuantizedS32(1.0f)) | |||
| .set_dtype(4, dtype::QuantizedS32(1.0f)) | |||
| .set_rng(0, &int_rng) | |||
| .set_rng(1, &int_rng) | |||
| .set_param(param) | |||
| .execs({src_shape, filter_shape, {}, {}, {}}); | |||
| } | |||
| } | |||
| TEST_F(CUDA, CONV_BIAS_FORWARD_QS8) { | |||
| require_compute_capability(6, 1); | |||