| @@ -561,7 +561,8 @@ void ConvolutionBase<Parameter>::check_or_deduce_dtype_fwd( | |||||
| src.enumv() == DTypeEnum::QuantizedS8 || | src.enumv() == DTypeEnum::QuantizedS8 || | ||||
| src.enumv() == DTypeEnum::Quantized8Asymm || | src.enumv() == DTypeEnum::Quantized8Asymm || | ||||
| src.enumv() == DTypeEnum::QuantizedS4 || | src.enumv() == DTypeEnum::QuantizedS4 || | ||||
| src.enumv() == DTypeEnum::Quantized4Asymm) { | |||||
| src.enumv() == DTypeEnum::Quantized4Asymm || | |||||
| src.enumv() == DTypeEnum::QuantizedS1) { | |||||
| supported_dst_dtype.push_back(dtype::QuantizedS32(mul_scale(src, filter))); | supported_dst_dtype.push_back(dtype::QuantizedS32(mul_scale(src, filter))); | ||||
| bool cond_dst = dst.valid() && (dst.enumv() == src.enumv() || | bool cond_dst = dst.valid() && (dst.enumv() == src.enumv() || | ||||
| ((dst.enumv() == DTypeEnum::QuantizedS4 || | ((dst.enumv() == DTypeEnum::QuantizedS4 || | ||||
| @@ -25,7 +25,7 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() { | |||||
| non_cudnn_algos.push_back(&matmul); | non_cudnn_algos.push_back(&matmul); | ||||
| non_cudnn_algos.push_back(&matmul8x8x32); | non_cudnn_algos.push_back(&matmul8x8x32); | ||||
| non_cudnn_algos.push_back(&batched_matmul); | non_cudnn_algos.push_back(&batched_matmul); | ||||
| non_cudnn_algos.push_back(&int1_simple); | |||||
| fill_cudnn_algos(); | fill_cudnn_algos(); | ||||
| for (auto&& algo : cudnn_conv_bias_activations) { | for (auto&& algo : cudnn_conv_bias_activations) { | ||||
| all_algos.push_back(&algo); | all_algos.push_back(&algo); | ||||
| @@ -45,6 +45,7 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() { | |||||
| conv_algos.push_back(&matmul8x8x32); | conv_algos.push_back(&matmul8x8x32); | ||||
| conv_algos.push_back(&batched_matmul); | conv_algos.push_back(&batched_matmul); | ||||
| conv_algos.push_back(&group); | conv_algos.push_back(&group); | ||||
| conv_algos.push_back(&int1_simple); | |||||
| for (auto&& algo : conv_algos) { | for (auto&& algo : conv_algos) { | ||||
| all_algos.push_back(algo); | all_algos.push_back(algo); | ||||
| @@ -87,6 +87,7 @@ public: | |||||
| CUDA_FALLBACK_NCHW_INT4, | CUDA_FALLBACK_NCHW_INT4, | ||||
| CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32, | CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32, | ||||
| CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16, | CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16, | ||||
| CUDA_SIMPLE_INT1, | |||||
| }; | }; | ||||
| using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>; | using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>; | ||||
| @@ -1089,6 +1090,24 @@ private: | |||||
| WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; | WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; | ||||
| }; | }; | ||||
| class ConvBiasForwardImpl::AlgoSimpleInt1 final : public AlgoBase { | |||||
| public: | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const override; | |||||
| void exec(const ExecArgs& args) const override; | |||||
| std::vector<SearchItem> get_subopr_list( | |||||
| const TensorLayoutArray& layouts, const OperatorBase* opr) const override; | |||||
| const char* name() const override { return "CONVBIAS_SIMPLE_INT1"; } | |||||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; } | |||||
| MEGDNN_DECL_ALGO_TYPE(CUDA_SIMPLE_INT1) | |||||
| private: | |||||
| WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; | |||||
| }; | |||||
| class ConvBiasForwardImpl::AlgoPack : NonCopyableObj { | class ConvBiasForwardImpl::AlgoPack : NonCopyableObj { | ||||
| private: | private: | ||||
| AlgoBase::Mapper m_all_algos_map; | AlgoBase::Mapper m_all_algos_map; | ||||
| @@ -1132,6 +1151,7 @@ public: | |||||
| std::vector<AlgoFloat16NCHWHMMAImplicitBatchedGemm> f16_implicit_bmm; | std::vector<AlgoFloat16NCHWHMMAImplicitBatchedGemm> f16_implicit_bmm; | ||||
| AlgoGroupConvGeneral group; | AlgoGroupConvGeneral group; | ||||
| AlgoBFloat16 bfloat16; | AlgoBFloat16 bfloat16; | ||||
| AlgoSimpleInt1 int1_simple; | |||||
| AlgoBase* cudnn_conv_bias_act_from_enum(cudnnConvolutionFwdAlgo_t algo); | AlgoBase* cudnn_conv_bias_act_from_enum(cudnnConvolutionFwdAlgo_t algo); | ||||
| @@ -30,6 +30,8 @@ bool ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::is_available( | |||||
| return false; | return false; | ||||
| } | } | ||||
| } | } | ||||
| if (args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS1) | |||||
| return false; | |||||
| if ((args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS4 || | if ((args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS4 || | ||||
| args.src_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) && | args.src_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) && | ||||
| args.filter_layout->dtype.enumv() == DTypeEnum::QuantizedS4) | args.filter_layout->dtype.enumv() == DTypeEnum::QuantizedS4) | ||||
| @@ -134,6 +134,9 @@ void ConvBiasDesc::set_conv( | |||||
| namespace conv_bias { | namespace conv_bias { | ||||
| bool is_cudnn_supported(const BiasForwardSizeArgs& args) { | bool is_cudnn_supported(const BiasForwardSizeArgs& args) { | ||||
| if (args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS1) | |||||
| return false; | |||||
| if ((args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS4 || | if ((args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS4 || | ||||
| args.src_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) && | args.src_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) && | ||||
| args.filter_layout->dtype.enumv() == DTypeEnum::QuantizedS4) | args.filter_layout->dtype.enumv() == DTypeEnum::QuantizedS4) | ||||
| @@ -221,6 +221,11 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( | |||||
| return &sm_algo_pack.fallback_nchw_qs8; | return &sm_algo_pack.fallback_nchw_qs8; | ||||
| } | } | ||||
| if (sm_algo_pack.int1_simple.is_available_attribute( | |||||
| args, positive_attr, negative_attr, workspace_limit_in_bytes)) { | |||||
| return &sm_algo_pack.int1_simple; | |||||
| } | |||||
| if (args.src_layout->dtype.enumv() != DTypeTrait<dtype::BFloat16>::enumv) { | if (args.src_layout->dtype.enumv() != DTypeTrait<dtype::BFloat16>::enumv) { | ||||
| return megdnn::get_algo_match_attribute<ConvBiasForwardImpl>( | return megdnn::get_algo_match_attribute<ConvBiasForwardImpl>( | ||||
| sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes, | sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes, | ||||
| @@ -72,6 +72,7 @@ public: | |||||
| class AlgoInt4Int4NHWCIMMAImplicitGemm; | class AlgoInt4Int4NHWCIMMAImplicitGemm; | ||||
| class AlgoUInt4Int4NHWCIMMAImplicitGemm; | class AlgoUInt4Int4NHWCIMMAImplicitGemm; | ||||
| class AlgoBFloat16; | class AlgoBFloat16; | ||||
| class AlgoSimpleInt1; | |||||
| // The following algorithms are suitable for channel wise convolution | // The following algorithms are suitable for channel wise convolution | ||||
| class AlgoFloat32NCHWFMAImplicitBatchedGemm; | class AlgoFloat32NCHWFMAImplicitBatchedGemm; | ||||
| class AlgoFloat16NCHWHMMAImplicitBatchedGemm; | class AlgoFloat16NCHWHMMAImplicitBatchedGemm; | ||||
| @@ -0,0 +1,145 @@ | |||||
| /** | |||||
| * \file dnn/src/cuda/conv_bias/simple_int1.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #include "src/common/algo_base.h" | |||||
| #include "src/cuda/conv_bias/algo.h" | |||||
| #include "src/cuda/handle.h" | |||||
| #include "src/cuda/utils.cuh" | |||||
| #include "src/cuda/utils.h" | |||||
| using namespace megdnn; | |||||
| using namespace cuda; | |||||
| using namespace conv_bias; | |||||
| namespace { | |||||
| std::pair<TensorLayoutArray, ConvBiasForwardImpl::Param> sub_opr_config( | |||||
| const TensorLayoutArray& layouts, const ConvBiasForwardImpl* opr) { | |||||
| megdnn_assert(layouts.size() >= 3); | |||||
| std::pair<TensorLayoutArray, ConvBiasForwardImpl::Param> ret; | |||||
| ret.first = layouts; | |||||
| auto change_dtype = [](TensorLayout& layout) { | |||||
| if (layout.dtype.enumv() == DTypeEnum::QuantizedS1 || | |||||
| layout.dtype.enumv() == DTypeEnum::QuantizedS32) { | |||||
| layout.dtype = dtype::Float32(); | |||||
| } | |||||
| }; | |||||
| change_dtype(ret.first[0]); | |||||
| change_dtype(ret.first[1]); | |||||
| change_dtype(ret.first[2]); | |||||
| change_dtype(ret.first[3]); | |||||
| change_dtype(ret.first[4]); | |||||
| ret.second = opr->param(); | |||||
| ret.second.compute_mode = ConvBiasForwardImpl::Param::ComputeMode::DEFAULT; | |||||
| return ret; | |||||
| } | |||||
| std::pair<TensorLayoutArray, std::unique_ptr<ConvBiasForward>> prepare_sub_opr( | |||||
| const ConvBiasForwardImpl::AlgoBase::SizeArgs& args) { | |||||
| auto convbias_opr = args.handle->create_operator<ConvBias>(); | |||||
| auto&& config = sub_opr_config( | |||||
| {*args.src_layout, *args.filter_layout, *args.bias_layout, *args.z_layout, | |||||
| *args.dst_layout}, | |||||
| args.opr); | |||||
| convbias_opr->param() = config.second; | |||||
| return {config.first, std::move(convbias_opr)}; | |||||
| } | |||||
| } // namespace | |||||
| std::vector<Algorithm::SearchItem> ConvBiasForwardImpl::AlgoSimpleInt1::get_subopr_list( | |||||
| const TensorLayoutArray& layouts, const OperatorBase* opr) const { | |||||
| auto&& config = | |||||
| sub_opr_config(layouts, static_cast<const ConvBiasForwardImpl*>(opr)); | |||||
| std::string param_str; | |||||
| Algorithm::serialize_write_pod(config.second, param_str); | |||||
| return {{Algorithm::OprType::CONVBIAS_FORWARD, param_str, config.first}}; | |||||
| } | |||||
| bool ConvBiasForwardImpl::AlgoSimpleInt1::is_available(const SizeArgs& args) const { | |||||
| if (args.src_layout->dtype.valid() && args.filter_layout->dtype.valid() && | |||||
| args.bias_layout->dtype.valid() && args.z_layout->dtype.valid() && | |||||
| args.dst_layout->dtype.valid()) { | |||||
| auto config = prepare_sub_opr(args); | |||||
| return args.src_layout->dtype.enumv() == args.filter_layout->dtype.enumv() && | |||||
| args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS1 && | |||||
| get_algorithm( | |||||
| static_cast<ConvBiasForwardImpl*>(config.second.get()), | |||||
| config.first[0], config.first[1], config.first[2], | |||||
| config.first[3], config.first[4]); | |||||
| } else { | |||||
| return false; | |||||
| } | |||||
| } | |||||
| WorkspaceBundle ConvBiasForwardImpl::AlgoSimpleInt1::get_workspace_bundle( | |||||
| void* ptr, const SizeArgs& args) const { | |||||
| auto config = prepare_sub_opr(args); | |||||
| SmallVector<size_t> sizes; | |||||
| auto get_workspace = [&sizes](const TensorLayout& src, const TensorLayout& dst) { | |||||
| if (src.dtype != dst.dtype) { | |||||
| sizes.push_back(dst.span().dist_byte()); | |||||
| } | |||||
| }; | |||||
| get_workspace(*args.src_layout, config.first[0]); | |||||
| get_workspace(*args.filter_layout, config.first[1]); | |||||
| get_workspace(*args.bias_layout, config.first[2]); | |||||
| get_workspace(*args.z_layout, config.first[3]); | |||||
| get_workspace(*args.dst_layout, config.first[4]); | |||||
| sizes.push_back(config.second->get_workspace_in_bytes( | |||||
| config.first[0], config.first[1], config.first[2], config.first[3], | |||||
| config.first[4], nullptr)); | |||||
| return {ptr, std::move(sizes)}; | |||||
| } | |||||
| size_t ConvBiasForwardImpl::AlgoSimpleInt1::get_workspace_in_bytes( | |||||
| const SizeArgs& args) const { | |||||
| return get_workspace_bundle(nullptr, args).total_size_in_bytes(); | |||||
| } | |||||
| void ConvBiasForwardImpl::AlgoSimpleInt1::exec(const ExecArgs& args) const { | |||||
| TensorND fsrc_tensor = *args.src_tensor; | |||||
| TensorND ffilter_tensor = *args.filter_tensor; | |||||
| TensorND fbias_tensor = *args.bias_tensor; | |||||
| TensorND fz_tensor = *args.z_tensor; | |||||
| TensorND fdst_tensor = *args.dst_tensor; | |||||
| auto config = prepare_sub_opr(args); | |||||
| auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args); | |||||
| CompTypeCvter<dtype::QuantizedS1, dtype::Float32> cvter(args.handle, &bundle); | |||||
| { | |||||
| cvter.src_to_comp_type(*args.src_tensor, fsrc_tensor) | |||||
| .src_to_comp_type(*args.filter_tensor, ffilter_tensor); | |||||
| } | |||||
| WorkspaceBundle dst_bundle = { | |||||
| bundle.get(2), | |||||
| {bundle.get_size(2), bundle.get_size(3), bundle.get_size(4), | |||||
| bundle.get_size(5)}}; | |||||
| CompTypeCvter<dtype::QuantizedS32, dtype::Float32> dst_cvter( | |||||
| args.handle, &dst_bundle); | |||||
| { | |||||
| dst_cvter.src_to_comp_type(*args.bias_tensor, fbias_tensor) | |||||
| .src_to_comp_type(*args.z_tensor, fz_tensor) | |||||
| .src_to_comp_type(*args.dst_tensor, fdst_tensor); | |||||
| } | |||||
| config.second->exec( | |||||
| fsrc_tensor, ffilter_tensor, fbias_tensor, fz_tensor, fdst_tensor, nullptr, | |||||
| dst_cvter.workspace()); | |||||
| { dst_cvter.comp_to_dst_type(fdst_tensor, *args.dst_tensor); } | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -44,6 +44,10 @@ std::pair<TensorLayoutArray, ConvBiasForward::Param> sub_opr_config( | |||||
| src.dtype.param<dtype::Quantized4Asymm>().scale * | src.dtype.param<dtype::Quantized4Asymm>().scale * | ||||
| filter.dtype.param<dtype::Quantized4Asymm>().scale); | filter.dtype.param<dtype::Quantized4Asymm>().scale); | ||||
| } else if (src.dtype.enumv() == DTypeEnum::QuantizedS1) { | |||||
| bias_type = dtype::QuantizedS32( | |||||
| src.dtype.param<dtype::QuantizedS1>().scale * | |||||
| filter.dtype.param<dtype::QuantizedS1>().scale); | |||||
| } else { | } else { | ||||
| megdnn_assert(src.dtype.category() == DTypeCategory::FLOAT); | megdnn_assert(src.dtype.category() == DTypeCategory::FLOAT); | ||||
| bias_type = src.dtype; | bias_type = src.dtype; | ||||
| @@ -278,6 +278,9 @@ void ConvBiasForwardImpl::exec( | |||||
| DISPATCH_RAW( | DISPATCH_RAW( | ||||
| Quantized4Asymm, QuantizedS4, QuantizedS32, QuantizedS32, DEFAULT, | Quantized4Asymm, QuantizedS4, QuantizedS32, QuantizedS32, DEFAULT, | ||||
| (convolution::forward_bias<dt_quint4, dt_qint4, dt_qint32, dt_qint32>)) | (convolution::forward_bias<dt_quint4, dt_qint4, dt_qint32, dt_qint32>)) | ||||
| DISPATCH_RAW( | |||||
| QuantizedS1, QuantizedS1, QuantizedS32, QuantizedS32, FLOAT32, | |||||
| (convolution::forward_bias<dt_qint1, dt_qint1, dt_qint32, dt_qint32>)) | |||||
| #if !MEGDNN_DISABLE_FLOAT16 | #if !MEGDNN_DISABLE_FLOAT16 | ||||
| DISPATCH(Float16, Float16) | DISPATCH(Float16, Float16) | ||||
| DISPATCH_RAW( | DISPATCH_RAW( | ||||
| @@ -84,6 +84,15 @@ inline void StrategyFwd::on( | |||||
| d += cast(s) * cast(f); | d += cast(s) * cast(f); | ||||
| } | } | ||||
| template <> | |||||
| inline void StrategyFwd::on( | |||||
| dt_qint1& s, dt_qint1& f, dt_qint32& d, DType, DType, DType) { | |||||
| auto cast = [](const dt_qint1& val) { | |||||
| return dt_qint32(static_cast<int32_t>(val.as_int8())); | |||||
| }; | |||||
| d += cast(s) * cast(f); | |||||
| } | |||||
| struct StrategyBwdData { | struct StrategyBwdData { | ||||
| template <typename st, typename ft, typename dt> | template <typename st, typename ft, typename dt> | ||||
| static void on(st& s, ft& f, dt& d, DType, DType, DType) { | static void on(st& s, ft& f, dt& d, DType, DType, DType) { | ||||
| @@ -133,6 +133,32 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_BF16) { | |||||
| } | } | ||||
| } | } | ||||
| TEST_F(CUDA, CONV_BIAS_FORWARD_QS1) { | |||||
| require_compute_capability(6, 1); | |||||
| UniformIntRNG int_rng{1, 1}; | |||||
| Checker<ConvBiasForward> checker(handle_cuda()); | |||||
| checker.set_before_exec_callback(AlgoChecker<ConvBiasForward>( | |||||
| ExecutionPolicyAlgoName{"CONVBIAS_SIMPLE_INT1", {{"MATMUL", {}}}})); | |||||
| ConvBias::Param param; | |||||
| param.format = ConvBias::Param::Format::NCHW; | |||||
| param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | |||||
| { | |||||
| auto src_shape = TensorShape{20, 2, 224, 224}; | |||||
| auto filter_shape = TensorShape{20, 2, 3, 3}; | |||||
| checker.set_dtype(0, dtype::QuantizedS1(1.0f)) | |||||
| .set_dtype(1, dtype::QuantizedS1(1.0f)) | |||||
| .set_dtype(2, dtype::QuantizedS32(1.0f)) | |||||
| .set_dtype(3, dtype::QuantizedS32(1.0f)) | |||||
| .set_dtype(4, dtype::QuantizedS32(1.0f)) | |||||
| .set_rng(0, &int_rng) | |||||
| .set_rng(1, &int_rng) | |||||
| .set_param(param) | |||||
| .execs({src_shape, filter_shape, {}, {}, {}}); | |||||
| } | |||||
| } | |||||
| TEST_F(CUDA, CONV_BIAS_FORWARD_QS8) { | TEST_F(CUDA, CONV_BIAS_FORWARD_QS8) { | ||||
| require_compute_capability(6, 1); | require_compute_capability(6, 1); | ||||