| @@ -48,6 +48,8 @@ INST_ARITY(megdnn::ConvBias, 4, 1); | |||||
| INST_ARITY(megdnn::DeformableConvBackwardData, 5, 3); | INST_ARITY(megdnn::DeformableConvBackwardData, 5, 3); | ||||
| INST_ARITY(megdnn::MatrixMul, 2, 1); | INST_ARITY(megdnn::MatrixMul, 2, 1); | ||||
| INST_ARITY(megdnn::BatchedMatrixMul, 2, 1); | INST_ARITY(megdnn::BatchedMatrixMul, 2, 1); | ||||
| INST_ARITY(megdnn::PoolingForward, 1, 1); | |||||
| INST_ARITY(megdnn::PoolingBackward, 3, 1); | |||||
| #undef INST_ARITY | #undef INST_ARITY | ||||
| @@ -259,6 +259,8 @@ public: | |||||
| DEFORMABLE_CONV_BACKWARD_FILTER, | DEFORMABLE_CONV_BACKWARD_FILTER, | ||||
| CONVBIAS_FORWARD, | CONVBIAS_FORWARD, | ||||
| BATCH_CONV_FORWARD, | BATCH_CONV_FORWARD, | ||||
| POOLING_FORWARD, | |||||
| POOLING_BACKWARD, | |||||
| }; | }; | ||||
| struct SearchItem { | struct SearchItem { | ||||
| @@ -334,6 +336,63 @@ private: | |||||
| ExecutionPolicy m_execution_policy; | ExecutionPolicy m_execution_policy; | ||||
| }; | }; | ||||
| //! specialize for nargs == 2 | |||||
| template <class Opr> | |||||
| class MultiAlgoOpr<Opr, 2> : public MultiAlgoOpr<Opr, -1> { | |||||
| public: | |||||
| using Algorithm = detail::Algorithm; | |||||
| using AlgorithmInfo = detail::Algorithm::Info; | |||||
| using AlgoAttribute = detail::Algorithm::Attribute; | |||||
| //! get all possible algorithm decriptions for the specified layouts | |||||
| std::vector<AlgorithmInfo> get_all_algorithms_info(const TensorLayout& p0, | |||||
| const TensorLayout& p1) { | |||||
| std::vector<AlgorithmInfo> ret; | |||||
| for (auto&& algo : get_all_algorithms(p0, p1)) { | |||||
| ret.emplace_back(algo->info()); | |||||
| } | |||||
| return ret; | |||||
| } | |||||
| /** | |||||
| * \brief Returns the best algorithm information which indicate the | |||||
| * algorithm by heuristic. | |||||
| * | |||||
| * The selected algorithm should not use workspace more than | |||||
| * \p workspace_limit_in_bytes. | |||||
| */ | |||||
| AlgorithmInfo get_algorithm_info_heuristic( | |||||
| const TensorLayout& p0, const TensorLayout& p1, | |||||
| size_t workspace_limit_in_bytes = | |||||
| std::numeric_limits<size_t>::max(), | |||||
| const AlgoAttribute& positive_attr = AlgoAttribute::DEFAULT, | |||||
| const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) { | |||||
| return get_algorithm_heuristic(p0, p1, workspace_limit_in_bytes, | |||||
| positive_attr, negative_attr) | |||||
| ->info(); | |||||
| } | |||||
| protected: | |||||
| ~MultiAlgoOpr() = default; | |||||
| //! get all possible algorithms for the specified layouts | |||||
| virtual std::vector<Algorithm*> get_all_algorithms( | |||||
| const TensorLayout& p0, const TensorLayout& p1) = 0; | |||||
| /** | |||||
| * \brief Returns the best algorithm by heuristic. | |||||
| * | |||||
| * The selected algorithm should not use workspace more than | |||||
| * \p workspace_limit_in_bytes. | |||||
| */ | |||||
| virtual Algorithm* get_algorithm_heuristic( | |||||
| const TensorLayout& p0, const TensorLayout& p1, | |||||
| size_t workspace_limit_in_bytes = | |||||
| std::numeric_limits<size_t>::max(), | |||||
| const AlgoAttribute& positive_attr = AlgoAttribute::DEFAULT, | |||||
| const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) = 0; | |||||
| }; | |||||
| //! specialize for nargs == 3 | //! specialize for nargs == 3 | ||||
| template <class Opr> | template <class Opr> | ||||
| class MultiAlgoOpr<Opr, 3> : public MultiAlgoOpr<Opr, -1> { | class MultiAlgoOpr<Opr, 3> : public MultiAlgoOpr<Opr, -1> { | ||||
| @@ -713,7 +713,8 @@ protected: | |||||
| void check_layout_fwd(const TensorLayout& src, const TensorLayout& dst); | void check_layout_fwd(const TensorLayout& src, const TensorLayout& dst); | ||||
| }; | }; | ||||
| class PoolingForward : public PoolingBase { | |||||
| class PoolingForward : public PoolingBase, | |||||
| public detail::MultiAlgoOpr<PoolingForward, 2> { | |||||
| DEF_OPR_IMPL(PoolingForward, PoolingBase, 1, 1); | DEF_OPR_IMPL(PoolingForward, PoolingBase, 1, 1); | ||||
| public: | public: | ||||
| @@ -734,7 +735,8 @@ protected: | |||||
| using Pooling = PoolingForward; | using Pooling = PoolingForward; | ||||
| class PoolingBackward : public PoolingBase { | |||||
| class PoolingBackward : public PoolingBase, | |||||
| public detail::MultiAlgoOpr<PoolingBackward, 4> { | |||||
| DEF_OPR_IMPL(PoolingBackward, PoolingBase, 3, 1); | DEF_OPR_IMPL(PoolingBackward, PoolingBase, 3, 1); | ||||
| public: | public: | ||||
| @@ -69,7 +69,7 @@ std::vector<typename Opr::Algorithm*> get_all_algorithms( | |||||
| ret.push_back(i); | ret.push_back(i); | ||||
| } | } | ||||
| } | } | ||||
| megdnn_assert(!ret.empty(), "no conv algorithm for %s", | |||||
| megdnn_assert(!ret.empty(), "no algorithm for %s", | |||||
| args.to_string().c_str()); | args.to_string().c_str()); | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| @@ -294,32 +294,6 @@ void ConvDesc::set(DType data_type, const param::Convolution& param, | |||||
| #endif | #endif | ||||
| } | } | ||||
| PoolingDesc::PoolingDesc() { | |||||
| cudnn_check(cudnnCreatePoolingDescriptor(&desc)); | |||||
| } | |||||
| PoolingDesc::~PoolingDesc() { | |||||
| cudnn_check(cudnnDestroyPoolingDescriptor(desc)); | |||||
| } | |||||
| void PoolingDesc::set(const param::Pooling& param) { | |||||
| cudnnPoolingMode_t mode; | |||||
| switch (param.mode) { | |||||
| case param::Pooling::Mode::MAX: | |||||
| mode = CUDNN_POOLING_MAX; | |||||
| break; | |||||
| case param::Pooling::Mode::AVERAGE: | |||||
| mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; | |||||
| break; | |||||
| case param::Pooling::Mode::AVERAGE_COUNT_EXCLUDE_PADDING: | |||||
| mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; | |||||
| break; | |||||
| } | |||||
| cudnn_check(cudnnSetPooling2dDescriptor( | |||||
| desc, mode, CUDNN_NOT_PROPAGATE_NAN, param.window_h, param.window_w, | |||||
| param.pad_h, param.pad_w, param.stride_h, param.stride_w)); | |||||
| } | |||||
| LRNDesc::LRNDesc() { | LRNDesc::LRNDesc() { | ||||
| cudnn_check(cudnnCreateLRNDescriptor(&desc)); | cudnn_check(cudnnCreateLRNDescriptor(&desc)); | ||||
| } | } | ||||
| @@ -54,14 +54,6 @@ class ConvDesc { | |||||
| cudnnConvolutionDescriptor_t desc; | cudnnConvolutionDescriptor_t desc; | ||||
| }; | }; | ||||
| class PoolingDesc { | |||||
| public: | |||||
| PoolingDesc(); | |||||
| void set(const param::Pooling ¶m); | |||||
| ~PoolingDesc(); | |||||
| cudnnPoolingDescriptor_t desc; | |||||
| }; | |||||
| class LRNDesc { | class LRNDesc { | ||||
| public: | public: | ||||
| LRNDesc(); | LRNDesc(); | ||||
| @@ -0,0 +1,621 @@ | |||||
| /** | |||||
| * \file dnn/src/cuda/pooling/algos.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #include "./algo.h" | |||||
| #include "./pooling2d_qint.cuh" | |||||
| #include "src/cuda/utils.h" | |||||
| using namespace megdnn; | |||||
| using namespace cuda; | |||||
| namespace { | |||||
| #define V1(v) #v | |||||
| #define V(v) V1(v) | |||||
| #define DEF_NAME(NAME) \ | |||||
| #NAME "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL) | |||||
| } // namespace | |||||
| PoolingForwardImpl::AlgoPack::AlgoPack() { | |||||
| all_algos.push_back(&algo_chwn4); | |||||
| all_algos.push_back(&algo_nchw4); | |||||
| all_algos.push_back(&algo_nchw32); | |||||
| all_algos.push_back(&algo_nhwc); | |||||
| all_algos.push_back(&algo_nchw64); | |||||
| all_algos.push_back(&algo_cudnn); | |||||
| #if CUDNN_VERSION >= 6000 | |||||
| all_algos.push_back(&algo_cudnn_max_deterministic); | |||||
| #endif | |||||
| for (auto&& algo : all_algos) { | |||||
| m_all_algos_map.emplace(algo->info().desc, algo); | |||||
| } | |||||
| } | |||||
| PoolingForwardImpl::AlgoPack PoolingForwardImpl::sm_algo_pack; | |||||
| MEGDNN_DEF_GET_ALGO_FROM_DESC(PoolingForwardImpl) | |||||
| PoolingForwardImpl::AlgoBase::SizeArgs::SizeArgs(PoolingForwardImpl* o, | |||||
| const TensorLayout& src, | |||||
| const TensorLayout& dst) | |||||
| : handle{concrete_handle(o->handle())}, | |||||
| opr{o}, | |||||
| layout_src{&src}, | |||||
| layout_dst{&dst} {} | |||||
| PoolingForwardImpl::AlgoBase::ExecArgs::ExecArgs(PoolingForwardImpl* opr, | |||||
| _megdnn_tensor_in src, | |||||
| _megdnn_tensor_out dst, | |||||
| _megdnn_workspace workspace) | |||||
| : SizeArgs(opr, src.layout, dst.layout), | |||||
| src_tensor{&src}, | |||||
| dst_tensor{&dst}, | |||||
| workspace{workspace} {} | |||||
| std::string PoolingForwardImpl::AlgoBase::SizeArgs::to_string() const { | |||||
| return ssprintf("src=%s, dst=%s", layout_src->to_string().c_str(), | |||||
| layout_dst->to_string().c_str()); | |||||
| } | |||||
| WorkspaceBundle PoolingForwardImpl::AlgoBase::get_workspace_bundle( | |||||
| void* ptr, const SizeArgs& args) const { | |||||
| SmallVector<size_t> sizes; | |||||
| TensorLayout fsrc = *args.layout_src; | |||||
| TensorLayout fdst = *args.layout_dst; | |||||
| auto get_workspace = [&sizes](TensorLayout& layout) { | |||||
| if (layout.dtype == dtype::BFloat16()) { | |||||
| layout.dtype = dtype::Float32(); | |||||
| sizes.push_back(layout.span().dist_byte()); | |||||
| } | |||||
| }; | |||||
| get_workspace(fsrc); | |||||
| get_workspace(fdst); | |||||
| return {ptr, std::move(sizes)}; | |||||
| } | |||||
| size_t PoolingForwardImpl::AlgoBase::get_workspace_in_bytes( | |||||
| const SizeArgs& args) const { | |||||
| return get_workspace_bundle(nullptr, args).total_size_in_bytes(); | |||||
| } | |||||
| bool PoolingForwardImpl::AlgoCUDNN::is_available(const SizeArgs& args) const { | |||||
| using Format = param::Pooling::Format; | |||||
| return (((args.opr->param().format == Format::NCHW || | |||||
| args.opr->param().format == Format::NHWC) && | |||||
| (args.layout_src->dtype.enumv() == DTypeEnum::Float16 || | |||||
| args.layout_src->dtype.enumv() == DTypeEnum::BFloat16 || | |||||
| args.layout_src->dtype.enumv() == DTypeEnum::Float32 || | |||||
| args.layout_src->dtype.enumv() == DTypeEnum::Int8 || | |||||
| args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS32 || | |||||
| args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS8 || | |||||
| args.layout_src->dtype.enumv() == DTypeEnum::Quantized8Asymm)) || | |||||
| ((args.opr->param().format == Format::NCHW4 || | |||||
| args.opr->param().format == Format::NCHW32) && | |||||
| (args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS8 || | |||||
| args.layout_src->dtype.enumv() == DTypeEnum::Quantized8Asymm))); | |||||
| } | |||||
| void PoolingForwardImpl::AlgoCUDNN::init_mode(const ExecArgs& args, | |||||
| cudnnPoolingMode_t& mode) const { | |||||
| switch (args.opr->param().mode) { | |||||
| case param::Pooling::Mode::MAX: | |||||
| mode = CUDNN_POOLING_MAX; | |||||
| break; | |||||
| case param::Pooling::Mode::AVERAGE: | |||||
| mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; | |||||
| break; | |||||
| case param::Pooling::Mode::AVERAGE_COUNT_EXCLUDE_PADDING: | |||||
| mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; | |||||
| break; | |||||
| default: | |||||
| megdnn_throw(ssprintf("Unspport pooling mode : {%d}", | |||||
| static_cast<int>(args.opr->param().mode))); | |||||
| } | |||||
| } | |||||
| void PoolingForwardImpl::AlgoCUDNN::exec(const ExecArgs& args) const { | |||||
| TensorND src = *args.src_tensor; | |||||
| TensorND dst = *args.dst_tensor; | |||||
| auto wsb = get_workspace_bundle(args.workspace.raw_ptr, args); | |||||
| auto ctypecvt = CompTypeCvter<dtype::BFloat16, dtype::Float32>( | |||||
| concrete_handle(args.handle), &wsb); | |||||
| if (args.layout_src->dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) { | |||||
| ctypecvt.src_to_comp_type(*args.src_tensor, src) | |||||
| .src_to_comp_type(*args.dst_tensor, dst); | |||||
| } | |||||
| { | |||||
| dt_float32 alpha = 1.0f, beta = 0.0f; | |||||
| TensorDesc src_desc, dst_desc; | |||||
| src_desc.set(src.layout, args.opr->param().format); | |||||
| dst_desc.set(dst.layout, args.opr->param().format); | |||||
| cudnnPoolingMode_t mode; | |||||
| init_mode(args, mode); | |||||
| cudnnPoolingDescriptor_t cudnn_desc; | |||||
| cudnn_check(cudnnCreatePoolingDescriptor(&cudnn_desc)); | |||||
| cudnn_check(cudnnSetPooling2dDescriptor( | |||||
| cudnn_desc, mode, CUDNN_NOT_PROPAGATE_NAN, | |||||
| args.opr->param().window_h, args.opr->param().window_w, | |||||
| args.opr->param().pad_h, args.opr->param().pad_w, | |||||
| args.opr->param().stride_h, args.opr->param().stride_w)); | |||||
| cudnn_check(cudnnPoolingForward(args.handle->cudnn_handle(), cudnn_desc, | |||||
| &alpha, src_desc.desc, src.raw_ptr, | |||||
| &beta, dst_desc.desc, dst.raw_ptr)); | |||||
| cudnn_check(cudnnDestroyPoolingDescriptor(cudnn_desc)); | |||||
| } | |||||
| if (args.layout_src->dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) { | |||||
| ctypecvt.comp_to_dst_type(dst, *args.dst_tensor); | |||||
| } | |||||
| } | |||||
| #if CUDNN_VERSION >= 6000 | |||||
| bool PoolingForwardImpl::AlgoCUDNNMAXDETERMINISTIC::is_available( | |||||
| const SizeArgs& args) const { | |||||
| using Format = param::Pooling::Format; | |||||
| return (args.opr->param().mode == param::Pooling::Mode::MAX && | |||||
| (((args.opr->param().format == Format::NCHW || | |||||
| args.opr->param().format == Format::NHWC) && | |||||
| (args.layout_src->dtype.enumv() == DTypeEnum::Float16 || | |||||
| args.layout_src->dtype.enumv() == DTypeEnum::BFloat16 || | |||||
| args.layout_src->dtype.enumv() == DTypeEnum::Float32 || | |||||
| args.layout_src->dtype.enumv() == DTypeEnum::Int8 || | |||||
| args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS32 || | |||||
| args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS8 || | |||||
| args.layout_src->dtype.enumv() == DTypeEnum::Quantized8Asymm)) || | |||||
| ((args.opr->param().format == Format::NCHW4 || | |||||
| args.opr->param().format == Format::NCHW32) && | |||||
| (args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS8 || | |||||
| args.layout_src->dtype.enumv() == DTypeEnum::Quantized8Asymm)))); | |||||
| } | |||||
| void PoolingForwardImpl::AlgoCUDNNMAXDETERMINISTIC::init_mode( | |||||
| const ExecArgs& args, cudnnPoolingMode_t& mode) const { | |||||
| switch (args.opr->param().mode) { | |||||
| case param::Pooling::Mode::MAX: | |||||
| mode = CUDNN_POOLING_MAX_DETERMINISTIC; | |||||
| break; | |||||
| default: | |||||
| megdnn_throw(ssprintf("Unspport pooling mode : {%d}", | |||||
| static_cast<int>(args.opr->param().mode))); | |||||
| } | |||||
| } | |||||
| void PoolingForwardImpl::AlgoCUDNNMAXDETERMINISTIC::exec( | |||||
| const ExecArgs& args) const { | |||||
| TensorND src = *args.src_tensor; | |||||
| TensorND dst = *args.dst_tensor; | |||||
| auto wsb = get_workspace_bundle(args.workspace.raw_ptr, args); | |||||
| auto ctypecvt = CompTypeCvter<dtype::BFloat16, dtype::Float32>( | |||||
| concrete_handle(args.handle), &wsb); | |||||
| if (args.layout_src->dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) { | |||||
| ctypecvt.src_to_comp_type(*args.src_tensor, src) | |||||
| .src_to_comp_type(*args.dst_tensor, dst); | |||||
| } | |||||
| { | |||||
| dt_float32 alpha = 1.0f, beta = 0.0f; | |||||
| TensorDesc src_desc, dst_desc; | |||||
| src_desc.set(src.layout, args.opr->param().format); | |||||
| dst_desc.set(dst.layout, args.opr->param().format); | |||||
| cudnnPoolingMode_t mode; | |||||
| init_mode(args, mode); | |||||
| cudnnPoolingDescriptor_t cudnn_desc; | |||||
| cudnn_check(cudnnCreatePoolingDescriptor(&cudnn_desc)); | |||||
| cudnn_check(cudnnSetPooling2dDescriptor( | |||||
| cudnn_desc, mode, CUDNN_NOT_PROPAGATE_NAN, | |||||
| args.opr->param().window_h, args.opr->param().window_w, | |||||
| args.opr->param().pad_h, args.opr->param().pad_w, | |||||
| args.opr->param().stride_h, args.opr->param().stride_w)); | |||||
| cudnn_check(cudnnPoolingForward(args.handle->cudnn_handle(), cudnn_desc, | |||||
| &alpha, src_desc.desc, src.raw_ptr, | |||||
| &beta, dst_desc.desc, dst.raw_ptr)); | |||||
| cudnn_check(cudnnDestroyPoolingDescriptor(cudnn_desc)); | |||||
| } | |||||
| if (args.layout_src->dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) { | |||||
| ctypecvt.comp_to_dst_type(dst, *args.dst_tensor); | |||||
| } | |||||
| } | |||||
| #endif | |||||
| bool PoolingForwardImpl::AlgoCHWN4::is_available(const SizeArgs& args) const { | |||||
| using Format = param::Pooling::Format; | |||||
| return (args.opr->param().format == Format::CHWN4 && | |||||
| (args.layout_src->dtype.enumv() == DTypeEnum::Quantized8Asymm || | |||||
| args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS8)); | |||||
| } | |||||
| void PoolingForwardImpl::AlgoCHWN4::exec(const ExecArgs& args) const { | |||||
| pooling2d::Param kern_param; | |||||
| size_t c = (*args.layout_src)[0], hi = (*args.layout_src)[1], | |||||
| wi = (*args.layout_src)[2], n = (*args.layout_src)[3], | |||||
| ho = (*args.layout_dst)[1], wo = (*args.layout_dst)[2]; | |||||
| c = c * 4; | |||||
| size_t ph = args.opr->param().pad_h, pw = args.opr->param().pad_w; | |||||
| size_t window_h = args.opr->param().window_h, | |||||
| window_w = args.opr->param().window_w; | |||||
| size_t sh = args.opr->param().stride_h, sw = args.opr->param().stride_w; | |||||
| kern_param.n = n, kern_param.c = c, kern_param.hi = hi, kern_param.wi = wi, | |||||
| kern_param.ho = ho, kern_param.wo = wo, kern_param.ph = ph, | |||||
| kern_param.pw = pw, kern_param.window_h = window_h, | |||||
| kern_param.window_w = window_w, kern_param.sh = sh, kern_param.sw = sw; | |||||
| auto&& stream = cuda_stream(args.handle); | |||||
| pooling2d::do_pooling2d_int8_cdiv4hwn4( | |||||
| args.src_tensor->compatible_ptr<int8_t>(), | |||||
| args.dst_tensor->compatible_ptr<int8_t>(), kern_param, stream, | |||||
| static_cast<uint32_t>(args.opr->param().mode)); | |||||
| } | |||||
| bool PoolingForwardImpl::AlgoNCHW4::is_available(const SizeArgs& args) const { | |||||
| using Format = param::Pooling::Format; | |||||
| return args.opr->param().format == Format::NCHW4 && | |||||
| (args.layout_src->dtype.enumv() == DTypeEnum::Quantized8Asymm || | |||||
| args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS8); | |||||
| } | |||||
| void PoolingForwardImpl::AlgoNCHW4::exec(const ExecArgs& args) const { | |||||
| pooling2d::Param kern_param; | |||||
| size_t n = (*args.layout_src)[0], hi = (*args.layout_src)[2], | |||||
| wi = (*args.layout_src)[3], c = (*args.layout_src)[1], | |||||
| ho = (*args.layout_dst)[2], wo = (*args.layout_dst)[3]; | |||||
| c = c * 4; | |||||
| size_t ph = args.opr->param().pad_h, pw = args.opr->param().pad_w; | |||||
| size_t window_h = args.opr->param().window_h, | |||||
| window_w = args.opr->param().window_w; | |||||
| size_t sh = args.opr->param().stride_h, sw = args.opr->param().stride_w; | |||||
| kern_param.n = n, kern_param.c = c, kern_param.hi = hi, kern_param.wi = wi, | |||||
| kern_param.ho = ho, kern_param.wo = wo, kern_param.ph = ph, | |||||
| kern_param.pw = pw, kern_param.window_h = window_h, | |||||
| kern_param.window_w = window_w, kern_param.sh = sh, kern_param.sw = sw; | |||||
| auto&& stream = cuda_stream(args.handle); | |||||
| pooling2d::do_pooling2d_int8_ncdiv4hw4( | |||||
| args.src_tensor->compatible_ptr<int8_t>(), | |||||
| args.dst_tensor->compatible_ptr<int8_t>(), kern_param, stream, | |||||
| static_cast<uint32_t>(args.opr->param().mode)); | |||||
| } | |||||
| bool PoolingForwardImpl::AlgoNCHW32::is_available(const SizeArgs& args) const { | |||||
| using Format = param::Pooling::Format; | |||||
| return (args.opr->param().format == Format::NCHW32 && | |||||
| (args.layout_src->dtype.enumv() == DTypeEnum::Quantized8Asymm || | |||||
| args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS8)); | |||||
| } | |||||
| void PoolingForwardImpl::AlgoNCHW32::exec(const ExecArgs& args) const { | |||||
| pooling2d::Param kern_param; | |||||
| size_t n = (*args.layout_src)[0], hi = (*args.layout_src)[2], | |||||
| wi = (*args.layout_src)[3], c = (*args.layout_src)[1], | |||||
| ho = (*args.layout_dst)[2], wo = (*args.layout_dst)[3]; | |||||
| c = c * 32; | |||||
| size_t ph = args.opr->param().pad_h, pw = args.opr->param().pad_w; | |||||
| size_t window_h = args.opr->param().window_h, | |||||
| window_w = args.opr->param().window_w; | |||||
| size_t sh = args.opr->param().stride_h, sw = args.opr->param().stride_w; | |||||
| kern_param.n = n, kern_param.c = c, kern_param.hi = hi, kern_param.wi = wi, | |||||
| kern_param.ho = ho, kern_param.wo = wo, kern_param.ph = ph, | |||||
| kern_param.pw = pw, kern_param.window_h = window_h, | |||||
| kern_param.window_w = window_w, kern_param.sh = sh, kern_param.sw = sw; | |||||
| auto&& stream = cuda_stream(args.handle); | |||||
| pooling2d::do_pooling2d_int8_ncdiv32hw32( | |||||
| args.src_tensor->compatible_ptr<int8_t>(), | |||||
| args.dst_tensor->compatible_ptr<int8_t>(), kern_param, stream, | |||||
| static_cast<uint32_t>(args.opr->param().mode)); | |||||
| } | |||||
| bool PoolingForwardImpl::AlgoNHWC::is_available(const SizeArgs& args) const { | |||||
| using Format = param::Pooling::Format; | |||||
| return (args.opr->param().format == Format::NHWC && | |||||
| (args.layout_src->dtype.enumv() == DTypeEnum::Quantized4Asymm || | |||||
| args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS4)); | |||||
| } | |||||
| void PoolingForwardImpl::AlgoNHWC::exec(const ExecArgs& args) const { | |||||
| TensorND src = *args.src_tensor; | |||||
| TensorND dst = *args.dst_tensor; | |||||
| { | |||||
| megdnn_assert(src.layout.dtype.enumv() == dst.layout.dtype.enumv(), | |||||
| "src and dst dtype must equal"); | |||||
| pooling2d::Param kern_param; | |||||
| size_t n = src.layout[0], hi = src.layout[1], wi = src.layout[2], | |||||
| c = src.layout[3], ho = dst.layout[1], wo = dst.layout[2]; | |||||
| size_t ph = args.opr->param().pad_h, pw = args.opr->param().pad_w; | |||||
| size_t window_h = args.opr->param().window_h, | |||||
| window_w = args.opr->param().window_w; | |||||
| size_t sh = args.opr->param().stride_h, sw = args.opr->param().stride_w; | |||||
| kern_param.n = n, kern_param.c = c, kern_param.hi = hi, | |||||
| kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo, | |||||
| kern_param.ph = ph, kern_param.pw = pw, kern_param.window_h = window_h, | |||||
| kern_param.window_w = window_w, kern_param.sh = sh, kern_param.sw = sw; | |||||
| bool uint_case = false; | |||||
| int zero_point = 0; | |||||
| if (src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { | |||||
| uint_case = true; | |||||
| zero_point = | |||||
| src.layout.dtype.param<dtype::Quantized4Asymm>().zero_point; | |||||
| } | |||||
| auto&& stream = cuda_stream(args.handle); | |||||
| pooling2d::do_pooling2d_int4_nhwc( | |||||
| (int8_t*)src.raw_ptr, (int8_t*)dst.raw_ptr, kern_param, stream, | |||||
| static_cast<uint32_t>(args.opr->param().mode), uint_case, | |||||
| zero_point); | |||||
| } | |||||
| } | |||||
| inline void PoolingForwardImpl::AlgoNCHW64::deduce_reformat_layout( | |||||
| std::unique_ptr<RelayoutFormat>& relayout, | |||||
| const TensorLayout& src_layout, TensorLayout& dst_layout, | |||||
| RelayoutFormat::Param::Mode mode, const int oc = 0, | |||||
| const int group = 1) const { | |||||
| if (src_layout.ndim > 0) { | |||||
| RelayoutFormat::Param trans_param; | |||||
| trans_param.mode = mode; | |||||
| trans_param.oc = oc; | |||||
| trans_param.group = group; | |||||
| relayout->param() = trans_param; | |||||
| relayout->deduce_layout(src_layout, dst_layout); | |||||
| } else { | |||||
| dst_layout = src_layout; | |||||
| } | |||||
| } | |||||
| void PoolingForwardImpl::AlgoNCHW64::get_inner_layout( | |||||
| const TensorLayout& src, const TensorLayout& dst, | |||||
| TensorLayout& inner_src, TensorLayout& inner_dst, Handle* handle, | |||||
| PoolingForwardImpl::Param::Format format) const { | |||||
| auto relayout_opr = handle->create_operator<RelayoutFormat>(); | |||||
| deduce_reformat_layout(relayout_opr, src, inner_src, | |||||
| RelayoutFormat::Param::Mode::NCHW_NCHW64, 0, 1); | |||||
| deduce_reformat_layout(relayout_opr, dst, inner_dst, | |||||
| RelayoutFormat::Param::Mode::NCHW_NCHW64, 0, 1); | |||||
| } | |||||
| WorkspaceBundle PoolingForwardImpl::AlgoNCHW64::get_workspace_bundle( | |||||
| void* ptr, const SizeArgs& args) const { | |||||
| using Format = param::Pooling::Format; | |||||
| SmallVector<size_t> sizes; | |||||
| TensorLayout fsrc = *args.layout_src; | |||||
| TensorLayout fdst = *args.layout_dst; | |||||
| if (args.opr->param().format == Format::NCHW) { | |||||
| get_inner_layout(*args.layout_src, *args.layout_dst, fsrc, fdst, | |||||
| args.handle, args.opr->param().format); | |||||
| sizes.push_back(fsrc.span().dist_byte()); | |||||
| sizes.push_back(fdst.span().dist_byte()); | |||||
| } | |||||
| return {ptr, std::move(sizes)}; | |||||
| } | |||||
| bool PoolingForwardImpl::AlgoNCHW64::is_available(const SizeArgs& args) const { | |||||
| using Format = param::Pooling::Format; | |||||
| return ((args.opr->param().format == Format::NCHW || | |||||
| args.opr->param().format == Format::NCHW64) && | |||||
| (args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS4 || | |||||
| args.layout_src->dtype.enumv() == DTypeEnum::Quantized4Asymm) && | |||||
| (args.layout_dst->dtype.enumv() == DTypeEnum::QuantizedS4 || | |||||
| args.layout_dst->dtype.enumv() == DTypeEnum::Quantized4Asymm)); | |||||
| } | |||||
| void PoolingForwardImpl::AlgoNCHW64::exec(const ExecArgs& args) const { | |||||
| using Format = param::Pooling::Format; | |||||
| TensorND src = *args.src_tensor; | |||||
| TensorND dst = *args.dst_tensor; | |||||
| if (args.opr->param().format == Format::NCHW) { | |||||
| auto wsb = get_workspace_bundle(args.workspace.raw_ptr, args); | |||||
| auto handle_ptr = args.handle; | |||||
| get_inner_layout(*args.layout_src, *args.layout_dst, src.layout, | |||||
| dst.layout, handle_ptr, args.opr->param().format); | |||||
| src.raw_ptr = wsb.get(0); | |||||
| dst.raw_ptr = wsb.get(1); | |||||
| auto relayout_opr = handle_ptr->create_operator<RelayoutFormat>(); | |||||
| RelayoutFormat::Param trans_param; | |||||
| trans_param.mode = RelayoutFormat::Param::Mode::NCHW_NCHW64; | |||||
| relayout_opr->param() = trans_param; | |||||
| relayout_opr->exec(*args.src_tensor, src, {}); | |||||
| } | |||||
| { | |||||
| pooling2d::Param kern_param; | |||||
| size_t n = src.layout[0], hi = src.layout[2], wi = src.layout[3], | |||||
| c = src.layout[1], ho = dst.layout[2], wo = dst.layout[3]; | |||||
| c = c * 64; | |||||
| size_t ph = args.opr->param().pad_h, pw = args.opr->param().pad_w; | |||||
| size_t window_h = args.opr->param().window_h, | |||||
| window_w = args.opr->param().window_w; | |||||
| size_t sh = args.opr->param().stride_h, sw = args.opr->param().stride_w; | |||||
| kern_param.n = n, kern_param.c = c, kern_param.hi = hi, | |||||
| kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo, | |||||
| kern_param.ph = ph, kern_param.pw = pw, kern_param.window_h = window_h, | |||||
| kern_param.window_w = window_w, kern_param.sh = sh, kern_param.sw = sw; | |||||
| bool uint_case = false; | |||||
| int zero_point = 0; | |||||
| if (src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { | |||||
| uint_case = true; | |||||
| zero_point = | |||||
| src.layout.dtype.param<dtype::Quantized4Asymm>().zero_point; | |||||
| } | |||||
| auto&& stream = cuda_stream(args.handle); | |||||
| pooling2d::do_pooling2d_int4_ncdiv64hw64( | |||||
| (int8_t*)src.raw_ptr, (int8_t*)dst.raw_ptr, kern_param, stream, | |||||
| static_cast<uint32_t>(args.opr->param().mode), uint_case, | |||||
| zero_point); | |||||
| } | |||||
| if (args.layout_dst->ndim == 4) { | |||||
| auto relayout_opr = args.handle->create_operator<RelayoutFormat>(); | |||||
| RelayoutFormat::Param trans_param; | |||||
| trans_param.mode = RelayoutFormat::Param::Mode::NCHW64_NCHW; | |||||
| relayout_opr->param() = trans_param; | |||||
| relayout_opr->exec(dst, *args.dst_tensor, {}); | |||||
| } | |||||
| } | |||||
| PoolingBackwardImpl::AlgoPack::AlgoPack() { | |||||
| algo_cudnn.push_back({DEF_NAME(cudnnUnreproducible), false}); | |||||
| algo_cudnn.push_back({DEF_NAME(cudnnReproducible), true}); | |||||
| for (auto&& i : algo_cudnn) { | |||||
| all_algos.push_back(&i); | |||||
| } | |||||
| for (auto&& algo : all_algos) { | |||||
| m_all_algos_map.emplace(algo->info().desc, algo); | |||||
| } | |||||
| } | |||||
| PoolingBackwardImpl::AlgoPack PoolingBackwardImpl::sm_algo_pack; | |||||
| MEGDNN_DEF_GET_ALGO_FROM_DESC(PoolingBackwardImpl) | |||||
| PoolingBackwardImpl::AlgoBase::SizeArgs::SizeArgs(PoolingBackwardImpl* o, | |||||
| const TensorLayout& src, | |||||
| const TensorLayout& dst, | |||||
| const TensorLayout& diff, | |||||
| const TensorLayout& grad) | |||||
| : handle{concrete_handle(o->handle())}, | |||||
| opr{o}, | |||||
| layout_src{&src}, | |||||
| layout_dst{&dst}, | |||||
| layout_diff{&diff}, | |||||
| layout_grad{&grad} {} | |||||
| PoolingBackwardImpl::AlgoBase::ExecArgs::ExecArgs(PoolingBackwardImpl* opr, | |||||
| _megdnn_tensor_in src, | |||||
| _megdnn_tensor_in dst, | |||||
| _megdnn_tensor_in diff, | |||||
| _megdnn_tensor_out grad, | |||||
| _megdnn_workspace workspace) | |||||
| : SizeArgs(opr, src.layout, dst.layout, diff.layout, grad.layout), | |||||
| src_tensor{&src}, | |||||
| dst_tensor{&dst}, | |||||
| diff_tensor{&diff}, | |||||
| grad_tensor{&grad}, | |||||
| workspace{workspace} {} | |||||
| std::string PoolingBackwardImpl::AlgoBase::SizeArgs::to_string() const { | |||||
| return ssprintf( | |||||
| "src=%s, dst=%s, diff=%s, grad=%s", layout_src->to_string().c_str(), | |||||
| layout_dst->to_string().c_str(), layout_diff->to_string().c_str(), | |||||
| layout_grad->to_string().c_str()); | |||||
| } | |||||
| bool PoolingBackwardImpl::AlgoCUDNN::is_available(const SizeArgs& args) const { | |||||
| using Format = param::Pooling::Format; | |||||
| #if CUDNN_VERSION < 6000 | |||||
| return ((args.opr->param().format == Format::NCHW || | |||||
| args.opr->param().format == Format::NHWC || | |||||
| args.opr->param().format == Format::NCHW4 || | |||||
| args.opr->param().format == Format::NCHW32) && | |||||
| (m_is_reproducible ^ | |||||
| (args.opr->param().mode == param::Pooling::Mode::MAX))); | |||||
| #else | |||||
| return ((args.opr->param().format == Format::NCHW || | |||||
| args.opr->param().format == Format::NHWC || | |||||
| args.opr->param().format == Format::NCHW4 || | |||||
| args.opr->param().format == Format::NCHW32) && | |||||
| (m_is_reproducible || | |||||
| args.opr->param().mode == param::Pooling::Mode::MAX)); | |||||
| #endif | |||||
| } | |||||
| WorkspaceBundle PoolingBackwardImpl::AlgoBase::get_workspace_bundle( | |||||
| void* ptr, const SizeArgs& args) const { | |||||
| SmallVector<size_t> sizes; | |||||
| TensorLayout fsrc = *args.layout_src; | |||||
| TensorLayout fdst = *args.layout_dst; | |||||
| TensorLayout fdiff = *args.layout_diff; | |||||
| TensorLayout fgrad = *args.layout_grad; | |||||
| auto get_workspace = [&sizes](TensorLayout& layout) { | |||||
| if (layout.dtype == dtype::BFloat16()) { | |||||
| layout.dtype = dtype::Float32(); | |||||
| sizes.push_back(layout.span().dist_byte()); | |||||
| } | |||||
| }; | |||||
| get_workspace(fsrc); | |||||
| get_workspace(fdst); | |||||
| get_workspace(fdiff); | |||||
| get_workspace(fgrad); | |||||
| return {ptr, std::move(sizes)}; | |||||
| } | |||||
| size_t PoolingBackwardImpl::AlgoBase::get_workspace_in_bytes( | |||||
| const SizeArgs& args) const { | |||||
| return get_workspace_bundle(nullptr, args).total_size_in_bytes(); | |||||
| } | |||||
| void PoolingBackwardImpl::AlgoCUDNN::init_mode(const ExecArgs& args, | |||||
| cudnnPoolingMode_t& mode) const { | |||||
| if (m_is_reproducible) { | |||||
| switch (args.opr->param().mode) { | |||||
| #if CUDNN_VERSION >= 6000 | |||||
| case param::Pooling::Mode::MAX: | |||||
| mode = CUDNN_POOLING_MAX_DETERMINISTIC; | |||||
| break; | |||||
| #endif | |||||
| case param::Pooling::Mode::AVERAGE: | |||||
| mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; | |||||
| break; | |||||
| case param::Pooling::Mode::AVERAGE_COUNT_EXCLUDE_PADDING: | |||||
| mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; | |||||
| break; | |||||
| default: | |||||
| megdnn_throw( | |||||
| ssprintf("Unspport pooling mode : {%d}", | |||||
| static_cast<int>(args.opr->param().mode))); | |||||
| } | |||||
| } else if (args.opr->param().mode == param::Pooling::Mode::MAX) { | |||||
| mode = CUDNN_POOLING_MAX; | |||||
| } else { | |||||
| megdnn_throw("init_mode failed\n"); | |||||
| } | |||||
| } | |||||
| void PoolingBackwardImpl::AlgoCUDNN::exec(const ExecArgs& args) const { | |||||
| TensorND src = *args.src_tensor; | |||||
| TensorND dst = *args.dst_tensor; | |||||
| TensorND diff = *args.diff_tensor; | |||||
| TensorND grad = *args.grad_tensor; | |||||
| auto wsb = get_workspace_bundle(args.workspace.raw_ptr, args); | |||||
| auto ctypecvt = CompTypeCvter<dtype::BFloat16, dtype::Float32>( | |||||
| concrete_handle(args.handle), &wsb); | |||||
| if (args.layout_src->dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) { | |||||
| ctypecvt.src_to_comp_type(*args.src_tensor, src) | |||||
| .src_to_comp_type(*args.dst_tensor, dst) | |||||
| .src_to_comp_type(*args.diff_tensor, diff) | |||||
| .src_to_comp_type(*args.grad_tensor, grad); | |||||
| } | |||||
| { | |||||
| dt_float32 alpha = 1.0f, beta = 0.0f; | |||||
| TensorDesc src_desc, dst_desc, diff_desc, grad_desc; | |||||
| src_desc.set(src.layout, args.opr->param().format); | |||||
| dst_desc.set(dst.layout, args.opr->param().format); | |||||
| diff_desc.set(diff.layout, args.opr->param().format); | |||||
| grad_desc.set(grad.layout, args.opr->param().format); | |||||
| cudnnPoolingMode_t mode; | |||||
| init_mode(args, mode); | |||||
| cudnnPoolingDescriptor_t cudnn_desc; | |||||
| cudnn_check(cudnnCreatePoolingDescriptor(&cudnn_desc)); | |||||
| cudnn_check(cudnnSetPooling2dDescriptor( | |||||
| cudnn_desc, mode, CUDNN_NOT_PROPAGATE_NAN, | |||||
| args.opr->param().window_h, args.opr->param().window_w, | |||||
| args.opr->param().pad_h, args.opr->param().pad_w, | |||||
| args.opr->param().stride_h, args.opr->param().stride_w)); | |||||
| cudnn_check(cudnnPoolingBackward( | |||||
| args.handle->cudnn_handle(), cudnn_desc, &alpha, dst_desc.desc, | |||||
| dst.raw_ptr, diff_desc.desc, diff.raw_ptr, src_desc.desc, | |||||
| src.raw_ptr, &beta, grad_desc.desc, grad.raw_ptr)); | |||||
| cudnn_check(cudnnDestroyPoolingDescriptor(cudnn_desc)); | |||||
| } | |||||
| if (args.layout_src->dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) { | |||||
| ctypecvt.comp_to_dst_type(grad, *args.grad_tensor); | |||||
| } | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,269 @@ | |||||
| /** | |||||
| * \file dnn/src/cuda/pooling/algo.h | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #pragma once | |||||
| #include <unordered_map> | |||||
| #include "src/common/algo_base.h" | |||||
| #include "src/common/metahelper.h" | |||||
| #include "src/cuda/cudnn_wrapper.h" | |||||
| #include "src/cuda/pooling/opr_impl.h" | |||||
| namespace megdnn { | |||||
| namespace cuda { | |||||
| namespace { | |||||
| #define V1(v) #v | |||||
| #define V(v) V1(v) | |||||
| #define DEF_NAME(NAME) \ | |||||
| #NAME "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL) | |||||
| } // namespace | |||||
| class PoolingForwardImpl::AlgoBase : public Algorithm { | |||||
| public: | |||||
| enum class AlgoType : uint32_t { | |||||
| CUDA_CUDNN, | |||||
| #if CUDNN_VERSION >= 6000 | |||||
| CUDA_CUDNN_MAXDETERMINISTIC, | |||||
| #endif | |||||
| CUDA_CHWN4, | |||||
| CUDA_NCHW4, | |||||
| CUDA_NCHW32, | |||||
| CUDA_NHWC, | |||||
| CUDA_NCHW64 | |||||
| }; | |||||
| using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>; | |||||
| AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; } | |||||
| struct SizeArgs { | |||||
| HandleImpl* handle; | |||||
| PoolingForwardImpl* opr; | |||||
| const TensorLayout *layout_src, *layout_dst; | |||||
| std::string to_string() const; | |||||
| SizeArgs(PoolingForwardImpl* opr, const TensorLayout& src, | |||||
| const TensorLayout& dst); | |||||
| }; | |||||
| struct ExecArgs : public SizeArgs { | |||||
| const TensorND *src_tensor, *dst_tensor; | |||||
| Workspace workspace; | |||||
| ExecArgs(PoolingForwardImpl* opr, _megdnn_tensor_in src, | |||||
| _megdnn_tensor_out dst, _megdnn_workspace workspace); | |||||
| }; | |||||
| virtual bool is_available(const SizeArgs& args) const = 0; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const; | |||||
| virtual void exec(const ExecArgs& args) const = 0; | |||||
| bool is_available_attribute( | |||||
| const SizeArgs& args, | |||||
| const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE, | |||||
| const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) { | |||||
| return contain_attribute_all(positive_attr) && | |||||
| !contain_attribute_any(negative_attr) && is_available(args); | |||||
| } | |||||
| protected: | |||||
| ~AlgoBase() = default; | |||||
| virtual WorkspaceBundle get_workspace_bundle(void* ptr, | |||||
| const SizeArgs& args) const; | |||||
| }; | |||||
| class PoolingForwardImpl::AlgoCUDNN final : public AlgoBase { | |||||
| std::string m_algo_name; | |||||
| public: | |||||
| AlgoCUDNN(std::string name) : m_algo_name(name) {} | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| void init_mode(const ExecArgs& args, cudnnPoolingMode_t& mode) const; | |||||
| void exec(const ExecArgs& args) const override; | |||||
| const char* name() const override { return m_algo_name.c_str(); } | |||||
| AlgoAttribute attribute() const override { | |||||
| return AlgoAttribute::REPRODUCIBLE; | |||||
| } | |||||
| MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN) | |||||
| std::string param() const override { return m_algo_name; } | |||||
| }; | |||||
| #if CUDNN_VERSION >= 6000 | |||||
| class PoolingForwardImpl::AlgoCUDNNMAXDETERMINISTIC final : public AlgoBase { | |||||
| std::string m_algo_name; | |||||
| public: | |||||
| AlgoCUDNNMAXDETERMINISTIC(std::string name) : m_algo_name(name) {} | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| void init_mode(const ExecArgs& args, cudnnPoolingMode_t& mode) const; | |||||
| void exec(const ExecArgs& args) const override; | |||||
| const char* name() const override { return m_algo_name.c_str(); } | |||||
| AlgoAttribute attribute() const override { | |||||
| return AlgoAttribute::REPRODUCIBLE; | |||||
| } | |||||
| MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN_MAXDETERMINISTIC) | |||||
| std::string param() const override { return m_algo_name; } | |||||
| }; | |||||
| #endif | |||||
| #define ALGO_LAYOUT_POOLING_IMPL(_layout) \ | |||||
| class PoolingForwardImpl::Algo##_layout final : public AlgoBase { \ | |||||
| std::string m_algo_name; \ | |||||
| \ | |||||
| public: \ | |||||
| Algo##_layout( \ | |||||
| std::string name = std::string("CUDA_").append(#_layout)) \ | |||||
| : m_algo_name(name) {} \ | |||||
| bool is_available(const SizeArgs& args) const override; \ | |||||
| void exec(const ExecArgs& args) const override; \ | |||||
| const char* name() const override { return m_algo_name.c_str(); } \ | |||||
| AlgoAttribute attribute() const override { \ | |||||
| return AlgoAttribute::REPRODUCIBLE; \ | |||||
| } \ | |||||
| MEGDNN_DECL_ALGO_TYPE(CUDA_##_layout) | |||||
| ALGO_LAYOUT_POOLING_IMPL(CHWN4)}; | |||||
| ALGO_LAYOUT_POOLING_IMPL(NCHW4)}; | |||||
| ALGO_LAYOUT_POOLING_IMPL(NCHW32)}; | |||||
| ALGO_LAYOUT_POOLING_IMPL(NHWC)}; | |||||
| ALGO_LAYOUT_POOLING_IMPL(NCHW64) //{ | |||||
| protected: | |||||
| WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) | |||||
| const override; | |||||
| private: | |||||
| inline void deduce_reformat_layout( | |||||
| std::unique_ptr<RelayoutFormat> & relayout, | |||||
| const TensorLayout& src_layout, TensorLayout& dst_layout, | |||||
| RelayoutFormat::Param::Mode mode, const int oc, const int group) | |||||
| const; | |||||
| void get_inner_layout(const TensorLayout& src, const TensorLayout& dst, | |||||
| TensorLayout& inner_src, TensorLayout& inner_dst, | |||||
| Handle* handle, | |||||
| PoolingForwardImpl::Param::Format format) const; | |||||
| }; | |||||
| #undef ALGO_LAYOUT_POOLING_IMPL | |||||
| class PoolingForwardImpl::AlgoPack : NonCopyableObj { | |||||
| private: | |||||
| AlgoBase::Mapper m_all_algos_map; | |||||
| public: | |||||
| AlgoPack(); | |||||
| AlgoCUDNN algo_cudnn{DEF_NAME(cudnnForward)}; | |||||
| #if CUDNN_VERSION >= 6000 | |||||
| AlgoCUDNNMAXDETERMINISTIC algo_cudnn_max_deterministic{ | |||||
| DEF_NAME(cudnnForwardMaxDeterministic)}; | |||||
| #endif | |||||
| AlgoCHWN4 algo_chwn4; | |||||
| AlgoNCHW4 algo_nchw4; | |||||
| AlgoNCHW32 algo_nchw32; | |||||
| AlgoNHWC algo_nhwc; | |||||
| AlgoNCHW64 algo_nchw64; | |||||
| std::vector<AlgoBase*> all_algos; | |||||
| const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; } | |||||
| }; | |||||
| class PoolingBackwardImpl::AlgoBase : public Algorithm { | |||||
| public: | |||||
| enum class AlgoType : uint32_t { CUDA_CUDNN }; | |||||
| using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>; | |||||
| AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; } | |||||
| struct SizeArgs { | |||||
| HandleImpl* handle; | |||||
| PoolingBackwardImpl* opr; | |||||
| const TensorLayout *layout_src, *layout_dst, *layout_diff, *layout_grad; | |||||
| std::string to_string() const; | |||||
| SizeArgs(PoolingBackwardImpl* opr, const TensorLayout& src, | |||||
| const TensorLayout& dst, const TensorLayout& diff, | |||||
| const TensorLayout& grad); | |||||
| }; | |||||
| struct ExecArgs : public SizeArgs { | |||||
| const TensorND *src_tensor, *dst_tensor, *diff_tensor, *grad_tensor; | |||||
| Workspace workspace; | |||||
| ExecArgs(PoolingBackwardImpl* opr, _megdnn_tensor_in src, | |||||
| _megdnn_tensor_in dst, _megdnn_tensor_in diff, | |||||
| _megdnn_tensor_out grad, _megdnn_workspace workspace); | |||||
| }; | |||||
| virtual bool is_available(const SizeArgs& args) const = 0; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const; | |||||
| virtual void exec(const ExecArgs& args) const = 0; | |||||
| bool is_available_attribute( | |||||
| const SizeArgs& args, | |||||
| const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE, | |||||
| const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) { | |||||
| return contain_attribute_all(positive_attr) && | |||||
| !contain_attribute_any(negative_attr) && is_available(args); | |||||
| } | |||||
| protected: | |||||
| ~AlgoBase() = default; | |||||
| virtual WorkspaceBundle get_workspace_bundle(void* ptr, | |||||
| const SizeArgs& args) const; | |||||
| }; | |||||
| class PoolingBackwardImpl::AlgoCUDNN final : public AlgoBase { | |||||
| std::string m_algo_name; | |||||
| bool m_is_reproducible; | |||||
| public: | |||||
| AlgoCUDNN(std::string name, bool is_reproducible) | |||||
| : m_algo_name(name), m_is_reproducible(is_reproducible) {} | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| void init_mode(const ExecArgs& args, cudnnPoolingMode_t& mode) const; | |||||
| void exec(const ExecArgs& args) const override; | |||||
| const char* name() const override { return m_algo_name.c_str(); } | |||||
| AlgoAttribute attribute() const override { | |||||
| auto ret = AlgoAttribute::DEFAULT; | |||||
| if (m_is_reproducible) { | |||||
| ret |= AlgoAttribute::REPRODUCIBLE; | |||||
| } | |||||
| return ret; | |||||
| } | |||||
| MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN) | |||||
| std::string param() const override { return m_algo_name; } | |||||
| }; | |||||
| class PoolingBackwardImpl::AlgoPack : NonCopyableObj { | |||||
| private: | |||||
| AlgoBase::Mapper m_all_algos_map; | |||||
| public: | |||||
| AlgoPack(); | |||||
| std::vector<AlgoCUDNN> algo_cudnn; | |||||
| std::vector<AlgoBase*> all_algos; | |||||
| const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; } | |||||
| }; | |||||
| } // namespace cuda | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -6,275 +6,97 @@ | |||||
| * | * | ||||
| * Unless required by applicable law or agreed to in writing, | * Unless required by applicable law or agreed to in writing, | ||||
| * software distributed under the License is distributed on an | * software distributed under the License is distributed on an | ||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | */ | ||||
| #include "src/cuda/pooling/opr_impl.h" | #include "src/cuda/pooling/opr_impl.h" | ||||
| #include "src/cuda/relayout_format/opr_impl.h" | |||||
| #include "./algo.h" | |||||
| #include "./pooling2d_qint.cuh" | #include "./pooling2d_qint.cuh" | ||||
| #include "src/common/algo_chooser.h" | |||||
| #include "src/cuda/relayout_format/opr_impl.h" | |||||
| #include "src/cuda/utils.h" | #include "src/cuda/utils.h" | ||||
| namespace megdnn { | namespace megdnn { | ||||
| namespace cuda { | namespace cuda { | ||||
| namespace { | |||||
| inline void deduce_reformat_layout(std::unique_ptr<RelayoutFormat>& relayout, | |||||
| const TensorLayout& src_layout, | |||||
| TensorLayout& dst_layout, | |||||
| RelayoutFormat::Param::Mode mode, | |||||
| const int oc = 0, const int group = 1) { | |||||
| if (src_layout.ndim > 0) { | |||||
| RelayoutFormat::Param trans_param; | |||||
| trans_param.mode = mode; | |||||
| trans_param.oc = oc; | |||||
| trans_param.group = group; | |||||
| relayout->param() = trans_param; | |||||
| relayout->deduce_layout(src_layout, dst_layout); | |||||
| } else { | |||||
| dst_layout = src_layout; | |||||
| } | |||||
| size_t PoolingForwardImpl::get_workspace_in_bytes(const TensorLayout& src, | |||||
| const TensorLayout& dst) { | |||||
| AlgoBase::SizeArgs args(this, src, dst); | |||||
| return get_algorithm(this, src, dst)->get_workspace_in_bytes(args); | |||||
| } | } | ||||
| void get_inner_layout(const TensorLayout& src, const TensorLayout& dst, | |||||
| TensorLayout& inner_src, TensorLayout& inner_dst, | |||||
| Handle* handle, | |||||
| PoolingForwardImpl::Param::Format format) { | |||||
| bool is_nchw = format == PoolingForwardImpl::Param::Format::NCHW; | |||||
| if (is_nchw) { | |||||
| auto relayout_opr = handle->create_operator<RelayoutFormat>(); | |||||
| deduce_reformat_layout(relayout_opr, src, inner_src, | |||||
| RelayoutFormat::Param::Mode::NCHW_NCHW64, 0, 1); | |||||
| deduce_reformat_layout(relayout_opr, dst, inner_dst, | |||||
| RelayoutFormat::Param::Mode::NCHW_NCHW64, 0, 1); | |||||
| } else { | |||||
| megdnn_assert(0, "not support"); | |||||
| } | |||||
| const char* PoolingForwardImpl::get_algorithm_set_name() const { | |||||
| return "CUDA_POOLING_FORWARD"; | |||||
| } | } | ||||
| } // namespace | |||||
| void PoolingForwardImpl::setup_descs(const TensorLayout& src, | |||||
| const TensorLayout& dst) { | |||||
| src_desc.set(src, param().format); | |||||
| dst_desc.set(dst, param().format); | |||||
| pooling_desc.set(this->param()); | |||||
| std::vector<PoolingForwardImpl::Algorithm*> | |||||
| PoolingForwardImpl::get_all_algorithms(const TensorLayout& src, | |||||
| const TensorLayout& dst) { | |||||
| return megdnn::get_all_algorithms<PoolingForwardImpl>({this, src, dst}); | |||||
| } | } | ||||
| WorkspaceBundle PoolingForwardImpl::get_workspace_bundle( | |||||
| void* ptr, const TensorLayout& src, const TensorLayout& dst) const { | |||||
| SmallVector<size_t> sizes; | |||||
| TensorLayout fsrc = src; | |||||
| TensorLayout fdst = dst; | |||||
| bool is_nchw = param().format == Param::Format::NCHW; | |||||
| if ((src.dtype.enumv() == DTypeEnum::QuantizedS4 || | |||||
| src.dtype.enumv() == DTypeEnum::Quantized4Asymm) && | |||||
| (dst.dtype.enumv() == DTypeEnum::QuantizedS4 || | |||||
| dst.dtype.enumv() == DTypeEnum::Quantized4Asymm) && | |||||
| is_nchw) { | |||||
| get_inner_layout(src, dst, fsrc, fdst, handle(), param().format); | |||||
| sizes.push_back(fsrc.span().dist_byte()); | |||||
| sizes.push_back(fdst.span().dist_byte()); | |||||
| } else { | |||||
| auto get_workspace = [&sizes](TensorLayout& layout) { | |||||
| if (layout.dtype == dtype::BFloat16()) { | |||||
| layout.dtype = dtype::Float32(); | |||||
| sizes.push_back(layout.span().dist_byte()); | |||||
| } | |||||
| }; | |||||
| get_workspace(fsrc); | |||||
| get_workspace(fdst); | |||||
| PoolingForwardImpl::Algorithm* PoolingForwardImpl::get_algorithm_heuristic( | |||||
| const TensorLayout& src, const TensorLayout& dst, | |||||
| size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, | |||||
| const AlgoAttribute& negative_attr) { | |||||
| MEGDNN_MARK_USED_VAR(workspace_limit_in_bytes); | |||||
| AlgoBase::SizeArgs args(this, src, dst); | |||||
| for (auto&& iter : sm_algo_pack.all_algos) { | |||||
| if (iter->is_available_attribute(args, positive_attr, negative_attr)) { | |||||
| return iter; | |||||
| } | |||||
| } | } | ||||
| return {ptr, std::move(sizes)}; | |||||
| megdnn_throw( | |||||
| ssprintf("require algorithm with attribute(%s) and without " | |||||
| "attribute(%s), but can't get suitable algo.\n", | |||||
| Algorithm::attribute_str(positive_attr).c_str(), | |||||
| Algorithm::attribute_str(negative_attr).c_str())); | |||||
| return nullptr; | |||||
| } | } | ||||
| void PoolingForwardImpl::exec(_megdnn_tensor_in ssrc, _megdnn_tensor_out sdst, | void PoolingForwardImpl::exec(_megdnn_tensor_in ssrc, _megdnn_tensor_out sdst, | ||||
| _megdnn_workspace sworkspace) { | _megdnn_workspace sworkspace) { | ||||
| check_exec(ssrc.layout, sdst.layout, sworkspace.size); | check_exec(ssrc.layout, sdst.layout, sworkspace.size); | ||||
| TensorND src = ssrc; | |||||
| TensorND dst = sdst; | |||||
| Param::Format inner_format = param().format; | |||||
| auto wsb = | |||||
| get_workspace_bundle(sworkspace.raw_ptr, ssrc.layout, sdst.layout); | |||||
| auto ctypecvt = CompTypeCvter<dtype::BFloat16, dtype::Float32>( | |||||
| concrete_handle(this->handle()), &wsb); | |||||
| bool is_nchw = param().format == Param::Format::NCHW; | |||||
| if (ssrc.layout.dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) { | |||||
| ctypecvt.src_to_comp_type(ssrc, src).src_to_comp_type(sdst, dst); | |||||
| } else if ((ssrc.layout.dtype.enumv() == DTypeEnum::QuantizedS4 || | |||||
| ssrc.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) && | |||||
| (sdst.layout.dtype.enumv() == DTypeEnum::QuantizedS4 || | |||||
| sdst.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) && | |||||
| is_nchw) { | |||||
| auto handle_ptr = handle(); | |||||
| get_inner_layout(ssrc.layout, sdst.layout, src.layout, dst.layout, | |||||
| handle_ptr, param().format); | |||||
| src.raw_ptr = wsb.get(0); | |||||
| dst.raw_ptr = wsb.get(1); | |||||
| auto relayout_opr = handle_ptr->create_operator<RelayoutFormat>(); | |||||
| RelayoutFormat::Param trans_param; | |||||
| trans_param.mode = RelayoutFormat::Param::Mode::NCHW_NCHW64; | |||||
| relayout_opr->param() = trans_param; | |||||
| relayout_opr->exec(ssrc, src, {}); | |||||
| inner_format = Param::Format::NCHW64; | |||||
| } | |||||
| { | { | ||||
| using Format = param::Pooling::Format; | |||||
| if (param().format == Format::CHWN4) { | |||||
| pooling2d::Param kern_param; | |||||
| size_t c = src.layout[0], hi = src.layout[1], wi = src.layout[2], | |||||
| n = src.layout[3], ho = dst.layout[1], wo = dst.layout[2]; | |||||
| c = c * 4; | |||||
| size_t ph = param().pad_h, pw = param().pad_w; | |||||
| size_t window_h = param().window_h, window_w = param().window_w; | |||||
| size_t sh = param().stride_h, sw = param().stride_w; | |||||
| kern_param.n = n, kern_param.c = c, kern_param.hi = hi, | |||||
| kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo, | |||||
| kern_param.ph = ph, kern_param.pw = pw, | |||||
| kern_param.window_h = window_h, kern_param.window_w = window_w, | |||||
| kern_param.sh = sh, kern_param.sw = sw; | |||||
| auto&& stream = cuda_stream(handle()); | |||||
| return pooling2d::do_pooling2d_int8_cdiv4hwn4( | |||||
| src.compatible_ptr<int8_t>(), dst.compatible_ptr<int8_t>(), | |||||
| kern_param, stream, static_cast<uint32_t>(param().mode)); | |||||
| } else if (param().format == Format::NCHW4) { | |||||
| pooling2d::Param kern_param; | |||||
| size_t n = src.layout[0], hi = src.layout[2], wi = src.layout[3], | |||||
| c = src.layout[1], ho = dst.layout[2], wo = dst.layout[3]; | |||||
| c = c * 4; | |||||
| size_t ph = param().pad_h, pw = param().pad_w; | |||||
| size_t window_h = param().window_h, window_w = param().window_w; | |||||
| size_t sh = param().stride_h, sw = param().stride_w; | |||||
| kern_param.n = n, kern_param.c = c, kern_param.hi = hi, | |||||
| kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo, | |||||
| kern_param.ph = ph, kern_param.pw = pw, | |||||
| kern_param.window_h = window_h, kern_param.window_w = window_w, | |||||
| kern_param.sh = sh, kern_param.sw = sw; | |||||
| auto&& stream = cuda_stream(handle()); | |||||
| return pooling2d::do_pooling2d_int8_ncdiv4hw4( | |||||
| src.compatible_ptr<int8_t>(), dst.compatible_ptr<int8_t>(), | |||||
| kern_param, stream, static_cast<uint32_t>(param().mode)); | |||||
| } else if (param().format == Format::NCHW32) { | |||||
| pooling2d::Param kern_param; | |||||
| size_t n = src.layout[0], hi = src.layout[2], wi = src.layout[3], | |||||
| c = src.layout[1], ho = dst.layout[2], wo = dst.layout[3]; | |||||
| c = c * 32; | |||||
| size_t ph = param().pad_h, pw = param().pad_w; | |||||
| size_t window_h = param().window_h, window_w = param().window_w; | |||||
| size_t sh = param().stride_h, sw = param().stride_w; | |||||
| kern_param.n = n, kern_param.c = c, kern_param.hi = hi, | |||||
| kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo, | |||||
| kern_param.ph = ph, kern_param.pw = pw, | |||||
| kern_param.window_h = window_h, kern_param.window_w = window_w, | |||||
| kern_param.sh = sh, kern_param.sw = sw; | |||||
| auto&& stream = cuda_stream(handle()); | |||||
| return pooling2d::do_pooling2d_int8_ncdiv32hw32( | |||||
| src.compatible_ptr<int8_t>(), dst.compatible_ptr<int8_t>(), | |||||
| kern_param, stream, static_cast<uint32_t>(param().mode)); | |||||
| } else if (param().format == Format::NCHW64 || | |||||
| inner_format == Format::NCHW64) { | |||||
| pooling2d::Param kern_param; | |||||
| size_t n = src.layout[0], hi = src.layout[2], wi = src.layout[3], | |||||
| c = src.layout[1], ho = dst.layout[2], wo = dst.layout[3]; | |||||
| c = c * 64; | |||||
| size_t ph = param().pad_h, pw = param().pad_w; | |||||
| size_t window_h = param().window_h, window_w = param().window_w; | |||||
| size_t sh = param().stride_h, sw = param().stride_w; | |||||
| kern_param.n = n, kern_param.c = c, kern_param.hi = hi, | |||||
| kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo, | |||||
| kern_param.ph = ph, kern_param.pw = pw, | |||||
| kern_param.window_h = window_h, kern_param.window_w = window_w, | |||||
| kern_param.sh = sh, kern_param.sw = sw; | |||||
| bool uint_case = false; | |||||
| int zero_point = 0; | |||||
| if (src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { | |||||
| uint_case = true; | |||||
| zero_point = src.layout.dtype.param<dtype::Quantized4Asymm>() | |||||
| .zero_point; | |||||
| } | |||||
| auto&& stream = cuda_stream(handle()); | |||||
| pooling2d::do_pooling2d_int4_ncdiv64hw64( | |||||
| (int8_t*)src.raw_ptr, (int8_t*)dst.raw_ptr, kern_param, | |||||
| stream, static_cast<uint32_t>(param().mode), uint_case, | |||||
| zero_point); | |||||
| if (sdst.layout.ndim == 4) { | |||||
| auto relayout_opr = handle()->create_operator<RelayoutFormat>(); | |||||
| RelayoutFormat::Param trans_param; | |||||
| trans_param.mode = RelayoutFormat::Param::Mode::NCHW64_NCHW; | |||||
| relayout_opr->param() = trans_param; | |||||
| relayout_opr->exec(dst, sdst, {}); | |||||
| } | |||||
| return; | |||||
| } else if (param().format == Format::NHWC && | |||||
| (src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm || | |||||
| src.layout.dtype.enumv() == DTypeEnum::QuantizedS4)) { | |||||
| megdnn_assert(src.layout.dtype.enumv() == dst.layout.dtype.enumv(), | |||||
| "src and dst dtype must equal"); | |||||
| pooling2d::Param kern_param; | |||||
| size_t n = src.layout[0], hi = src.layout[1], wi = src.layout[2], | |||||
| c = src.layout[3], ho = dst.layout[1], wo = dst.layout[2]; | |||||
| size_t ph = param().pad_h, pw = param().pad_w; | |||||
| size_t window_h = param().window_h, window_w = param().window_w; | |||||
| size_t sh = param().stride_h, sw = param().stride_w; | |||||
| kern_param.n = n, kern_param.c = c, kern_param.hi = hi, | |||||
| kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo, | |||||
| kern_param.ph = ph, kern_param.pw = pw, | |||||
| kern_param.window_h = window_h, kern_param.window_w = window_w, | |||||
| kern_param.sh = sh, kern_param.sw = sw; | |||||
| bool uint_case = false; | |||||
| int zero_point = 0; | |||||
| if (src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { | |||||
| uint_case = true; | |||||
| zero_point = src.layout.dtype.param<dtype::Quantized4Asymm>() | |||||
| .zero_point; | |||||
| } | |||||
| auto&& stream = cuda_stream(handle()); | |||||
| pooling2d::do_pooling2d_int4_nhwc( | |||||
| (int8_t*)src.raw_ptr, (int8_t*)dst.raw_ptr, kern_param, | |||||
| stream, static_cast<uint32_t>(param().mode), uint_case, | |||||
| zero_point); | |||||
| return; | |||||
| } | |||||
| auto handle = cudnn_handle(this->handle()); | |||||
| setup_descs(src.layout, dst.layout); | |||||
| dt_float32 alpha = 1.0f, beta = 0.0f; | |||||
| cudnn_check(cudnnPoolingForward(handle, pooling_desc.desc, &alpha, | |||||
| src_desc.desc, src.raw_ptr, &beta, | |||||
| dst_desc.desc, dst.raw_ptr)); | |||||
| AlgoBase::ExecArgs args(this, ssrc, sdst, sworkspace); | |||||
| auto algo = get_algorithm(this, ssrc.layout, sdst.layout); | |||||
| algo->exec(args); | |||||
| } | } | ||||
| if (ssrc.layout.dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) { | |||||
| ctypecvt.comp_to_dst_type(dst, sdst); | |||||
| } | |||||
| } | } | ||||
| void PoolingBackwardImpl::setup_descs(const TensorLayout& src, | |||||
| const TensorLayout& dst, | |||||
| const TensorLayout& diff, | |||||
| const TensorLayout& grad) { | |||||
| src_desc.set(src); | |||||
| dst_desc.set(dst); | |||||
| diff_desc.set(diff); | |||||
| grad_desc.set(grad); | |||||
| pooling_desc.set(this->param()); | |||||
| const char* PoolingBackwardImpl::get_algorithm_set_name() const { | |||||
| return "CUDA_POOLING_BACKWARD"; | |||||
| } | } | ||||
| WorkspaceBundle PoolingBackwardImpl::get_workspace_bundle( | |||||
| void* ptr, const TensorLayout& src, const TensorLayout& dst, | |||||
| const TensorLayout& diff, const TensorLayout& grad) const { | |||||
| SmallVector<size_t> sizes; | |||||
| TensorLayout fsrc = src; | |||||
| TensorLayout fdst = dst; | |||||
| TensorLayout fdiff = diff; | |||||
| TensorLayout fgrad = grad; | |||||
| auto get_workspace = [&sizes](TensorLayout& layout) { | |||||
| if (layout.dtype == dtype::BFloat16()) { | |||||
| layout.dtype = dtype::Float32(); | |||||
| sizes.push_back(layout.span().dist_byte()); | |||||
| std::vector<PoolingBackwardImpl::Algorithm*> | |||||
| PoolingBackwardImpl::get_all_algorithms(const TensorLayout& src, | |||||
| const TensorLayout& dst, | |||||
| const TensorLayout& diff, | |||||
| const TensorLayout& grad) { | |||||
| return megdnn::get_all_algorithms<PoolingBackwardImpl>( | |||||
| {this, src, dst, diff, grad}); | |||||
| } | |||||
| PoolingBackwardImpl::Algorithm* PoolingBackwardImpl::get_algorithm_heuristic( | |||||
| const TensorLayout& src, const TensorLayout& dst, | |||||
| const TensorLayout& diff, const TensorLayout& grad, | |||||
| size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, | |||||
| const AlgoAttribute& negative_attr) { | |||||
| MEGDNN_MARK_USED_VAR(workspace_limit_in_bytes); | |||||
| AlgoBase::SizeArgs args(this, src, dst, diff, grad); | |||||
| for (auto iter : sm_algo_pack.all_algos) { | |||||
| if (iter->is_available_attribute(args, positive_attr, negative_attr)) { | |||||
| return iter; | |||||
| } | } | ||||
| }; | |||||
| get_workspace(fsrc); | |||||
| get_workspace(fdst); | |||||
| get_workspace(fdiff); | |||||
| get_workspace(fgrad); | |||||
| return {ptr, std::move(sizes)}; | |||||
| } | |||||
| megdnn_throw( | |||||
| ssprintf("require algorithm with attribute(%s) and without " | |||||
| "attribute(%s), but can't get suitable algo.\n", | |||||
| Algorithm::attribute_str(positive_attr).c_str(), | |||||
| Algorithm::attribute_str(negative_attr).c_str())); | |||||
| return nullptr; | |||||
| } | } | ||||
| void PoolingBackwardImpl::exec(_megdnn_tensor_in ssrc, _megdnn_tensor_in sdst, | void PoolingBackwardImpl::exec(_megdnn_tensor_in ssrc, _megdnn_tensor_in sdst, | ||||
| @@ -283,34 +105,23 @@ void PoolingBackwardImpl::exec(_megdnn_tensor_in ssrc, _megdnn_tensor_in sdst, | |||||
| _megdnn_workspace sworkspace) { | _megdnn_workspace sworkspace) { | ||||
| check_exec(ssrc.layout, sdst.layout, sdiff.layout, sgrad.layout, | check_exec(ssrc.layout, sdst.layout, sdiff.layout, sgrad.layout, | ||||
| sworkspace.size); | sworkspace.size); | ||||
| auto handle = cudnn_handle(this->handle()); | |||||
| TensorND src = ssrc; | |||||
| TensorND dst = sdst; | |||||
| TensorND diff = sdiff; | |||||
| TensorND grad = sgrad; | |||||
| auto wsb = get_workspace_bundle(sworkspace.raw_ptr, ssrc.layout, | |||||
| sdst.layout, sdiff.layout, sgrad.layout); | |||||
| auto ctypecvt = CompTypeCvter<dtype::BFloat16, dtype::Float32>( | |||||
| concrete_handle(this->handle()), &wsb); | |||||
| if (ssrc.layout.dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) { | |||||
| ctypecvt.src_to_comp_type(ssrc, src) | |||||
| .src_to_comp_type(sdst, dst) | |||||
| .src_to_comp_type(sdiff, diff) | |||||
| .src_to_comp_type(sgrad, grad); | |||||
| } | |||||
| { | { | ||||
| setup_descs(src.layout, dst.layout, diff.layout, grad.layout); | |||||
| float alpha = 1.0f, beta = 0.0f; | |||||
| cudnn_check(cudnnPoolingBackward( | |||||
| handle, pooling_desc.desc, &alpha, dst_desc.desc, dst.raw_ptr, | |||||
| diff_desc.desc, diff.raw_ptr, src_desc.desc, src.raw_ptr, &beta, | |||||
| grad_desc.desc, grad.raw_ptr)); | |||||
| } | |||||
| if (ssrc.layout.dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) { | |||||
| ctypecvt.comp_to_dst_type(grad, sgrad); | |||||
| AlgoBase::ExecArgs args(this, ssrc, sdst, sdiff, sgrad, sworkspace); | |||||
| auto algo = get_algorithm(this, ssrc.layout, sdst.layout, sdiff.layout, | |||||
| sgrad.layout); | |||||
| algo->exec(args); | |||||
| } | } | ||||
| } | } | ||||
| size_t PoolingBackwardImpl::get_workspace_in_bytes(const TensorLayout& src, | |||||
| const TensorLayout& dst, | |||||
| const TensorLayout& diff, | |||||
| const TensorLayout& grad) { | |||||
| AlgoBase::SizeArgs args(this, src, dst, diff, grad); | |||||
| return get_algorithm(this, src, dst, diff, grad) | |||||
| ->get_workspace_in_bytes(args); | |||||
| } | |||||
| } // namespace cuda | } // namespace cuda | ||||
| } // namespace megdnn | } // namespace megdnn | ||||
| @@ -23,16 +23,45 @@ public: | |||||
| void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | ||||
| _megdnn_workspace workspace) override; | _megdnn_workspace workspace) override; | ||||
| size_t get_workspace_in_bytes(const TensorLayout& src, | size_t get_workspace_in_bytes(const TensorLayout& src, | ||||
| const TensorLayout& dst) override { | |||||
| return get_workspace_bundle(nullptr, src, dst).total_size_in_bytes(); | |||||
| const TensorLayout& dst) override; | |||||
| const char* get_algorithm_set_name() const override; | |||||
| Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override; | |||||
| AlgorithmInfo get_algorithm_info_heuristic( | |||||
| const TensorLayout& src, const TensorLayout& dst, | |||||
| size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, | |||||
| const AlgoAttribute& negative_attr) { | |||||
| return get_algorithm_heuristic(src, dst, workspace_limit_in_bytes, | |||||
| positive_attr, negative_attr) | |||||
| ->info(); | |||||
| } | } | ||||
| class AlgoBase; | |||||
| class AlgoCUDNN; | |||||
| #if CUDNN_VERSION >= 6000 | |||||
| class AlgoCUDNNMAXDETERMINISTIC; | |||||
| #endif | |||||
| class AlgoCHWN4; | |||||
| class AlgoNCHW4; | |||||
| class AlgoNCHW32; | |||||
| class AlgoNHWC; | |||||
| class AlgoNCHW64; | |||||
| class AlgoPack; | |||||
| static const AlgoPack& algo_pack() { return sm_algo_pack; } | |||||
| protected: | |||||
| std::vector<Algorithm*> get_all_algorithms( | |||||
| const TensorLayout& src, const TensorLayout& dst) override; | |||||
| Algorithm* get_algorithm_heuristic( | |||||
| const TensorLayout& src, const TensorLayout& dst, | |||||
| size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, | |||||
| const AlgoAttribute& negative_attr) override; | |||||
| private: | private: | ||||
| TensorDesc src_desc, dst_desc; | |||||
| PoolingDesc pooling_desc; | |||||
| void setup_descs(const TensorLayout& src, const TensorLayout& dst); | |||||
| WorkspaceBundle get_workspace_bundle(void* ptr, const TensorLayout& src, | |||||
| const TensorLayout& dst) const; | |||||
| static AlgoPack sm_algo_pack; | |||||
| }; | }; | ||||
| class PoolingBackwardImpl final : public PoolingBackward { | class PoolingBackwardImpl final : public PoolingBackward { | ||||
| @@ -44,23 +73,43 @@ public: | |||||
| size_t get_workspace_in_bytes(const TensorLayout& src, | size_t get_workspace_in_bytes(const TensorLayout& src, | ||||
| const TensorLayout& dst, | const TensorLayout& dst, | ||||
| const TensorLayout& diff, | const TensorLayout& diff, | ||||
| const TensorLayout& grad) override { | |||||
| return get_workspace_bundle(nullptr, src, dst, diff, grad) | |||||
| .total_size_in_bytes(); | |||||
| const TensorLayout& grad) override; | |||||
| const char* get_algorithm_set_name() const override; | |||||
| Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override; | |||||
| AlgorithmInfo get_algorithm_info_heuristic( | |||||
| const TensorLayout& src, const TensorLayout& dst, | |||||
| const TensorLayout& diff, const TensorLayout& grad, | |||||
| size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, | |||||
| const AlgoAttribute& negative_attr) { | |||||
| return get_algorithm_heuristic(src, dst, diff, grad, | |||||
| workspace_limit_in_bytes, positive_attr, | |||||
| negative_attr) | |||||
| ->info(); | |||||
| } | } | ||||
| class AlgoBase; | |||||
| class AlgoCUDNN; | |||||
| class AlgoPack; | |||||
| static const AlgoPack& algo_pack() { return sm_algo_pack; } | |||||
| protected: | |||||
| std::vector<Algorithm*> get_all_algorithms( | |||||
| const TensorLayout& src, const TensorLayout& dst, | |||||
| const TensorLayout& diff, const TensorLayout& grad) override; | |||||
| Algorithm* get_algorithm_heuristic( | |||||
| const TensorLayout& src, const TensorLayout& dst, | |||||
| const TensorLayout& diff, const TensorLayout& grad, | |||||
| size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, | |||||
| const AlgoAttribute& negative_attr) override; | |||||
| private: | private: | ||||
| TensorDesc src_desc, dst_desc, diff_desc, grad_desc; | |||||
| PoolingDesc pooling_desc; | |||||
| void setup_descs(const TensorLayout& src, const TensorLayout& dst, | |||||
| const TensorLayout& diff, const TensorLayout& grad); | |||||
| WorkspaceBundle get_workspace_bundle(void* ptr, const TensorLayout& src, | |||||
| const TensorLayout& dst, | |||||
| const TensorLayout& diff, | |||||
| const TensorLayout& grad) const; | |||||
| static AlgoPack sm_algo_pack; | |||||
| }; | }; | ||||
| } // namespace cuda | |||||
| } // namespace megdnn | |||||
| } // namespace cuda | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen | ||||
| @@ -57,6 +57,24 @@ class DefaultBatchConvBiasForwardAlgorithm final | |||||
| const char* name() const override { return "DEFAULT"; } | const char* name() const override { return "DEFAULT"; } | ||||
| }; | }; | ||||
| class DefaultPoolingForwardAlgorithm final | |||||
| : public megdnn::PoolingForward::Algorithm { | |||||
| AlgoAttribute attribute() const override { | |||||
| return AlgoAttribute::REPRODUCIBLE | AlgoAttribute::NAIVE; | |||||
| } | |||||
| uint32_t type() const override { return 0; } | |||||
| const char* name() const override { return "DEFAULT"; } | |||||
| }; | |||||
| class DefaultPoolingBackwardAlgorithm final | |||||
| : public megdnn::PoolingBackward::Algorithm { | |||||
| AlgoAttribute attribute() const override { | |||||
| return AlgoAttribute::REPRODUCIBLE | AlgoAttribute::NAIVE; | |||||
| } | |||||
| uint32_t type() const override { return 0; } | |||||
| const char* name() const override { return "DEFAULT"; } | |||||
| }; | |||||
| } // namespace naive | } // namespace naive | ||||
| } // namespace megdnn | } // namespace megdnn | ||||
| @@ -110,6 +110,9 @@ DefaultLocalShareBackwardFilterAlgorithm | |||||
| DefaultMatrixMulAlgorithm HandleImpl::m_default_matmul_fwd_algo; | DefaultMatrixMulAlgorithm HandleImpl::m_default_matmul_fwd_algo; | ||||
| DefaultBatchedMatrixMulAlgorithm HandleImpl::m_default_batched_matmul_fwd_algo; | DefaultBatchedMatrixMulAlgorithm HandleImpl::m_default_batched_matmul_fwd_algo; | ||||
| DefaultPoolingForwardAlgorithm HandleImpl::m_default_pooling_fwd_algo; | |||||
| DefaultPoolingBackwardAlgorithm HandleImpl::m_default_pooling_bwd_algo; | |||||
| HandleImpl::HandleImpl(megcoreComputingHandle_t computing_handle, | HandleImpl::HandleImpl(megcoreComputingHandle_t computing_handle, | ||||
| HandleType type) | HandleType type) | ||||
| : HandleImplHelper(computing_handle, type), | : HandleImplHelper(computing_handle, type), | ||||
| @@ -51,6 +51,9 @@ class HandleImpl : public HandleImplHelper { | |||||
| static DefaultMatrixMulAlgorithm m_default_matmul_fwd_algo; | static DefaultMatrixMulAlgorithm m_default_matmul_fwd_algo; | ||||
| static DefaultBatchedMatrixMulAlgorithm m_default_batched_matmul_fwd_algo; | static DefaultBatchedMatrixMulAlgorithm m_default_batched_matmul_fwd_algo; | ||||
| static DefaultPoolingForwardAlgorithm m_default_pooling_fwd_algo; | |||||
| static DefaultPoolingBackwardAlgorithm m_default_pooling_bwd_algo; | |||||
| //! move KernFunc to alloc_kern()->func, destruct func, and call dispatch | //! move KernFunc to alloc_kern()->func, destruct func, and call dispatch | ||||
| template <typename T> | template <typename T> | ||||
| void move_kern_func_to_new_kern_and_dispatch(T& func) { | void move_kern_func_to_new_kern_and_dispatch(T& func) { | ||||
| @@ -122,6 +125,14 @@ public: | |||||
| return &m_default_batched_matmul_fwd_algo; | return &m_default_batched_matmul_fwd_algo; | ||||
| } | } | ||||
| PoolingForward::Algorithm* default_pooling_fwd_algo() { | |||||
| return &m_default_pooling_fwd_algo; | |||||
| } | |||||
| PoolingBackward::Algorithm* default_pooling_bwd_algo() { | |||||
| return &m_default_pooling_bwd_algo; | |||||
| } | |||||
| Relayout* relayout_opr() override { | Relayout* relayout_opr() override { | ||||
| return get_helper_opr<Relayout, 2>(this); | return get_helper_opr<Relayout, 2>(this); | ||||
| } | } | ||||
| @@ -582,6 +582,52 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||||
| megdnn_assert_internal(0); | megdnn_assert_internal(0); | ||||
| } | } | ||||
| PoolingForward::Algorithm* PoolingForwardImpl::get_algorithm_from_desc( | |||||
| const AlgorithmDesc& desc) { | |||||
| Algorithm* ret = | |||||
| static_cast<HandleImpl*>(handle())->default_pooling_fwd_algo(); | |||||
| megdnn_assert(desc == ret->info().desc); | |||||
| return ret; | |||||
| } | |||||
| std::vector<Algorithm*> PoolingForwardImpl::get_all_algorithms( | |||||
| const TensorLayout&, const TensorLayout&) { | |||||
| return {static_cast<HandleImpl*>(handle())->default_pooling_fwd_algo()}; | |||||
| } | |||||
| Algorithm* PoolingForwardImpl::get_algorithm_heuristic( | |||||
| const TensorLayout& /*src*/, const TensorLayout& /*dst*/, | |||||
| size_t /*workspace_limit_in_bytes*/, const AlgoAttribute& positive_attr, | |||||
| const AlgoAttribute& negative_attr) { | |||||
| auto algo = static_cast<HandleImpl*>(handle())->default_pooling_fwd_algo(); | |||||
| algo->check_attribute(positive_attr, negative_attr); | |||||
| return algo; | |||||
| } | |||||
| Algorithm* PoolingBackwardImpl::get_algorithm_from_desc( | |||||
| const AlgorithmDesc& desc) { | |||||
| Algorithm* ret = | |||||
| static_cast<HandleImpl*>(handle())->default_pooling_bwd_algo(); | |||||
| megdnn_assert(desc == ret->info().desc); | |||||
| return ret; | |||||
| } | |||||
| std::vector<Algorithm*> PoolingBackwardImpl::get_all_algorithms( | |||||
| const TensorLayout& /*src*/, const TensorLayout& /*dst*/, | |||||
| const TensorLayout& /*diff*/, const TensorLayout& /*grad*/) { | |||||
| return {static_cast<HandleImpl*>(handle())->default_pooling_bwd_algo()}; | |||||
| } | |||||
| Algorithm* PoolingBackwardImpl::get_algorithm_heuristic( | |||||
| const TensorLayout& /*src*/, const TensorLayout& /*dst*/, | |||||
| const TensorLayout& /*diff*/, const TensorLayout& /*grad*/, | |||||
| size_t /*workspace_limit_in_bytes*/, const AlgoAttribute& positive_attr, | |||||
| const AlgoAttribute& negative_attr) { | |||||
| auto algo = static_cast<HandleImpl*>(handle())->default_pooling_bwd_algo(); | |||||
| algo->check_attribute(positive_attr, negative_attr); | |||||
| return algo; | |||||
| } | |||||
| WorkspaceBundle PoolingBackwardImpl::get_workspace_bundle( | WorkspaceBundle PoolingBackwardImpl::get_workspace_bundle( | ||||
| void* ptr, const TensorLayout& src, const TensorLayout& dst, | void* ptr, const TensorLayout& src, const TensorLayout& dst, | ||||
| const TensorLayout& diff, const TensorLayout& grad) const { | const TensorLayout& diff, const TensorLayout& grad) const { | ||||
| @@ -26,6 +26,21 @@ class PoolingForwardImpl: public PoolingForward { | |||||
| private: | private: | ||||
| WorkspaceBundle get_workspace_bundle(void* ptr, const TensorLayout&, | WorkspaceBundle get_workspace_bundle(void* ptr, const TensorLayout&, | ||||
| const TensorLayout&) const; | const TensorLayout&) const; | ||||
| const char* get_algorithm_set_name() const override { | |||||
| return "DEFALUT"; | |||||
| } | |||||
| Algorithm* get_algorithm_from_desc(const AlgorithmDesc&) override; | |||||
| std::vector<Algorithm*> get_all_algorithms( | |||||
| const TensorLayout& src, const TensorLayout& dst) override; | |||||
| Algorithm* get_algorithm_heuristic( | |||||
| const TensorLayout& src, const TensorLayout& dst, | |||||
| size_t workspace_limit_in_bytes, | |||||
| const AlgoAttribute& positive_attr, | |||||
| const AlgoAttribute& negative_attr) override; | |||||
| }; | }; | ||||
| class PoolingBackwardImpl : public PoolingBackward { | class PoolingBackwardImpl : public PoolingBackward { | ||||
| @@ -38,6 +53,20 @@ public: | |||||
| const TensorLayout&, | const TensorLayout&, | ||||
| const TensorLayout&) override; | const TensorLayout&) override; | ||||
| const char* get_algorithm_set_name() const override { return "DEFALUT"; } | |||||
| Algorithm* get_algorithm_from_desc(const AlgorithmDesc&) override; | |||||
| std::vector<Algorithm*> get_all_algorithms( | |||||
| const TensorLayout& src, const TensorLayout& dst, | |||||
| const TensorLayout& diff, const TensorLayout& grad) override; | |||||
| Algorithm* get_algorithm_heuristic( | |||||
| const TensorLayout& src, const TensorLayout& dst, | |||||
| const TensorLayout& diff, const TensorLayout& grad, | |||||
| size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, | |||||
| const AlgoAttribute& negative_attr) override; | |||||
| private: | private: | ||||
| WorkspaceBundle get_workspace_bundle(void* ptr, const TensorLayout&, | WorkspaceBundle get_workspace_bundle(void* ptr, const TensorLayout&, | ||||
| const TensorLayout&, | const TensorLayout&, | ||||
| @@ -46,6 +46,12 @@ struct AlgoProxy; | |||||
| } \ | } \ | ||||
| } | } | ||||
| #define LAYOUTS layouts[0], layouts[1] | |||||
| #define TENSORS tensors[0], tensors[1] | |||||
| DEF_ALGO_PROXY(2); | |||||
| #undef LAYOUTS | |||||
| #undef TENSORS | |||||
| #define LAYOUTS layouts[0], layouts[1], layouts[2] | #define LAYOUTS layouts[0], layouts[1], layouts[2] | ||||
| #define TENSORS tensors[0], tensors[1], tensors[2] | #define TENSORS tensors[0], tensors[1], tensors[2] | ||||
| DEF_ALGO_PROXY(3); | DEF_ALGO_PROXY(3); | ||||
| @@ -21,6 +21,13 @@ | |||||
| #include <cudnn.h> | #include <cudnn.h> | ||||
| #include "test/cuda/benchmark.h" | #include "test/cuda/benchmark.h" | ||||
| namespace { | |||||
| #define V1(v) #v | |||||
| #define V(v) V1(v) | |||||
| #define DEF_NAME(NAME) \ | |||||
| #NAME "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL) | |||||
| } // namespace | |||||
| namespace megdnn { | namespace megdnn { | ||||
| namespace test { | namespace test { | ||||
| @@ -263,19 +270,24 @@ TEST_F(CUDA, POOLING_FORWARD_NCHW_Q4) { | |||||
| checker.set_param(param).exec({{20, 24, 22, 33}, {}}); | checker.set_param(param).exec({{20, 24, 22, 33}, {}}); | ||||
| } | } | ||||
| TEST_F(CUDA, POOLING_FORWARD_NCHW4) { | |||||
| TEST_F(CUDA, POOLING_FORWARD_NCHW4_NCHW32) { | |||||
| require_compute_capability(7, 5); | require_compute_capability(7, 5); | ||||
| using Param = param::Pooling; | using Param = param::Pooling; | ||||
| Checker<Pooling> checker(handle_cuda()); | Checker<Pooling> checker(handle_cuda()); | ||||
| Param param; | Param param; | ||||
| checker.set_dtype(0, dtype::QuantizedS8(0.1f)); | checker.set_dtype(0, dtype::QuantizedS8(0.1f)); | ||||
| param.format = Param::Format::NCHW4; | |||||
| checker.set_epsilon(1 + 1e-3); | checker.set_epsilon(1 + 1e-3); | ||||
| checker.set_param(param).exec({{20, 3, 50, 50, 4}, {}}); | |||||
| param.mode = Param::Mode::AVERAGE; | |||||
| checker.set_param(param).exec({{20, 3, 50, 50, 4}, {}}); | |||||
| param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; | |||||
| checker.set_param(param).exec({{20, 3, 50, 50, 4}, {}}); | |||||
| checker.set_before_exec_callback( | |||||
| AlgoChecker<PoolingForward>(DEF_NAME(cudnnForward))); | |||||
| for (auto format : {Param::Format::NCHW4, Param::Format::NCHW32}) { | |||||
| param.format = format; | |||||
| param.mode = Param::Mode::MAX; | |||||
| checker.set_param(param).exec({{4, 3, 28, 28, 32}, {}}); | |||||
| param.mode = Param::Mode::AVERAGE; | |||||
| checker.set_param(param).exec({{4, 3, 28, 28, 64}, {}}); | |||||
| param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; | |||||
| checker.set_param(param).exec({{4, 3, 28, 28, 32}, {}}); | |||||
| } | |||||
| } | } | ||||
| #if CUDNN_VERSION >= 7500 | #if CUDNN_VERSION >= 7500 | ||||
| @@ -288,6 +300,8 @@ TEST_F(CUDA, POOLING_FORWARD_NCHW32) { | |||||
| auto i8_max = std::numeric_limits<int8_t>().max(); | auto i8_max = std::numeric_limits<int8_t>().max(); | ||||
| UniformIntRNG int_rng{i8_min, i8_max}; | UniformIntRNG int_rng{i8_min, i8_max}; | ||||
| checker.set_dtype(0, dtype::QuantizedS8(0.1f)); | checker.set_dtype(0, dtype::QuantizedS8(0.1f)); | ||||
| checker.set_before_exec_callback( | |||||
| AlgoChecker<PoolingForward>("CUDA_NCHW32")); | |||||
| param.format = Param::Format::NCHW32; | param.format = Param::Format::NCHW32; | ||||
| checker.set_epsilon(1e-3).set_rng(0, &int_rng); | checker.set_epsilon(1e-3).set_rng(0, &int_rng); | ||||
| checker.set_param(param).exec({{64, 8, 28, 28, 32}, {}}); | checker.set_param(param).exec({{64, 8, 28, 28, 32}, {}}); | ||||
| @@ -394,6 +408,7 @@ TEST_F(CUDA, POOLING_FORWARD_INT8_NCHW4) { | |||||
| UniformIntRNG int_rng{i8_min, i8_max}; | UniformIntRNG int_rng{i8_min, i8_max}; | ||||
| checker.set_dtype(0, dtype::QuantizedS8(0.1f)); | checker.set_dtype(0, dtype::QuantizedS8(0.1f)); | ||||
| param.format = Param::Format::NCHW4; | param.format = Param::Format::NCHW4; | ||||
| checker.set_before_exec_callback(AlgoChecker<PoolingForward>("CUDA_NCHW4")); | |||||
| for (auto mode : {Param::Mode::MAX, Param::Mode::AVERAGE, | for (auto mode : {Param::Mode::MAX, Param::Mode::AVERAGE, | ||||
| Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING}) { | Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING}) { | ||||
| param.mode = mode; | param.mode = mode; | ||||
| @@ -413,6 +428,8 @@ TEST_F(CUDA, POOLING_FORWARD_INT8_NCHW32) { | |||||
| auto i8_max = std::numeric_limits<int8_t>().max(); | auto i8_max = std::numeric_limits<int8_t>().max(); | ||||
| UniformIntRNG int_rng{i8_min, i8_max}; | UniformIntRNG int_rng{i8_min, i8_max}; | ||||
| checker.set_dtype(0, dtype::QuantizedS8(0.1f)); | checker.set_dtype(0, dtype::QuantizedS8(0.1f)); | ||||
| checker.set_before_exec_callback( | |||||
| AlgoChecker<PoolingForward>("CUDA_NCHW32")); | |||||
| param.format = Param::Format::NCHW32; | param.format = Param::Format::NCHW32; | ||||
| for (auto mode : {Param::Mode::MAX, Param::Mode::AVERAGE, | for (auto mode : {Param::Mode::MAX, Param::Mode::AVERAGE, | ||||
| Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING}) { | Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING}) { | ||||
| @@ -10,14 +10,52 @@ | |||||
| */ | */ | ||||
| #include "megbrain/opr/dnn/pooling.h" | #include "megbrain/opr/dnn/pooling.h" | ||||
| #include "megbrain/graph/grad_impl.h" | #include "megbrain/graph/grad_impl.h" | ||||
| #include "megbrain/opr/search_policy/algo_chooser.h" | |||||
| #include "../internal/megdnn_opr_wrapper.inl" | #include "../internal/megdnn_opr_wrapper.inl" | ||||
| #include "../search_policy/workspace_need_limit_getter.inl" | |||||
| using namespace mgb; | using namespace mgb; | ||||
| using namespace opr; | using namespace opr; | ||||
| MGB_DYN_TYPE_OBJ_FINAL_IMPL(PoolingForward); | MGB_DYN_TYPE_OBJ_FINAL_IMPL(PoolingForward); | ||||
| MEGDNN_OPR_INIT1(PoolingForward, "pooling") | |||||
| PoolingForward::PoolingForward(VarNode* i0, const Param& param, | |||||
| const ExecutionPolicy& policy, | |||||
| const OperatorNodeConfig& config) | |||||
| : Super(OperatorNodeBaseCtorParam{ | |||||
| i0->owner_graph(), config, "pooling", {i0}}) { | |||||
| init_megdnn_opr(*this, param); | |||||
| add_input({i0}); | |||||
| m_policy = policy; | |||||
| intl::MegDNNOprInitPostCtor<PoolingForward>::apply(*this); | |||||
| } | |||||
| SymbolVar PoolingForward::make(SymbolVar i0, const Param& param, | |||||
| const OperatorNodeConfig& config, | |||||
| const ExecutionPolicy& policy) { | |||||
| intl::MegDNNOprInitInputsModifier<PoolingForward>::apply(param, {&i0}); | |||||
| return i0.insert_single_output_opr<PoolingForward>(i0.node(), param, policy, | |||||
| config); | |||||
| } | |||||
| void PoolingForward::init_output_static_infer_desc() { | |||||
| Super::set_nr_managed_outputs(this->output().size() - 1); | |||||
| Super::Super::init_output_static_infer_desc(); | |||||
| init_output_static_infer_desc_workspace( | |||||
| intl::AutoAddWorkspaceNeedLimitGetter<megdnn::PoolingForward>::val); | |||||
| } | |||||
| size_t PoolingForward::get_workspace_size_bytes( | |||||
| const TensorShapeArray& input_shapes, | |||||
| const TensorShapeArray& output_shapes) const { | |||||
| return AlgoChooser<megdnn::PoolingForward>::setup_algo( | |||||
| {TensorLayout{input_shapes[0], input(0)->dtype(), | |||||
| input(0)->format()}, | |||||
| {output_shapes[0], output(0)->dtype(), output(0)->format()}}, | |||||
| megdnn_opr(), this, false); | |||||
| } | |||||
| #if MGB_ENABLE_GRAD | #if MGB_ENABLE_GRAD | ||||
| MGB_IMPL_OPR_GRAD(PoolingForward) { | MGB_IMPL_OPR_GRAD(PoolingForward) { | ||||
| @@ -29,7 +67,41 @@ MGB_IMPL_OPR_GRAD(PoolingForward) { | |||||
| #endif | #endif | ||||
| MGB_DYN_TYPE_OBJ_FINAL_IMPL(PoolingBackward); | MGB_DYN_TYPE_OBJ_FINAL_IMPL(PoolingBackward); | ||||
| MEGDNN_OPR_INIT3(PoolingBackward, "pooling_bwd", 0, true); | |||||
| PoolingBackward::PoolingBackward(VarNode* i0, VarNode* i1, VarNode* i2, | |||||
| const Param& param, | |||||
| const ExecutionPolicy& policy, | |||||
| const OperatorNodeConfig& config) | |||||
| : Super( | |||||
| OperatorNodeBaseCtorParam{ | |||||
| i0->owner_graph(), config, "pooling_bwd", {i0}}, | |||||
| 0, true) { | |||||
| init_megdnn_opr(*this, param); | |||||
| add_input({i0, i1, i2}); | |||||
| intl::MegDNNOprInitPostCtor<PoolingBackward>::apply(*this); | |||||
| } | |||||
| SymbolVar PoolingBackward::make(SymbolVar i0, SymbolVar i1, SymbolVar i2, | |||||
| const Param& param, | |||||
| const OperatorNodeConfig& config, | |||||
| const ExecutionPolicy& policy) { | |||||
| intl::MegDNNOprInitInputsModifier<PoolingBackward>::apply(param, | |||||
| {&i0, &i1, &i2}); | |||||
| return i0.insert_single_output_opr<PoolingBackward>( | |||||
| i0.node(), i1.node(), i2.node(), param, policy, config); | |||||
| } | |||||
| size_t PoolingBackward::get_workspace_size_bytes( | |||||
| const TensorShapeArray& input_shapes, | |||||
| const TensorShapeArray& output_shapes) const { | |||||
| return AlgoChooser<megdnn::PoolingBackward>::setup_algo( | |||||
| {TensorLayout{input_shapes[0], input(0)->dtype(), | |||||
| input(0)->format()}, | |||||
| {input_shapes[1], input(1)->dtype(), input(1)->format()}, | |||||
| {input_shapes[2], input(2)->dtype(), input(2)->format()}, | |||||
| {output_shapes[0], output(0)->dtype(), output(0)->format()}}, | |||||
| megdnn_opr(), this, false); | |||||
| } | |||||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | ||||
| @@ -175,6 +175,8 @@ cb(DEFORMABLE_CONV_BACKWARD_DATA, DeformableConvBackwardData); | |||||
| cb(DEFORMABLE_CONV_BACKWARD_FILTER, DeformableConvBackwardFilter); | cb(DEFORMABLE_CONV_BACKWARD_FILTER, DeformableConvBackwardFilter); | ||||
| cb(BATCH_CONV_FORWARD, BatchConvBiasForward); | cb(BATCH_CONV_FORWARD, BatchConvBiasForward); | ||||
| cb(CONVBIAS_FORWARD, ConvBiasForward); | cb(CONVBIAS_FORWARD, ConvBiasForward); | ||||
| cb(POOLING_FORWARD, PoolingForward); | |||||
| cb(POOLING_BACKWARD, PoolingBackward); | |||||
| #undef cb | #undef cb | ||||
| @@ -195,7 +197,9 @@ cb(CONVBIAS_FORWARD, ConvBiasForward); | |||||
| cb(DEFORMABLE_CONV_BACKWARD_DATA, stmt) \ | cb(DEFORMABLE_CONV_BACKWARD_DATA, stmt) \ | ||||
| cb(DEFORMABLE_CONV_BACKWARD_FILTER, stmt) \ | cb(DEFORMABLE_CONV_BACKWARD_FILTER, stmt) \ | ||||
| cb(BATCH_CONV_FORWARD, stmt) \ | cb(BATCH_CONV_FORWARD, stmt) \ | ||||
| cb(CONVBIAS_FORWARD, stmt) | |||||
| cb(CONVBIAS_FORWARD, stmt) \ | |||||
| cb(POOLING_FORWARD, stmt) \ | |||||
| cb(POOLING_BACKWARD, stmt) | |||||
| // clang-format on | // clang-format on | ||||
| #define _OPR_TYPE_CASE(_opr_type, _stmt) \ | #define _OPR_TYPE_CASE(_opr_type, _stmt) \ | ||||
| @@ -521,11 +525,14 @@ AlgoChooser<Opr>::AlgoChooserHelper::AlgoChooserHelper( | |||||
| mgb_assert(m_fastrun_layouts.size() == layouts.size()); | mgb_assert(m_fastrun_layouts.size() == layouts.size()); | ||||
| static_assert(std::tuple_size<FixedTensorLayouts>::value == 3 || | |||||
| std::tuple_size<FixedTensorLayouts>::value == 5 || | |||||
| std::tuple_size<FixedTensorLayouts>::value == 8, | |||||
| "Convolution AlgoChooser assumes arity = 3 , 5 or 8 (for " | |||||
| "deformable conv)"); | |||||
| static_assert( | |||||
| std::tuple_size<FixedTensorLayouts>::value == 2 || | |||||
| std::tuple_size<FixedTensorLayouts>::value == 3 || | |||||
| std::tuple_size<FixedTensorLayouts>::value == 4 || | |||||
| std::tuple_size<FixedTensorLayouts>::value == 5 || | |||||
| std::tuple_size<FixedTensorLayouts>::value == 8, | |||||
| "Pooling assumes arity = 2 or 4,Convolution AlgoChooser assumes " | |||||
| "arity = 3 , 5 or 8 (for deformable conv)"); | |||||
| } | } | ||||
| template <typename Opr> | template <typename Opr> | ||||
| @@ -284,7 +284,7 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( | |||||
| mdn_workspace), | mdn_workspace), | ||||
| std::forward_as_tuple(layouts[0], | std::forward_as_tuple(layouts[0], | ||||
| inp_val[1].as_megdnn()), | inp_val[1].as_megdnn()), | ||||
| array_skip<2>(layouts)); | |||||
| array_skip<arity_in>(layouts)); | |||||
| }); | }); | ||||
| } | } | ||||
| }); | }); | ||||
| @@ -12,34 +12,50 @@ | |||||
| #pragma once | #pragma once | ||||
| #include "megbrain/opr/internal/megdnn_opr_wrapper.h" | #include "megbrain/opr/internal/megdnn_opr_wrapper.h" | ||||
| #include "megbrain/opr/search_policy/algo_chooser_helper.h" | |||||
| #include "megdnn/oprs.h" | #include "megdnn/oprs.h" | ||||
| namespace mgb { | namespace mgb { | ||||
| namespace opr { | namespace opr { | ||||
| MGB_DEFINE_OPR_CLASS(PoolingForward, | MGB_DEFINE_OPR_CLASS(PoolingForward, | ||||
| intl::MegDNNOprWrapperFwd<megdnn::PoolingForward>) // { | |||||
| public: | |||||
| PoolingForward(VarNode *src, const Param ¶m, | |||||
| const OperatorNodeConfig &config); | |||||
| static SymbolVar make(SymbolVar src, const Param ¶m, | |||||
| const OperatorNodeConfig &config = {}); | |||||
| intl::MegDNNOprWrapperFwd<megdnn::PoolingForward>, | |||||
| public mixin::AlgoChooserHelper) //{ | |||||
| public: | |||||
| PoolingForward(VarNode * src, const Param& param, | |||||
| const ExecutionPolicy& policy, | |||||
| const OperatorNodeConfig& config); | |||||
| static SymbolVar make(SymbolVar src, const Param& param, | |||||
| const OperatorNodeConfig& config = {}, | |||||
| const ExecutionPolicy& policy = {}); | |||||
| void init_output_static_infer_desc() override; | |||||
| size_t get_workspace_size_bytes(const TensorShapeArray& input_shapes, | |||||
| const TensorShapeArray& output_shapes) | |||||
| const override; | |||||
| }; | }; | ||||
| using Pooling = PoolingForward; | using Pooling = PoolingForward; | ||||
| MGB_DEFINE_OPR_CLASS(PoolingBackward, | MGB_DEFINE_OPR_CLASS(PoolingBackward, | ||||
| intl::MegDNNOprWrapperBwd<megdnn::PoolingBackward>) // { | |||||
| public: | |||||
| PoolingBackward(VarNode *src, VarNode *dst, VarNode *diff, | |||||
| const Param ¶m, const OperatorNodeConfig &config); | |||||
| static SymbolVar make(SymbolVar src, SymbolVar dst, SymbolVar diff, | |||||
| const Param ¶m, | |||||
| const OperatorNodeConfig &config = {}); | |||||
| intl::MegDNNOprWrapperBwd<megdnn::PoolingBackward>, | |||||
| public mixin::AlgoChooserHelper) //{ | |||||
| public: | |||||
| PoolingBackward(VarNode * src, VarNode * dst, VarNode * diff, | |||||
| const Param& param, const ExecutionPolicy& policy, | |||||
| const OperatorNodeConfig& config); | |||||
| static SymbolVar make(SymbolVar src, SymbolVar dst, SymbolVar diff, | |||||
| const Param& param, | |||||
| const OperatorNodeConfig& config = {}, | |||||
| const ExecutionPolicy& policy = {}); | |||||
| size_t get_workspace_size_bytes(const TensorShapeArray& input_shapes, | |||||
| const TensorShapeArray& output_shapes) | |||||
| const override final; | |||||
| }; | }; | ||||
| } // namespace opr | |||||
| } // namespace mgb | |||||
| } // namespace opr | |||||
| } // namespace mgb | |||||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | ||||
| @@ -18,6 +18,7 @@ | |||||
| #include "megbrain/opr/search_policy/algo_chooser_helper.h" | #include "megbrain/opr/search_policy/algo_chooser_helper.h" | ||||
| #include "megbrain/opr/search_policy/profiler.h" | #include "megbrain/opr/search_policy/profiler.h" | ||||
| #include "megbrain/opr/dnn/convolution.h" | #include "megbrain/opr/dnn/convolution.h" | ||||
| #include "megbrain/opr/dnn/pooling.h" | |||||
| #include "megbrain/opr/blas.h" | #include "megbrain/opr/blas.h" | ||||
| #include "megdnn/oprs/base.h" | #include "megdnn/oprs/base.h" | ||||
| @@ -40,7 +40,9 @@ namespace opr { | |||||
| cb(DeformableConvBackwardData) \ | cb(DeformableConvBackwardData) \ | ||||
| cb(BatchConvBiasForward) \ | cb(BatchConvBiasForward) \ | ||||
| cb(MatrixMul) \ | cb(MatrixMul) \ | ||||
| cb(BatchedMatrixMul) | |||||
| cb(BatchedMatrixMul) \ | |||||
| cb(PoolingForward) \ | |||||
| cb(PoolingBackward) | |||||
| // clang-format on | // clang-format on | ||||
| template <typename Opr> | template <typename Opr> | ||||
| @@ -11,6 +11,9 @@ | |||||
| #include "./legacy_checker.h" | #include "./legacy_checker.h" | ||||
| #include "megbrain/opr/dnn/pooling.h" | #include "megbrain/opr/dnn/pooling.h" | ||||
| #include "megbrain/utils/persistent_cache.h" | |||||
| #include "megbrain/opr/basic_arith.h" | |||||
| #include "megbrain/opr/basic_arith_wrapper.h" | |||||
| using namespace std; | using namespace std; | ||||
| using namespace mgb; | using namespace mgb; | ||||
| @@ -104,6 +107,58 @@ TEST(TestOprDNN, PoolingBackward) | |||||
| } | } | ||||
| } | } | ||||
| TEST(TestOprDNN, PoolingExePolicy) { | |||||
| using Param = opr::Pooling::Param; | |||||
| Param param; | |||||
| using Policy = opr::Pooling::ExecutionPolicy; | |||||
| using S = Policy::Strategy; | |||||
| REQUIRE_GPU(1); | |||||
| auto cn = CompNode::load("gpu0"); | |||||
| cn.activate(); | |||||
| auto orig_impl = PersistentCache::set_impl( | |||||
| std::make_shared<InMemoryPersistentCache>()); | |||||
| HostTensorND host_y, host_y_copy; | |||||
| S strategy = S::HEURISTIC | S::REPRODUCIBLE; | |||||
| auto graph = ComputingGraph::make(); | |||||
| HostTensorGenerator<> gen; | |||||
| TensorShape shape = {1, 20, 24, 24}; | |||||
| auto input = opr::Host2DeviceCopy::make(*graph, gen(shape, cn)); | |||||
| param.mode = Param::Mode::MAX; | |||||
| param.window_h = param.window_w = 2; | |||||
| param.stride_h = param.stride_w = 2; | |||||
| param.pad_h = param.pad_w = 0; | |||||
| param.format = Param::Format::NCHW; | |||||
| Policy policy; | |||||
| policy.strategy = strategy; | |||||
| auto pooling = opr::PoolingForward::make(input, param, {}, policy); | |||||
| auto loss0 = opr::reduce_sum_sqr(pooling, pooling.make_scalar(1)); | |||||
| auto grad = cg::grad(loss0, input, true, false); | |||||
| opr::PoolingBackward* found = nullptr; | |||||
| auto cb = [&found](cg::OperatorNodeBase* opr) { | |||||
| if (opr->same_type<opr::PoolingBackward>()) { | |||||
| found = &opr->cast_final_safe<opr::PoolingBackward>(); | |||||
| } | |||||
| }; | |||||
| cg::DepOprIter{cb}.add(grad.node()->owner_opr()); | |||||
| found->set_execution_policy(strategy); | |||||
| auto func = graph->compile({make_callback_copy(grad, host_y)}); | |||||
| func->execute().wait(); | |||||
| mgb_assert(found->megdnn_opr()->execution_policy().algo.name.find( | |||||
| "cudnnReproducible") != std::string::npos); | |||||
| } | |||||
| } // anonymous namespace | } // anonymous namespace | ||||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | ||||