GitOrigin-RevId: 84791aacf9
tags/v0.5.0
| @@ -210,21 +210,25 @@ public: | |||
| _megdnn_tensor_out dst, | |||
| const PreprocessedFilter* preprocessed_filter, | |||
| _megdnn_workspace workspace) = 0; | |||
| virtual void exec_preprocess(const TensorLayout& src_layout, | |||
| _megdnn_tensor_in filter, | |||
| const TensorLayout& dst_layout, | |||
| PreprocessedFilter* preprocessed_filter, | |||
| _megdnn_workspace workspace) = 0; | |||
| void deduce_dtype(DType src, DType filter, DType& dst); | |||
| void deduce_layout(const TensorLayout& src, const TensorLayout& filter, | |||
| TensorLayout& dst); | |||
| virtual size_t get_workspace_in_bytes( | |||
| const TensorLayout& src, const TensorLayout& filter, | |||
| const TensorLayout& dst, | |||
| PreprocessedFilter* preprocessed_filter) = 0; | |||
| const PreprocessedFilter* preprocessed_filter) = 0; | |||
| virtual SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||
| const TensorLayout& src, const TensorLayout& filter, | |||
| const TensorLayout& dst) = 0; | |||
| virtual size_t get_preprocess_workspace_in_bytes( | |||
| const TensorLayout& src, const TensorLayout& filter, | |||
| const TensorLayout& dst) = 0; | |||
| @@ -337,7 +341,7 @@ public: | |||
| const TensorLayout& src, const TensorLayout& filter, | |||
| const TensorLayout& bias, const TensorLayout& z, | |||
| const TensorLayout& dst, | |||
| PreprocessedFilter* preprocessed_filter) = 0; | |||
| const PreprocessedFilter* preprocessed_filter) = 0; | |||
| virtual size_t get_preprocess_workspace_in_bytes( | |||
| const TensorLayout& src, const TensorLayout& filter, | |||
| const TensorLayout& bias, const TensorLayout& z, | |||
| @@ -76,7 +76,7 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( | |||
| auto ret = check_layout_fwd(src, filter, dst); | |||
| megdnn_assert_contiguous(bias); | |||
| auto required_workspace_in_bytes = | |||
| get_workspace_in_bytes(src, filter, bias, z, dst); | |||
| get_workspace_in_bytes(src, filter, bias, z, dst, nullptr); | |||
| megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); | |||
| if (bias.ndim != 0) { | |||
| //! bias.layout == dst.layout failed, no assert information | |||
| @@ -981,7 +981,8 @@ ConvolutionForward::CanonizedFilterMeta ConvolutionForward::check_exec( | |||
| const TensorLayout& src, const TensorLayout& filter, | |||
| const TensorLayout& dst, size_t workspace_in_bytes) { | |||
| auto ret = check_layout_fwd(src, filter, dst); | |||
| auto required_workspace_in_bytes = get_workspace_in_bytes(src, filter, dst); | |||
| auto required_workspace_in_bytes = | |||
| get_workspace_in_bytes(src, filter, dst, nullptr); | |||
| megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); | |||
| return ret; | |||
| } | |||
| @@ -112,7 +112,7 @@ void ConvBiasForwardImpl::AlgoBFloat16::exec(const ExecArgs& args) const { | |||
| convbias_opr->param().compute_mode = Param::ComputeMode::DEFAULT; | |||
| convbias_opr->execution_policy() = {m_impl}; | |||
| convbias_opr->exec(fsrc_tensor, ffilter_tensor, fbias_tensor, fz_tensor, | |||
| fdst_tensor, cvter.workspace()); | |||
| fdst_tensor, nullptr, cvter.workspace()); | |||
| } | |||
| { cvter.comp_to_dst_type(fdst_tensor, *args.dst_tensor); } | |||
| } | |||
| @@ -25,6 +25,7 @@ namespace cuda { | |||
| void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
| _megdnn_tensor_in bias, _megdnn_tensor_in z, | |||
| _megdnn_tensor_out dst, | |||
| const PreprocessedFilter*, | |||
| _megdnn_workspace workspace) { | |||
| check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout, | |||
| workspace.size); | |||
| @@ -208,7 +209,8 @@ size_t ConvBiasForwardImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
| const TensorLayout& filter, | |||
| const TensorLayout& bias, | |||
| const TensorLayout& z, | |||
| const TensorLayout& dst) { | |||
| const TensorLayout& dst, | |||
| const PreprocessedFilter*) { | |||
| AlgoBase::SizeArgs args{this, src, filter, bias, z, dst}; | |||
| return get_algorithm(this, src, filter, bias, z, dst) | |||
| ->get_workspace_in_bytes(args); | |||
| @@ -20,7 +20,9 @@ public: | |||
| using ConvBiasForward::ConvBiasForward; | |||
| void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
| _megdnn_tensor_in bias, _megdnn_tensor_in z, | |||
| _megdnn_tensor_out dst, _megdnn_workspace workspace) override; | |||
| _megdnn_tensor_out dst, | |||
| const PreprocessedFilter* preprocessed_filter, | |||
| _megdnn_workspace workspace) override; | |||
| std::vector<Algorithm*> get_all_algorithms( | |||
| const TensorLayout& src, const TensorLayout& filter, | |||
| const TensorLayout& bias, const TensorLayout& z, | |||
| @@ -34,7 +36,30 @@ public: | |||
| bool reproducible) override; | |||
| size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&, | |||
| const TensorLayout&, const TensorLayout&, | |||
| const TensorLayout&) override; | |||
| const TensorLayout&, | |||
| const PreprocessedFilter*) override; | |||
| size_t get_preprocess_workspace_in_bytes(const TensorLayout&, | |||
| const TensorLayout&, | |||
| const TensorLayout&, | |||
| const TensorLayout&, | |||
| const TensorLayout&) override { | |||
| return 0; | |||
| }; | |||
| SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||
| const TensorLayout&, const TensorLayout&, const TensorLayout&, | |||
| const TensorLayout&, const TensorLayout&) override { | |||
| return {}; | |||
| } | |||
| void exec_preprocess(const TensorLayout& , | |||
| _megdnn_tensor_in , | |||
| const TensorLayout& , | |||
| const TensorLayout& , | |||
| const TensorLayout& , | |||
| PreprocessedFilter* , | |||
| _megdnn_workspace ) override { | |||
| megdnn_throw("cuda conv_bias exec_preprocess has not implemeted yet"); | |||
| } | |||
| const char* get_algorithm_set_name() const override; | |||
| @@ -73,22 +73,32 @@ ConvolutionForwardImpl::get_all_algorithms(const TensorLayout& src, | |||
| size_t ConvolutionForwardImpl::get_workspace_in_bytes( | |||
| const TensorLayout& src, const TensorLayout& filter, | |||
| const TensorLayout& dst) { | |||
| const TensorLayout& dst, | |||
| const PreprocessedFilter* preprocessed_filter) { | |||
| auto extra_data = conv_bias_extra_data(dst); | |||
| return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get()) | |||
| ->get_workspace_in_bytes(src, filter, extra_data.bias_layout, | |||
| extra_data.z_layout, dst); | |||
| ->get_workspace_in_bytes( | |||
| src, filter, extra_data.bias_layout, extra_data.z_layout, | |||
| dst, | |||
| reinterpret_cast<const ConvolutionBase< | |||
| param::ConvBias>::PreprocessedFilter*>( | |||
| preprocessed_filter)); | |||
| } | |||
| void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, | |||
| _megdnn_tensor_in filter, | |||
| _megdnn_tensor_out dst, | |||
| const PreprocessedFilter* preprocessed_filter, | |||
| _megdnn_workspace workspace) { | |||
| auto extra_data = conv_bias_extra_data(dst.layout); | |||
| TensorND bias(nullptr, extra_data.bias_layout); | |||
| TensorND z(nullptr, extra_data.z_layout); | |||
| return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get()) | |||
| ->exec(src, filter, bias, z, dst, workspace); | |||
| ->exec(src, filter, bias, z, dst, | |||
| reinterpret_cast<const ConvolutionBase< | |||
| param::ConvBias>::PreprocessedFilter*>( | |||
| preprocessed_filter), | |||
| workspace); | |||
| } | |||
| const char* ConvolutionForwardImpl::get_algorithm_set_name() const { | |||
| @@ -11,6 +11,7 @@ | |||
| #pragma once | |||
| #include "megdnn/oprs/nn.h" | |||
| #include "src/common/utils.h" | |||
| namespace megdnn { | |||
| namespace cuda { | |||
| @@ -18,10 +19,11 @@ namespace cuda { | |||
| class ConvolutionForwardImpl: public ConvolutionForward { | |||
| public: | |||
| using ConvolutionForward::ConvolutionForward; | |||
| void exec(_megdnn_tensor_in src, | |||
| _megdnn_tensor_in filter, | |||
| _megdnn_tensor_out dst, | |||
| _megdnn_workspace workspace) override; | |||
| void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
| _megdnn_tensor_out dst, | |||
| const PreprocessedFilter* preprocessed_filter, | |||
| _megdnn_workspace workspace) override; | |||
| std::vector<Algorithm *> get_all_algorithms(const TensorLayout &src, | |||
| const TensorLayout &filter, | |||
| const TensorLayout &dst) override; | |||
| @@ -30,11 +32,28 @@ class ConvolutionForwardImpl: public ConvolutionForward { | |||
| const TensorLayout& dst, | |||
| size_t workspace_limit_in_bytes, | |||
| bool reproducible) override; | |||
| size_t get_workspace_in_bytes(const TensorLayout& src, | |||
| const TensorLayout& filter, | |||
| const TensorLayout& dst) override; | |||
| size_t get_workspace_in_bytes( | |||
| const TensorLayout& src, const TensorLayout& filter, | |||
| const TensorLayout& dst, | |||
| const PreprocessedFilter* preprocessed_filter) override; | |||
| const char* get_algorithm_set_name() const override; | |||
| SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||
| const TensorLayout&, const TensorLayout&, | |||
| const TensorLayout&) override { | |||
| return {}; | |||
| } | |||
| size_t get_preprocess_workspace_in_bytes( | |||
| const TensorLayout& , const TensorLayout& , | |||
| const TensorLayout& ) override{ | |||
| return 0; | |||
| } | |||
| void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, | |||
| const TensorLayout&, PreprocessedFilter*, | |||
| _megdnn_workspace) override { | |||
| megdnn_throw("cuda exec_preprocess has not implemeted yet"); | |||
| } | |||
| protected: | |||
| struct ConvBiasExtraData{ | |||
| std::unique_ptr<ConvBiasForward> convbias_opr; | |||
| @@ -27,7 +27,7 @@ void MaskConvForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
| _megdnn_workspace workspace) { | |||
| megdnn_assert(dst.layout.dtype.enumv() == DTypeTrait<dtype::Float32>::enumv, | |||
| "Mask conv only support Float32 dtype."); | |||
| m_conv_opr->exec(src, filter, dst, workspace); | |||
| m_conv_opr->exec(src, filter, dst, nullptr, workspace); | |||
| auto stream = cuda_stream(handle()); | |||
| #define cb(DType) \ | |||
| if (mask.layout.dtype == DType()) { \ | |||
| @@ -30,7 +30,7 @@ public: | |||
| const TensorLayout& dst) override { | |||
| MEGDNN_MARK_USED_VAR(mask); | |||
| m_conv_opr->param() = param(); | |||
| return m_conv_opr->get_workspace_in_bytes(src, filter, dst); | |||
| return m_conv_opr->get_workspace_in_bytes(src, filter, dst, nullptr); | |||
| } | |||
| private: | |||
| @@ -95,7 +95,9 @@ bool ConvBiasImpl::is_naive_algo(ConvBiasImpl::Algorithm* algo) { | |||
| } | |||
| void ConvBiasImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
| _megdnn_tensor_in bias, _megdnn_tensor_in z, | |||
| _megdnn_tensor_out dst, _megdnn_workspace workspace) { | |||
| _megdnn_tensor_out dst, | |||
| const PreprocessedFilter* preprocessed_filter, | |||
| _megdnn_workspace workspace) { | |||
| check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout, | |||
| workspace.size); | |||
| auto fparam = make_ncb_kern_param(src, filter, bias, dst, workspace); | |||
| @@ -104,20 +106,21 @@ void ConvBiasImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
| ncb_algo_get_workspace(algo, fparam) <= workspace.size) { | |||
| exec_with_ncb_kern(fparam, algo); | |||
| } else { | |||
| naive::ConvBiasForwardImpl::exec(src, filter, bias, z, dst, workspace); | |||
| naive::ConvBiasForwardImpl::exec(src, filter, bias, z, dst, | |||
| preprocessed_filter, workspace); | |||
| } | |||
| } | |||
| size_t ConvBiasImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
| const TensorLayout& filter, | |||
| const TensorLayout& bias, | |||
| const TensorLayout& z, | |||
| const TensorLayout& dst) { | |||
| size_t ConvBiasImpl::get_workspace_in_bytes( | |||
| const TensorLayout& src, const TensorLayout& filter, | |||
| const TensorLayout& bias, const TensorLayout& z, | |||
| const TensorLayout& dst, | |||
| const PreprocessedFilter* preprocessed_filter) { | |||
| auto fparam = make_ncb_kern_size_param(src, filter, bias, dst); | |||
| ConvBiasImpl::Algorithm* algo = get_algorithm(fparam); | |||
| if (is_naive_algo(algo)) { | |||
| return naive::ConvBiasForwardImpl::get_workspace_in_bytes(src, filter, | |||
| bias, z, dst); | |||
| return naive::ConvBiasForwardImpl::get_workspace_in_bytes( | |||
| src, filter, bias, z, dst, preprocessed_filter); | |||
| } else { | |||
| return ncb_algo_get_workspace(algo, fparam); | |||
| } | |||
| @@ -41,14 +41,16 @@ public: | |||
| //! implemented by exec_with_ncb_kern() | |||
| void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
| _megdnn_tensor_in bias, _megdnn_tensor_in z, | |||
| _megdnn_tensor_out dst, _megdnn_workspace workspace) override; | |||
| _megdnn_tensor_out dst, const PreprocessedFilter*, | |||
| _megdnn_workspace workspace) override; | |||
| //! implemented by get_workspace_with_ncb() | |||
| size_t get_workspace_in_bytes(const TensorLayout& src, | |||
| const TensorLayout& filter, | |||
| const TensorLayout& bias, | |||
| const TensorLayout& z, | |||
| const TensorLayout& dst) override; | |||
| const TensorLayout& dst, | |||
| const PreprocessedFilter*) override; | |||
| //! implemented by get_all_algorithms_with_ncb() | |||
| std::vector<Algorithm*> get_all_algorithms( | |||
| @@ -82,6 +82,7 @@ bool ConvolutionImpl::is_naive_algo(ConvolutionImpl::Algorithm* algo) { | |||
| } | |||
| void ConvolutionImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
| _megdnn_tensor_out dst, | |||
| const PreprocessedFilter* preprocessed_filter, | |||
| _megdnn_workspace workspace) { | |||
| auto fparam = make_ncb_kern_param(src, filter, dst, workspace); | |||
| ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size); | |||
| @@ -89,18 +90,20 @@ void ConvolutionImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
| ncb_algo_get_workspace(algo, fparam) <= workspace.size) { | |||
| exec_with_ncb_kern(fparam, algo); | |||
| } else { | |||
| naive::ConvolutionForwardImpl::exec(src, filter, dst, workspace); | |||
| naive::ConvolutionForwardImpl::exec(src, filter, dst, | |||
| preprocessed_filter, workspace); | |||
| } | |||
| } | |||
| size_t ConvolutionImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
| const TensorLayout& filter, | |||
| const TensorLayout& dst) { | |||
| size_t ConvolutionImpl::get_workspace_in_bytes( | |||
| const TensorLayout& src, const TensorLayout& filter, | |||
| const TensorLayout& dst, | |||
| const PreprocessedFilter* preprocessed_filter) { | |||
| auto fparam = make_ncb_kern_size_param(src, filter, dst); | |||
| Algorithm* algo = get_algorithm(fparam); | |||
| if (is_naive_algo(algo)) { | |||
| return naive::ConvolutionForwardImpl::get_workspace_in_bytes( | |||
| src, filter, dst); | |||
| src, filter, dst, preprocessed_filter); | |||
| } else { | |||
| return ncb_algo_get_workspace(algo, fparam); | |||
| } | |||
| @@ -36,12 +36,14 @@ public: | |||
| //! implemented by exec_with_ncb_kern() | |||
| void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
| _megdnn_tensor_out dst, _megdnn_workspace workspace) override; | |||
| _megdnn_tensor_out dst, const PreprocessedFilter*, | |||
| _megdnn_workspace workspace) override; | |||
| //! implemented by get_workspace_with_ncb() | |||
| size_t get_workspace_in_bytes(const TensorLayout& src, | |||
| const TensorLayout& filter, | |||
| const TensorLayout& dst) override; | |||
| const TensorLayout& dst, | |||
| const PreprocessedFilter*) override; | |||
| //! implemented by get_all_algorithms_with_ncb() | |||
| std::vector<Algorithm*> get_all_algorithms( | |||
| @@ -54,7 +54,8 @@ size_t ConvBiasForwardImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
| const TensorLayout& flt, | |||
| const TensorLayout& bias, | |||
| const TensorLayout& z, | |||
| const TensorLayout& dst) { | |||
| const TensorLayout& dst, | |||
| const PreprocessedFilter*) { | |||
| size_t float_workspace_size = 0; | |||
| if (z.ndim > 0 && z.dtype.category() != DTypeCategory::FLOAT) { | |||
| @@ -79,6 +80,7 @@ size_t ConvBiasForwardImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
| void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
| _megdnn_tensor_in bias, _megdnn_tensor_in z, | |||
| _megdnn_tensor_out dst, | |||
| const PreprocessedFilter*, | |||
| _megdnn_workspace workspace) { | |||
| MIDOUT_BEGIN(megdnn_naive_conv_bias_fwd) { | |||
| dt_byte *workspace_ptr = workspace.raw_ptr; | |||
| @@ -22,7 +22,9 @@ public: | |||
| using ConvBiasForward::ConvBiasForward; | |||
| void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
| _megdnn_tensor_in bias, _megdnn_tensor_in z, | |||
| _megdnn_tensor_out dst, _megdnn_workspace workspace) override; | |||
| _megdnn_tensor_out dst, | |||
| const PreprocessedFilter* preprocessed_filter, | |||
| _megdnn_workspace workspace) override; | |||
| std::vector<Algorithm*> get_all_algorithms( | |||
| const TensorLayout& src, const TensorLayout& filter, | |||
| @@ -37,11 +39,32 @@ public: | |||
| size_t workspace_limit_in_bytes, | |||
| bool reproducible) override; | |||
| size_t get_workspace_in_bytes(const TensorLayout& src, | |||
| const TensorLayout& filter, | |||
| const TensorLayout& bias, | |||
| const TensorLayout& z, | |||
| const TensorLayout& dst) override; | |||
| size_t get_workspace_in_bytes( | |||
| const TensorLayout& src, const TensorLayout& filter, | |||
| const TensorLayout& bias, const TensorLayout& z, | |||
| const TensorLayout& dst, | |||
| const PreprocessedFilter* preprocessed_filter) override; | |||
| size_t get_preprocess_workspace_in_bytes(const TensorLayout&, | |||
| const TensorLayout&, | |||
| const TensorLayout&, | |||
| const TensorLayout&, | |||
| const TensorLayout&) override { | |||
| return 0; | |||
| } | |||
| SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||
| const TensorLayout&, const TensorLayout&, const TensorLayout&, | |||
| const TensorLayout&, const TensorLayout&) override { | |||
| return {}; | |||
| } | |||
| void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, | |||
| const TensorLayout&, const TensorLayout&, | |||
| const TensorLayout&, PreprocessedFilter*, | |||
| _megdnn_workspace) override{ | |||
| megdnn_throw("conv_bias exec_preprocess is not impl yet"); | |||
| } | |||
| const char* get_algorithm_set_name() const override; | |||
| }; | |||
| @@ -26,15 +26,14 @@ using namespace megdnn; | |||
| using namespace naive; | |||
| void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, | |||
| _megdnn_tensor_in filter, | |||
| _megdnn_tensor_out dst, | |||
| _megdnn_workspace workspace) | |||
| { | |||
| _megdnn_tensor_in filter, | |||
| _megdnn_tensor_out dst, | |||
| const PreprocessedFilter*, | |||
| _megdnn_workspace workspace) { | |||
| MIDOUT_BEGIN(megdnn_naive_conv_fwd) { | |||
| auto filter_meta = check_exec( | |||
| src.layout, filter.layout, dst.layout, workspace.size); | |||
| using ComputeMode = Param::ComputeMode; | |||
| auto filter_meta = check_exec(src.layout, filter.layout, dst.layout, | |||
| workspace.size); | |||
| using ComputeMode = Param::ComputeMode; | |||
| #define DISPATCH_CMODE(in_dt, out_dt, in_ct, out_ct, comp_ct, cmode) \ | |||
| do { \ | |||
| using namespace dtype; \ | |||
| @@ -52,24 +51,28 @@ void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, | |||
| #define cb(dt) \ | |||
| DISPATCH(dt, dt, DTypeTrait<dt>::ctype, DTypeTrait<dt>::ctype, \ | |||
| DTypeTrait<dt>::ctype) | |||
| MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb); | |||
| MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb); | |||
| #undef cb | |||
| DISPATCH(Int8, Int16, dt_int8, dt_int16, dt_int16); | |||
| DISPATCH(Int8, Int32, dt_int8, dt_int32, dt_int32); | |||
| DISPATCH(QuantizedS8, QuantizedS32, dt_int8, dt_int32, dt_int32); | |||
| MEGDNN_INC_FLOAT16(DISPATCH_CMODE(Float16, Float16, dt_float16, dt_float16, | |||
| dt_float32, ComputeMode::FLOAT32)); | |||
| MEGDNN_INC_FLOAT16(DISPATCH_CMODE(BFloat16, BFloat16, dt_bfloat16, | |||
| dt_bfloat16, dt_float32, | |||
| ComputeMode::FLOAT32)); | |||
| DISPATCH(Quantized8Asymm, QuantizedS32, dt_quint8, dt_qint32, dt_qint32); | |||
| DISPATCH(QuantizedS8, QuantizedS8, dt_int8, dt_int8, dt_int32); | |||
| DISPATCH(Int8, Int16, dt_int8, dt_int16, dt_int16); | |||
| DISPATCH(Int8, Int32, dt_int8, dt_int32, dt_int32); | |||
| DISPATCH(QuantizedS8, QuantizedS32, dt_int8, dt_int32, dt_int32); | |||
| MEGDNN_INC_FLOAT16(DISPATCH_CMODE(Float16, Float16, dt_float16, | |||
| dt_float16, dt_float32, | |||
| ComputeMode::FLOAT32)); | |||
| MEGDNN_INC_FLOAT16(DISPATCH_CMODE(BFloat16, BFloat16, dt_bfloat16, | |||
| dt_bfloat16, dt_float32, | |||
| ComputeMode::FLOAT32)); | |||
| DISPATCH(Quantized8Asymm, QuantizedS32, dt_quint8, dt_qint32, | |||
| dt_qint32); | |||
| DISPATCH(QuantizedS8, QuantizedS8, dt_int8, dt_int8, dt_int32); | |||
| #undef DISPATCH | |||
| megdnn_throw(ssprintf("unsupported Conv(%s, %s) -> %s with cmode = %d", | |||
| src.layout.dtype.name(), filter.layout.dtype.name(), | |||
| dst.layout.dtype.name(), | |||
| static_cast<int>(param().compute_mode))); | |||
| } MIDOUT_END(); | |||
| megdnn_throw(ssprintf("unsupported Conv(%s, %s) -> %s with cmode = %d", | |||
| src.layout.dtype.name(), | |||
| filter.layout.dtype.name(), | |||
| dst.layout.dtype.name(), | |||
| static_cast<int>(param().compute_mode))); | |||
| } | |||
| MIDOUT_END(); | |||
| } | |||
| size_t ConvolutionBackwardDataImpl::get_workspace_in_bytes(const TensorLayout& filter, | |||
| @@ -10,6 +10,7 @@ | |||
| */ | |||
| #pragma once | |||
| #include "megdnn/oprs.h" | |||
| #include "src/common/utils.h" | |||
| namespace megdnn { | |||
| namespace naive { | |||
| @@ -17,10 +18,10 @@ namespace naive { | |||
| class ConvolutionForwardImpl: public ConvolutionForward { | |||
| public: | |||
| using ConvolutionForward::ConvolutionForward; | |||
| void exec(_megdnn_tensor_in src, | |||
| _megdnn_tensor_in filter, | |||
| _megdnn_tensor_out dst, | |||
| _megdnn_workspace workspace) override; | |||
| void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
| _megdnn_tensor_out dst, | |||
| const PreprocessedFilter* preprocessed_filter, | |||
| _megdnn_workspace workspace) override; | |||
| std::vector<Algorithm *> get_all_algorithms(const TensorLayout &src, | |||
| const TensorLayout &filter, | |||
| const TensorLayout &dst) override; | |||
| @@ -30,10 +31,29 @@ class ConvolutionForwardImpl: public ConvolutionForward { | |||
| size_t workspace_limit_in_bytes, | |||
| bool reproducible) override; | |||
| size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&, | |||
| const TensorLayout&) override { | |||
| const TensorLayout&, | |||
| const PreprocessedFilter*) override { | |||
| return 0; | |||
| } | |||
| size_t get_preprocess_workspace_in_bytes(const TensorLayout&, | |||
| const TensorLayout&, | |||
| const TensorLayout&) override { | |||
| return 0; | |||
| } | |||
| void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, | |||
| const TensorLayout&, PreprocessedFilter*, | |||
| _megdnn_workspace) override { | |||
| megdnn_throw("convolution exec_preprocess in not impl yet"); | |||
| } | |||
| SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||
| const TensorLayout& , const TensorLayout& , | |||
| const TensorLayout& )override{ | |||
| return {}; | |||
| } | |||
| const char* get_algorithm_set_name() const override; | |||
| }; | |||
| @@ -97,7 +97,7 @@ void ConvPoolingForwardImpl::exec(const _megdnn_in TensorND src, | |||
| TensorND conv_dst((float*)(workspace.raw_ptr), conv_dst_layout); | |||
| //convFwd->check_layout(src.layout, filter.layout, workspace.layout, empty_wsp.layout); | |||
| check_layout(src.layout, filter.layout, bias.layout, dst.layout, workspace.size); | |||
| convFwd->exec(src, filter, conv_dst, empty_wsp); | |||
| convFwd->exec(src, filter, conv_dst, nullptr, empty_wsp); | |||
| // calculate bias | |||
| int conv_dst_batch = conv_dst.layout.shape[0]; | |||
| @@ -80,7 +80,7 @@ void MaskConvForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
| _megdnn_workspace workspace) { | |||
| MEGDNN_MARK_USED_VAR(mask); | |||
| m_conv_opr->param() = this->param(); | |||
| m_conv_opr->exec(src, filter, dst, workspace); | |||
| m_conv_opr->exec(src, filter, dst, nullptr, workspace); | |||
| #define cb(DType) \ | |||
| if (mask.layout.dtype == DType()) { \ | |||
| using ctype = typename DTypeTrait<DType>::ctype; \ | |||
| @@ -99,7 +99,7 @@ size_t MaskConvForwardImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
| const TensorLayout& dst) { | |||
| MEGDNN_MARK_USED_VAR(mask); | |||
| m_conv_opr->param() = this->param(); | |||
| return m_conv_opr->get_workspace_in_bytes(src, filter, dst); | |||
| return m_conv_opr->get_workspace_in_bytes(src, filter, dst, nullptr); | |||
| } | |||
| void MaskPropagateImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, | |||
| @@ -103,7 +103,7 @@ void SeparableConvForwardImpl::exec(_megdnn_tensor_in src, | |||
| ConvolutionForwardImpl* convOptr = new ConvolutionForwardImpl(this->handle()); | |||
| Workspace empty_wsp; | |||
| convOptr->exec(src, filter2d, dst, empty_wsp); | |||
| convOptr->exec(src, filter2d, dst, nullptr, empty_wsp); | |||
| delete(convOptr); | |||
| free(filter2d_buf); | |||
| @@ -664,7 +664,7 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD) { | |||
| conv_bias_opr->get_workspace_in_bytes( | |||
| tensors[0].layout, filter_transform_layout, | |||
| tensors[2].layout, tensors[3].layout, | |||
| tensors[4].layout); | |||
| tensors[4].layout, nullptr); | |||
| WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(), | |||
| conv_bias_workspace_in_bytes, | |||
| @@ -676,7 +676,8 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD) { | |||
| winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor, | |||
| wb.get_workspace(2)); | |||
| conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2], | |||
| tensors[3], tensors[4], wb.get_workspace(1)); | |||
| tensors[3], tensors[4], nullptr, | |||
| wb.get_workspace(1)); | |||
| free(wb.ptr()); | |||
| }; | |||
| @@ -1008,7 +1008,7 @@ void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m, | |||
| conv_bias_opr->param().output_block_size = m; | |||
| size_t conv_bias_workspace_in_bytes = conv_bias_opr->get_workspace_in_bytes( | |||
| tensors[0].layout, filter_transform_layout, tensors[2].layout, | |||
| tensors[3].layout, tensors[4].layout); | |||
| tensors[3].layout, tensors[4].layout, nullptr); | |||
| WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(), | |||
| conv_bias_workspace_in_bytes, | |||
| @@ -1020,7 +1020,7 @@ void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m, | |||
| winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor, | |||
| wb.get_workspace(2)); | |||
| conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2], | |||
| tensors[3], tensors[4], wb.get_workspace(1)); | |||
| tensors[3], tensors[4], nullptr, wb.get_workspace(1)); | |||
| free(wb.ptr()); | |||
| }; | |||
| @@ -200,15 +200,70 @@ struct OprProxyProfilingTernary : public OprProxyProfilingBase<Opr, 3> { | |||
| using OprProxyProfilingTernary<c>::OprProxyProfilingTernary; \ | |||
| } | |||
| DEF_PROF3(ConvolutionForward); | |||
| DEF_PROF3(ConvolutionBackwardData); | |||
| DEF_PROF3(ConvolutionBackwardFilter); | |||
| DEF_PROF3(LocalShareForward); | |||
| DEF_PROF3(LocalShareBackwardData); | |||
| DEF_PROF3(LocalShareBackwardFilter); | |||
| #undef DEF_PROF3 | |||
| //! TODO: it should adapt weight preprocess later | |||
| template <> | |||
| struct OprProxy<ConvolutionForward> | |||
| : public OprProxyProfilingTernary<ConvolutionForward> { | |||
| using OprProxyProfilingTernary<ConvolutionForward>::OprProxyProfilingTernary; | |||
| void exec(ConvolutionForward* opr, const TensorNDArray& tensors) { | |||
| megdnn_assert(tensors.size() == 3); | |||
| if (!Base::W.valid()) { | |||
| Base::W = WorkspaceWrapper(opr->handle(), 0); | |||
| } | |||
| if (Base::m_profiling && !Base::target_algo) { | |||
| size_t min_time = std::numeric_limits<size_t>::max(); | |||
| for (auto algo : | |||
| opr->get_all_algorithms(tensors[0].layout, tensors[1].layout, | |||
| tensors[2].layout)) { | |||
| opr->execution_policy().algorithm = algo; | |||
| auto workspace_size = opr->get_workspace_in_bytes( | |||
| tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||
| nullptr); | |||
| Base::W.update(workspace_size); | |||
| for (size_t times = 0; times < Base::warmup_times; ++times) | |||
| opr->exec(tensors[0], tensors[1], tensors[2], nullptr, | |||
| Base::W.workspace()); | |||
| megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
| Timer timer; | |||
| timer.start(); | |||
| for (size_t times = 0; times < Base::exec_times; ++times) { | |||
| opr->exec(tensors[0], tensors[1], tensors[2], nullptr, | |||
| Base::W.workspace()); | |||
| } | |||
| megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
| timer.stop(); | |||
| printf("%.3fms %s\n", timer.get_time_in_us() / 1e3, | |||
| algo->name()); | |||
| if (min_time > timer.get_time_in_us()) { | |||
| min_time = timer.get_time_in_us(); | |||
| Base::target_algo = algo; | |||
| } | |||
| } | |||
| opr->execution_policy().algorithm = Base::target_algo; | |||
| auto workspace_size = opr->get_workspace_in_bytes( | |||
| tensors[0].layout, tensors[1].layout, tensors[2].layout, nullptr); | |||
| Base::W.update(workspace_size); | |||
| } | |||
| if (!Base::target_algo) { | |||
| auto workspace_size = opr->get_workspace_in_bytes( | |||
| tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||
| nullptr); | |||
| Base::W.update(workspace_size); | |||
| } | |||
| opr->exec(tensors[0], tensors[1], tensors[2], nullptr, | |||
| Base::W.workspace()); | |||
| } | |||
| }; | |||
| template <class Opr> | |||
| struct OprProxyProfiling5 : public OprProxyProfilingBase<Opr, 5> { | |||
| using Base = OprProxyProfilingBase<Opr, 5>; | |||
| @@ -274,10 +329,67 @@ struct OprProxyProfiling5 : public OprProxyProfilingBase<Opr, 5> { | |||
| DEF_PROF5(DeformableConvForward); | |||
| DEF_PROF5(DeformableConvBackwardFilter); | |||
| DEF_PROF5(ConvBiasForward); | |||
| //DEF_PROF5(ConvBiasForward); | |||
| DEF_PROF5(BatchConvBiasForward); | |||
| #undef DEF_PROF5 | |||
| //! TODO: it should adapt weight preprocess later | |||
| template <> | |||
| struct OprProxy<ConvBiasForward> : public OprProxyProfiling5<ConvBiasForward> { | |||
| using OprProxyProfiling5<ConvBiasForward>::OprProxyProfiling5; | |||
| void exec(ConvBiasForward* opr, const TensorNDArray& tensors) { | |||
| megdnn_assert(tensors.size() == 5); | |||
| if (!Base::W.valid()) { | |||
| Base::W = WorkspaceWrapper(opr->handle(), 0); | |||
| } | |||
| if (Base::m_profiling && !Base::target_algo) { | |||
| size_t min_time = std::numeric_limits<size_t>::max(); | |||
| for (auto algo : | |||
| opr->get_all_algorithms(tensors[0].layout, tensors[1].layout, | |||
| tensors[2].layout, tensors[3].layout, | |||
| tensors[4].layout)) { | |||
| opr->execution_policy().algorithm = algo; | |||
| auto workspace_size = opr->get_workspace_in_bytes( | |||
| tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||
| tensors[3].layout, tensors[4].layout, nullptr); | |||
| Base::W.update(workspace_size); | |||
| for (size_t times = 0; times < Base::warmup_times; ++times) | |||
| opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], | |||
| tensors[4], nullptr, Base::W.workspace()); | |||
| megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
| Timer timer; | |||
| timer.start(); | |||
| for (size_t times = 0; times < Base::exec_times; ++times) { | |||
| opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], | |||
| tensors[4], nullptr, Base::W.workspace()); | |||
| } | |||
| megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
| timer.stop(); | |||
| printf("%.3fms %s\n", timer.get_time_in_us() / 1e3, | |||
| algo->name()); | |||
| if (min_time > timer.get_time_in_us()) { | |||
| min_time = timer.get_time_in_us(); | |||
| Base::target_algo = algo; | |||
| } | |||
| } | |||
| opr->execution_policy().algorithm = Base::target_algo; | |||
| auto workspace_size = opr->get_workspace_in_bytes( | |||
| tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||
| tensors[3].layout, tensors[4].layout, nullptr); | |||
| Base::W.update(workspace_size); | |||
| } | |||
| if (!Base::target_algo) { | |||
| auto workspace_size = opr->get_workspace_in_bytes( | |||
| tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||
| tensors[3].layout, tensors[4].layout, nullptr); | |||
| Base::W.update(workspace_size); | |||
| } | |||
| opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], tensors[4], | |||
| nullptr, Base::W.workspace()); | |||
| } | |||
| }; | |||
| template <class Opr> | |||
| struct OprProxyProfiling8 : public OprProxyProfilingBase<Opr, 8> { | |||
| using Base = OprProxyProfilingBase<Opr, 8>; | |||
| @@ -75,8 +75,8 @@ TEST_F(CPU, MASK_PROPAGATE) { | |||
| auto dst = TensorND{dst_ptr, dst_layout}; | |||
| WorkspaceWrapper workspace{ | |||
| handle(), opr->get_workspace_in_bytes(src.layout, filter.layout, | |||
| dst.layout)}; | |||
| opr->exec(src, filter, dst, workspace.workspace()); | |||
| dst.layout, nullptr)}; | |||
| opr->exec(src, filter, dst, nullptr, workspace.workspace()); | |||
| for (size_t i = 0; i < dst.layout.total_nr_elems(); ++i) { | |||
| mask_dst.ptr<int>()[i] = dst_ptr[i] > 0; | |||
| } | |||
| @@ -176,6 +176,46 @@ public: | |||
| } | |||
| } | |||
| //! special for weight preprocess | |||
| void exec_convolution(ConvolutionForward* opr0, ConvolutionForward* opr1) { | |||
| opr0->param().pad_h = pad_h; | |||
| opr0->param().pad_w = pad_w; | |||
| opr1->param() = opr0->param(); | |||
| opr1->param().sparse = param::Convolution::Sparse::GROUP; | |||
| TensorND a0, b0, c0, a1, b1, c1; | |||
| std::tie(a0, b0, c0) = shuffle(std::make_tuple( | |||
| src0->tensornd(), flt0->tensornd(), dst0->tensornd())); | |||
| std::tie(a1, b1, c1) = shuffle(std::make_tuple( | |||
| src1->tensornd(), flt1->tensornd(), dst1->tensornd())); | |||
| WorkspaceWrapper wk( | |||
| handle, | |||
| std::max(opr0->get_workspace_in_bytes(a0.layout, b0.layout, | |||
| c0.layout, nullptr), | |||
| opr1->get_workspace_in_bytes(a1.layout, b1.layout, | |||
| c1.layout, nullptr))); | |||
| cudaProfilerStart(); | |||
| cudaEventRecord(cuda_ev[0], cuda_stream); | |||
| opr0->exec(a0, b0, c0, nullptr, wk.workspace()); | |||
| cudaEventRecord(cuda_ev[1], cuda_stream); | |||
| opr1->exec(a1, b1, c1, nullptr, wk.workspace()); | |||
| cudaEventRecord(cuda_ev[2], cuda_stream); | |||
| cudaProfilerStop(); | |||
| if (getenv("MEGDNN_CHANWISE_CONV_VERBOSE") || | |||
| getenv("MEGDNN_CHANWISE_CONV_FULLBENCH")) { | |||
| cudaStreamSynchronize(cuda_stream); | |||
| float t0 = -1, t1 = -1; | |||
| cudaEventElapsedTime(&t0, cuda_ev[0], cuda_ev[1]); | |||
| cudaEventElapsedTime(&t1, cuda_ev[1], cuda_ev[2]); | |||
| printf("%s;%s;%s: cudnn/megdnn: %.3fms/%.3fms=%.3f\n", | |||
| lsrc.TensorShape::to_string().c_str(), | |||
| lflt1.TensorShape::to_string().c_str(), | |||
| ldst.TensorShape::to_string().c_str(), | |||
| t0, t1, t0 / t1); | |||
| } | |||
| } | |||
| void cmp_dst() { | |||
| Tensor<> dst0_cpu(handle_cpu, ldst), dst1_cpu(handle_cpu, ldst); | |||
| megdnn_memcpy_D2H(handle, | |||
| @@ -399,7 +439,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_BENCH_CHECK) { | |||
| benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW); | |||
| benv.fill_src(); | |||
| benv.fill_flt(); | |||
| benv.exec(conv0.get(), conv1.get()); | |||
| benv.exec_convolution(conv0.get(), conv1.get()); | |||
| benv.cmp_dst(); | |||
| }; | |||
| @@ -30,10 +30,10 @@ TEST(DISPATCHER, NULL_DISPATCHER) | |||
| auto layout = TensorLayout({1, 1, 1, 1}, dtype::Float32()); | |||
| TensorND src(nullptr, layout), filter(nullptr, layout), dst(nullptr, layout); | |||
| auto wsize = opr->get_workspace_in_bytes(layout, layout, layout); | |||
| auto wsize = opr->get_workspace_in_bytes(layout, layout, layout, nullptr); | |||
| Workspace workspace(nullptr, wsize); | |||
| opr->exec(src, filter, dst, workspace); | |||
| opr->exec(src, filter, dst, nullptr, workspace); | |||
| } | |||
| #endif | |||
| @@ -217,11 +217,11 @@ TEST_F(NAIVE, CONV_BIAS_QUANTIZED8x8x32_NCHW32) { | |||
| size_t ws_size = conv_opr->get_workspace_in_bytes( | |||
| src_layout_4, filter_layout_4, bias_layout_4, z_layout_4, | |||
| dst_layout_4); | |||
| dst_layout_4, nullptr); | |||
| WorkspaceWrapper ws{handle(), ws_size}; | |||
| conv_opr->exec(src_ts_4.tensornd(), filter_ts_4.tensornd(), | |||
| bias_ts_4.tensornd(), z_ts_4.tensornd(), dst_ts_4.tensornd(), | |||
| ws.workspace()); | |||
| nullptr, ws.workspace()); | |||
| TensorLayout src_layout_32{{N, IC / 32, IH, IW, 32}, | |||
| dtype::QuantizedS8(0.1f)}; | |||
| @@ -209,7 +209,8 @@ TEST_F(NAIVE, CONVOLUTION_WITH_NCHW4) { | |||
| } | |||
| auto workspace_size = conv->get_workspace_in_bytes( | |||
| tensors[0].layout, tensors[1].layout, tensors[2].layout); | |||
| tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||
| nullptr); | |||
| dt_byte* workspace_ptr = static_cast<dt_byte*>(malloc(workspace_size)); | |||
| Workspace workspace{workspace_ptr, workspace_size}; | |||
| @@ -217,7 +218,7 @@ TEST_F(NAIVE, CONVOLUTION_WITH_NCHW4) { | |||
| relayout->exec(nchw4_tensors[0], nchw_tensors[0]); | |||
| relayout->exec(nchw4_tensors[1], nchw_tensors[1]); | |||
| conv->exec(nchw_tensors[0], nchw_tensors[1], nchw_tensors[2], | |||
| conv->exec(nchw_tensors[0], nchw_tensors[1], nchw_tensors[2], nullptr, | |||
| workspace); | |||
| relayout->exec(nchw_tensors[2], nchw4_tensors[2]); | |||
| @@ -1334,8 +1334,8 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_WEIGHT_PREPROCESS) { | |||
| size_t conv_bias_workspace_in_bytes = | |||
| conv_bias_opr->get_workspace_in_bytes( | |||
| tensors[0].layout, filter_transform_layout, | |||
| tensors[2].layout, tensors[3].layout, | |||
| tensors[4].layout); | |||
| tensors[2].layout, tensors[3].layout, tensors[4].layout, | |||
| nullptr); | |||
| WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(), | |||
| conv_bias_workspace_in_bytes, | |||
| @@ -1347,7 +1347,8 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_WEIGHT_PREPROCESS) { | |||
| winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor, | |||
| wb.get_workspace(2)); | |||
| conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2], | |||
| tensors[3], tensors[4], wb.get_workspace(1)); | |||
| tensors[3], tensors[4], nullptr, | |||
| wb.get_workspace(1)); | |||
| free(wb.ptr()); | |||
| }; | |||