GitOrigin-RevId: ca860f487e
tags/v0.6.0
| @@ -234,10 +234,10 @@ public: | |||||
| const TensorLayout& dst) = 0; | const TensorLayout& dst) = 0; | ||||
| protected: | protected: | ||||
| CanonizedFilterMeta check_exec(const TensorLayout& src, | |||||
| const TensorLayout& filter, | |||||
| const TensorLayout& dst, | |||||
| size_t workspace_in_bytes); | |||||
| CanonizedFilterMeta check_exec( | |||||
| const TensorLayout& src, const TensorLayout& filter, | |||||
| const TensorLayout& dst, size_t workspace_in_bytes, | |||||
| const PreprocessedFilter* preprocessed_filter); | |||||
| }; | }; | ||||
| using Convolution = ConvolutionForward; | using Convolution = ConvolutionForward; | ||||
| @@ -408,12 +408,11 @@ public: | |||||
| static WinogradParam parse_winograd_name(const std::string& algo_name); | static WinogradParam parse_winograd_name(const std::string& algo_name); | ||||
| protected: | protected: | ||||
| CanonizedFilterMeta check_exec(const TensorLayout& src, | |||||
| const TensorLayout& filter, | |||||
| const TensorLayout& bias, | |||||
| const TensorLayout& z, | |||||
| const TensorLayout& dst, | |||||
| size_t workspace_in_bytes); | |||||
| CanonizedFilterMeta check_exec( | |||||
| const TensorLayout& src, const TensorLayout& filter, | |||||
| const TensorLayout& bias, const TensorLayout& z, | |||||
| const TensorLayout& dst, size_t workspace_in_bytes, | |||||
| const PreprocessedFilter* preprocessed_filter); | |||||
| }; | }; | ||||
| using ConvBias = ConvBiasForward; | using ConvBias = ConvBiasForward; | ||||
| @@ -32,7 +32,8 @@ void ConvBiasForward::deduce_layout(const TensorLayout& src, | |||||
| ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( | ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( | ||||
| const TensorLayout& src, const TensorLayout& filter, | const TensorLayout& src, const TensorLayout& filter, | ||||
| const TensorLayout& bias, const TensorLayout& z, | const TensorLayout& bias, const TensorLayout& z, | ||||
| const TensorLayout& dst, size_t workspace_in_bytes) { | |||||
| const TensorLayout& dst, size_t workspace_in_bytes, | |||||
| const PreprocessedFilter* preprocessed_filter) { | |||||
| if ((param().format == param::ConvBias::Format::NCHW_WINOGRAD || | if ((param().format == param::ConvBias::Format::NCHW_WINOGRAD || | ||||
| param().format == param::ConvBias::Format::NCHW88_WINOGRAD || | param().format == param::ConvBias::Format::NCHW88_WINOGRAD || | ||||
| param().format == param::ConvBias::Format::NCHW44_WINOGRAD) && | param().format == param::ConvBias::Format::NCHW44_WINOGRAD) && | ||||
| @@ -82,9 +83,11 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( | |||||
| auto ret = check_layout_fwd(src, filter, dst); | auto ret = check_layout_fwd(src, filter, dst); | ||||
| megdnn_assert_contiguous(bias); | megdnn_assert_contiguous(bias); | ||||
| auto required_workspace_in_bytes = | |||||
| get_workspace_in_bytes(src, filter, bias, z, dst, nullptr); | |||||
| megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); | |||||
| auto required_workspace_in_bytes = get_workspace_in_bytes( | |||||
| src, filter, bias, z, dst, preprocessed_filter); | |||||
| megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes, | |||||
| "worksapce have size of %zu, but need %zu", | |||||
| workspace_in_bytes, required_workspace_in_bytes); | |||||
| if (bias.ndim != 0) { | if (bias.ndim != 0) { | ||||
| //! bias.layout == dst.layout failed, no assert information | //! bias.layout == dst.layout failed, no assert information | ||||
| auto check_eq = [](const TensorLayout& bias, const TensorLayout& dst) { | auto check_eq = [](const TensorLayout& bias, const TensorLayout& dst) { | ||||
| @@ -1028,10 +1028,11 @@ void ConvolutionForward::deduce_layout(const TensorLayout& src, | |||||
| ConvolutionForward::CanonizedFilterMeta ConvolutionForward::check_exec( | ConvolutionForward::CanonizedFilterMeta ConvolutionForward::check_exec( | ||||
| const TensorLayout& src, const TensorLayout& filter, | const TensorLayout& src, const TensorLayout& filter, | ||||
| const TensorLayout& dst, size_t workspace_in_bytes) { | |||||
| const TensorLayout& dst, size_t workspace_in_bytes, | |||||
| const PreprocessedFilter* preprocessed_filter) { | |||||
| auto ret = check_layout_fwd(src, filter, dst); | auto ret = check_layout_fwd(src, filter, dst); | ||||
| auto required_workspace_in_bytes = | auto required_workspace_in_bytes = | ||||
| get_workspace_in_bytes(src, filter, dst, nullptr); | |||||
| get_workspace_in_bytes(src, filter, dst, preprocessed_filter); | |||||
| megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); | megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| @@ -25,10 +25,10 @@ namespace cuda { | |||||
| void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | ||||
| _megdnn_tensor_in bias, _megdnn_tensor_in z, | _megdnn_tensor_in bias, _megdnn_tensor_in z, | ||||
| _megdnn_tensor_out dst, | _megdnn_tensor_out dst, | ||||
| const PreprocessedFilter*, | |||||
| const PreprocessedFilter* preprocessed_filter, | |||||
| _megdnn_workspace workspace) { | _megdnn_workspace workspace) { | ||||
| check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout, | check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout, | ||||
| workspace.size); | |||||
| workspace.size, preprocessed_filter); | |||||
| AlgoBase::ExecArgs args(this, src, filter, bias, z, dst, workspace); | AlgoBase::ExecArgs args(this, src, filter, bias, z, dst, workspace); | ||||
| auto algo = get_algorithm(this, src.layout, filter.layout, bias.layout, | auto algo = get_algorithm(this, src.layout, filter.layout, bias.layout, | ||||
| z.layout, dst.layout); | z.layout, dst.layout); | ||||
| @@ -52,13 +52,10 @@ public: | |||||
| const TensorLayout&, const TensorLayout&) override { | const TensorLayout&, const TensorLayout&) override { | ||||
| return {}; | return {}; | ||||
| } | } | ||||
| void exec_preprocess(const TensorLayout& , | |||||
| _megdnn_tensor_in , | |||||
| const TensorLayout& , | |||||
| const TensorLayout& , | |||||
| const TensorLayout& , | |||||
| PreprocessedFilter* , | |||||
| _megdnn_workspace ) override { | |||||
| void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, | |||||
| const TensorLayout&, const TensorLayout&, | |||||
| const TensorLayout&, PreprocessedFilter*, | |||||
| _megdnn_workspace) override { | |||||
| megdnn_throw("cuda conv_bias exec_preprocess has not implemeted yet"); | megdnn_throw("cuda conv_bias exec_preprocess has not implemeted yet"); | ||||
| } | } | ||||
| @@ -119,17 +119,22 @@ SmallVector<ConvBiasImpl::AlgoBase*> ConvBiasImpl::algo_pack() { | |||||
| bool ConvBiasImpl::is_naive_algo(ConvBiasImpl::Algorithm* algo) { | bool ConvBiasImpl::is_naive_algo(ConvBiasImpl::Algorithm* algo) { | ||||
| return algo == nullptr || strcmp(algo->name(), "DEFAULT") == 0; | return algo == nullptr || strcmp(algo->name(), "DEFAULT") == 0; | ||||
| } | } | ||||
| #define NCB_ALGO_FUNC(name, algo, param) \ | |||||
| static_cast<AlgoBase*>(algo)->name(this, param) | |||||
| void ConvBiasImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | void ConvBiasImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | ||||
| _megdnn_tensor_in bias, _megdnn_tensor_in z, | _megdnn_tensor_in bias, _megdnn_tensor_in z, | ||||
| _megdnn_tensor_out dst, | _megdnn_tensor_out dst, | ||||
| const PreprocessedFilter* preprocessed_filter, | const PreprocessedFilter* preprocessed_filter, | ||||
| _megdnn_workspace workspace) { | _megdnn_workspace workspace) { | ||||
| check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout, | check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout, | ||||
| workspace.size); | |||||
| auto fparam = make_ncb_kern_param(src, filter, bias, dst, workspace); | |||||
| workspace.size, preprocessed_filter); | |||||
| auto fparam = make_ncb_kern_param(src, filter, bias, dst, workspace, | |||||
| preprocessed_filter); | |||||
| ConvBiasImpl::Algorithm* algo = get_algorithm(fparam, workspace.size); | ConvBiasImpl::Algorithm* algo = get_algorithm(fparam, workspace.size); | ||||
| if (!is_naive_algo(algo) && | if (!is_naive_algo(algo) && | ||||
| ncb_algo_get_workspace(algo, fparam) <= workspace.size) { | |||||
| NCB_ALGO_FUNC(get_workspace, algo, fparam) <= workspace.size) { | |||||
| exec_with_ncb_kern(fparam, algo); | exec_with_ncb_kern(fparam, algo); | ||||
| } else { | } else { | ||||
| naive::ConvBiasForwardImpl::exec(src, filter, bias, z, dst, | naive::ConvBiasForwardImpl::exec(src, filter, bias, z, dst, | ||||
| @@ -137,18 +142,71 @@ void ConvBiasImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||||
| } | } | ||||
| } | } | ||||
| void ConvBiasImpl::exec_preprocess(const TensorLayout& src_layout, | |||||
| _megdnn_tensor_in filter, | |||||
| const TensorLayout& bias_layout, | |||||
| const TensorLayout& z_layout, | |||||
| const TensorLayout& dst_layout, | |||||
| PreprocessedFilter* preprocessed_filter, | |||||
| _megdnn_workspace workspace) { | |||||
| //! exec_preprocess currently only support preprocess weights before exec, | |||||
| //! src/dst/bias/z will be ignored, just set to nullptr | |||||
| TensorND src{nullptr, src_layout}, dst{nullptr, dst_layout}, | |||||
| bias{nullptr, bias_layout}; | |||||
| auto fparam = make_ncb_kern_param(src, filter, bias, dst, workspace, | |||||
| preprocessed_filter); | |||||
| ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size); | |||||
| if (!is_naive_algo(algo) && NCB_ALGO_FUNC(get_preprocess_workspace, algo, | |||||
| fparam) <= workspace.size) { | |||||
| exec_preprocess_with_ncb_kern(fparam, algo); | |||||
| } else { | |||||
| naive::ConvBiasForwardImpl::exec_preprocess( | |||||
| src_layout, filter, bias_layout, z_layout, dst_layout, | |||||
| preprocessed_filter, workspace); | |||||
| } | |||||
| } | |||||
| size_t ConvBiasImpl::get_workspace_in_bytes( | size_t ConvBiasImpl::get_workspace_in_bytes( | ||||
| const TensorLayout& src, const TensorLayout& filter, | const TensorLayout& src, const TensorLayout& filter, | ||||
| const TensorLayout& bias, const TensorLayout& z, | const TensorLayout& bias, const TensorLayout& z, | ||||
| const TensorLayout& dst, | const TensorLayout& dst, | ||||
| const PreprocessedFilter* preprocessed_filter) { | const PreprocessedFilter* preprocessed_filter) { | ||||
| auto fparam = make_ncb_kern_size_param(src, filter, bias, dst); | |||||
| auto fparam = make_ncb_kern_size_param(src, filter, bias, dst, | |||||
| preprocessed_filter); | |||||
| ConvBiasImpl::Algorithm* algo = get_algorithm(fparam); | ConvBiasImpl::Algorithm* algo = get_algorithm(fparam); | ||||
| if (is_naive_algo(algo)) { | if (is_naive_algo(algo)) { | ||||
| return naive::ConvBiasForwardImpl::get_workspace_in_bytes( | return naive::ConvBiasForwardImpl::get_workspace_in_bytes( | ||||
| src, filter, bias, z, dst, preprocessed_filter); | src, filter, bias, z, dst, preprocessed_filter); | ||||
| } else { | } else { | ||||
| return ncb_algo_get_workspace(algo, fparam); | |||||
| return NCB_ALGO_FUNC(get_workspace, algo, fparam); | |||||
| } | |||||
| } | |||||
| size_t ConvBiasImpl::get_preprocess_workspace_in_bytes( | |||||
| const TensorLayout& src, const TensorLayout& filter, | |||||
| const TensorLayout& bias, const TensorLayout& z, | |||||
| const TensorLayout& dst) { | |||||
| auto fparam = make_ncb_kern_size_param(src, filter, bias, dst, nullptr); | |||||
| Algorithm* algo = get_algorithm(fparam); | |||||
| if (is_naive_algo(algo)) { | |||||
| return naive::ConvBiasForwardImpl::get_preprocess_workspace_in_bytes( | |||||
| src, filter, bias, z, dst); | |||||
| } else { | |||||
| return NCB_ALGO_FUNC(get_preprocess_workspace, algo, fparam); | |||||
| } | |||||
| } | |||||
| SmallVector<TensorLayout> ConvBiasImpl::deduce_preprocessed_filter_layout( | |||||
| const TensorLayout& src, const TensorLayout& filter, | |||||
| const TensorLayout& bias, const TensorLayout& z, | |||||
| const TensorLayout& dst) { | |||||
| auto fparam = make_ncb_kern_size_param(src, filter, bias, dst, nullptr); | |||||
| Algorithm* algo = get_algorithm(fparam); | |||||
| if (is_naive_algo(algo)) { | |||||
| return naive::ConvBiasForwardImpl::deduce_preprocessed_filter_layout( | |||||
| src, filter, bias, z, dst); | |||||
| } else { | |||||
| return NCB_ALGO_FUNC(deduce_preprocessed_filter_layout, algo, fparam); | |||||
| } | } | ||||
| } | } | ||||
| @@ -156,7 +214,7 @@ std::vector<ConvBiasImpl::Algorithm*> ConvBiasImpl::get_all_algorithms( | |||||
| const TensorLayout& src, const TensorLayout& filter, | const TensorLayout& src, const TensorLayout& filter, | ||||
| const TensorLayout& bias, const TensorLayout& z, | const TensorLayout& bias, const TensorLayout& z, | ||||
| const TensorLayout& dst) { | const TensorLayout& dst) { | ||||
| auto fparam = make_ncb_kern_size_param(src, filter, bias, dst); | |||||
| auto fparam = make_ncb_kern_size_param(src, filter, bias, dst, nullptr); | |||||
| auto ret = get_all_algorithms_with_ncb(fparam); | auto ret = get_all_algorithms_with_ncb(fparam); | ||||
| if (ret.empty()) { | if (ret.empty()) { | ||||
| return naive::ConvBiasForwardImpl::get_all_algorithms(src, filter, bias, | return naive::ConvBiasForwardImpl::get_all_algorithms(src, filter, bias, | ||||
| @@ -170,7 +228,7 @@ ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm_heuristic( | |||||
| const TensorLayout& bias, const TensorLayout& z, | const TensorLayout& bias, const TensorLayout& z, | ||||
| const TensorLayout& dst, size_t workspace_limit_in_bytes, | const TensorLayout& dst, size_t workspace_limit_in_bytes, | ||||
| bool reproducible) { | bool reproducible) { | ||||
| auto fparam = make_ncb_kern_size_param(src, filter, bias, dst); | |||||
| auto fparam = make_ncb_kern_size_param(src, filter, bias, dst, nullptr); | |||||
| auto result = get_algorithm_heuristic_with_ncb( | auto result = get_algorithm_heuristic_with_ncb( | ||||
| fparam, workspace_limit_in_bytes, reproducible); | fparam, workspace_limit_in_bytes, reproducible); | ||||
| if (result == nullptr) { | if (result == nullptr) { | ||||
| @@ -181,9 +239,25 @@ ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm_heuristic( | |||||
| return result; | return result; | ||||
| } | } | ||||
| ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm_heuristic_with_ncb( | |||||
| const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, | |||||
| bool reproducible) { | |||||
| for (auto i : get_all_algorithms_with_ncb(param)) { | |||||
| size_t need_workspace = NCB_ALGO_FUNC(get_workspace, i, param); | |||||
| if (static_cast<AlgoBase*>(i)->usable_reproducible( | |||||
| this, param, AlgoSelectionStrategy::HEURISTIC, | |||||
| reproducible) && | |||||
| need_workspace <= workspace_limit_in_bytes) { | |||||
| return i; | |||||
| } | |||||
| } | |||||
| return nullptr; | |||||
| } | |||||
| ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param( | ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param( | ||||
| const TensorLayout& src, const TensorLayout& filter, | const TensorLayout& src, const TensorLayout& filter, | ||||
| const TensorLayout& bias, const TensorLayout& dst) { | |||||
| const TensorLayout& bias, const TensorLayout& dst, | |||||
| const PreprocessedFilter* preprocessed_filter) { | |||||
| auto safe_u32 = [](size_t v) -> uint32_t { | auto safe_u32 = [](size_t v) -> uint32_t { | ||||
| megdnn_assert(v <= std::numeric_limits<uint32_t>::max(), | megdnn_assert(v <= std::numeric_limits<uint32_t>::max(), | ||||
| "value too large: %zu", v); | "value too large: %zu", v); | ||||
| @@ -258,7 +332,9 @@ ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param( | |||||
| {src.stride[0], src.stride[1], src.stride[2], src.stride[3]}, | {src.stride[0], src.stride[1], src.stride[2], src.stride[3]}, | ||||
| {dst.stride[0], dst.stride[1], dst.stride[2], dst.stride[3]}, | {dst.stride[0], dst.stride[1], dst.stride[2], dst.stride[3]}, | ||||
| param().compute_mode, | param().compute_mode, | ||||
| nr_threads}, | |||||
| nr_threads, | |||||
| reinterpret_cast<const ConvolutionForward::PreprocessedFilter*>( | |||||
| preprocessed_filter)}, | |||||
| param().output_block_size, | param().output_block_size, | ||||
| format, | format, | ||||
| bias.dtype, | bias.dtype, | ||||
| @@ -269,10 +345,12 @@ ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param( | |||||
| ConvBiasImpl::NCBKernParam ConvBiasImpl::make_ncb_kern_param( | ConvBiasImpl::NCBKernParam ConvBiasImpl::make_ncb_kern_param( | ||||
| _megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_in bias, | _megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_in bias, | ||||
| _megdnn_tensor_out dst, _megdnn_workspace workspace) { | |||||
| _megdnn_tensor_out dst, _megdnn_workspace workspace, | |||||
| const PreprocessedFilter* preprocessed_filter) { | |||||
| NCBKernParam ret; | NCBKernParam ret; | ||||
| static_cast<NCBKernSizeParam&>(ret) = make_ncb_kern_size_param( | |||||
| src.layout, filter.layout, bias.layout, dst.layout); | |||||
| static_cast<NCBKernSizeParam&>(ret) = | |||||
| make_ncb_kern_size_param(src.layout, filter.layout, bias.layout, | |||||
| dst.layout, preprocessed_filter); | |||||
| ret.src_ptr = src.raw_ptr; | ret.src_ptr = src.raw_ptr; | ||||
| ret.filter_ptr = filter.raw_ptr; | ret.filter_ptr = filter.raw_ptr; | ||||
| ret.bias_ptr = bias.raw_ptr; | ret.bias_ptr = bias.raw_ptr; | ||||
| @@ -284,7 +362,7 @@ ConvBiasImpl::NCBKernParam ConvBiasImpl::make_ncb_kern_param( | |||||
| void ConvBiasImpl::exec_with_ncb_kern(const NCBKernParam& param, | void ConvBiasImpl::exec_with_ncb_kern(const NCBKernParam& param, | ||||
| ConvBiasImpl::Algorithm* algo) { | ConvBiasImpl::Algorithm* algo) { | ||||
| auto ncb_kerns = ncb_algo_dispatch_kerns(algo, param); | |||||
| auto ncb_kerns = NCB_ALGO_FUNC(dispatch_kerns, algo, param); | |||||
| for (auto&& kernel : ncb_kerns) { | for (auto&& kernel : ncb_kerns) { | ||||
| auto run = [kernel, param](size_t index, size_t thread_id) { | auto run = [kernel, param](size_t index, size_t thread_id) { | ||||
| CpuNDRange ndrange_id(kernel.global_size, index); | CpuNDRange ndrange_id(kernel.global_size, index); | ||||
| @@ -295,21 +373,17 @@ void ConvBiasImpl::exec_with_ncb_kern(const NCBKernParam& param, | |||||
| } | } | ||||
| } | } | ||||
| ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm_heuristic_with_ncb( | |||||
| const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, | |||||
| bool reproducible) { | |||||
| return ncb_algo_get_algorithm_heuristic(param, workspace_limit_in_bytes, | |||||
| reproducible); | |||||
| } | |||||
| size_t ConvBiasImpl::ncb_algo_get_workspace(Algorithm* algo, | |||||
| const NCBKernSizeParam& param) { | |||||
| return static_cast<AlgoBase*>(algo)->get_workspace(this, param); | |||||
| } | |||||
| SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::ncb_algo_dispatch_kerns( | |||||
| Algorithm* algo, const NCBKernSizeParam& param) { | |||||
| return static_cast<AlgoBase*>(algo)->dispatch_kerns(this, param); | |||||
| void ConvBiasImpl::exec_preprocess_with_ncb_kern( | |||||
| const NCBKernParam& param, ConvBiasImpl::Algorithm* algo) { | |||||
| auto ncb_kerns = NCB_ALGO_FUNC(dispatch_preprocess_kerns, algo, param); | |||||
| for (auto&& kernel : ncb_kerns) { | |||||
| auto run = [kernel, param](size_t index, size_t thread_id) { | |||||
| CpuNDRange ndrange_id(kernel.global_size, index); | |||||
| kernel.kern(param, {thread_id, ndrange_id}); | |||||
| }; | |||||
| static_cast<naive::HandleImpl*>(handle())->dispatch_kern( | |||||
| run, kernel.global_size.total_size()); | |||||
| } | |||||
| } | } | ||||
| std::vector<ConvBiasImpl::Algorithm*> ConvBiasImpl::get_all_algorithms_with_ncb( | std::vector<ConvBiasImpl::Algorithm*> ConvBiasImpl::get_all_algorithms_with_ncb( | ||||
| @@ -332,20 +406,6 @@ std::vector<ConvBiasImpl::Algorithm*> ConvBiasImpl::get_all_algorithms_with_ncb( | |||||
| return algos; | return algos; | ||||
| } | } | ||||
| ConvBiasImpl::Algorithm* ConvBiasImpl::ncb_algo_get_algorithm_heuristic( | |||||
| const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, | |||||
| bool reproducible) { | |||||
| for (auto i : get_all_algorithms_with_ncb(param)) { | |||||
| if (static_cast<AlgoBase*>(i)->usable_reproducible( | |||||
| this, param, AlgoSelectionStrategy::HEURISTIC, | |||||
| reproducible) && | |||||
| ncb_algo_get_workspace(i, param) <= workspace_limit_in_bytes) { | |||||
| return i; | |||||
| } | |||||
| } | |||||
| return nullptr; | |||||
| } | |||||
| ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm( | ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm( | ||||
| const NCBKernSizeParam& param, size_t workspace_size) { | const NCBKernSizeParam& param, size_t workspace_size) { | ||||
| if (auto set = execution_policy().algorithm) { | if (auto set = execution_policy().algorithm) { | ||||
| @@ -51,6 +51,25 @@ public: | |||||
| _megdnn_tensor_out dst, const PreprocessedFilter*, | _megdnn_tensor_out dst, const PreprocessedFilter*, | ||||
| _megdnn_workspace workspace) override; | _megdnn_workspace workspace) override; | ||||
| void exec_preprocess(const TensorLayout& src_layout, | |||||
| _megdnn_tensor_in filter, | |||||
| const TensorLayout& bias_layout, | |||||
| const TensorLayout& z_layout, | |||||
| const TensorLayout& dst_layout, | |||||
| PreprocessedFilter* preprocessed_filter, | |||||
| _megdnn_workspace workspace) override; | |||||
| SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||||
| const TensorLayout& src, const TensorLayout& filter, | |||||
| const TensorLayout& bias, const TensorLayout& z, | |||||
| const TensorLayout& dst) override; | |||||
| size_t get_preprocess_workspace_in_bytes(const TensorLayout& src, | |||||
| const TensorLayout& filter, | |||||
| const TensorLayout& bias, | |||||
| const TensorLayout& z, | |||||
| const TensorLayout& dst) override; | |||||
| //! implemented by get_workspace_with_ncb() | //! implemented by get_workspace_with_ncb() | ||||
| size_t get_workspace_in_bytes(const TensorLayout& src, | size_t get_workspace_in_bytes(const TensorLayout& src, | ||||
| const TensorLayout& filter, | const TensorLayout& filter, | ||||
| @@ -198,6 +217,23 @@ public: | |||||
| virtual SmallVector<NCBKern> dispatch_kerns( | virtual SmallVector<NCBKern> dispatch_kerns( | ||||
| ConvBiasImpl* opr, const NCBKernSizeParam& param) const = 0; | ConvBiasImpl* opr, const NCBKernSizeParam& param) const = 0; | ||||
| virtual SmallVector<NCBKern> dispatch_preprocess_kerns( | |||||
| ConvBiasImpl*, const NCBKernSizeParam&) const { | |||||
| return {}; | |||||
| }; | |||||
| //! get the layouts of weight_prerocess dst | |||||
| virtual SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||||
| ConvBiasImpl*, const NCBKernSizeParam&) const { | |||||
| return {}; | |||||
| }; | |||||
| //! get the workspace when weight_prerocess | |||||
| virtual size_t get_preprocess_workspace(ConvBiasImpl*, | |||||
| const NCBKernSizeParam&) const { | |||||
| return 0_z; | |||||
| }; | |||||
| //! Temporarily used to identify whether the matmul algorithm is | //! Temporarily used to identify whether the matmul algorithm is | ||||
| //! is_preferred. | //! is_preferred. | ||||
| virtual bool is_preferred(ConvBiasImpl*, | virtual bool is_preferred(ConvBiasImpl*, | ||||
| @@ -219,40 +255,19 @@ public: | |||||
| virtual SmallVector<AlgoBase*> algo_pack(); | virtual SmallVector<AlgoBase*> algo_pack(); | ||||
| protected: | protected: | ||||
| //! default impl calls ncb_algo_dispatch_kern() | |||||
| virtual void exec_with_ncb_kern(const NCBKernParam& param, | virtual void exec_with_ncb_kern(const NCBKernParam& param, | ||||
| ConvBiasImpl::Algorithm* algo); | ConvBiasImpl::Algorithm* algo); | ||||
| //! default impl calls ncb_algo_get_all_algorithms() | |||||
| virtual void exec_preprocess_with_ncb_kern(const NCBKernParam& param, | |||||
| Algorithm* algo); | |||||
| virtual std::vector<Algorithm*> get_all_algorithms_with_ncb( | virtual std::vector<Algorithm*> get_all_algorithms_with_ncb( | ||||
| const NCBKernSizeParam& param); | const NCBKernSizeParam& param); | ||||
| //! default impl calls ncb_algo_get_algorithm_heuristic() | |||||
| virtual Algorithm* get_algorithm_heuristic_with_ncb( | virtual Algorithm* get_algorithm_heuristic_with_ncb( | ||||
| const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, | const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, | ||||
| bool reproducible = false); | bool reproducible = false); | ||||
| /** | |||||
| * \brief get kernel pointer for non-contiguous batch kernel or | |||||
| * simply conv bias kernel. | |||||
| * | |||||
| * whether the kernel processing batch 1-group is decided by the | |||||
| * algo. | |||||
| */ | |||||
| virtual SmallVector<NCBKern> ncb_algo_dispatch_kerns( | |||||
| Algorithm* algo, const NCBKernSizeParam& param); | |||||
| virtual size_t ncb_algo_get_workspace(Algorithm* algo, | |||||
| const NCBKernSizeParam& param); | |||||
| /*! | |||||
| * the default impl iterates over all ncb_algo_get_all_algorithms() | |||||
| * and return the first one whose workspace does not exceed the limit. | |||||
| */ | |||||
| virtual Algorithm* ncb_algo_get_algorithm_heuristic( | |||||
| const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, | |||||
| bool reproducible = false); | |||||
| const char* get_algorithm_set_name() const override; | const char* get_algorithm_set_name() const override; | ||||
| private: | private: | ||||
| @@ -276,16 +291,16 @@ private: | |||||
| const NCBKernSizeParam& param, | const NCBKernSizeParam& param, | ||||
| size_t workspace_size = std::numeric_limits<size_t>::max()); | size_t workspace_size = std::numeric_limits<size_t>::max()); | ||||
| NCBKernSizeParam make_ncb_kern_size_param(const TensorLayout& src, | |||||
| const TensorLayout& filter, | |||||
| const TensorLayout& bias, | |||||
| const TensorLayout& dst); | |||||
| NCBKernParam make_ncb_kern_param(_megdnn_tensor_in src, | |||||
| _megdnn_tensor_in filter, | |||||
| _megdnn_tensor_in bias, | |||||
| _megdnn_tensor_out dst, | |||||
| _megdnn_workspace workspace); | |||||
| NCBKernSizeParam make_ncb_kern_size_param( | |||||
| const TensorLayout& src, const TensorLayout& filter, | |||||
| const TensorLayout& bias, const TensorLayout& dst, | |||||
| const PreprocessedFilter* preprocessed_filter); | |||||
| NCBKernParam make_ncb_kern_param( | |||||
| _megdnn_tensor_in src, _megdnn_tensor_in filter, | |||||
| _megdnn_tensor_in bias, _megdnn_tensor_out dst, | |||||
| _megdnn_workspace workspace, | |||||
| const PreprocessedFilter* preprocessed_filter); | |||||
| }; | }; | ||||
| } // namespace fallback | } // namespace fallback | ||||
| @@ -376,7 +376,67 @@ size_t ConvolutionImpl::AlgoDefault::get_workspace( | |||||
| return get_bundle(param).total_size_in_bytes(); | return get_bundle(param).total_size_in_bytes(); | ||||
| } | } | ||||
| //! Return the implment kernel | |||||
| size_t ConvolutionImpl::AlgoDefault::get_preprocess_workspace( | |||||
| ConvolutionImpl*, const NCBKernSizeParam& param) const { | |||||
| ::ConvBiasImpl::NCBKernSizeParam conv_bias_param = | |||||
| init_convbias_opr_and_param(m_conv_bias_opr, param); | |||||
| m_conv_bias_opr->execution_policy() = {m_algorithm}; | |||||
| return m_algorithm->get_preprocess_workspace(m_conv_bias_opr, | |||||
| conv_bias_param); | |||||
| } | |||||
| SmallVector<TensorLayout> | |||||
| ConvolutionImpl::AlgoDefault::deduce_preprocessed_filter_layout( | |||||
| ConvolutionImpl*, const NCBKernSizeParam& param) const { | |||||
| ::ConvBiasImpl::NCBKernSizeParam conv_bias_param = | |||||
| init_convbias_opr_and_param(m_conv_bias_opr, param); | |||||
| m_conv_bias_opr->execution_policy() = {m_algorithm}; | |||||
| return m_algorithm->deduce_preprocessed_filter_layout(m_conv_bias_opr, | |||||
| conv_bias_param); | |||||
| } | |||||
| //! Return the implement preprocess kernel | |||||
| SmallVector<ConvolutionImpl::NCBKern> | |||||
| ConvolutionImpl::AlgoDefault::get_preprocess_kimpl( | |||||
| ::ConvBiasImpl* conv_bias_opr, ConvBiasImpl::AlgoBase* algo, | |||||
| const NCBKernSizeParam& param) { | |||||
| MIDOUT_BEGIN(megdnn_fallback_conv, midout_iv("get_preprocess_kimpl"_hash)) { | |||||
| // construct the conv_bias kern param | |||||
| ::ConvBiasImpl::NCBKernParam conv_bias_param; | |||||
| ::ConvBiasImpl::NCBKernSizeParam conv_bias_size_param = | |||||
| init_convbias_opr_and_param(conv_bias_opr, param); | |||||
| static_cast<::ConvBiasImpl::NCBKernSizeParam&>(conv_bias_param) = | |||||
| conv_bias_size_param; | |||||
| auto conv_bias_preprocess_kerns = | |||||
| algo->dispatch_preprocess_kerns(conv_bias_opr, conv_bias_param); | |||||
| SmallVector<ConvolutionImpl::NCBKern> convolution_preprocess_kerns; | |||||
| //! Set the conv_bias param using convolution param | |||||
| auto set_copy_param_filter_workspace_ptr = | |||||
| [](const NCBKernParam& conv_param, | |||||
| ::ConvBiasImpl::NCBKernParam& copied_param) { | |||||
| copied_param.filter_ptr = conv_param.filter_ptr; | |||||
| copied_param.workspace_ptr = conv_param.workspace_ptr; | |||||
| copied_param.workspace_size = conv_param.workspace_size; | |||||
| }; | |||||
| for (size_t i = 0; i < conv_bias_preprocess_kerns.size(); i++) { | |||||
| auto kernel = conv_bias_preprocess_kerns[i]; | |||||
| //! If the kerenl batch parallel | |||||
| auto run = [=](const NCBKernParam& p, | |||||
| const NCBKernIndex& ncb_index) { | |||||
| auto copy_param = conv_bias_param; | |||||
| set_copy_param_filter_workspace_ptr(p, copy_param); | |||||
| kernel.kern(copy_param, | |||||
| {ncb_index.thread_id, ncb_index.ndrange_id}); | |||||
| }; | |||||
| convolution_preprocess_kerns.push_back({run, kernel.global_size}); | |||||
| } | |||||
| return convolution_preprocess_kerns; | |||||
| } | |||||
| MIDOUT_END(); | |||||
| } | |||||
| //! Return the implement kernel | |||||
| SmallVector<ConvolutionImpl::NCBKern> ConvolutionImpl::AlgoDefault::get_kimpl( | SmallVector<ConvolutionImpl::NCBKern> ConvolutionImpl::AlgoDefault::get_kimpl( | ||||
| ::ConvBiasImpl* conv_bias_opr, ConvBiasImpl::AlgoBase* algo, | ::ConvBiasImpl* conv_bias_opr, ConvBiasImpl::AlgoBase* algo, | ||||
| const NCBKernSizeParam& param) { | const NCBKernSizeParam& param) { | ||||
| @@ -392,7 +452,7 @@ SmallVector<ConvolutionImpl::NCBKern> ConvolutionImpl::AlgoDefault::get_kimpl( | |||||
| SmallVector<ConvolutionImpl::NCBKern> convolution_kerns; | SmallVector<ConvolutionImpl::NCBKern> convolution_kerns; | ||||
| //! Set the conv_bias param using convolution param | //! Set the conv_bias param using convolution param | ||||
| auto set_copy_param_run_time_address = | |||||
| auto set_copy_param_compute_address = | |||||
| [](const NCBKernParam& conv_param, | [](const NCBKernParam& conv_param, | ||||
| ::ConvBiasImpl::NCBKernParam& copied_param) { | ::ConvBiasImpl::NCBKernParam& copied_param) { | ||||
| copied_param.src_ptr = conv_param.src_ptr; | copied_param.src_ptr = conv_param.src_ptr; | ||||
| @@ -407,7 +467,7 @@ SmallVector<ConvolutionImpl::NCBKern> ConvolutionImpl::AlgoDefault::get_kimpl( | |||||
| auto run = [=](const NCBKernParam& p, | auto run = [=](const NCBKernParam& p, | ||||
| const NCBKernIndex& ncb_index) { | const NCBKernIndex& ncb_index) { | ||||
| auto copy_param = conv_bias_param; | auto copy_param = conv_bias_param; | ||||
| set_copy_param_run_time_address(p, copy_param); | |||||
| set_copy_param_compute_address(p, copy_param); | |||||
| kernel.kern(copy_param, | kernel.kern(copy_param, | ||||
| {ncb_index.thread_id, ncb_index.ndrange_id}); | {ncb_index.thread_id, ncb_index.ndrange_id}); | ||||
| }; | }; | ||||
| @@ -110,6 +110,9 @@ class ConvolutionImpl::AlgoDefault final : public AlgoBase { | |||||
| static SmallVector<NCBKern> get_kimpl(ConvBiasImpl* conv_bias_opr, | static SmallVector<NCBKern> get_kimpl(ConvBiasImpl* conv_bias_opr, | ||||
| ConvBiasImpl::AlgoBase* algo, | ConvBiasImpl::AlgoBase* algo, | ||||
| const NCBKernSizeParam& param); | const NCBKernSizeParam& param); | ||||
| static SmallVector<NCBKern> get_preprocess_kimpl( | |||||
| ConvBiasImpl* conv_bias_opr, ConvBiasImpl::AlgoBase* algo, | |||||
| const NCBKernSizeParam& param); | |||||
| public: | public: | ||||
| AlgoDefault(fallback::ConvBiasImpl* conv_bias_opr, ConvBiasImpl::AlgoBase*); | AlgoDefault(fallback::ConvBiasImpl* conv_bias_opr, ConvBiasImpl::AlgoBase*); | ||||
| @@ -121,6 +124,17 @@ public: | |||||
| size_t get_workspace(ConvolutionImpl* opr, | size_t get_workspace(ConvolutionImpl* opr, | ||||
| const NCBKernSizeParam& param) const override; | const NCBKernSizeParam& param) const override; | ||||
| size_t get_preprocess_workspace(ConvolutionImpl*, | |||||
| const NCBKernSizeParam&) const override; | |||||
| SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||||
| ConvolutionImpl*, const NCBKernSizeParam&) const override; | |||||
| SmallVector<NCBKern> dispatch_preprocess_kern( | |||||
| ConvolutionImpl*, const NCBKernSizeParam& param) const override { | |||||
| return get_preprocess_kimpl(m_conv_bias_opr, m_algorithm, param); | |||||
| } | |||||
| SmallVector<NCBKern> dispatch_kern( | SmallVector<NCBKern> dispatch_kern( | ||||
| ConvolutionImpl* /*opr*/, | ConvolutionImpl* /*opr*/, | ||||
| const NCBKernSizeParam& param) const override { | const NCBKernSizeParam& param) const override { | ||||
| @@ -80,14 +80,19 @@ SmallVector<ConvolutionImpl::AlgoBase*> ConvolutionImpl::algo_pack() { | |||||
| bool ConvolutionImpl::is_naive_algo(ConvolutionImpl::Algorithm* algo) { | bool ConvolutionImpl::is_naive_algo(ConvolutionImpl::Algorithm* algo) { | ||||
| return algo == nullptr || strcmp(algo->name(), "DEFAULT") == 0; | return algo == nullptr || strcmp(algo->name(), "DEFAULT") == 0; | ||||
| } | } | ||||
| #define NCB_ALGO_FUNC(name, algo, param) \ | |||||
| static_cast<AlgoBase*>(algo)->name(this, fparam) | |||||
| void ConvolutionImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | void ConvolutionImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | ||||
| _megdnn_tensor_out dst, | _megdnn_tensor_out dst, | ||||
| const PreprocessedFilter* preprocessed_filter, | const PreprocessedFilter* preprocessed_filter, | ||||
| _megdnn_workspace workspace) { | _megdnn_workspace workspace) { | ||||
| auto fparam = make_ncb_kern_param(src, filter, dst, workspace); | |||||
| auto fparam = make_ncb_kern_param(src, filter, dst, preprocessed_filter, | |||||
| workspace); | |||||
| ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size); | ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size); | ||||
| if (!is_naive_algo(algo) && | if (!is_naive_algo(algo) && | ||||
| ncb_algo_get_workspace(algo, fparam) <= workspace.size) { | |||||
| NCB_ALGO_FUNC(get_workspace, algo, fparam) <= workspace.size) { | |||||
| exec_with_ncb_kern(fparam, algo); | exec_with_ncb_kern(fparam, algo); | ||||
| } else { | } else { | ||||
| naive::ConvolutionForwardImpl::exec(src, filter, dst, | naive::ConvolutionForwardImpl::exec(src, filter, dst, | ||||
| @@ -95,24 +100,73 @@ void ConvolutionImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||||
| } | } | ||||
| } | } | ||||
| void ConvolutionImpl::exec_preprocess(const TensorLayout& src_layout, | |||||
| _megdnn_tensor_in filter, | |||||
| const TensorLayout& dst_layout, | |||||
| PreprocessedFilter* preprocessed_filter, | |||||
| _megdnn_workspace workspace) { | |||||
| //! exec_preprocess currently only support preprocess weights before exec, | |||||
| //! src/dst will be ignored, just set to nullptr | |||||
| TensorND src{nullptr, src_layout}, dst{nullptr, dst_layout}; | |||||
| auto fparam = make_ncb_kern_param(src, filter, dst, preprocessed_filter, | |||||
| workspace); | |||||
| ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size); | |||||
| if (!is_naive_algo(algo) && NCB_ALGO_FUNC(get_preprocess_workspace, algo, | |||||
| fparam) <= workspace.size) { | |||||
| exec_preprocess_with_ncb_kern(fparam, algo); | |||||
| } else { | |||||
| naive::ConvolutionForwardImpl::exec_preprocess( | |||||
| src_layout, filter, dst_layout, preprocessed_filter, workspace); | |||||
| } | |||||
| } | |||||
| size_t ConvolutionImpl::get_workspace_in_bytes( | size_t ConvolutionImpl::get_workspace_in_bytes( | ||||
| const TensorLayout& src, const TensorLayout& filter, | const TensorLayout& src, const TensorLayout& filter, | ||||
| const TensorLayout& dst, | const TensorLayout& dst, | ||||
| const PreprocessedFilter* preprocessed_filter) { | const PreprocessedFilter* preprocessed_filter) { | ||||
| auto fparam = make_ncb_kern_size_param(src, filter, dst); | |||||
| auto fparam = | |||||
| make_ncb_kern_size_param(src, filter, dst, preprocessed_filter); | |||||
| Algorithm* algo = get_algorithm(fparam); | Algorithm* algo = get_algorithm(fparam); | ||||
| if (is_naive_algo(algo)) { | if (is_naive_algo(algo)) { | ||||
| return naive::ConvolutionForwardImpl::get_workspace_in_bytes( | return naive::ConvolutionForwardImpl::get_workspace_in_bytes( | ||||
| src, filter, dst, preprocessed_filter); | src, filter, dst, preprocessed_filter); | ||||
| } else { | } else { | ||||
| return ncb_algo_get_workspace(algo, fparam); | |||||
| return static_cast<AlgoBase*>(algo)->get_workspace(this, fparam); | |||||
| } | |||||
| } | |||||
| size_t ConvolutionImpl::get_preprocess_workspace_in_bytes( | |||||
| const TensorLayout& src, const TensorLayout& filter, | |||||
| const TensorLayout& dst) { | |||||
| auto fparam = make_ncb_kern_size_param(src, filter, dst, nullptr); | |||||
| Algorithm* algo = get_algorithm(fparam); | |||||
| if (is_naive_algo(algo)) { | |||||
| return naive::ConvolutionForwardImpl::get_preprocess_workspace_in_bytes( | |||||
| src, filter, dst); | |||||
| } else { | |||||
| return static_cast<AlgoBase*>(algo)->get_preprocess_workspace(this, | |||||
| fparam); | |||||
| } | |||||
| } | |||||
| SmallVector<TensorLayout> ConvolutionImpl::deduce_preprocessed_filter_layout( | |||||
| const TensorLayout& src, const TensorLayout& filter, | |||||
| const TensorLayout& dst){ | |||||
| auto fparam = make_ncb_kern_size_param(src, filter, dst, nullptr); | |||||
| Algorithm* algo = get_algorithm(fparam); | |||||
| if (is_naive_algo(algo)) { | |||||
| return naive::ConvolutionForwardImpl::deduce_preprocessed_filter_layout( | |||||
| src, filter, dst); | |||||
| } else { | |||||
| return static_cast<AlgoBase*>(algo)->deduce_preprocessed_filter_layout( | |||||
| this, fparam); | |||||
| } | } | ||||
| } | } | ||||
| std::vector<ConvolutionImpl::Algorithm*> ConvolutionImpl::get_all_algorithms( | std::vector<ConvolutionImpl::Algorithm*> ConvolutionImpl::get_all_algorithms( | ||||
| const TensorLayout& src, const TensorLayout& filter, | const TensorLayout& src, const TensorLayout& filter, | ||||
| const TensorLayout& dst) { | const TensorLayout& dst) { | ||||
| auto fparam = make_ncb_kern_size_param(src, filter, dst); | |||||
| auto fparam = make_ncb_kern_size_param(src, filter, dst, nullptr); | |||||
| auto ret = get_all_algorithms_with_ncb(fparam); | auto ret = get_all_algorithms_with_ncb(fparam); | ||||
| if (ret.empty()) { | if (ret.empty()) { | ||||
| return naive::ConvolutionForwardImpl::get_all_algorithms(src, filter, | return naive::ConvolutionForwardImpl::get_all_algorithms(src, filter, | ||||
| @@ -125,7 +179,7 @@ ConvolutionImpl::Algorithm* ConvolutionImpl::get_algorithm_heuristic( | |||||
| const TensorLayout& src, const TensorLayout& filter, | const TensorLayout& src, const TensorLayout& filter, | ||||
| const TensorLayout& dst, size_t workspace_limit_in_bytes, | const TensorLayout& dst, size_t workspace_limit_in_bytes, | ||||
| bool reproducible) { | bool reproducible) { | ||||
| auto fparam = make_ncb_kern_size_param(src, filter, dst); | |||||
| auto fparam = make_ncb_kern_size_param(src, filter, dst, nullptr); | |||||
| auto result = get_algorithm_heuristic_with_ncb( | auto result = get_algorithm_heuristic_with_ncb( | ||||
| fparam, workspace_limit_in_bytes, reproducible); | fparam, workspace_limit_in_bytes, reproducible); | ||||
| if (result == nullptr) { | if (result == nullptr) { | ||||
| @@ -137,7 +191,8 @@ ConvolutionImpl::Algorithm* ConvolutionImpl::get_algorithm_heuristic( | |||||
| ConvolutionImpl::NCBKernSizeParam ConvolutionImpl::make_ncb_kern_size_param( | ConvolutionImpl::NCBKernSizeParam ConvolutionImpl::make_ncb_kern_size_param( | ||||
| const TensorLayout& src, const TensorLayout& filter, | const TensorLayout& src, const TensorLayout& filter, | ||||
| const TensorLayout& dst) { | |||||
| const TensorLayout& dst, | |||||
| const PreprocessedFilter* preprocessed_filter) { | |||||
| auto safe_u32 = [](size_t v) -> uint32_t { | auto safe_u32 = [](size_t v) -> uint32_t { | ||||
| megdnn_assert(v <= std::numeric_limits<uint32_t>::max(), | megdnn_assert(v <= std::numeric_limits<uint32_t>::max(), | ||||
| "value too large: %zu", v); | "value too large: %zu", v); | ||||
| @@ -175,15 +230,17 @@ ConvolutionImpl::NCBKernSizeParam ConvolutionImpl::make_ncb_kern_size_param( | |||||
| {src.stride[0], src.stride[1], src.stride[2], src.stride[3]}, | {src.stride[0], src.stride[1], src.stride[2], src.stride[3]}, | ||||
| {dst.stride[0], dst.stride[1], dst.stride[2], dst.stride[3]}, | {dst.stride[0], dst.stride[1], dst.stride[2], dst.stride[3]}, | ||||
| param().compute_mode, | param().compute_mode, | ||||
| nr_threads}; | |||||
| nr_threads, | |||||
| preprocessed_filter}; | |||||
| } | } | ||||
| ConvolutionImpl::NCBKernParam ConvolutionImpl::make_ncb_kern_param( | ConvolutionImpl::NCBKernParam ConvolutionImpl::make_ncb_kern_param( | ||||
| _megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_out dst, | _megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_out dst, | ||||
| const PreprocessedFilter* preprocessed_filter, | |||||
| _megdnn_workspace workspace) { | _megdnn_workspace workspace) { | ||||
| NCBKernParam ret; | NCBKernParam ret; | ||||
| static_cast<NCBKernSizeParam&>(ret) = | |||||
| make_ncb_kern_size_param(src.layout, filter.layout, dst.layout); | |||||
| static_cast<NCBKernSizeParam&>(ret) = make_ncb_kern_size_param( | |||||
| src.layout, filter.layout, dst.layout, preprocessed_filter); | |||||
| ret.src_ptr = src.raw_ptr; | ret.src_ptr = src.raw_ptr; | ||||
| ret.filter_ptr = filter.raw_ptr; | ret.filter_ptr = filter.raw_ptr; | ||||
| ret.dst_ptr = dst.raw_ptr; | ret.dst_ptr = dst.raw_ptr; | ||||
| @@ -192,9 +249,30 @@ ConvolutionImpl::NCBKernParam ConvolutionImpl::make_ncb_kern_param( | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| void ConvolutionImpl::exec_preprocess_with_ncb_kern(const NCBKernParam& param, | |||||
| Algorithm* algo) { | |||||
| auto kerns = | |||||
| static_cast<AlgoBase*>(algo)->dispatch_preprocess_kern(this, param); | |||||
| auto fallback_handle = handle(); | |||||
| for (auto kernel : kerns) { | |||||
| megdnn_assert( | |||||
| param.filter_meta.format == Param::Format::NCHW || | |||||
| param.filter_meta.format == Param::Format::NHWC || | |||||
| param.filter_meta.format == Param::Format::NCHW88 || | |||||
| param.filter_meta.format == Param::Format::NCHW44, | |||||
| "invalid conv format"); | |||||
| auto run = [param, kernel](size_t index, size_t thread_id) { | |||||
| CpuNDRange ndrange_id(kernel.global_size, index); | |||||
| kernel.kern(param, {thread_id, ndrange_id}); | |||||
| }; | |||||
| static_cast<naive::HandleImpl*>(fallback_handle) | |||||
| ->dispatch_kern(run, kernel.global_size.total_size()); | |||||
| } | |||||
| } | |||||
| void ConvolutionImpl::exec_with_ncb_kern(const NCBKernParam& param, | void ConvolutionImpl::exec_with_ncb_kern(const NCBKernParam& param, | ||||
| Algorithm* algo) { | Algorithm* algo) { | ||||
| auto kerns = ncb_algo_dispatch_kern(algo, param); | |||||
| auto kerns = static_cast<AlgoBase*>(algo)->dispatch_kern(this, param); | |||||
| auto fallback_handle = handle(); | auto fallback_handle = handle(); | ||||
| for (auto kernel : kerns) { | for (auto kernel : kerns) { | ||||
| megdnn_assert(param.filter_meta.format == Param::Format::NCHW || | megdnn_assert(param.filter_meta.format == Param::Format::NCHW || | ||||
| @@ -215,10 +293,13 @@ ConvolutionImpl::Algorithm* ConvolutionImpl::get_algorithm_heuristic_with_ncb( | |||||
| const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, | const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, | ||||
| bool reproducible) { | bool reproducible) { | ||||
| for (auto i : get_all_algorithms_with_ncb(param)) { | for (auto i : get_all_algorithms_with_ncb(param)) { | ||||
| if (static_cast<AlgoBase*>(i)->usable_reproducible( | |||||
| this, param, AlgoSelectionStrategy::HEURISTIC, | |||||
| reproducible) && | |||||
| ncb_algo_get_workspace(i, param) <= workspace_limit_in_bytes) { | |||||
| size_t need_workspace = | |||||
| static_cast<AlgoBase*>(i)->get_workspace(this, param); | |||||
| bool usable_reproducible = | |||||
| static_cast<AlgoBase*>(i)->usable_reproducible( | |||||
| this, param, AlgoSelectionStrategy::HEURISTIC, | |||||
| reproducible); | |||||
| if (usable_reproducible && need_workspace <= workspace_limit_in_bytes) { | |||||
| return i; | return i; | ||||
| } | } | ||||
| } | } | ||||
| @@ -39,12 +39,26 @@ public: | |||||
| _megdnn_tensor_out dst, const PreprocessedFilter*, | _megdnn_tensor_out dst, const PreprocessedFilter*, | ||||
| _megdnn_workspace workspace) override; | _megdnn_workspace workspace) override; | ||||
| void exec_preprocess(const TensorLayout& src_layout, | |||||
| _megdnn_tensor_in filter, | |||||
| const TensorLayout& dst_layout, | |||||
| PreprocessedFilter* preprocessed_filter, | |||||
| _megdnn_workspace workspace) override; | |||||
| //! implemented by get_workspace_with_ncb() | //! implemented by get_workspace_with_ncb() | ||||
| size_t get_workspace_in_bytes(const TensorLayout& src, | size_t get_workspace_in_bytes(const TensorLayout& src, | ||||
| const TensorLayout& filter, | const TensorLayout& filter, | ||||
| const TensorLayout& dst, | const TensorLayout& dst, | ||||
| const PreprocessedFilter*) override; | const PreprocessedFilter*) override; | ||||
| SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||||
| const TensorLayout& src, const TensorLayout& filter, | |||||
| const TensorLayout& dst) override; | |||||
| size_t get_preprocess_workspace_in_bytes(const TensorLayout& src, | |||||
| const TensorLayout& filter, | |||||
| const TensorLayout& dst) override; | |||||
| //! implemented by get_all_algorithms_with_ncb() | //! implemented by get_all_algorithms_with_ncb() | ||||
| std::vector<Algorithm*> get_all_algorithms( | std::vector<Algorithm*> get_all_algorithms( | ||||
| const TensorLayout& src, const TensorLayout& filter, | const TensorLayout& src, const TensorLayout& filter, | ||||
| @@ -70,6 +84,8 @@ public: | |||||
| ptrdiff_t inp_s[4], out_s[4]; | ptrdiff_t inp_s[4], out_s[4]; | ||||
| Param::ComputeMode compute_mode; | Param::ComputeMode compute_mode; | ||||
| size_t nr_threads; | size_t nr_threads; | ||||
| //! weight_preprocess info | |||||
| const PreprocessedFilter* preprocessed_filter; | |||||
| }; | }; | ||||
| //! memory param for kernels with non-contiguous batch | //! memory param for kernels with non-contiguous batch | ||||
| @@ -169,6 +185,23 @@ public: | |||||
| virtual SmallVector<NCBKern> dispatch_kern( | virtual SmallVector<NCBKern> dispatch_kern( | ||||
| ConvolutionImpl* opr, const NCBKernSizeParam& param) const = 0; | ConvolutionImpl* opr, const NCBKernSizeParam& param) const = 0; | ||||
| virtual SmallVector<NCBKern> dispatch_preprocess_kern( | |||||
| ConvolutionImpl*, const NCBKernSizeParam&) const { | |||||
| return {}; | |||||
| }; | |||||
| //! get the layouts of weight_prerocess dst | |||||
| virtual SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||||
| ConvolutionImpl*, const NCBKernSizeParam&) const { | |||||
| return {}; | |||||
| }; | |||||
| //! get the workspace when weight_prerocess | |||||
| virtual size_t get_preprocess_workspace(ConvolutionImpl*, | |||||
| const NCBKernSizeParam&) const { | |||||
| return 0_z; | |||||
| }; | |||||
| //! Temporarily used to identify whether the matmul algorithm is | //! Temporarily used to identify whether the matmul algorithm is | ||||
| //! is_preferred. | //! is_preferred. | ||||
| virtual bool is_preferred(ConvolutionImpl*, | virtual bool is_preferred(ConvolutionImpl*, | ||||
| @@ -192,6 +225,9 @@ public: | |||||
| protected: | protected: | ||||
| virtual void exec_with_ncb_kern(const NCBKernParam& param, Algorithm* algo); | virtual void exec_with_ncb_kern(const NCBKernParam& param, Algorithm* algo); | ||||
| virtual void exec_preprocess_with_ncb_kern(const NCBKernParam& param, | |||||
| Algorithm* algo); | |||||
| virtual std::vector<Algorithm*> get_all_algorithms_with_ncb( | virtual std::vector<Algorithm*> get_all_algorithms_with_ncb( | ||||
| const NCBKernSizeParam& param); | const NCBKernSizeParam& param); | ||||
| @@ -199,21 +235,6 @@ protected: | |||||
| const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, | const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, | ||||
| bool reproducible = false); | bool reproducible = false); | ||||
| //! get kernel pointer | |||||
| virtual SmallVector<NCBKern> ncb_algo_dispatch_kern( | |||||
| Algorithm* algo, const NCBKernSizeParam& param) { | |||||
| return static_cast<AlgoBase*>(algo)->dispatch_kern(this, param); | |||||
| } | |||||
| //! get algo workspace | |||||
| virtual size_t ncb_algo_get_workspace(Algorithm* algo, | |||||
| const NCBKernSizeParam& param) { | |||||
| return static_cast<AlgoBase*>(algo)->get_workspace(this, param); | |||||
| } | |||||
| /*! | |||||
| * the default impl iterates over all ncb_1g_get_all_algorithms() | |||||
| * and return the first one whose workspace does not exceed the limit. | |||||
| */ | |||||
| const char* get_algorithm_set_name() const override; | const char* get_algorithm_set_name() const override; | ||||
| class AlgoFallback; | class AlgoFallback; | ||||
| @@ -231,14 +252,16 @@ private: | |||||
| const NCBKernSizeParam& param, | const NCBKernSizeParam& param, | ||||
| size_t workspace_size = std::numeric_limits<size_t>::max()); | size_t workspace_size = std::numeric_limits<size_t>::max()); | ||||
| NCBKernSizeParam make_ncb_kern_size_param(const TensorLayout& src, | |||||
| const TensorLayout& filter, | |||||
| const TensorLayout& dst); | |||||
| NCBKernParam make_ncb_kern_param(_megdnn_tensor_in src, | |||||
| _megdnn_tensor_in filter, | |||||
| _megdnn_tensor_out dst, | |||||
| _megdnn_workspace workspace); | |||||
| NCBKernSizeParam make_ncb_kern_size_param( | |||||
| const TensorLayout& src, const TensorLayout& filter, | |||||
| const TensorLayout& dst, | |||||
| const PreprocessedFilter* preprocessed_filter); | |||||
| NCBKernParam make_ncb_kern_param( | |||||
| _megdnn_tensor_in src, _megdnn_tensor_in filter, | |||||
| _megdnn_tensor_out dst, | |||||
| const PreprocessedFilter* preprocessed_filter, | |||||
| _megdnn_workspace workspace); | |||||
| }; | }; | ||||
| class ConvolutionBackwardDataImpl : public naive::ConvolutionBackwardDataImpl { | class ConvolutionBackwardDataImpl : public naive::ConvolutionBackwardDataImpl { | ||||
| @@ -80,14 +80,15 @@ size_t ConvBiasForwardImpl::get_workspace_in_bytes(const TensorLayout& src, | |||||
| void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | ||||
| _megdnn_tensor_in bias, _megdnn_tensor_in z, | _megdnn_tensor_in bias, _megdnn_tensor_in z, | ||||
| _megdnn_tensor_out dst, | _megdnn_tensor_out dst, | ||||
| const PreprocessedFilter*, | |||||
| const PreprocessedFilter* preprocessed_filter, | |||||
| _megdnn_workspace workspace) { | _megdnn_workspace workspace) { | ||||
| MIDOUT_BEGIN(megdnn_naive_conv_bias_fwd) { | MIDOUT_BEGIN(megdnn_naive_conv_bias_fwd) { | ||||
| dt_byte *workspace_ptr = workspace.raw_ptr; | dt_byte *workspace_ptr = workspace.raw_ptr; | ||||
| // ============================w * f + b================================ | // ============================w * f + b================================ | ||||
| auto filter_meta = check_exec(src.layout, filter.layout, bias.layout, | |||||
| z.layout, dst.layout, workspace.size); | |||||
| auto filter_meta = | |||||
| check_exec(src.layout, filter.layout, bias.layout, z.layout, | |||||
| dst.layout, workspace.size, preprocessed_filter); | |||||
| auto sfb = dst; | auto sfb = dst; | ||||
| if (bias.layout.dtype.enumv() != dst.layout.dtype.enumv()) { | if (bias.layout.dtype.enumv() != dst.layout.dtype.enumv()) { | ||||
| // intermediate result | // intermediate result | ||||
| @@ -61,9 +61,7 @@ public: | |||||
| void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, | void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, | ||||
| const TensorLayout&, const TensorLayout&, | const TensorLayout&, const TensorLayout&, | ||||
| const TensorLayout&, PreprocessedFilter*, | const TensorLayout&, PreprocessedFilter*, | ||||
| _megdnn_workspace) override{ | |||||
| megdnn_throw("conv_bias exec_preprocess is not impl yet"); | |||||
| } | |||||
| _megdnn_workspace) override {} | |||||
| const char* get_algorithm_set_name() const override; | const char* get_algorithm_set_name() const override; | ||||
| }; | }; | ||||
| @@ -28,11 +28,11 @@ using namespace naive; | |||||
| void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, | void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, | ||||
| _megdnn_tensor_in filter, | _megdnn_tensor_in filter, | ||||
| _megdnn_tensor_out dst, | _megdnn_tensor_out dst, | ||||
| const PreprocessedFilter*, | |||||
| const PreprocessedFilter* preprocessed_filter, | |||||
| _megdnn_workspace workspace) { | _megdnn_workspace workspace) { | ||||
| MIDOUT_BEGIN(megdnn_naive_conv_fwd) { | MIDOUT_BEGIN(megdnn_naive_conv_fwd) { | ||||
| auto filter_meta = check_exec(src.layout, filter.layout, dst.layout, | auto filter_meta = check_exec(src.layout, filter.layout, dst.layout, | ||||
| workspace.size); | |||||
| workspace.size, preprocessed_filter); | |||||
| using ComputeMode = Param::ComputeMode; | using ComputeMode = Param::ComputeMode; | ||||
| #define DISPATCH_CMODE(in_dt, out_dt, in_ct, out_ct, comp_ct, cmode) \ | #define DISPATCH_CMODE(in_dt, out_dt, in_ct, out_ct, comp_ct, cmode) \ | ||||
| do { \ | do { \ | ||||
| @@ -44,9 +44,7 @@ class ConvolutionForwardImpl: public ConvolutionForward { | |||||
| void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, | void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, | ||||
| const TensorLayout&, PreprocessedFilter*, | const TensorLayout&, PreprocessedFilter*, | ||||
| _megdnn_workspace) override { | |||||
| megdnn_throw("convolution exec_preprocess in not impl yet"); | |||||
| } | |||||
| _megdnn_workspace) override {} | |||||
| SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | ||||
| const TensorLayout& , const TensorLayout& , | const TensorLayout& , const TensorLayout& , | ||||
| @@ -18,6 +18,9 @@ | |||||
| #include "test/common/workspace_wrapper.h" | #include "test/common/workspace_wrapper.h" | ||||
| #include <algorithm> | #include <algorithm> | ||||
| #include <memory> | |||||
| namespace megdnn { | namespace megdnn { | ||||
| namespace test { | namespace test { | ||||
| @@ -32,6 +35,9 @@ struct OprProxyDefaultImpl | |||||
| template <typename Opr> | template <typename Opr> | ||||
| struct OprProxy : public OprProxyDefaultImpl<Opr> {}; | struct OprProxy : public OprProxyDefaultImpl<Opr> {}; | ||||
| template <typename Opr> | |||||
| struct OprWeightPreprocessProxy : public OprProxyDefaultImpl<Opr> {}; | |||||
| template <typename Opr> | template <typename Opr> | ||||
| struct OprProxyVectorToSingle {}; | struct OprProxyVectorToSingle {}; | ||||
| @@ -139,6 +145,28 @@ struct OprProxyProfilingBase | |||||
| typename Opr::Algorithm* target_algo = nullptr; | typename Opr::Algorithm* target_algo = nullptr; | ||||
| OprProxyProfilingBase(bool profile = false) { m_profiling = profile; } | OprProxyProfilingBase(bool profile = false) { m_profiling = profile; } | ||||
| //! used for alloc tensor for weight preprocess | |||||
| static std::shared_ptr<TensorNDArray> alloc_tensors( | |||||
| Handle* handle, const TensorLayoutArray& layouts) { | |||||
| auto deleter = [handle](TensorNDArray* ptr) { | |||||
| for (auto&& i : *ptr) { | |||||
| auto pdata = static_cast<dt_byte*>(i.raw_ptr) + | |||||
| i.layout.span().low_byte; | |||||
| megdnn_free(handle, pdata); | |||||
| } | |||||
| delete ptr; | |||||
| }; | |||||
| std::shared_ptr<TensorNDArray> ret{new TensorNDArray, deleter}; | |||||
| for (size_t i = 0; i < layouts.size(); ++i) { | |||||
| auto span = layouts[i].span(); | |||||
| ret->emplace_back(static_cast<dt_byte*>( | |||||
| megdnn_malloc(handle, span.dist_byte())) - | |||||
| span.low_byte, | |||||
| layouts[i]); | |||||
| } | |||||
| return ret; | |||||
| } | |||||
| }; | }; | ||||
| template <class Opr> | template <class Opr> | ||||
| @@ -207,7 +235,6 @@ DEF_PROF3(LocalShareBackwardData); | |||||
| DEF_PROF3(LocalShareBackwardFilter); | DEF_PROF3(LocalShareBackwardFilter); | ||||
| #undef DEF_PROF3 | #undef DEF_PROF3 | ||||
| //! TODO: it should adapt weight preprocess later | |||||
| template <> | template <> | ||||
| struct OprProxy<ConvolutionForward> | struct OprProxy<ConvolutionForward> | ||||
| : public OprProxyProfilingTernary<ConvolutionForward> { | : public OprProxyProfilingTernary<ConvolutionForward> { | ||||
| @@ -263,6 +290,100 @@ struct OprProxy<ConvolutionForward> | |||||
| } | } | ||||
| }; | }; | ||||
| template <> | |||||
| struct OprWeightPreprocessProxy<ConvolutionForward> | |||||
| : public OprProxyProfilingTernary<ConvolutionForward> { | |||||
| using OprProxyProfilingTernary<ConvolutionForward>::OprProxyProfilingTernary; | |||||
| void exec(ConvolutionForward* opr, const TensorNDArray& tensors) { | |||||
| megdnn_assert(tensors.size() == 3); | |||||
| if (!Base::W.valid()) { | |||||
| Base::W = WorkspaceWrapper(opr->handle(), 0); | |||||
| } | |||||
| if (Base::m_profiling && !Base::target_algo) { | |||||
| size_t min_time = std::numeric_limits<size_t>::max(); | |||||
| for (auto algo : | |||||
| opr->get_all_algorithms(tensors[0].layout, tensors[1].layout, | |||||
| tensors[2].layout)) { | |||||
| opr->execution_policy().algorithm = algo; | |||||
| auto preprocess_tensors = weight_prerocess(opr, tensors, algo); | |||||
| megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||||
| ConvolutionForward::PreprocessedFilter preprocessed_filter{ | |||||
| algo, *preprocess_tensors}; | |||||
| auto workspace_size = opr->get_workspace_in_bytes( | |||||
| tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||||
| &preprocessed_filter); | |||||
| Base::W.update(workspace_size); | |||||
| for (size_t times = 0; times < Base::warmup_times; ++times) | |||||
| opr->exec(tensors[0], tensors[1], tensors[2], | |||||
| &preprocessed_filter, Base::W.workspace()); | |||||
| megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||||
| Timer timer; | |||||
| timer.start(); | |||||
| for (size_t times = 0; times < Base::exec_times; ++times) { | |||||
| opr->exec(tensors[0], tensors[1], tensors[2], | |||||
| &preprocessed_filter, Base::W.workspace()); | |||||
| } | |||||
| megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||||
| timer.stop(); | |||||
| printf("%.3fms %s\n", timer.get_time_in_us() / 1e3, | |||||
| algo->name()); | |||||
| if (min_time > timer.get_time_in_us()) { | |||||
| min_time = timer.get_time_in_us(); | |||||
| Base::target_algo = algo; | |||||
| } | |||||
| } | |||||
| opr->execution_policy().algorithm = Base::target_algo; | |||||
| auto preprocess_tensors = | |||||
| weight_prerocess(opr, tensors, Base::target_algo); | |||||
| megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||||
| ConvolutionForward::PreprocessedFilter preprocessed_filter{ | |||||
| Base::target_algo, *preprocess_tensors}; | |||||
| auto workspace_size = opr->get_workspace_in_bytes( | |||||
| tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||||
| &preprocessed_filter); | |||||
| Base::W.update(workspace_size); | |||||
| } | |||||
| auto preprocess_tensors = | |||||
| weight_prerocess(opr, tensors, Base::target_algo); | |||||
| megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||||
| ConvolutionForward::PreprocessedFilter preprocessed_filter{ | |||||
| Base::target_algo, *preprocess_tensors}; | |||||
| if (!Base::target_algo) { | |||||
| auto workspace_size = opr->get_workspace_in_bytes( | |||||
| tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||||
| &preprocessed_filter); | |||||
| Base::W.update(workspace_size); | |||||
| } | |||||
| opr->exec(tensors[0], tensors[1], tensors[2], &preprocessed_filter, | |||||
| Base::W.workspace()); | |||||
| } | |||||
| //! handle weight preprocess | |||||
| std::shared_ptr<TensorNDArray> weight_prerocess( | |||||
| ConvolutionForward* opr, const TensorNDArray& tensors, | |||||
| ConvolutionForward::Algorithm* algo) { | |||||
| auto weight_perprocess_layouts = opr->deduce_preprocessed_filter_layout( | |||||
| tensors[0].layout, tensors[1].layout, tensors[2].layout); | |||||
| auto preprocessed_filter_tensors_ptr = | |||||
| alloc_tensors(opr->handle(), weight_perprocess_layouts); | |||||
| ConvolutionForward::PreprocessedFilter preprocessed_filter{ | |||||
| algo, *preprocessed_filter_tensors_ptr}; | |||||
| size_t preprocess_workspace_size = | |||||
| opr->get_preprocess_workspace_in_bytes(tensors[0].layout, | |||||
| tensors[1].layout, | |||||
| tensors[2].layout); | |||||
| WorkspaceWrapper preprocess_workspace(opr->handle(), | |||||
| preprocess_workspace_size); | |||||
| opr->exec_preprocess(tensors[0].layout, tensors[1], tensors[2].layout, | |||||
| &preprocessed_filter, | |||||
| preprocess_workspace.workspace()); | |||||
| return preprocessed_filter_tensors_ptr; | |||||
| } | |||||
| }; | |||||
| template <class Opr> | template <class Opr> | ||||
| struct OprProxyProfiling5 : public OprProxyProfilingBase<Opr, 5> { | struct OprProxyProfiling5 : public OprProxyProfilingBase<Opr, 5> { | ||||
| @@ -329,11 +450,9 @@ struct OprProxyProfiling5 : public OprProxyProfilingBase<Opr, 5> { | |||||
| DEF_PROF5(DeformableConvForward); | DEF_PROF5(DeformableConvForward); | ||||
| DEF_PROF5(DeformableConvBackwardFilter); | DEF_PROF5(DeformableConvBackwardFilter); | ||||
| //DEF_PROF5(ConvBiasForward); | |||||
| DEF_PROF5(BatchConvBiasForward); | DEF_PROF5(BatchConvBiasForward); | ||||
| #undef DEF_PROF5 | #undef DEF_PROF5 | ||||
| //! TODO: it should adapt weight preprocess later | |||||
| template <> | template <> | ||||
| struct OprProxy<ConvBiasForward> : public OprProxyProfiling5<ConvBiasForward> { | struct OprProxy<ConvBiasForward> : public OprProxyProfiling5<ConvBiasForward> { | ||||
| using OprProxyProfiling5<ConvBiasForward>::OprProxyProfiling5; | using OprProxyProfiling5<ConvBiasForward>::OprProxyProfiling5; | ||||
| @@ -390,6 +509,106 @@ struct OprProxy<ConvBiasForward> : public OprProxyProfiling5<ConvBiasForward> { | |||||
| } | } | ||||
| }; | }; | ||||
| template <> | |||||
| struct OprWeightPreprocessProxy<ConvBiasForward> | |||||
| : public OprProxyProfiling5<ConvBiasForward> { | |||||
| using OprProxyProfiling5<ConvBiasForward>::OprProxyProfiling5; | |||||
| void exec(ConvBiasForward* opr, const TensorNDArray& tensors) { | |||||
| megdnn_assert(tensors.size() == 5); | |||||
| if (!Base::W.valid()) { | |||||
| Base::W = WorkspaceWrapper(opr->handle(), 0); | |||||
| } | |||||
| if (Base::m_profiling && !Base::target_algo) { | |||||
| size_t min_time = std::numeric_limits<size_t>::max(); | |||||
| for (auto algo : | |||||
| opr->get_all_algorithms(tensors[0].layout, tensors[1].layout, | |||||
| tensors[2].layout, tensors[3].layout, | |||||
| tensors[4].layout)) { | |||||
| opr->execution_policy().algorithm = algo; | |||||
| auto preprocess_tensors = weight_prerocess(opr, tensors, algo); | |||||
| megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||||
| ConvBiasForward::PreprocessedFilter preprocessed_filter{ | |||||
| algo, *preprocess_tensors}; | |||||
| auto workspace_size = opr->get_workspace_in_bytes( | |||||
| tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||||
| tensors[3].layout, tensors[4].layout, | |||||
| &preprocessed_filter); | |||||
| Base::W.update(workspace_size); | |||||
| for (size_t times = 0; times < Base::warmup_times; ++times) | |||||
| opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], | |||||
| tensors[4], &preprocessed_filter, | |||||
| Base::W.workspace()); | |||||
| megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||||
| Timer timer; | |||||
| timer.start(); | |||||
| for (size_t times = 0; times < Base::exec_times; ++times) { | |||||
| opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], | |||||
| tensors[4], &preprocessed_filter, | |||||
| Base::W.workspace()); | |||||
| } | |||||
| megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||||
| timer.stop(); | |||||
| printf("%.3fms %s\n", timer.get_time_in_us() / 1e3, | |||||
| algo->name()); | |||||
| if (min_time > timer.get_time_in_us()) { | |||||
| min_time = timer.get_time_in_us(); | |||||
| Base::target_algo = algo; | |||||
| } | |||||
| } | |||||
| opr->execution_policy().algorithm = Base::target_algo; | |||||
| auto preprocess_tensors = | |||||
| weight_prerocess(opr, tensors, Base::target_algo); | |||||
| megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||||
| ConvBiasForward::PreprocessedFilter preprocessed_filter{ | |||||
| Base::target_algo, *preprocess_tensors}; | |||||
| auto workspace_size = opr->get_workspace_in_bytes( | |||||
| tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||||
| tensors[3].layout, tensors[4].layout, &preprocessed_filter); | |||||
| Base::W.update(workspace_size); | |||||
| } | |||||
| auto preprocess_tensors = | |||||
| weight_prerocess(opr, tensors, Base::target_algo); | |||||
| megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||||
| ConvBiasForward::PreprocessedFilter preprocessed_filter{ | |||||
| Base::target_algo, *preprocess_tensors}; | |||||
| if (!Base::target_algo) { | |||||
| auto workspace_size = opr->get_workspace_in_bytes( | |||||
| tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||||
| tensors[3].layout, tensors[4].layout, &preprocessed_filter); | |||||
| Base::W.update(workspace_size); | |||||
| } | |||||
| opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], tensors[4], | |||||
| &preprocessed_filter, Base::W.workspace()); | |||||
| } | |||||
| //! handle weight preprocess | |||||
| std::shared_ptr<TensorNDArray> weight_prerocess( | |||||
| ConvBiasForward* opr, const TensorNDArray& tensors, | |||||
| ConvBiasForward::Algorithm* algo) { | |||||
| auto weight_perprocess_layouts = opr->deduce_preprocessed_filter_layout( | |||||
| tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||||
| tensors[3].layout, tensors[4].layout); | |||||
| auto preprocessed_filter_tensors_ptr = | |||||
| alloc_tensors(opr->handle(), weight_perprocess_layouts); | |||||
| ConvBiasForward::PreprocessedFilter preprocessed_filter{ | |||||
| algo, *preprocessed_filter_tensors_ptr}; | |||||
| size_t preprocess_workspace_size = | |||||
| opr->get_preprocess_workspace_in_bytes( | |||||
| tensors[0].layout, tensors[1].layout, tensors[2].layout, | |||||
| tensors[3].layout, tensors[4].layout); | |||||
| WorkspaceWrapper preprocess_workspace(opr->handle(), | |||||
| preprocess_workspace_size); | |||||
| opr->exec_preprocess(tensors[0].layout, tensors[1], tensors[2].layout, | |||||
| tensors[3].layout, tensors[4].layout, | |||||
| &preprocessed_filter, | |||||
| preprocess_workspace.workspace()); | |||||
| return preprocessed_filter_tensors_ptr; | |||||
| } | |||||
| }; | |||||
| template <class Opr> | template <class Opr> | ||||
| struct OprProxyProfiling8 : public OprProxyProfilingBase<Opr, 8> { | struct OprProxyProfiling8 : public OprProxyProfilingBase<Opr, 8> { | ||||
| using Base = OprProxyProfilingBase<Opr, 8>; | using Base = OprProxyProfilingBase<Opr, 8>; | ||||