GitOrigin-RevId: 1797f3b91c
tags/v1.11.1
| @@ -14,7 +14,7 @@ | |||||
| namespace megdnn { | namespace megdnn { | ||||
| namespace test { | namespace test { | ||||
| template <typename Opr, typename T> | |||||
| template <typename Opr, typename T, typename Proxy = OprProxy<Opr>> | |||||
| class BenchmarkerBase { | class BenchmarkerBase { | ||||
| public: | public: | ||||
| using Param = typename Opr::Param; | using Param = typename Opr::Param; | ||||
| @@ -28,7 +28,7 @@ public: | |||||
| m_handle(handle), | m_handle(handle), | ||||
| m_default_rng(new NormalRNG()), | m_default_rng(new NormalRNG()), | ||||
| m_param(Param()), | m_param(Param()), | ||||
| m_proxy{new OprProxy<Opr>()} {} | |||||
| m_proxy{new Proxy()} {} | |||||
| const Handle* handle() const { return m_handle; } | const Handle* handle() const { return m_handle; } | ||||
| @@ -81,12 +81,12 @@ public: | |||||
| } | } | ||||
| return layouts; | return layouts; | ||||
| } | } | ||||
| BenchmarkerBase& set_proxy(std::unique_ptr<OprProxy<Opr>>& proxy) { | |||||
| BenchmarkerBase& set_proxy(std::unique_ptr<Proxy>& proxy) { | |||||
| m_proxy.reset(nullptr); | m_proxy.reset(nullptr); | ||||
| m_proxy = std::move(proxy); | m_proxy = std::move(proxy); | ||||
| return *this; | return *this; | ||||
| } | } | ||||
| std::unique_ptr<OprProxy<Opr>>& proxy() { return m_proxy; } | |||||
| std::unique_ptr<Proxy>& proxy() { return m_proxy; } | |||||
| BenchmarkerBase& set_times(size_t times) { | BenchmarkerBase& set_times(size_t times) { | ||||
| m_times = times; | m_times = times; | ||||
| return *this; | return *this; | ||||
| @@ -135,14 +135,14 @@ private: | |||||
| std::map<size_t, DType> m_dtype; | std::map<size_t, DType> m_dtype; | ||||
| std::map<size_t, TensorFormat> m_fmt; | std::map<size_t, TensorFormat> m_fmt; | ||||
| Param m_param; | Param m_param; | ||||
| std::unique_ptr<OprProxy<Opr>> m_proxy; | |||||
| std::unique_ptr<Proxy> m_proxy; | |||||
| BeforeExecCallback m_before_exec_callback; | BeforeExecCallback m_before_exec_callback; | ||||
| std::unique_ptr<Opr> m_opr; | std::unique_ptr<Opr> m_opr; | ||||
| TensorsConstriant m_tensor_constraint; | TensorsConstriant m_tensor_constraint; | ||||
| }; | }; | ||||
| template <typename Opr, typename T> | |||||
| float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) { | |||||
| template <typename Opr, typename T, typename OprProxy> | |||||
| float BenchmarkerBase<Opr, T, OprProxy>::exec(TensorLayoutArray layouts) { | |||||
| auto opr = this->opr(); | auto opr = this->opr(); | ||||
| opr->param() = m_param; | opr->param() = m_param; | ||||
| auto user_layouts = layouts; | auto user_layouts = layouts; | ||||
| @@ -196,6 +196,8 @@ float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) { | |||||
| if (m_before_exec_callback) { | if (m_before_exec_callback) { | ||||
| m_before_exec_callback(opr, tensors_cur); | m_before_exec_callback(opr, tensors_cur); | ||||
| } | } | ||||
| //! init weights | |||||
| m_proxy->init(opr, tensors_cur); | |||||
| // run | // run | ||||
| // warm up | // warm up | ||||
| m_proxy->exec(opr, tensors_cur); | m_proxy->exec(opr, tensors_cur); | ||||
| @@ -246,8 +248,8 @@ float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) { | |||||
| return time_in_ms; | return time_in_ms; | ||||
| } | } | ||||
| template <typename Opr, typename T> | |||||
| float BenchmarkerBase<Opr, T>::exect(const TensorValueArray& testcase_in) { | |||||
| template <typename Opr, typename T, typename Proxy> | |||||
| float BenchmarkerBase<Opr, T, Proxy>::exect(const TensorValueArray& testcase_in) { | |||||
| auto opr = this->opr(); | auto opr = this->opr(); | ||||
| opr->param() = m_param; | opr->param() = m_param; | ||||
| TensorLayoutArray layouts; | TensorLayoutArray layouts; | ||||
| @@ -295,6 +297,8 @@ float BenchmarkerBase<Opr, T>::exect(const TensorValueArray& testcase_in) { | |||||
| if (m_before_exec_callback) { | if (m_before_exec_callback) { | ||||
| m_before_exec_callback(opr, tensors_cur); | m_before_exec_callback(opr, tensors_cur); | ||||
| } | } | ||||
| //! init weights | |||||
| m_proxy->init(opr, tensors_cur); | |||||
| //! run | //! run | ||||
| //! warm up | //! warm up | ||||
| m_proxy->exec(opr, tensors_cur); | m_proxy->exec(opr, tensors_cur); | ||||
| @@ -344,19 +348,16 @@ float BenchmarkerBase<Opr, T>::exect(const TensorValueArray& testcase_in) { | |||||
| return time_in_ms; | return time_in_ms; | ||||
| } | } | ||||
| template <typename Opr, typename T = Timer> | |||||
| class Benchmarker; | |||||
| template <typename Opr> | |||||
| class Benchmarker<Opr, Timer> : public BenchmarkerBase<Opr, Timer> { | |||||
| template <typename Opr, typename T = Timer, typename Proxy = OprProxy<Opr>> | |||||
| class Benchmarker : public BenchmarkerBase<Opr, T, Proxy> { | |||||
| public: | public: | ||||
| Benchmarker(Handle* handle) : BenchmarkerBase<Opr, Timer>{handle, Timer{}} {} | |||||
| Benchmarker(Handle* handle) : BenchmarkerBase<Opr, T, Proxy>{handle, Timer{}} {} | |||||
| }; | }; | ||||
| ////////////////// Algo Benchmark //////////////////////// | ////////////////// Algo Benchmark //////////////////////// | ||||
| template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer> | template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer> | ||||
| float algo_benchmark( | float algo_benchmark( | ||||
| Benchmarker<Opr, T>& benchmark, TensorLayoutArray layouts, | |||||
| Benchmarker<Opr, T, Proxy>& benchmark, TensorLayoutArray layouts, | |||||
| const std::string& algo_base) { | const std::string& algo_base) { | ||||
| Proxy proxy; | Proxy proxy; | ||||
| auto opr = benchmark.opr(); | auto opr = benchmark.opr(); | ||||
| @@ -381,7 +382,7 @@ float algo_benchmark( | |||||
| template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer> | template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer> | ||||
| float algo_benchmark( | float algo_benchmark( | ||||
| Benchmarker<Opr, T>& benchmark, TensorShapeArray shapes, | |||||
| Benchmarker<Opr, T, Proxy>& benchmark, TensorShapeArray shapes, | |||||
| const std::string& algo_base) { | const std::string& algo_base) { | ||||
| return algo_benchmark(benchmark, benchmark.make_layouts(shapes), algo_base); | return algo_benchmark(benchmark, benchmark.make_layouts(shapes), algo_base); | ||||
| } | } | ||||
| @@ -995,6 +995,52 @@ void benchmark_winograd( | |||||
| used / used_winograd); | used / used_winograd); | ||||
| } | } | ||||
| } | } | ||||
| // usage of weight pre-processing for winograd benchmark | |||||
| void benchmark_winograd_weight_preprocess( | |||||
| const char* algo_name, megdnn::Handle* handle, size_t kernel, | |||||
| size_t pack_size) { | |||||
| auto&& args = get_winograd_benchmark_args(kernel, pack_size); | |||||
| using namespace conv_bias; | |||||
| constexpr size_t RUN = 10; | |||||
| //! here!!! | |||||
| Benchmarker<ConvBias, Timer, OprWeightPreprocessBenchmarkProxy<ConvBias>> | |||||
| benchmark_winograd(handle); | |||||
| benchmark_winograd.set_display(false); | |||||
| benchmark_winograd.set_times(RUN); | |||||
| for (auto&& arg : args) { | |||||
| TensorLayout dst_layout; | |||||
| auto opr = handle->create_operator<ConvBias>(); | |||||
| opr->param() = arg.param; | |||||
| opr->deduce_layout( | |||||
| {arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()}, | |||||
| {arg.bias, dtype::Float32()}, {}, dst_layout); | |||||
| //! dst.nr_elems * IC * FH * FW * 2 | |||||
| float computations = dst_layout.total_nr_elems() * arg.filter[1] * | |||||
| arg.filter[2] * arg.filter[3] * 2.0 / | |||||
| (1024 * 1024 * 1024) * 1e3; | |||||
| param::Convolution conv_param; | |||||
| conv_param.pad_h = arg.param.pad_h; | |||||
| conv_param.pad_w = arg.param.pad_w; | |||||
| conv_param.stride_h = arg.param.stride_h; | |||||
| conv_param.stride_w = arg.param.stride_w; | |||||
| benchmark_winograd.set_param(arg.param); | |||||
| auto used_winograd = | |||||
| algo_benchmark< | |||||
| ConvBias, OprWeightPreprocessBenchmarkProxy<ConvBias>, Timer>( | |||||
| benchmark_winograd, {arg.src, arg.filter, {}, {}, {}}, | |||||
| algo_name) / | |||||
| RUN; | |||||
| printf("%s %s: %s: %f ms %f Gflops\n", arg.src.to_string().c_str(), | |||||
| arg.filter.to_string().c_str(), algo_name, used_winograd, | |||||
| computations / used_winograd); | |||||
| } | |||||
| } | |||||
| #endif // MEGDNN_WITH_BENCHMARK | #endif // MEGDNN_WITH_BENCHMARK | ||||
| template <class Checker> | template <class Checker> | ||||
| @@ -66,6 +66,9 @@ std::vector<conv_bias::TestArg> get_winograd_benchmark_args( | |||||
| void benchmark_winograd( | void benchmark_winograd( | ||||
| const char* algo_name, megdnn::Handle* handle, size_t kernel, | const char* algo_name, megdnn::Handle* handle, size_t kernel, | ||||
| size_t pack_size = 1); | size_t pack_size = 1); | ||||
| void benchmark_winograd_weight_preprocess( | |||||
| const char* algo_name, megdnn::Handle* handle, size_t kernel, | |||||
| size_t pack_size = 1); | |||||
| #endif // MEGDNN_WITH_BENCHMARK | #endif // MEGDNN_WITH_BENCHMARK | ||||
| template <class Checker> | template <class Checker> | ||||
| void check_winograd( | void check_winograd( | ||||
| @@ -114,7 +114,10 @@ template < | |||||
| bool has_workspace = OprTrait<Opr>::has_workspace, | bool has_workspace = OprTrait<Opr>::has_workspace, | ||||
| bool can_deduce_layout = OprTrait<Opr>::can_deduce_layout> | bool can_deduce_layout = OprTrait<Opr>::can_deduce_layout> | ||||
| struct OprProxyDefaultImpl : public DeduceLayoutProxy<Opr, arity, can_deduce_layout>, | struct OprProxyDefaultImpl : public DeduceLayoutProxy<Opr, arity, can_deduce_layout>, | ||||
| public ExecProxy<Opr, arity, has_workspace> {}; | |||||
| public ExecProxy<Opr, arity, has_workspace> { | |||||
| virtual void init(Opr*, const TensorNDArray&) {} | |||||
| virtual ~OprProxyDefaultImpl() {} | |||||
| }; | |||||
| template <typename Opr> | template <typename Opr> | ||||
| struct OprProxy : public OprProxyDefaultImpl<Opr> {}; | struct OprProxy : public OprProxyDefaultImpl<Opr> {}; | ||||
| @@ -122,6 +125,9 @@ struct OprProxy : public OprProxyDefaultImpl<Opr> {}; | |||||
| template <typename Opr> | template <typename Opr> | ||||
| struct OprWeightPreprocessProxy : public OprProxyDefaultImpl<Opr> {}; | struct OprWeightPreprocessProxy : public OprProxyDefaultImpl<Opr> {}; | ||||
| template <typename Opr> | |||||
| struct OprWeightPreprocessBenchmarkProxy : OprProxyDefaultImpl<Opr> {}; | |||||
| template <typename Opr> | template <typename Opr> | ||||
| struct OprProxyVectorToSingle {}; | struct OprProxyVectorToSingle {}; | ||||
| @@ -134,6 +140,8 @@ struct OprProxy<ElemwiseForward> { | |||||
| opr->deduce_layout(inp, layouts.back()); | opr->deduce_layout(inp, layouts.back()); | ||||
| } | } | ||||
| static void init(ElemwiseForward*, const TensorNDArray&) {} | |||||
| static void exec(ElemwiseForward* opr, const TensorNDArray& tensors) { | static void exec(ElemwiseForward* opr, const TensorNDArray& tensors) { | ||||
| megdnn_assert(tensors.size() >= 2); | megdnn_assert(tensors.size() >= 2); | ||||
| auto inp = tensors; | auto inp = tensors; | ||||
| @@ -151,6 +159,8 @@ struct OprProxy<ElemwiseMultiType> { | |||||
| opr->deduce_layout(inp, layouts.back()); | opr->deduce_layout(inp, layouts.back()); | ||||
| } | } | ||||
| static void init(ElemwiseMultiType*, const TensorNDArray&) {} | |||||
| static void exec(ElemwiseMultiType* opr, const TensorNDArray& tensors) { | static void exec(ElemwiseMultiType* opr, const TensorNDArray& tensors) { | ||||
| megdnn_assert(tensors.size() >= 2); | megdnn_assert(tensors.size() >= 2); | ||||
| auto inp = tensors; | auto inp = tensors; | ||||
| @@ -169,6 +179,8 @@ struct OprProxy<ConcatForward> { | |||||
| opr->deduce_layout(inp, layouts.back()); | opr->deduce_layout(inp, layouts.back()); | ||||
| } | } | ||||
| static void init(ConcatForward*, const TensorNDArray&) {} | |||||
| void exec(ConcatForward* opr, const TensorNDArray& tensors) { | void exec(ConcatForward* opr, const TensorNDArray& tensors) { | ||||
| if (!W.valid()) { | if (!W.valid()) { | ||||
| W = WorkspaceWrapper(opr->handle(), 0); | W = WorkspaceWrapper(opr->handle(), 0); | ||||
| @@ -200,6 +212,8 @@ struct OprProxy<CheckNonFinite> { | |||||
| opr->deduce_layout(inp, layouts.back()); | opr->deduce_layout(inp, layouts.back()); | ||||
| } | } | ||||
| static void init(CheckNonFinite*, const TensorNDArray&) {} | |||||
| static void exec(CheckNonFinite* opr, const TensorNDArray& tensors) { | static void exec(CheckNonFinite* opr, const TensorNDArray& tensors) { | ||||
| megdnn_assert(tensors.size() >= 2); | megdnn_assert(tensors.size() >= 2); | ||||
| auto inps = tensors; | auto inps = tensors; | ||||
| @@ -220,6 +234,9 @@ struct OprProxy<CheckNonFinite> { | |||||
| template <> | template <> | ||||
| struct OprProxy<SplitForward> : DeduceLayoutProxy<SplitForward, 0, false> { | struct OprProxy<SplitForward> : DeduceLayoutProxy<SplitForward, 0, false> { | ||||
| WorkspaceWrapper W; | WorkspaceWrapper W; | ||||
| void init(SplitForward*, const TensorNDArray&) {} | |||||
| void exec(SplitForward* opr, const TensorNDArray& tensors) { | void exec(SplitForward* opr, const TensorNDArray& tensors) { | ||||
| megdnn_assert(tensors.size() >= 2); | megdnn_assert(tensors.size() >= 2); | ||||
| if (!W.valid()) { | if (!W.valid()) { | ||||
| @@ -428,7 +445,9 @@ struct OprProxyProfilingBase | |||||
| best_algo); | best_algo); | ||||
| } | } | ||||
| void exec(Opr* opr, const TensorNDArray& tensors) { | |||||
| virtual void init(Opr*, const TensorNDArray&) {} | |||||
| virtual void exec(Opr* opr, const TensorNDArray& tensors) { | |||||
| megdnn_assert(tensors.size() == arity); | megdnn_assert(tensors.size() == arity); | ||||
| if (!W.valid()) { | if (!W.valid()) { | ||||
| W = WorkspaceWrapper(opr->handle(), 0); | W = WorkspaceWrapper(opr->handle(), 0); | ||||
| @@ -463,6 +482,8 @@ struct OprProxyProfilingBase | |||||
| } | } | ||||
| AlgoProxy<Opr, arity>::exec(opr, tensors, W.workspace()); | AlgoProxy<Opr, arity>::exec(opr, tensors, W.workspace()); | ||||
| } | } | ||||
| virtual ~OprProxyProfilingBase() {} | |||||
| }; | }; | ||||
| #define DEF_PROF(c) \ | #define DEF_PROF(c) \ | ||||
| @@ -491,7 +512,7 @@ template <class Opr> | |||||
| struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr> { | struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr> { | ||||
| using Base = OprProxyProfilingBase<Opr>; | using Base = OprProxyProfilingBase<Opr>; | ||||
| static constexpr int arity = OprTrait<Opr>::arity; | static constexpr int arity = OprTrait<Opr>::arity; | ||||
| void exec(Opr* opr, const TensorNDArray& tensors) { | |||||
| void exec(Opr* opr, const TensorNDArray& tensors) override { | |||||
| megdnn_assert(tensors.size() == arity); | megdnn_assert(tensors.size() == arity); | ||||
| if (!Base::W.valid()) { | if (!Base::W.valid()) { | ||||
| Base::W = WorkspaceWrapper(opr->handle(), 0); | Base::W = WorkspaceWrapper(opr->handle(), 0); | ||||
| @@ -584,11 +605,55 @@ struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr> { | |||||
| } | } | ||||
| }; | }; | ||||
| template <class Opr> | |||||
| struct OprWeightPreprocessProxyBenchmarkImpl | |||||
| : public OprWeightPreprocessProxyImpl<Opr> { | |||||
| using Base = OprProxyProfilingBase<Opr>; | |||||
| static constexpr int arity = OprTrait<Opr>::arity; | |||||
| void init(Opr* opr, const TensorNDArray& tensors) override { | |||||
| megdnn_assert(tensors.size() == arity); | |||||
| if (!Base::W.valid()) { | |||||
| Base::W = WorkspaceWrapper(opr->handle(), 0); | |||||
| } | |||||
| TensorLayoutArray layouts; | |||||
| for (auto&& tensor : tensors) { | |||||
| layouts.push_back(tensor.layout); | |||||
| } | |||||
| m_preprocessed_tensors = this->weight_prerocess( | |||||
| opr, tensors, Base::target_execution_policy.algo); | |||||
| megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||||
| typename Opr::PreprocessedFilter preprocessed_filter{ | |||||
| nullptr, *m_preprocessed_tensors}; | |||||
| if (!Base::target_execution_policy.algo.valid()) { | |||||
| auto workspace_size = AlgoProxy<Opr, arity>::get_workspace_in_bytes( | |||||
| opr, layouts, &preprocessed_filter); | |||||
| Base::W.update(workspace_size); | |||||
| } | |||||
| } | |||||
| void exec(Opr* opr, const TensorNDArray& tensors) override { | |||||
| megdnn_assert(tensors.size() == arity); | |||||
| typename Opr::PreprocessedFilter preprocessed_filter{ | |||||
| nullptr, *m_preprocessed_tensors}; | |||||
| AlgoProxy<Opr, arity>::exec( | |||||
| opr, tensors, &preprocessed_filter, Base::W.workspace()); | |||||
| } | |||||
| public: | |||||
| std::shared_ptr<TensorNDArray> m_preprocessed_tensors; | |||||
| }; | |||||
| #define DEF_PROF(c) \ | #define DEF_PROF(c) \ | ||||
| template <> \ | template <> \ | ||||
| struct OprWeightPreprocessProxy<c> : public OprWeightPreprocessProxyImpl<c> { \ | struct OprWeightPreprocessProxy<c> : public OprWeightPreprocessProxyImpl<c> { \ | ||||
| using OprWeightPreprocessProxyImpl<c>::OprWeightPreprocessProxyImpl; \ | using OprWeightPreprocessProxyImpl<c>::OprWeightPreprocessProxyImpl; \ | ||||
| } | |||||
| }; \ | |||||
| template <> \ | |||||
| struct OprWeightPreprocessBenchmarkProxy<c> \ | |||||
| : public OprWeightPreprocessProxyBenchmarkImpl<c> { \ | |||||
| using OprWeightPreprocessProxyBenchmarkImpl< \ | |||||
| c>::OprWeightPreprocessProxyBenchmarkImpl; \ | |||||
| }; | |||||
| DEF_PROF(ConvolutionForward); | DEF_PROF(ConvolutionForward); | ||||
| DEF_PROF(ConvBias); | DEF_PROF(ConvBias); | ||||
| @@ -16,6 +16,7 @@ private: | |||||
| public: | public: | ||||
| OprProxy() = default; | OprProxy() = default; | ||||
| OprProxy(int k) : m_k{k} {} | OprProxy(int k) : m_k{k} {} | ||||
| void init(TopK*, const TensorLayoutArray&) {} | |||||
| void deduce_layout(TopK* opr, TensorLayoutArray& layouts) { | void deduce_layout(TopK* opr, TensorLayoutArray& layouts) { | ||||
| if (layouts.size() == 3) { | if (layouts.size() == 3) { | ||||