GitOrigin-RevId: 1797f3b91c
tags/v1.11.1
| @@ -14,7 +14,7 @@ | |||
| namespace megdnn { | |||
| namespace test { | |||
| template <typename Opr, typename T> | |||
| template <typename Opr, typename T, typename Proxy = OprProxy<Opr>> | |||
| class BenchmarkerBase { | |||
| public: | |||
| using Param = typename Opr::Param; | |||
| @@ -28,7 +28,7 @@ public: | |||
| m_handle(handle), | |||
| m_default_rng(new NormalRNG()), | |||
| m_param(Param()), | |||
| m_proxy{new OprProxy<Opr>()} {} | |||
| m_proxy{new Proxy()} {} | |||
| const Handle* handle() const { return m_handle; } | |||
| @@ -81,12 +81,12 @@ public: | |||
| } | |||
| return layouts; | |||
| } | |||
| BenchmarkerBase& set_proxy(std::unique_ptr<OprProxy<Opr>>& proxy) { | |||
| BenchmarkerBase& set_proxy(std::unique_ptr<Proxy>& proxy) { | |||
| m_proxy.reset(nullptr); | |||
| m_proxy = std::move(proxy); | |||
| return *this; | |||
| } | |||
| std::unique_ptr<OprProxy<Opr>>& proxy() { return m_proxy; } | |||
| std::unique_ptr<Proxy>& proxy() { return m_proxy; } | |||
| BenchmarkerBase& set_times(size_t times) { | |||
| m_times = times; | |||
| return *this; | |||
| @@ -135,14 +135,14 @@ private: | |||
| std::map<size_t, DType> m_dtype; | |||
| std::map<size_t, TensorFormat> m_fmt; | |||
| Param m_param; | |||
| std::unique_ptr<OprProxy<Opr>> m_proxy; | |||
| std::unique_ptr<Proxy> m_proxy; | |||
| BeforeExecCallback m_before_exec_callback; | |||
| std::unique_ptr<Opr> m_opr; | |||
| TensorsConstriant m_tensor_constraint; | |||
| }; | |||
| template <typename Opr, typename T> | |||
| float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) { | |||
| template <typename Opr, typename T, typename OprProxy> | |||
| float BenchmarkerBase<Opr, T, OprProxy>::exec(TensorLayoutArray layouts) { | |||
| auto opr = this->opr(); | |||
| opr->param() = m_param; | |||
| auto user_layouts = layouts; | |||
| @@ -196,6 +196,8 @@ float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) { | |||
| if (m_before_exec_callback) { | |||
| m_before_exec_callback(opr, tensors_cur); | |||
| } | |||
| //! init weights | |||
| m_proxy->init(opr, tensors_cur); | |||
| // run | |||
| // warm up | |||
| m_proxy->exec(opr, tensors_cur); | |||
| @@ -246,8 +248,8 @@ float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) { | |||
| return time_in_ms; | |||
| } | |||
| template <typename Opr, typename T> | |||
| float BenchmarkerBase<Opr, T>::exect(const TensorValueArray& testcase_in) { | |||
| template <typename Opr, typename T, typename Proxy> | |||
| float BenchmarkerBase<Opr, T, Proxy>::exect(const TensorValueArray& testcase_in) { | |||
| auto opr = this->opr(); | |||
| opr->param() = m_param; | |||
| TensorLayoutArray layouts; | |||
| @@ -295,6 +297,8 @@ float BenchmarkerBase<Opr, T>::exect(const TensorValueArray& testcase_in) { | |||
| if (m_before_exec_callback) { | |||
| m_before_exec_callback(opr, tensors_cur); | |||
| } | |||
| //! init weights | |||
| m_proxy->init(opr, tensors_cur); | |||
| //! run | |||
| //! warm up | |||
| m_proxy->exec(opr, tensors_cur); | |||
| @@ -344,19 +348,16 @@ float BenchmarkerBase<Opr, T>::exect(const TensorValueArray& testcase_in) { | |||
| return time_in_ms; | |||
| } | |||
| template <typename Opr, typename T = Timer> | |||
| class Benchmarker; | |||
| template <typename Opr> | |||
| class Benchmarker<Opr, Timer> : public BenchmarkerBase<Opr, Timer> { | |||
| template <typename Opr, typename T = Timer, typename Proxy = OprProxy<Opr>> | |||
| class Benchmarker : public BenchmarkerBase<Opr, T, Proxy> { | |||
| public: | |||
| Benchmarker(Handle* handle) : BenchmarkerBase<Opr, Timer>{handle, Timer{}} {} | |||
| Benchmarker(Handle* handle) : BenchmarkerBase<Opr, T, Proxy>{handle, Timer{}} {} | |||
| }; | |||
| ////////////////// Algo Benchmark //////////////////////// | |||
| template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer> | |||
| float algo_benchmark( | |||
| Benchmarker<Opr, T>& benchmark, TensorLayoutArray layouts, | |||
| Benchmarker<Opr, T, Proxy>& benchmark, TensorLayoutArray layouts, | |||
| const std::string& algo_base) { | |||
| Proxy proxy; | |||
| auto opr = benchmark.opr(); | |||
| @@ -381,7 +382,7 @@ float algo_benchmark( | |||
| template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer> | |||
| float algo_benchmark( | |||
| Benchmarker<Opr, T>& benchmark, TensorShapeArray shapes, | |||
| Benchmarker<Opr, T, Proxy>& benchmark, TensorShapeArray shapes, | |||
| const std::string& algo_base) { | |||
| return algo_benchmark(benchmark, benchmark.make_layouts(shapes), algo_base); | |||
| } | |||
| @@ -995,6 +995,52 @@ void benchmark_winograd( | |||
| used / used_winograd); | |||
| } | |||
| } | |||
| // usage of weight pre-processing for winograd benchmark | |||
| void benchmark_winograd_weight_preprocess( | |||
| const char* algo_name, megdnn::Handle* handle, size_t kernel, | |||
| size_t pack_size) { | |||
| auto&& args = get_winograd_benchmark_args(kernel, pack_size); | |||
| using namespace conv_bias; | |||
| constexpr size_t RUN = 10; | |||
| //! here!!! | |||
| Benchmarker<ConvBias, Timer, OprWeightPreprocessBenchmarkProxy<ConvBias>> | |||
| benchmark_winograd(handle); | |||
| benchmark_winograd.set_display(false); | |||
| benchmark_winograd.set_times(RUN); | |||
| for (auto&& arg : args) { | |||
| TensorLayout dst_layout; | |||
| auto opr = handle->create_operator<ConvBias>(); | |||
| opr->param() = arg.param; | |||
| opr->deduce_layout( | |||
| {arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()}, | |||
| {arg.bias, dtype::Float32()}, {}, dst_layout); | |||
| //! dst.nr_elems * IC * FH * FW * 2 | |||
| float computations = dst_layout.total_nr_elems() * arg.filter[1] * | |||
| arg.filter[2] * arg.filter[3] * 2.0 / | |||
| (1024 * 1024 * 1024) * 1e3; | |||
| param::Convolution conv_param; | |||
| conv_param.pad_h = arg.param.pad_h; | |||
| conv_param.pad_w = arg.param.pad_w; | |||
| conv_param.stride_h = arg.param.stride_h; | |||
| conv_param.stride_w = arg.param.stride_w; | |||
| benchmark_winograd.set_param(arg.param); | |||
| auto used_winograd = | |||
| algo_benchmark< | |||
| ConvBias, OprWeightPreprocessBenchmarkProxy<ConvBias>, Timer>( | |||
| benchmark_winograd, {arg.src, arg.filter, {}, {}, {}}, | |||
| algo_name) / | |||
| RUN; | |||
| printf("%s %s: %s: %f ms %f Gflops\n", arg.src.to_string().c_str(), | |||
| arg.filter.to_string().c_str(), algo_name, used_winograd, | |||
| computations / used_winograd); | |||
| } | |||
| } | |||
| #endif // MEGDNN_WITH_BENCHMARK | |||
| template <class Checker> | |||
| @@ -66,6 +66,9 @@ std::vector<conv_bias::TestArg> get_winograd_benchmark_args( | |||
| void benchmark_winograd( | |||
| const char* algo_name, megdnn::Handle* handle, size_t kernel, | |||
| size_t pack_size = 1); | |||
| void benchmark_winograd_weight_preprocess( | |||
| const char* algo_name, megdnn::Handle* handle, size_t kernel, | |||
| size_t pack_size = 1); | |||
| #endif // MEGDNN_WITH_BENCHMARK | |||
| template <class Checker> | |||
| void check_winograd( | |||
| @@ -114,7 +114,10 @@ template < | |||
| bool has_workspace = OprTrait<Opr>::has_workspace, | |||
| bool can_deduce_layout = OprTrait<Opr>::can_deduce_layout> | |||
| struct OprProxyDefaultImpl : public DeduceLayoutProxy<Opr, arity, can_deduce_layout>, | |||
| public ExecProxy<Opr, arity, has_workspace> {}; | |||
| public ExecProxy<Opr, arity, has_workspace> { | |||
| virtual void init(Opr*, const TensorNDArray&) {} | |||
| virtual ~OprProxyDefaultImpl() {} | |||
| }; | |||
| template <typename Opr> | |||
| struct OprProxy : public OprProxyDefaultImpl<Opr> {}; | |||
| @@ -122,6 +125,9 @@ struct OprProxy : public OprProxyDefaultImpl<Opr> {}; | |||
| template <typename Opr> | |||
| struct OprWeightPreprocessProxy : public OprProxyDefaultImpl<Opr> {}; | |||
| template <typename Opr> | |||
| struct OprWeightPreprocessBenchmarkProxy : OprProxyDefaultImpl<Opr> {}; | |||
| template <typename Opr> | |||
| struct OprProxyVectorToSingle {}; | |||
| @@ -134,6 +140,8 @@ struct OprProxy<ElemwiseForward> { | |||
| opr->deduce_layout(inp, layouts.back()); | |||
| } | |||
| static void init(ElemwiseForward*, const TensorNDArray&) {} | |||
| static void exec(ElemwiseForward* opr, const TensorNDArray& tensors) { | |||
| megdnn_assert(tensors.size() >= 2); | |||
| auto inp = tensors; | |||
| @@ -151,6 +159,8 @@ struct OprProxy<ElemwiseMultiType> { | |||
| opr->deduce_layout(inp, layouts.back()); | |||
| } | |||
| static void init(ElemwiseMultiType*, const TensorNDArray&) {} | |||
| static void exec(ElemwiseMultiType* opr, const TensorNDArray& tensors) { | |||
| megdnn_assert(tensors.size() >= 2); | |||
| auto inp = tensors; | |||
| @@ -169,6 +179,8 @@ struct OprProxy<ConcatForward> { | |||
| opr->deduce_layout(inp, layouts.back()); | |||
| } | |||
| static void init(ConcatForward*, const TensorNDArray&) {} | |||
| void exec(ConcatForward* opr, const TensorNDArray& tensors) { | |||
| if (!W.valid()) { | |||
| W = WorkspaceWrapper(opr->handle(), 0); | |||
| @@ -200,6 +212,8 @@ struct OprProxy<CheckNonFinite> { | |||
| opr->deduce_layout(inp, layouts.back()); | |||
| } | |||
| static void init(CheckNonFinite*, const TensorNDArray&) {} | |||
| static void exec(CheckNonFinite* opr, const TensorNDArray& tensors) { | |||
| megdnn_assert(tensors.size() >= 2); | |||
| auto inps = tensors; | |||
| @@ -220,6 +234,9 @@ struct OprProxy<CheckNonFinite> { | |||
| template <> | |||
| struct OprProxy<SplitForward> : DeduceLayoutProxy<SplitForward, 0, false> { | |||
| WorkspaceWrapper W; | |||
| void init(SplitForward*, const TensorNDArray&) {} | |||
| void exec(SplitForward* opr, const TensorNDArray& tensors) { | |||
| megdnn_assert(tensors.size() >= 2); | |||
| if (!W.valid()) { | |||
| @@ -428,7 +445,9 @@ struct OprProxyProfilingBase | |||
| best_algo); | |||
| } | |||
| void exec(Opr* opr, const TensorNDArray& tensors) { | |||
| virtual void init(Opr*, const TensorNDArray&) {} | |||
| virtual void exec(Opr* opr, const TensorNDArray& tensors) { | |||
| megdnn_assert(tensors.size() == arity); | |||
| if (!W.valid()) { | |||
| W = WorkspaceWrapper(opr->handle(), 0); | |||
| @@ -463,6 +482,8 @@ struct OprProxyProfilingBase | |||
| } | |||
| AlgoProxy<Opr, arity>::exec(opr, tensors, W.workspace()); | |||
| } | |||
| virtual ~OprProxyProfilingBase() {} | |||
| }; | |||
| #define DEF_PROF(c) \ | |||
| @@ -491,7 +512,7 @@ template <class Opr> | |||
| struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr> { | |||
| using Base = OprProxyProfilingBase<Opr>; | |||
| static constexpr int arity = OprTrait<Opr>::arity; | |||
| void exec(Opr* opr, const TensorNDArray& tensors) { | |||
| void exec(Opr* opr, const TensorNDArray& tensors) override { | |||
| megdnn_assert(tensors.size() == arity); | |||
| if (!Base::W.valid()) { | |||
| Base::W = WorkspaceWrapper(opr->handle(), 0); | |||
| @@ -584,11 +605,55 @@ struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr> { | |||
| } | |||
| }; | |||
| template <class Opr> | |||
| struct OprWeightPreprocessProxyBenchmarkImpl | |||
| : public OprWeightPreprocessProxyImpl<Opr> { | |||
| using Base = OprProxyProfilingBase<Opr>; | |||
| static constexpr int arity = OprTrait<Opr>::arity; | |||
| void init(Opr* opr, const TensorNDArray& tensors) override { | |||
| megdnn_assert(tensors.size() == arity); | |||
| if (!Base::W.valid()) { | |||
| Base::W = WorkspaceWrapper(opr->handle(), 0); | |||
| } | |||
| TensorLayoutArray layouts; | |||
| for (auto&& tensor : tensors) { | |||
| layouts.push_back(tensor.layout); | |||
| } | |||
| m_preprocessed_tensors = this->weight_prerocess( | |||
| opr, tensors, Base::target_execution_policy.algo); | |||
| megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
| typename Opr::PreprocessedFilter preprocessed_filter{ | |||
| nullptr, *m_preprocessed_tensors}; | |||
| if (!Base::target_execution_policy.algo.valid()) { | |||
| auto workspace_size = AlgoProxy<Opr, arity>::get_workspace_in_bytes( | |||
| opr, layouts, &preprocessed_filter); | |||
| Base::W.update(workspace_size); | |||
| } | |||
| } | |||
| void exec(Opr* opr, const TensorNDArray& tensors) override { | |||
| megdnn_assert(tensors.size() == arity); | |||
| typename Opr::PreprocessedFilter preprocessed_filter{ | |||
| nullptr, *m_preprocessed_tensors}; | |||
| AlgoProxy<Opr, arity>::exec( | |||
| opr, tensors, &preprocessed_filter, Base::W.workspace()); | |||
| } | |||
| public: | |||
| std::shared_ptr<TensorNDArray> m_preprocessed_tensors; | |||
| }; | |||
| #define DEF_PROF(c) \ | |||
| template <> \ | |||
| struct OprWeightPreprocessProxy<c> : public OprWeightPreprocessProxyImpl<c> { \ | |||
| using OprWeightPreprocessProxyImpl<c>::OprWeightPreprocessProxyImpl; \ | |||
| } | |||
| }; \ | |||
| template <> \ | |||
| struct OprWeightPreprocessBenchmarkProxy<c> \ | |||
| : public OprWeightPreprocessProxyBenchmarkImpl<c> { \ | |||
| using OprWeightPreprocessProxyBenchmarkImpl< \ | |||
| c>::OprWeightPreprocessProxyBenchmarkImpl; \ | |||
| }; | |||
| DEF_PROF(ConvolutionForward); | |||
| DEF_PROF(ConvBias); | |||
| @@ -16,6 +16,7 @@ private: | |||
| public: | |||
| OprProxy() = default; | |||
| OprProxy(int k) : m_k{k} {} | |||
| void init(TopK*, const TensorLayoutArray&) {} | |||
| void deduce_layout(TopK* opr, TensorLayoutArray& layouts) { | |||
| if (layouts.size() == 3) { | |||