In order to improve performance of the profiling procedure. Make layout transform testcase stable. The profiling result in ci environment will be cached in files.
GitOrigin-RevId: ba2743f35f
tags/v1.7.0
| @@ -9,6 +9,7 @@ dnn/src/cuda/matrix_mul/fp32_simt/kimpl/* binary | |||||
| dnn/src/cuda/sass/prebuilt/map_defs.cpp binary | dnn/src/cuda/sass/prebuilt/map_defs.cpp binary | ||||
| dnn/src/cuda/convolution/backward_data/int8/kimpl/* binary | dnn/src/cuda/convolution/backward_data/int8/kimpl/* binary | ||||
| dnn/src/cuda/elemwise_multi_type/kimpl/* binary | dnn/src/cuda/elemwise_multi_type/kimpl/* binary | ||||
| src/gopt/test/cache_data.h binary | |||||
| tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text | tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text | ||||
| imperative/python/test/integration/data/*.mge filter=lfs diff=lfs merge=lfs -text | imperative/python/test/integration/data/*.mge filter=lfs diff=lfs merge=lfs -text | ||||
| ci/resource/models/float/mobilenet_v2.pkl filter=lfs diff=lfs merge=lfs -text | ci/resource/models/float/mobilenet_v2.pkl filter=lfs diff=lfs merge=lfs -text | ||||
| @@ -2,13 +2,11 @@ cc_library( | |||||
| name = "mgblar", | name = "mgblar", | ||||
| copts = ["-std=c++14"], | copts = ["-std=c++14"], | ||||
| srcs = [ | srcs = [ | ||||
| "src/infile_persistent_cache.cpp", | |||||
| "src/mgblar.cpp", | "src/mgblar.cpp", | ||||
| "src/json_loader.cpp", | "src/json_loader.cpp", | ||||
| "src/text_table.cpp", | "src/text_table.cpp", | ||||
| ], | ], | ||||
| hdrs = [ | hdrs = [ | ||||
| "src/infile_persistent_cache.h", | |||||
| "src/mgblar.h", | "src/mgblar.h", | ||||
| "src/json_loader.h", | "src/json_loader.h", | ||||
| "src/text_table.h", | "src/text_table.h", | ||||
| @@ -57,11 +55,9 @@ cc_megvii_binary( | |||||
| cc_library( | cc_library( | ||||
| name = "megbrain_ios_lar_lib", | name = "megbrain_ios_lar_lib", | ||||
| srcs = [ | srcs = [ | ||||
| "src/infile_persistent_cache.cpp", | |||||
| "src/mgblar.cpp", | "src/mgblar.cpp", | ||||
| ], | ], | ||||
| hdrs = [ | hdrs = [ | ||||
| "src/infile_persistent_cache.h", | |||||
| "src/mgblar.h", | "src/mgblar.h", | ||||
| ], | ], | ||||
| copts = ["-DMGB_NO_MAIN=1"], | copts = ["-DMGB_NO_MAIN=1"], | ||||
| @@ -10,7 +10,6 @@ | |||||
| */ | */ | ||||
| #include "./mgblar.h" | #include "./mgblar.h" | ||||
| #include "./infile_persistent_cache.h" | |||||
| #include "./json_loader.h" | #include "./json_loader.h" | ||||
| #include "./npy.h" | #include "./npy.h" | ||||
| #include "./text_table.h" | #include "./text_table.h" | ||||
| @@ -30,6 +29,7 @@ | |||||
| #include "megbrain/serialization/extern_c_opr.h" | #include "megbrain/serialization/extern_c_opr.h" | ||||
| #include "megbrain/serialization/serializer.h" | #include "megbrain/serialization/serializer.h" | ||||
| #include "megbrain/utils/debug.h" | #include "megbrain/utils/debug.h" | ||||
| #include "megbrain/utils/infile_persistent_cache.h" | |||||
| #include "megbrain/system.h" | #include "megbrain/system.h" | ||||
| #include "megbrain/version.h" | #include "megbrain/version.h" | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | /** | ||||
| * \file sdk/load-and-run/src/infile_persistent_cache.cpp | |||||
| * \file src/core/impl/utils/infile_persistent_cache.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | ||||
| * | * | ||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | ||||
| @@ -9,7 +9,7 @@ | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| */ | */ | ||||
| #include "./infile_persistent_cache.h" | |||||
| #include "megbrain/utils/infile_persistent_cache.h" | |||||
| #if defined(_WIN32) | #if defined(_WIN32) | ||||
| #include <io.h> | #include <io.h> | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | /** | ||||
| * \file sdk/load-and-run/src/infile_persistent_cache.h | |||||
| * \file src/core/include/megbrain/utils/infile_persistent_cache.h | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | ||||
| * | * | ||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | ||||
| @@ -70,6 +70,7 @@ public: | |||||
| Maybe<Blob> get(const std::string& category, const Blob& key) override; | Maybe<Blob> get(const std::string& category, const Blob& key) override; | ||||
| void put(const std::string& category, const Blob& key, | void put(const std::string& category, const Blob& key, | ||||
| const Blob& value) override; | const Blob& value) override; | ||||
| bool support_dump_cache() override { return true; } | |||||
| }; | }; | ||||
| } // namespace mgb | } // namespace mgb | ||||
| @@ -39,6 +39,8 @@ public: | |||||
| virtual void put( | virtual void put( | ||||
| const std::string& category, const Blob& key, const Blob& value) = 0; | const std::string& category, const Blob& key, const Blob& value) = 0; | ||||
| virtual bool support_dump_cache() { return false; } | |||||
| //! set an implementation; return the original implementation | //! set an implementation; return the original implementation | ||||
| static std::shared_ptr<PersistentCache> set_impl( | static std::shared_ptr<PersistentCache> set_impl( | ||||
| std::shared_ptr<PersistentCache> impl); | std::shared_ptr<PersistentCache> impl); | ||||
| @@ -0,0 +1,96 @@ | |||||
| /** | |||||
| * \file src/gopt/impl/global_layout_transform/opr_safe_dump.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #include "./opr_safe_dump.h" | |||||
| #include "megbrain/opr/basic_arith.h" | |||||
| #include "megbrain/opr/dnn/convolution.h" | |||||
| #include "megbrain/opr/dnn/pooling.h" | |||||
| #include "megbrain/opr/imgproc.h" | |||||
| #include "megbrain/opr/nn_int.h" | |||||
| #include "megbrain/opr/tensor_manip.h" | |||||
| #include "midout.h" | |||||
| MIDOUT_DECL(megbrain_opr_safe_dump) | |||||
| #define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_safe_dump, __VA_ARGS__) { | |||||
| #define MIDOUT_E \ | |||||
| } \ | |||||
| MIDOUT_END(); | |||||
| using namespace mgb; | |||||
| using namespace opr; | |||||
| namespace { | |||||
| template <typename Param> | |||||
| void write_param(std::string& data, const Param& param) { | |||||
| megdnn::Algorithm::serialize_write_pod(param, data); | |||||
| } | |||||
| template <> | |||||
| void write_param(std::string& /* data */, const DType& /* dtype */) {} | |||||
| template <class Opr> | |||||
| struct OprDumpImpl { | |||||
| static std::string dump(const cg::OperatorNodeBase* opr_) { | |||||
| MIDOUT_B(Opr) | |||||
| auto&& opr = opr_->cast_final_safe<Opr>(); | |||||
| std::string data; | |||||
| write_param(data, opr.param()); | |||||
| return data; | |||||
| MIDOUT_E | |||||
| } | |||||
| }; | |||||
| #define INST(_Opr) \ | |||||
| template <> \ | |||||
| struct OprDumpImpl<_Opr> { \ | |||||
| static std::string dump(const cg::OperatorNodeBase* opr_) { \ | |||||
| MIDOUT_B(_Opr) \ | |||||
| auto&& opr = opr_->cast_final_safe<_Opr>(); \ | |||||
| std::string data; \ | |||||
| write_param(data, opr.param()); \ | |||||
| using ExecutionPolicy = megdnn::param::ExecutionPolicy; \ | |||||
| ExecutionPolicy policy{ \ | |||||
| opr.execution_policy_transient().strategy, \ | |||||
| opr.execution_policy_transient().workspace_limit}; \ | |||||
| write_param(data, policy); \ | |||||
| return data; \ | |||||
| MIDOUT_E \ | |||||
| } \ | |||||
| }; | |||||
| INST(Convolution); | |||||
| INST(ConvBiasForward); | |||||
| INST(ConvolutionBackwardData); | |||||
| INST(PoolingForward); | |||||
| #undef INST | |||||
| } // namespace | |||||
| namespace mgb { | |||||
| namespace gopt { | |||||
| namespace intl { | |||||
| std::string opr_safe_dump(const cg::OperatorNodeBase* opr) { | |||||
| #define cb(_Opr) \ | |||||
| if (opr->dyn_typeinfo() == _Opr::typeinfo()) { \ | |||||
| return OprDumpImpl<_Opr>::dump(opr); \ | |||||
| } else | |||||
| FOREACH_SUPPORTED_OPR(cb) { | |||||
| mgb_throw(InternalError, "unsupported operator(got:%s)", | |||||
| opr->dyn_typeinfo()->name); | |||||
| } | |||||
| #undef cb | |||||
| } | |||||
| } // namespace intl | |||||
| } // namespace gopt | |||||
| } // namespace mgb | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,30 @@ | |||||
| /** | |||||
| * \file src/gopt/impl/global_layout_transform/opr_safe_dump.h | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #pragma once | |||||
| #include "megbrain/graph.h" | |||||
| namespace mgb { | |||||
| namespace gopt { | |||||
| namespace intl { | |||||
| #define FOREACH_SUPPORTED_OPR(cb) \ | |||||
| cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData) \ | |||||
| cb(PoolingForward) cb(WarpPerspective) cb(Resize) cb(Elemwise) \ | |||||
| cb(ElemwiseMultiType) cb(Concat) cb(PowC) cb(TypeCvt) | |||||
| std::string opr_safe_dump(const cg::OperatorNodeBase* opr); | |||||
| } // namespace intl | |||||
| } // namespace gopt | |||||
| } // namespace mgb | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,184 @@ | |||||
| /** | |||||
| * \file src/gopt/impl/profiler_cache.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #include "./opr_safe_dump.h" | |||||
| #include "megbrain/gopt/profiler.h" | |||||
| #include "megbrain/comp_node_env.h" | |||||
| using namespace mgb; | |||||
| using namespace gopt; | |||||
| using ReformatKey = ReformatManager::ReformatKey; | |||||
| // =================== ProfilerCache ====================== | |||||
| void ProfilerCache::Key::build_blob_from_opr() { | |||||
| auto&& opr = m_key_impl.opr_key.opr; | |||||
| // process opr type | |||||
| auto type = opr->dyn_typeinfo()->name; | |||||
| size_t type_size = strlen(type); | |||||
| // process opr param | |||||
| auto data = intl::opr_safe_dump(opr); | |||||
| size_t param_size = data.size(); | |||||
| size_t nr_inputs = opr->input().size(); | |||||
| size_t nr_outputs = opr->usable_output().size(); | |||||
| size_t nr_layouts = nr_inputs + nr_outputs; | |||||
| m_blob_storage.reserve(sizeof(TensorLayout) * 3 * nr_layouts + type_size + | |||||
| param_size); | |||||
| // serialize opr type | |||||
| m_blob_storage.append(type, type_size); | |||||
| // serialize param | |||||
| const char* data_ptr = reinterpret_cast<const char*>(data.data()); | |||||
| m_blob_storage.append(data_ptr, param_size); | |||||
| // serialize layouts | |||||
| auto append_layout = [this](const VarNode* v) { | |||||
| TensorLayout ly{v->shape(), v->dtype(), v->format()}; | |||||
| for (size_t i = 0; i < ly.ndim; ++i) { | |||||
| if (i) | |||||
| m_blob_storage.push_back(','); | |||||
| m_blob_storage.append(std::to_string(ly.shape[i])); | |||||
| } | |||||
| if (!ly.is_contiguous()) { | |||||
| m_blob_storage.push_back(';'); | |||||
| for (size_t i = 0; i < ly.ndim; ++i) { | |||||
| if (i) | |||||
| m_blob_storage.push_back(','); | |||||
| m_blob_storage.append(std::to_string(ly.stride[i])); | |||||
| } | |||||
| } | |||||
| m_blob_storage.push_back(';'); | |||||
| m_blob_storage.append(ly.dtype.name()); | |||||
| m_blob_storage.push_back('|'); | |||||
| }; | |||||
| for (size_t i = 0; i < nr_inputs; ++i) { | |||||
| append_layout(opr->input(i)); | |||||
| } | |||||
| for (size_t i = 0; i < nr_outputs; ++i) { | |||||
| append_layout(opr->output(i)); | |||||
| } | |||||
| // serialize opr_format | |||||
| m_blob_storage.append(std::to_string( | |||||
| static_cast<uint32_t>(m_key_impl.opr_key.opr_format))); | |||||
| // serialize extra_attribute | |||||
| m_blob_storage.append(std::to_string( | |||||
| static_cast<uint32_t>(m_key_impl.opr_key.extra_attribute))); | |||||
| } | |||||
| void ProfilerCache::Key::build_category(CompNode cn) { | |||||
| m_category = "layout_transform_profile:"; | |||||
| auto&& env = CompNodeEnv::from_comp_node(cn); | |||||
| switch (env.property().type) { | |||||
| #if MGB_CUDA | |||||
| case CompNode::DeviceType::CUDA: { | |||||
| auto&& prop = env.cuda_env().device_prop; | |||||
| m_category += ssprintf("plat=cuda;dev=%s;cap=%d.%d", prop.name, | |||||
| prop.major, prop.minor); | |||||
| break; | |||||
| } | |||||
| #endif | |||||
| case CompNode::DeviceType::CPU: | |||||
| m_category += "plat=cpu"; | |||||
| break; | |||||
| default: | |||||
| mgb_throw(MegBrainError, | |||||
| "unsupported comp node for global layout transform " | |||||
| "profiler cache category"); | |||||
| } | |||||
| } | |||||
| void ProfilerCache::Key::build_blob_from_var() { | |||||
| auto v = m_key_impl.var_key.var; | |||||
| // serialize layouts | |||||
| auto append_layout = [this](const VarNode* v) { | |||||
| TensorLayout ly{v->shape(), v->dtype(), v->format()}; | |||||
| for (size_t i = 0; i < ly.ndim; ++i) { | |||||
| if (i) | |||||
| m_blob_storage.push_back(','); | |||||
| m_blob_storage.append(std::to_string(ly.shape[i])); | |||||
| } | |||||
| if (!ly.is_contiguous()) { | |||||
| m_blob_storage.push_back(';'); | |||||
| for (size_t i = 0; i < ly.ndim; ++i) { | |||||
| if (i) | |||||
| m_blob_storage.push_back(','); | |||||
| m_blob_storage.append(std::to_string(ly.stride[i])); | |||||
| } | |||||
| } | |||||
| m_blob_storage.push_back(';'); | |||||
| m_blob_storage.append(ly.dtype.name()); | |||||
| m_blob_storage.push_back('|'); | |||||
| }; | |||||
| append_layout(v); | |||||
| // serialze reformat key | |||||
| m_blob_storage.append(m_key_impl.var_key.key.to_string()); | |||||
| } | |||||
| const std::string& ProfilerCache::Key::category() const { | |||||
| mgb_assert(!m_category.empty()); | |||||
| return m_category; | |||||
| } | |||||
| PersistentCache::Blob ProfilerCache::Key::blob() const { | |||||
| mgb_assert(!m_blob_storage.empty()); | |||||
| return {m_blob_storage.data(), m_blob_storage.size()}; | |||||
| } | |||||
| ProfilerCache& ProfilerCache::inst() { | |||||
| static ProfilerCache inst; | |||||
| return inst; | |||||
| } | |||||
| ProfilerCache& ProfilerCache::set_impl(std::unique_ptr<PersistentCache> impl) { | |||||
| mgb_assert(impl != nullptr); | |||||
| m_impl.swap(impl); | |||||
| return *this; | |||||
| } | |||||
| void ProfilerCache::dump_cache(const char* path) { | |||||
| mgb_assert(m_impl->support_dump_cache(), | |||||
| "current impl of ProfilerCache does not support dump cache to " | |||||
| "file."); | |||||
| auto cache = static_cast<InFilePersistentCache*>(m_impl.get()); | |||||
| cache->dump_cache(path); | |||||
| } | |||||
| Maybe<ProfilerCache::Result> ProfilerCache::get(const Key& key) { | |||||
| auto raw_buf = m_impl->get(key.category(), key.blob()); | |||||
| if (!raw_buf.valid()) | |||||
| return None; | |||||
| // data type of cost is float | |||||
| auto buf = static_cast<const uint8_t*>(raw_buf->ptr); | |||||
| auto size = raw_buf->size; | |||||
| mgb_assert(buf && size == sizeof(float), | |||||
| "ProfileCache invalid value: ptr=%p, size=%zu", buf, size); | |||||
| auto read_f32 = [&]() { | |||||
| auto ret = *reinterpret_cast<const float*>(buf); | |||||
| return ret; | |||||
| }; | |||||
| auto cost = read_f32(); | |||||
| return cost; | |||||
| } | |||||
| void ProfilerCache::put(const Key& key, Result& result) { | |||||
| std::string val; | |||||
| megdnn::Algorithm::serialize_write_pod(result, val); | |||||
| m_impl->put(key.category(), key.blob(), {val.data(), val.size()}); | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -154,69 +154,61 @@ void MarkInputContiguous::init_output_static_infer_desc() { | |||||
| } // namespace | } // namespace | ||||
| /* ================== ProfilerImpl =================*/ | /* ================== ProfilerImpl =================*/ | ||||
| class ProfilerImpl final : public ProfilerBase { | |||||
| public: | |||||
| ProfilerImpl(int runs = 10) : m_runs{runs} {}; | |||||
| ~ProfilerImpl() = default; | |||||
| ProfilingResult profile(const Problem& problem) const override; | |||||
| private: | |||||
| static constexpr float PROFILE_TIME_OUT = 1e7; | |||||
| using ReformatAttribute = ReformatKey::Attribute; | |||||
| /*! | |||||
| * \brief profile opr format agnostic operators (like elemwise, elemwise | |||||
| * multi type, typecvt etc.) | |||||
| * | |||||
| * \param opr pointer to the operator node to be profiled | |||||
| * \param base_format the original tensor format of the operator node. | |||||
| * \param available_tensor_formats the available tensor formats | |||||
| * \return the operator node record | |||||
| */ | |||||
| OperatorNodeRecord profile_operator( | |||||
| const OperatorNodeBase* opr, TensorFormats base_format, | |||||
| const SmallVector<TensorFormats>& available_tensor_formats, | |||||
| ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const; | |||||
| float profile_operator( | |||||
| const OperatorNodeBase* opr, TensorFormats base_format, | |||||
| TensorFormats tensor_format, | |||||
| ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const; | |||||
| /*! | |||||
| * \brief profile opr format aware operators (like conv, deconv, conv_bias, | |||||
| * etc.) | |||||
| * | |||||
| * \param opr pointer to the operator node to be profiled | |||||
| * \param base_config the tensor formats configuration of base opr format | |||||
| * \param config all the available configuration | |||||
| * \return the operator node record | |||||
| */ | |||||
| OperatorNodeRecord profile_operator( | |||||
| const OperatorNodeBase* opr, | |||||
| const OprTensorFormatsConfiguration& base_config, | |||||
| const SmallVector<OprTensorFormatsConfiguration>& available_configs, | |||||
| ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const; | |||||
| float profile_operator( | |||||
| const OperatorNodeBase* opr, | |||||
| const OprTensorFormatsConfiguration& base_config, | |||||
| const OprTensorFormatsConfiguration& config, | |||||
| ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const; | |||||
| /*! | |||||
| * \brief profile layout transform of the var node | |||||
| * | |||||
| * \param var pointer to the var node to be profiled | |||||
| * \param base_format the original tensor formats in which the var node is | |||||
| * stored \param available_tensor_formats the available tensor formats | |||||
| * \param extra_attribute the extra attributes (options) of the problem | |||||
| * \return the var node record | |||||
| */ | |||||
| VarNodeRecord profile_var_node( | |||||
| const VarNode* var, TensorFormats base_format, | |||||
| const SmallVector<TensorFormats>& available_tensor_formats, | |||||
| ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const; | |||||
| float profile_var_node( | |||||
| const VarNode* var, TensorFormats base_format, | |||||
| const ReformatKey& key) const; | |||||
| int m_runs; /// sample times of the profiler | |||||
| }; | |||||
| ProfilerImpl::ProfilerImpl(int runs, float opr_threshold, | |||||
| float var_node_threshold) | |||||
| : m_opr_threshold{opr_threshold}, | |||||
| m_var_node_threshold{var_node_threshold}, | |||||
| m_runs{runs} { | |||||
| m_opr_filter = [this](const OperatorNodeBase* opr, | |||||
| OperatorNodeBase* new_opr) { | |||||
| /// \note: for the considerations of performance, we skip nchw(naive) | |||||
| /// kernels for conv bias on CUDA platform. to remove this later | |||||
| if (auto conv = try_cast_as_op<opr::ConvBiasForward>(new_opr)) { | |||||
| if (conv->output(0)->comp_node().device_type() == | |||||
| CompNode::DeviceType::CUDA && | |||||
| conv->input(0)->dtype().category() == | |||||
| DTypeCategory::QUANTIZED && | |||||
| conv->param().format == OprFormat::NCHW) { | |||||
| return false; | |||||
| } | |||||
| } | |||||
| float comp1 = m_opr_footprint.get_computation( | |||||
| const_cast<OperatorNodeBase*>(opr)); | |||||
| float comp2 = m_opr_footprint.get_computation(new_opr); | |||||
| if (comp2 > m_opr_threshold * comp1) | |||||
| return false; | |||||
| return true; | |||||
| }; | |||||
| m_var_node_filter = [this](const VarNode* var, TensorShape from, | |||||
| TensorShape to, ReformatKey key) { | |||||
| /// \note: due to the alignment requirement of low-bit tensor, we skip | |||||
| /// some layout transform for low-bit tensors. The skipped layout | |||||
| /// transforms do not have corresponding dnn kernel and cannot be | |||||
| /// implemented by tensor manip operators (like reshape, dimshuffle, | |||||
| /// subtensor, etc.). | |||||
| if (var->dtype().enumv() == DTypeEnum::QuantizedS4 || | |||||
| var->dtype().enumv() == DTypeEnum::Quantized4Asymm) { | |||||
| if (key.input_format == TensorFormats::NCHW && | |||||
| key.output_format != TensorFormats::NHWC && | |||||
| key.output_format != TensorFormats::NCHWc64) { | |||||
| return false; | |||||
| } | |||||
| if (key.output_format == TensorFormats::NCHW && | |||||
| key.input_format != TensorFormats::NHWC && | |||||
| key.input_format != TensorFormats::NCHWc64) { | |||||
| return false; | |||||
| } | |||||
| } | |||||
| TensorLayout orig_ly = {var->shape(), var->dtype()}, | |||||
| from_ly = {from, var->dtype()}, to_ly = {to, var->dtype()}; | |||||
| float orig_memory = orig_ly.span().dist_byte() * 2.f; | |||||
| float reformat_memory = | |||||
| from_ly.span().dist_byte() + to_ly.span().dist_byte(); | |||||
| if (reformat_memory > orig_memory * m_var_node_threshold) | |||||
| return false; | |||||
| return true; | |||||
| }; | |||||
| } | |||||
| ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator( | ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator( | ||||
| const OperatorNodeBase* opr, TensorFormats base_format, | const OperatorNodeBase* opr, TensorFormats base_format, | ||||
| @@ -507,56 +499,6 @@ ProfilerImpl::ProfilingResult ProfilerImpl::profile(const Problem& problem) cons | |||||
| } | } | ||||
| /* ================== ProfilerBase =================*/ | /* ================== ProfilerBase =================*/ | ||||
| ProfilerBase::ProfilerBase(float opr_threshold, float var_node_threshold) | |||||
| : m_opr_threshold{opr_threshold}, m_var_node_threshold{var_node_threshold} { | |||||
| m_opr_filter = [this](const OperatorNodeBase* opr, OperatorNodeBase* new_opr) { | |||||
| /// \note: for the considerations of performance, we skip nchw(naive) | |||||
| /// kernels for conv bias on CUDA platform. to remove this later | |||||
| if (auto conv = try_cast_as_op<opr::ConvBiasForward>(new_opr)) { | |||||
| if (conv->output(0)->comp_node().device_type() == | |||||
| CompNode::DeviceType::CUDA && | |||||
| conv->input(0)->dtype().category() == DTypeCategory::QUANTIZED && | |||||
| conv->param().format == OprFormat::NCHW) { | |||||
| return false; | |||||
| } | |||||
| } | |||||
| float comp1 = | |||||
| m_opr_footprint.get_computation(const_cast<OperatorNodeBase*>(opr)); | |||||
| float comp2 = m_opr_footprint.get_computation(new_opr); | |||||
| if (comp2 > m_opr_threshold * comp1) | |||||
| return false; | |||||
| return true; | |||||
| }; | |||||
| m_var_node_filter = [this](const VarNode* var, TensorShape from, TensorShape to, | |||||
| ReformatKey key) { | |||||
| /// \note: due to the alignment requirement of low-bit tensor, we skip | |||||
| /// some layout transform for low-bit tensors. The skipped layout | |||||
| /// transforms do not have corresponding dnn kernel and cannot be | |||||
| /// implemented by tensor manip operators (like reshape, dimshuffle, | |||||
| /// subtensor, etc.). | |||||
| if (var->dtype().enumv() == DTypeEnum::QuantizedS4 || | |||||
| var->dtype().enumv() == DTypeEnum::Quantized4Asymm) { | |||||
| if (key.input_format == TensorFormats::NCHW && | |||||
| key.output_format != TensorFormats::NHWC && | |||||
| key.output_format != TensorFormats::NCHWc64) { | |||||
| return false; | |||||
| } | |||||
| if (key.output_format == TensorFormats::NCHW && | |||||
| key.input_format != TensorFormats::NHWC && | |||||
| key.input_format != TensorFormats::NCHWc64) { | |||||
| return false; | |||||
| } | |||||
| } | |||||
| TensorLayout orig_ly = {var->shape(), var->dtype()}, | |||||
| from_ly = {from, var->dtype()}, to_ly = {to, var->dtype()}; | |||||
| float orig_memory = orig_ly.span().dist_byte() * 2.f; | |||||
| float reformat_memory = from_ly.span().dist_byte() + to_ly.span().dist_byte(); | |||||
| if (reformat_memory > orig_memory * m_var_node_threshold) | |||||
| return false; | |||||
| return true; | |||||
| }; | |||||
| } | |||||
| std::string ProfilerBase::OperatorNodeRecord::to_string() const { | std::string ProfilerBase::OperatorNodeRecord::to_string() const { | ||||
| auto str = ssprintf( | auto str = ssprintf( | ||||
| "\nopr type: %s\nopr name: %s\ninputs:\n", opr->dyn_typeinfo()->name, | "\nopr type: %s\nopr name: %s\ninputs:\n", opr->dyn_typeinfo()->name, | ||||
| @@ -595,4 +537,68 @@ std::unique_ptr<ProfilerBase> ProfilerBase::make_profiler() { | |||||
| return std::make_unique<ProfilerImpl>(); | return std::make_unique<ProfilerImpl>(); | ||||
| } | } | ||||
| std::unique_ptr<ProfilerBase> ProfilerBase::make_cached_profiler( | |||||
| const char* path) { | |||||
| return std::make_unique<CachedProfiler>(path); | |||||
| } | |||||
| /* ================== CachedProfiler =================*/ | |||||
| CachedProfiler::CachedProfiler(const char* path, int runs, float opr_threshold, | |||||
| float var_node_threshold) | |||||
| : ProfilerImpl(runs, opr_threshold, var_node_threshold), m_path{path} { | |||||
| if (m_path != nullptr) { // file cache | |||||
| ProfilerCache::inst().set_impl( | |||||
| std::make_unique<InFilePersistentCache>(m_path)); | |||||
| } | |||||
| } | |||||
| CachedProfiler::ProfilingResult CachedProfiler::profile( | |||||
| const Problem& problem) const { | |||||
| auto ret = ProfilerImpl::profile(problem); | |||||
| if (m_path != nullptr) | |||||
| ProfilerCache::inst().dump_cache(m_path); | |||||
| return ret; | |||||
| } | |||||
| float CachedProfiler::profile_operator( | |||||
| const OperatorNodeBase* opr, TensorFormats base_format, | |||||
| TensorFormats tensor_format, ReformatAttribute extra_attribute) const { | |||||
| ProfilerCache::Key key{opr, tensor_formats_to_opr_format(tensor_format), | |||||
| extra_attribute}; | |||||
| auto ret = ProfilerCache::inst().get(key); | |||||
| if (ret.valid()) | |||||
| return ret.val(); | |||||
| auto rst = ProfilerImpl::profile_operator(opr, base_format, tensor_format, | |||||
| extra_attribute); | |||||
| ProfilerCache::inst().put(key, rst); | |||||
| return rst; | |||||
| } | |||||
| float CachedProfiler::profile_operator( | |||||
| const OperatorNodeBase* opr, | |||||
| const OprTensorFormatsConfiguration& base_config, | |||||
| const OprTensorFormatsConfiguration& config, | |||||
| ReformatAttribute extra_attribute) const { | |||||
| ProfilerCache::Key key{opr, config.opr_format, extra_attribute}; | |||||
| auto ret = ProfilerCache::inst().get(key); | |||||
| if (ret.valid()) | |||||
| return ret.val(); | |||||
| auto rst = ProfilerImpl::profile_operator(opr, base_config, config, | |||||
| extra_attribute); | |||||
| ProfilerCache::inst().put(key, rst); | |||||
| return rst; | |||||
| } | |||||
| float CachedProfiler::profile_var_node(const VarNode* var, | |||||
| TensorFormats base_format, | |||||
| const ReformatKey& key) const { | |||||
| ProfilerCache::Key pf_key{var, key}; | |||||
| auto ret = ProfilerCache::inst().get(pf_key); | |||||
| if (ret.valid()) | |||||
| return ret.val(); | |||||
| auto rst = ProfilerImpl::profile_var_node(var, base_format, key); | |||||
| ProfilerCache::inst().put(pf_key, rst); | |||||
| return rst; | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen | ||||
| @@ -18,11 +18,13 @@ | |||||
| #include "megbrain/gopt/subgraph_extractor.h" | #include "megbrain/gopt/subgraph_extractor.h" | ||||
| #include "megbrain/opr/dnn/convolution.h" | #include "megbrain/opr/dnn/convolution.h" | ||||
| #include "megbrain/plugin/opr_footprint.h" | #include "megbrain/plugin/opr_footprint.h" | ||||
| #include "megbrain/utils/infile_persistent_cache.h" | |||||
| namespace mgb { | namespace mgb { | ||||
| namespace gopt { | namespace gopt { | ||||
| class Problem; | class Problem; | ||||
| class CachedProfiler; | |||||
| /*! | /*! | ||||
| * \brief A profiler that collects all the performance data to describe the | * \brief A profiler that collects all the performance data to describe the | ||||
| @@ -75,22 +77,245 @@ public: | |||||
| using VarNodeFilter = thin_function<bool( | using VarNodeFilter = thin_function<bool( | ||||
| const VarNode*, TensorShape, TensorShape, ReformatManager::ReformatKey)>; | const VarNode*, TensorShape, TensorShape, ReformatManager::ReformatKey)>; | ||||
| ProfilerBase(float opr_threshold = 2.f, float var_node_threshold = 2.f); | |||||
| ProfilerBase(OprFilter opr_filter, VarNodeFilter var_node_filter = {}) | |||||
| : m_opr_filter{std::move(opr_filter)}, | |||||
| m_var_node_filter{std::move(var_node_filter)} {} | |||||
| ProfilerBase() = default; | |||||
| virtual ~ProfilerBase() = default; | virtual ~ProfilerBase() = default; | ||||
| virtual ProfilingResult profile(const Problem& problem) const = 0; | virtual ProfilingResult profile(const Problem& problem) const = 0; | ||||
| ProfilerBase& set_opr_filter(const OprFilter& opr_filter) { | |||||
| m_opr_filter = opr_filter; | |||||
| return *this; | |||||
| } | |||||
| ProfilerBase& set_var_node_filter(const VarNodeFilter& var_node_filter) { | |||||
| m_var_node_filter = var_node_filter; | |||||
| return *this; | |||||
| } | |||||
| static std::unique_ptr<ProfilerBase> make_profiler(); | static std::unique_ptr<ProfilerBase> make_profiler(); | ||||
| static std::unique_ptr<ProfilerBase> make_cached_profiler( | |||||
| const char* path = nullptr); | |||||
| protected: | protected: | ||||
| OprFilter m_opr_filter; | OprFilter m_opr_filter; | ||||
| VarNodeFilter m_var_node_filter; | VarNodeFilter m_var_node_filter; | ||||
| float m_opr_threshold; | |||||
| float m_var_node_threshold; | |||||
| }; | |||||
| private: | |||||
| /*! \brief A default profiler impl | |||||
| */ | |||||
| class ProfilerImpl : public ProfilerBase { | |||||
| public: | |||||
| ProfilerImpl(int runs = 10, float opr_threshold = 2.f, | |||||
| float var_node_threshold = 2.f); | |||||
| ~ProfilerImpl() = default; | |||||
| ProfilingResult profile(const Problem& problem) const override; | |||||
| protected: | |||||
| static constexpr float PROFILE_TIME_OUT = 1e7; | |||||
| using ReformatKey = ReformatManager::ReformatKey; | |||||
| using ReformatAttribute = ReformatKey::Attribute; | |||||
| /*! | |||||
| * \brief profile opr format agnostic operators (like elemwise, elemwise | |||||
| * multi type, typecvt etc.) | |||||
| * | |||||
| * \param opr pointer to the operator node to be profiled | |||||
| * \param base_format the original tensor format of the operator node. | |||||
| * \param available_tensor_formats the available tensor formats | |||||
| * \return the operator node record | |||||
| */ | |||||
| OperatorNodeRecord profile_operator( | |||||
| const OperatorNodeBase* opr, TensorFormats base_format, | |||||
| const SmallVector<TensorFormats>& available_tensor_formats, | |||||
| ReformatAttribute extra_attribute = | |||||
| ReformatAttribute::DEFAULT) const; | |||||
| /*! | |||||
| * \brief prfile opr format agnostic operators (like elemwise, elemwise multi type, typecvt etc.) | |||||
| * | |||||
| * \param opr pointer to the operator to be profiled | |||||
| * \param base_format the original tensor format of the operator node. | |||||
| * \param tensor_format the tensor format to be profiled | |||||
| * \param extra_attribute identify whether to use image object for OpenCL or automatically padding nhwc layout | |||||
| * \return elapsed time of operator in the given tensor format configuration | |||||
| */ | |||||
| virtual float profile_operator( | |||||
| const OperatorNodeBase* opr, TensorFormats base_format, | |||||
| TensorFormats tensor_format, | |||||
| ReformatAttribute extra_attribute = | |||||
| ReformatAttribute::DEFAULT) const; | |||||
| /*! | |||||
| * \brief profile opr format aware operators (like conv, deconv, conv_bias, | |||||
| * etc.) | |||||
| * | |||||
| * \param opr pointer to the operator node to be profiled | |||||
| * \param base_config the tensor formats configuration of base opr format | |||||
| * \param config all the available configuration | |||||
| * \return the operator node record | |||||
| */ | |||||
| OperatorNodeRecord profile_operator( | |||||
| const OperatorNodeBase* opr, | |||||
| const OprTensorFormatsConfiguration& base_config, | |||||
| const SmallVector<OprTensorFormatsConfiguration>& available_configs, | |||||
| ReformatAttribute extra_attribute = | |||||
| ReformatAttribute::DEFAULT) const; | |||||
| /*! | |||||
| * \brief prfile opr format aware operators (like conv, deconv, conv_bias, resize, warp etc.) | |||||
| * | |||||
| * \param opr pointer to the operator to be profiled | |||||
| * \param base_config the original opr format configuration of the operator node, | |||||
| * \param config the opr format configuration to be profiled | |||||
| * \param extra_attribute identify whether to use image object for OpenCL or automatically padding nhwc layout | |||||
| * \return elapsed time of operator in the given opr format configuration | |||||
| */ | |||||
| virtual float profile_operator(const OperatorNodeBase* opr, | |||||
| const OprTensorFormatsConfiguration& base_config, | |||||
| const OprTensorFormatsConfiguration& config, | |||||
| ReformatAttribute extra_attribute = | |||||
| ReformatAttribute::DEFAULT) const; | |||||
| /*! | |||||
| * \brief profile layout transform of the var node | |||||
| * | |||||
| * \param var pointer to the var node to be profiled | |||||
| * \param base_format the original tensor formats in which the var node is | |||||
| * stored | |||||
| * \param available_tensor_formats the available tensor formats | |||||
| * \param extra_attribute the extra attributes (options) of the problem | |||||
| * \return the var node record | |||||
| */ | |||||
| VarNodeRecord profile_var_node( | |||||
| const VarNode* var, TensorFormats base_format, | |||||
| const SmallVector<TensorFormats>& available_tensor_formats, | |||||
| ReformatAttribute extra_attribute = | |||||
| ReformatAttribute::DEFAULT) const; | |||||
| /*! | |||||
| * \brief profile layout transform of the var node | |||||
| * | |||||
| * \param var pointer to the var node to be profiled | |||||
| * \param base_format the original tensor formats in which the var node is | |||||
| * stored | |||||
| * \param key type of ReformatKey, identify the information/attributes of the layout transoform | |||||
| * \return elapsed time of the layout transform | |||||
| */ | |||||
| virtual float profile_var_node(const VarNode* var, | |||||
| TensorFormats base_format, | |||||
| const ReformatKey& key) const; | |||||
| OprFootprint m_opr_footprint; | OprFootprint m_opr_footprint; | ||||
| float m_opr_threshold; /// a threshold, when the computation of the newly | |||||
| /// created operator that is built in some opr | |||||
| /// format configuration is as greater as | |||||
| /// m_opr_threshold times of the original operator, | |||||
| /// the opr format configuration will be skipped | |||||
| /// (i.e. the cost is infinite) | |||||
| float m_var_node_threshold; /// a threshold, when the memory footprint of | |||||
| /// the layout transform of the var node is as | |||||
| /// larger as m_var_node_threshold as the var | |||||
| /// node itself, the layout transform will be | |||||
| /// skipped (i.e. the cost is infinite) | |||||
| int m_runs; /// sample times of the profiler | |||||
| }; | |||||
| /*! | |||||
| * \brief a ProfilerCache that manages the profiling results of operator in | |||||
| * different layouts and of layout transform of var nodes. | |||||
| */ | |||||
| class ProfilerCache : public NonCopyableObj { | |||||
| ProfilerCache() : m_impl{std::make_unique<InMemoryPersistentCache>()} {}; | |||||
| public: | |||||
| using ReformatKey = ReformatManager::ReformatKey; | |||||
| using ReformatAttribute = ReformatKey::Attribute; | |||||
| using OprFormat = ProfilerBase::OprFormat; | |||||
| class Key final : public NonCopyableObj { | |||||
| std::string m_blob_storage; | |||||
| std::string m_category; | |||||
| struct OprKey { | |||||
| const OperatorNodeBase* opr; | |||||
| OprFormat opr_format; | |||||
| ReformatAttribute extra_attribute; | |||||
| }; | |||||
| struct VarKey { | |||||
| const VarNode* var; | |||||
| ReformatKey key; | |||||
| }; | |||||
| union KeyImpl { | |||||
| OprKey opr_key; | |||||
| VarKey var_key; | |||||
| KeyImpl() { std::memset(this, 0, sizeof(KeyImpl)); } | |||||
| }; | |||||
| KeyImpl m_key_impl; | |||||
| void build_blob_from_opr(); | |||||
| void build_blob_from_var(); | |||||
| void build_category(CompNode cn); | |||||
| public: | |||||
| Key(const OperatorNodeBase* opr, OprFormat opr_format, | |||||
| ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) { | |||||
| m_key_impl.opr_key = {opr, opr_format, extra_attribute}; | |||||
| build_blob_from_opr(); | |||||
| mgb_assert( | |||||
| opr->node_prop().contain( | |||||
| cg::OperatorNodeProp::Flag::SINGLE_COMP_NODE), | |||||
| "operator with multiple comp node is not supported(opr:%s)", | |||||
| opr->cname()); | |||||
| // here, we assume that the operator to be profiled has only one | |||||
| // comp node | |||||
| build_category(opr->output(0)->comp_node()); | |||||
| } | |||||
| Key(const VarNode* var, ReformatKey key) { | |||||
| m_key_impl.var_key = {var, key}; | |||||
| build_blob_from_var(); | |||||
| build_category(var->comp_node()); | |||||
| } | |||||
| const std::string& category() const; | |||||
| PersistentCache::Blob blob() const; | |||||
| }; | |||||
| using Result = float; | |||||
| public: | |||||
| static ProfilerCache& inst(); | |||||
| ProfilerCache& set_impl(std::unique_ptr<PersistentCache> impl); | |||||
| void dump_cache(const char* path); | |||||
| Maybe<Result> get(const Key& key); | |||||
| void put(const Key& key, Result& result); | |||||
| private: | |||||
| std::unique_ptr<PersistentCache> m_impl; | |||||
| }; | |||||
| class CachedProfiler final : public ProfilerImpl { | |||||
| public: | |||||
| CachedProfiler(const char* path = nullptr, int runs = 10, | |||||
| float opr_threshold = 2.f, float var_node_threshold = 2.f); | |||||
| ProfilingResult profile(const Problem& problem) const override; | |||||
| private: | |||||
| float profile_operator(const OperatorNodeBase* opr, | |||||
| TensorFormats base_format, | |||||
| TensorFormats tensor_format, | |||||
| ReformatAttribute extra_attribute = | |||||
| ReformatAttribute::DEFAULT) const override; | |||||
| float profile_operator(const OperatorNodeBase* opr, | |||||
| const OprTensorFormatsConfiguration& base_config, | |||||
| const OprTensorFormatsConfiguration& config, | |||||
| ReformatAttribute extra_attribute = | |||||
| ReformatAttribute::DEFAULT) const override; | |||||
| float profile_var_node(const VarNode* var, TensorFormats base_format, | |||||
| const ReformatKey& key) const override; | |||||
| const char* m_path; | |||||
| }; | }; | ||||
| } // namespace gopt | } // namespace gopt | ||||
| @@ -0,0 +1,93 @@ | |||||
| #!/usr/bin/env python3 | |||||
| # MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| # | |||||
| # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, | |||||
| # software distributed under the License is distributed on an | |||||
| # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # 为了保证全局图优化里的 profiling 结果不受到 ci 环境的影响,所以把写死的 profiling 结果存到了 cache 里去, | |||||
| # 每次跑测试会从内存里读取 cache 里的 profiling 结果,然后根据 profiling 结果去做全局图优化。 | |||||
| # 这个脚本用来把 dump 出去的 cache 文件转化成 cache 的头文件,用于测试时读取数据。 | |||||
| # 如果在 src/gopt/test/layout_transform_pass.cpp 里添加了全局图优化相关的测试,则需要考虑用这个脚本来 | |||||
| # 处理一下 profiling 数据。 | |||||
| # 1. 首先将 src/gopt/test/layout_transform_pass.cpp 中的 `#define MGB_WITH_CACHED_TEST 1` 修改为 | |||||
| # `#define MGB_WITH_CACHED_TEST 0` | |||||
| # 2. 编译megbrain_test,并运行所有全局图优化相关测试: | |||||
| # ./megbrain_test --gtest_filter="*LayoutTransform*" | |||||
| # 3. 用这个脚本把所有的cache文件打包在一起 | |||||
| # python3 embed_cache.py -o cache_data.h $(ls /path/to/cache/*.cache) | |||||
| # 4. 将步骤1中的 define 改回去,这样 profile 过程用到的是 cache 下来的数据。随后可以重新构建 megbrain_test , | |||||
| # 验证测试是否正确。 | |||||
| import os.path | |||||
| import logging | |||||
| import hashlib | |||||
| import argparse | |||||
| import struct | |||||
| import itertools | |||||
| import sys | |||||
| import subprocess | |||||
| logger = logging.getLogger(__name__) | |||||
| logging.basicConfig(level=logging.WARNING, format='%(asctime)-15s %(message)s') | |||||
| CHAR_MAP = {i: r'{}'.format(i) for i in range(256)} | |||||
| def _u32(data): | |||||
| return struct.unpack('<I', data)[0] | |||||
| class CacheDataGenerator: | |||||
| _cache_files = None | |||||
| def __init__(self, cache_files): | |||||
| self._cache_files = cache_files | |||||
| def _get_hash(self): | |||||
| return _u32(self._hash.digest()[:4]) | |||||
| def gen_cache_data(self, fpath): | |||||
| fname = os.path.basename(fpath) | |||||
| with open(fpath, 'rb') as fcache: | |||||
| cache_data = fcache.read() | |||||
| cache_data = struct.unpack( | |||||
| "<{}B".format(len(cache_data)), cache_data) | |||||
| ret = list(map(CHAR_MAP.__getitem__, cache_data)) | |||||
| for i in range(50, len(ret), 50): | |||||
| ret[i] = '\n' + ret[i] | |||||
| return ','.join(ret) | |||||
| def gen_cache_data_header(self, fout, src_map): | |||||
| fout.write('// generated embed_cache.py\n') | |||||
| fout.write('#include <vector>\n') | |||||
| fout.write('#include <stdint.h>\n') | |||||
| for k, v in sorted(src_map.items()): | |||||
| fout.write(""" | |||||
| static const std::vector<uint8_t> {} = {{ | |||||
| """.format(k.replace('.', '_'))) | |||||
| fout.write('{}'.format(v)) | |||||
| fout.write('};\n') | |||||
| def invoke(self, output): | |||||
| logger.info('generate cache_data.h ...') | |||||
| fname2cache_data = {} | |||||
| for fname in self._cache_files: | |||||
| base, ext = os.path.splitext(os.path.basename(fname)) | |||||
| assert ext == ".cache", "ext: {}, fname {}".format(ext, fname) | |||||
| assert base not in fname2cache_data, "duplicated kernel: " + base | |||||
| fname2cache_data[base] = self.gen_cache_data(fname) | |||||
| with open(output, 'w') as fout: | |||||
| self.gen_cache_data_header(fout, fname2cache_data) | |||||
| logger.info('done') | |||||
| if __name__ == '__main__': | |||||
| parser = argparse.ArgumentParser( | |||||
| description='embed cache into cache header file', | |||||
| formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |||||
| parser.add_argument('-o', '--output', help='output source file', | |||||
| required=True) | |||||
| parser.add_argument('cache', help='cache files to be embedded', nargs='+') | |||||
| args = parser.parse_args() | |||||
| cache_generator = CacheDataGenerator(args.cache) | |||||
| cache_generator.invoke(args.output) | |||||
| @@ -23,6 +23,12 @@ | |||||
| #include "megbrain/plugin/profiler.h" | #include "megbrain/plugin/profiler.h" | ||||
| #include "megbrain/serialization/serializer.h" | #include "megbrain/serialization/serializer.h" | ||||
| #define MGB_WITH_CACHED_TEST 1 | |||||
| #if MGB_WITH_CACHED_TEST | |||||
| #include "./cache_data.h" | |||||
| #endif | |||||
| using namespace mgb; | using namespace mgb; | ||||
| using namespace gopt; | using namespace gopt; | ||||
| using namespace serialization; | using namespace serialization; | ||||
| @@ -53,6 +59,78 @@ size_t find_opr_num(SymbolVar endpoint) { | |||||
| cg::DepOprIter{cb}.add(endpoint.node()->owner_opr()); | cg::DepOprIter{cb}.add(endpoint.node()->owner_opr()); | ||||
| return opr_num; | return opr_num; | ||||
| } | } | ||||
| using OprFormat = Problem::OprFormat; | |||||
| OprFormat tensor_formats_to_opr_format(TensorFormats tensor_format) { | |||||
| switch (tensor_format) { | |||||
| case TensorFormats::NCHW: | |||||
| return OprFormat::NCHW; | |||||
| case TensorFormats::NCHWc4: | |||||
| return OprFormat::NCHW4; | |||||
| case TensorFormats::NCHWc8: | |||||
| return OprFormat::NCHW8; | |||||
| case TensorFormats::NCHWc32: | |||||
| return OprFormat::NCHW32; | |||||
| case TensorFormats::NCHWc64: | |||||
| return OprFormat::NCHW64; | |||||
| case TensorFormats::NHWC: | |||||
| return OprFormat::NHWC; | |||||
| case TensorFormats::CHWNc4: | |||||
| return OprFormat::CHWN4; | |||||
| default: | |||||
| mgb_throw(MegBrainError, "tensor format(%u) is not supported", | |||||
| static_cast<uint32_t>(tensor_format)); | |||||
| } | |||||
| } | |||||
| class ProfilerMock : public ProfilerImpl { | |||||
| public: | |||||
| ProfilerMock(const uint8_t* bin, size_t size) { | |||||
| mgb_assert(bin != nullptr); | |||||
| ProfilerCache::inst().set_impl( | |||||
| std::make_unique<InFilePersistentCache>(bin, size)); | |||||
| } | |||||
| ~ProfilerMock() { | |||||
| // reset in memory cache | |||||
| ProfilerCache::inst().set_impl( | |||||
| std::make_unique<InMemoryPersistentCache>()); | |||||
| } | |||||
| private: | |||||
| float profile_operator(const OperatorNodeBase* opr, | |||||
| TensorFormats base_format, | |||||
| TensorFormats tensor_format, | |||||
| ReformatAttribute extra_attribute = | |||||
| ReformatAttribute::DEFAULT) const override { | |||||
| ProfilerCache::Key key{opr, tensor_formats_to_opr_format(tensor_format), | |||||
| extra_attribute}; | |||||
| auto ret = ProfilerCache::inst().get(key); | |||||
| if (ret.valid()) | |||||
| return ret.val(); | |||||
| mgb_assert(false); | |||||
| } | |||||
| float profile_operator(const OperatorNodeBase* opr, | |||||
| const OprTensorFormatsConfiguration& base_config, | |||||
| const OprTensorFormatsConfiguration& config, | |||||
| ReformatAttribute extra_attribute = | |||||
| ReformatAttribute::DEFAULT) const override { | |||||
| ProfilerCache::Key key{opr, config.opr_format, extra_attribute}; | |||||
| std::string tmp; | |||||
| tmp.reserve(key.blob().size); | |||||
| auto ret = ProfilerCache::inst().get(key); | |||||
| if (ret.valid()) | |||||
| return ret.val(); | |||||
| mgb_assert(false); | |||||
| } | |||||
| float profile_var_node(const VarNode* var, TensorFormats base_format, | |||||
| const ReformatKey& key) const override { | |||||
| ProfilerCache::Key pf_key{var, key}; | |||||
| auto ret = ProfilerCache::inst().get(pf_key); | |||||
| if (ret.valid()) | |||||
| return ret.val(); | |||||
| mgb_assert(false); | |||||
| } | |||||
| }; | |||||
| } // namespace | } // namespace | ||||
| #if MGB_CUDA | #if MGB_CUDA | ||||
| @@ -96,15 +174,23 @@ TEST(TestLayoutTransform, Resnet18_QS8) { | |||||
| OprFormat::NCHW, TensorFormats::NCHW, Target::UNSPEC, | OprFormat::NCHW, TensorFormats::NCHW, Target::UNSPEC, | ||||
| ReformatAttribute::AUTO_PADDING_NHWC}; | ReformatAttribute::AUTO_PADDING_NHWC}; | ||||
| auto ctx = std::make_unique<LayoutTransformContext>( | auto ctx = std::make_unique<LayoutTransformContext>( | ||||
| std::move(opr_list), std::move(available_tensor_formats), attribute); | |||||
| ctx->add_opr_config( | |||||
| opr::ConvBiasForward::typeinfo(), | |||||
| {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4, OprFormat::NHWC}) | |||||
| .add_opr_config( | |||||
| opr::PoolingForward::typeinfo(), | |||||
| {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NHWC, | |||||
| OprFormat::CHWN4}); | |||||
| auto profiler = ProfilerBase::make_profiler(); | |||||
| std::move(opr_list), std::move(available_tensor_formats), | |||||
| attribute); | |||||
| ctx->add_opr_config(opr::ConvBiasForward::typeinfo(), | |||||
| {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4, | |||||
| OprFormat::NHWC}) | |||||
| .add_opr_config(opr::PoolingForward::typeinfo(), | |||||
| {OprFormat::NCHW4, OprFormat::NCHW32, | |||||
| OprFormat::NHWC, OprFormat::CHWN4}); | |||||
| #if MGB_WITH_CACHED_TEST | |||||
| auto profiler = std::make_unique<ProfilerMock>( | |||||
| static_cast<const uint8_t*>( | |||||
| TestLayoutTransform_Resnet18_QS8.data()), | |||||
| TestLayoutTransform_Resnet18_QS8.size()); | |||||
| #else | |||||
| auto profiler = ProfilerBase::make_cached_profiler( | |||||
| "TestLayoutTransform.Resnet18_QS8.cache"); | |||||
| #endif | |||||
| std::unique_ptr<SolverBase> solver{ | std::unique_ptr<SolverBase> solver{ | ||||
| new DynamicProgrammingSolver(std::move(profiler))}; | new DynamicProgrammingSolver(std::move(profiler))}; | ||||
| auto new_output = | auto new_output = | ||||
| @@ -190,7 +276,15 @@ TEST(TestLayoutTransform, Resnet18_QS4) { | |||||
| opr::PoolingForward::typeinfo(), | opr::PoolingForward::typeinfo(), | ||||
| {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NCHW64, | {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NCHW64, | ||||
| OprFormat::NHWC, OprFormat::CHWN4}); | OprFormat::NHWC, OprFormat::CHWN4}); | ||||
| auto profiler = ProfilerBase::make_profiler(); | |||||
| #if MGB_WITH_CACHED_TEST | |||||
| auto profiler = std::make_unique<ProfilerMock>( | |||||
| static_cast<const uint8_t*>( | |||||
| TestLayoutTransform_Resnet18_QS4.data()), | |||||
| TestLayoutTransform_Resnet18_QS4.size()); | |||||
| #else | |||||
| auto profiler = ProfilerBase::make_cached_profiler( | |||||
| "TestLayoutTransform.Resnet18_QS4.cache"); | |||||
| #endif | |||||
| std::unique_ptr<SolverBase> solver{ | std::unique_ptr<SolverBase> solver{ | ||||
| new DynamicProgrammingSolver(std::move(profiler))}; | new DynamicProgrammingSolver(std::move(profiler))}; | ||||
| auto new_output = | auto new_output = | ||||
| @@ -305,7 +399,15 @@ TEST(TestLayoutTransform, Detection_QS8) { | |||||
| opr::PoolingForward::typeinfo(), | opr::PoolingForward::typeinfo(), | ||||
| {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NCHW64, | {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NCHW64, | ||||
| OprFormat::NHWC, OprFormat::CHWN4}); | OprFormat::NHWC, OprFormat::CHWN4}); | ||||
| auto profiler = ProfilerBase::make_profiler(); | |||||
| #if MGB_WITH_CACHED_TEST | |||||
| auto profiler = std::make_unique<ProfilerMock>( | |||||
| static_cast<const uint8_t*>( | |||||
| TestLayoutTransform_Detection_QS8.data()), | |||||
| TestLayoutTransform_Detection_QS8.size()); | |||||
| #else | |||||
| auto profiler = ProfilerBase::make_cached_profiler( | |||||
| "TestLayoutTransform.Detection_QS8.cache"); | |||||
| #endif | |||||
| std::unique_ptr<SolverBase> solver{ | std::unique_ptr<SolverBase> solver{ | ||||
| new DynamicProgrammingSolver(std::move(profiler))}; | new DynamicProgrammingSolver(std::move(profiler))}; | ||||
| auto new_outputs = | auto new_outputs = | ||||
| @@ -375,7 +477,15 @@ TEST(TestLayoutTransform, Detection_QS4) { | |||||
| opr::PoolingForward::typeinfo(), | opr::PoolingForward::typeinfo(), | ||||
| {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NCHW64, | {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NCHW64, | ||||
| OprFormat::NHWC, OprFormat::CHWN4}); | OprFormat::NHWC, OprFormat::CHWN4}); | ||||
| auto profiler = ProfilerBase::make_profiler(); | |||||
| #if MGB_WITH_CACHED_TEST | |||||
| auto profiler = std::make_unique<ProfilerMock>( | |||||
| static_cast<const uint8_t*>( | |||||
| TestLayoutTransform_Detection_QS4.data()), | |||||
| TestLayoutTransform_Detection_QS4.size()); | |||||
| #else | |||||
| auto profiler = ProfilerBase::make_cached_profiler( | |||||
| "TestLayoutTransform.Detection_QS4.cache"); | |||||
| #endif | |||||
| std::unique_ptr<SolverBase> solver{ | std::unique_ptr<SolverBase> solver{ | ||||
| new DynamicProgrammingSolver(std::move(profiler))}; | new DynamicProgrammingSolver(std::move(profiler))}; | ||||
| auto new_outputs = | auto new_outputs = | ||||
| @@ -443,10 +553,18 @@ TEST(TestLayoutTransform, Wide) { | |||||
| OprFormat::NCHW, TensorFormats::NCHW, Target::UNSPEC, | OprFormat::NCHW, TensorFormats::NCHW, Target::UNSPEC, | ||||
| ReformatAttribute::DEFAULT}; | ReformatAttribute::DEFAULT}; | ||||
| auto ctx = std::make_unique<LayoutTransformContext>( | auto ctx = std::make_unique<LayoutTransformContext>( | ||||
| std::move(opr_list), std::move(available_tensor_formats), attribute); | |||||
| ctx->add_opr_config( | |||||
| opr::ConvBiasForward::typeinfo(), {OprFormat::NCHW, OprFormat::NHWC}); | |||||
| auto profiler = ProfilerBase::make_profiler(); | |||||
| std::move(opr_list), std::move(available_tensor_formats), | |||||
| attribute); | |||||
| ctx->add_opr_config(opr::ConvBiasForward::typeinfo(), | |||||
| {OprFormat::NCHW, OprFormat::NHWC}); | |||||
| #if MGB_WITH_CACHED_TEST | |||||
| auto profiler = std::make_unique<ProfilerMock>( | |||||
| static_cast<const uint8_t*>(TestLayoutTransform_Wide.data()), | |||||
| TestLayoutTransform_Wide.size()); | |||||
| #else | |||||
| auto profiler = ProfilerBase::make_cached_profiler( | |||||
| "TestLayoutTransform.Wide.cache"); | |||||
| #endif | |||||
| std::unique_ptr<SolverBase> solver{ | std::unique_ptr<SolverBase> solver{ | ||||
| new DynamicProgrammingSolver(std::move(profiler))}; | new DynamicProgrammingSolver(std::move(profiler))}; | ||||
| auto v = gopt::GraphOptimizer{} | auto v = gopt::GraphOptimizer{} | ||||
| @@ -463,12 +581,8 @@ TEST(TestLayoutTransform, Wide) { | |||||
| auto func = network.graph->compile({{sym_o, {}}}); | auto func = network.graph->compile({{sym_o, {}}}); | ||||
| func->execute(); | func->execute(); | ||||
| gprof.to_json_full(func.get())->writeto_fpath(output_file("wide.json")); | gprof.to_json_full(func.get())->writeto_fpath(output_file("wide.json")); | ||||
| /// check global layout transform pass, no dimshuffle | |||||
| /// disable the following check, to make ci stable. | |||||
| #if 0 | |||||
| auto nr_dimshuffle = find_opr_num<opr::Dimshuffle>(sym_o); | auto nr_dimshuffle = find_opr_num<opr::Dimshuffle>(sym_o); | ||||
| ASSERT_EQ(nr_dimshuffle, 0u); | ASSERT_EQ(nr_dimshuffle, 0u); | ||||
| #endif | |||||
| auto nr_param_merge = find_opr_num<opr::MultipleDeviceTensorHolder>(sym_o); | auto nr_param_merge = find_opr_num<opr::MultipleDeviceTensorHolder>(sym_o); | ||||
| ASSERT_EQ(nr_param_merge, 1u); | ASSERT_EQ(nr_param_merge, 1u); | ||||
| /// check first conv format | /// check first conv format | ||||
| @@ -477,48 +591,6 @@ TEST(TestLayoutTransform, Wide) { | |||||
| ASSERT_EQ(cast.param().format, opr::ConvBias::Param::Format::NCHW); | ASSERT_EQ(cast.param().format, opr::ConvBias::Param::Format::NCHW); | ||||
| } | } | ||||
| TEST(TestLayoutTransform, ElemwiseMultiType) { | |||||
| REQUIRE_GPU(1); | |||||
| auto cn = CompNode::load("gpu0"); | |||||
| Network network(cn); | |||||
| auto x = network.add_var("x", {64, 64, 1, 2}); | |||||
| auto y = network.add_var("y", {64, 64, 1, 2}); | |||||
| x = network.add_type_cvt(x, dtype::QuantizedS4{1.f}); | |||||
| y = network.add_type_cvt(y, dtype::QuantizedS4{1.f}); | |||||
| auto x_ = network.add_type_cvt(x, dtype::Float32()); | |||||
| auto y_ = network.add_type_cvt(y, dtype::Float32()); | |||||
| auto z = network.add_elemwise( | |||||
| {x_, y_}, dtype::Float32(), opr::Elemwise::Mode::FUSE_ADD_RELU); | |||||
| z = network.add_type_cvt(z, dtype::QuantizedS4{1.f}); | |||||
| z = network.add_type_cvt(z, dtype::Float32()); | |||||
| auto z2 = network.add_elemwise( | |||||
| {x, y}, dtype::QuantizedS4{1.f}, opr::Elemwise::Mode::FUSE_ADD_RELU); | |||||
| z2 = network.add_type_cvt(z2, dtype::Float32()); | |||||
| HostTensorND t1; | |||||
| auto func1 = network.graph->compile({make_callback_copy(z, t1)}); | |||||
| func1->execute(); | |||||
| HostTensorND t3; | |||||
| auto func3 = network.graph->compile({make_callback_copy(z2, t3)}); | |||||
| func3->execute(); | |||||
| auto alter_x = opr::RelayoutFormat::make( | |||||
| x, megdnn::param::RelayoutFormat::Mode::NCHW_NCHW64); | |||||
| auto alter_y = opr::RelayoutFormat::make( | |||||
| y, megdnn::param::RelayoutFormat::Mode::NCHW_NCHW64); | |||||
| auto alter_z = network.add_elemwise( | |||||
| {alter_x, alter_y}, dtype::QuantizedS4{1.f}, | |||||
| opr::Elemwise::Mode::FUSE_ADD_RELU); | |||||
| alter_z = opr::RelayoutFormat::make( | |||||
| alter_z, megdnn::param::RelayoutFormat::Mode::NCHW64_NCHW); | |||||
| alter_z = network.add_type_cvt(alter_z, dtype::Float32()); | |||||
| HostTensorND t2; | |||||
| auto func2 = network.graph->compile({make_callback_copy(alter_z, t2)}); | |||||
| func2->execute(); | |||||
| // MGB_ASSERT_TENSOR_EQ(t1, t3); | |||||
| MGB_ASSERT_TENSOR_EQ(t2, t3); | |||||
| } | |||||
| #if CUDA_VERSION >= 10020 | #if CUDA_VERSION >= 10020 | ||||
| TEST(TestLayoutTransform, DetectionHead) { | TEST(TestLayoutTransform, DetectionHead) { | ||||
| REQUIRE_GPU(1); | REQUIRE_GPU(1); | ||||
| @@ -600,8 +672,15 @@ TEST(TestLayoutTransform, DetectionHead) { | |||||
| .add_opr_config( | .add_opr_config( | ||||
| opr::WarpPerspectiveForward::typeinfo(), | opr::WarpPerspectiveForward::typeinfo(), | ||||
| {OprFormat::NHWC, OprFormat::NCHW4, OprFormat::NCHW64}); | {OprFormat::NHWC, OprFormat::NCHW4, OprFormat::NCHW64}); | ||||
| auto profiler = ProfilerBase::make_profiler(); | |||||
| #if MGB_WITH_CACHED_TEST | |||||
| auto profiler = std::make_unique<ProfilerMock>( | |||||
| static_cast<const uint8_t*>( | |||||
| TestLayoutTransform_DetectionHead.data()), | |||||
| TestLayoutTransform_DetectionHead.size()); | |||||
| #else | |||||
| auto profiler = ProfilerBase::make_cached_profiler( | |||||
| "TestLayoutTransform.DetectionHead.cache"); | |||||
| #endif | |||||
| std::unique_ptr<SolverBase> solver{ | std::unique_ptr<SolverBase> solver{ | ||||
| new DynamicProgrammingSolver(std::move(profiler))}; | new DynamicProgrammingSolver(std::move(profiler))}; | ||||
| auto new_out_vars = | auto new_out_vars = | ||||