|
- #include "layout_trans_options.h"
- #include <gflags/gflags.h>
- #include "megbrain/serialization/serializer.h"
- #include "misc.h"
- #include "models/model_lite.h"
- #include "models/model_mdl.h"
- namespace lar {
-
- template <>
- void GoptLayoutOption::config_model_internel<ModelLite>(
- RuntimeParam& runtime_param, std::shared_ptr<ModelLite> model) {
- if (runtime_param.stage == RunStage::AFTER_NETWORK_CREATED) {
- if (m_layout_transform) {
- LITE_LOG("using global layout transform optimization\n");
- if (m_layout_transform_target ==
- mgb::gopt::GraphTuningOptions::Target::CPU) {
- model->get_config().device_type = LiteDeviceType::LITE_CPU;
- }
- #if LITE_WITH_CUDA
- else if (
- m_layout_transform_target ==
- mgb::gopt::GraphTuningOptions::Target::CUDA) {
- model->get_config().device_type = LiteDeviceType::LITE_CUDA;
- }
- #endif
- LITE_LOG("enable layout transform while load model for lite");
- auto&& lite_network = model->get_lite_network();
- lite::Runtime::enable_global_layout_transform(lite_network);
- }
- } else if (runtime_param.stage == RunStage::GLOBAL_OPTIMIZATION) {
- if (m_layout_transform) {
- auto&& network = model->get_lite_network();
- if (!m_layout_transform_dump_file.empty()) {
- lite::Runtime::dump_layout_transform_model(
- network, m_layout_transform_dump_file);
- }
- }
- }
- }
-
- template <>
- void GoptLayoutOption::config_model_internel<ModelMdl>(
- RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
- if (runtime_param.stage == RunStage::AFTER_MODEL_LOAD) {
- if (m_layout_transform) {
- mgb_log_debug("update input shape for global layout transform\n");
- auto&& load_result = model->get_mdl_load_result();
- if (m_force_batch_size > 0) {
- for (auto&& i : load_result.tensor_map) {
- auto& in = i.second;
- mgb::TensorShape new_shape = in->shape();
- new_shape[0] = m_force_batch_size;
- mgb::HostTensorND new_tensor;
- new_tensor.comp_node(mgb::CompNode::default_cpu(), true)
- .dtype(in->dtype())
- .resize(new_shape);
- mgb::dt_byte* raw_ptr = new_tensor.raw_ptr();
- memset((char*)raw_ptr, 1, new_tensor.layout().total_nr_elems());
- in->copy_from(new_tensor);
- }
- }
- for (auto&& item : load_result.output_var_list) {
- if (item.shape()[0] > 1) {
- mgb_log_warn(
- " model may be dumped with multi batch and will cost lots "
- "of time to profile during global layout transform!!!\n");
- }
- }
- //! update output varlist when input shape maybe change(some pass excution
- //! time depends on the shape of init input)
- mgb::thin_hash_table::ThinHashMap<mgb::cg::SymbolVar, mgb::cg::SymbolVar>
- varmap;
- mgb::cg::DepOprIter dep([&](mgb::cg::OperatorNodeBase* opr) {
- if (auto h2d = opr->try_cast_final<mgb::opr::Host2DeviceCopy>()) {
- auto param = h2d->param();
- mgb::TensorShape new_shape = h2d->host_data()->shape();
- std::shared_ptr<mgb::HostTensorND> new_tensor =
- std::make_shared<mgb::HostTensorND>(
- h2d->host_data()->comp_node(), new_shape,
- h2d->host_data()->dtype());
- new_tensor->only_reset_raw_storage(h2d->host_data()->storage());
- auto h2d_opr = mgb::opr::Host2DeviceCopy::make(
- *h2d->owner_graph(), new_tensor, param, h2d->config());
- varmap[h2d->output(0)] = h2d_opr;
- }
- });
-
- for (auto&& i : load_result.output_var_list)
- dep.add(i);
-
- if (!varmap.empty()) {
- auto output_vars =
- mgb::cg::replace_vars(load_result.output_var_list, varmap);
- for (size_t i = 0; i < load_result.output_var_list.size(); ++i) {
- output_vars[i].rename(
- load_result.output_var_list[i].node()->name());
- }
- load_result.output_var_list = output_vars;
- }
- }
- } else if (runtime_param.stage == RunStage::GLOBAL_OPTIMIZATION) {
- if (m_layout_transform) {
- mgb_log("using global layout transform optimization\n");
- auto&& load_result = model->get_mdl_load_result();
- load_result.output_var_list = mgb::gopt::layout_transform(
- load_result.output_var_list, m_layout_transform_target);
-
- if (!m_layout_transform_dump_file.empty()) {
- auto out_file = mgb::serialization::OutputFile::make_fs(
- m_layout_transform_dump_file.c_str(), 'w');
- auto testcase_num = model->get_testcase_num();
-
- if (testcase_num) {
- const char* magic = "mgbtest0";
- constexpr size_t len = sizeof(magic);
- out_file->write(magic, len);
- out_file->write(&testcase_num, sizeof(testcase_num));
- }
-
- using DumpConfig = mgb::serialization::GraphDumper::DumpConfig;
- DumpConfig config{1, false, false};
- auto dumper = model->get_dumper(std::move(out_file));
- dumper->dump(load_result.output_var_list, config);
-
- if (testcase_num) {
- auto input_file = model->get_loader()->reset_file();
- auto current_offset = input_file->tell();
- auto loader = model->reset_loader(std::move(input_file));
- auto testcase = loader->load(model->get_mdl_config(), false);
- mgb::serialization::GraphDumper::DumpConfig config{1, false, false};
- for (size_t i = 0; i < testcase_num; ++i) {
- auto casefile = mgb::serialization::OutputFile::make_fs(
- m_layout_transform_dump_file.c_str(), 'a');
- auto casedumper = model->get_dumper(std::move(casefile));
- casedumper->dump(testcase.output_var_list, config);
- if (i != testcase_num - 1) {
- loader = model->reset_loader();
- testcase = loader->load(model->get_mdl_config(), false);
- }
- }
- input_file = model->get_loader()->reset_file();
- input_file->rewind();
- input_file->skip(current_offset);
- model->reset_loader(std::move(input_file));
- }
- }
- }
- }
- }
-
- } // namespace lar
-
- using namespace lar;
- bool GoptLayoutOption::m_valid;
- void GoptLayoutOption::update() {
- m_option_name = "gopt_layout";
- if (FLAGS_layout_transform != "cpu"
- #if LITE_WITH_CUDA
- && FLAGS_layout_transform != "cuda"
- #endif
- ) {
- m_layout_transform = false;
- m_layout_transform_target = mgb::gopt::GraphTuningOptions::Target::UNSPEC;
-
- } else {
- m_layout_transform = true;
-
- if (FLAGS_layout_transform == "cpu") {
- m_layout_transform_target = mgb::gopt::GraphTuningOptions::Target::CPU;
- }
- #if LITE_WITH_CUDA
- else if (FLAGS_layout_transform == "cuda") {
- m_layout_transform_target = mgb::gopt::GraphTuningOptions::Target::CUDA;
- }
- #endif
- }
- m_layout_transform_dump_file = FLAGS_layout_transform_dump;
-
- m_force_batch_size = FLAGS_layout_transform_batch_size;
-
- m_option = {
- {"layout_transform", lar::String::make("")},
- };
- std::static_pointer_cast<lar::String>(m_option["layout_transform"])
- ->set_value(FLAGS_layout_transform);
- }
-
- bool GoptLayoutOption::is_valid() {
- bool ret = false;
- if (!FLAGS_layout_transform.empty()) {
- if (FLAGS_layout_transform != "cpu"
- #if LITE_WITH_CUDA
- && FLAGS_layout_transform != "cuda"
- #endif
- ) {
- mgb_assert(
- false,
- "unsupported target(got:%s) for global layout "
- "transform",
- FLAGS_layout_transform.c_str());
- ret = false;
- } else {
- ret = true;
- }
- }
- ret = ret || !FLAGS_layout_transform_dump.empty();
- if (FLAGS_layout_transform_batch_size > 0) {
- mgb_assert(
- FLAGS_layout_transform_batch_size > 0 &&
- !FLAGS_layout_transform.empty(),
- "\"layout-transform-batch-size\" should be set with "
- "\"layout-transform\"");
- ret = ret || FLAGS_layout_transform_batch_size > 0;
- }
- return ret || m_valid;
- }
-
- std::shared_ptr<OptionBase> GoptLayoutOption::create_option() {
- static std::shared_ptr<GoptLayoutOption> option(new GoptLayoutOption);
- if (GoptLayoutOption::is_valid()) {
- option->update();
- return std::static_pointer_cast<OptionBase>(option);
- } else {
- return nullptr;
- }
- }
-
- void GoptLayoutOption::config_model(
- RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
- auto value = std::static_pointer_cast<lar::String>(m_option["layout_transform"])
- ->get_value();
- if (value.empty()) {
- return;
- }
- if (value == "cpu") {
- m_layout_transform = true;
- m_layout_transform_target = mgb::gopt::GraphTuningOptions::Target::CPU;
- }
- #if LITE_WITH_CUDA
- else if (value == "cuda") {
- m_layout_transform = true;
- m_layout_transform_target = mgb::gopt::GraphTuningOptions::Target::CUDA;
- }
- #endif
- else {
- mgb_throw(
- mgb::AssertionError, "invalid options of global layout transform %s",
- value.c_str());
- }
- CONFIG_MODEL_FUN;
- }
-
- DEFINE_string(
- layout_transform, "",
- "Enable global layout transform optimization for computing graph. User should "
- "specify the device target for the optimization, and a series of passes will "
- "be applied on the computing graph. The passes will benchmark the elapsed time "
- "of operators on different tensor layouts, and select fastest implementation "
- "for the operators. The optimization process will take some time. The default "
- "target is unspec, which all the available for operators will be profiled. So "
- "the optimize time will be longer.");
- DEFINE_string(
- layout_transform_dump, "",
- "The computing graph after global layout transform will be dumped to the given "
- "file path.");
-
- DEFINE_int32(
- layout_transform_batch_size, -1,
- "the batch size of input for global layout transform optimization working on");
- REGIST_OPTION_CREATOR(gopt_layout, lar::GoptLayoutOption::create_option);
- REGIST_OPTION_VALIDATER(gopt_layout, lar::GoptLayoutOption::set_valid);
|