You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

utility.cpp 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440
  1. /**
  2. * \file imperative/src/impl/ops/utility.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megbrain/imperative/ops/autogen.h"
  12. #include "megbrain/imperative/ops/utility.h"
  13. #include "megbrain/imperative/ops/opr_attr.h"
  14. #include "megbrain/imperative/graph_cache.h"
  15. #include "megbrain/imperative/subgraph_detail.h"
  16. #include "megbrain/imperative/opr_utility.h"
  17. #include "megbrain/opr/utility.h"
  18. #include "megbrain/opr/tensor_gen.h"
  19. #include "megbrain/opr/tensor_manip.h"
  20. #include "megbrain/opr/io.h"
  21. #include "../op_trait.h"
  22. namespace mgb::imperative {
  23. MGB_DYN_TYPE_OBJ_FINAL_IMPL(GenericPyOp);
  24. OP_TRAIT_REG(GenericPyOp, GenericPyOp).fallback();
  25. namespace { namespace fastpathcopy {
  26. auto apply_on_var_node(
  27. const OpDef& def,
  28. const VarNodeArray& inputs) {
  29. return inputs;
  30. }
  31. OP_TRAIT_REG(FastpathCopy,FastpathCopy)
  32. .apply_on_var_node(apply_on_var_node)
  33. .fallback();
  34. }} // fastpathcopy
  35. namespace { namespace shape_infer {
  36. auto apply_on_physical_tensor(
  37. const OpDef& def,
  38. const SmallVector<TensorPtr>& inputs) {
  39. auto& op = def.cast_final_safe<ShapeInfer>();
  40. size_t nr_inputs = inputs.size();
  41. mgb_assert(nr_inputs > 0, "no inputs for ShapeInfer");
  42. SmallVector<LogicalTensorDesc> input_descs;
  43. for (size_t i = 0; i < nr_inputs; ++i) {
  44. auto input = inputs[i]->get_value();
  45. TensorLayout layout;
  46. layout.ndim = input.shape(0);
  47. for (size_t i = 0; i < layout.ndim; ++i) {
  48. layout[i] = input.ptr<int32_t>()[i];
  49. }
  50. layout.dtype = op.dtypes[i];
  51. layout.init_contiguous_stride();
  52. input_descs.push_back({layout, op.devices[i]});
  53. }
  54. auto [output_descs, valid] = OpDef::infer_output_attrs_fallible(*op.op, input_descs);
  55. mgb_assert(valid, "shape inference incomplete");
  56. SmallVector<TensorPtr> outputs;
  57. for (auto&& output_desc: output_descs) {
  58. HostTensorND shape_tensor{output_desc.comp_node, {output_desc.layout.ndim}, dtype::Int32()};
  59. for (size_t i = 0; i < output_desc.layout.ndim; ++i) {
  60. shape_tensor.ptr<int32_t>()[i] = output_desc.layout[i];
  61. }
  62. auto output = Tensor::make(shape_tensor);
  63. outputs.push_back(output);
  64. }
  65. return outputs;
  66. }
  67. auto apply_on_var_node(
  68. const OpDef& def,
  69. const VarNodeArray& inputs) {
  70. auto& op = def.cast_final_safe<ShapeInfer>();
  71. size_t nr_inputs = inputs.size();
  72. VarNodeArray input_values, outputs;
  73. mgb_assert(nr_inputs > 0, "no inputs for ShapeInfer");
  74. for (size_t i = 0; i < nr_inputs; ++i) {
  75. auto input_value = opr::Alloc::make(SymbolVar(inputs[i]), op.dtypes[i], {op.devices[i]});
  76. input_values.push_back(input_value.node());
  77. }
  78. auto output_values = OpDef::apply_on_var_node(*op.op, input_values);
  79. for (auto&& output_value: output_values) {
  80. outputs.push_back(opr::GetVarShape::make(output_value).node());
  81. }
  82. return outputs;
  83. }
  84. auto infer_output_attrs_fallible(
  85. const OpDef& def,
  86. const SmallVector<LogicalTensorDesc>& input_descs) {
  87. auto& op = def.cast_final_safe<ShapeInfer>();
  88. SmallVector<LogicalTensorDesc> input_shape_descs;
  89. size_t nr_inputs = op.devices.size();
  90. mgb_assert(op.dtypes.size() == nr_inputs, "number of input devices and dtypes mismatch");
  91. for (size_t i = 0; i < nr_inputs; ++i) {
  92. LogicalTensorDesc input_shape_desc;
  93. input_shape_desc.comp_node = op.devices[i];
  94. input_shape_desc.layout.ndim = 0;
  95. input_shape_desc.layout.dtype = op.dtypes[i];
  96. input_shape_descs.push_back(input_shape_desc);
  97. }
  98. auto [output_shape_descs, _] = OpDef::infer_output_attrs_fallible(*op.op, input_shape_descs);
  99. SmallVector<LogicalTensorDesc> output_descs;
  100. for (auto&& output_shape_desc: output_shape_descs) {
  101. LogicalTensorDesc output_desc;
  102. output_desc.comp_node = output_shape_desc.comp_node;
  103. output_desc.layout.ndim = 1;
  104. output_desc.layout.dtype = dtype::Int32();
  105. output_descs.push_back(output_desc);
  106. }
  107. return std::make_tuple(output_descs, false);
  108. }
  109. auto props(const OpDef& def) {
  110. auto& op = def.cast_final_safe<ShapeInfer>();
  111. return OpDef::props(*op.op);
  112. }
  113. auto make_name(const OpDef& def) {
  114. auto& op = def.cast_final_safe<ShapeInfer>();
  115. MGB_MARK_USED_VAR(op);
  116. return ssprintf("ShapeInfer[%s]", op.op->make_name().c_str());
  117. }
  118. auto hash(const OpDef& def) {
  119. auto& op = def.cast_final_safe<ShapeInfer>();
  120. return op.op->hash();
  121. }
  122. auto is_same_st(const OpDef& def, const OpDef& another) {
  123. if (!another.same_type<ShapeInfer>()) {
  124. return false;
  125. }
  126. auto& lhs = def.cast_final_safe<ShapeInfer>();
  127. auto& rhs = another.cast_final_safe<ShapeInfer>();
  128. if (!lhs.op->is_same(*rhs.op)) {
  129. return false;
  130. }
  131. return std::tie(lhs.devices, lhs.dtypes) ==
  132. std::tie(rhs.devices, rhs.dtypes);
  133. }
  134. OP_TRAIT_REG(ShapeInfer,ShapeInfer)
  135. .apply_on_var_node(apply_on_var_node)
  136. .apply_on_physical_tensor(apply_on_physical_tensor)
  137. .infer_output_attrs_fallible(infer_output_attrs_fallible)
  138. .make_name(make_name)
  139. .props(props)
  140. .hash(hash)
  141. .is_same_st(is_same_st)
  142. .fallback();
  143. }}
  144. MGB_DYN_TYPE_OBJ_FINAL_IMPL(ShapeInfer);
  145. namespace { namespace identity {
  146. auto apply_on_var_node(
  147. const OpDef& def,
  148. const VarNodeArray& inputs) {
  149. auto&& op = def.cast_final_safe<Identity>();
  150. mgb_assert(inputs.size() == 1);
  151. OperatorNodeConfig config{op.make_name()};
  152. return opr::Identity::make(inputs[0], config);
  153. }
  154. auto apply_on_physical_tensor(
  155. const OpDef& def,
  156. const SmallVector<TensorPtr>& inputs) {
  157. return SmallVector<TensorPtr>{inputs[0]};
  158. }
  159. OP_TRAIT_REG(Identity, Identity)
  160. .apply_on_var_node(apply_on_var_node)
  161. .apply_on_physical_tensor(apply_on_physical_tensor)
  162. .fallback();
  163. }} // identity
  164. namespace { namespace subgraph {
  165. EncodedSubraph make_forward_graph(const OpDef& def, SmallVector<LogicalTensorDesc> inputs) {
  166. return EncodedSubraph::make(def.cast_final_safe<SubgraphOp>().graph);
  167. }
  168. EncodedSubraph make_backward_graph(
  169. const OpDef& def,
  170. const SmallVector<LogicalTensorDesc>& inputs,
  171. const SmallVector<bool>& input_requires_grad,
  172. SmallVector<bool> output_has_grad) {
  173. auto& op = def.cast_final_safe<SubgraphOp>();
  174. mgb_assert(output_has_grad.size() == op.output_grad_mask.size());
  175. for (size_t i = 0; i < output_has_grad.size(); ++i) {
  176. if (!op.output_grad_mask[i]) {
  177. output_has_grad[i] = false;
  178. }
  179. }
  180. auto bgraph = subgraph_detail::make_backward_graph(def, inputs, input_requires_grad, output_has_grad);
  181. return EncodedSubraph::make_single(SubgraphOp::make(op.name+"Grad", bgraph.graph), bgraph.input_mask, bgraph.output_mask);
  182. }
  183. std::vector<std::pair<const char*, std::string>> props(const OpDef& def) {
  184. auto& op = def.cast_final_safe<SubgraphOp>();
  185. return {
  186. {"name", op.name},
  187. {"inputs", mgb::imperative::to_string(op.graph.inputs)},
  188. {"exprs", mgb::imperative::to_string(op.graph.exprs)},
  189. {"outputs", mgb::imperative::to_string(op.graph.outputs)},
  190. };
  191. }
  192. std::string make_name(const OpDef& def) {
  193. auto& op = def.cast_final_safe<SubgraphOp>();
  194. if (op.name.empty()) {
  195. return "SubgraphOp";
  196. } else {
  197. return op.name;
  198. }
  199. }
  200. auto hash(const OpDef& def) {
  201. auto& op = def.cast_final_safe<SubgraphOp>();
  202. if (!op.graph_key) {
  203. return (size_t)reinterpret_cast<uintptr_t>(&op.graph);
  204. }
  205. return op.graph_key->hash();
  206. }
  207. auto is_same_st(const OpDef& def, const OpDef& another) {
  208. if (!another.same_type<SubgraphOp>()) {
  209. return false;
  210. }
  211. auto& lhs = def.cast_final_safe<SubgraphOp>();
  212. auto& rhs = another.cast_final_safe<SubgraphOp>();
  213. auto has_graph_key = bool(lhs.graph_key);
  214. bool graph_same = false;
  215. if (has_graph_key) {
  216. graph_same = rhs.graph_key && lhs.graph_key->is_same(*rhs.graph_key);
  217. } else {
  218. graph_same = !rhs.graph_key && &lhs.graph == &rhs.graph;
  219. }
  220. return graph_same;
  221. }
  222. OP_TRAIT_REG(SubgraphOp, SubgraphOp)
  223. .make_forward_graph(make_forward_graph)
  224. .make_backward_graph(make_backward_graph)
  225. .props(props)
  226. .make_name(make_name)
  227. .hash(hash)
  228. .is_same_st(is_same_st)
  229. .fallback();
  230. }}
  231. namespace { namespace compiled_op {
  232. struct DeviceMemoryAllocatorImpl: cg::DeviceMemoryAllocator {
  233. std::shared_ptr<OpDef> current_op;
  234. void alloc_static(ComputingGraph* graph, DeviceTensorStorage& dest, size_t size) override {
  235. mgb_assert(0, "alloc_static is not allowed in CompiledOp");
  236. }
  237. void alloc_dynamic(VarNode* var, DeviceTensorStorage& dest, size_t size) override {
  238. auto comp_node = var->comp_node();
  239. auto storage = current_op->allocate(comp_node, size);
  240. dest.reset(comp_node, size, storage);
  241. }
  242. };
  243. struct ComputingGraphHolder {
  244. std::shared_ptr<ComputingGraph> graph;
  245. std::unique_ptr<cg::AsyncExecutable> executable;
  246. SmallVector<std::shared_ptr<DeviceTensorND>> inputs;
  247. SmallVector<std::shared_ptr<DeviceTensorND>> outputs;
  248. std::shared_ptr<DeviceMemoryAllocatorImpl> allocator;
  249. };
  250. thread_local OpMethResultCache<ComputingGraphHolder> cg_cache;
  251. ComputingGraphHolder& get_computing_graph(std::shared_ptr<OpDef> compiled_op, SmallVector<LogicalTensorDesc> descs) {
  252. OpMethArgs<> key = {compiled_op, descs};
  253. auto& cg_holder = cg_cache[key];
  254. if (!cg_holder.graph) {
  255. cg_holder.allocator = std::make_shared<DeviceMemoryAllocatorImpl>();
  256. cg_holder.graph = ComputingGraph::make();
  257. cg_holder.graph->options().force_dynamic_alloc = true;
  258. cg_holder.graph->options().async_exec_level = 0;
  259. cg_holder.graph->options().graph_opt_level = compiled_op->cast_final_safe<CompiledOp>().gopt_level;
  260. cg_holder.graph->options().enable_var_mem_defragment = false;
  261. cg_holder.graph->options().comp_seq_sync_device = false;
  262. cg_holder.graph->set_device_memory_allocator(cg_holder.allocator);
  263. // cg_holder.graph->options().graph_opt.jit = 2;
  264. VarNodeArray input_vars;
  265. for (auto&& desc: descs) {
  266. auto input_device_nd = std::make_shared<DeviceTensorND>();
  267. input_device_nd->dtype(desc.layout.dtype);
  268. input_device_nd->comp_node(desc.comp_node);
  269. input_device_nd->resize(desc.layout);
  270. cg_holder.inputs.push_back(input_device_nd);
  271. auto callback = [input_device_nd]{
  272. return *input_device_nd;
  273. };
  274. auto* input_var = opr::InputCallback::make(*cg_holder.graph, callback, desc.comp_node, desc.layout.dtype, TensorShape())[0].node();
  275. input_vars.push_back(input_var);
  276. }
  277. // forward to inner op
  278. auto output_vars = OpDef::apply_on_var_node(*compiled_op, input_vars);
  279. ComputingGraph::OutputSpec output_spec;
  280. size_t nr_outputs = output_vars.size();
  281. for (size_t i = 0; i < nr_outputs; ++i) {
  282. auto* output_var = output_vars[i];
  283. auto output_ptr = std::make_shared<DeviceTensorND>();
  284. auto callback = [output_ptr](DeviceTensorND output){
  285. output_ptr->reset(output.storage(), output.layout());
  286. };
  287. output_spec.push_back({output_var, callback});
  288. cg_holder.outputs.push_back(output_ptr);
  289. }
  290. cg_holder.executable = cg_holder.graph->compile(output_spec);
  291. }
  292. return cg_holder;
  293. }
  294. auto apply_on_physical_tensor(
  295. const OpDef& def,
  296. const SmallVector<TensorPtr>& inputs) {
  297. SmallVector<LogicalTensorDesc> input_descs;
  298. for (auto&& input: inputs) {
  299. input_descs.push_back({input->layout(), input->comp_node()});
  300. }
  301. size_t nr_inputs = inputs.size();
  302. auto shared_def = const_cast<OpDef&>(def).shared_from_this();
  303. auto& cg_holder = get_computing_graph(shared_def, input_descs);
  304. for (size_t i = 0; i < nr_inputs; ++i) {
  305. auto input_dev_tensor = inputs[i]->dev_tensor();
  306. cg_holder.inputs[i]->reset(input_dev_tensor.storage(), input_dev_tensor.layout());
  307. }
  308. cg_holder.allocator->current_op = shared_def;
  309. cg_holder.executable->execute();
  310. cg_holder.executable->wait();
  311. SmallVector<TensorPtr> outputs;
  312. for (auto input_nd: cg_holder.inputs) {
  313. *input_nd = {};
  314. }
  315. for (auto output_nd: cg_holder.outputs) {
  316. outputs.push_back(Tensor::make(*output_nd));
  317. *output_nd = {};
  318. }
  319. cg_holder.executable->clear_device_memory();
  320. cg_holder.allocator->current_op = nullptr;
  321. return outputs;
  322. }
  323. auto apply_on_var_node(
  324. const OpDef& def,
  325. const VarNodeArray& inputs) {
  326. return OpDef::apply_on_var_node(*def.cast_final_safe<CompiledOp>().op, inputs);
  327. }
  328. auto infer_output_attrs_fallible(
  329. const OpDef& def,
  330. const SmallVector<LogicalTensorDesc>& input_descs) {
  331. return OpDef::infer_output_attrs_fallible(*def.cast_final_safe<CompiledOp>().op, input_descs);
  332. }
  333. auto props(const OpDef& def) {
  334. return OpDef::props(*def.cast_final_safe<CompiledOp>().op);
  335. }
  336. auto make_name(const OpDef& def) {
  337. auto& op = def.cast_final_safe<CompiledOp>();
  338. MGB_MARK_USED_VAR(op);
  339. return ssprintf("CompiledOp[%s]", op.op->make_name().c_str());
  340. }
  341. std::tuple<SmallVector<MemoryDesc>, SmallVector<MemoryDesc>> infer_output_mem_desc(
  342. const OpDef& def,
  343. const SmallVector<TensorPtr>& inputs_tensors,
  344. const SmallVector<MemoryDesc>& inputs_mems) {
  345. return {};
  346. }
  347. EncodedSubraph make_backward_graph(
  348. const OpDef& def,
  349. const SmallVector<LogicalTensorDesc>& inputs,
  350. const SmallVector<bool>& input_requires_grad,
  351. const SmallVector<bool>& output_has_grad) {
  352. auto& op = def.cast_final_safe<CompiledOp>();
  353. auto backward_graph = OpDef::make_backward_graph(*op.op, inputs, input_requires_grad, output_has_grad);
  354. auto name = def.trait()->make_name(def);
  355. auto key = std::make_shared<BackwardOpKey>();
  356. key->op = op.op;
  357. key->inputs = inputs;
  358. key->extras = {input_requires_grad, output_has_grad};
  359. SmallVector<bool> grad_outputs_has_grad(backward_graph.graph.outputs.size(), true);
  360. std::shared_ptr<OpDef> bgraph_op;
  361. if (backward_graph.graph.is_single()) {
  362. bgraph_op = backward_graph.graph.as_single();
  363. } else {
  364. bgraph_op = SubgraphOp::make(name+"Grad", backward_graph.graph, grad_outputs_has_grad, key);
  365. }
  366. auto compiled_op = CompiledOp::make(bgraph_op, op.gopt_level);
  367. auto encoded_graph = EncodedSubraph::make_single(compiled_op, backward_graph.input_mask, backward_graph.output_mask);
  368. return encoded_graph;
  369. }
  370. auto hash(const OpDef& def) {
  371. auto& op = def.cast_final_safe<CompiledOp>();
  372. return mgb::hash_pair_combine(op.op->hash(), op.gopt_level);
  373. }
  374. auto is_same_st(const OpDef& def, const OpDef& another) {
  375. if (!another.same_type<CompiledOp>()) {
  376. return false;
  377. }
  378. auto& lhs = def.cast_final_safe<CompiledOp>();
  379. auto& rhs = another.cast_final_safe<CompiledOp>();
  380. return lhs.op->is_same(*rhs.op) && lhs.gopt_level == rhs.gopt_level;
  381. }
  382. OP_TRAIT_REG(CompiledOp, CompiledOp)
  383. .apply_on_var_node(apply_on_var_node)
  384. .apply_on_physical_tensor(apply_on_physical_tensor)
  385. .infer_output_attrs_fallible(infer_output_attrs_fallible)
  386. .make_backward_graph(make_backward_graph)
  387. .make_name(make_name)
  388. .infer_output_mem_desc(infer_output_mem_desc)
  389. .props(props)
  390. .hash(hash)
  391. .is_same_st(is_same_st)
  392. .fallback();
  393. }}
  394. MGB_DYN_TYPE_OBJ_FINAL_IMPL(SubgraphOp);
  395. MGB_DYN_TYPE_OBJ_FINAL_IMPL(BackwardOpKey);
  396. MGB_DYN_TYPE_OBJ_FINAL_IMPL(CompiledOp);
  397. } // namespace mgb::imperative

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台