You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

graph_compiler.h 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. /**
  2. * Copyright 2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_GRAPH_COMPILER_H_
  17. #define MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_GRAPH_COMPILER_H_
  18. #include <vector>
  19. #include <memory>
  20. #include <string>
  21. #include <map>
  22. #include <set>
  23. #include "utils/hash_map.h"
  24. #include "runtime/hardware/device_context.h"
  25. #include "runtime/graph_scheduler/actor/actor_common.h"
  26. #include "runtime/graph_scheduler/control_node_parser.h"
  27. #include "backend/common/session/session_basic.h"
  28. #include "backend/common/session/session_factory.h"
  29. #include "ir/tensor.h"
  30. #include "include/backend/visible.h"
  31. namespace mindspore {
  32. using device::DeviceContext;
  33. using session::CallBackFunc;
  34. using session::GraphOutputInfo;
  35. using session::InputTensorInfo;
  36. using session::KernelGraph;
  37. using session::KernelWithIndex;
  38. using session::OpRunInfo;
  39. using tensor::TensorPtr;
  40. namespace runtime {
  41. // Position of kernel with index, the value pair<branch_id, vector<pos>> means the branch id of the kernel and the pos
  42. // of the kernel. Generally, there is only one branch, and the branch id is 0 at this time. In control flow, there are
  43. // multiple branch scenarios, and pos represents the position of the kernel in the branch.
  44. using KernelMapPosition = std::map<KernelWithIndex, std::vector<size_t>, session::KernelWithIndexCmp>;
  45. // The graph compiler info generated by graph compiler is the express of executable graph.
  46. // The device context is unified interface of interaction with device of corresponding graph.
  47. // The tensors mask is used to distinguish input tensor's type.
  48. // The input tensor is used to link graphs in the dynamic build scenario.
  49. // The control node is used to link graphs in the control flow scenario.
  50. // The control node parser is used to parse the edge info in control nodes.
  51. // The origin parameters order is used to correspond to the input args.
  52. // The origin outputs order is used to correspond to the output args.
  53. // The need_erase means need erase this GraphCompilerInfo object after run actor set.
  54. struct BACKEND_EXPORT GraphCompilerInfo {
  55. GraphCompilerInfo(const std::vector<KernelGraphPtr> &graphs, const std::vector<DeviceContext *> &device_contexts,
  56. const std::vector<std::vector<int64_t> *> &tensors_mask,
  57. const std::vector<std::vector<TensorPtr> *> &input_tensors,
  58. const std::vector<AnfNodePtr> &control_nodes,
  59. const std::vector<AnfNodePtr> &origin_parameters_order, const ControlNodeParserPtr &parser,
  60. const KernelMapPosition &origin_outputs_order, const size_t outputs_num, const std::string &name,
  61. bool need_erase, GraphExecutionStrategy strategy)
  62. : graphs_(graphs),
  63. device_contexts_(device_contexts),
  64. tensors_mask_(tensors_mask),
  65. input_tensors_(input_tensors),
  66. control_nodes_(control_nodes),
  67. control_node_parser_(parser),
  68. origin_parameters_order_(origin_parameters_order),
  69. origin_outputs_order_(origin_outputs_order),
  70. outputs_num_(outputs_num),
  71. name_(name),
  72. need_erase_(need_erase),
  73. strategy_(strategy) {}
  74. ~GraphCompilerInfo();
  75. std::vector<KernelGraphPtr> graphs_;
  76. std::vector<DeviceContext *> device_contexts_;
  77. std::vector<std::vector<int64_t> *> tensors_mask_;
  78. std::vector<std::vector<TensorPtr> *> input_tensors_;
  79. std::vector<AnfNodePtr> control_nodes_;
  80. ControlNodeParserPtr control_node_parser_;
  81. std::vector<AnfNodePtr> origin_parameters_order_;
  82. KernelMapPosition origin_outputs_order_;
  83. size_t outputs_num_;
  84. std::string name_;
  85. bool need_erase_;
  86. GraphExecutionStrategy strategy_;
  87. };
  88. class GraphCompiler {
  89. public:
  90. GraphCompiler() { session_ = session::SessionFactory::Get().Create(kSessionBasic); }
  91. ~GraphCompiler() = default;
  92. // Construct kernel graph from anf nodes list and compile kernel graph in Graph mode,
  93. // the detailed implementation of compiling graph is in 'CompileGraphImpl'.
  94. GraphId CompileGraph(const GraphSegmentPtr &segment, const AnfNodePtrList &outputs,
  95. const DeviceContext *device_context, bool run_in_pynative = false);
  96. // Construct kernel graph from function graph and compile kernel graph in Graph mode,
  97. // the detailed implementation of compiling graph is in 'CompileGraphImpl'.
  98. GraphId CompileGraph(const FuncGraphPtr &func_graph, const DeviceContext *device_context);
  99. // Construct single op kernel graph and compile the kernel graph in PyNative mode.
  100. GraphId CompileGraph(const session::OpRunInfo &op_run_info, bool *single_op_cache_hit,
  101. const DeviceContext *device_context);
  102. // Create kernel and Create workspace for graphs in PyNative mode.
  103. void BuildSingleOpGraphs(const std::vector<KernelGraphPtr> &graphs, const DeviceContext *device_context) const;
  104. // Get graph by graph id, if not exist return nullptr, used in Graph mode.
  105. KernelGraphPtr Fetch(GraphId graph_id) const;
  106. // Get graph by graph info, if not exist return nullptr, used in PyNative mode.
  107. KernelGraphPtr Fetch(const GraphInfo &graph_info) const;
  108. // The following four methods used in PyNative back propagation to split complete kernel graph to single
  109. // op graph, and these methods will be removed to class MindRTBackend after deleting session module.
  110. // Cache index for all parameter and output nodes of kernel graph, used to get parameter of single op and
  111. // recover output of original complete back propagation kernel graph.
  112. void GetParamAndOutputIndex(const KernelGraphPtr &graph, const std::vector<TensorPtr> &inputs,
  113. VectorRef *const outputs, std::map<AnfNodePtr, size_t> *parameter_index,
  114. std::map<KernelWithIndex, std::vector<std::vector<size_t>>> *output_indexes);
  115. // Get input tensors for single op compile and run, input tensors may convert from value node and parameter in graph
  116. // and prev kernel node's output.
  117. void GetSingleOpInputTensors(const CNodePtr &kernel, const std::map<KernelWithIndex, TensorPtr> &op_output,
  118. const std::map<AnfNodePtr, size_t> &parameter_index,
  119. const std::vector<TensorPtr> &graph_inputs, InputTensorInfo *const input_tensor_info);
  120. // Get one input tensor for single control op, such as bprop_cut.
  121. TensorPtr GetSingleOpInputTensorByIndex(const CNodePtr &kernel, const std::map<KernelWithIndex, TensorPtr> &op_output,
  122. const std::map<AnfNodePtr, size_t> &parameter_index,
  123. const std::vector<TensorPtr> &graph_inputs,
  124. InputTensorInfo *const input_tensor_info, size_t input_index);
  125. // Get OpRunInfo and GraphInfo for single op compile and run.
  126. void GetSingleOpRunInfoAndGraphInfo(const CNodePtr &kernel, const InputTensorInfo &tensor_info, OpRunInfo *run_info,
  127. GraphInfo *graph_info, GraphOutputInfo *const graph_output_info);
  128. // Calculate ref count of PyNative back propagation operators.
  129. void CalculateRefCount(const KernelGraphPtr &graph, std::map<KernelWithIndex, size_t> *ref_count) const;
  130. // Calculate forward op output ref count of PyNative back graph.
  131. void CalculateForwardOpOutputCount(const KernelGraphPtr &graph, const std::vector<tensor::TensorPtr> &inputs,
  132. std::map<std::string, size_t> *forward_op_output_tensor_id) const;
  133. // Update ref count of PyNative back propagation operators.
  134. void UpdateRefCount(const std::set<KernelWithIndex> &input_kernels_with_index,
  135. std::map<KernelWithIndex, size_t> *ref_count,
  136. std::map<KernelWithIndex, tensor::TensorPtr> *op_output_map) const;
  137. // Update forward op output ref count of PyNative back graph.
  138. void UpdateForwardOpOutputRefCount(const std::vector<tensor::TensorPtr> &input_tensor,
  139. std::map<std::string, size_t> *forward_op_output_tensor_id) const;
  140. // Handle single op output tensor and recover output of original complete kernel graph.
  141. void RecoverGraphOutput(const AnfNodePtr &kernel, const VectorRef &op_outputs,
  142. const std::map<KernelWithIndex, size_t> &ref_count,
  143. std::map<KernelWithIndex, TensorPtr> *op_output_map,
  144. GraphOutputInfo *const graph_output_info) const;
  145. // Collect output tensors of back propagation graph for allreduce operators to average gradient,
  146. // used in PyNative distributed training mode.
  147. void AddGradAddrToBucket(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &grad_tensor);
  148. // Clear resource in bucket, such as useless tensors and device memory of all communication operators,
  149. // Bucket is used in PyNative distributed training mode, one bucket handles all resource to launch and sync allreduce
  150. // operator.
  151. void ClearAllBucket(const GraphId &graph_id);
  152. const std::vector<KernelWithIndex> &GetGraphOutputNodes(GraphId graph_id) const;
  153. // Register a summary callback function, which is called in the final stages of summary.
  154. void RegisterSummaryCallBackFunc(const CallBackFunc &callback) const;
  155. // Execute graph summary.
  156. void Summary(const std::vector<KernelGraphPtr> &graphs) const;
  157. // Remove single op kernel graph cache and output nodes cache.
  158. void EraseSingleOpCache(const GraphInfo &graph_info, const GraphId &graph_id);
  159. private:
  160. DISABLE_COPY_AND_ASSIGN(GraphCompiler);
  161. // The implementation of compiling graph in Graph Mode, including optimizing graph,
  162. // setting operator info, creating kernel and transforming kernel graph to ActorSet.
  163. GraphId CompileGraphImpl(const KernelGraphPtr &graph, const DeviceContext *device_context) const;
  164. // Add operators' output and input reference map to the graph.
  165. void AddOutInRefToGraph(const KernelGraphPtr &graph) const;
  166. // Create device address for all anf nodes of graph.
  167. void CreateDeviceAddress(const KernelGraphPtr &graph, const DeviceContext *device_context,
  168. bool is_gradient_out) const;
  169. // Create device address for input and output of ops.
  170. void CreateDeviceAddressWithoutWorkspace(const KernelGraphPtr &graph, const DeviceContext *device_context,
  171. bool is_gradient_out) const;
  172. // Set Graph's dependencies for pre_graph and post_graph.
  173. void SetGraphDependency(const KernelGraphPtr &graph, const GraphSegmentPtr &segment) const;
  174. // Single op kernel graph cache for PyNative mode.
  175. mindspore::HashMap<GraphInfo, KernelGraphPtr> run_op_graphs_;
  176. // Single op kernel graph output nodes cache for PyNative mode.
  177. mindspore::HashMap<GraphId, std::vector<KernelWithIndex>> run_op_graph_output_nodes_;
  178. // The member variable 'session_' will be removed after removing session module.
  179. // Now all the GraphCompiler share the same 'session_'.
  180. session::SessionPtr session_;
  181. };
  182. } // namespace runtime
  183. } // namespace mindspore
  184. #endif // MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_GRAPH_COMPILER_H_