You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

graph_compiler.h 7.2 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. /**
  2. * Copyright 2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_GRAPH_COMPILER_H_
  17. #define MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_GRAPH_COMPILER_H_
  18. #include <vector>
  19. #include <memory>
  20. #include <string>
  21. #include <unordered_map>
  22. #include <map>
  23. #include <set>
  24. #include "runtime/hardware/device_context.h"
  25. #include "backend/session/session_basic.h"
  26. #include "backend/session/session_factory.h"
  27. #include "ir/tensor.h"
  28. namespace mindspore {
  29. using device::DeviceContext;
  30. using session::CallBackFunc;
  31. using session::GraphOutputInfo;
  32. using session::InputTensorInfo;
  33. using session::KernelGraph;
  34. using session::KernelWithIndex;
  35. using session::OpRunInfo;
  36. using tensor::TensorPtr;
  37. namespace runtime {
  38. class GraphCompiler {
  39. public:
  40. GraphCompiler() { session_ = session::SessionFactory::Get().Create(kSessionBasic); }
  41. ~GraphCompiler() = default;
  42. // Construct kernel graph from anf nodes list and compile kernel graph in Graph mode,
  43. // the detailed implementation of compiling graph is in 'CompileGraphImpl'.
  44. GraphId CompileGraph(const AnfNodePtrList &nodes, const AnfNodePtrList &outputs, const DeviceContext *device_context);
  45. // Construct single op kernel graph and compile the kernel graph in PyNative mode.
  46. GraphId CompileGraph(const session::OpRunInfo &op_run_info, const GraphInfo &graph_info,
  47. const std::vector<int64_t> *tensors_mask, std::vector<TensorPtr> *const input_tensors,
  48. bool *single_op_cache_hit, const DeviceContext *device_context);
  49. // Get graph by graph id, if not exist return nullptr, used in Graph mode.
  50. KernelGraphPtr Fetch(GraphId graph_id) const;
  51. // Get graph by graph info, if not exist return nullptr, used in PyNative mode.
  52. KernelGraphPtr Fetch(const GraphInfo &graph_info) const;
  53. // The following four methods used in PyNative back propagation to split complete kernel graph to single
  54. // op graph, and these methods will be removed to class MindRTBackend after deleting session module.
  55. // Cache index for all parameter and output nodes of kernel graph, used to get parameter of single op and
  56. // recover output of original complete back propagation kernel graph.
  57. void GetParamAndOutputIndex(const KernelGraphPtr &graph, const std::vector<TensorPtr> &inputs,
  58. VectorRef *const outputs, std::map<AnfNodePtr, size_t> *parameter_index,
  59. std::map<KernelWithIndex, std::vector<std::vector<size_t>>> *output_indexes);
  60. // Get input tensors for single op compile and run, input tensors may convert from value node and parameter in graph
  61. // and prev kernel node's output.
  62. void GetSingleOpInputTensors(const CNodePtr &kernel, const std::map<KernelWithIndex, TensorPtr> &op_output,
  63. const std::map<AnfNodePtr, size_t> &parameter_index,
  64. const std::vector<TensorPtr> &graph_inputs, InputTensorInfo *const input_tensor_info);
  65. // Get one input tensor for single control op, such as bprop_cut.
  66. TensorPtr GetSingleOpInputTensorByIndex(const CNodePtr &kernel, const std::map<KernelWithIndex, TensorPtr> &op_output,
  67. const std::map<AnfNodePtr, size_t> &parameter_index,
  68. const std::vector<TensorPtr> &graph_inputs,
  69. InputTensorInfo *const input_tensor_info, size_t input_index);
  70. // Get OpRunInfo and GraphInfo for single op compile and run.
  71. void GetSingleOpRunInfoAndGraphInfo(const CNodePtr &kernel, const std::vector<TensorPtr> &input_tensors,
  72. OpRunInfo *const run_info, GraphInfo *const graph_info);
  73. // Calculate ref count of PyNative back propagation operators.
  74. void CalculateRefCount(const KernelGraphPtr &graph, std::map<KernelWithIndex, size_t> *ref_count) const;
  75. // Update ref count of PyNative back propagation operators.
  76. void UpdateRefCount(const std::set<KernelWithIndex> &input_kernels_with_index,
  77. std::map<KernelWithIndex, size_t> *ref_count,
  78. std::map<KernelWithIndex, tensor::TensorPtr> *op_output_map) const;
  79. // Handle single op output tensor and recover output of original complete kernel graph.
  80. void RecoverGraphOutput(const AnfNodePtr &kernel, const VectorRef &op_outputs,
  81. const std::map<KernelWithIndex, size_t> &ref_count,
  82. std::map<KernelWithIndex, TensorPtr> *op_output_map,
  83. GraphOutputInfo *const graph_output_info) const;
  84. // Collect output tensors of back propagation graph for allreduce operators to average gradient,
  85. // used in PyNative distributed training mode.
  86. void AddGradAddrToBucket(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &grad_tensor);
  87. // Clear resource in bucket, such as useless tensors and device memory of all communication operators,
  88. // Bucket is used in PyNative distributed training mode, one bucket handles all resource to launch and sync allreduce
  89. // operator.
  90. void ClearAllBucket(const GraphId &graph_id);
  91. const std::vector<KernelWithIndex> &GetGraphOutputNodes(GraphId graph_id) const;
  92. // Register a summary callback function, which is called in the final stages of summary.
  93. void RegisterSummaryCallBackFunc(const CallBackFunc &callback) const;
  94. // Execute graph summary.
  95. void Summary(const std::vector<KernelGraphPtr> &graphs) const;
  96. // Remove single op kernel graph cache and output nodes cache.
  97. void EraseSingleOpCache(const GraphInfo &graph_info, const GraphId &graph_id);
  98. private:
  99. DISABLE_COPY_AND_ASSIGN(GraphCompiler);
  100. // The implementation of compiling graph in Graph Mode, including optimizing graph,
  101. // setting operator info, creating kernel and transforming kernel graph to ActorSet.
  102. GraphId CompileGraphImpl(const KernelGraphPtr &graph, const DeviceContext *device_context) const;
  103. // Create device address for all anf nodes of graph.
  104. void CreateDeviceAddress(const KernelGraphPtr &graph, const DeviceContext *device_context) const;
  105. // Single op kernel graph cache for PyNative mode.
  106. std::unordered_map<GraphInfo, KernelGraphPtr> run_op_graphs_;
  107. // Single op kernel graph output nodes cache for PyNative mode.
  108. std::unordered_map<GraphId, std::vector<KernelWithIndex>> run_op_graph_output_nodes_;
  109. // The member variable 'session_' will be removed after removing session module.
  110. // Now all the GraphCompiler share the same 'session_'.
  111. session::SessionPtr session_;
  112. };
  113. } // namespace runtime
  114. } // namespace mindspore
  115. #endif // MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_GRAPH_COMPILER_H_