You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gpu_session.cc 9.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "session/gpu_session.h"
  17. #include "device/gpu/kernel_info_setter.h"
  18. #include "device/gpu/gpu_kernel_build.h"
  19. #include "device/gpu/gpu_kernel_runtime.h"
  20. #include "device/gpu/gpu_stream_assign.h"
  21. #include "pre_activate/common/optimizer.h"
  22. #include "pre_activate/common/pass_manager.h"
  23. #include "pre_activate/common/helper.h"
  24. #include "pre_activate/pass/communication_op_fusion.h"
  25. #include "pre_activate/pass/getitem_tuple.h"
  26. #include "device/kernel_runtime_manager.h"
  27. #include "predict/predict.h"
  28. #include "common/utils.h"
  29. #include "common/trans.h"
  30. #include "utils/context/ms_context.h"
  31. #include "utils/base_ref_extends.h"
  32. namespace mindspore {
  33. namespace session {
  34. namespace gpu {
  35. using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm;
  36. void GPUSession::SelectKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  37. MS_EXCEPTION_IF_NULL(kernel_graph);
  38. for (const auto &kernel_node : kernel_graph->execution_order()) {
  39. MS_EXCEPTION_IF_NULL(kernel_node);
  40. device::gpu::SetKernelInfo(kernel_node);
  41. }
  42. }
  43. void GPUSession::StartKernelRT() const {
  44. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  45. MS_EXCEPTION_IF_NULL(runtime_instance);
  46. if (!runtime_instance->Init()) {
  47. MS_LOG(EXCEPTION) << "GPU start kernel runtime failed";
  48. }
  49. }
  50. void GPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
  51. MS_EXCEPTION_IF_NULL(kernel_graph);
  52. auto optimizer = std::make_shared<opt::GraphOptimizer>();
  53. auto pm = std::make_shared<opt::PassManager>();
  54. pm->AddPass(std::make_shared<opt::AllReduceFusion>());
  55. pm->AddPass(std::make_shared<opt::GetitemTuple>());
  56. optimizer->AddPassManager(pm);
  57. (void)optimizer->Optimize(kernel_graph);
  58. kernel_graph->SetExecOrderByDefault();
  59. }
  60. void GPUSession::AssignStream(const std::shared_ptr<KernelGraph> &kernel_graph) {
  61. MS_EXCEPTION_IF_NULL(kernel_graph);
  62. device::gpu::AssignGpuStream(kernel_graph);
  63. }
  64. void GPUSession::BuildKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  65. device::gpu::GpuBuild(kernel_graph);
  66. }
  67. void GPUSession::AllocateMemory(KernelGraph *kernel_graph) const {
  68. MS_EXCEPTION_IF_NULL(kernel_graph);
  69. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  70. MS_EXCEPTION_IF_NULL(runtime_instance);
  71. runtime_instance->AssignMemory(kernel_graph);
  72. }
  73. void GPUSession::RunOpAllocateMemory(const std::vector<tensor::TensorPtr> &input_tensors,
  74. KernelGraph *kernel_graph) const {
  75. MS_EXCEPTION_IF_NULL(kernel_graph);
  76. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  77. MS_EXCEPTION_IF_NULL(runtime_instance);
  78. runtime_instance->RunOpAssignMemory(input_tensors, kernel_graph);
  79. }
  80. void GPUSession::RunOpClearMemory(KernelGraph *kernel_graph) const {
  81. MS_EXCEPTION_IF_NULL(kernel_graph);
  82. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  83. MS_EXCEPTION_IF_NULL(runtime_instance);
  84. runtime_instance->RunOpClearMemory(kernel_graph);
  85. }
  86. void GPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
  87. const std::vector<tensor::TensorPtr> &inputs_const) const {
  88. std::vector<tensor::TensorPtr> inputs(inputs_const);
  89. MS_EXCEPTION_IF_NULL(kernel_graph);
  90. auto input_nodes = kernel_graph->inputs();
  91. auto ms_context = MsContext::GetInstance();
  92. MS_EXCEPTION_IF_NULL(ms_context);
  93. for (size_t i = 0; i < inputs.size(); ++i) {
  94. auto tensor = inputs[i];
  95. MS_EXCEPTION_IF_NULL(tensor);
  96. auto input_node = input_nodes[i];
  97. MS_EXCEPTION_IF_NULL(input_node);
  98. if (input_node->isa<Parameter>() && AnfAlgo::OutputAddrExist(input_node, 0)) {
  99. auto pk_node = input_node->cast<ParameterPtr>();
  100. auto device_address = AnfAlgo::GetMutableOutputAddr(pk_node, 0);
  101. auto tensor_address = tensor->device_address();
  102. bool need_sync = false;
  103. if (ms_context->enable_pynative_infer()) {
  104. if (tensor_address.get() == nullptr || tensor_address != device_address) {
  105. need_sync = true;
  106. }
  107. } else if (tensor->is_dirty()) {
  108. need_sync = true;
  109. } else if (tensor_address != device_address) {
  110. if (tensor_address->DeviceType() == device_address->DeviceType()) {
  111. AnfAlgo::SetOutputAddr(tensor_address, 0, pk_node.get());
  112. } else {
  113. need_sync = true;
  114. }
  115. }
  116. if (need_sync) {
  117. tensor->set_device_address(device_address);
  118. MS_EXCEPTION_IF_NULL(device_address);
  119. if (!device_address->SyncHostToDevice(trans::GetRuntimePaddingShape(pk_node, 0),
  120. LongToSize(tensor->data().nbytes()), tensor->data_type(),
  121. tensor->data_c(false))) {
  122. MS_LOG(EXCEPTION) << "SyncHostToDevice failed.";
  123. }
  124. }
  125. }
  126. tensor->set_dirty(false);
  127. }
  128. }
  129. void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  130. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  131. MS_EXCEPTION_IF_NULL(runtime_instance);
  132. if (!runtime_instance->Run(kernel_graph.get())) {
  133. MS_LOG(EXCEPTION) << "GPU execute graph failed!";
  134. }
  135. }
  136. GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) {
  137. // Construct graph, if successfully, graph_sum_ + 1
  138. auto graph_id = graph_sum_;
  139. auto graph = ConstructKernelGraph(lst, outputs);
  140. MS_EXCEPTION_IF_NULL(graph);
  141. // Select kernel build info
  142. SelectKernel(graph);
  143. // Convert kernel Graph to model
  144. predictmodel::StepConvertGraph(graph);
  145. // Start gpu kernel runtime
  146. StartKernelRT();
  147. // AllReduce Optimize
  148. Optimize(graph);
  149. // Assign CUDA streams
  150. AssignStream(graph);
  151. // Hide NoOp from execution graph
  152. opt::HideNopNode(graph.get());
  153. // Build kernel if node is cnode
  154. BuildKernel(graph);
  155. // Set graph execution order before memory alloc, ensure that memory alloc is according to the reorder graph
  156. auto execution_order = graph->execution_order();
  157. Reorder(&execution_order);
  158. graph->set_execution_order(execution_order);
  159. // Get summary nodes.
  160. GetSummaryNodes(graph.get());
  161. // Remove NoOp from execution graph
  162. opt::RemoveNopNode(graph.get());
  163. // Alloc memory, including static memory and dynamic memory
  164. AllocateMemory(graph.get());
  165. MS_EXCEPTION_IF_NULL(context_);
  166. FuncGraphManagerPtr manager = MakeManager({graph});
  167. context_->AddManager(manager);
  168. if (manager) {
  169. manager->AddFuncGraph(graph);
  170. graph->set_manager(manager);
  171. }
  172. return graph_id;
  173. }
  174. void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
  175. auto &kernel_graph = graphs_[graph_id];
  176. // Load input data from user input
  177. LoadInputData(kernel_graph, inputs);
  178. MS_EXCEPTION_IF_NULL(kernel_graph);
  179. // Convert inputs to model
  180. predictmodel::StepConvertWeight(inputs);
  181. {
  182. py::gil_scoped_release gil_release;
  183. // Run graph on GPU
  184. Execute(kernel_graph);
  185. }
  186. // Get result from GPU
  187. UpdateOutputs(kernel_graph, outputs, inputs);
  188. // Summary
  189. auto context_ptr = MsContext::GetInstance();
  190. MS_EXCEPTION_IF_NULL(context_ptr);
  191. if (context_ptr->enable_gpu_summary()) {
  192. Summary(kernel_graph.get());
  193. }
  194. }
  195. void GPUSession::BuildOp(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
  196. const std::vector<tensor::TensorPtr> &input_tensors, const std::vector<int> &tensors_mask) {
  197. // Check if the graph cache exists.
  198. if (run_op_graphs_.find(graph_info) != run_op_graphs_.end()) {
  199. return;
  200. }
  201. // Prepare the graph
  202. auto kernel_graph = ConstructSingleOpGraph(op_run_info, input_tensors, tensors_mask);
  203. MS_EXCEPTION_IF_NULL(kernel_graph);
  204. SelectKernel(kernel_graph);
  205. StartKernelRT();
  206. // Hide NoOp from execution graph
  207. opt::HideNopNode(kernel_graph.get());
  208. BuildKernel(kernel_graph);
  209. run_op_graphs_[graph_info] = kernel_graph;
  210. }
  211. py::tuple GPUSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
  212. const std::vector<tensor::TensorPtr> &input_tensors) {
  213. auto kernel_graph = run_op_graphs_[graph_info];
  214. MS_EXCEPTION_IF_NULL(kernel_graph);
  215. // Remove NoOp from execution graph
  216. opt::RemoveNopNode(kernel_graph.get());
  217. RunOpAllocateMemory(input_tensors, kernel_graph.get());
  218. // Execute the computation
  219. LoadInputData(kernel_graph, input_tensors);
  220. Execute(kernel_graph);
  221. // Fetch outputs
  222. VectorRef outputs;
  223. UpdateOutputs(kernel_graph, &outputs, input_tensors);
  224. // Trans output to tuple
  225. auto output_tensors = TransformBaseRefListToTuple(outputs);
  226. if (!utils::isa<PyObjectRef>(output_tensors) ||
  227. !py::isinstance<py::tuple>(utils::cast<PyObjectRef>(output_tensors).object_)) {
  228. MS_EXCEPTION(NotSupportError) << "The output tensors should be a tuple !";
  229. }
  230. py::object tuple_obj = utils::cast<PyObjectRef>(output_tensors).object_;
  231. py::tuple tuple_tensors = py::cast<py::tuple>(tuple_obj);
  232. RunOpClearMemory(kernel_graph.get());
  233. return tuple_tensors;
  234. }
  235. } // namespace gpu
  236. } // namespace session
  237. } // namespace mindspore