You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gpu_session.cc 6.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "session/gpu_session.h"
  17. #include "device/gpu/kernel_info_setter.h"
  18. #include "device/gpu/gpu_kernel_build.h"
  19. #include "device/gpu/gpu_kernel_runtime.h"
  20. #include "device/gpu/gpu_stream_assign.h"
  21. #include "pre_activate/common/optimizer.h"
  22. #include "pre_activate/common/pass_manager.h"
  23. #include "pre_activate/pass/communication_op_fusion.h"
  24. #include "device/kernel_runtime_manager.h"
  25. #include "predict/predict.h"
  26. #include "common/utils.h"
  27. #include "utils/context/ms_context.h"
  28. namespace mindspore {
  29. namespace session {
  30. namespace gpu {
  31. using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm;
  32. void GPUSession::SelectKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  33. MS_EXCEPTION_IF_NULL(kernel_graph);
  34. for (const auto &kernel_node : kernel_graph->execution_order()) {
  35. MS_EXCEPTION_IF_NULL(kernel_node);
  36. device::gpu::SetKernelInfo(kernel_node);
  37. }
  38. }
  39. void GPUSession::StartKernelRT() const {
  40. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  41. MS_EXCEPTION_IF_NULL(runtime_instance);
  42. if (!runtime_instance->Init()) {
  43. MS_LOG(EXCEPTION) << "GPU start kernel runtime failed";
  44. }
  45. }
  46. void GPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
  47. auto optimizer = std::make_shared<opt::GraphOptimizer>();
  48. auto pm = std::make_shared<opt::PassManager>();
  49. pm->AddPass(std::make_shared<opt::AllReduceFusion>());
  50. optimizer->AddPassManager(pm);
  51. (void)optimizer->Optimize(kernel_graph);
  52. kernel_graph->SetExecOrderByDefault();
  53. }
  54. void GPUSession::AssignStream(const std::shared_ptr<KernelGraph> &kernel_graph) {
  55. MS_EXCEPTION_IF_NULL(kernel_graph);
  56. device::gpu::AssignGpuStream(kernel_graph);
  57. }
  58. void GPUSession::BuildKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  59. device::gpu::GpuBuild(kernel_graph);
  60. }
  61. void GPUSession::AllocateMemory(KernelGraph *kernel_graph) const {
  62. MS_EXCEPTION_IF_NULL(kernel_graph);
  63. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  64. MS_EXCEPTION_IF_NULL(runtime_instance);
  65. runtime_instance->AssignMemory(kernel_graph);
  66. }
  67. void GPUSession::RunOpAllocateMemory(const std::vector<tensor::TensorPtr> &input_tensors,
  68. KernelGraph *kernel_graph) const {
  69. MS_EXCEPTION_IF_NULL(kernel_graph);
  70. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  71. MS_EXCEPTION_IF_NULL(runtime_instance);
  72. runtime_instance->RunOpAssignMemory(input_tensors, kernel_graph);
  73. }
  74. void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  75. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  76. MS_EXCEPTION_IF_NULL(runtime_instance);
  77. if (!runtime_instance->Run(kernel_graph.get())) {
  78. MS_LOG(EXCEPTION) << "GPU execute graph failed!";
  79. }
  80. }
  81. GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) {
  82. // Construct graph, if successfully, graph_sum_ + 1
  83. auto graph_id = graph_sum_;
  84. auto graph = ConstructKernelGraph(lst, outputs);
  85. // Select kernel build info
  86. SelectKernel(graph);
  87. // Convert kernel Graph to model
  88. predictmodel::StepConvertGraph(graph);
  89. // Start gpu kernel runtime
  90. StartKernelRT();
  91. // AllReduce Optimize
  92. Optimize(graph);
  93. // Assign CUDA streams
  94. AssignStream(graph);
  95. // Build kernel if node is cnode
  96. BuildKernel(graph);
  97. // Set graph execution order before memory alloc, ensure that memory alloc is according to the reorder graph
  98. auto execution_order = graph->execution_order();
  99. Reorder(&execution_order);
  100. graph->set_execution_order(execution_order);
  101. // Alloc memory, including static memory and dynamic memory
  102. AllocateMemory(graph.get());
  103. return graph_id;
  104. }
  105. void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
  106. auto &kernel_graph = graphs_[graph_id];
  107. // Load input data from user input
  108. LoadInputData(kernel_graph, inputs);
  109. MS_EXCEPTION_IF_NULL(kernel_graph);
  110. // Convert inputs to model
  111. predictmodel::StepConvertWeight(inputs);
  112. // Run graph on GPU
  113. Execute(kernel_graph);
  114. // Get result from GPU
  115. UpdateOutputs(kernel_graph, outputs, inputs);
  116. // Summary
  117. auto context_ptr = MsContext::GetInstance();
  118. MS_EXCEPTION_IF_NULL(context_ptr);
  119. if (context_ptr->enable_gpu_summary()) {
  120. Summary(kernel_graph.get());
  121. }
  122. }
  123. void GPUSession::BuildOp(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
  124. const std::vector<tensor::TensorPtr> &input_tensors, const std::vector<bool> &tensors_mask) {
  125. // Prepare the graph
  126. auto kernel_graph = ConstructSingleOpGraph(op_run_info, input_tensors, tensors_mask);
  127. MS_EXCEPTION_IF_NULL(kernel_graph);
  128. SelectKernel(kernel_graph);
  129. StartKernelRT();
  130. BuildKernel(kernel_graph);
  131. run_op_graphs_[graph_info] = kernel_graph;
  132. }
  133. py::tuple GPUSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
  134. const std::vector<tensor::TensorPtr> &input_tensors) {
  135. auto kernel_graph = run_op_graphs_[graph_info];
  136. MS_EXCEPTION_IF_NULL(kernel_graph);
  137. RunOpAllocateMemory(input_tensors, kernel_graph.get());
  138. // Execute the computation
  139. LoadInputData(kernel_graph, input_tensors);
  140. Execute(kernel_graph);
  141. // Fetch outputs
  142. VectorRef outputs;
  143. UpdateOutputs(kernel_graph, &outputs, input_tensors);
  144. // Trans output to tuple
  145. auto output_tensors = TransformBaseRefListToTuple(outputs);
  146. if (!utils::isa<PyObjectRef>(output_tensors) ||
  147. !py::isinstance<py::tuple>(utils::cast<PyObjectRef>(output_tensors).object_)) {
  148. MS_EXCEPTION(NotSupportError) << "The output tensors should be a tuple !";
  149. }
  150. py::object tuple_obj = utils::cast<PyObjectRef>(output_tensors).object_;
  151. py::tuple tuple_tensors = py::cast<py::tuple>(tuple_obj);
  152. run_op_graphs_.clear();
  153. return tuple_tensors;
  154. }
  155. } // namespace gpu
  156. } // namespace session
  157. } // namespace mindspore