You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gpu_session.cc 6.6 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "session/gpu_session.h"
  17. #include "device/gpu/kernel_info_setter.h"
  18. #include "device/gpu/gpu_kernel_build.h"
  19. #include "device/gpu/gpu_kernel_runtime.h"
  20. #include "pre_activate/common/optimizer.h"
  21. #include "pre_activate/common/pass_manager.h"
  22. #include "pre_activate/common/ir_fusion/allreduce_fusion.h"
  23. #include "device/kernel_runtime_manager.h"
  24. #include "predict/predict.h"
  25. #include "common/utils.h"
  26. #include "utils/context/ms_context.h"
  27. namespace mindspore {
  28. namespace session {
  29. namespace gpu {
  30. using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm;
  31. void GPUSession::SelectKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  32. MS_EXCEPTION_IF_NULL(kernel_graph);
  33. for (const auto &kernel_node : kernel_graph->execution_order()) {
  34. MS_EXCEPTION_IF_NULL(kernel_node);
  35. device::gpu::SetKernelInfo(kernel_node);
  36. }
  37. }
  38. void GPUSession::StartKernelRT() const {
  39. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  40. MS_EXCEPTION_IF_NULL(runtime_instance);
  41. if (!runtime_instance->Init()) {
  42. MS_LOG(EXCEPTION) << "GPU start kernel runtime failed";
  43. }
  44. }
  45. void GPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
  46. auto optimizer = std::make_shared<opt::GraphOptimizer>();
  47. auto pm = std::make_shared<opt::PassManager>();
  48. pm->AddPass(std::make_shared<opt::AllReduceFusion>());
  49. optimizer->AddPassManager(pm);
  50. (void)optimizer->Optimize(kernel_graph);
  51. kernel_graph->SetExecOrderByDefault();
  52. }
  53. void GPUSession::BuildKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  54. device::gpu::GpuBuild(kernel_graph);
  55. }
  56. void GPUSession::AllocateMemory(KernelGraph *kernel_graph) const {
  57. MS_EXCEPTION_IF_NULL(kernel_graph);
  58. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  59. MS_EXCEPTION_IF_NULL(runtime_instance);
  60. runtime_instance->AssignMemory(kernel_graph);
  61. }
  62. void GPUSession::RunOpAllocateMemory(const std::vector<tensor::TensorPtr> &input_tensors,
  63. KernelGraph *kernel_graph) const {
  64. MS_EXCEPTION_IF_NULL(kernel_graph);
  65. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  66. MS_EXCEPTION_IF_NULL(runtime_instance);
  67. runtime_instance->RunOpAssignMemory(input_tensors, kernel_graph);
  68. }
  69. void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  70. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  71. MS_EXCEPTION_IF_NULL(runtime_instance);
  72. if (!runtime_instance->Run(kernel_graph.get())) {
  73. MS_LOG(EXCEPTION) << "GPU execute graph failed!";
  74. }
  75. }
  76. GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) {
  77. // Construct graph, if construct successs, graph_sum_ + 1
  78. auto graph_id = graph_sum_;
  79. auto graph = ConstructKernelGraph(lst, outputs);
  80. // Select kernel build info
  81. SelectKernel(graph);
  82. // Convert kernel Graph to model
  83. predictmodel::StepConvertGraph(graph);
  84. // Start gpu kernel runtime
  85. StartKernelRT();
  86. // AllReduce Optimize
  87. Optimize(graph);
  88. // Build kernel if node is cnode
  89. BuildKernel(graph);
  90. // Set graph execution order before memory alloc, ensure that memory alloc is according to the reorder graph
  91. auto execution_order = graph->execution_order();
  92. Reorder(&execution_order);
  93. graph->set_execution_order(execution_order);
  94. // Alloc memeory, include static memory and dynamic memory
  95. AllocateMemory(graph.get());
  96. // Reset memory resource
  97. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  98. MS_EXCEPTION_IF_NULL(runtime_instance);
  99. runtime_instance->FreeHostMemory();
  100. return graph_id;
  101. }
  102. void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
  103. auto &kernel_graph = graphs_[graph_id];
  104. // Load input data from user input
  105. LoadInputData(kernel_graph, inputs);
  106. MS_EXCEPTION_IF_NULL(kernel_graph);
  107. // Convert inputs to model
  108. predictmodel::StepConvertWeight(inputs);
  109. // Run graph on GPU
  110. Execute(kernel_graph);
  111. // Get result from GPU
  112. UpdateOutputs(kernel_graph, outputs, inputs);
  113. // Summary
  114. auto context_ptr = MsContext::GetInstance();
  115. MS_EXCEPTION_IF_NULL(context_ptr);
  116. if (context_ptr->enable_gpu_summary()) {
  117. Summary(kernel_graph.get());
  118. }
  119. }
  120. void GPUSession::BuildOp(const OpRunInfo &op_run_info, const GraphInfo &graph_info) {
  121. // Prepare the graph
  122. auto kernel_graph = ConstructSingleOpGraph(op_run_info);
  123. MS_EXCEPTION_IF_NULL(kernel_graph);
  124. SelectKernel(kernel_graph);
  125. StartKernelRT();
  126. BuildKernel(kernel_graph);
  127. run_op_graphs_[graph_info] = kernel_graph;
  128. }
  129. py::tuple GPUSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph_info) {
  130. auto kernel_graph = run_op_graphs_[graph_info];
  131. MS_EXCEPTION_IF_NULL(kernel_graph);
  132. std::vector<tensor::TensorPtr> input_tensors = {};
  133. std::vector<bool> tensors_mask = {};
  134. ToTensorPtr(op_run_info, &input_tensors, &tensors_mask);
  135. RunOpAllocateMemory(input_tensors, kernel_graph.get());
  136. // Execute the computation
  137. LoadInputData(kernel_graph, input_tensors);
  138. Execute(kernel_graph);
  139. // Fetch outputs
  140. VectorRef outputs;
  141. UpdateOutputs(kernel_graph, &outputs, input_tensors);
  142. // Trans output to tuple
  143. auto output_tensors = TransformBaseRefListToTuple(outputs);
  144. if (!utils::isa<PyObjectRef>(output_tensors) ||
  145. !py::isinstance<py::tuple>(utils::cast<PyObjectRef>(output_tensors).object_)) {
  146. MS_EXCEPTION(NotSupportError) << "The output tensors should be a tuple !";
  147. }
  148. py::object tuple_obj = utils::cast<PyObjectRef>(output_tensors).object_;
  149. py::tuple tuple_tensors = py::cast<py::tuple>(tuple_obj);
  150. run_op_graphs_.clear();
  151. return tuple_tensors;
  152. }
  153. } // namespace gpu
  154. } // namespace session
  155. } // namespace mindspore