You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gpu_session.cc 9.1 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "session/gpu_session.h"
  17. #include "device/gpu/kernel_info_setter.h"
  18. #include "device/gpu/gpu_kernel_build.h"
  19. #include "device/gpu/gpu_kernel_runtime.h"
  20. #include "device/gpu/gpu_stream_assign.h"
  21. #include "pre_activate/common/optimizer.h"
  22. #include "pre_activate/common/pass_manager.h"
  23. #include "pre_activate/common/helper.h"
  24. #include "pre_activate/pass/communication_op_fusion.h"
  25. #include "pre_activate/pass/getitem_tuple.h"
  26. #include "device/kernel_runtime_manager.h"
  27. #include "predict/predict.h"
  28. #include "common/utils.h"
  29. #include "common/trans.h"
  30. #include "utils/context/ms_context.h"
  31. #include "utils/base_ref_extends.h"
  32. namespace mindspore {
  33. namespace session {
  34. namespace gpu {
  35. using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm;
  36. void GPUSession::SelectKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  37. MS_EXCEPTION_IF_NULL(kernel_graph);
  38. for (const auto &kernel_node : kernel_graph->execution_order()) {
  39. MS_EXCEPTION_IF_NULL(kernel_node);
  40. device::gpu::SetKernelInfo(kernel_node);
  41. }
  42. }
  43. void GPUSession::StartKernelRT() const {
  44. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  45. MS_EXCEPTION_IF_NULL(runtime_instance);
  46. if (!runtime_instance->Init()) {
  47. MS_LOG(EXCEPTION) << "GPU start kernel runtime failed";
  48. }
  49. }
  50. void GPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
  51. auto optimizer = std::make_shared<opt::GraphOptimizer>();
  52. auto pm = std::make_shared<opt::PassManager>();
  53. pm->AddPass(std::make_shared<opt::AllReduceFusion>());
  54. pm->AddPass(std::make_shared<opt::GetitemTuple>());
  55. optimizer->AddPassManager(pm);
  56. (void)optimizer->Optimize(kernel_graph);
  57. kernel_graph->SetExecOrderByDefault();
  58. }
  59. void GPUSession::AssignStream(const std::shared_ptr<KernelGraph> &kernel_graph) {
  60. MS_EXCEPTION_IF_NULL(kernel_graph);
  61. device::gpu::AssignGpuStream(kernel_graph);
  62. }
  63. void GPUSession::BuildKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  64. device::gpu::GpuBuild(kernel_graph);
  65. }
  66. void GPUSession::AllocateMemory(KernelGraph *kernel_graph) const {
  67. MS_EXCEPTION_IF_NULL(kernel_graph);
  68. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  69. MS_EXCEPTION_IF_NULL(runtime_instance);
  70. // opt::RemoveNopNode(kernel_graph);
  71. runtime_instance->AssignMemory(kernel_graph);
  72. }
  73. void GPUSession::RunOpAllocateMemory(const std::vector<tensor::TensorPtr> &input_tensors,
  74. KernelGraph *kernel_graph) const {
  75. MS_EXCEPTION_IF_NULL(kernel_graph);
  76. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  77. MS_EXCEPTION_IF_NULL(runtime_instance);
  78. // opt::RemoveNopNode(kernel_graph);
  79. runtime_instance->RunOpAssignMemory(input_tensors, kernel_graph);
  80. }
  81. void GPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
  82. const std::vector<tensor::TensorPtr> &inputs_const) const {
  83. std::vector<tensor::TensorPtr> inputs(inputs_const);
  84. MS_EXCEPTION_IF_NULL(kernel_graph);
  85. auto input_nodes = kernel_graph->inputs();
  86. auto ms_context = MsContext::GetInstance();
  87. MS_EXCEPTION_IF_NULL(ms_context);
  88. for (size_t i = 0; i < inputs.size(); ++i) {
  89. auto tensor = inputs[i];
  90. MS_EXCEPTION_IF_NULL(tensor);
  91. auto input_node = input_nodes[i];
  92. MS_EXCEPTION_IF_NULL(input_node);
  93. if (input_node->isa<Parameter>() && AnfAlgo::OutputAddrExist(input_node, 0)) {
  94. auto pk_node = input_node->cast<ParameterPtr>();
  95. auto device_address = AnfAlgo::GetMutableOutputAddr(pk_node, 0);
  96. bool need_sync = false;
  97. if (ms_context->enable_pynative_infer()) {
  98. if (tensor->device_address().get() == nullptr || tensor->device_address() != device_address) {
  99. need_sync = true;
  100. }
  101. } else {
  102. if (tensor->is_dirty()) {
  103. need_sync = true;
  104. } else if (tensor->device_address() != device_address) {
  105. AnfAlgo::SetOutputAddr(tensor->device_address(), 0, pk_node.get());
  106. need_sync = false;
  107. }
  108. }
  109. if (need_sync) {
  110. tensor->set_device_address(device_address);
  111. MS_EXCEPTION_IF_NULL(device_address);
  112. if (!device_address->SyncHostToDevice(trans::GetRuntimePaddingShape(pk_node, 0),
  113. LongToSize(tensor->data().nbytes()), tensor->data_type(),
  114. tensor->data_c(false))) {
  115. MS_LOG(EXCEPTION) << "SyncHostToDevice failed.";
  116. }
  117. }
  118. }
  119. tensor->set_dirty(false);
  120. }
  121. }
  122. void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  123. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  124. MS_EXCEPTION_IF_NULL(runtime_instance);
  125. if (!runtime_instance->Run(kernel_graph.get())) {
  126. MS_LOG(EXCEPTION) << "GPU execute graph failed!";
  127. }
  128. }
  129. GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) {
  130. // Construct graph, if successfully, graph_sum_ + 1
  131. auto graph_id = graph_sum_;
  132. auto graph = ConstructKernelGraph(lst, outputs);
  133. // Select kernel build info
  134. SelectKernel(graph);
  135. // Convert kernel Graph to model
  136. predictmodel::StepConvertGraph(graph);
  137. // Start gpu kernel runtime
  138. StartKernelRT();
  139. // AllReduce Optimize
  140. Optimize(graph);
  141. // Assign CUDA streams
  142. AssignStream(graph);
  143. // Remove NoOp from execution graph
  144. // opt::HideNopNode(graph.get());
  145. // Build kernel if node is cnode
  146. BuildKernel(graph);
  147. // Set graph execution order before memory alloc, ensure that memory alloc is according to the reorder graph
  148. auto execution_order = graph->execution_order();
  149. Reorder(&execution_order);
  150. graph->set_execution_order(execution_order);
  151. // Alloc memory, including static memory and dynamic memory
  152. AllocateMemory(graph.get());
  153. MS_EXCEPTION_IF_NULL(context_);
  154. FuncGraphManagerPtr manager = MakeManager({graph});
  155. context_->AddManager(manager);
  156. if (manager) {
  157. manager->AddFuncGraph(graph);
  158. graph->set_manager(manager);
  159. }
  160. return graph_id;
  161. }
  162. void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
  163. auto &kernel_graph = graphs_[graph_id];
  164. // Load input data from user input
  165. LoadInputData(kernel_graph, inputs);
  166. MS_EXCEPTION_IF_NULL(kernel_graph);
  167. // Convert inputs to model
  168. predictmodel::StepConvertWeight(inputs);
  169. {
  170. py::gil_scoped_release gil_release;
  171. // Run graph on GPU
  172. Execute(kernel_graph);
  173. }
  174. // Get result from GPU
  175. UpdateOutputs(kernel_graph, outputs, inputs);
  176. // Summary
  177. auto context_ptr = MsContext::GetInstance();
  178. MS_EXCEPTION_IF_NULL(context_ptr);
  179. if (context_ptr->enable_gpu_summary()) {
  180. Summary(kernel_graph.get());
  181. }
  182. }
  183. void GPUSession::BuildOp(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
  184. const std::vector<tensor::TensorPtr> &input_tensors, const std::vector<int> &tensors_mask) {
  185. // Prepare the graph
  186. auto kernel_graph = ConstructSingleOpGraph(op_run_info, input_tensors, tensors_mask);
  187. MS_EXCEPTION_IF_NULL(kernel_graph);
  188. SelectKernel(kernel_graph);
  189. StartKernelRT();
  190. BuildKernel(kernel_graph);
  191. run_op_graphs_[graph_info] = kernel_graph;
  192. }
  193. py::tuple GPUSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
  194. const std::vector<tensor::TensorPtr> &input_tensors) {
  195. auto kernel_graph = run_op_graphs_[graph_info];
  196. MS_EXCEPTION_IF_NULL(kernel_graph);
  197. RunOpAllocateMemory(input_tensors, kernel_graph.get());
  198. // Execute the computation
  199. LoadInputData(kernel_graph, input_tensors);
  200. Execute(kernel_graph);
  201. // Fetch outputs
  202. VectorRef outputs;
  203. UpdateOutputs(kernel_graph, &outputs, input_tensors);
  204. // Trans output to tuple
  205. auto output_tensors = TransformBaseRefListToTuple(outputs);
  206. if (!utils::isa<PyObjectRef>(output_tensors) ||
  207. !py::isinstance<py::tuple>(utils::cast<PyObjectRef>(output_tensors).object_)) {
  208. MS_EXCEPTION(NotSupportError) << "The output tensors should be a tuple !";
  209. }
  210. py::object tuple_obj = utils::cast<PyObjectRef>(output_tensors).object_;
  211. py::tuple tuple_tensors = py::cast<py::tuple>(tuple_obj);
  212. run_op_graphs_.clear();
  213. return tuple_tensors;
  214. }
  215. } // namespace gpu
  216. } // namespace session
  217. } // namespace mindspore