You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gpu_session.cc 18 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "backend/session/gpu_session.h"
  17. #include "backend/optimizer/common/helper.h"
  18. #include "backend/optimizer/common/optimizer.h"
  19. #include "backend/optimizer/common/pass_manager.h"
  20. #include "backend/optimizer/gpu/adam_weight_decay_fusion.h"
  21. #include "backend/optimizer/gpu/adam_fusion.h"
  22. #include "backend/optimizer/gpu/apply_momentum_weight_scale_fusion.h"
  23. #include "backend/optimizer/gpu/apply_momentum_scale_fusion.h"
  24. #include "backend/optimizer/gpu/batch_norm_relu_fusion.h"
  25. #include "backend/optimizer/gpu/batch_norm_relu_grad_fusion.h"
  26. #include "backend/optimizer/gpu/batch_norm_add_relu_fusion.h"
  27. #include "backend/optimizer/gpu/batch_norm_add_relu_grad_fusion.h"
  28. #include "backend/optimizer/gpu/cudnn_inplace_fusion.h"
  29. #include "backend/optimizer/gpu/insert_format_transform_op.h"
  30. #include "backend/optimizer/gpu/replace_momentum_cast_fusion.h"
  31. #include "backend/optimizer/gpu/replace_addn_fusion.h"
  32. #include "backend/optimizer/gpu/remove_format_transform_pair.h"
  33. #include "backend/optimizer/gpu/remove_redundant_format_transform.h"
  34. #include "backend/optimizer/gpu/reduce_precision_fusion.h"
  35. #include "backend/optimizer/graph_kernel/arithmetic_simplify.h"
  36. #include "backend/optimizer/graph_kernel/basic_ops_fusion.h"
  37. #include "backend/optimizer/graph_kernel/composite_ops_fusion.h"
  38. #include "backend/optimizer/graph_kernel/graph_kernel_splitter.h"
  39. #include "backend/optimizer/graph_kernel/graph_kernel_expander.h"
  40. #include "backend/optimizer/graph_kernel/graph_kernel_cse.h"
  41. #include "backend/optimizer/graph_kernel/value_graph_binder.h"
  42. #include "backend/optimizer/pass/communication_op_fusion.h"
  43. #include "backend/optimizer/pass/getitem_tuple.h"
  44. #include "common/trans.h"
  45. #include "debug/data_dump/e2e_dump_util.h"
  46. #include "debug/tensor_load.h"
  47. #include "debug/dump_proto.h"
  48. #include "runtime/device/gpu/gpu_kernel_build.h"
  49. #include "runtime/device/gpu/gpu_kernel_runtime.h"
  50. #include "runtime/device/gpu/gpu_stream_assign.h"
  51. #include "runtime/device/gpu/kernel_info_setter.h"
  52. #include "runtime/device/kernel_runtime_manager.h"
  53. #include "utils/ms_utils.h"
  54. #include "utils/config_manager.h"
  55. #include "utils/ms_context.h"
  56. namespace mindspore {
  57. namespace session {
  58. namespace gpu {
  59. using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm;
  60. void GPUSession::SelectKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  61. MS_EXCEPTION_IF_NULL(kernel_graph);
  62. device::gpu::FormatTransformChecker::GetInstance().CheckSupportFormatTransform(kernel_graph);
  63. for (const auto &kernel_node : kernel_graph->execution_order()) {
  64. MS_EXCEPTION_IF_NULL(kernel_node);
  65. device::gpu::SetKernelInfo(kernel_node);
  66. }
  67. }
  68. void GPUSession::StartKernelRT() const {
  69. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  70. MS_EXCEPTION_IF_NULL(runtime_instance);
  71. if (!runtime_instance->Init()) {
  72. MS_LOG(EXCEPTION) << "GPU start kernel runtime failed";
  73. }
  74. }
  75. void GPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
  76. MS_EXCEPTION_IF_NULL(kernel_graph);
  77. auto context_ptr = MsContext::GetInstance();
  78. MS_EXCEPTION_IF_NULL(context_ptr);
  79. auto optimizer = std::make_shared<opt::GraphOptimizer>();
  80. auto pm = std::make_shared<opt::PassManager>();
  81. pm->AddPass(std::make_shared<opt::AdamWeightDecayFusion>());
  82. pm->AddPass(std::make_shared<opt::AdamFusion>());
  83. pm->AddPass(std::make_shared<opt::ReplaceMomentumCastFusion>());
  84. pm->AddPass(std::make_shared<opt::ReplaceAddNFusion>());
  85. optimizer->AddPassManager(pm);
  86. (void)optimizer->Optimize(kernel_graph);
  87. kernel_graph->SetExecOrderByDefault();
  88. }
  89. void GPUSession::HardwareOptimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
  90. auto optimizer = std::make_shared<opt::GraphOptimizer>();
  91. auto pm = std::make_shared<opt::PassManager>();
  92. pm->AddPass(std::make_shared<opt::BatchNormReluFusion>());
  93. pm->AddPass(std::make_shared<opt::BatchNormReluGradFusion>());
  94. pm->AddPass(std::make_shared<opt::BatchNormAddReluFusion>());
  95. pm->AddPass(std::make_shared<opt::InsertFormatTransformOp>());
  96. pm->AddPass(std::make_shared<opt::RemoveFormatTransformPair>());
  97. pm->AddPass(std::make_shared<opt::RemoveRedundantFormatTransform>());
  98. pm->AddPass(std::make_shared<opt::AllReduceFusion>());
  99. pm->AddPass(std::make_shared<opt::GetitemTuple>());
  100. pm->AddPass(std::make_shared<opt::ReducePrecisionFusion>("reduce_precision"));
  101. optimizer->AddPassManager(pm);
  102. (void)optimizer->Optimize(kernel_graph);
  103. kernel_graph->SetExecOrderByDefault();
  104. }
  105. void GPUSession::GraphKernelOptimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
  106. auto context_ptr = MsContext::GetInstance();
  107. MS_EXCEPTION_IF_NULL(context_ptr);
  108. if (!(context_ptr->get_param<bool>(MS_CTX_ENABLE_GRAPH_KERNEL))) {
  109. return;
  110. }
  111. auto optimizer = std::make_shared<opt::GraphOptimizer>();
  112. auto pm = std::make_shared<opt::PassManager>("graph_kernel_pm");
  113. pm->AddPass(std::make_shared<opt::GraphKernelExpander>());
  114. pm->AddPass(std::make_shared<opt::BasicOpsFusion>());
  115. pm->AddPass(std::make_shared<opt::CompositeOpsFusion>());
  116. pm->AddPass(std::make_shared<opt::GraphKernelCSE>());
  117. pm->AddPass(std::make_shared<opt::ArithmeticSimplify>());
  118. pm->AddPass(std::make_shared<opt::GraphKernelCSE>());
  119. pm->AddPass(std::make_shared<opt::GraphKernelSplitter>());
  120. // After Simplify and Splitter, a lot of redundant getitem/maketuple
  121. // will be exposed, use GetitemTuple Pass to delete them.
  122. pm->AddPass(std::make_shared<opt::GetitemTuple>());
  123. pm->AddPass(std::make_shared<opt::BindValueToGraph>());
  124. optimizer->AddPassManager(pm);
  125. (void)optimizer->Optimize(kernel_graph);
  126. kernel_graph->SetExecOrderByDefault();
  127. }
  128. void GPUSession::AssignStream(const std::shared_ptr<KernelGraph> &kernel_graph) {
  129. MS_EXCEPTION_IF_NULL(kernel_graph);
  130. device::gpu::AssignGpuStream(kernel_graph);
  131. }
  132. void GPUSession::BuildKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  133. device::gpu::GpuBuild(kernel_graph);
  134. }
  135. void GPUSession::AllocateMemory(KernelGraph *kernel_graph) const {
  136. MS_EXCEPTION_IF_NULL(kernel_graph);
  137. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  138. MS_EXCEPTION_IF_NULL(runtime_instance);
  139. runtime_instance->AssignMemory(kernel_graph);
  140. }
  141. void GPUSession::RunOpAllocateMemory(const ValuePtr &pre_output_value,
  142. const std::vector<tensor::TensorPtr> &input_tensors,
  143. KernelGraph *kernel_graph) const {
  144. MS_EXCEPTION_IF_NULL(kernel_graph);
  145. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  146. MS_EXCEPTION_IF_NULL(runtime_instance);
  147. runtime_instance->RunOpAssignMemory(pre_output_value, input_tensors, kernel_graph);
  148. }
  149. void GPUSession::RunOpClearMemory(KernelGraph *kernel_graph) const {
  150. MS_EXCEPTION_IF_NULL(kernel_graph);
  151. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  152. MS_EXCEPTION_IF_NULL(runtime_instance);
  153. runtime_instance->RunOpClearMemory(kernel_graph);
  154. }
  155. void GPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
  156. const std::vector<tensor::TensorPtr> &inputs_const) const {
  157. std::vector<tensor::TensorPtr> inputs(inputs_const);
  158. MS_EXCEPTION_IF_NULL(kernel_graph);
  159. std::vector<AnfNodePtr> input_nodes;
  160. for (const auto &input_node : kernel_graph->inputs()) {
  161. auto params = AnfAlgo::GetAllOutput(input_node);
  162. std::copy(params.begin(), params.end(), std::back_inserter(input_nodes));
  163. }
  164. auto ms_context = MsContext::GetInstance();
  165. MS_EXCEPTION_IF_NULL(ms_context);
  166. if (inputs.size() != input_nodes.size()) {
  167. MS_LOG(EXCEPTION) << "Tensor input:" << inputs.size() << " is not equal graph inputs:" << input_nodes.size();
  168. }
  169. for (size_t i = 0; i < inputs.size(); ++i) {
  170. auto tensor = inputs[i];
  171. MS_EXCEPTION_IF_NULL(tensor);
  172. auto input_node = input_nodes[i];
  173. MS_EXCEPTION_IF_NULL(input_node);
  174. if (input_node->isa<Parameter>() && AnfAlgo::OutputAddrExist(input_node, 0)) {
  175. auto pk_node = input_node->cast<ParameterPtr>();
  176. auto device_address = AnfAlgo::GetMutableOutputAddr(pk_node, 0);
  177. auto tensor_address = std::dynamic_pointer_cast<device::DeviceAddress>(tensor->device_address());
  178. bool need_sync = false;
  179. if (ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER)) {
  180. if (tensor_address == nullptr || tensor_address != device_address) {
  181. need_sync = true;
  182. }
  183. } else if (tensor->NeedSyncHostToDevice() || tensor_address == nullptr) {
  184. need_sync = true;
  185. } else if (tensor_address != device_address) {
  186. if (tensor_address->DeviceType() == device_address->DeviceType()) {
  187. AnfAlgo::SetOutputAddr(tensor_address, 0, pk_node.get());
  188. } else {
  189. need_sync = true;
  190. }
  191. }
  192. if (need_sync) {
  193. if (AnfAlgo::IsParameterWeight(input_node->cast<ParameterPtr>())) {
  194. tensor->set_device_address(device_address);
  195. }
  196. MS_EXCEPTION_IF_NULL(device_address);
  197. if (!device_address->SyncHostToDevice(trans::GetRuntimePaddingShape(pk_node, 0),
  198. LongToSize(tensor->data().nbytes()), tensor->data_type(),
  199. tensor->data_c())) {
  200. MS_LOG(EXCEPTION) << "SyncHostToDevice failed.";
  201. }
  202. }
  203. }
  204. tensor->set_sync_status(kNoNeedSync);
  205. }
  206. }
  207. void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  208. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  209. MS_EXCEPTION_IF_NULL(runtime_instance);
  210. if (!runtime_instance->Run(kernel_graph.get(), false)) {
  211. MS_LOG(EXCEPTION) << "GPU execute graph failed!";
  212. }
  213. }
  214. GraphId GPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) {
  215. // Construct graph, if successfully, graph_sum_ + 1
  216. auto graph_id = graph_sum_;
  217. auto graph = ConstructKernelGraph(lst, outputs);
  218. MS_EXCEPTION_IF_NULL(graph);
  219. // Prepare ms context info for dump .pb graph
  220. auto context_ptr = MsContext::GetInstance();
  221. MS_EXCEPTION_IF_NULL(context_ptr);
  222. bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
  223. // Dump .pb graph before graph optimization
  224. if (save_graphs) {
  225. DumpIRProto(graph, "before_opt_" + std::to_string(graph_id));
  226. }
  227. // Graph optimization irrelevant to device data format
  228. Optimize(graph);
  229. // Select kernel build info
  230. SelectKernel(graph);
  231. // Graph optimization relevant to device data format
  232. HardwareOptimize(graph);
  233. // Graph kernel fusion optimization
  234. GraphKernelOptimize(graph);
  235. #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
  236. // Assign parameter keys.
  237. AssignParamKey(graph);
  238. #endif
  239. // Start gpu kernel runtime
  240. StartKernelRT();
  241. // Assign CUDA streams
  242. AssignStream(graph);
  243. // Hide NopOp from execution graph
  244. opt::HideNopNode(graph.get());
  245. // Build kernel if node is cnode
  246. BuildKernel(graph);
  247. // Set graph execution order before memory alloc, ensure that memory alloc is according to the reorder graph
  248. auto execution_order = graph->execution_order();
  249. Reorder(&execution_order);
  250. graph->set_execution_order(execution_order);
  251. // Get summary nodes.
  252. SetSummaryNodes(graph.get());
  253. // Remove NopOp from execution graph
  254. opt::RemoveNopNode(graph.get());
  255. // Dump .pb graph after graph optimization
  256. if (save_graphs) {
  257. DumpIRProto(graph, "after_opt_" + std::to_string(graph_id));
  258. }
  259. // Set graph manager.
  260. MS_EXCEPTION_IF_NULL(context_);
  261. FuncGraphManagerPtr manager = MakeManager({graph});
  262. context_->AddManager(manager);
  263. if (manager) {
  264. manager->AddFuncGraph(graph);
  265. graph->set_manager(manager);
  266. }
  267. // Alloc memory, including static memory and dynamic memory
  268. AllocateMemory(graph.get());
  269. return graph_id;
  270. }
  271. void GPUSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs,
  272. VectorRef *outputs) {
  273. auto &kernel_graph = graphs_[graph_id];
  274. // Load input data from user input
  275. LoadInputData(kernel_graph, inputs);
  276. PreIterationDbg(kernel_graph);
  277. #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
  278. // Initialize parameter server
  279. InitPSParamAndOptim(kernel_graph, inputs);
  280. #endif
  281. MS_EXCEPTION_IF_NULL(kernel_graph);
  282. Execute(kernel_graph);
  283. PostLoadTensor(kernel_graph);
  284. // Summary
  285. auto context_ptr = MsContext::GetInstance();
  286. MS_EXCEPTION_IF_NULL(context_ptr);
  287. if (context_ptr->get_param<bool>(MS_CTX_ENABLE_GPU_SUMMARY)) {
  288. Summary(kernel_graph.get());
  289. }
  290. PostIterationDbg(kernel_graph);
  291. }
  292. void GPUSession::BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
  293. const std::vector<tensor::TensorPtr> &input_tensors,
  294. const std::vector<int> &tensors_mask) {
  295. // Check if the graph cache exists.
  296. if (run_op_graphs_.find(graph_info) != run_op_graphs_.end()) {
  297. return;
  298. }
  299. // Prepare the graph
  300. auto kernel_graph = ConstructSingleOpGraph(op_run_info, input_tensors, tensors_mask);
  301. MS_EXCEPTION_IF_NULL(kernel_graph);
  302. SelectKernel(kernel_graph);
  303. StartKernelRT();
  304. // Hide NopOp from execution graph
  305. opt::HideNopNode(kernel_graph.get());
  306. BuildKernel(kernel_graph);
  307. run_op_graphs_[graph_info] = kernel_graph;
  308. }
  309. void GPUSession::RunOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
  310. const std::vector<tensor::TensorPtr> &input_tensors, VectorRef *outputs) {
  311. auto kernel_graph = run_op_graphs_[graph_info];
  312. MS_EXCEPTION_IF_NULL(kernel_graph);
  313. // Remove NopOp from execution graph
  314. opt::RemoveNopNode(kernel_graph.get());
  315. RunOpAllocateMemory(op_run_info.value, input_tensors, kernel_graph.get());
  316. // Execute the computation
  317. LoadInputData(kernel_graph, input_tensors);
  318. Execute(kernel_graph);
  319. // Fetch outputs
  320. UpdateOutputs(kernel_graph, outputs, input_tensors);
  321. RunOpClearMemory(kernel_graph.get());
  322. }
  323. void GPUSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  324. if (debugger_->DebuggerBackendEnabled()) {
  325. MS_EXCEPTION_IF_NULL(kernel_graph);
  326. E2eDumpUtil::DumpData(kernel_graph.get(), device_id_, debugger_.get());
  327. } else {
  328. DumpJsonParser::GetInstance().UpdateDumpIter();
  329. }
  330. }
  331. bool GPUSession::DumpDataEnabledIteration() const {
  332. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  333. MS_EXCEPTION_IF_NULL(runtime_instance);
  334. return runtime_instance->DumpDataEnabledIteration();
  335. }
  336. void GPUSession::PreIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  337. if (debugger_) {
  338. debugger_->PreExecute(kernel_graph);
  339. }
  340. PreLoadTensor(kernel_graph);
  341. }
  342. void GPUSession::PostIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  343. bool dump_enabled = DumpDataEnabledIteration();
  344. // debug used for dump
  345. if (debugger_ && dump_enabled) {
  346. Dump(kernel_graph);
  347. } else {
  348. DumpJsonParser::GetInstance().UpdateDumpIter();
  349. }
  350. if (debugger_) {
  351. debugger_->PostExecute();
  352. }
  353. }
  354. void GPUSession::PreLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  355. // check the dump_enabled and dataset_sink_mode
  356. bool dump_enabled = DumpDataEnabledIteration();
  357. auto context_ptr = MsContext::GetInstance();
  358. MS_EXCEPTION_IF_NULL(context_ptr);
  359. if (dump_enabled && ConfigManager::GetInstance().dataset_mode() == DS_SINK_MODE) {
  360. MS_EXCEPTION(NotSupportError) << "Don't support set dataset_sink_mode to True when using e2e_dump";
  361. }
  362. if (!(debugger_ && (debugger_->debugger_enabled() || dump_enabled))) {
  363. return;
  364. }
  365. MS_EXCEPTION_IF_NULL(kernel_graph);
  366. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  367. MS_EXCEPTION_IF_NULL(runtime_instance);
  368. DebugServices *debug_services = debugger_->debug_services();
  369. TensorLoader *tensor_loader = debug_services->tensor_loader();
  370. tensor_loader->EmptyTensor();
  371. uint32_t iter_num = tensor_loader->GetIterNum();
  372. tensor_loader->set_iter_num(++iter_num);
  373. }
  374. void GPUSession::PostLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  375. bool dump_enabled = DumpDataEnabledIteration();
  376. if (!(debugger_ && (debugger_->debugger_enabled() || dump_enabled))) {
  377. return;
  378. }
  379. MS_EXCEPTION_IF_NULL(kernel_graph);
  380. auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  381. MS_EXCEPTION_IF_NULL(runtime_instance);
  382. DebugServices *debug_services = debugger_->debug_services();
  383. TensorLoader *tensor_loader = debug_services->tensor_loader();
  384. tensor_loader->EmptyPrevTensor();
  385. }
  386. } // namespace gpu
  387. } // namespace session
  388. } // namespace mindspore