You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gpu_stream_assign.cc 8.8 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "device/gpu/gpu_stream_assign.h"
  17. #include <set>
  18. #include <string>
  19. #include <memory>
  20. #include <algorithm>
  21. #include "device/gpu/gpu_common.h"
  22. #include "device/gpu/kernel_info_setter.h"
  23. #include "device/gpu/gpu_device_manager.h"
  24. namespace mindspore {
  25. namespace device {
  26. namespace gpu {
  27. void AssignGpuStream(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
  28. MS_EXCEPTION_IF_NULL(kernel_graph);
  29. std::vector<CNodePtr> allreduce_kernels;
  30. auto execution_kernels = kernel_graph->execution_order();
  31. for (auto kernel_node : execution_kernels) {
  32. std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
  33. if (kernel_name == kAllReduceOpName) {
  34. allreduce_kernels.emplace_back(kernel_node);
  35. } else {
  36. DeviceStream compute_stream = GPUDeviceManager::GetInstance().default_stream();
  37. MS_EXCEPTION_IF_NULL(compute_stream);
  38. AnfAlgo::SetNodeAttr(kAttrStreamId, MakeValue(reinterpret_cast<uintptr_t>(compute_stream)), kernel_node);
  39. }
  40. }
  41. if (allreduce_kernels.size() > 1) {
  42. // Assign multiple streams only when there're multiple AllReduce nodes.
  43. std::vector<SendRecvPair> send_recv_pairs;
  44. if (FindAllReduceStreamSwitchPos(kernel_graph, &send_recv_pairs)) {
  45. DeviceStream comm_stream = nullptr;
  46. GPUDeviceManager::GetInstance().CreateStream(&comm_stream);
  47. std::transform(
  48. allreduce_kernels.begin(), allreduce_kernels.end(), allreduce_kernels.begin(), [&](CNodePtr allreduce_kernel) {
  49. AnfAlgo::SetNodeAttr(kAttrStreamId, MakeValue(reinterpret_cast<uintptr_t>(comm_stream)), allreduce_kernel);
  50. return allreduce_kernel;
  51. });
  52. InsertStreamSwitchNode(kernel_graph, send_recv_pairs);
  53. } else {
  54. return;
  55. }
  56. }
  57. }
  58. bool FindAllReduceStreamSwitchPos(const std::shared_ptr<session::KernelGraph> &kernel_graph,
  59. std::vector<SendRecvPair> *send_recv_pairs) {
  60. auto execution_kernels = kernel_graph->execution_order();
  61. std::vector<CNodePtr>::iterator iter, iter_begin;
  62. iter = iter_begin = execution_kernels.begin();
  63. std::vector<CNodePtr>::iterator iter_end = execution_kernels.end();
  64. for (; iter != execution_kernels.end(); ++iter) {
  65. std::string kernel_name = AnfAlgo::GetCNodeName(*iter);
  66. if (kernel_name == kAllReduceOpName) {
  67. // Find AllReduce node's last input node.
  68. std::vector<CNodePtr>::iterator mock_send_node_iter =
  69. FindSendNodePos(iter_begin, iter + 1, *iter, kAllReduceStreamSwitch);
  70. if (mock_send_node_iter == iter + 1) {
  71. MS_LOG(WARNING) << "Can't find send node place before AllReduce node.";
  72. continue;
  73. }
  74. SendRecvPair pair1 = {kAllReduceStreamSwitch, *mock_send_node_iter, *iter,
  75. IntToSize(mock_send_node_iter - iter_begin + 1), IntToSize(iter - iter_begin)};
  76. send_recv_pairs->push_back(pair1);
  77. // Find node which uses AllReduce as input[0].
  78. std::vector<CNodePtr>::iterator mock_recv_node_iter =
  79. FindRecvNodePos(iter, iter_end, *iter, kAllReduceStreamSwitch);
  80. if (mock_recv_node_iter == iter_end) {
  81. MS_LOG(WARNING) << "Can't find recv node place after AllReduce node.";
  82. return false;
  83. }
  84. SendRecvPair pair2 = {kAllReduceStreamSwitch, *iter, *mock_recv_node_iter, IntToSize(iter - iter_begin + 1),
  85. IntToSize(mock_recv_node_iter - iter_begin)};
  86. send_recv_pairs->push_back(pair2);
  87. }
  88. }
  89. return true;
  90. }
  91. std::vector<CNodePtr>::iterator FindSendNodePos(std::vector<CNodePtr>::iterator begin,
  92. std::vector<CNodePtr>::iterator end, const CNodePtr mock_recv_node,
  93. StreamSwitchType stream_switch_type) {
  94. MS_EXCEPTION_IF_NULL(mock_recv_node);
  95. if (stream_switch_type == kAllReduceStreamSwitch) {
  96. for (auto iter = begin; iter != end; iter++) {
  97. if (*(iter + 1) == mock_recv_node) {
  98. return iter;
  99. }
  100. }
  101. }
  102. return end;
  103. }
  104. std::vector<CNodePtr>::iterator FindRecvNodePos(std::vector<CNodePtr>::iterator begin,
  105. std::vector<CNodePtr>::iterator end, const CNodePtr mock_send_node,
  106. StreamSwitchType stream_switch_type) {
  107. MS_EXCEPTION_IF_NULL(mock_send_node);
  108. for (auto iter = begin; iter != end; iter++) {
  109. auto node = *iter;
  110. if (stream_switch_type == kAllReduceStreamSwitch) {
  111. for (auto input : node->inputs()) {
  112. if (mock_send_node == AnfAlgo::VisitKernel(input, 0).first) {
  113. return iter;
  114. }
  115. }
  116. }
  117. }
  118. return end;
  119. }
  120. void InsertStreamSwitchNode(const std::shared_ptr<session::KernelGraph> &kernel_graph,
  121. const std::vector<SendRecvPair> &send_recv_pairs) {
  122. std::set<StreamSwitchNode> ordered_stream_switch_nodes;
  123. for (SendRecvPair pair : send_recv_pairs) {
  124. StreamSwitchType stream_switch_type = pair.stream_switch_type;
  125. CNodePtr mock_send_node = pair.mock_send_node;
  126. CNodePtr mock_recv_node = pair.mock_recv_node;
  127. size_t send_node_offset = pair.send_node_offset;
  128. size_t recv_node_offset = pair.recv_node_offset;
  129. CNodePtr send_node = nullptr;
  130. CNodePtr recv_node = nullptr;
  131. // Step 1: generate Send and Recv CNodes.
  132. if (stream_switch_type == kAllReduceStreamSwitch) {
  133. if (!GenSendRecvCNodesForAllReduce(kernel_graph, mock_send_node, mock_recv_node, &send_node, &recv_node)) {
  134. MS_LOG(EXCEPTION) << "Generating CNodes for send and recv failed. Stream switch type: kAllReduceStreamSwitch";
  135. }
  136. }
  137. // Step 2: sort send and recv CNodes by offset.
  138. ordered_stream_switch_nodes.insert({send_node_offset, send_node});
  139. ordered_stream_switch_nodes.insert({recv_node_offset, recv_node});
  140. }
  141. // Step 3: insert stream switch CNodes into execution kernel list.
  142. auto execution_kernels = kernel_graph->execution_order();
  143. for (auto node = ordered_stream_switch_nodes.rbegin(); node != ordered_stream_switch_nodes.rend(); node++) {
  144. execution_kernels.insert(execution_kernels.begin() + node->offset, node->cnode);
  145. }
  146. kernel_graph->set_execution_order(execution_kernels);
  147. }
  148. bool GenSendRecvCNodesForAllReduce(const std::shared_ptr<session::KernelGraph> &kernel_graph,
  149. const CNodePtr &mock_send_node, const CNodePtr &mock_recv_node, CNodePtr *send_node,
  150. CNodePtr *recv_node) {
  151. *send_node = CreateStreamSwitchNode(kernel_graph, kSendOpName);
  152. MS_EXCEPTION_IF_NULL(*send_node);
  153. *recv_node = CreateStreamSwitchNode(kernel_graph, kRecvOpName);
  154. MS_EXCEPTION_IF_NULL(*recv_node);
  155. cudaEvent_t event = nullptr;
  156. CHECK_CUDA_RET_WITH_EXCEPT(cudaEventCreate(&event, cudaEventDisableTiming), "Creating cuda event failed.");
  157. AnfAlgo::SetNodeAttr(kAttrRecordEvent, MakeValue(reinterpret_cast<uintptr_t>(event)), *send_node);
  158. AnfAlgo::SetNodeAttr(kAttrWaitEvent, MakeValue(reinterpret_cast<uintptr_t>(event)), *recv_node);
  159. uintptr_t send_stream = AnfAlgo::GetNodeAttr<uintptr_t>(mock_send_node, kAttrStreamId);
  160. AnfAlgo::SetNodeAttr(kAttrRecordEventStream, MakeValue(send_stream), *send_node);
  161. uintptr_t recv_stream = AnfAlgo::GetNodeAttr<uintptr_t>(mock_recv_node, kAttrStreamId);
  162. AnfAlgo::SetNodeAttr(kAttrWaitEventStream, MakeValue(recv_stream), *recv_node);
  163. return true;
  164. }
  165. CNodePtr CreateStreamSwitchNode(const std::shared_ptr<session::KernelGraph> &kernel_graph, const std::string &name) {
  166. auto op = std::make_shared<Primitive>(name);
  167. MS_EXCEPTION_IF_NULL(op);
  168. auto apply = std::make_shared<ValueNode>(op);
  169. MS_EXCEPTION_IF_NULL(apply);
  170. std::vector<AnfNodePtr> input_list = {apply};
  171. CNodePtr node = kernel_graph->NewCNode(input_list);
  172. MS_EXCEPTION_IF_NULL(node);
  173. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder;
  174. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), node.get());
  175. auto abstract_none = std::make_shared<abstract::AbstractNone>();
  176. MS_EXCEPTION_IF_NULL(abstract_none);
  177. node->set_abstract(abstract_none);
  178. SetKernelInfo(node);
  179. return node;
  180. }
  181. } // namespace gpu
  182. } // namespace device
  183. } // namespace mindspore