You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

kernel_adjust.cc 43 kB

5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "runtime/device/kernel_adjust.h"
  17. #include <map>
  18. #include <algorithm>
  19. #include <string>
  20. #include <vector>
  21. #include <utility>
  22. #include "backend/session/anf_runtime_algorithm.h"
  23. #include "utils/ms_context.h"
  24. #include "common/trans.h"
  25. #include "utils/config_manager.h"
  26. #include "utils/ms_utils.h"
  27. #include "backend/kernel_compiler/kernel_build_info.h"
  28. #include "utils/utils.h"
  29. #include "runtime/device/ascend/profiling/profiling_manager.h"
  30. #include "runtime/base.h"
  31. #include "runtime/device/ascend/ascend_stream_assign.h"
  32. #include "utils/shape_utils.h"
  33. namespace {
  34. constexpr auto kProfilingGraphId = "PROFILING_GRAPH_ID";
  35. } // namespace
  36. namespace mindspore {
  37. namespace device {
  38. using device::ascend::ProfilingUtils;
  39. void KernelAdjust::ReorderGetNext(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
  40. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  41. const std::vector<CNodePtr> &origin_cnode_list = kernel_graph_ptr->execution_order();
  42. std::vector<CNodePtr> getnext_list;
  43. std::vector<CNodePtr> other_list;
  44. for (const auto &cnode : origin_cnode_list) {
  45. if (AnfAlgo::GetCNodeName(cnode) == kGetNextOpName) {
  46. getnext_list.emplace_back(cnode);
  47. } else {
  48. other_list.emplace_back(cnode);
  49. }
  50. }
  51. std::vector<CNodePtr> new_order_list;
  52. new_order_list.insert(new_order_list.end(), getnext_list.begin(), getnext_list.end());
  53. new_order_list.insert(new_order_list.end(), other_list.begin(), other_list.end());
  54. kernel_graph_ptr->set_execution_order(new_order_list);
  55. }
  56. bool KernelAdjust::NeedInsertSwitch() {
  57. auto context_ptr = MsContext::GetInstance();
  58. MS_EXCEPTION_IF_NULL(context_ptr);
  59. return (context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK) &&
  60. context_ptr->get_param<bool>(MS_CTX_ENABLE_LOOP_SINK) && ConfigManager::GetInstance().iter_num() > 1);
  61. }
  62. CNodePtr KernelAdjust::CreateSendApplyKernel(const std::shared_ptr<session::KernelGraph> &graph_ptr,
  63. uint32_t event_id) {
  64. MS_EXCEPTION_IF_NULL(graph_ptr);
  65. auto send_op = std::make_shared<Primitive>(kSendOpName);
  66. MS_EXCEPTION_IF_NULL(send_op);
  67. auto send_apply = std::make_shared<ValueNode>(send_op);
  68. MS_EXCEPTION_IF_NULL(send_apply);
  69. std::vector<AnfNodePtr> send_input_list = {send_apply};
  70. CNodePtr send_node_ptr = graph_ptr->NewCNode(send_input_list);
  71. MS_EXCEPTION_IF_NULL(send_node_ptr);
  72. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder;
  73. selected_kernel_builder.SetKernelType(KernelType::RT_KERNEL);
  74. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), send_node_ptr.get());
  75. AnfAlgo::SetNodeAttr(kAttrEventId, MakeValue(event_id), send_node_ptr);
  76. auto abstract_none = std::make_shared<abstract::AbstractNone>();
  77. MS_EXCEPTION_IF_NULL(abstract_none);
  78. send_node_ptr->set_abstract(abstract_none);
  79. return send_node_ptr;
  80. }
  81. CNodePtr KernelAdjust::CreateRecvApplyKernel(const std::shared_ptr<session::KernelGraph> &graph_ptr,
  82. uint32_t event_id) {
  83. MS_EXCEPTION_IF_NULL(graph_ptr);
  84. auto recv_op = std::make_shared<Primitive>(kRecvOpName);
  85. MS_EXCEPTION_IF_NULL(recv_op);
  86. auto recv_apply = std::make_shared<ValueNode>(recv_op);
  87. MS_EXCEPTION_IF_NULL(recv_apply);
  88. std::vector<AnfNodePtr> recv_input_list = {recv_apply};
  89. CNodePtr recv_node_ptr = graph_ptr->NewCNode(recv_input_list);
  90. MS_EXCEPTION_IF_NULL(recv_node_ptr);
  91. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder;
  92. selected_kernel_builder.SetKernelType(KernelType::RT_KERNEL);
  93. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), recv_node_ptr.get());
  94. AnfAlgo::SetNodeAttr(kAttrEventId, MakeValue(event_id), recv_node_ptr);
  95. auto abstract_none = std::make_shared<abstract::AbstractNone>();
  96. MS_EXCEPTION_IF_NULL(abstract_none);
  97. recv_node_ptr->set_abstract(abstract_none);
  98. return recv_node_ptr;
  99. }
  100. bool KernelAdjust::ExistGetNext(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
  101. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  102. const std::vector<CNodePtr> &cnode_list = kernel_graph_ptr->execution_order();
  103. for (const auto &cnode : cnode_list) {
  104. if (AnfAlgo::GetCNodeName(cnode) == kGetNextOpName) {
  105. return true;
  106. }
  107. }
  108. return false;
  109. }
  110. bool KernelAdjust::ExistIndependent(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
  111. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  112. const auto &exe_orders = kernel_graph_ptr->execution_order();
  113. for (const auto &node : exe_orders) {
  114. if (AnfAlgo::IsIndependentNode(node) && AnfAlgo::GetGraphId(node.get()) == kernel_graph_ptr->graph_id()) {
  115. MS_LOG(INFO) << "graph exit independent node";
  116. return true;
  117. }
  118. }
  119. return false;
  120. }
  121. void KernelAdjust::InsertIndepentParallel(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  122. const std::map<std::string, mindspore::ParameterPtr> &switch_loop_input,
  123. std::vector<CNodePtr> *exec_order) {
  124. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  125. MS_EXCEPTION_IF_NULL(exec_order);
  126. device::ascend::AscendResourceMng &resource_manager = device::ascend::AscendResourceMng::GetInstance();
  127. CNodePtr independent_switch_app = CreateStreamSwitchOp(kernel_graph_ptr, switch_loop_input, kIndependentStreamSwitch);
  128. MS_EXCEPTION_IF_NULL(independent_switch_app);
  129. uint32_t independent_switch_stream_id = resource_manager.ApplyNewStream();
  130. AnfAlgo::SetStreamId(independent_switch_stream_id, independent_switch_app.get());
  131. AnfAlgo::SetNodeAttr(kStreamNeedActivedFirst, MakeValue<bool>(true), independent_switch_app);
  132. AnfAlgo::SetNodeAttr(kAttrStreamSwitchKind, MakeValue<uint32_t>(kIndependentStreamSwitch), independent_switch_app);
  133. (*exec_order).push_back(independent_switch_app);
  134. MS_LOG(INFO) << "Independent op loop insert Stream Switch " << independent_switch_app->fullname_with_scope();
  135. }
  136. void KernelAdjust::InsertFpBpLoopStreamSwitch(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  137. const std::map<std::string, mindspore::ParameterPtr> &switch_loop_input,
  138. std::vector<CNodePtr> *exec_order, uint32_t *fpbp_stream_id,
  139. uint32_t *fpbp_switch_stream_id) {
  140. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  141. MS_EXCEPTION_IF_NULL(exec_order);
  142. MS_EXCEPTION_IF_NULL(fpbp_stream_id);
  143. MS_EXCEPTION_IF_NULL(fpbp_switch_stream_id);
  144. device::ascend::AscendResourceMng &resource_manager = device::ascend::AscendResourceMng::GetInstance();
  145. *fpbp_switch_stream_id = resource_manager.ApplyNewStream();
  146. *fpbp_stream_id = resource_manager.ApplyNewStream();
  147. CNodePtr fpbp_switch_app = CreateStreamSwitchOp(kernel_graph_ptr, switch_loop_input, kFpBpStreamSwitch);
  148. MS_EXCEPTION_IF_NULL(fpbp_switch_app);
  149. AnfAlgo::SetStreamId(*fpbp_switch_stream_id, fpbp_switch_app.get());
  150. AnfAlgo::SetNodeAttr(kStreamNeedActivedFirst, MakeValue<bool>(true), fpbp_switch_app);
  151. // update fpbp loop stream switch true_branch_stream attr
  152. AnfAlgo::SetNodeAttr(kAttrTrueBranchStream, MakeValue<uint32_t>(*fpbp_stream_id), fpbp_switch_app);
  153. AnfAlgo::SetNodeAttr(kAttrStreamSwitchKind, MakeValue<uint32_t>(kFpBpStreamSwitch), fpbp_switch_app);
  154. (*exec_order).push_back(fpbp_switch_app);
  155. MS_LOG(INFO) << "FpBp loop insert Stream Switch " << fpbp_switch_app->fullname_with_scope();
  156. }
  157. void KernelAdjust::CopyMemcpyList(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  158. const std::vector<CNodePtr> &orders, size_t order_index,
  159. std::vector<CNodePtr> *memcpy_list, std::vector<CNodePtr> *other_list) {
  160. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  161. MS_EXCEPTION_IF_NULL(memcpy_list);
  162. MS_EXCEPTION_IF_NULL(other_list);
  163. CNodePtr cur_cnode = nullptr;
  164. for (size_t idx = order_index + 1; idx < orders.size(); idx++) {
  165. cur_cnode = orders[idx];
  166. if (AnfAlgo::HasNodeAttr(kAttrLabelForInsertStreamActive, cur_cnode)) {
  167. auto pre_node = orders[idx - 1];
  168. auto pre_kernel_name = AnfAlgo::GetCNodeName(pre_node);
  169. if (pre_kernel_name == kAtomicAddrCleanOpName) {
  170. (*other_list).pop_back();
  171. (*memcpy_list).push_back(pre_node);
  172. }
  173. (*memcpy_list).emplace_back(cur_cnode);
  174. } else {
  175. (*other_list).emplace_back(cur_cnode);
  176. }
  177. }
  178. }
  179. void KernelAdjust::InsertEosDoneRecv(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  180. std::vector<CNodePtr> *exec_order, uint32_t eos_done_event_id,
  181. uint32_t fpbp_stream_id) {
  182. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  183. MS_EXCEPTION_IF_NULL(exec_order);
  184. CNodePtr eos_done_recv = CreateRecvApplyKernel(kernel_graph_ptr, eos_done_event_id);
  185. AnfAlgo::SetStreamId(fpbp_stream_id, eos_done_recv.get());
  186. (*exec_order).push_back(eos_done_recv);
  187. MS_LOG(INFO) << "FpBp loop insert EoS done Recv " << eos_done_recv->fullname_with_scope();
  188. }
  189. void KernelAdjust::InsertGetNextLoopStreamActive(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  190. std::vector<CNodePtr> *exec_order,
  191. const std::vector<uint32_t> &getnext_active_streams) {
  192. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  193. MS_EXCEPTION_IF_NULL(exec_order);
  194. CNodePtr getnext_active_app = CreateStreamActiveOp(kernel_graph_ptr);
  195. MS_EXCEPTION_IF_NULL(getnext_active_app);
  196. AnfAlgo::SetNodeAttr(kAttrActiveStreamList, MakeValue<std::vector<uint32_t>>(getnext_active_streams),
  197. getnext_active_app);
  198. (*exec_order).push_back(getnext_active_app);
  199. MS_LOG(INFO) << "FpBp loop insert GetNext loop Stream Active " << getnext_active_app->fullname_with_scope();
  200. }
  201. void KernelAdjust::InsertFpBpStartRecv(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  202. std::vector<CNodePtr> *exec_order, uint32_t fpbp_start_event_id,
  203. uint32_t fpbp_stream_id) {
  204. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  205. MS_EXCEPTION_IF_NULL(exec_order);
  206. CNodePtr fpbp_start_recv = CreateRecvApplyKernel(kernel_graph_ptr, fpbp_start_event_id);
  207. AnfAlgo::SetStreamId(fpbp_stream_id, fpbp_start_recv.get());
  208. (*exec_order).push_back(fpbp_start_recv);
  209. MS_LOG(INFO) << "FpBp loop insert FpBp start Recv " << fpbp_start_recv->fullname_with_scope();
  210. }
  211. void KernelAdjust::InsertNextLoopAssignAdd(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  212. std::vector<CNodePtr> *exec_order,
  213. const std::map<std::string, mindspore::ParameterPtr> &switch_loop_input,
  214. uint32_t fpbp_stream_id) {
  215. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  216. MS_EXCEPTION_IF_NULL(exec_order);
  217. CNodePtr assign_add_one = CreateStreamAssignAddnOP(kernel_graph_ptr, switch_loop_input, false);
  218. MS_EXCEPTION_IF_NULL(assign_add_one);
  219. AnfAlgo::SetStreamId(fpbp_stream_id, assign_add_one.get());
  220. (*exec_order).push_back(assign_add_one);
  221. MS_LOG(INFO) << "FpBp loop insert next loop AssignAdd " << assign_add_one->fullname_with_scope();
  222. }
  223. void KernelAdjust::InsertCurrentLoopAssignAdd(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  224. std::vector<CNodePtr> *exec_order,
  225. const std::map<std::string, mindspore::ParameterPtr> &switch_loop_input) {
  226. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  227. MS_EXCEPTION_IF_NULL(exec_order);
  228. CNodePtr cur_assign_add = CreateStreamAssignAddnOP(kernel_graph_ptr, switch_loop_input, true);
  229. MS_EXCEPTION_IF_NULL(cur_assign_add);
  230. AnfAlgo::SetNodeAttr(kAttrFpBpEnd, MakeValue<bool>(true), cur_assign_add);
  231. (*exec_order).push_back(cur_assign_add);
  232. MS_LOG(INFO) << "FpBp loop insert current loop AssignAdd " << cur_assign_add->fullname_with_scope();
  233. }
  234. void KernelAdjust::InsertFpBpAndEosLoopStreamActive(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  235. std::vector<CNodePtr> *exec_order,
  236. const std::vector<uint32_t> &fpbp_active_streams) {
  237. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  238. MS_EXCEPTION_IF_NULL(exec_order);
  239. CNodePtr fpbp_active_app = CreateStreamActiveOp(kernel_graph_ptr);
  240. MS_EXCEPTION_IF_NULL(fpbp_active_app);
  241. AnfAlgo::SetNodeAttr(kAttrActiveStreamList, MakeValue<std::vector<uint32_t>>(fpbp_active_streams), fpbp_active_app);
  242. (*exec_order).push_back(fpbp_active_app);
  243. MS_LOG(INFO) << "FpBp loop insert FpBp loop and Eos loop Stream Active " << fpbp_active_app->fullname_with_scope();
  244. }
  245. void KernelAdjust::InsertSwitchLoopInput(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  246. const std::map<std::string, mindspore::ParameterPtr> &switch_loop_input) {
  247. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  248. std::vector<AnfNodePtr> *mute_inputs = kernel_graph_ptr->MutableInputs();
  249. MS_EXCEPTION_IF_NULL(mute_inputs);
  250. mute_inputs->push_back(switch_loop_input.at(kCurLoopCountParamName));
  251. mute_inputs->push_back(switch_loop_input.at(kNextLoopCountParamName));
  252. mute_inputs->push_back(switch_loop_input.at(kEpochParamName));
  253. mute_inputs->push_back(switch_loop_input.at(kIterLoopParamName));
  254. mute_inputs->push_back(switch_loop_input.at(kOneParamName));
  255. for (const auto &input : kernel_graph_ptr->inputs()) {
  256. MS_EXCEPTION_IF_NULL(input);
  257. if (input->isa<Parameter>()) {
  258. ParameterPtr param_ptr = input->cast<ParameterPtr>();
  259. if (param_ptr == nullptr) {
  260. MS_EXCEPTION(NotSupportError) << "Cast to parameter point failed !";
  261. }
  262. }
  263. }
  264. }
  265. void KernelAdjust::InsertGetNextLoopStreamSwitch(
  266. const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr, std::vector<CNodePtr> *exec_order,
  267. uint32_t *getnext_switch_stream_id, uint32_t *getnext_stream_id,
  268. const std::map<std::string, mindspore::ParameterPtr> &switch_loop_input) {
  269. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  270. MS_EXCEPTION_IF_NULL(exec_order);
  271. MS_EXCEPTION_IF_NULL(getnext_switch_stream_id);
  272. MS_EXCEPTION_IF_NULL(getnext_stream_id);
  273. device::ascend::AscendResourceMng &resource_manager = device::ascend::AscendResourceMng::GetInstance();
  274. *getnext_switch_stream_id = resource_manager.ApplyNewStream();
  275. *getnext_stream_id = resource_manager.ApplyNewStream();
  276. CNodePtr getnext_switch_app = CreateStreamSwitchOp(kernel_graph_ptr, switch_loop_input, kGetNextStreamSwitch);
  277. MS_EXCEPTION_IF_NULL(getnext_switch_app);
  278. AnfAlgo::SetStreamId(*getnext_switch_stream_id, getnext_switch_app.get());
  279. // update getnext loop stream switch true_branch_stream attr
  280. AnfAlgo::SetNodeAttr(kStreamNeedActivedFirst, MakeValue<bool>(true), getnext_switch_app);
  281. AnfAlgo::SetNodeAttr(kAttrTrueBranchStream, MakeValue<uint32_t>(*getnext_stream_id), getnext_switch_app);
  282. AnfAlgo::SetNodeAttr(kAttrStreamSwitchKind, MakeValue<uint32_t>(kGetNextStreamSwitch), getnext_switch_app);
  283. (*exec_order).push_back(getnext_switch_app);
  284. MS_LOG(INFO) << "GetNext loop insert Stream Switch " << getnext_switch_app->fullname_with_scope();
  285. }
  286. void KernelAdjust::SetBeforeGetNextStreamID(std::vector<CNodePtr> *exec_order, const std::vector<CNodePtr> &orders,
  287. size_t *order_index, CNodePtr getnext_cnode, uint32_t getnext_stream_id) {
  288. MS_EXCEPTION_IF_NULL(exec_order);
  289. MS_EXCEPTION_IF_NULL(order_index);
  290. for (; *order_index < orders.size(); (*order_index)++) {
  291. auto node = orders[*order_index];
  292. (*exec_order).push_back(node);
  293. AnfAlgo::SetStreamId(getnext_stream_id, (*exec_order)[(*exec_order).size() - 1].get());
  294. if (AnfAlgo::GetCNodeName(node) == kGetNextOpName) {
  295. getnext_cnode = node;
  296. break;
  297. }
  298. }
  299. }
  300. void KernelAdjust::InsertGetNextLoopFpBpStartSend(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  301. std::vector<CNodePtr> *exec_order, uint32_t *fpbp_start_event_id,
  302. uint32_t getnext_stream_id) {
  303. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  304. MS_EXCEPTION_IF_NULL(exec_order);
  305. MS_EXCEPTION_IF_NULL(fpbp_start_event_id);
  306. device::ascend::AscendResourceMng &resource_manager = device::ascend::AscendResourceMng::GetInstance();
  307. *fpbp_start_event_id = resource_manager.ApplyNewEvent();
  308. CNodePtr fpbp_start_send = CreateSendApplyKernel(kernel_graph_ptr, *fpbp_start_event_id);
  309. AnfAlgo::SetStreamId(getnext_stream_id, fpbp_start_send.get());
  310. (*exec_order).push_back(fpbp_start_send);
  311. MS_LOG(INFO) << "GetNext loop insert FpBp start Send " << fpbp_start_send->fullname_with_scope();
  312. }
  313. void KernelAdjust::InsertGetNextLoopEosStartSend(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  314. std::vector<CNodePtr> *exec_order, uint32_t *eos_start_event_id,
  315. uint32_t getnext_stream_id) {
  316. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  317. MS_EXCEPTION_IF_NULL(exec_order);
  318. MS_EXCEPTION_IF_NULL(eos_start_event_id);
  319. device::ascend::AscendResourceMng &resource_manager = device::ascend::AscendResourceMng::GetInstance();
  320. *eos_start_event_id = resource_manager.ApplyNewEvent();
  321. CNodePtr eos_start_send = CreateSendApplyKernel(kernel_graph_ptr, *eos_start_event_id);
  322. AnfAlgo::SetStreamId(getnext_stream_id, eos_start_send.get());
  323. (*exec_order).push_back(eos_start_send);
  324. MS_LOG(INFO) << "GetNext loop insert EoS start Send " << eos_start_send->fullname_with_scope();
  325. }
  326. void KernelAdjust::InsertEosStreamSwitch(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  327. const std::map<std::string, mindspore::ParameterPtr> &switch_loop_input,
  328. std::vector<CNodePtr> *exec_order, uint32_t *eos_switch_stream_id,
  329. uint32_t *eos_stream_id) {
  330. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  331. MS_EXCEPTION_IF_NULL(exec_order);
  332. MS_EXCEPTION_IF_NULL(eos_switch_stream_id);
  333. MS_EXCEPTION_IF_NULL(eos_stream_id);
  334. device::ascend::AscendResourceMng &resource_manager = device::ascend::AscendResourceMng::GetInstance();
  335. *eos_switch_stream_id = resource_manager.ApplyNewStream();
  336. *eos_stream_id = resource_manager.ApplyNewStream();
  337. CNodePtr eos_switch_app = CreateStreamSwitchOp(kernel_graph_ptr, switch_loop_input, kEosStreamSwitch);
  338. MS_EXCEPTION_IF_NULL(eos_switch_app);
  339. AnfAlgo::SetStreamId(*eos_switch_stream_id, eos_switch_app.get());
  340. AnfAlgo::SetNodeAttr(kStreamNeedActivedFirst, MakeValue<bool>(true), eos_switch_app);
  341. // update eos loop stream switch true_branch_stream attr
  342. AnfAlgo::SetNodeAttr(kAttrTrueBranchStream, MakeValue<uint32_t>(*eos_stream_id), eos_switch_app);
  343. AnfAlgo::SetNodeAttr(kAttrStreamSwitchKind, MakeValue<uint32_t>(kEosStreamSwitch), eos_switch_app);
  344. (*exec_order).push_back(eos_switch_app);
  345. MS_LOG(INFO) << "EoS loop insert Stream Switch " << eos_switch_app->fullname_with_scope();
  346. }
  347. void KernelAdjust::InsertGetNextLoopEosStartRecv(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  348. std::vector<CNodePtr> *exec_order, uint32_t eos_start_event_id,
  349. uint32_t eos_stream_id) {
  350. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  351. MS_EXCEPTION_IF_NULL(exec_order);
  352. CNodePtr eos_start_recv = CreateRecvApplyKernel(kernel_graph_ptr, eos_start_event_id);
  353. AnfAlgo::SetStreamId(eos_stream_id, eos_start_recv.get());
  354. (*exec_order).push_back(eos_start_recv);
  355. MS_LOG(INFO) << "EoS loop insert EoS Recv " << eos_start_recv->fullname_with_scope();
  356. }
  357. void KernelAdjust::InsertEosOp(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  358. std::vector<CNodePtr> *exec_order, const CNodePtr &getnext_cnode,
  359. uint32_t eos_stream_id) {
  360. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  361. MS_EXCEPTION_IF_NULL(exec_order);
  362. MS_EXCEPTION_IF_NULL(getnext_cnode);
  363. CNodePtr end_of_sequence_op = CreateEndOfSequenceOP(kernel_graph_ptr, getnext_cnode);
  364. MS_EXCEPTION_IF_NULL(end_of_sequence_op);
  365. AnfAlgo::SetStreamId(eos_stream_id, end_of_sequence_op.get());
  366. (*exec_order).push_back(end_of_sequence_op);
  367. MS_LOG(INFO) << "EoS loop insert Eos Op " << end_of_sequence_op->fullname_with_scope();
  368. }
  369. void KernelAdjust::InsertEosDoneSend(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  370. std::vector<CNodePtr> *exec_order, uint32_t *eos_done_event_id,
  371. uint32_t eos_stream_id) {
  372. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  373. MS_EXCEPTION_IF_NULL(exec_order);
  374. MS_EXCEPTION_IF_NULL(eos_done_event_id);
  375. device::ascend::AscendResourceMng &resource_manager = device::ascend::AscendResourceMng::GetInstance();
  376. *eos_done_event_id = resource_manager.ApplyNewEvent();
  377. CNodePtr eos_done_send = CreateSendApplyKernel(kernel_graph_ptr, *eos_done_event_id);
  378. AnfAlgo::SetStreamId(eos_stream_id, eos_done_send.get());
  379. (*exec_order).push_back(eos_done_send);
  380. MS_LOG(INFO) << "EoS loop insert EoS done Send " << eos_done_send->fullname_with_scope();
  381. }
  382. void KernelAdjust::InsertSwitchLoop(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
  383. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  384. device::ascend::AscendResourceMng &resource_manager = device::ascend::AscendResourceMng::GetInstance();
  385. resource_manager.ResetResource();
  386. if (!NeedInsertSwitch()) {
  387. return;
  388. }
  389. if (kernel_graph_ptr->is_dynamic_shape()) {
  390. MS_LOG(INFO) << "KernelGraph:" << kernel_graph_ptr->graph_id() << " is dynamic shape, skip InsertSwitchLoop";
  391. return;
  392. }
  393. bool exist_getnext = ExistGetNext(kernel_graph_ptr);
  394. bool eos_mode = ConfigManager::GetInstance().iter_num() == INT32_MAX && exist_getnext;
  395. MS_LOG(INFO) << "GetNext exist:" << exist_getnext << " End of Sequence mode:" << eos_mode
  396. << " iter num:" << ConfigManager::GetInstance().iter_num();
  397. if (exist_getnext) {
  398. ReorderGetNext(kernel_graph_ptr);
  399. }
  400. std::map<std::string, mindspore::ParameterPtr> switch_loop_input;
  401. CreateSwitchOpParameters(kernel_graph_ptr, &switch_loop_input);
  402. InsertSwitchLoopInput(kernel_graph_ptr, switch_loop_input);
  403. const std::vector<CNodePtr> &orders = kernel_graph_ptr->execution_order();
  404. if (orders.empty()) {
  405. MS_LOG(EXCEPTION) << "graph execution order is empty";
  406. }
  407. std::vector<CNodePtr> exec_order;
  408. CNodePtr getnext_cnode;
  409. uint32_t getnext_switch_stream_id = UINT32_MAX;
  410. uint32_t fpbp_start_event_id = UINT32_MAX;
  411. uint32_t eos_start_event_id = UINT32_MAX;
  412. uint32_t getnext_stream_id = UINT32_MAX;
  413. size_t order_index = 0;
  414. if (exist_getnext) {
  415. InsertGetNextLoopStreamSwitch(kernel_graph_ptr, &exec_order, &getnext_switch_stream_id, &getnext_stream_id,
  416. switch_loop_input);
  417. SetBeforeGetNextStreamID(&exec_order, orders, &order_index, getnext_cnode, getnext_stream_id);
  418. InsertGetNextLoopFpBpStartSend(kernel_graph_ptr, &exec_order, &fpbp_start_event_id, getnext_stream_id);
  419. if (eos_mode) {
  420. InsertGetNextLoopEosStartSend(kernel_graph_ptr, &exec_order, &eos_start_event_id, getnext_stream_id);
  421. }
  422. }
  423. uint32_t eos_switch_stream_id = UINT32_MAX;
  424. uint32_t eos_stream_id = UINT32_MAX;
  425. uint32_t eos_done_event_id = UINT32_MAX;
  426. std::vector<uint32_t> fpbp_active_streams;
  427. if (eos_mode) {
  428. InsertEosStreamSwitch(kernel_graph_ptr, switch_loop_input, &exec_order, &eos_switch_stream_id, &eos_stream_id);
  429. InsertGetNextLoopEosStartRecv(kernel_graph_ptr, &exec_order, eos_start_event_id, eos_stream_id);
  430. InsertEosOp(kernel_graph_ptr, &exec_order, getnext_cnode, eos_stream_id);
  431. InsertEosDoneSend(kernel_graph_ptr, &exec_order, &eos_done_event_id, eos_stream_id);
  432. fpbp_active_streams.push_back(eos_switch_stream_id);
  433. }
  434. bool exist_independent = ExistIndependent(kernel_graph_ptr);
  435. if (exist_independent) {
  436. InsertIndepentParallel(kernel_graph_ptr, switch_loop_input, &exec_order);
  437. }
  438. uint32_t fpbp_stream_id = UINT32_MAX;
  439. uint32_t fpbp_switch_stream_id = UINT32_MAX;
  440. InsertFpBpLoopStreamSwitch(kernel_graph_ptr, switch_loop_input, &exec_order, &fpbp_stream_id, &fpbp_switch_stream_id);
  441. if (exist_getnext) {
  442. InsertFpBpStartRecv(kernel_graph_ptr, &exec_order, fpbp_start_event_id, fpbp_stream_id);
  443. }
  444. InsertNextLoopAssignAdd(kernel_graph_ptr, &exec_order, switch_loop_input, fpbp_stream_id);
  445. std::vector<CNodePtr> memcpy_list;
  446. std::vector<CNodePtr> other_list;
  447. if (exist_getnext) {
  448. CopyMemcpyList(kernel_graph_ptr, orders, order_index, &memcpy_list, &other_list);
  449. (void)std::copy(memcpy_list.begin(), memcpy_list.end(), std::back_inserter(exec_order));
  450. } else {
  451. other_list = orders;
  452. }
  453. if (eos_mode) {
  454. InsertEosDoneRecv(kernel_graph_ptr, &exec_order, eos_done_event_id, fpbp_stream_id);
  455. }
  456. std::vector<uint32_t> getnext_active_streams;
  457. if (exist_getnext) {
  458. // small loop active
  459. getnext_active_streams.push_back(getnext_switch_stream_id);
  460. InsertGetNextLoopStreamActive(kernel_graph_ptr, &exec_order, getnext_active_streams);
  461. }
  462. (void)std::copy(other_list.begin(), other_list.end(), std::back_inserter(exec_order));
  463. InsertCurrentLoopAssignAdd(kernel_graph_ptr, &exec_order, switch_loop_input);
  464. // big loop active
  465. fpbp_active_streams.push_back(fpbp_switch_stream_id);
  466. InsertFpBpAndEosLoopStreamActive(kernel_graph_ptr, &exec_order, fpbp_active_streams);
  467. kernel_graph_ptr->set_execution_order(exec_order);
  468. }
  469. void KernelAdjust::CreateSwitchOpParameters(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  470. std::map<std::string, mindspore::ParameterPtr> *switch_loop_input) {
  471. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  472. MS_EXCEPTION_IF_NULL(switch_loop_input);
  473. ShapeVector shp = {1};
  474. tensor::TensorPtr tensor_ptr = std::make_shared<tensor::Tensor>(kInt32->type_id(), shp);
  475. MS_EXCEPTION_IF_NULL(tensor_ptr);
  476. mindspore::abstract::AbstractBasePtr paremeter_abstract_ptr = tensor_ptr->ToAbstract();
  477. if (paremeter_abstract_ptr == nullptr) {
  478. MS_LOG(EXCEPTION) << "create abstract before insert switch op failed!";
  479. }
  480. ParameterPtr cur_loop_count = std::make_shared<Parameter>(kernel_graph_ptr);
  481. MS_EXCEPTION_IF_NULL(cur_loop_count);
  482. cur_loop_count->set_name(kCurLoopCountParamName);
  483. cur_loop_count->set_abstract(paremeter_abstract_ptr);
  484. ParameterPtr loop_count_cur = kernel_graph_ptr->NewParameter(cur_loop_count);
  485. (*switch_loop_input)[kCurLoopCountParamName] = loop_count_cur;
  486. ParameterPtr next_loop_count = std::make_shared<Parameter>(kernel_graph_ptr);
  487. MS_EXCEPTION_IF_NULL(next_loop_count);
  488. next_loop_count->set_name(kNextLoopCountParamName);
  489. next_loop_count->set_abstract(paremeter_abstract_ptr);
  490. ParameterPtr loop_count_next = kernel_graph_ptr->NewParameter(next_loop_count);
  491. (*switch_loop_input)[kNextLoopCountParamName] = loop_count_next;
  492. ParameterPtr iter_loop = std::make_shared<Parameter>(kernel_graph_ptr);
  493. iter_loop->set_name(kIterLoopParamName);
  494. iter_loop->set_abstract(paremeter_abstract_ptr);
  495. ParameterPtr iter_loop_new = kernel_graph_ptr->NewParameter(iter_loop);
  496. (*switch_loop_input)[kIterLoopParamName] = iter_loop_new;
  497. ParameterPtr one = std::make_shared<Parameter>(kernel_graph_ptr);
  498. one->set_name(kOneParamName);
  499. one->set_abstract(paremeter_abstract_ptr);
  500. ParameterPtr one_new = kernel_graph_ptr->NewParameter(one);
  501. (*switch_loop_input)[kOneParamName] = one_new;
  502. ParameterPtr epoch = std::make_shared<Parameter>(kernel_graph_ptr);
  503. MS_EXCEPTION_IF_NULL(epoch);
  504. epoch->set_name(kEpochParamName);
  505. epoch->set_abstract(paremeter_abstract_ptr);
  506. ParameterPtr epoch_new = kernel_graph_ptr->NewParameter(epoch);
  507. (*switch_loop_input)[kEpochParamName] = epoch_new;
  508. }
  509. kernel::KernelBuildInfo::KernelBuildInfoBuilder KernelAdjust::CreateMngKernelBuilder(
  510. const std::vector<std::string> &formats, const std::vector<TypeId> &type_ids) {
  511. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder;
  512. selected_kernel_builder.SetInputsFormat(formats);
  513. selected_kernel_builder.SetInputsDeviceType(type_ids);
  514. selected_kernel_builder.SetFusionType(kernel::FusionType::OPAQUE);
  515. selected_kernel_builder.SetProcessor(kernel::Processor::AICORE);
  516. selected_kernel_builder.SetKernelType(KernelType::RT_KERNEL);
  517. return selected_kernel_builder;
  518. }
  519. CNodePtr KernelAdjust::CreateStreamSwitchOp(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  520. const std::map<std::string, mindspore::ParameterPtr> &switch_loop_input,
  521. StreamSwitchKind kind) {
  522. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder = CreateMngKernelBuilder(
  523. {kOpFormat_DEFAULT, kOpFormat_DEFAULT}, {TypeId::kNumberTypeInt32, TypeId::kNumberTypeInt32});
  524. auto typeNone_abstract = std::make_shared<abstract::AbstractNone>();
  525. auto stream_switch = std::make_shared<Primitive>(kStreamSwitchOpName);
  526. std::vector<AnfNodePtr> inputs;
  527. inputs.push_back(NewValueNode(stream_switch));
  528. if (kind == kFpBpStreamSwitch || kind == kEosStreamSwitch) {
  529. inputs.push_back(switch_loop_input.at(kNextLoopCountParamName));
  530. } else if (kind == kGetNextStreamSwitch || kind == kIndependentStreamSwitch) {
  531. inputs.push_back(switch_loop_input.at(kNextLoopCountParamName));
  532. } else {
  533. MS_LOG(ERROR) << "unknown stream switch kind";
  534. }
  535. inputs.push_back(switch_loop_input.at(kIterLoopParamName));
  536. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  537. CNodePtr stream_switch_app = kernel_graph_ptr->NewCNode(inputs);
  538. MS_EXCEPTION_IF_NULL(stream_switch_app);
  539. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), stream_switch_app.get());
  540. stream_switch_app->set_abstract(typeNone_abstract);
  541. // set attr: cond_ RT_LESS
  542. int condition = static_cast<int>(RT_LESS);
  543. ValuePtr cond = MakeValue(condition);
  544. AnfAlgo::SetNodeAttr(kAttrSwitchCondition, cond, stream_switch_app);
  545. // set attr:data_type
  546. int data_type = static_cast<int>(RT_SWITCH_INT64);
  547. ValuePtr dt = MakeValue(data_type);
  548. AnfAlgo::SetNodeAttr(kAttrDataType, dt, stream_switch_app);
  549. // set distinction label and graph id
  550. return stream_switch_app;
  551. }
  552. CNodePtr KernelAdjust::CreateStreamActiveOp(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
  553. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder = CreateMngKernelBuilder(
  554. {kOpFormat_DEFAULT, kOpFormat_DEFAULT}, {TypeId::kNumberTypeInt32, TypeId::kNumberTypeInt32});
  555. abstract::AbstractBasePtr typeNone_abstract = std::make_shared<abstract::AbstractNone>();
  556. auto stream_active_others = std::make_shared<Primitive>(kStreamActiveOpName);
  557. std::vector<AnfNodePtr> inputs;
  558. inputs.push_back(NewValueNode(stream_active_others));
  559. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  560. CNodePtr stream_active_others_app = kernel_graph_ptr->NewCNode(inputs);
  561. MS_EXCEPTION_IF_NULL(stream_active_others_app);
  562. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), stream_active_others_app.get());
  563. stream_active_others_app->set_abstract(typeNone_abstract);
  564. return stream_active_others_app;
  565. }
  566. CNodePtr KernelAdjust::CreatTupleGetItemNode(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  567. const CNodePtr &node, size_t output_idx) {
  568. auto idx = NewValueNode(SizeToLong(output_idx));
  569. MS_EXCEPTION_IF_NULL(idx);
  570. auto imm = std::make_shared<Int64Imm>(SizeToInt(output_idx));
  571. auto abstract_scalar = std::make_shared<abstract::AbstractScalar>(imm);
  572. idx->set_abstract(abstract_scalar);
  573. CNodePtr tuple_getitem = kernel_graph_ptr->NewCNode({NewValueNode(prim::kPrimTupleGetItem), node, idx});
  574. MS_EXCEPTION_IF_NULL(tuple_getitem);
  575. tuple_getitem->set_scope(node->scope());
  576. std::vector<size_t> origin_shape = AnfAlgo::GetOutputInferShape(node, output_idx);
  577. TypeId origin_type = AnfAlgo::GetOutputInferDataType(node, output_idx);
  578. AnfAlgo::SetOutputInferTypeAndShape({origin_type}, {origin_shape}, tuple_getitem.get());
  579. return tuple_getitem;
  580. }
  581. CNodePtr KernelAdjust::CreateEndOfSequenceOP(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  582. const CNodePtr &getnext_cnode) {
  583. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  584. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder;
  585. selected_kernel_builder.SetInputsFormat({kOpFormat_DEFAULT});
  586. selected_kernel_builder.SetInputsDeviceType({kNumberTypeUInt8});
  587. selected_kernel_builder.SetFusionType(kernel::FusionType::OPAQUE);
  588. selected_kernel_builder.SetProcessor(kernel::Processor::AICPU);
  589. selected_kernel_builder.SetKernelType(KernelType::AICPU_KERNEL);
  590. selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT});
  591. selected_kernel_builder.SetOutputsDeviceType({kNumberTypeUInt8});
  592. // EndOfSequence
  593. auto end_of_sequence = std::make_shared<Primitive>(kEndOfSequence);
  594. std::vector<AnfNodePtr> inputs;
  595. inputs.push_back(NewValueNode(end_of_sequence));
  596. // GetNext output 0 is EndOfSequence's input
  597. auto tuple_get_item = CreatTupleGetItemNode(kernel_graph_ptr, getnext_cnode, 0);
  598. inputs.push_back(tuple_get_item);
  599. CNodePtr end_of_sequence_node = kernel_graph_ptr->NewCNode(inputs);
  600. MS_EXCEPTION_IF_NULL(end_of_sequence_node);
  601. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), end_of_sequence_node.get());
  602. std::vector<std::string> input_names = {"x"};
  603. ValuePtr input_names_v = MakeValue(input_names);
  604. AnfAlgo::SetNodeAttr("input_names", input_names_v, end_of_sequence_node);
  605. std::vector<std::string> output_names = {"y"};
  606. ValuePtr output_names_v = MakeValue(output_names);
  607. AnfAlgo::SetNodeAttr("output_names", output_names_v, end_of_sequence_node);
  608. end_of_sequence_node->set_abstract(tuple_get_item->abstract());
  609. return end_of_sequence_node;
  610. }
  611. CNodePtr KernelAdjust::CreateStreamAssignAddnOP(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  612. const std::map<std::string, mindspore::ParameterPtr> &switch_loop_input,
  613. bool cur_loop) {
  614. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  615. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder = CreateMngKernelBuilder(
  616. {kOpFormat_DEFAULT, kOpFormat_DEFAULT}, {TypeId::kNumberTypeInt32, TypeId::kNumberTypeInt32});
  617. selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT});
  618. selected_kernel_builder.SetOutputsDeviceType({kNumberTypeInt32});
  619. // AssignAdd
  620. auto assign_add = std::make_shared<Primitive>(kAssignAddOpName);
  621. std::vector<AnfNodePtr> inputs;
  622. inputs.push_back(NewValueNode(assign_add));
  623. if (cur_loop) {
  624. inputs.push_back(switch_loop_input.at(kCurLoopCountParamName));
  625. } else {
  626. inputs.push_back(switch_loop_input.at(kNextLoopCountParamName));
  627. }
  628. inputs.push_back(switch_loop_input.at(kOneParamName));
  629. CNodePtr assign_add_one = kernel_graph_ptr->NewCNode(inputs);
  630. MS_EXCEPTION_IF_NULL(assign_add_one);
  631. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), assign_add_one.get());
  632. std::vector<std::string> input_names = {"ref", "value"};
  633. std::vector<std::string> output_names = {"output"};
  634. ValuePtr input_names_v = MakeValue(input_names);
  635. ValuePtr output_names_v = MakeValue(output_names);
  636. AnfAlgo::SetNodeAttr("input_names", input_names_v, assign_add_one);
  637. AnfAlgo::SetNodeAttr("output_names", output_names_v, assign_add_one);
  638. selected_kernel_builder.SetKernelType(KernelType::TBE_KERNEL);
  639. MS_EXCEPTION_IF_NULL(switch_loop_input.at(kCurLoopCountParamName));
  640. assign_add_one->set_abstract(switch_loop_input.at(kCurLoopCountParamName)->abstract());
  641. // add AssignAdd op to kernel ref node map
  642. session::AnfWithOutIndex final_pair = std::make_pair(assign_add_one, 0);
  643. session::KernelWithIndex kernel_with_index = AnfAlgo::VisitKernel(AnfAlgo::GetInputNode(assign_add_one, 0), 0);
  644. kernel_graph_ptr->AddRefCorrespondPairs(final_pair, kernel_with_index);
  645. return assign_add_one;
  646. }
  647. bool KernelAdjust::StepLoadCtrlInputs(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
  648. if (!NeedInsertSwitch()) {
  649. return true;
  650. }
  651. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  652. if (kernel_graph_ptr->is_dynamic_shape()) {
  653. MS_LOG(INFO) << "Skip StepLoadCtrlInputs";
  654. return true;
  655. }
  656. auto input_nodes = kernel_graph_ptr->inputs();
  657. std::vector<tensor::TensorPtr> inputs;
  658. LoadSwitchInputs(&inputs);
  659. std::shared_ptr<std::vector<tensor::TensorPtr>> inputsPtr = std::make_shared<std::vector<tensor::TensorPtr>>(inputs);
  660. kernel_graph_ptr->set_input_ctrl_tensors(inputsPtr);
  661. size_t input_ctrl_size = inputs.size();
  662. // inputs_node:include four ctrl nodes in the back. such as:conv,loop_cnt, ites_loop, zero, one.
  663. // deal four ctrl nodes.
  664. for (size_t i = 0; i < inputs.size(); ++i) {
  665. auto tensor = inputs[i];
  666. size_t deal_index = input_nodes.size() - input_ctrl_size + i;
  667. if (deal_index >= input_nodes.size()) {
  668. MS_LOG(EXCEPTION) << "deal_index[" << deal_index << "] out of range";
  669. }
  670. auto input_node = input_nodes[deal_index];
  671. bool need_sync = false;
  672. MS_EXCEPTION_IF_NULL(input_node);
  673. if (input_node->isa<Parameter>()) {
  674. auto pk_node = input_node->cast<ParameterPtr>();
  675. MS_EXCEPTION_IF_NULL(tensor);
  676. MS_EXCEPTION_IF_NULL(pk_node);
  677. if (tensor->NeedSyncHostToDevice() || !pk_node->has_default()) {
  678. need_sync = true;
  679. }
  680. }
  681. if (need_sync) {
  682. auto pk_node = input_node->cast<ParameterPtr>();
  683. MS_EXCEPTION_IF_NULL(pk_node);
  684. auto device_address = AnfAlgo::GetMutableOutputAddr(pk_node, 0);
  685. MS_EXCEPTION_IF_NULL(device_address);
  686. tensor->set_device_address(device_address);
  687. if (!device_address->SyncHostToDevice(trans::GetRuntimePaddingShape(pk_node, 0),
  688. LongToSize(tensor->data().nbytes()), tensor->data_type(), tensor->data_c(),
  689. tensor->device_info().host_format_)) {
  690. MS_LOG(INFO) << "SyncHostToDevice failed.";
  691. return false;
  692. }
  693. }
  694. tensor->set_sync_status(kNoNeedSync);
  695. }
  696. return true;
  697. }
  698. void KernelAdjust::LoadSwitchInputs(std::vector<tensor::TensorPtr> *inputs) {
  699. MS_LOG(INFO) << "---------------- LoadSwitchInputs---";
  700. MS_EXCEPTION_IF_NULL(inputs);
  701. // current loop count
  702. ShapeVector shp = {1};
  703. tensor::TensorPtr cur_loop_count = std::make_shared<tensor::Tensor>(kInt32->type_id(), shp);
  704. MS_EXCEPTION_IF_NULL(cur_loop_count);
  705. int32_t *val = nullptr;
  706. val = static_cast<int32_t *>(cur_loop_count->data_c());
  707. MS_EXCEPTION_IF_NULL(val);
  708. *val = 0;
  709. inputs->push_back(cur_loop_count);
  710. // next loop count
  711. tensor::TensorPtr next_loop_count = std::make_shared<tensor::Tensor>(kInt32->type_id(), shp);
  712. MS_EXCEPTION_IF_NULL(next_loop_count);
  713. val = static_cast<int32_t *>(next_loop_count->data_c());
  714. MS_EXCEPTION_IF_NULL(val);
  715. *val = 0;
  716. inputs->push_back(next_loop_count);
  717. // Epoch in device
  718. tensor::TensorPtr epoch_tensor = std::make_shared<tensor::Tensor>(kInt32->type_id(), shp);
  719. MS_EXCEPTION_IF_NULL(epoch_tensor);
  720. val = static_cast<int32_t *>(epoch_tensor->data_c());
  721. MS_EXCEPTION_IF_NULL(val);
  722. *val = 0;
  723. inputs->push_back(epoch_tensor);
  724. // total loop count per iter
  725. tensor::TensorPtr iter_loop_tensor = std::make_shared<tensor::Tensor>(kInt32->type_id(), shp);
  726. MS_EXCEPTION_IF_NULL(iter_loop_tensor);
  727. val = static_cast<int32_t *>(iter_loop_tensor->data_c());
  728. MS_EXCEPTION_IF_NULL(val);
  729. *val = SizeToInt(LongToSize(ConfigManager::GetInstance().iter_num()));
  730. MS_LOG(INFO) << "iter_loop_tensor = " << *val;
  731. inputs->push_back(iter_loop_tensor);
  732. tensor::TensorPtr one_tensor = std::make_shared<tensor::Tensor>(kInt32->type_id(), shp);
  733. MS_EXCEPTION_IF_NULL(one_tensor);
  734. val = static_cast<int32_t *>(one_tensor->data_c());
  735. MS_EXCEPTION_IF_NULL(val);
  736. *val = 1;
  737. inputs->push_back(one_tensor);
  738. MS_LOG(INFO) << "---------------- LoadSwitchInputs End--";
  739. }
  740. void KernelAdjust::Profiling(NotNull<session::KernelGraph *> kernel_graph_ptr) {
  741. if (!ascend::ProfilingManager::GetInstance().IsProfiling()) {
  742. MS_LOG(INFO) << "No need to profiling";
  743. return;
  744. }
  745. auto graph_id_env = std::getenv(kProfilingGraphId);
  746. if (graph_id_env != nullptr) {
  747. auto graph_id = std::stoul(graph_id_env);
  748. if (graph_id != kernel_graph_ptr->graph_id()) {
  749. MS_LOG(WARNING) << "Get PROFILING_GRAPH_ID " << graph_id
  750. << " Not Match Current Graph Id:" << kernel_graph_ptr->graph_id();
  751. return;
  752. }
  753. }
  754. ProfilingTraceInfo profiling_trace_info = ProfilingUtils::GenerateProfilingTrace(*kernel_graph_ptr);
  755. if (!profiling_trace_info.IsValid()) {
  756. MS_LOG(INFO) << "[profiling] no profiling node found!";
  757. return;
  758. }
  759. InsertProfilingKernel(profiling_trace_info, kernel_graph_ptr);
  760. }
  761. void KernelAdjust::InsertProfilingKernel(const ProfilingTraceInfo &profiling_trace_info,
  762. NotNull<session::KernelGraph *> kernel_graph_ptr) {
  763. MS_LOG(INFO) << "[profiling] Insert profiling kernel start";
  764. if (!profiling_trace_info.IsValid()) {
  765. MS_LOG(WARNING) << "Profiling trace point not found";
  766. return;
  767. }
  768. std::vector<CNodePtr> new_cnode_list;
  769. std::vector<CNodePtr> cnode_ptr_list = kernel_graph_ptr->execution_order();
  770. if (cnode_ptr_list.empty()) {
  771. MS_LOG(ERROR) << "No CNode in graph";
  772. return;
  773. }
  774. for (const auto &cnode_ptr : cnode_ptr_list) {
  775. ProfilingUtils::InsertProfilingTraceFp(cnode_ptr, profiling_trace_info, kernel_graph_ptr,
  776. NOT_NULL(&new_cnode_list));
  777. new_cnode_list.emplace_back(cnode_ptr);
  778. ProfilingUtils::InsertProfilingCustomOp(cnode_ptr, profiling_trace_info, kernel_graph_ptr,
  779. NOT_NULL(&new_cnode_list));
  780. ProfilingUtils::InsertProfilingTraceBpEnd(cnode_ptr, profiling_trace_info, kernel_graph_ptr,
  781. NOT_NULL(&new_cnode_list));
  782. ProfilingUtils::InsertProfilingTraceIterEnd(cnode_ptr, profiling_trace_info, kernel_graph_ptr,
  783. NOT_NULL(&new_cnode_list));
  784. }
  785. kernel_graph_ptr->set_execution_order(new_cnode_list);
  786. }
  787. } // namespace device
  788. } // namespace mindspore