You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

kernel_adjust.cc 55 kB

5 years ago
4 years ago
5 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
4 years ago
6 years ago
6 years ago
5 years ago
6 years ago
6 years ago
6 years ago
4 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "runtime/device/kernel_adjust.h"
  17. #include <map>
  18. #include <algorithm>
  19. #include <string>
  20. #include <vector>
  21. #include <utility>
  22. #include "backend/session/anf_runtime_algorithm.h"
  23. #include "utils/ms_context.h"
  24. #include "common/trans.h"
  25. #include "utils/config_manager.h"
  26. #include "utils/ms_utils.h"
  27. #include "backend/kernel_compiler/kernel_build_info.h"
  28. #include "utils/utils.h"
  29. #include "runtime/device/ascend/profiling/profiling_manager.h"
  30. #include "runtime/base.h"
  31. #include "runtime/device/ascend/ascend_stream_assign.h"
  32. #include "utils/shape_utils.h"
  33. namespace {
  34. constexpr auto kProfilingGraphId = "PROFILING_GRAPH_ID";
  35. constexpr auto kGradients = "Gradients";
  36. constexpr auto kSpecifyParameter = "accu_status";
  37. size_t kNPUShape = 8;
  38. } // namespace
  39. namespace mindspore {
  40. namespace device {
  41. using device::ascend::ProfilingUtils;
  42. void KernelAdjust::ReorderGetNext(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
  43. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  44. const std::vector<CNodePtr> &origin_cnode_list = kernel_graph_ptr->execution_order();
  45. std::vector<CNodePtr> getnext_list;
  46. std::vector<CNodePtr> other_list;
  47. for (const auto &cnode : origin_cnode_list) {
  48. if (AnfAlgo::GetCNodeName(cnode) == kGetNextOpName) {
  49. getnext_list.emplace_back(cnode);
  50. } else {
  51. other_list.emplace_back(cnode);
  52. }
  53. }
  54. std::vector<CNodePtr> new_order_list;
  55. new_order_list.insert(new_order_list.end(), getnext_list.begin(), getnext_list.end());
  56. new_order_list.insert(new_order_list.end(), other_list.begin(), other_list.end());
  57. kernel_graph_ptr->set_execution_order(new_order_list);
  58. }
  59. bool KernelAdjust::NeedInsertSwitch() {
  60. auto context_ptr = MsContext::GetInstance();
  61. MS_EXCEPTION_IF_NULL(context_ptr);
  62. return (context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK) &&
  63. context_ptr->get_param<bool>(MS_CTX_ENABLE_LOOP_SINK) && ConfigManager::GetInstance().iter_num() > 1);
  64. }
  65. CNodePtr KernelAdjust::CreateSendApplyKernel(const std::shared_ptr<session::KernelGraph> &graph_ptr,
  66. uint32_t event_id) {
  67. MS_EXCEPTION_IF_NULL(graph_ptr);
  68. auto send_op = std::make_shared<Primitive>(kSendOpName);
  69. MS_EXCEPTION_IF_NULL(send_op);
  70. auto send_apply = std::make_shared<ValueNode>(send_op);
  71. MS_EXCEPTION_IF_NULL(send_apply);
  72. std::vector<AnfNodePtr> send_input_list = {send_apply};
  73. CNodePtr send_node_ptr = graph_ptr->NewCNode(send_input_list);
  74. MS_EXCEPTION_IF_NULL(send_node_ptr);
  75. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder;
  76. selected_kernel_builder.SetKernelType(KernelType::RT_KERNEL);
  77. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), send_node_ptr.get());
  78. AnfAlgo::SetNodeAttr(kAttrEventId, MakeValue(event_id), send_node_ptr);
  79. auto abstract_none = std::make_shared<abstract::AbstractNone>();
  80. MS_EXCEPTION_IF_NULL(abstract_none);
  81. send_node_ptr->set_abstract(abstract_none);
  82. return send_node_ptr;
  83. }
  84. CNodePtr KernelAdjust::CreateRecvApplyKernel(const std::shared_ptr<session::KernelGraph> &graph_ptr,
  85. uint32_t event_id) {
  86. MS_EXCEPTION_IF_NULL(graph_ptr);
  87. auto recv_op = std::make_shared<Primitive>(kRecvOpName);
  88. MS_EXCEPTION_IF_NULL(recv_op);
  89. auto recv_apply = std::make_shared<ValueNode>(recv_op);
  90. MS_EXCEPTION_IF_NULL(recv_apply);
  91. std::vector<AnfNodePtr> recv_input_list = {recv_apply};
  92. CNodePtr recv_node_ptr = graph_ptr->NewCNode(recv_input_list);
  93. MS_EXCEPTION_IF_NULL(recv_node_ptr);
  94. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder;
  95. selected_kernel_builder.SetKernelType(KernelType::RT_KERNEL);
  96. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), recv_node_ptr.get());
  97. AnfAlgo::SetNodeAttr(kAttrEventId, MakeValue(event_id), recv_node_ptr);
  98. auto abstract_none = std::make_shared<abstract::AbstractNone>();
  99. MS_EXCEPTION_IF_NULL(abstract_none);
  100. recv_node_ptr->set_abstract(abstract_none);
  101. return recv_node_ptr;
  102. }
  103. bool KernelAdjust::ExistGetNext(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
  104. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  105. const std::vector<CNodePtr> &cnode_list = kernel_graph_ptr->execution_order();
  106. for (const auto &cnode : cnode_list) {
  107. if (AnfAlgo::GetCNodeName(cnode) == kGetNextOpName) {
  108. return true;
  109. }
  110. }
  111. return false;
  112. }
  113. bool KernelAdjust::ExistIndependent(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
  114. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  115. const auto &exe_orders = kernel_graph_ptr->execution_order();
  116. for (const auto &node : exe_orders) {
  117. if (AnfAlgo::IsIndependentNode(node) && AnfAlgo::GetGraphId(node.get()) == kernel_graph_ptr->graph_id()) {
  118. MS_LOG(INFO) << "graph exit independent node";
  119. return true;
  120. }
  121. }
  122. return false;
  123. }
  124. void KernelAdjust::InsertIndepentParallel(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  125. const std::map<std::string, mindspore::ParameterPtr> &switch_loop_input,
  126. std::vector<CNodePtr> *exec_order) {
  127. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  128. MS_EXCEPTION_IF_NULL(exec_order);
  129. device::ascend::AscendResourceMng &resource_manager = device::ascend::AscendResourceMng::GetInstance();
  130. CNodePtr independent_switch_app = CreateStreamSwitchOp(kernel_graph_ptr, switch_loop_input, kIndependentStreamSwitch);
  131. MS_EXCEPTION_IF_NULL(independent_switch_app);
  132. uint32_t independent_switch_stream_id = resource_manager.ApplyNewStream();
  133. AnfAlgo::SetStreamId(independent_switch_stream_id, independent_switch_app.get());
  134. AnfAlgo::SetNodeAttr(kStreamNeedActivedFirst, MakeValue<bool>(true), independent_switch_app);
  135. AnfAlgo::SetNodeAttr(kAttrStreamSwitchKind, MakeValue<uint32_t>(kIndependentStreamSwitch), independent_switch_app);
  136. (*exec_order).push_back(independent_switch_app);
  137. MS_LOG(INFO) << "Independent op loop insert Stream Switch " << independent_switch_app->fullname_with_scope();
  138. }
  139. void KernelAdjust::InsertFpBpLoopStreamSwitch(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  140. const std::map<std::string, mindspore::ParameterPtr> &switch_loop_input,
  141. std::vector<CNodePtr> *exec_order, uint32_t *fpbp_stream_id,
  142. uint32_t *fpbp_switch_stream_id) {
  143. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  144. MS_EXCEPTION_IF_NULL(exec_order);
  145. MS_EXCEPTION_IF_NULL(fpbp_stream_id);
  146. MS_EXCEPTION_IF_NULL(fpbp_switch_stream_id);
  147. device::ascend::AscendResourceMng &resource_manager = device::ascend::AscendResourceMng::GetInstance();
  148. *fpbp_switch_stream_id = resource_manager.ApplyNewStream();
  149. *fpbp_stream_id = resource_manager.ApplyNewStream();
  150. CNodePtr fpbp_switch_app = CreateStreamSwitchOp(kernel_graph_ptr, switch_loop_input, kFpBpStreamSwitch);
  151. MS_EXCEPTION_IF_NULL(fpbp_switch_app);
  152. AnfAlgo::SetStreamId(*fpbp_switch_stream_id, fpbp_switch_app.get());
  153. AnfAlgo::SetNodeAttr(kStreamNeedActivedFirst, MakeValue<bool>(true), fpbp_switch_app);
  154. // update fpbp loop stream switch true_branch_stream attr
  155. AnfAlgo::SetNodeAttr(kAttrTrueBranchStream, MakeValue<uint32_t>(*fpbp_stream_id), fpbp_switch_app);
  156. AnfAlgo::SetNodeAttr(kAttrStreamSwitchKind, MakeValue<uint32_t>(kFpBpStreamSwitch), fpbp_switch_app);
  157. (*exec_order).push_back(fpbp_switch_app);
  158. MS_LOG(INFO) << "FpBp loop insert Stream Switch " << fpbp_switch_app->fullname_with_scope();
  159. }
  160. void KernelAdjust::CopyMemcpyList(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  161. const std::vector<CNodePtr> &orders, size_t order_index,
  162. std::vector<CNodePtr> *memcpy_list, std::vector<CNodePtr> *other_list) {
  163. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  164. MS_EXCEPTION_IF_NULL(memcpy_list);
  165. MS_EXCEPTION_IF_NULL(other_list);
  166. CNodePtr cur_cnode = nullptr;
  167. for (size_t idx = order_index + 1; idx < orders.size(); idx++) {
  168. cur_cnode = orders[idx];
  169. if (AnfAlgo::HasNodeAttr(kAttrLabelForInsertStreamActive, cur_cnode)) {
  170. auto pre_node = orders[idx - 1];
  171. auto pre_kernel_name = AnfAlgo::GetCNodeName(pre_node);
  172. if (pre_kernel_name == kAtomicAddrCleanOpName) {
  173. (*other_list).pop_back();
  174. (*memcpy_list).push_back(pre_node);
  175. }
  176. (*memcpy_list).emplace_back(cur_cnode);
  177. } else {
  178. (*other_list).emplace_back(cur_cnode);
  179. }
  180. }
  181. }
  182. void KernelAdjust::InsertEosDoneRecv(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  183. std::vector<CNodePtr> *exec_order, uint32_t eos_done_event_id,
  184. uint32_t fpbp_stream_id) {
  185. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  186. MS_EXCEPTION_IF_NULL(exec_order);
  187. CNodePtr eos_done_recv = CreateRecvApplyKernel(kernel_graph_ptr, eos_done_event_id);
  188. AnfAlgo::SetStreamId(fpbp_stream_id, eos_done_recv.get());
  189. (*exec_order).push_back(eos_done_recv);
  190. MS_LOG(INFO) << "FpBp loop insert EoS done Recv " << eos_done_recv->fullname_with_scope();
  191. }
  192. void KernelAdjust::InsertGetNextLoopStreamActive(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  193. std::vector<CNodePtr> *exec_order,
  194. const std::vector<uint32_t> &getnext_active_streams) {
  195. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  196. MS_EXCEPTION_IF_NULL(exec_order);
  197. CNodePtr getnext_active_app = CreateStreamActiveOp(kernel_graph_ptr);
  198. MS_EXCEPTION_IF_NULL(getnext_active_app);
  199. AnfAlgo::SetNodeAttr(kAttrActiveStreamList, MakeValue<std::vector<uint32_t>>(getnext_active_streams),
  200. getnext_active_app);
  201. (*exec_order).push_back(getnext_active_app);
  202. MS_LOG(INFO) << "FpBp loop insert GetNext loop Stream Active " << getnext_active_app->fullname_with_scope();
  203. }
  204. void KernelAdjust::InsertFpBpStartRecv(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  205. std::vector<CNodePtr> *exec_order, uint32_t fpbp_start_event_id,
  206. uint32_t fpbp_stream_id) {
  207. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  208. MS_EXCEPTION_IF_NULL(exec_order);
  209. CNodePtr fpbp_start_recv = CreateRecvApplyKernel(kernel_graph_ptr, fpbp_start_event_id);
  210. AnfAlgo::SetStreamId(fpbp_stream_id, fpbp_start_recv.get());
  211. (*exec_order).push_back(fpbp_start_recv);
  212. MS_LOG(INFO) << "FpBp loop insert FpBp start Recv " << fpbp_start_recv->fullname_with_scope();
  213. }
  214. void KernelAdjust::InsertNextLoopAssignAdd(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  215. std::vector<CNodePtr> *exec_order,
  216. const std::map<std::string, mindspore::ParameterPtr> &switch_loop_input,
  217. uint32_t fpbp_stream_id) {
  218. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  219. MS_EXCEPTION_IF_NULL(exec_order);
  220. CNodePtr assign_add_one = CreateStreamAssignAddnOP(kernel_graph_ptr, switch_loop_input, false);
  221. MS_EXCEPTION_IF_NULL(assign_add_one);
  222. AnfAlgo::SetStreamId(fpbp_stream_id, assign_add_one.get());
  223. (*exec_order).push_back(assign_add_one);
  224. MS_LOG(INFO) << "FpBp loop insert next loop AssignAdd " << assign_add_one->fullname_with_scope();
  225. }
  226. void KernelAdjust::InsertCurrentLoopAssignAdd(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  227. std::vector<CNodePtr> *exec_order,
  228. const std::map<std::string, mindspore::ParameterPtr> &switch_loop_input) {
  229. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  230. MS_EXCEPTION_IF_NULL(exec_order);
  231. CNodePtr cur_assign_add = CreateStreamAssignAddnOP(kernel_graph_ptr, switch_loop_input, true);
  232. MS_EXCEPTION_IF_NULL(cur_assign_add);
  233. AnfAlgo::SetNodeAttr(kAttrFpBpEnd, MakeValue<bool>(true), cur_assign_add);
  234. (*exec_order).push_back(cur_assign_add);
  235. MS_LOG(INFO) << "FpBp loop insert current loop AssignAdd " << cur_assign_add->fullname_with_scope();
  236. }
  237. void KernelAdjust::InsertFpBpAndEosLoopStreamActive(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  238. std::vector<CNodePtr> *exec_order,
  239. const std::vector<uint32_t> &fpbp_active_streams) {
  240. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  241. MS_EXCEPTION_IF_NULL(exec_order);
  242. CNodePtr fpbp_active_app = CreateStreamActiveOp(kernel_graph_ptr);
  243. MS_EXCEPTION_IF_NULL(fpbp_active_app);
  244. AnfAlgo::SetNodeAttr(kAttrActiveStreamList, MakeValue<std::vector<uint32_t>>(fpbp_active_streams), fpbp_active_app);
  245. (*exec_order).push_back(fpbp_active_app);
  246. MS_LOG(INFO) << "FpBp loop insert FpBp loop and Eos loop Stream Active " << fpbp_active_app->fullname_with_scope();
  247. }
  248. void KernelAdjust::InsertSwitchLoopInput(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  249. const std::map<std::string, mindspore::ParameterPtr> &switch_loop_input) {
  250. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  251. std::vector<AnfNodePtr> *mute_inputs = kernel_graph_ptr->MutableInputs();
  252. MS_EXCEPTION_IF_NULL(mute_inputs);
  253. mute_inputs->push_back(switch_loop_input.at(kCurLoopCountParamName));
  254. mute_inputs->push_back(switch_loop_input.at(kNextLoopCountParamName));
  255. mute_inputs->push_back(switch_loop_input.at(kEpochParamName));
  256. mute_inputs->push_back(switch_loop_input.at(kIterLoopParamName));
  257. mute_inputs->push_back(switch_loop_input.at(kOneParamName));
  258. for (const auto &input : kernel_graph_ptr->inputs()) {
  259. MS_EXCEPTION_IF_NULL(input);
  260. if (input->isa<Parameter>()) {
  261. ParameterPtr param_ptr = input->cast<ParameterPtr>();
  262. if (param_ptr == nullptr) {
  263. MS_EXCEPTION(NotSupportError) << "Cast to parameter point failed !";
  264. }
  265. }
  266. }
  267. }
  268. void KernelAdjust::InsertGetNextLoopStreamSwitch(
  269. const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr, std::vector<CNodePtr> *exec_order,
  270. uint32_t *getnext_switch_stream_id, uint32_t *getnext_stream_id,
  271. const std::map<std::string, mindspore::ParameterPtr> &switch_loop_input) {
  272. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  273. MS_EXCEPTION_IF_NULL(exec_order);
  274. MS_EXCEPTION_IF_NULL(getnext_switch_stream_id);
  275. MS_EXCEPTION_IF_NULL(getnext_stream_id);
  276. device::ascend::AscendResourceMng &resource_manager = device::ascend::AscendResourceMng::GetInstance();
  277. *getnext_switch_stream_id = resource_manager.ApplyNewStream();
  278. *getnext_stream_id = resource_manager.ApplyNewStream();
  279. CNodePtr getnext_switch_app = CreateStreamSwitchOp(kernel_graph_ptr, switch_loop_input, kGetNextStreamSwitch);
  280. MS_EXCEPTION_IF_NULL(getnext_switch_app);
  281. AnfAlgo::SetStreamId(*getnext_switch_stream_id, getnext_switch_app.get());
  282. // update getnext loop stream switch true_branch_stream attr
  283. AnfAlgo::SetNodeAttr(kStreamNeedActivedFirst, MakeValue<bool>(true), getnext_switch_app);
  284. AnfAlgo::SetNodeAttr(kAttrTrueBranchStream, MakeValue<uint32_t>(*getnext_stream_id), getnext_switch_app);
  285. AnfAlgo::SetNodeAttr(kAttrStreamSwitchKind, MakeValue<uint32_t>(kGetNextStreamSwitch), getnext_switch_app);
  286. (*exec_order).push_back(getnext_switch_app);
  287. MS_LOG(INFO) << "GetNext loop insert Stream Switch " << getnext_switch_app->fullname_with_scope();
  288. }
  289. void KernelAdjust::SetBeforeGetNextStreamID(std::vector<CNodePtr> *exec_order, const std::vector<CNodePtr> &orders,
  290. size_t *order_index, CNodePtr getnext_cnode, uint32_t getnext_stream_id) {
  291. MS_EXCEPTION_IF_NULL(exec_order);
  292. MS_EXCEPTION_IF_NULL(order_index);
  293. for (; *order_index < orders.size(); (*order_index)++) {
  294. auto node = orders[*order_index];
  295. (*exec_order).push_back(node);
  296. AnfAlgo::SetStreamId(getnext_stream_id, (*exec_order)[(*exec_order).size() - 1].get());
  297. if (AnfAlgo::GetCNodeName(node) == kGetNextOpName) {
  298. getnext_cnode = node;
  299. break;
  300. }
  301. }
  302. }
  303. void KernelAdjust::InsertGetNextLoopFpBpStartSend(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  304. std::vector<CNodePtr> *exec_order, uint32_t *fpbp_start_event_id,
  305. uint32_t getnext_stream_id) {
  306. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  307. MS_EXCEPTION_IF_NULL(exec_order);
  308. MS_EXCEPTION_IF_NULL(fpbp_start_event_id);
  309. device::ascend::AscendResourceMng &resource_manager = device::ascend::AscendResourceMng::GetInstance();
  310. *fpbp_start_event_id = resource_manager.ApplyNewEvent();
  311. CNodePtr fpbp_start_send = CreateSendApplyKernel(kernel_graph_ptr, *fpbp_start_event_id);
  312. AnfAlgo::SetStreamId(getnext_stream_id, fpbp_start_send.get());
  313. (*exec_order).push_back(fpbp_start_send);
  314. MS_LOG(INFO) << "GetNext loop insert FpBp start Send " << fpbp_start_send->fullname_with_scope();
  315. }
  316. void KernelAdjust::InsertGetNextLoopEosStartSend(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  317. std::vector<CNodePtr> *exec_order, uint32_t *eos_start_event_id,
  318. uint32_t getnext_stream_id) {
  319. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  320. MS_EXCEPTION_IF_NULL(exec_order);
  321. MS_EXCEPTION_IF_NULL(eos_start_event_id);
  322. device::ascend::AscendResourceMng &resource_manager = device::ascend::AscendResourceMng::GetInstance();
  323. *eos_start_event_id = resource_manager.ApplyNewEvent();
  324. CNodePtr eos_start_send = CreateSendApplyKernel(kernel_graph_ptr, *eos_start_event_id);
  325. AnfAlgo::SetStreamId(getnext_stream_id, eos_start_send.get());
  326. (*exec_order).push_back(eos_start_send);
  327. MS_LOG(INFO) << "GetNext loop insert EoS start Send " << eos_start_send->fullname_with_scope();
  328. }
  329. void KernelAdjust::InsertEosStreamSwitch(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  330. const std::map<std::string, mindspore::ParameterPtr> &switch_loop_input,
  331. std::vector<CNodePtr> *exec_order, uint32_t *eos_switch_stream_id,
  332. uint32_t *eos_stream_id) {
  333. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  334. MS_EXCEPTION_IF_NULL(exec_order);
  335. MS_EXCEPTION_IF_NULL(eos_switch_stream_id);
  336. MS_EXCEPTION_IF_NULL(eos_stream_id);
  337. device::ascend::AscendResourceMng &resource_manager = device::ascend::AscendResourceMng::GetInstance();
  338. *eos_switch_stream_id = resource_manager.ApplyNewStream();
  339. *eos_stream_id = resource_manager.ApplyNewStream();
  340. CNodePtr eos_switch_app = CreateStreamSwitchOp(kernel_graph_ptr, switch_loop_input, kEosStreamSwitch);
  341. MS_EXCEPTION_IF_NULL(eos_switch_app);
  342. AnfAlgo::SetStreamId(*eos_switch_stream_id, eos_switch_app.get());
  343. AnfAlgo::SetNodeAttr(kStreamNeedActivedFirst, MakeValue<bool>(true), eos_switch_app);
  344. // update eos loop stream switch true_branch_stream attr
  345. AnfAlgo::SetNodeAttr(kAttrTrueBranchStream, MakeValue<uint32_t>(*eos_stream_id), eos_switch_app);
  346. AnfAlgo::SetNodeAttr(kAttrStreamSwitchKind, MakeValue<uint32_t>(kEosStreamSwitch), eos_switch_app);
  347. (*exec_order).push_back(eos_switch_app);
  348. MS_LOG(INFO) << "EoS loop insert Stream Switch " << eos_switch_app->fullname_with_scope();
  349. }
  350. void KernelAdjust::InsertGetNextLoopEosStartRecv(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  351. std::vector<CNodePtr> *exec_order, uint32_t eos_start_event_id,
  352. uint32_t eos_stream_id) {
  353. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  354. MS_EXCEPTION_IF_NULL(exec_order);
  355. CNodePtr eos_start_recv = CreateRecvApplyKernel(kernel_graph_ptr, eos_start_event_id);
  356. AnfAlgo::SetStreamId(eos_stream_id, eos_start_recv.get());
  357. (*exec_order).push_back(eos_start_recv);
  358. MS_LOG(INFO) << "EoS loop insert EoS Recv " << eos_start_recv->fullname_with_scope();
  359. }
  360. void KernelAdjust::InsertEosOp(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  361. std::vector<CNodePtr> *exec_order, const CNodePtr &getnext_cnode,
  362. uint32_t eos_stream_id) {
  363. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  364. MS_EXCEPTION_IF_NULL(exec_order);
  365. MS_EXCEPTION_IF_NULL(getnext_cnode);
  366. CNodePtr end_of_sequence_op = CreateEndOfSequenceOP(kernel_graph_ptr, getnext_cnode);
  367. MS_EXCEPTION_IF_NULL(end_of_sequence_op);
  368. AnfAlgo::SetStreamId(eos_stream_id, end_of_sequence_op.get());
  369. (*exec_order).push_back(end_of_sequence_op);
  370. MS_LOG(INFO) << "EoS loop insert Eos Op " << end_of_sequence_op->fullname_with_scope();
  371. }
  372. void KernelAdjust::InsertEosDoneSend(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  373. std::vector<CNodePtr> *exec_order, uint32_t *eos_done_event_id,
  374. uint32_t eos_stream_id) {
  375. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  376. MS_EXCEPTION_IF_NULL(exec_order);
  377. MS_EXCEPTION_IF_NULL(eos_done_event_id);
  378. device::ascend::AscendResourceMng &resource_manager = device::ascend::AscendResourceMng::GetInstance();
  379. *eos_done_event_id = resource_manager.ApplyNewEvent();
  380. CNodePtr eos_done_send = CreateSendApplyKernel(kernel_graph_ptr, *eos_done_event_id);
  381. AnfAlgo::SetStreamId(eos_stream_id, eos_done_send.get());
  382. (*exec_order).push_back(eos_done_send);
  383. MS_LOG(INFO) << "EoS loop insert EoS done Send " << eos_done_send->fullname_with_scope();
  384. }
  385. void KernelAdjust::InsertSwitchLoop(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
  386. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  387. device::ascend::AscendResourceMng &resource_manager = device::ascend::AscendResourceMng::GetInstance();
  388. resource_manager.ResetResource();
  389. if (!NeedInsertSwitch()) {
  390. return;
  391. }
  392. if (kernel_graph_ptr->is_dynamic_shape()) {
  393. MS_LOG(INFO) << "KernelGraph:" << kernel_graph_ptr->graph_id() << " is dynamic shape, skip InsertSwitchLoop";
  394. return;
  395. }
  396. bool exist_getnext = ExistGetNext(kernel_graph_ptr);
  397. bool eos_mode = ConfigManager::GetInstance().iter_num() == INT32_MAX && exist_getnext;
  398. MS_LOG(INFO) << "GetNext exist:" << exist_getnext << " End of Sequence mode:" << eos_mode
  399. << " iter num:" << ConfigManager::GetInstance().iter_num();
  400. if (exist_getnext) {
  401. ReorderGetNext(kernel_graph_ptr);
  402. }
  403. std::map<std::string, mindspore::ParameterPtr> switch_loop_input;
  404. CreateSwitchOpParameters(kernel_graph_ptr, &switch_loop_input);
  405. InsertSwitchLoopInput(kernel_graph_ptr, switch_loop_input);
  406. const std::vector<CNodePtr> &orders = kernel_graph_ptr->execution_order();
  407. if (orders.empty()) {
  408. MS_LOG(EXCEPTION) << "graph " << kernel_graph_ptr->graph_id() << " execution order is empty";
  409. }
  410. std::vector<CNodePtr> exec_order;
  411. CNodePtr getnext_cnode;
  412. uint32_t getnext_switch_stream_id = UINT32_MAX;
  413. uint32_t fpbp_start_event_id = UINT32_MAX;
  414. uint32_t eos_start_event_id = UINT32_MAX;
  415. uint32_t getnext_stream_id = UINT32_MAX;
  416. size_t order_index = 0;
  417. if (exist_getnext) {
  418. InsertGetNextLoopStreamSwitch(kernel_graph_ptr, &exec_order, &getnext_switch_stream_id, &getnext_stream_id,
  419. switch_loop_input);
  420. SetBeforeGetNextStreamID(&exec_order, orders, &order_index, getnext_cnode, getnext_stream_id);
  421. InsertGetNextLoopFpBpStartSend(kernel_graph_ptr, &exec_order, &fpbp_start_event_id, getnext_stream_id);
  422. if (eos_mode) {
  423. InsertGetNextLoopEosStartSend(kernel_graph_ptr, &exec_order, &eos_start_event_id, getnext_stream_id);
  424. }
  425. }
  426. uint32_t eos_switch_stream_id = UINT32_MAX;
  427. uint32_t eos_stream_id = UINT32_MAX;
  428. uint32_t eos_done_event_id = UINT32_MAX;
  429. std::vector<uint32_t> fpbp_active_streams;
  430. if (eos_mode) {
  431. InsertEosStreamSwitch(kernel_graph_ptr, switch_loop_input, &exec_order, &eos_switch_stream_id, &eos_stream_id);
  432. InsertGetNextLoopEosStartRecv(kernel_graph_ptr, &exec_order, eos_start_event_id, eos_stream_id);
  433. InsertEosOp(kernel_graph_ptr, &exec_order, getnext_cnode, eos_stream_id);
  434. InsertEosDoneSend(kernel_graph_ptr, &exec_order, &eos_done_event_id, eos_stream_id);
  435. fpbp_active_streams.push_back(eos_switch_stream_id);
  436. }
  437. bool exist_independent = ExistIndependent(kernel_graph_ptr);
  438. if (exist_independent) {
  439. InsertIndepentParallel(kernel_graph_ptr, switch_loop_input, &exec_order);
  440. }
  441. uint32_t fpbp_stream_id = UINT32_MAX;
  442. uint32_t fpbp_switch_stream_id = UINT32_MAX;
  443. InsertFpBpLoopStreamSwitch(kernel_graph_ptr, switch_loop_input, &exec_order, &fpbp_stream_id, &fpbp_switch_stream_id);
  444. if (exist_getnext) {
  445. InsertFpBpStartRecv(kernel_graph_ptr, &exec_order, fpbp_start_event_id, fpbp_stream_id);
  446. }
  447. InsertNextLoopAssignAdd(kernel_graph_ptr, &exec_order, switch_loop_input, fpbp_stream_id);
  448. std::vector<CNodePtr> memcpy_list;
  449. std::vector<CNodePtr> other_list;
  450. if (exist_getnext) {
  451. CopyMemcpyList(kernel_graph_ptr, orders, order_index, &memcpy_list, &other_list);
  452. (void)std::copy(memcpy_list.begin(), memcpy_list.end(), std::back_inserter(exec_order));
  453. } else {
  454. other_list = orders;
  455. }
  456. if (eos_mode) {
  457. InsertEosDoneRecv(kernel_graph_ptr, &exec_order, eos_done_event_id, fpbp_stream_id);
  458. }
  459. std::vector<uint32_t> getnext_active_streams;
  460. if (exist_getnext) {
  461. // small loop active
  462. getnext_active_streams.push_back(getnext_switch_stream_id);
  463. InsertGetNextLoopStreamActive(kernel_graph_ptr, &exec_order, getnext_active_streams);
  464. }
  465. (void)std::copy(other_list.begin(), other_list.end(), std::back_inserter(exec_order));
  466. InsertCurrentLoopAssignAdd(kernel_graph_ptr, &exec_order, switch_loop_input);
  467. // big loop active
  468. fpbp_active_streams.push_back(fpbp_switch_stream_id);
  469. InsertFpBpAndEosLoopStreamActive(kernel_graph_ptr, &exec_order, fpbp_active_streams);
  470. kernel_graph_ptr->set_execution_order(exec_order);
  471. }
  472. void KernelAdjust::CreateSwitchOpParameters(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  473. std::map<std::string, mindspore::ParameterPtr> *switch_loop_input) {
  474. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  475. MS_EXCEPTION_IF_NULL(switch_loop_input);
  476. ShapeVector shp = {1};
  477. tensor::TensorPtr tensor_ptr = std::make_shared<tensor::Tensor>(kInt32->type_id(), shp);
  478. MS_EXCEPTION_IF_NULL(tensor_ptr);
  479. mindspore::abstract::AbstractBasePtr paremeter_abstract_ptr = tensor_ptr->ToAbstract();
  480. if (paremeter_abstract_ptr == nullptr) {
  481. MS_LOG(EXCEPTION) << "create abstract before insert switch op failed!";
  482. }
  483. ParameterPtr cur_loop_count = std::make_shared<Parameter>(kernel_graph_ptr);
  484. MS_EXCEPTION_IF_NULL(cur_loop_count);
  485. cur_loop_count->set_name(kCurLoopCountParamName);
  486. cur_loop_count->set_abstract(paremeter_abstract_ptr);
  487. ParameterPtr loop_count_cur = kernel_graph_ptr->NewParameter(cur_loop_count);
  488. (*switch_loop_input)[kCurLoopCountParamName] = loop_count_cur;
  489. ParameterPtr next_loop_count = std::make_shared<Parameter>(kernel_graph_ptr);
  490. MS_EXCEPTION_IF_NULL(next_loop_count);
  491. next_loop_count->set_name(kNextLoopCountParamName);
  492. next_loop_count->set_abstract(paremeter_abstract_ptr);
  493. ParameterPtr loop_count_next = kernel_graph_ptr->NewParameter(next_loop_count);
  494. (*switch_loop_input)[kNextLoopCountParamName] = loop_count_next;
  495. ParameterPtr iter_loop = std::make_shared<Parameter>(kernel_graph_ptr);
  496. iter_loop->set_name(kIterLoopParamName);
  497. iter_loop->set_abstract(paremeter_abstract_ptr);
  498. ParameterPtr iter_loop_new = kernel_graph_ptr->NewParameter(iter_loop);
  499. (*switch_loop_input)[kIterLoopParamName] = iter_loop_new;
  500. ParameterPtr one = std::make_shared<Parameter>(kernel_graph_ptr);
  501. one->set_name(kOneParamName);
  502. one->set_abstract(paremeter_abstract_ptr);
  503. ParameterPtr one_new = kernel_graph_ptr->NewParameter(one);
  504. (*switch_loop_input)[kOneParamName] = one_new;
  505. ParameterPtr epoch = std::make_shared<Parameter>(kernel_graph_ptr);
  506. MS_EXCEPTION_IF_NULL(epoch);
  507. epoch->set_name(kEpochParamName);
  508. epoch->set_abstract(paremeter_abstract_ptr);
  509. ParameterPtr epoch_new = kernel_graph_ptr->NewParameter(epoch);
  510. (*switch_loop_input)[kEpochParamName] = epoch_new;
  511. }
  512. kernel::KernelBuildInfo::KernelBuildInfoBuilder KernelAdjust::CreateMngKernelBuilder(
  513. const std::vector<std::string> &formats, const std::vector<TypeId> &type_ids) {
  514. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder;
  515. selected_kernel_builder.SetInputsFormat(formats);
  516. selected_kernel_builder.SetInputsDeviceType(type_ids);
  517. selected_kernel_builder.SetFusionType(kernel::FusionType::OPAQUE);
  518. selected_kernel_builder.SetProcessor(kernel::Processor::AICORE);
  519. selected_kernel_builder.SetKernelType(KernelType::RT_KERNEL);
  520. return selected_kernel_builder;
  521. }
  522. CNodePtr KernelAdjust::CreateStreamSwitchOp(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  523. const std::map<std::string, mindspore::ParameterPtr> &switch_loop_input,
  524. StreamSwitchKind kind) {
  525. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder = CreateMngKernelBuilder(
  526. {kOpFormat_DEFAULT, kOpFormat_DEFAULT}, {TypeId::kNumberTypeInt32, TypeId::kNumberTypeInt32});
  527. auto typeNone_abstract = std::make_shared<abstract::AbstractNone>();
  528. auto stream_switch = std::make_shared<Primitive>(kStreamSwitchOpName);
  529. std::vector<AnfNodePtr> inputs;
  530. inputs.push_back(NewValueNode(stream_switch));
  531. if (kind == kFpBpStreamSwitch || kind == kEosStreamSwitch) {
  532. inputs.push_back(switch_loop_input.at(kNextLoopCountParamName));
  533. } else if (kind == kGetNextStreamSwitch || kind == kIndependentStreamSwitch) {
  534. inputs.push_back(switch_loop_input.at(kNextLoopCountParamName));
  535. } else {
  536. MS_LOG(ERROR) << "unknown stream switch kind: " << kind;
  537. }
  538. inputs.push_back(switch_loop_input.at(kIterLoopParamName));
  539. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  540. CNodePtr stream_switch_app = kernel_graph_ptr->NewCNode(inputs);
  541. MS_EXCEPTION_IF_NULL(stream_switch_app);
  542. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), stream_switch_app.get());
  543. stream_switch_app->set_abstract(typeNone_abstract);
  544. // set attr: cond_ RT_LESS
  545. int condition = static_cast<int>(RT_LESS);
  546. ValuePtr cond = MakeValue(condition);
  547. AnfAlgo::SetNodeAttr(kAttrSwitchCondition, cond, stream_switch_app);
  548. // set attr:data_type
  549. int data_type = static_cast<int>(RT_SWITCH_INT64);
  550. ValuePtr dt = MakeValue(data_type);
  551. AnfAlgo::SetNodeAttr(kAttrDataType, dt, stream_switch_app);
  552. // set distinction label and graph id
  553. return stream_switch_app;
  554. }
  555. CNodePtr KernelAdjust::CreateStreamActiveOp(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
  556. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder = CreateMngKernelBuilder(
  557. {kOpFormat_DEFAULT, kOpFormat_DEFAULT}, {TypeId::kNumberTypeInt32, TypeId::kNumberTypeInt32});
  558. abstract::AbstractBasePtr typeNone_abstract = std::make_shared<abstract::AbstractNone>();
  559. auto stream_active_others = std::make_shared<Primitive>(kStreamActiveOpName);
  560. std::vector<AnfNodePtr> inputs;
  561. inputs.push_back(NewValueNode(stream_active_others));
  562. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  563. CNodePtr stream_active_others_app = kernel_graph_ptr->NewCNode(inputs);
  564. MS_EXCEPTION_IF_NULL(stream_active_others_app);
  565. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), stream_active_others_app.get());
  566. stream_active_others_app->set_abstract(typeNone_abstract);
  567. return stream_active_others_app;
  568. }
  569. CNodePtr KernelAdjust::CreatTupleGetItemNode(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  570. const CNodePtr &node, size_t output_idx) {
  571. auto idx = NewValueNode(SizeToLong(output_idx));
  572. MS_EXCEPTION_IF_NULL(idx);
  573. auto imm = std::make_shared<Int64Imm>(SizeToInt(output_idx));
  574. auto abstract_scalar = std::make_shared<abstract::AbstractScalar>(imm);
  575. idx->set_abstract(abstract_scalar);
  576. CNodePtr tuple_getitem = kernel_graph_ptr->NewCNode({NewValueNode(prim::kPrimTupleGetItem), node, idx});
  577. MS_EXCEPTION_IF_NULL(tuple_getitem);
  578. tuple_getitem->set_scope(node->scope());
  579. std::vector<size_t> origin_shape = AnfAlgo::GetOutputInferShape(node, output_idx);
  580. TypeId origin_type = AnfAlgo::GetOutputInferDataType(node, output_idx);
  581. AnfAlgo::SetOutputInferTypeAndShape({origin_type}, {origin_shape}, tuple_getitem.get());
  582. return tuple_getitem;
  583. }
  584. CNodePtr KernelAdjust::CreateEndOfSequenceOP(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  585. const CNodePtr &getnext_cnode) {
  586. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  587. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder;
  588. selected_kernel_builder.SetInputsFormat({kOpFormat_DEFAULT});
  589. selected_kernel_builder.SetInputsDeviceType({kNumberTypeUInt8});
  590. selected_kernel_builder.SetFusionType(kernel::FusionType::OPAQUE);
  591. selected_kernel_builder.SetProcessor(kernel::Processor::AICPU);
  592. selected_kernel_builder.SetKernelType(KernelType::AICPU_KERNEL);
  593. selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT});
  594. selected_kernel_builder.SetOutputsDeviceType({kNumberTypeUInt8});
  595. // EndOfSequence
  596. auto end_of_sequence = std::make_shared<Primitive>(kEndOfSequence);
  597. std::vector<AnfNodePtr> inputs;
  598. inputs.push_back(NewValueNode(end_of_sequence));
  599. // GetNext output 0 is EndOfSequence's input
  600. auto tuple_get_item = CreatTupleGetItemNode(kernel_graph_ptr, getnext_cnode, 0);
  601. inputs.push_back(tuple_get_item);
  602. CNodePtr end_of_sequence_node = kernel_graph_ptr->NewCNode(inputs);
  603. MS_EXCEPTION_IF_NULL(end_of_sequence_node);
  604. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), end_of_sequence_node.get());
  605. std::vector<std::string> input_names = {"x"};
  606. ValuePtr input_names_v = MakeValue(input_names);
  607. AnfAlgo::SetNodeAttr("input_names", input_names_v, end_of_sequence_node);
  608. std::vector<std::string> output_names = {"y"};
  609. ValuePtr output_names_v = MakeValue(output_names);
  610. AnfAlgo::SetNodeAttr("output_names", output_names_v, end_of_sequence_node);
  611. end_of_sequence_node->set_abstract(tuple_get_item->abstract());
  612. return end_of_sequence_node;
  613. }
  614. CNodePtr KernelAdjust::CreateStreamAssignAddnOP(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  615. const std::map<std::string, mindspore::ParameterPtr> &switch_loop_input,
  616. bool cur_loop) {
  617. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  618. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder = CreateMngKernelBuilder(
  619. {kOpFormat_DEFAULT, kOpFormat_DEFAULT}, {TypeId::kNumberTypeInt32, TypeId::kNumberTypeInt32});
  620. selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT});
  621. selected_kernel_builder.SetOutputsDeviceType({kNumberTypeInt32});
  622. // AssignAdd
  623. auto assign_add = std::make_shared<Primitive>(kAssignAddOpName);
  624. std::vector<AnfNodePtr> inputs;
  625. inputs.push_back(NewValueNode(assign_add));
  626. if (cur_loop) {
  627. inputs.push_back(switch_loop_input.at(kCurLoopCountParamName));
  628. } else {
  629. inputs.push_back(switch_loop_input.at(kNextLoopCountParamName));
  630. }
  631. inputs.push_back(switch_loop_input.at(kOneParamName));
  632. CNodePtr assign_add_one = kernel_graph_ptr->NewCNode(inputs);
  633. MS_EXCEPTION_IF_NULL(assign_add_one);
  634. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), assign_add_one.get());
  635. std::vector<std::string> input_names = {"ref", "value"};
  636. std::vector<std::string> output_names = {"output"};
  637. ValuePtr input_names_v = MakeValue(input_names);
  638. ValuePtr output_names_v = MakeValue(output_names);
  639. AnfAlgo::SetNodeAttr("input_names", input_names_v, assign_add_one);
  640. AnfAlgo::SetNodeAttr("output_names", output_names_v, assign_add_one);
  641. selected_kernel_builder.SetKernelType(KernelType::TBE_KERNEL);
  642. MS_EXCEPTION_IF_NULL(switch_loop_input.at(kCurLoopCountParamName));
  643. assign_add_one->set_abstract(switch_loop_input.at(kCurLoopCountParamName)->abstract());
  644. // add AssignAdd op to kernel ref node map
  645. session::AnfWithOutIndex final_pair = std::make_pair(assign_add_one, 0);
  646. session::KernelWithIndex kernel_with_index = AnfAlgo::VisitKernel(AnfAlgo::GetInputNode(assign_add_one, 0), 0);
  647. kernel_graph_ptr->AddRefCorrespondPairs(final_pair, kernel_with_index);
  648. return assign_add_one;
  649. }
  650. bool KernelAdjust::StepLoadCtrlInputs(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
  651. if (!NeedInsertSwitch()) {
  652. return true;
  653. }
  654. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  655. if (kernel_graph_ptr->is_dynamic_shape()) {
  656. MS_LOG(INFO) << "Skip StepLoadCtrlInputs";
  657. return true;
  658. }
  659. auto input_nodes = kernel_graph_ptr->inputs();
  660. std::vector<tensor::TensorPtr> inputs;
  661. LoadSwitchInputs(&inputs);
  662. std::shared_ptr<std::vector<tensor::TensorPtr>> inputsPtr = std::make_shared<std::vector<tensor::TensorPtr>>(inputs);
  663. kernel_graph_ptr->set_input_ctrl_tensors(inputsPtr);
  664. size_t input_ctrl_size = inputs.size();
  665. // inputs_node:include four ctrl nodes in the back. such as:conv,loop_cnt, ites_loop, zero, one.
  666. // deal four ctrl nodes.
  667. for (size_t i = 0; i < inputs.size(); ++i) {
  668. auto tensor = inputs[i];
  669. MS_EXCEPTION_IF_NULL(tensor);
  670. size_t deal_index = input_nodes.size() - input_ctrl_size + i;
  671. if (deal_index >= input_nodes.size()) {
  672. MS_LOG(EXCEPTION) << "deal_index[" << deal_index << "] out of range";
  673. }
  674. auto input_node = input_nodes[deal_index];
  675. bool need_sync = false;
  676. MS_EXCEPTION_IF_NULL(input_node);
  677. if (input_node->isa<Parameter>()) {
  678. auto pk_node = input_node->cast<ParameterPtr>();
  679. MS_EXCEPTION_IF_NULL(pk_node);
  680. if (tensor->NeedSyncHostToDevice() || !pk_node->has_default()) {
  681. need_sync = true;
  682. }
  683. }
  684. if (need_sync) {
  685. auto pk_node = input_node->cast<ParameterPtr>();
  686. MS_EXCEPTION_IF_NULL(pk_node);
  687. auto device_address = AnfAlgo::GetMutableOutputAddr(pk_node, 0);
  688. MS_EXCEPTION_IF_NULL(device_address);
  689. tensor->set_device_address(device_address);
  690. if (!device_address->SyncHostToDevice(trans::GetRuntimePaddingShape(pk_node, 0),
  691. LongToSize(tensor->data().nbytes()), tensor->data_type(), tensor->data_c(),
  692. tensor->device_info().host_format_)) {
  693. MS_LOG(INFO) << "SyncHostToDevice failed.";
  694. return false;
  695. }
  696. }
  697. tensor->set_sync_status(kNoNeedSync);
  698. }
  699. return true;
  700. }
  701. void KernelAdjust::LoadSwitchInputs(std::vector<tensor::TensorPtr> *inputs) {
  702. MS_LOG(INFO) << "---------------- LoadSwitchInputs---";
  703. MS_EXCEPTION_IF_NULL(inputs);
  704. // current loop count
  705. ShapeVector shp = {1};
  706. tensor::TensorPtr cur_loop_count = std::make_shared<tensor::Tensor>(kInt32->type_id(), shp);
  707. MS_EXCEPTION_IF_NULL(cur_loop_count);
  708. int32_t *val = nullptr;
  709. val = static_cast<int32_t *>(cur_loop_count->data_c());
  710. MS_EXCEPTION_IF_NULL(val);
  711. *val = 0;
  712. inputs->push_back(cur_loop_count);
  713. // next loop count
  714. tensor::TensorPtr next_loop_count = std::make_shared<tensor::Tensor>(kInt32->type_id(), shp);
  715. MS_EXCEPTION_IF_NULL(next_loop_count);
  716. val = static_cast<int32_t *>(next_loop_count->data_c());
  717. MS_EXCEPTION_IF_NULL(val);
  718. *val = 0;
  719. inputs->push_back(next_loop_count);
  720. // Epoch in device
  721. tensor::TensorPtr epoch_tensor = std::make_shared<tensor::Tensor>(kInt32->type_id(), shp);
  722. MS_EXCEPTION_IF_NULL(epoch_tensor);
  723. val = static_cast<int32_t *>(epoch_tensor->data_c());
  724. MS_EXCEPTION_IF_NULL(val);
  725. *val = 0;
  726. inputs->push_back(epoch_tensor);
  727. // total loop count per iter
  728. tensor::TensorPtr iter_loop_tensor = std::make_shared<tensor::Tensor>(kInt32->type_id(), shp);
  729. MS_EXCEPTION_IF_NULL(iter_loop_tensor);
  730. val = static_cast<int32_t *>(iter_loop_tensor->data_c());
  731. MS_EXCEPTION_IF_NULL(val);
  732. *val = SizeToInt(LongToSize(ConfigManager::GetInstance().iter_num()));
  733. MS_LOG(INFO) << "iter_loop_tensor = " << *val;
  734. inputs->push_back(iter_loop_tensor);
  735. tensor::TensorPtr one_tensor = std::make_shared<tensor::Tensor>(kInt32->type_id(), shp);
  736. MS_EXCEPTION_IF_NULL(one_tensor);
  737. val = static_cast<int32_t *>(one_tensor->data_c());
  738. MS_EXCEPTION_IF_NULL(val);
  739. *val = 1;
  740. inputs->push_back(one_tensor);
  741. MS_LOG(INFO) << "---------------- LoadSwitchInputs End--";
  742. }
  743. void KernelAdjust::Profiling(NotNull<session::KernelGraph *> kernel_graph_ptr) {
  744. if (!ascend::ProfilingManager::GetInstance().IsProfiling()) {
  745. MS_LOG(INFO) << "No need to profiling";
  746. return;
  747. }
  748. auto graph_id_env = std::getenv(kProfilingGraphId);
  749. if (graph_id_env != nullptr) {
  750. auto graph_id = std::stoul(graph_id_env);
  751. if (graph_id != kernel_graph_ptr->graph_id()) {
  752. MS_LOG(WARNING) << "Get PROFILING_GRAPH_ID " << graph_id
  753. << " Not Match Current Graph Id:" << kernel_graph_ptr->graph_id();
  754. return;
  755. }
  756. }
  757. ProfilingTraceInfo profiling_trace_info = ProfilingUtils::GenerateProfilingTrace(*kernel_graph_ptr);
  758. if (!profiling_trace_info.IsValid()) {
  759. MS_LOG(INFO) << "[profiling] no profiling node found!";
  760. return;
  761. }
  762. InsertProfilingKernel(profiling_trace_info, kernel_graph_ptr);
  763. }
  764. void KernelAdjust::InsertProfilingKernel(const ProfilingTraceInfo &profiling_trace_info,
  765. NotNull<session::KernelGraph *> kernel_graph_ptr) {
  766. MS_LOG(INFO) << "[profiling] Insert profiling kernel start";
  767. if (!profiling_trace_info.IsValid()) {
  768. MS_LOG(WARNING) << "Profiling trace point not found";
  769. return;
  770. }
  771. std::vector<CNodePtr> new_cnode_list;
  772. std::vector<CNodePtr> cnode_ptr_list = kernel_graph_ptr->execution_order();
  773. if (cnode_ptr_list.empty()) {
  774. MS_LOG(ERROR) << "No CNode in graph " << kernel_graph_ptr->graph_id();
  775. return;
  776. }
  777. for (const auto &cnode_ptr : cnode_ptr_list) {
  778. ProfilingUtils::InsertProfilingTraceFp(cnode_ptr, profiling_trace_info, kernel_graph_ptr,
  779. NOT_NULL(&new_cnode_list));
  780. new_cnode_list.emplace_back(cnode_ptr);
  781. ProfilingUtils::InsertProfilingCustomOp(cnode_ptr, profiling_trace_info, kernel_graph_ptr,
  782. NOT_NULL(&new_cnode_list));
  783. ProfilingUtils::InsertProfilingTraceBpEnd(cnode_ptr, profiling_trace_info, kernel_graph_ptr,
  784. NOT_NULL(&new_cnode_list));
  785. ProfilingUtils::InsertProfilingTraceIterEnd(cnode_ptr, profiling_trace_info, kernel_graph_ptr,
  786. NOT_NULL(&new_cnode_list));
  787. }
  788. kernel_graph_ptr->set_execution_order(new_cnode_list);
  789. }
  790. CNodePtr KernelAdjust::CreateNPUGetFloatStatus(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  791. const CNodePtr &npu_alloc_cnode) {
  792. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  793. MS_EXCEPTION_IF_NULL(npu_alloc_cnode);
  794. auto npu_get_primitive = std::make_shared<Primitive>(kNPUGetFloatStatusOpName);
  795. std::vector<AnfNodePtr> npu_get_inputs = {NewValueNode(npu_get_primitive), npu_alloc_cnode};
  796. auto npu_get_cnode = kernel_graph_ptr->NewCNode(npu_get_inputs);
  797. MS_EXCEPTION_IF_NULL(npu_get_cnode);
  798. npu_alloc_cnode->set_scope(kDefaultScope);
  799. npu_get_cnode->set_abstract(npu_alloc_cnode->abstract());
  800. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder;
  801. selected_kernel_builder.SetInputsFormat({kOpFormat_DEFAULT});
  802. selected_kernel_builder.SetInputsDeviceType({kNumberTypeFloat32});
  803. selected_kernel_builder.SetFusionType(kernel::FusionType::OPAQUE);
  804. selected_kernel_builder.SetProcessor(kernel::Processor::AICORE);
  805. selected_kernel_builder.SetKernelType(KernelType::TBE_KERNEL);
  806. selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT});
  807. selected_kernel_builder.SetOutputsDeviceType({kNumberTypeFloat32});
  808. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), npu_get_cnode.get());
  809. return npu_get_cnode;
  810. }
  811. CNodePtr KernelAdjust::CreateNPUClearStatus(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  812. const CNodePtr &npu_alloc_cnode) {
  813. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  814. MS_EXCEPTION_IF_NULL(npu_alloc_cnode);
  815. auto npu_clear_primitive = std::make_shared<Primitive>(kNPUClearFloatStatusOpName);
  816. std::vector<AnfNodePtr> npu_clear_inputs = {NewValueNode(npu_clear_primitive), npu_alloc_cnode};
  817. auto npu_clear_cnode = kernel_graph_ptr->NewCNode(npu_clear_inputs);
  818. MS_EXCEPTION_IF_NULL(npu_clear_cnode);
  819. npu_alloc_cnode->set_scope(kDefaultScope);
  820. npu_clear_cnode->set_abstract(npu_alloc_cnode->abstract());
  821. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder;
  822. selected_kernel_builder.SetInputsFormat({kOpFormat_DEFAULT});
  823. selected_kernel_builder.SetInputsDeviceType({kNumberTypeFloat32});
  824. selected_kernel_builder.SetFusionType(kernel::FusionType::OPAQUE);
  825. selected_kernel_builder.SetProcessor(kernel::Processor::AICORE);
  826. selected_kernel_builder.SetKernelType(KernelType::TBE_KERNEL);
  827. selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT});
  828. selected_kernel_builder.SetOutputsDeviceType({kNumberTypeFloat32});
  829. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), npu_clear_cnode.get());
  830. return npu_clear_cnode;
  831. }
  832. CNodePtr KernelAdjust::CreateNPUAllocStatus(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
  833. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  834. // create npu_alloc_cnode
  835. auto npu_alloc_primitive = std::make_shared<Primitive>(kNPUAllocFloatStatusOpName);
  836. std::vector<AnfNodePtr> npu_alloc_inputs = {NewValueNode(npu_alloc_primitive)};
  837. auto npu_alloc_cnode = kernel_graph_ptr->NewCNode(npu_alloc_inputs);
  838. MS_EXCEPTION_IF_NULL(npu_alloc_cnode);
  839. npu_alloc_cnode->set_scope(kDefaultScope);
  840. std::vector<size_t> npu_output_shape = {kNPUShape};
  841. AnfAlgo::SetOutputInferTypeAndShape({kNumberTypeFloat32}, {npu_output_shape}, npu_alloc_cnode.get());
  842. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder;
  843. selected_kernel_builder.SetFusionType(kernel::FusionType::OPAQUE);
  844. selected_kernel_builder.SetProcessor(kernel::Processor::AICORE);
  845. selected_kernel_builder.SetKernelType(KernelType::TBE_KERNEL);
  846. selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT});
  847. selected_kernel_builder.SetOutputsDeviceType({kNumberTypeFloat32});
  848. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), npu_alloc_cnode.get());
  849. return npu_alloc_cnode;
  850. }
  851. CNodePtr KernelAdjust::CreateAssignAdd(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  852. const CNodePtr &npu_alloc_cnode, const AnfNodePtr &specify_para) {
  853. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  854. MS_EXCEPTION_IF_NULL(npu_alloc_cnode);
  855. MS_EXCEPTION_IF_NULL(specify_para);
  856. auto assign_add_primitive = std::make_shared<Primitive>(kAssignAddOpName);
  857. std::vector<AnfNodePtr> assign_add_inputs = {NewValueNode(assign_add_primitive), specify_para, npu_alloc_cnode};
  858. auto assign_add_cnode = kernel_graph_ptr->NewCNode(assign_add_inputs);
  859. MS_EXCEPTION_IF_NULL(assign_add_cnode);
  860. assign_add_cnode->set_scope(kDefaultScope);
  861. assign_add_cnode->set_abstract(specify_para->abstract());
  862. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder = CreateMngKernelBuilder(
  863. {kOpFormat_DEFAULT, kOpFormat_DEFAULT}, {TypeId::kNumberTypeFloat32, TypeId::kNumberTypeFloat32});
  864. selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT});
  865. selected_kernel_builder.SetOutputsDeviceType({kNumberTypeFloat32});
  866. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), assign_add_cnode.get());
  867. std::vector<std::string> input_names = {"ref", "value"};
  868. std::vector<std::string> output_names = {"output"};
  869. ValuePtr input_names_v = MakeValue(input_names);
  870. ValuePtr output_names_v = MakeValue(output_names);
  871. AnfAlgo::SetNodeAttr("input_names", input_names_v, assign_add_cnode);
  872. AnfAlgo::SetNodeAttr("output_names", output_names_v, assign_add_cnode);
  873. selected_kernel_builder.SetKernelType(KernelType::TBE_KERNEL);
  874. session::AnfWithOutIndex final_pair = std::make_pair(assign_add_cnode, 0);
  875. session::KernelWithIndex kernel_with_index = AnfAlgo::VisitKernel(AnfAlgo::GetInputNode(assign_add_cnode, 0), 0);
  876. kernel_graph_ptr->AddRefCorrespondPairs(final_pair, kernel_with_index);
  877. return assign_add_cnode;
  878. }
  879. CNodePtr KernelAdjust::CreateAssign(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  880. const AnfNodePtr &specify_para) {
  881. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  882. MS_EXCEPTION_IF_NULL(specify_para);
  883. std::vector<float> reset(kNPUShape, 0.0);
  884. ShapeVector reset_shape({static_cast<int64_t>(kNPUShape)});
  885. auto shp_buf_size = sizeof(float) * reset.size();
  886. auto reset_tensor = std::make_shared<tensor::Tensor>(kNumberTypeFloat32, reset_shape, reset.data(), shp_buf_size);
  887. auto reset_value_node = std::make_shared<ValueNode>(reset_tensor);
  888. MS_EXCEPTION_IF_NULL(reset_value_node);
  889. reset_value_node->set_abstract(specify_para->abstract());
  890. kernel_graph_ptr->AddValueNodeToGraph(reset_value_node);
  891. auto kernel_info = std::make_shared<device::KernelInfo>();
  892. MS_EXCEPTION_IF_NULL(kernel_info);
  893. reset_value_node->set_kernel_info(kernel_info);
  894. kernel::KernelBuildInfo::KernelBuildInfoBuilder builder1;
  895. builder1.SetOutputsFormat({kOpFormat_DEFAULT});
  896. builder1.SetOutputsDeviceType({kNumberTypeFloat32});
  897. AnfAlgo::SetSelectKernelBuildInfo(builder1.Build(), reset_value_node.get());
  898. auto assign_primitive = std::make_shared<Primitive>(kAssignOpName);
  899. std::vector<AnfNodePtr> assign_inputs = {NewValueNode(assign_primitive), specify_para, reset_value_node};
  900. auto assign_cnode = kernel_graph_ptr->NewCNode(assign_inputs);
  901. MS_EXCEPTION_IF_NULL(assign_cnode);
  902. assign_cnode->set_scope(kDefaultScope);
  903. assign_cnode->set_abstract(specify_para->abstract());
  904. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder = CreateMngKernelBuilder(
  905. {kOpFormat_DEFAULT, kOpFormat_DEFAULT}, {TypeId::kNumberTypeFloat32, TypeId::kNumberTypeFloat32});
  906. selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT});
  907. selected_kernel_builder.SetOutputsDeviceType({kNumberTypeFloat32});
  908. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), assign_cnode.get());
  909. std::vector<std::string> input_names = {"ref", "value"};
  910. std::vector<std::string> output_names = {"output"};
  911. ValuePtr input_names_v = MakeValue(input_names);
  912. ValuePtr output_names_v = MakeValue(output_names);
  913. AnfAlgo::SetNodeAttr("input_names", input_names_v, assign_cnode);
  914. AnfAlgo::SetNodeAttr("output_names", output_names_v, assign_cnode);
  915. selected_kernel_builder.SetKernelType(KernelType::TBE_KERNEL);
  916. session::AnfWithOutIndex final_pair = std::make_pair(assign_cnode, 0);
  917. session::KernelWithIndex kernel_with_index = AnfAlgo::VisitKernel(AnfAlgo::GetInputNode(assign_cnode, 0), 0);
  918. kernel_graph_ptr->AddRefCorrespondPairs(final_pair, kernel_with_index);
  919. return assign_cnode;
  920. }
  921. void KernelAdjust::InsertOverflowCheckOperations(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
  922. MS_LOG(INFO) << "Start Insert Overflow Check Operations.";
  923. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  924. auto parameters = kernel_graph_ptr->parameters();
  925. AnfNodePtr specify_para;
  926. bool not_find = true;
  927. for (size_t i = 0; i < parameters.size(); i++) {
  928. auto para_fullname = parameters[i]->fullname_with_scope();
  929. if (para_fullname.find(kSpecifyParameter) != std::string::npos) {
  930. not_find = false;
  931. specify_para = parameters[i];
  932. break;
  933. }
  934. }
  935. if (not_find) {
  936. MS_LOG(INFO) << "Not find parameter named " << kSpecifyParameter;
  937. return;
  938. }
  939. bool first_grad_op = true;
  940. CNodePtr npu_alloc_cnode;
  941. std::vector<CNodePtr> new_execution_order;
  942. auto execution_order = kernel_graph_ptr->execution_order();
  943. for (size_t i = 0; i < execution_order.size() - 1; i++) {
  944. new_execution_order.push_back(execution_order[i]);
  945. auto cur_full_name = execution_order[i]->fullname_with_scope();
  946. auto next_full_name = execution_order[i + 1]->fullname_with_scope();
  947. auto cur_stream_id = AnfAlgo::GetStreamId(execution_order[i]);
  948. auto next_stream_id = AnfAlgo::GetStreamId(execution_order[i + 1]);
  949. if (cur_full_name.find(kGradients) == std::string::npos && next_full_name.find(kGradients) != std::string::npos) {
  950. if (first_grad_op) {
  951. npu_alloc_cnode = CreateNPUAllocStatus(kernel_graph_ptr);
  952. auto npu_clear_cnode = CreateNPUClearStatus(kernel_graph_ptr, npu_alloc_cnode);
  953. auto assign_cnode = CreateAssign(kernel_graph_ptr, specify_para);
  954. AnfAlgo::SetStreamId(next_stream_id, npu_alloc_cnode.get());
  955. AnfAlgo::SetStreamId(next_stream_id, npu_clear_cnode.get());
  956. AnfAlgo::SetStreamId(next_stream_id, assign_cnode.get());
  957. new_execution_order.push_back(npu_alloc_cnode);
  958. new_execution_order.push_back(npu_clear_cnode);
  959. new_execution_order.push_back(assign_cnode);
  960. first_grad_op = false;
  961. } else {
  962. auto npu_clear_cnode = CreateNPUClearStatus(kernel_graph_ptr, npu_alloc_cnode);
  963. AnfAlgo::SetStreamId(next_stream_id, npu_clear_cnode.get());
  964. new_execution_order.push_back(npu_clear_cnode);
  965. }
  966. }
  967. if (cur_full_name.find(kGradients) != std::string::npos && next_full_name.find(kGradients) == std::string::npos) {
  968. auto npu_get_cnode = CreateNPUGetFloatStatus(kernel_graph_ptr, npu_alloc_cnode);
  969. auto assign_add_cnode = CreateAssignAdd(kernel_graph_ptr, npu_alloc_cnode, specify_para);
  970. AnfAlgo::SetStreamId(cur_stream_id, npu_get_cnode.get());
  971. AnfAlgo::SetStreamId(cur_stream_id, assign_add_cnode.get());
  972. new_execution_order.push_back(npu_get_cnode);
  973. new_execution_order.push_back(assign_add_cnode);
  974. }
  975. if (i == execution_order.size() - 2) {
  976. new_execution_order.push_back(execution_order[i + 1]);
  977. if (next_full_name.find(kGradients) != std::string::npos) {
  978. auto npu_get_cnode = CreateNPUGetFloatStatus(kernel_graph_ptr, npu_alloc_cnode);
  979. auto assign_add_cnode = CreateAssignAdd(kernel_graph_ptr, npu_alloc_cnode, specify_para);
  980. AnfAlgo::SetStreamId(cur_stream_id, npu_get_cnode.get());
  981. AnfAlgo::SetStreamId(cur_stream_id, assign_add_cnode.get());
  982. new_execution_order.push_back(npu_get_cnode);
  983. new_execution_order.push_back(assign_add_cnode);
  984. }
  985. }
  986. }
  987. kernel_graph_ptr->set_execution_order(new_execution_order);
  988. }
  989. } // namespace device
  990. } // namespace mindspore