You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

kernel_adjust.cc 27 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "runtime/device/kernel_adjust.h"
  17. #include <map>
  18. #include <algorithm>
  19. #include <string>
  20. #include <unordered_set>
  21. #include <unordered_map>
  22. #include <vector>
  23. #include "backend/session/anf_runtime_algorithm.h"
  24. #include "utils/context/ms_context.h"
  25. #include "common/trans.h"
  26. #include "utils/config_manager.h"
  27. #include "common/utils.h"
  28. #include "backend/kernel_compiler/kernel_build_info.h"
  29. #include "utils/utils.h"
  30. #include "runtime/device/ascend/profiling/profiling_manager.h"
  31. #include "runtime/device/ascend/kernel_select_ascend.h"
  32. #include "runtime/base.h"
  33. #include "runtime/device/ascend/ascend_stream_assign.h"
  34. namespace mindspore {
  35. namespace device {
  36. using device::ascend::ProfilingUtils;
  37. void KernelAdjust::ReorderGetNext(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
  38. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  39. const std::vector<CNodePtr> &origin_cnode_list = kernel_graph_ptr->execution_order();
  40. std::vector<CNodePtr> getnext_list;
  41. std::vector<CNodePtr> other_list;
  42. for (const auto &cnode : origin_cnode_list) {
  43. if (AnfAlgo::GetCNodeName(cnode) == kGetNextOpName) {
  44. getnext_list.emplace_back(cnode);
  45. } else {
  46. other_list.emplace_back(cnode);
  47. }
  48. }
  49. std::vector<CNodePtr> new_order_list;
  50. new_order_list.insert(new_order_list.end(), getnext_list.begin(), getnext_list.end());
  51. new_order_list.insert(new_order_list.end(), other_list.begin(), other_list.end());
  52. kernel_graph_ptr->set_execution_order(new_order_list);
  53. }
  54. bool KernelAdjust::NeedInsertSwitch() {
  55. auto context_ptr = MsContext::GetInstance();
  56. MS_EXCEPTION_IF_NULL(context_ptr);
  57. return (context_ptr->enable_task_sink() && context_ptr->loop_sink_flag() &&
  58. ConfigManager::GetInstance().iter_num() > 1);
  59. }
  60. CNodePtr KernelAdjust::CreateSendApplyKernel(const std::shared_ptr<session::KernelGraph> &graph_ptr,
  61. uint32_t event_id) {
  62. MS_EXCEPTION_IF_NULL(graph_ptr);
  63. auto send_op = std::make_shared<Primitive>(kSendOpName);
  64. MS_EXCEPTION_IF_NULL(send_op);
  65. auto send_apply = std::make_shared<ValueNode>(send_op);
  66. MS_EXCEPTION_IF_NULL(send_apply);
  67. std::vector<AnfNodePtr> send_input_list = {send_apply};
  68. CNodePtr send_node_ptr = graph_ptr->NewCNode(send_input_list);
  69. MS_EXCEPTION_IF_NULL(send_node_ptr);
  70. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder;
  71. selected_kernel_builder.SetKernelType(KernelType::RT_KERNEL);
  72. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), send_node_ptr.get());
  73. AnfAlgo::SetNodeAttr(kAttrEventId, MakeValue(event_id), send_node_ptr);
  74. auto abstract_none = std::make_shared<abstract::AbstractNone>();
  75. MS_EXCEPTION_IF_NULL(abstract_none);
  76. send_node_ptr->set_abstract(abstract_none);
  77. return send_node_ptr;
  78. }
  79. CNodePtr KernelAdjust::CreateRecvApplyKernel(const std::shared_ptr<session::KernelGraph> &graph_ptr,
  80. uint32_t event_id) {
  81. MS_EXCEPTION_IF_NULL(graph_ptr);
  82. auto recv_op = std::make_shared<Primitive>(kRecvOpName);
  83. MS_EXCEPTION_IF_NULL(recv_op);
  84. auto recv_apply = std::make_shared<ValueNode>(recv_op);
  85. MS_EXCEPTION_IF_NULL(recv_apply);
  86. std::vector<AnfNodePtr> recv_input_list = {recv_apply};
  87. CNodePtr recv_node_ptr = graph_ptr->NewCNode(recv_input_list);
  88. MS_EXCEPTION_IF_NULL(recv_node_ptr);
  89. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder;
  90. selected_kernel_builder.SetKernelType(KernelType::RT_KERNEL);
  91. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), recv_node_ptr.get());
  92. AnfAlgo::SetNodeAttr(kAttrEventId, MakeValue(event_id), recv_node_ptr);
  93. auto abstract_none = std::make_shared<abstract::AbstractNone>();
  94. MS_EXCEPTION_IF_NULL(abstract_none);
  95. recv_node_ptr->set_abstract(abstract_none);
  96. return recv_node_ptr;
  97. }
  98. void KernelAdjust::InsertSwitchLoop(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
  99. device::ascend::AscendResourceMng &resource_manager = device::ascend::AscendResourceMng::GetInstance();
  100. resource_manager.ResetResource();
  101. if (!NeedInsertSwitch()) {
  102. return;
  103. }
  104. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  105. bool eos_mode = ConfigManager::GetInstance().iter_num() == INT32_MAX;
  106. ReorderGetNext(kernel_graph_ptr);
  107. std::map<std::string, mindspore::ParameterPtr> switch_loop_input;
  108. CreateSwitchOpParameters(kernel_graph_ptr, &switch_loop_input);
  109. std::vector<AnfNodePtr> *mute_inputs = kernel_graph_ptr->MutableInputs();
  110. MS_EXCEPTION_IF_NULL(mute_inputs);
  111. mute_inputs->push_back(switch_loop_input[kLoopCountParamName]);
  112. mute_inputs->push_back(switch_loop_input[kEpochParamName]);
  113. mute_inputs->push_back(switch_loop_input[kIterLoopParamName]);
  114. mute_inputs->push_back(switch_loop_input[kZeroParamName]);
  115. mute_inputs->push_back(switch_loop_input[kOneParamName]);
  116. for (const auto &input : kernel_graph_ptr->inputs()) {
  117. MS_EXCEPTION_IF_NULL(input);
  118. if (input->isa<Parameter>()) {
  119. ParameterPtr param_ptr = input->cast<ParameterPtr>();
  120. if (param_ptr == nullptr) {
  121. MS_EXCEPTION(NotSupportError) << "Cast to parameter point failed !";
  122. }
  123. }
  124. }
  125. const std::vector<CNodePtr> &orders = kernel_graph_ptr->execution_order();
  126. if (orders.empty()) {
  127. MS_LOG(EXCEPTION) << "graph execution order is empty";
  128. }
  129. std::vector<CNodePtr> exec_order;
  130. std::vector<uint32_t> getnext_active_streams;
  131. std::vector<uint32_t> fpbp_active_streams;
  132. CNodePtr getnext_cnode;
  133. uint32_t eos_done_event_id = UINT32_MAX;
  134. // getnext loop process
  135. // getnext loop stream switch op
  136. CNodePtr getnext_switch_app = CreateStreamSwitchOp(kernel_graph_ptr, switch_loop_input);
  137. MS_EXCEPTION_IF_NULL(getnext_switch_app);
  138. uint32_t getnext_switch_stream_id = resource_manager.ApplyNewStream();
  139. AnfAlgo::SetStreamId(getnext_switch_stream_id, getnext_switch_app.get());
  140. exec_order.push_back(getnext_switch_app);
  141. // getnext op
  142. uint32_t getnext_stream_id = resource_manager.ApplyNewStream();
  143. size_t i = 0;
  144. for (; i < orders.size(); i++) {
  145. auto node = orders[i];
  146. exec_order.push_back(node);
  147. AnfAlgo::SetStreamId(getnext_stream_id, exec_order[exec_order.size() - 1].get());
  148. if (AnfAlgo::GetCNodeName(node) == kGetNextOpName) {
  149. getnext_cnode = node;
  150. break;
  151. }
  152. }
  153. // update getnext loop stream switch true_branch_stream attr
  154. AnfAlgo::SetNodeAttr(kAttrTrueBranchStream, MakeValue<uint32_t>(getnext_stream_id), getnext_switch_app);
  155. // getnext loop fpbp start send
  156. uint32_t fpbp_start_event_id = resource_manager.ApplyNewEvent();
  157. CNodePtr fpbp_start_send = CreateSendApplyKernel(kernel_graph_ptr, fpbp_start_event_id);
  158. AnfAlgo::SetStreamId(getnext_stream_id, fpbp_start_send.get());
  159. exec_order.push_back(fpbp_start_send);
  160. if (eos_mode) {
  161. // getnext loop eos start send
  162. uint32_t eos_start_event_id = resource_manager.ApplyNewEvent();
  163. CNodePtr eos_start_send = CreateSendApplyKernel(kernel_graph_ptr, eos_start_event_id);
  164. AnfAlgo::SetStreamId(getnext_stream_id, eos_start_send.get());
  165. exec_order.push_back(eos_start_send);
  166. // End Of Sequence loop process
  167. // eos loop stream switch
  168. CNodePtr eos_switch_app = CreateStreamSwitchOp(kernel_graph_ptr, switch_loop_input);
  169. MS_EXCEPTION_IF_NULL(eos_switch_app);
  170. uint32_t eos_switch_stream_id = resource_manager.ApplyNewStream();
  171. AnfAlgo::SetStreamId(eos_switch_stream_id, eos_switch_app.get());
  172. AnfAlgo::SetNodeAttr(kStreamNeedActivedFirst, MakeValue<bool>(true), eos_switch_app);
  173. exec_order.push_back(eos_switch_app);
  174. // eos loop eos start recv
  175. CNodePtr eos_start_recv = CreateRecvApplyKernel(kernel_graph_ptr, eos_start_event_id);
  176. uint32_t eos_stream_id = resource_manager.ApplyNewStream();
  177. AnfAlgo::SetStreamId(eos_stream_id, eos_start_recv.get());
  178. exec_order.push_back(eos_start_recv);
  179. // update eos loop stream switch true_branch_stream attr
  180. AnfAlgo::SetNodeAttr(kAttrTrueBranchStream, MakeValue<uint32_t>(eos_stream_id), eos_switch_app);
  181. // EndOfSequence op
  182. CNodePtr end_of_sequence_op = CreateEndOfSequenceOP(kernel_graph_ptr, getnext_cnode);
  183. MS_EXCEPTION_IF_NULL(end_of_sequence_op);
  184. AnfAlgo::SetStreamId(eos_stream_id, end_of_sequence_op.get());
  185. exec_order.push_back(end_of_sequence_op);
  186. // eos loop eos done send
  187. eos_done_event_id = resource_manager.ApplyNewEvent();
  188. CNodePtr eos_done_send = CreateSendApplyKernel(kernel_graph_ptr, eos_done_event_id);
  189. AnfAlgo::SetStreamId(eos_stream_id, eos_done_send.get());
  190. exec_order.push_back(eos_done_send);
  191. // eos loop stream active
  192. fpbp_active_streams.push_back(eos_switch_stream_id);
  193. }
  194. // fpbp loop process
  195. // fpbp loop stream switch
  196. CNodePtr fpbp_switch_app = CreateStreamSwitchOp(kernel_graph_ptr, switch_loop_input);
  197. MS_EXCEPTION_IF_NULL(fpbp_switch_app);
  198. uint32_t fpbp_switch_stream_id = resource_manager.ApplyNewStream();
  199. AnfAlgo::SetStreamId(fpbp_switch_stream_id, fpbp_switch_app.get());
  200. AnfAlgo::SetNodeAttr(kStreamNeedActivedFirst, MakeValue<bool>(true), fpbp_switch_app);
  201. exec_order.push_back(fpbp_switch_app);
  202. // fpbp loop fpbp start recv
  203. CNodePtr fpbp_start_recv = CreateRecvApplyKernel(kernel_graph_ptr, fpbp_start_event_id);
  204. uint32_t fpbp_stream_id = resource_manager.ApplyNewStream();
  205. AnfAlgo::SetStreamId(fpbp_stream_id, fpbp_start_recv.get());
  206. exec_order.push_back(fpbp_start_recv);
  207. // update fpbp loop stream switch true_branch_stream attr
  208. AnfAlgo::SetNodeAttr(kAttrTrueBranchStream, MakeValue<uint32_t>(fpbp_stream_id), fpbp_switch_app);
  209. // fpbp loop AssignAdd
  210. CNodePtr assign_add_one = CreateStreamAssignAddnOP(kernel_graph_ptr, switch_loop_input);
  211. MS_EXCEPTION_IF_NULL(assign_add_one);
  212. AnfAlgo::SetStreamId(fpbp_stream_id, assign_add_one.get());
  213. exec_order.push_back(assign_add_one);
  214. // fpbp memcpy
  215. std::vector<CNodePtr> memcpy_list;
  216. std::vector<CNodePtr> other_list;
  217. CNodePtr cur_cnode = nullptr;
  218. for (size_t idx = i + 1; idx < orders.size(); idx++) {
  219. cur_cnode = orders[idx];
  220. if (AnfAlgo::HasNodeAttr(kAttrLabelForInsertStreamActive, cur_cnode)) {
  221. memcpy_list.emplace_back(cur_cnode);
  222. } else {
  223. other_list.emplace_back(cur_cnode);
  224. }
  225. }
  226. (void)std::copy(memcpy_list.begin(), memcpy_list.end(), std::back_inserter(exec_order));
  227. // fpbp loop eos done recv
  228. if (eos_mode) {
  229. CNodePtr eos_done_recv = CreateRecvApplyKernel(kernel_graph_ptr, eos_done_event_id);
  230. AnfAlgo::SetStreamId(fpbp_stream_id, eos_done_recv.get());
  231. exec_order.push_back(eos_done_recv);
  232. }
  233. // stream active to activate getnext loop
  234. CNodePtr getnext_active_app = CreateStreamActiveOp(kernel_graph_ptr);
  235. MS_EXCEPTION_IF_NULL(getnext_active_app);
  236. getnext_active_streams.push_back(getnext_switch_stream_id);
  237. AnfAlgo::SetNodeAttr(kAttrActiveStreamList, MakeValue<std::vector<uint32_t>>(getnext_active_streams),
  238. getnext_active_app);
  239. exec_order.push_back(getnext_active_app);
  240. // fpbp loop other ops
  241. (void)std::copy(other_list.begin(), other_list.end(), std::back_inserter(exec_order));
  242. // stream active to activate fpbp loop and eos loop
  243. CNodePtr fpbp_active_app = CreateStreamActiveOp(kernel_graph_ptr);
  244. MS_EXCEPTION_IF_NULL(fpbp_active_app);
  245. fpbp_active_streams.push_back(fpbp_switch_stream_id);
  246. AnfAlgo::SetNodeAttr(kAttrActiveStreamList, MakeValue<std::vector<uint32_t>>(fpbp_active_streams), fpbp_active_app);
  247. exec_order.push_back(fpbp_active_app);
  248. kernel_graph_ptr->set_execution_order(exec_order);
  249. }
  250. void KernelAdjust::CreateSwitchOpParameters(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  251. std::map<std::string, mindspore::ParameterPtr> *switch_loop_input) {
  252. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  253. MS_EXCEPTION_IF_NULL(switch_loop_input);
  254. std::vector<int> shp = {1};
  255. tensor::TensorPtr tensor_ptr = std::make_shared<tensor::Tensor>(kInt32->type_id(), shp);
  256. MS_EXCEPTION_IF_NULL(tensor_ptr);
  257. mindspore::abstract::AbstractBasePtr paremeter_abstract_ptr = tensor_ptr->ToAbstract();
  258. if (paremeter_abstract_ptr == nullptr) {
  259. MS_LOG(EXCEPTION) << "create abstract before insert switch op failed!";
  260. }
  261. ParameterPtr loop_count = std::make_shared<Parameter>(kernel_graph_ptr);
  262. MS_EXCEPTION_IF_NULL(loop_count);
  263. loop_count->set_name(kLoopCountParamName);
  264. loop_count->set_abstract(paremeter_abstract_ptr);
  265. ParameterPtr loop_count_new = kernel_graph_ptr->NewParameter(loop_count);
  266. (*switch_loop_input)[kLoopCountParamName] = loop_count_new;
  267. ParameterPtr iter_loop = std::make_shared<Parameter>(kernel_graph_ptr);
  268. iter_loop->set_name(kIterLoopParamName);
  269. iter_loop->set_abstract(paremeter_abstract_ptr);
  270. ParameterPtr iter_loop_new = kernel_graph_ptr->NewParameter(iter_loop);
  271. (*switch_loop_input)[kIterLoopParamName] = iter_loop_new;
  272. ParameterPtr zero = std::make_shared<Parameter>(kernel_graph_ptr);
  273. zero->set_name(kZeroParamName);
  274. zero->set_abstract(paremeter_abstract_ptr);
  275. ParameterPtr zero_new = kernel_graph_ptr->NewParameter(zero);
  276. (*switch_loop_input)[kZeroParamName] = zero_new;
  277. ParameterPtr one = std::make_shared<Parameter>(kernel_graph_ptr);
  278. one->set_name(kOneParamName);
  279. one->set_abstract(paremeter_abstract_ptr);
  280. ParameterPtr one_new = kernel_graph_ptr->NewParameter(one);
  281. (*switch_loop_input)[kOneParamName] = one_new;
  282. ParameterPtr epoch = std::make_shared<Parameter>(kernel_graph_ptr);
  283. MS_EXCEPTION_IF_NULL(epoch);
  284. epoch->set_name(kEpochParamName);
  285. epoch->set_abstract(paremeter_abstract_ptr);
  286. ParameterPtr epoch_new = kernel_graph_ptr->NewParameter(epoch);
  287. (*switch_loop_input)[kEpochParamName] = epoch_new;
  288. }
  289. kernel::KernelBuildInfo::KernelBuildInfoBuilder KernelAdjust::CreateMngKernelBuilder(
  290. const std::vector<std::string> &formats, const std::vector<TypeId> &type_ids) {
  291. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder;
  292. selected_kernel_builder.SetInputsFormat(formats);
  293. selected_kernel_builder.SetInputsDeviceType(type_ids);
  294. selected_kernel_builder.SetFusionType(kernel::FusionType::OPAQUE);
  295. selected_kernel_builder.SetProcessor(kernel::Processor::AICORE);
  296. selected_kernel_builder.SetKernelType(KernelType::RT_KERNEL);
  297. return selected_kernel_builder;
  298. }
  299. CNodePtr KernelAdjust::CreateStreamSwitchOp(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  300. const std::map<std::string, mindspore::ParameterPtr> &switch_loop_input) {
  301. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder = CreateMngKernelBuilder(
  302. {kOpFormat_DEFAULT, kOpFormat_DEFAULT}, {TypeId::kNumberTypeInt32, TypeId::kNumberTypeInt32});
  303. auto typeNone_abstract = std::make_shared<abstract::AbstractNone>();
  304. auto stream_switch = std::make_shared<Primitive>(kStreamSwitchOpName);
  305. std::vector<AnfNodePtr> inputs;
  306. inputs.push_back(NewValueNode(stream_switch));
  307. inputs.push_back(switch_loop_input.at(kLoopCountParamName));
  308. inputs.push_back(switch_loop_input.at(kIterLoopParamName));
  309. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  310. CNodePtr stream_switch_app = kernel_graph_ptr->NewCNode(inputs);
  311. MS_EXCEPTION_IF_NULL(stream_switch_app);
  312. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), stream_switch_app.get());
  313. stream_switch_app->set_abstract(typeNone_abstract);
  314. // set attr: cond_ RT_LESS
  315. int condition = static_cast<int>(RT_LESS);
  316. ValuePtr cond = MakeValue(condition);
  317. AnfAlgo::SetNodeAttr(kAttrSwitchCondition, cond, stream_switch_app);
  318. // set attr:data_type
  319. int data_type = static_cast<int>(RT_SWITCH_INT64);
  320. ValuePtr dt = MakeValue(data_type);
  321. AnfAlgo::SetNodeAttr(kAttrDataType, dt, stream_switch_app);
  322. // set distinction label and graph id
  323. return stream_switch_app;
  324. }
  325. CNodePtr KernelAdjust::CreateStreamActiveOp(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
  326. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder = CreateMngKernelBuilder(
  327. {kOpFormat_DEFAULT, kOpFormat_DEFAULT}, {TypeId::kNumberTypeInt32, TypeId::kNumberTypeInt32});
  328. abstract::AbstractBasePtr typeNone_abstract = std::make_shared<abstract::AbstractNone>();
  329. auto stream_active_others = std::make_shared<Primitive>(kStreamActiveOpName);
  330. std::vector<AnfNodePtr> inputs;
  331. inputs.push_back(NewValueNode(stream_active_others));
  332. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  333. CNodePtr stream_active_others_app = kernel_graph_ptr->NewCNode(inputs);
  334. MS_EXCEPTION_IF_NULL(stream_active_others_app);
  335. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), stream_active_others_app.get());
  336. stream_active_others_app->set_abstract(typeNone_abstract);
  337. return stream_active_others_app;
  338. }
  339. CNodePtr KernelAdjust::CreatTupleGetItemNode(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  340. const CNodePtr &node, size_t output_idx) {
  341. auto idx = NewValueNode(SizeToInt(output_idx));
  342. MS_EXCEPTION_IF_NULL(idx);
  343. auto imm = std::make_shared<Int32Imm>(SizeToInt(output_idx));
  344. auto abstract_scalar = std::make_shared<abstract::AbstractScalar>(imm);
  345. idx->set_abstract(abstract_scalar);
  346. CNodePtr tuple_getitem = kernel_graph_ptr->NewCNode({NewValueNode(prim::kPrimTupleGetItem), node, idx});
  347. MS_EXCEPTION_IF_NULL(tuple_getitem);
  348. tuple_getitem->set_scope(node->scope());
  349. std::vector<size_t> origin_shape = AnfAlgo::GetOutputInferShape(node, output_idx);
  350. TypeId origin_type = AnfAlgo::GetOutputInferDataType(node, output_idx);
  351. AnfAlgo::SetOutputInferTypeAndShape({origin_type}, {origin_shape}, tuple_getitem.get());
  352. return tuple_getitem;
  353. }
  354. CNodePtr KernelAdjust::CreateEndOfSequenceOP(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  355. const CNodePtr &getnext_cnode) {
  356. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  357. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder;
  358. selected_kernel_builder.SetInputsFormat({kOpFormat_DEFAULT});
  359. selected_kernel_builder.SetInputsDeviceType({kNumberTypeUInt8});
  360. selected_kernel_builder.SetFusionType(kernel::FusionType::OPAQUE);
  361. selected_kernel_builder.SetProcessor(kernel::Processor::AICPU);
  362. selected_kernel_builder.SetKernelType(KernelType::AICPU_KERNEL);
  363. selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT});
  364. selected_kernel_builder.SetOutputsDeviceType({kNumberTypeUInt8});
  365. // EndOfSequence
  366. auto end_of_sequence = std::make_shared<Primitive>(kEndOfSequence);
  367. std::vector<AnfNodePtr> inputs;
  368. inputs.push_back(NewValueNode(end_of_sequence));
  369. // GetNext output 0 is EndOfSequence's input
  370. auto tuple_get_item = CreatTupleGetItemNode(kernel_graph_ptr, getnext_cnode, 0);
  371. inputs.push_back(tuple_get_item);
  372. CNodePtr end_of_sequence_node = kernel_graph_ptr->NewCNode(inputs);
  373. MS_EXCEPTION_IF_NULL(end_of_sequence_node);
  374. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), end_of_sequence_node.get());
  375. std::vector<std::string> input_names = {"x"};
  376. ValuePtr input_names_v = MakeValue(input_names);
  377. AnfAlgo::SetNodeAttr("input_names", input_names_v, end_of_sequence_node);
  378. std::vector<std::string> output_names = {"y"};
  379. ValuePtr output_names_v = MakeValue(output_names);
  380. AnfAlgo::SetNodeAttr("output_names", output_names_v, end_of_sequence_node);
  381. end_of_sequence_node->set_abstract(tuple_get_item->abstract());
  382. return end_of_sequence_node;
  383. }
  384. CNodePtr KernelAdjust::CreateStreamAssignAddnOP(
  385. const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  386. const std::map<std::string, mindspore::ParameterPtr> &switch_loop_input) {
  387. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  388. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder = CreateMngKernelBuilder(
  389. {kOpFormat_DEFAULT, kOpFormat_DEFAULT}, {TypeId::kNumberTypeInt32, TypeId::kNumberTypeInt32});
  390. selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT});
  391. selected_kernel_builder.SetOutputsDeviceType({kNumberTypeInt32});
  392. // AssignAdd
  393. auto assign_add = std::make_shared<Primitive>(kAssignAddOpName);
  394. std::vector<AnfNodePtr> inputs;
  395. inputs.push_back(NewValueNode(assign_add));
  396. inputs.push_back(switch_loop_input.at(kLoopCountParamName));
  397. inputs.push_back(switch_loop_input.at(kOneParamName));
  398. CNodePtr assign_add_one = kernel_graph_ptr->NewCNode(inputs);
  399. MS_EXCEPTION_IF_NULL(assign_add_one);
  400. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), assign_add_one.get());
  401. std::vector<std::string> input_names = {"ref", "value"};
  402. std::vector<std::string> output_names = {"output"};
  403. ValuePtr input_names_v = MakeValue(input_names);
  404. ValuePtr output_names_v = MakeValue(output_names);
  405. AnfAlgo::SetNodeAttr("input_names", input_names_v, assign_add_one);
  406. AnfAlgo::SetNodeAttr("output_names", output_names_v, assign_add_one);
  407. selected_kernel_builder.SetKernelType(KernelType::TBE_KERNEL);
  408. MS_EXCEPTION_IF_NULL(switch_loop_input.at(kLoopCountParamName));
  409. assign_add_one->set_abstract(switch_loop_input.at(kLoopCountParamName)->abstract());
  410. return assign_add_one;
  411. }
  412. bool KernelAdjust::StepLoadCtrlInputs(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
  413. if (!NeedInsertSwitch()) {
  414. return true;
  415. }
  416. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  417. auto input_nodes = kernel_graph_ptr->inputs();
  418. std::vector<tensor::TensorPtr> inputs;
  419. LoadSwitchInputs(&inputs);
  420. std::shared_ptr<std::vector<tensor::TensorPtr>> inputsPtr = std::make_shared<std::vector<tensor::TensorPtr>>(inputs);
  421. kernel_graph_ptr->set_input_ctrl_tensors(inputsPtr);
  422. size_t input_ctrl_size = inputs.size();
  423. // inputs_node:include four ctrl nodes in the back. such as:conv,loop_cnt, ites_loop, zero, one.
  424. // deal four ctrl nodes.
  425. for (size_t i = 0; i < inputs.size(); ++i) {
  426. auto tensor = inputs[i];
  427. size_t deal_index = input_nodes.size() - input_ctrl_size + i;
  428. if (deal_index >= input_nodes.size()) {
  429. MS_LOG(EXCEPTION) << "deal_index[" << deal_index << "] out of range";
  430. }
  431. auto input_node = input_nodes[deal_index];
  432. bool need_sync = false;
  433. MS_EXCEPTION_IF_NULL(input_node);
  434. if (input_node->isa<Parameter>()) {
  435. auto pk_node = input_node->cast<ParameterPtr>();
  436. MS_EXCEPTION_IF_NULL(tensor);
  437. MS_EXCEPTION_IF_NULL(pk_node);
  438. if (tensor->is_dirty() || !pk_node->has_default()) {
  439. need_sync = true;
  440. }
  441. }
  442. if (need_sync) {
  443. auto pk_node = input_node->cast<ParameterPtr>();
  444. MS_EXCEPTION_IF_NULL(pk_node);
  445. auto device_address = AnfAlgo::GetMutableOutputAddr(pk_node, 0);
  446. MS_EXCEPTION_IF_NULL(device_address);
  447. tensor->set_device_address(device_address);
  448. if (!device_address->SyncHostToDevice(trans::GetRuntimePaddingShape(pk_node, 0),
  449. LongToSize(tensor->data().nbytes()), tensor->data_type(),
  450. tensor->data_c())) {
  451. MS_LOG(INFO) << "SyncHostToDevice failed.";
  452. return false;
  453. }
  454. }
  455. tensor->set_dirty(false);
  456. }
  457. return true;
  458. }
  459. void KernelAdjust::LoadSwitchInputs(std::vector<tensor::TensorPtr> *inputs) {
  460. MS_LOG(INFO) << "---------------- LoadSwitchInputs---";
  461. MS_EXCEPTION_IF_NULL(inputs);
  462. std::vector<int> shp = {1};
  463. tensor::TensorPtr loop_count_tensor = std::make_shared<tensor::Tensor>(kInt32->type_id(), shp);
  464. MS_EXCEPTION_IF_NULL(loop_count_tensor);
  465. int32_t *val = nullptr;
  466. val = static_cast<int32_t *>(loop_count_tensor->data_c());
  467. MS_EXCEPTION_IF_NULL(val);
  468. *val = 0;
  469. inputs->push_back(loop_count_tensor);
  470. // Epoch in device
  471. tensor::TensorPtr epoch_tensor = std::make_shared<tensor::Tensor>(kInt32->type_id(), shp);
  472. MS_EXCEPTION_IF_NULL(epoch_tensor);
  473. val = static_cast<int32_t *>(epoch_tensor->data_c());
  474. MS_EXCEPTION_IF_NULL(val);
  475. *val = 0;
  476. inputs->push_back(epoch_tensor);
  477. tensor::TensorPtr iter_loop_tensor = std::make_shared<tensor::Tensor>(kInt32->type_id(), shp);
  478. MS_EXCEPTION_IF_NULL(iter_loop_tensor);
  479. val = static_cast<int32_t *>(iter_loop_tensor->data_c());
  480. MS_EXCEPTION_IF_NULL(val);
  481. *val = SizeToInt(LongToSize(ConfigManager::GetInstance().iter_num()));
  482. MS_LOG(INFO) << "iter_loop_tensor = " << *val;
  483. inputs->push_back(iter_loop_tensor);
  484. tensor::TensorPtr zero_tensor = std::make_shared<tensor::Tensor>(kInt32->type_id(), shp);
  485. MS_EXCEPTION_IF_NULL(zero_tensor);
  486. val = static_cast<int32_t *>(zero_tensor->data_c());
  487. MS_EXCEPTION_IF_NULL(val);
  488. *val = 0;
  489. inputs->push_back(zero_tensor);
  490. tensor::TensorPtr one_tensor = std::make_shared<tensor::Tensor>(kInt32->type_id(), shp);
  491. MS_EXCEPTION_IF_NULL(one_tensor);
  492. val = static_cast<int32_t *>(one_tensor->data_c());
  493. MS_EXCEPTION_IF_NULL(val);
  494. *val = 1;
  495. inputs->push_back(one_tensor);
  496. MS_LOG(INFO) << "---------------- LoadSwitchInputs End--";
  497. }
  498. void KernelAdjust::Profiling(NotNull<session::KernelGraph *> kernel_graph_ptr) {
  499. if (!ascend::ProfilingManager::GetInstance().IsProfiling()) {
  500. MS_LOG(INFO) << "No need to profiling";
  501. return;
  502. }
  503. ProfilingTraceInfo profiling_trace_info = ProfilingUtils::GetProfilingTraceFromEnv(kernel_graph_ptr);
  504. if (!profiling_trace_info.IsValid()) {
  505. MS_LOG(WARNING) << "[profiling] no profiling node found!";
  506. return;
  507. }
  508. InsertProfilingKernel(profiling_trace_info, kernel_graph_ptr);
  509. }
  510. void KernelAdjust::InsertProfilingKernel(const ProfilingTraceInfo &profiling_trace_info,
  511. NotNull<session::KernelGraph *> kernel_graph_ptr) {
  512. MS_LOG(INFO) << "[profiling] Insert profiling kernel start";
  513. if (!profiling_trace_info.IsValid()) {
  514. MS_LOG(WARNING) << "Profiling trace point not found";
  515. return;
  516. }
  517. std::vector<CNodePtr> new_cnode_list;
  518. std::vector<CNodePtr> cnode_ptr_list = kernel_graph_ptr->execution_order();
  519. if (cnode_ptr_list.empty()) {
  520. MS_LOG(ERROR) << "No CNode in graph";
  521. return;
  522. }
  523. for (const auto &cnode_ptr : cnode_ptr_list) {
  524. ProfilingUtils::ProfilingTraceFpStart(cnode_ptr, profiling_trace_info, kernel_graph_ptr, NOT_NULL(&new_cnode_list));
  525. new_cnode_list.emplace_back(cnode_ptr);
  526. ProfilingUtils::ProfilingCustomOp(cnode_ptr, profiling_trace_info, kernel_graph_ptr, NOT_NULL(&new_cnode_list));
  527. ProfilingUtils::ProfilingTraceBpEnd(cnode_ptr, profiling_trace_info, kernel_graph_ptr, NOT_NULL(&new_cnode_list));
  528. ProfilingUtils::ProfilingTraceEnd(cnode_ptr, profiling_trace_info, kernel_graph_ptr, NOT_NULL(&new_cnode_list));
  529. }
  530. kernel_graph_ptr->set_execution_order(new_cnode_list);
  531. }
  532. } // namespace device
  533. } // namespace mindspore