You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

kernel_adjust.cc 23 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "device/kernel_adjust.h"
  17. #include <map>
  18. #include <algorithm>
  19. #include <string>
  20. #include <unordered_set>
  21. #include <unordered_map>
  22. #include <vector>
  23. #include "session/anf_runtime_algorithm.h"
  24. #include "utils/context/ms_context.h"
  25. #include "common/trans.h"
  26. #include "utils/config_manager.h"
  27. #include "common/utils.h"
  28. #include "kernel/kernel_build_info.h"
  29. #include "utils/utils.h"
  30. #include "device/ascend/profiling/profiling_manager.h"
  31. #include "device/ascend/kernel_select_ascend.h"
  32. #include "runtime/base.h"
  33. #include "device/ascend/ascend_stream_assign.h"
  34. namespace mindspore {
  35. namespace device {
  36. using device::ascend::ProfilingUtils;
  37. void KernelAdjust::Reorder(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
  38. MS_EXCEPTION_IF_NULL(kernel_graph);
  39. const std::vector<CNodePtr> &origin_cnode_list = kernel_graph->execution_order();
  40. std::vector<CNodePtr> momentum_list;
  41. std::vector<CNodePtr> other_list;
  42. for (const auto &cnode : origin_cnode_list) {
  43. if (kOptOperatorSet.find(AnfAlgo::GetCNodeName(cnode)) != kOptOperatorSet.end()) {
  44. momentum_list.emplace_back(cnode);
  45. } else {
  46. other_list.emplace_back(cnode);
  47. }
  48. }
  49. std::vector<CNodePtr> new_order_list;
  50. new_order_list.insert(new_order_list.end(), other_list.begin(), other_list.end());
  51. new_order_list.insert(new_order_list.end(), momentum_list.begin(), momentum_list.end());
  52. kernel_graph->set_execution_order(new_order_list);
  53. }
  54. void KernelAdjust::ReorderGetNext(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
  55. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  56. const std::vector<CNodePtr> &origin_cnode_list = kernel_graph_ptr->execution_order();
  57. std::vector<CNodePtr> getnext_list;
  58. std::vector<CNodePtr> other_list;
  59. for (const auto &cnode : origin_cnode_list) {
  60. if (AnfAlgo::GetCNodeName(cnode) == kGetNextOpName) {
  61. getnext_list.emplace_back(cnode);
  62. } else {
  63. other_list.emplace_back(cnode);
  64. }
  65. }
  66. std::vector<CNodePtr> new_order_list;
  67. new_order_list.insert(new_order_list.end(), getnext_list.begin(), getnext_list.end());
  68. new_order_list.insert(new_order_list.end(), other_list.begin(), other_list.end());
  69. kernel_graph_ptr->set_execution_order(new_order_list);
  70. }
  71. bool KernelAdjust::NeedInsertSwitch() {
  72. auto context_ptr = MsContext::GetInstance();
  73. MS_EXCEPTION_IF_NULL(context_ptr);
  74. return (context_ptr->enable_task_sink() && context_ptr->loop_sink_flag() &&
  75. ConfigManager::GetInstance().iter_num() > 1);
  76. }
  77. uint32_t KernelAdjust::FindFirstStreamSwitchLabel(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
  78. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  79. auto cnode_ptr_list = kernel_graph_ptr->execution_order();
  80. CNodePtr cur_cnode_ptr = nullptr;
  81. uint32_t label = kInvalidDistincLabel;
  82. for (uint32_t i = 0; i < cnode_ptr_list.size(); ++i) {
  83. cur_cnode_ptr = cnode_ptr_list[i];
  84. MS_EXCEPTION_IF_NULL(cur_cnode_ptr);
  85. if (AnfAlgo::GetCNodeName(cur_cnode_ptr) == kStreamSwitchOpName) {
  86. label = AnfAlgo::GetStreamDistinctionLabel(cur_cnode_ptr.get());
  87. break;
  88. }
  89. }
  90. return label;
  91. }
  92. CNodePtr KernelAdjust::CreateSendApplyKernel(const std::shared_ptr<session::KernelGraph> &graph_ptr,
  93. uint32_t event_id) {
  94. MS_EXCEPTION_IF_NULL(graph_ptr);
  95. auto send_op = std::make_shared<Primitive>(kSendOpName);
  96. MS_EXCEPTION_IF_NULL(send_op);
  97. auto send_apply = std::make_shared<ValueNode>(send_op);
  98. MS_EXCEPTION_IF_NULL(send_apply);
  99. std::vector<AnfNodePtr> send_input_list = {send_apply};
  100. CNodePtr send_node_ptr = graph_ptr->NewCNode(send_input_list);
  101. MS_EXCEPTION_IF_NULL(send_node_ptr);
  102. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder;
  103. selected_kernel_builder.SetKernelType(KernelType::RT_KERNEL);
  104. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), send_node_ptr.get());
  105. AnfAlgo::SetNodeAttr(kAttrEventId, MakeValue(event_id), send_node_ptr);
  106. auto abstract_none = std::make_shared<abstract::AbstractNone>();
  107. MS_EXCEPTION_IF_NULL(abstract_none);
  108. send_node_ptr->set_abstract(abstract_none);
  109. return send_node_ptr;
  110. }
  111. CNodePtr KernelAdjust::CreateRecvApplyKernel(const std::shared_ptr<session::KernelGraph> &graph_ptr,
  112. uint32_t event_id) {
  113. MS_EXCEPTION_IF_NULL(graph_ptr);
  114. auto recv_op = std::make_shared<Primitive>(kRecvOpName);
  115. MS_EXCEPTION_IF_NULL(recv_op);
  116. auto recv_apply = std::make_shared<ValueNode>(recv_op);
  117. MS_EXCEPTION_IF_NULL(recv_apply);
  118. std::vector<AnfNodePtr> recv_input_list = {recv_apply};
  119. CNodePtr recv_node_ptr = graph_ptr->NewCNode(recv_input_list);
  120. MS_EXCEPTION_IF_NULL(recv_node_ptr);
  121. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder;
  122. selected_kernel_builder.SetKernelType(KernelType::RT_KERNEL);
  123. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), recv_node_ptr.get());
  124. AnfAlgo::SetNodeAttr(kAttrEventId, MakeValue(event_id), recv_node_ptr);
  125. auto abstract_none = std::make_shared<abstract::AbstractNone>();
  126. MS_EXCEPTION_IF_NULL(abstract_none);
  127. recv_node_ptr->set_abstract(abstract_none);
  128. return recv_node_ptr;
  129. }
  130. void KernelAdjust::InsertSwitchLoop(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
  131. if (!NeedInsertSwitch()) {
  132. return;
  133. }
  134. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  135. ReorderGetNext(kernel_graph_ptr);
  136. std::map<std::string, mindspore::ParameterPtr> switch_loop_input;
  137. CreateSwitchOpParameters(kernel_graph_ptr, &switch_loop_input);
  138. std::vector<AnfNodePtr> *mute_inputs = kernel_graph_ptr->MutableInputs();
  139. MS_EXCEPTION_IF_NULL(mute_inputs);
  140. mute_inputs->push_back(switch_loop_input[kLoopCountParamName]);
  141. mute_inputs->push_back(switch_loop_input[kIterLoopParamName]);
  142. mute_inputs->push_back(switch_loop_input[kZeroParamName]);
  143. mute_inputs->push_back(switch_loop_input[kOneParamName]);
  144. for (const auto &input : kernel_graph_ptr->inputs()) {
  145. MS_EXCEPTION_IF_NULL(input);
  146. if (input->isa<Parameter>()) {
  147. ParameterPtr param_ptr = input->cast<ParameterPtr>();
  148. if (param_ptr == nullptr) {
  149. MS_EXCEPTION(NotSupportError) << "Cast to parameter point failed !";
  150. }
  151. }
  152. }
  153. auto orders = kernel_graph_ptr->execution_order();
  154. if (orders.empty()) {
  155. MS_LOG(EXCEPTION) << "graph execution order is empty";
  156. }
  157. uint32_t first_cnode_stream_label = AnfAlgo::GetStreamDistinctionLabel(orders[0].get());
  158. std::vector<CNodePtr> exec_order;
  159. CNodePtr first_stream_switch_app = CreateStreamSwitchOp(kernel_graph_ptr, switch_loop_input);
  160. MS_EXCEPTION_IF_NULL(first_stream_switch_app);
  161. AnfAlgo::SetStreamDistinctionLabel(kFirstStreamSwitchLabel, first_stream_switch_app.get());
  162. AnfAlgo::SetNodeAttr(kAttrTrueBranchStream, MakeValue<uint32_t>(kGetNextLabel), first_stream_switch_app);
  163. CNodePtr second_stream_switch_app = CreateStreamSwitchOp(kernel_graph_ptr, switch_loop_input);
  164. MS_EXCEPTION_IF_NULL(second_stream_switch_app);
  165. AnfAlgo::SetStreamDistinctionLabel(kSecondStreamSwitchLabel, second_stream_switch_app.get());
  166. AnfAlgo::SetNodeAttr(kAttrTrueBranchStream, MakeValue<uint32_t>(first_cnode_stream_label), second_stream_switch_app);
  167. // add attr "stream_need_active"
  168. AnfAlgo::SetNodeAttr(kStreamNeedActivedFirst, MakeValue<bool>(true), second_stream_switch_app);
  169. CNodePtr first_stream_active_app = CreateStreamActiveOp(kernel_graph_ptr);
  170. MS_EXCEPTION_IF_NULL(first_stream_active_app);
  171. AnfAlgo::SetStreamDistinctionLabel(first_cnode_stream_label, first_stream_active_app.get());
  172. std::vector<uint32_t> first_active_streams = {kFirstStreamSwitchLabel};
  173. AnfAlgo::SetNodeAttr(kAttrActiveStreamList, MakeValue<std::vector<uint32_t>>(first_active_streams),
  174. first_stream_active_app);
  175. CNodePtr second_stream_active_app = CreateStreamActiveOp(kernel_graph_ptr);
  176. MS_EXCEPTION_IF_NULL(second_stream_active_app);
  177. // specific deal for common ctrl stream policy
  178. uint32_t first_common_stream_switch_label = FindFirstStreamSwitchLabel(kernel_graph_ptr);
  179. if (first_common_stream_switch_label == kInvalidDistincLabel) {
  180. AnfAlgo::SetStreamDistinctionLabel(first_cnode_stream_label, second_stream_active_app.get());
  181. } else {
  182. AnfAlgo::SetStreamDistinctionLabel(first_common_stream_switch_label, second_stream_active_app.get());
  183. }
  184. std::vector<uint32_t> second_active_streams = {kSecondStreamSwitchLabel};
  185. AnfAlgo::SetNodeAttr(kAttrActiveStreamList, MakeValue<std::vector<uint32_t>>(second_active_streams),
  186. second_stream_active_app);
  187. CNodePtr assign_add_one = CreateStreamAssignAddnOP(kernel_graph_ptr, switch_loop_input);
  188. MS_EXCEPTION_IF_NULL(assign_add_one);
  189. AnfAlgo::SetStreamDistinctionLabel(first_cnode_stream_label, assign_add_one.get());
  190. CNodePtr send = CreateSendApplyKernel(kernel_graph_ptr, kFirstEventId);
  191. AnfAlgo::SetStreamDistinctionLabel(kGetNextLabel, send.get());
  192. CNodePtr recv = CreateRecvApplyKernel(kernel_graph_ptr, kFirstEventId);
  193. AnfAlgo::SetStreamDistinctionLabel(first_cnode_stream_label, recv.get());
  194. // reorder graph orders
  195. exec_order.push_back(first_stream_switch_app);
  196. size_t i = 0;
  197. for (; i < orders.size(); i++) {
  198. auto node = orders[i];
  199. exec_order.push_back(node);
  200. AnfAlgo::SetStreamDistinctionLabel(kGetNextLabel, exec_order[exec_order.size() - 1].get());
  201. if (AnfAlgo::GetCNodeName(node) == kGetNextOpName) {
  202. break;
  203. }
  204. }
  205. exec_order.push_back(send);
  206. exec_order.push_back(second_stream_switch_app);
  207. exec_order.push_back(recv);
  208. exec_order.push_back(assign_add_one);
  209. std::vector<CNodePtr> memcpy_list;
  210. std::vector<CNodePtr> before_list;
  211. std::vector<CNodePtr> after_list;
  212. bool first_memcpy_found = false;
  213. CNodePtr cur_cnode = nullptr;
  214. for (size_t idx = i + 1; idx < orders.size(); idx++) {
  215. cur_cnode = orders[idx];
  216. if (AnfAlgo::HasNodeAttr(kAttrLabelForInsertStreamActive, cur_cnode)) {
  217. memcpy_list.emplace_back(cur_cnode);
  218. first_memcpy_found = true;
  219. } else if (first_memcpy_found) {
  220. after_list.emplace_back(cur_cnode);
  221. } else {
  222. before_list.emplace_back(cur_cnode);
  223. }
  224. }
  225. (void)std::copy(before_list.begin(), before_list.end(), std::back_inserter(exec_order));
  226. (void)std::copy(memcpy_list.begin(), memcpy_list.end(), std::back_inserter(exec_order));
  227. exec_order.push_back(first_stream_active_app);
  228. (void)std::copy(after_list.begin(), after_list.end(), std::back_inserter(exec_order));
  229. exec_order.push_back(second_stream_active_app);
  230. kernel_graph_ptr->set_execution_order(exec_order);
  231. }
  232. void KernelAdjust::CreateSwitchOpParameters(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  233. std::map<std::string, mindspore::ParameterPtr> *switch_loop_input) {
  234. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  235. MS_EXCEPTION_IF_NULL(switch_loop_input);
  236. std::vector<int> shp = {1};
  237. tensor::TensorPtr tensor_ptr = std::make_shared<tensor::Tensor>(kInt32->type_id(), shp);
  238. MS_EXCEPTION_IF_NULL(tensor_ptr);
  239. mindspore::abstract::AbstractBasePtr paremeter_abstract_ptr = tensor_ptr->ToAbstract();
  240. if (paremeter_abstract_ptr == nullptr) {
  241. MS_LOG(EXCEPTION) << "create abstract before insert switch op failed!";
  242. }
  243. ParameterPtr loop_count = std::make_shared<Parameter>(kernel_graph_ptr);
  244. MS_EXCEPTION_IF_NULL(loop_count);
  245. loop_count->set_name(kLoopCountParamName);
  246. loop_count->set_abstract(paremeter_abstract_ptr);
  247. ParameterPtr loop_count_new = kernel_graph_ptr->NewParameter(loop_count);
  248. (*switch_loop_input)[kLoopCountParamName] = loop_count_new;
  249. ParameterPtr iter_loop = std::make_shared<Parameter>(kernel_graph_ptr);
  250. iter_loop->set_name(kIterLoopParamName);
  251. iter_loop->set_abstract(paremeter_abstract_ptr);
  252. ParameterPtr iter_loop_new = kernel_graph_ptr->NewParameter(iter_loop);
  253. (*switch_loop_input)[kIterLoopParamName] = iter_loop_new;
  254. ParameterPtr zero = std::make_shared<Parameter>(kernel_graph_ptr);
  255. zero->set_name(kZeroParamName);
  256. zero->set_abstract(paremeter_abstract_ptr);
  257. ParameterPtr zero_new = kernel_graph_ptr->NewParameter(zero);
  258. (*switch_loop_input)[kZeroParamName] = zero_new;
  259. ParameterPtr one = std::make_shared<Parameter>(kernel_graph_ptr);
  260. one->set_name(kOneParamName);
  261. one->set_abstract(paremeter_abstract_ptr);
  262. ParameterPtr one_new = kernel_graph_ptr->NewParameter(one);
  263. (*switch_loop_input)[kOneParamName] = one_new;
  264. }
  265. kernel::KernelBuildInfo::KernelBuildInfoBuilder KernelAdjust::CreateMngKernelBuilder(
  266. const std::vector<std::string> &formats, const std::vector<TypeId> &type_ids) {
  267. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder;
  268. selected_kernel_builder.SetInputsFormat(formats);
  269. selected_kernel_builder.SetInputsDeviceType(type_ids);
  270. selected_kernel_builder.SetFusionType(kernel::FusionType::OPAQUE);
  271. selected_kernel_builder.SetProcessor(kernel::Processor::AICORE);
  272. selected_kernel_builder.SetKernelType(KernelType::RT_KERNEL);
  273. return selected_kernel_builder;
  274. }
  275. CNodePtr KernelAdjust::CreateStreamSwitchOp(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  276. const std::map<std::string, mindspore::ParameterPtr> &switch_loop_input) {
  277. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder = CreateMngKernelBuilder(
  278. {kOpFormat_DEFAULT, kOpFormat_DEFAULT}, {TypeId::kNumberTypeInt32, TypeId::kNumberTypeInt32});
  279. auto typeNone_abstract = std::make_shared<abstract::AbstractNone>();
  280. auto stream_switch = std::make_shared<Primitive>(kStreamSwitchOpName);
  281. std::vector<AnfNodePtr> inputs;
  282. inputs.push_back(NewValueNode(stream_switch));
  283. inputs.push_back(switch_loop_input.at(kLoopCountParamName));
  284. inputs.push_back(switch_loop_input.at(kIterLoopParamName));
  285. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  286. CNodePtr stream_switch_app = kernel_graph_ptr->NewCNode(inputs);
  287. MS_EXCEPTION_IF_NULL(stream_switch_app);
  288. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), stream_switch_app.get());
  289. stream_switch_app->set_abstract(typeNone_abstract);
  290. // set attr: cond_ RT_LESS
  291. int condition = static_cast<int>(RT_LESS);
  292. ValuePtr cond = MakeValue(condition);
  293. AnfAlgo::SetNodeAttr(kAttrSwitchCondition, cond, stream_switch_app);
  294. // set attr:data_type
  295. int data_type = static_cast<int>(RT_SWITCH_INT64);
  296. ValuePtr dt = MakeValue(data_type);
  297. AnfAlgo::SetNodeAttr(kAttrDataType, dt, stream_switch_app);
  298. // set distinction label and graph id
  299. return stream_switch_app;
  300. }
  301. CNodePtr KernelAdjust::CreateStreamActiveOp(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
  302. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder = CreateMngKernelBuilder(
  303. {kOpFormat_DEFAULT, kOpFormat_DEFAULT}, {TypeId::kNumberTypeInt32, TypeId::kNumberTypeInt32});
  304. abstract::AbstractBasePtr typeNone_abstract = std::make_shared<abstract::AbstractNone>();
  305. auto stream_active_others = std::make_shared<Primitive>(kStreamActiveOpName);
  306. std::vector<AnfNodePtr> inputs;
  307. inputs.push_back(NewValueNode(stream_active_others));
  308. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  309. CNodePtr stream_active_others_app = kernel_graph_ptr->NewCNode(inputs);
  310. MS_EXCEPTION_IF_NULL(stream_active_others_app);
  311. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), stream_active_others_app.get());
  312. stream_active_others_app->set_abstract(typeNone_abstract);
  313. return stream_active_others_app;
  314. }
  315. CNodePtr KernelAdjust::CreateStreamAssignAddnOP(
  316. const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
  317. const std::map<std::string, mindspore::ParameterPtr> &switch_loop_input) {
  318. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  319. kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder = CreateMngKernelBuilder(
  320. {kOpFormat_DEFAULT, kOpFormat_DEFAULT}, {TypeId::kNumberTypeInt32, TypeId::kNumberTypeInt32});
  321. selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT});
  322. selected_kernel_builder.SetOutputsDeviceType({kNumberTypeInt32});
  323. // AssignAdd
  324. auto assign_add = std::make_shared<Primitive>(kAssignAddOpName);
  325. std::vector<AnfNodePtr> inputs;
  326. inputs.push_back(NewValueNode(assign_add));
  327. inputs.push_back(switch_loop_input.at(kLoopCountParamName));
  328. inputs.push_back(switch_loop_input.at(kOneParamName));
  329. CNodePtr assign_add_one = kernel_graph_ptr->NewCNode(inputs);
  330. MS_EXCEPTION_IF_NULL(assign_add_one);
  331. AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), assign_add_one.get());
  332. std::vector<std::string> input_names = {"ref", "value"};
  333. std::vector<std::string> output_names = {"output"};
  334. ValuePtr input_names_v = MakeValue(input_names);
  335. ValuePtr output_names_v = MakeValue(output_names);
  336. AnfAlgo::SetNodeAttr("input_names", input_names_v, assign_add_one);
  337. AnfAlgo::SetNodeAttr("output_names", output_names_v, assign_add_one);
  338. selected_kernel_builder.SetKernelType(KernelType::TBE_KERNEL);
  339. MS_EXCEPTION_IF_NULL(switch_loop_input.at(kLoopCountParamName));
  340. assign_add_one->set_abstract(switch_loop_input.at(kLoopCountParamName)->abstract());
  341. return assign_add_one;
  342. }
  343. bool KernelAdjust::StepLoadCtrlInputs(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
  344. if (!NeedInsertSwitch()) {
  345. return true;
  346. }
  347. MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  348. auto input_nodes = kernel_graph_ptr->inputs();
  349. std::vector<tensor::TensorPtr> inputs;
  350. LoadSwitchInputs(&inputs);
  351. std::shared_ptr<std::vector<tensor::TensorPtr>> inputsPtr = std::make_shared<std::vector<tensor::TensorPtr>>(inputs);
  352. kernel_graph_ptr->set_input_ctrl_tensors(inputsPtr);
  353. size_t input_ctrl_size = inputs.size();
  354. // inputs_node:include four ctrl nodes in the back. such as:conv,loop_cnt, ites_loop, zero, one.
  355. // deal four ctrl nodes.
  356. for (size_t i = 0; i < inputs.size(); ++i) {
  357. auto tensor = inputs[i];
  358. size_t deal_index = input_nodes.size() - input_ctrl_size + i;
  359. if (deal_index >= input_nodes.size()) {
  360. MS_LOG(EXCEPTION) << "deal_index[" << deal_index << "] out of range";
  361. }
  362. auto input_node = input_nodes[deal_index];
  363. bool need_sync = false;
  364. MS_EXCEPTION_IF_NULL(input_node);
  365. if (input_node->isa<Parameter>()) {
  366. auto pk_node = input_node->cast<ParameterPtr>();
  367. MS_EXCEPTION_IF_NULL(tensor);
  368. MS_EXCEPTION_IF_NULL(pk_node);
  369. if (tensor->is_dirty() || !pk_node->has_default()) {
  370. need_sync = true;
  371. }
  372. }
  373. if (need_sync) {
  374. auto pk_node = input_node->cast<ParameterPtr>();
  375. MS_EXCEPTION_IF_NULL(pk_node);
  376. auto device_address = AnfAlgo::GetMutableOutputAddr(pk_node, 0);
  377. MS_EXCEPTION_IF_NULL(device_address);
  378. tensor->set_device_address(device_address);
  379. if (!device_address->SyncHostToDevice(trans::GetRuntimePaddingShape(pk_node, 0),
  380. LongToSize(tensor->data().nbytes()), tensor->data_type(),
  381. tensor->data_c(false))) {
  382. MS_LOG(INFO) << "SyncHostToDevice failed.";
  383. return false;
  384. }
  385. }
  386. tensor->set_dirty(false);
  387. }
  388. return true;
  389. }
  390. void KernelAdjust::LoadSwitchInputs(std::vector<tensor::TensorPtr> *inputs) {
  391. MS_LOG(INFO) << "---------------- LoadSwitchInputs---";
  392. MS_EXCEPTION_IF_NULL(inputs);
  393. std::vector<int> shp = {1};
  394. tensor::TensorPtr loop_count_tensor = std::make_shared<tensor::Tensor>(kInt32->type_id(), shp);
  395. MS_EXCEPTION_IF_NULL(loop_count_tensor);
  396. int32_t *val = nullptr;
  397. val = static_cast<int32_t *>(loop_count_tensor->data_c(true));
  398. MS_EXCEPTION_IF_NULL(val);
  399. *val = 0;
  400. inputs->push_back(loop_count_tensor);
  401. tensor::TensorPtr iter_loop_tensor = std::make_shared<tensor::Tensor>(kInt32->type_id(), shp);
  402. MS_EXCEPTION_IF_NULL(iter_loop_tensor);
  403. val = static_cast<int32_t *>(iter_loop_tensor->data_c(true));
  404. MS_EXCEPTION_IF_NULL(val);
  405. *val = SizeToInt(LongToSize(ConfigManager::GetInstance().iter_num()));
  406. MS_LOG(INFO) << "iter_loop_tensor = " << *val;
  407. inputs->push_back(iter_loop_tensor);
  408. tensor::TensorPtr zero_tensor = std::make_shared<tensor::Tensor>(kInt32->type_id(), shp);
  409. MS_EXCEPTION_IF_NULL(zero_tensor);
  410. val = static_cast<int32_t *>(zero_tensor->data_c(true));
  411. MS_EXCEPTION_IF_NULL(val);
  412. *val = 0;
  413. inputs->push_back(zero_tensor);
  414. tensor::TensorPtr one_tensor = std::make_shared<tensor::Tensor>(kInt32->type_id(), shp);
  415. MS_EXCEPTION_IF_NULL(one_tensor);
  416. val = static_cast<int32_t *>(one_tensor->data_c(true));
  417. MS_EXCEPTION_IF_NULL(val);
  418. *val = 1;
  419. inputs->push_back(one_tensor);
  420. MS_LOG(INFO) << "---------------- LoadSwitchInputs End--";
  421. }
  422. void KernelAdjust::Profiling(NotNull<session::KernelGraph *> kernel_graph_ptr) {
  423. if (!ascend::ProfilingManager::GetInstance().IsProfiling()) {
  424. MS_LOG(INFO) << "No need to profiling";
  425. return;
  426. }
  427. ProfilingTraceInfo profiling_trace_info = ProfilingUtils::GetProfilingTraceFromEnv(kernel_graph_ptr);
  428. if (!profiling_trace_info.IsValid()) {
  429. MS_LOG(WARNING) << "[profiling] no profiling node found!";
  430. return;
  431. }
  432. InsertProfilingKernel(profiling_trace_info, kernel_graph_ptr);
  433. }
  434. void KernelAdjust::InsertProfilingKernel(const ProfilingTraceInfo &profiling_trace_info,
  435. NotNull<session::KernelGraph *> kernel_graph_ptr) {
  436. MS_LOG(INFO) << "[profiling] Insert profiling kernel start";
  437. if (!profiling_trace_info.IsValid()) {
  438. MS_LOG(WARNING) << "Profiling trace point not found";
  439. return;
  440. }
  441. std::vector<CNodePtr> new_cnode_list;
  442. std::vector<CNodePtr> cnode_ptr_list = kernel_graph_ptr->execution_order();
  443. if (cnode_ptr_list.empty()) {
  444. MS_LOG(ERROR) << "No CNode in graph";
  445. return;
  446. }
  447. for (const auto &cnode_ptr : cnode_ptr_list) {
  448. ProfilingUtils::ProfilingTraceFpStart(cnode_ptr, profiling_trace_info, kernel_graph_ptr, NOT_NULL(&new_cnode_list));
  449. new_cnode_list.emplace_back(cnode_ptr);
  450. ProfilingUtils::ProfilingCustomOp(cnode_ptr, profiling_trace_info, kernel_graph_ptr, NOT_NULL(&new_cnode_list));
  451. ProfilingUtils::ProfilingTraceBpEnd(cnode_ptr, profiling_trace_info, kernel_graph_ptr, NOT_NULL(&new_cnode_list));
  452. ProfilingUtils::ProfilingTraceEnd(cnode_ptr, profiling_trace_info, kernel_graph_ptr, NOT_NULL(&new_cnode_list));
  453. }
  454. kernel_graph_ptr->set_execution_order(new_cnode_list);
  455. }
  456. } // namespace device
  457. } // namespace mindspore