You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cpu_session.cc 16 kB

5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "backend/session/cpu_session.h"
  17. #include <algorithm>
  18. #include <sstream>
  19. #include <exception>
  20. #include "ir/anf.h"
  21. #include "utils/ms_utils.h"
  22. #include "utils/trace_base.h"
  23. #include "utils/context/graph_kernel_flags.h"
  24. #include "backend/session/anf_runtime_algorithm.h"
  25. #include "runtime/device/kernel_runtime.h"
  26. #include "backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.h"
  27. #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
  28. #include "runtime/device/cpu/kernel_select_cpu.h"
  29. #include "backend/optimizer/common/optimizer.h"
  30. #include "backend/optimizer/common/pass_manager.h"
  31. #include "backend/optimizer/cpu/insert_cast_cpu.h"
  32. #include "backend/optimizer/cpu/insert_format_transform_op.h"
  33. #include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
  34. #include "backend/optimizer/pass/replace_node_by_proxy.h"
  35. #include "backend/optimizer/pass/erase_visit_attr.h"
  36. #include "debug/anf_ir_dump.h"
  37. #include "backend/optimizer/common/common_backend_optimization.h"
  38. #include "debug/dump_proto.h"
  39. #ifndef ENABLE_SECURITY
  40. #include "debug/data_dump/dump_json_parser.h"
  41. #endif
  42. #if ((defined ENABLE_CPU) && (!defined _WIN32))
  43. #include "ps/util.h"
  44. #include "ps/ps_context.h"
  45. #endif
  46. #ifdef ENABLE_DUMP_IR
  47. #include "debug/rdr/graph_recorder.h"
  48. #include "debug/rdr/running_data_recorder.h"
  49. #endif
  50. namespace mindspore {
  51. namespace session {
  52. void CPUSession::Init(uint32_t device_id) {
  53. #ifndef ENABLE_SECURITY
  54. // Dump json config file if dump is enabled
  55. auto &json_parser = DumpJsonParser::GetInstance();
  56. json_parser.Parse();
  57. json_parser.CopyMSCfgJsonToDir(rank_id_);
  58. #endif
  59. InitExecutor(kCPUDevice, device_id);
  60. }
  61. ParameterPtr CPUSession::CreateNewParameterFromParameter(const AnfNodePtr &anf, KernelGraph *graph) {
  62. MS_EXCEPTION_IF_NULL(anf);
  63. MS_EXCEPTION_IF_NULL(graph);
  64. if (!anf->isa<Parameter>()) {
  65. MS_LOG(EXCEPTION) << "anf[" << anf->DebugString() << "] is not a parameter";
  66. }
  67. auto valid_inputs = graph->MutableValidInputs();
  68. MS_EXCEPTION_IF_NULL(valid_inputs);
  69. auto graph_inputs = graph->MutableInputs();
  70. MS_EXCEPTION_IF_NULL(graph_inputs);
  71. TraceManager::DebugTrace(std::make_shared<TraceCopy>(anf->debug_info()));
  72. ParameterPtr new_parameter = graph->NewParameter(anf->cast<ParameterPtr>());
  73. TraceManager::EndTrace();
  74. graph_inputs->push_back(new_parameter);
  75. valid_inputs->push_back(true);
  76. return new_parameter;
  77. }
  78. // Remove after PS feature finish adapting push/pull in auto_monad.
  79. void CPUSession::Reorder(std::vector<CNodePtr> *node_list) { AnfAlgo::ReorderPosteriorExecList(NOT_NULL(node_list)); }
  80. void CPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
  81. auto optimizer = std::make_shared<opt::GraphOptimizer>();
  82. auto pm = std::make_shared<opt::PassManager>();
  83. #if ((defined ENABLE_CPU) && (!defined _WIN32) && !defined(__APPLE__))
  84. auto ms_context = MsContext::GetInstance();
  85. MS_EXCEPTION_IF_NULL(ms_context);
  86. if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) != kPynativeMode && ps::PSContext::instance()->is_ps_mode()) {
  87. AssignParamKey(kernel_graph);
  88. if (ps::PSContext::instance()->is_worker()) {
  89. std::string pass_name = "replace_node_by_proxy";
  90. pass_name.append(std::to_string(graph_sum_));
  91. pm->AddPass(std::make_shared<opt::ReplaceNodeByProxy>(pass_name));
  92. }
  93. }
  94. #endif
  95. pm->AddPass(std::make_shared<opt::InsertFormatTransformOpCPU>("insert_format_transform_op_cpu"));
  96. pm->AddPass(std::make_shared<opt::InsertCastCPU>("insert_cast"));
  97. pm->AddPass(std::make_shared<opt::EraseVisitAttr>());
  98. optimizer->AddPassManager(pm);
  99. (void)optimizer->Optimize(kernel_graph);
  100. kernel_graph->SetExecOrderByDefault();
  101. }
  102. void CPUSession::GraphKernelOptimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
  103. #ifdef ENABLE_AKG
  104. if (!graphkernel::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) {
  105. return;
  106. }
  107. graphkernel::GraphKernelOptimize(kernel_graph);
  108. kernel_graph->SetExecOrderByDefault();
  109. #endif
  110. }
  111. GraphId CPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) {
  112. auto graph_id = graph_sum_;
  113. auto graph = ConstructKernelGraph(lst, outputs);
  114. MS_EXCEPTION_IF_NULL(graph);
  115. opt::AddDynamicShapeAttrPass(graph);
  116. MS_LOG(INFO) << "Set kernel info";
  117. SetKernelInfo(graph.get());
  118. MS_LOG(INFO) << "Set kernel info end";
  119. Optimize(graph);
  120. FinalOptimize(graph);
  121. GraphKernelOptimize(graph);
  122. MS_LOG(INFO) << "Build kernel";
  123. BuildKernel(graph.get());
  124. // Remove reorder after PS feature finish adapting push/pull in auto_monad.
  125. auto execution_order = graph->execution_order();
  126. Reorder(&execution_order);
  127. graph->set_execution_order(execution_order);
  128. #ifdef ENABLE_DUMP_IR
  129. std::string name = "graph_build." + std::to_string(graph->graph_id());
  130. DumpGraphParams dump_params = {true, static_cast<int>(kWholeStack)};
  131. (void)mindspore::RDR::RecordAnfGraph(SubModuleId::SM_SESSION, name, graph, dump_params, ".ir");
  132. const std::vector<CNodePtr> &exec_order = graph->execution_order();
  133. std::string exec_order_name = "graph_exec_order." + std::to_string(graph->graph_id());
  134. (void)mindspore::RDR::RecordGraphExecOrder(SubModuleId::SM_SESSION, exec_order_name, exec_order);
  135. #endif
  136. // runtime init
  137. if (!runtime_.Init()) {
  138. MS_LOG(EXCEPTION) << "Kernel runtime init error.";
  139. }
  140. MS_LOG(INFO) << "Assign kernel address";
  141. runtime_.AssignKernelAddress(graph.get());
  142. // set summary node
  143. #ifndef ENABLE_SECURITY
  144. SetSummaryNodes(graph.get());
  145. #endif
  146. runtime_.IncreaseSummaryRefCount(graph->summary_nodes());
  147. DumpGraph(graph);
  148. return graph_id;
  149. }
  150. void CPUSession::CreateOutputTensors(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &input_tensors,
  151. VectorRef *outputs,
  152. std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node,
  153. KernelMapTensor *) {
  154. auto kernel_graph = GetGraph(graph_id);
  155. MS_EXCEPTION_IF_NULL(kernel_graph);
  156. runtime_.CreateOutputTensors(kernel_graph.get(), input_tensors, outputs, tensor_to_node);
  157. }
  158. void CPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
  159. const std::vector<tensor::TensorPtr> &inputs_const) const {
  160. MS_EXCEPTION_IF_NULL(kernel_graph);
  161. auto &input_nodes = kernel_graph->input_nodes();
  162. if (input_nodes.size() != inputs_const.size()) {
  163. MS_LOG(EXCEPTION) << "Input size " << inputs_const.size() << " is not equal to input node size "
  164. << input_nodes.size();
  165. }
  166. for (size_t input_idx = 0; input_idx < input_nodes.size(); ++input_idx) {
  167. auto &input_node = input_nodes[input_idx];
  168. MS_EXCEPTION_IF_NULL(input_node);
  169. if (!input_node->isa<Parameter>() || HasAbstractMonad(input_node)) {
  170. continue;
  171. }
  172. auto address = AnfAlgo::GetMutableOutputAddr(input_node, 0);
  173. auto tensor = inputs_const[input_idx];
  174. auto tensor_address = tensor->device_address();
  175. MS_EXCEPTION_IF_NULL(address);
  176. MS_EXCEPTION_IF_NULL(tensor);
  177. if (tensor_address == nullptr || tensor_address == address) {
  178. continue;
  179. }
  180. auto input_param = input_node->cast<ParameterPtr>();
  181. if (AnfAlgo::IsParameterWeight(input_param) && !tensor->IsUpdatedByDevice()) {
  182. continue;
  183. }
  184. if (std::dynamic_pointer_cast<device::DeviceAddress>(tensor_address)->DeviceType() !=
  185. device::DeviceAddressType::kCPU) {
  186. tensor->data_sync(false);
  187. }
  188. }
  189. }
  190. void CPUSession::PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph,
  191. const std::vector<tensor::TensorPtr> &inputs, VectorRef *const outputs) {
  192. MS_LOG(INFO) << "Bind input output address";
  193. runtime_.BindInputOutput(kernel_graph.get(), inputs, outputs);
  194. #if ((defined ENABLE_CPU) && (!defined _WIN32) && !defined(__APPLE__))
  195. InitPSParamAndOptim(kernel_graph, inputs);
  196. #endif
  197. }
  198. void CPUSession::PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph,
  199. const std::vector<tensor::TensorPtr> &, VectorRef *const) {
  200. #ifndef ENABLE_SECURITY
  201. Summary(kernel_graph.get());
  202. #endif
  203. }
  204. void CPUSession::ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) {
  205. bool ret = runtime_.Run(*kernel_graph, false);
  206. if (!ret) {
  207. MS_LOG(EXCEPTION) << "Run graph failed";
  208. }
  209. }
  210. KernelGraphPtr CPUSession::BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
  211. const std::vector<tensor::TensorPtr> &input_tensors,
  212. const std::vector<int64_t> &tensors_mask) {
  213. // Check if the graph cache exists.
  214. auto it = run_op_graphs_.find(graph_info);
  215. if (it != run_op_graphs_.end()) {
  216. return it->second;
  217. }
  218. // Prepare the graph
  219. const auto &kernel_graph = ConstructSingleOpGraph(op_run_info, input_tensors, tensors_mask);
  220. MS_EXCEPTION_IF_NULL(kernel_graph);
  221. SetKernelInfo(kernel_graph.get());
  222. Optimize(kernel_graph);
  223. BuildKernel(kernel_graph.get());
  224. auto enable_op_graph_cache = MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE);
  225. if (enable_op_graph_cache) {
  226. run_op_graphs_[graph_info] = kernel_graph;
  227. }
  228. return kernel_graph;
  229. }
  230. void CPUSession::SetOutputFlags(const VectorRef &base_ref) {
  231. for (size_t i = 0; i < base_ref.size(); ++i) {
  232. if (utils::isa<VectorRef>(base_ref[i])) {
  233. auto ref_iter = utils::cast<VectorRef>(base_ref[i]);
  234. SetOutputFlags(ref_iter);
  235. } else if (utils::isa<tensor::TensorPtr>(base_ref[i])) {
  236. auto tensor_ptr = utils::cast<std::shared_ptr<tensor::Tensor>>(base_ref[i]);
  237. tensor_ptr->SetNeedWait(false);
  238. tensor_ptr->data_sync(false);
  239. }
  240. }
  241. }
  242. void CPUSession::UpdateDynamicOutputShape(const std::map<tensor::TensorPtr, KernelWithIndex> &tensor_to_node) {
  243. for (const auto &tensor_node : tensor_to_node) {
  244. if (AnfAlgo::IsDynamicShape(tensor_node.second.first)) {
  245. const auto &kernel = tensor_node.second.first;
  246. const auto &output_index = tensor_node.second.second;
  247. const auto &shape = AnfAlgo::GetOutputInferShape(kernel, output_index);
  248. std::vector<int64_t> refresh_shape;
  249. (void)std::copy(shape.begin(), shape.end(), std::back_inserter(refresh_shape));
  250. MS_EXCEPTION_IF_NULL(tensor_node.first);
  251. tensor_node.first->set_shape(refresh_shape);
  252. }
  253. }
  254. }
  255. void CPUSession::RunOpImplOrigin(const GraphInfo &graph_info, OpRunInfo *op_run_info,
  256. std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs,
  257. const std::vector<int64_t> &tensors_mask) {
  258. RunOpImpl(graph_info, op_run_info, input_tensors, outputs, tensors_mask);
  259. }
  260. void CPUSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info,
  261. std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs,
  262. const std::vector<int64_t> &tensors_mask) {
  263. MS_EXCEPTION_IF_NULL(input_tensors);
  264. MS_EXCEPTION_IF_NULL(op_run_info);
  265. ProcessInputTensorsForHeterogeneous("CPU", *input_tensors);
  266. const auto &kernel_graph = BuildOpImpl(*op_run_info, graph_info, *input_tensors, tensors_mask);
  267. EraseValueNodeTensor(tensors_mask, input_tensors);
  268. // Remove reorder after PS feature finish adapting push/pull in auto_monad.
  269. auto execution_order = kernel_graph->execution_order();
  270. Reorder(&execution_order);
  271. kernel_graph->set_execution_order(execution_order);
  272. // runtime init
  273. if (!runtime_.Init()) {
  274. MS_LOG(EXCEPTION) << "Kernel runtime init error.";
  275. }
  276. runtime_.AssignKernelAddress(kernel_graph.get());
  277. std::map<tensor::TensorPtr, session::KernelWithIndex> tensor_to_node;
  278. runtime_.CreateOutputTensors(kernel_graph.get(), *input_tensors, outputs, &tensor_to_node);
  279. runtime_.BindInputOutput(kernel_graph.get(), *input_tensors, outputs);
  280. bool ret = runtime_.Run(*kernel_graph, false);
  281. if (!ret) {
  282. MS_LOG(EXCEPTION) << "Run Op failed";
  283. }
  284. UpdateDynamicOutputShape(tensor_to_node);
  285. // update output abstract of dynamic op to op_run_info
  286. if (op_run_info->is_dynamic_shape) {
  287. UpdateOutputAbstract(kernel_graph, op_run_info);
  288. }
  289. SetOutputFlags(*outputs);
  290. runtime_.RunOpClearMemory(*kernel_graph);
  291. }
  292. void CPUSession::SetKernelInfo(const KernelGraph *kernel_graph) {
  293. MS_EXCEPTION_IF_NULL(kernel_graph);
  294. auto &kernel_nodes = kernel_graph->execution_order();
  295. for (const auto &kernel_node : kernel_nodes) {
  296. MS_EXCEPTION_IF_NULL(kernel_node);
  297. device::cpu::SetKernelInfo(kernel_node);
  298. }
  299. }
  300. namespace {
  301. void KernelNotSupportException(const AnfNodePtr &kernel_node) {
  302. std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
  303. std::stringstream operator_info;
  304. operator_info << "Operator[" << kernel_name << "] ";
  305. auto kernel_info = dynamic_cast<device::KernelInfo *>(kernel_node->kernel_info());
  306. if (kernel_info == nullptr) {
  307. operator_info << "is not support.";
  308. MS_LOG(EXCEPTION) << operator_info.str();
  309. }
  310. auto kernel_build_Info = kernel_info->select_kernel_build_info();
  311. if (kernel_build_Info == nullptr) {
  312. operator_info << "is not support.";
  313. MS_LOG(EXCEPTION) << operator_info.str();
  314. }
  315. size_t input_num = kernel_build_Info->GetInputNum();
  316. if (input_num > 0) {
  317. operator_info << " input(";
  318. for (size_t i = 0; i < input_num; ++i) {
  319. operator_info << TypeIdLabel(kernel_build_Info->GetInputDeviceType(i));
  320. if (i != input_num - 1) {
  321. operator_info << ",";
  322. }
  323. }
  324. operator_info << ") ";
  325. }
  326. size_t output_num = kernel_build_Info->GetOutputNum();
  327. if (output_num > 0) {
  328. operator_info << "output(";
  329. for (size_t i = 0; i < output_num; ++i) {
  330. operator_info << TypeIdLabel(kernel_build_Info->GetOutputDeviceType(i));
  331. if (i != kernel_build_Info->GetOutputNum() - 1) {
  332. operator_info << ",";
  333. }
  334. }
  335. operator_info << ") ";
  336. }
  337. operator_info << "is not support.";
  338. MS_LOG(EXCEPTION) << operator_info.str() << trace::DumpSourceLines(kernel_node);
  339. }
  340. } // namespace
  341. void CPUSession::BuildKernel(const KernelGraph *kernel_graph) {
  342. MS_EXCEPTION_IF_NULL(kernel_graph);
  343. auto &kernel_nodes = kernel_graph->execution_order();
  344. kernel::KernelMeta *bin_map = kernel::KernelMeta::GetInstance();
  345. MS_EXCEPTION_IF_NULL(bin_map);
  346. std::vector<AnfNodePtr> akg_nodes;
  347. for (const auto &kernel_node : kernel_nodes) {
  348. MS_EXCEPTION_IF_NULL(kernel_node);
  349. std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
  350. MS_LOG(INFO) << "Cpu building operator[" << kernel_name << "].";
  351. if (session::AnfRuntimeAlgorithm::GetKernelType(kernel_node) == KernelType::AKG_KERNEL) {
  352. if (!bin_map->initialized()) {
  353. bin_map->Initialize();
  354. }
  355. akg_nodes.push_back(kernel_node);
  356. continue;
  357. }
  358. std::shared_ptr<kernel::CPUKernel> cpu_kernel =
  359. kernel::CPUKernelFactory::GetInstance().Create(kernel_name, kernel_node);
  360. if (cpu_kernel == nullptr) {
  361. KernelNotSupportException(kernel_node);
  362. }
  363. try {
  364. cpu_kernel->Init(kernel_node);
  365. } catch (std::exception &e) {
  366. MS_LOG(EXCEPTION) << e.what() << trace::DumpSourceLines(kernel_node);
  367. }
  368. AnfAlgo::SetKernelMod(cpu_kernel, kernel_node.get());
  369. MS_LOG(INFO) << "Cpu build success operator[" << kernel_name << "].";
  370. }
  371. #ifdef ENABLE_AKG
  372. kernel::AkgCpuKernelBuilder akg_cpu_kernel_builder;
  373. (void)akg_cpu_kernel_builder.AkgKernelParallelBuild(akg_nodes);
  374. #endif
  375. }
  376. } // namespace session
  377. } // namespace mindspore