You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cpu_session.cc 15 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "backend/session/cpu_session.h"
  17. #include <algorithm>
  18. #include <sstream>
  19. #include <exception>
  20. #include "ir/anf.h"
  21. #include "utils/ms_utils.h"
  22. #include "utils/trace_base.h"
  23. #include "backend/session/anf_runtime_algorithm.h"
  24. #include "runtime/device/kernel_runtime.h"
  25. #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
  26. #include "runtime/device/cpu/kernel_select_cpu.h"
  27. #include "backend/optimizer/common/optimizer.h"
  28. #include "backend/optimizer/common/pass_manager.h"
  29. #include "backend/optimizer/cpu/insert_cast_cpu.h"
  30. #include "backend/optimizer/cpu/insert_format_transform_op.h"
  31. #include "backend/optimizer/pass/replace_node_by_proxy.h"
  32. #include "backend/optimizer/pass/erase_visit_attr.h"
  33. #include "debug/anf_ir_dump.h"
  34. #include "debug/dump_proto.h"
  35. #include "debug/data_dump/dump_json_parser.h"
  36. #if ((defined ENABLE_CPU) && (!defined _WIN32))
  37. #include "ps/util.h"
  38. #include "ps/ps_context.h"
  39. #endif
  40. #ifdef ENABLE_DUMP_IR
  41. #include "debug/rdr/graph_recorder.h"
  42. #include "debug/rdr/running_data_recorder.h"
  43. #endif
  44. namespace mindspore {
  45. namespace session {
  46. void CPUSession::Init(uint32_t device_id) {
  47. #ifndef ENABLE_SECURITY
  48. // Dump json config file if dump is enabled
  49. auto &json_parser = DumpJsonParser::GetInstance();
  50. json_parser.Parse();
  51. json_parser.CopyMSCfgJsonToDir(rank_id_);
  52. #endif
  53. InitExecutor(kCPUDevice, device_id);
  54. }
  55. ParameterPtr CPUSession::CreateNewParameterFromParameter(const AnfNodePtr &anf, KernelGraph *graph) {
  56. MS_EXCEPTION_IF_NULL(anf);
  57. MS_EXCEPTION_IF_NULL(graph);
  58. if (!anf->isa<Parameter>()) {
  59. MS_LOG(EXCEPTION) << "anf[" << anf->DebugString() << "] is not a parameter";
  60. }
  61. auto valid_inputs = graph->MutableValidInputs();
  62. MS_EXCEPTION_IF_NULL(valid_inputs);
  63. auto graph_inputs = graph->MutableInputs();
  64. MS_EXCEPTION_IF_NULL(graph_inputs);
  65. TraceManager::DebugTrace(std::make_shared<TraceCopy>(anf->debug_info()));
  66. ParameterPtr new_parameter = graph->NewParameter(anf->cast<ParameterPtr>());
  67. TraceManager::EndTrace();
  68. graph_inputs->push_back(new_parameter);
  69. valid_inputs->push_back(true);
  70. return new_parameter;
  71. }
  72. // Remove after PS feature finish adapting push/pull in auto_monad.
  73. void CPUSession::Reorder(std::vector<CNodePtr> *node_list) { AnfAlgo::ReorderPosteriorExecList(NOT_NULL(node_list)); }
  74. void CPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
  75. auto optimizer = std::make_shared<opt::GraphOptimizer>();
  76. auto pm = std::make_shared<opt::PassManager>();
  77. #if ((defined ENABLE_CPU) && (!defined _WIN32))
  78. auto ms_context = MsContext::GetInstance();
  79. MS_EXCEPTION_IF_NULL(ms_context);
  80. if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) != kPynativeMode && ps::PSContext::instance()->is_ps_mode()) {
  81. AssignParamKey(kernel_graph);
  82. if (ps::PSContext::instance()->is_worker()) {
  83. std::string pass_name = "replace_node_by_proxy";
  84. pass_name.append(std::to_string(graph_sum_));
  85. pm->AddPass(std::make_shared<opt::ReplaceNodeByProxy>(pass_name));
  86. }
  87. }
  88. #endif
  89. pm->AddPass(std::make_shared<opt::InsertFormatTransformOpCPU>("insert_format_transform_op_cpu"));
  90. optimizer->AddPassManager(pm);
  91. (void)optimizer->Optimize(kernel_graph);
  92. kernel_graph->SetExecOrderByDefault();
  93. }
  94. void CPUSession::ProcessCast(const std::shared_ptr<KernelGraph> &kernel_graph) {
  95. auto optimizer = std::make_shared<opt::GraphOptimizer>();
  96. auto pm = std::make_shared<opt::PassManager>();
  97. pm->AddPass(std::make_shared<opt::InsertCastCPU>("insert_cast_cpu"));
  98. MS_LOG(INFO) << "Insert cast pass";
  99. pm->AddPass(std::make_shared<opt::EraseVisitAttr>());
  100. optimizer->AddPassManager(pm);
  101. (void)optimizer->Optimize(kernel_graph);
  102. kernel_graph->SetExecOrderByDefault();
  103. }
  104. GraphId CPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) {
  105. auto graph_id = graph_sum_;
  106. auto graph = ConstructKernelGraph(lst, outputs);
  107. MS_EXCEPTION_IF_NULL(graph);
  108. UpdateGraphDynamicShapeAttr(NOT_NULL(graph));
  109. graph->UpdateGraphDynamicAttr();
  110. MS_LOG(INFO) << "Set kernel info";
  111. SetKernelInfo(graph.get());
  112. MS_LOG(INFO) << "Set kernel info end";
  113. Optimize(graph);
  114. FinalOptimize(graph);
  115. MS_LOG(INFO) << "Build kernel";
  116. BuildKernel(graph.get());
  117. ProcessCast(graph);
  118. // Remove reorder after PS feature finish adapting push/pull in auto_monad.
  119. auto execution_order = graph->execution_order();
  120. Reorder(&execution_order);
  121. graph->set_execution_order(execution_order);
  122. #ifdef ENABLE_DUMP_IR
  123. std::string name = "graph_build." + std::to_string(graph->graph_id());
  124. DumpGraphParams dump_params = {true, static_cast<int>(kWholeStack)};
  125. (void)mindspore::RDR::RecordAnfGraph(SubModuleId::SM_SESSION, name, graph, dump_params, ".ir");
  126. const std::vector<CNodePtr> &exec_order = graph->execution_order();
  127. std::string exec_order_name = "graph_exec_order." + std::to_string(graph->graph_id());
  128. (void)mindspore::RDR::RecordGraphExecOrder(SubModuleId::SM_SESSION, exec_order_name, exec_order);
  129. #endif
  130. // runtime init
  131. if (!runtime_.Init()) {
  132. MS_LOG(EXCEPTION) << "Kernel runtime init error.";
  133. }
  134. MS_LOG(INFO) << "Assign kernel address";
  135. runtime_.AssignKernelAddress(graph.get());
  136. // set summary node
  137. SetSummaryNodes(graph.get());
  138. runtime_.IncreaseSummaryRefCount(graph->summary_nodes());
  139. DumpGraph(graph);
  140. return graph_id;
  141. }
  142. void CPUSession::CreateOutputTensors(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &input_tensors,
  143. VectorRef *outputs,
  144. std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node) {
  145. auto kernel_graph = GetGraph(graph_id);
  146. MS_EXCEPTION_IF_NULL(kernel_graph);
  147. runtime_.CreateOutputTensors(kernel_graph.get(), input_tensors, outputs, tensor_to_node);
  148. }
  149. void CPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
  150. const std::vector<tensor::TensorPtr> &inputs_const) const {
  151. MS_EXCEPTION_IF_NULL(kernel_graph);
  152. auto &input_nodes = kernel_graph->inputs();
  153. if (input_nodes.size() != inputs_const.size()) {
  154. MS_LOG(EXCEPTION) << "Input size " << inputs_const.size() << " is not equal to input node size "
  155. << input_nodes.size();
  156. }
  157. for (size_t input_idx = 0; input_idx < input_nodes.size(); ++input_idx) {
  158. auto &input_node = input_nodes[input_idx];
  159. MS_EXCEPTION_IF_NULL(input_node);
  160. if (!input_node->isa<Parameter>() || HasAbstractMonad(input_node)) {
  161. continue;
  162. }
  163. auto address = AnfAlgo::GetMutableOutputAddr(input_node, 0);
  164. auto tensor = inputs_const[input_idx];
  165. auto tensor_address = tensor->device_address();
  166. MS_EXCEPTION_IF_NULL(address);
  167. MS_EXCEPTION_IF_NULL(tensor);
  168. if (tensor_address == nullptr || tensor_address == address) {
  169. continue;
  170. }
  171. auto input_param = input_node->cast<ParameterPtr>();
  172. if (AnfAlgo::IsParameterWeight(input_param) && !tensor->IsUpdatedByDevice()) {
  173. continue;
  174. }
  175. if (std::dynamic_pointer_cast<device::DeviceAddress>(tensor_address)->DeviceType() !=
  176. device::DeviceAddressType::kCPU) {
  177. tensor->data_sync(false);
  178. }
  179. }
  180. }
  181. void CPUSession::PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph,
  182. const std::vector<tensor::TensorPtr> &inputs, VectorRef *const outputs) {
  183. MS_LOG(INFO) << "Bind input output address";
  184. runtime_.BindInputOutput(kernel_graph.get(), inputs, outputs);
  185. #if ((defined ENABLE_CPU) && (!defined _WIN32))
  186. InitPSParamAndOptim(kernel_graph, inputs);
  187. #endif
  188. }
  189. void CPUSession::PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph,
  190. const std::vector<tensor::TensorPtr> &, VectorRef *const) {
  191. Summary(kernel_graph.get());
  192. }
  193. void CPUSession::ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) {
  194. bool ret = runtime_.Run(kernel_graph.get(), false);
  195. if (!ret) {
  196. MS_LOG(EXCEPTION) << "Run graph failed";
  197. }
  198. }
  199. KernelGraphPtr CPUSession::BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
  200. const std::vector<tensor::TensorPtr> &input_tensors,
  201. const std::vector<int64_t> &tensors_mask) {
  202. // Check if the graph cache exists.
  203. auto it = run_op_graphs_.find(graph_info);
  204. if (it != run_op_graphs_.end()) {
  205. return it->second;
  206. }
  207. // Prepare the graph
  208. const auto &kernel_graph = ConstructSingleOpGraph(op_run_info, input_tensors, tensors_mask);
  209. MS_EXCEPTION_IF_NULL(kernel_graph);
  210. SetKernelInfo(kernel_graph.get());
  211. Optimize(kernel_graph);
  212. BuildKernel(kernel_graph.get());
  213. ProcessCast(kernel_graph);
  214. auto enable_op_graph_cache = MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE);
  215. if (enable_op_graph_cache) {
  216. run_op_graphs_[graph_info] = kernel_graph;
  217. }
  218. return kernel_graph;
  219. }
  220. void CPUSession::SetOutputFlags(const VectorRef &base_ref) {
  221. for (size_t i = 0; i < base_ref.size(); ++i) {
  222. if (utils::isa<VectorRef>(base_ref[i])) {
  223. auto ref_iter = utils::cast<VectorRef>(base_ref[i]);
  224. SetOutputFlags(ref_iter);
  225. } else if (utils::isa<tensor::TensorPtr>(base_ref[i])) {
  226. auto tensor_ptr = utils::cast<std::shared_ptr<tensor::Tensor>>(base_ref[i]);
  227. tensor_ptr->SetNeedWait(false);
  228. tensor_ptr->data_sync(false);
  229. }
  230. }
  231. }
  232. void CPUSession::UpdateDynamicOutputShape(const std::map<tensor::TensorPtr, KernelWithIndex> &tensor_to_node) {
  233. for (const auto &tensor_node : tensor_to_node) {
  234. if (AnfAlgo::IsDynamicShape(tensor_node.second.first)) {
  235. const auto &kernel = tensor_node.second.first;
  236. const auto &output_index = tensor_node.second.second;
  237. const auto &shape = AnfAlgo::GetOutputInferShape(kernel, output_index);
  238. std::vector<int64_t> refresh_shape;
  239. (void)std::copy(shape.begin(), shape.end(), std::back_inserter(refresh_shape));
  240. tensor_node.first->set_shape(refresh_shape);
  241. }
  242. }
  243. }
  244. void CPUSession::RunOpImplOrigin(const GraphInfo &graph_info, OpRunInfo *op_run_info,
  245. std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs,
  246. const std::vector<int64_t> &tensors_mask) {
  247. RunOpImpl(graph_info, op_run_info, input_tensors, outputs, tensors_mask);
  248. }
  249. void CPUSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info,
  250. std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs,
  251. const std::vector<int64_t> &tensors_mask) {
  252. MS_EXCEPTION_IF_NULL(input_tensors);
  253. MS_EXCEPTION_IF_NULL(op_run_info);
  254. const auto &kernel_graph = BuildOpImpl(*op_run_info, graph_info, *input_tensors, tensors_mask);
  255. EraseValueNodeTensor(tensors_mask, input_tensors);
  256. // Remove reorder after PS feature finish adapting push/pull in auto_monad.
  257. auto execution_order = kernel_graph->execution_order();
  258. Reorder(&execution_order);
  259. kernel_graph->set_execution_order(execution_order);
  260. // runtime init
  261. if (!runtime_.Init()) {
  262. MS_LOG(EXCEPTION) << "Kernel runtime init error.";
  263. }
  264. runtime_.AssignKernelAddress(kernel_graph.get());
  265. std::map<tensor::TensorPtr, session::KernelWithIndex> tensor_to_node;
  266. runtime_.CreateOutputTensors(kernel_graph.get(), *input_tensors, outputs, &tensor_to_node);
  267. runtime_.BindInputOutput(kernel_graph.get(), *input_tensors, outputs);
  268. bool ret = runtime_.Run(kernel_graph.get(), false);
  269. if (!ret) {
  270. MS_LOG(EXCEPTION) << "Run Op failed";
  271. }
  272. UpdateDynamicOutputShape(tensor_to_node);
  273. // update output abstract of dynamic op to op_run_info
  274. if (op_run_info->is_dynamic_shape) {
  275. UpdateOutputAbstract(kernel_graph, op_run_info);
  276. }
  277. SetOutputFlags(*outputs);
  278. runtime_.RunOpClearMemory(kernel_graph.get());
  279. }
  280. void CPUSession::SetKernelInfo(const KernelGraph *kernel_graph) {
  281. MS_EXCEPTION_IF_NULL(kernel_graph);
  282. auto &kernel_nodes = kernel_graph->execution_order();
  283. for (const auto &kernel_node : kernel_nodes) {
  284. MS_EXCEPTION_IF_NULL(kernel_node);
  285. device::cpu::SetKernelInfo(kernel_node);
  286. }
  287. }
  288. namespace {
  289. void KernelNotSupportException(const AnfNodePtr &kernel_node) {
  290. std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
  291. std::stringstream operator_info;
  292. operator_info << "Operator[" << kernel_name << "] ";
  293. auto kernel_info = dynamic_cast<device::KernelInfo *>(kernel_node->kernel_info());
  294. if (kernel_info == nullptr) {
  295. operator_info << "is not support.";
  296. MS_LOG(EXCEPTION) << operator_info.str();
  297. }
  298. auto kernel_build_Info = kernel_info->select_kernel_build_info();
  299. if (kernel_build_Info == nullptr) {
  300. operator_info << "is not support.";
  301. MS_LOG(EXCEPTION) << operator_info.str();
  302. }
  303. size_t input_num = kernel_build_Info->GetInputNum();
  304. if (input_num > 0) {
  305. operator_info << " input(";
  306. for (size_t i = 0; i < input_num; ++i) {
  307. operator_info << TypeIdLabel(kernel_build_Info->GetInputDeviceType(i));
  308. if (i != input_num - 1) {
  309. operator_info << ",";
  310. }
  311. }
  312. operator_info << ") ";
  313. }
  314. size_t output_num = kernel_build_Info->GetOutputNum();
  315. if (output_num > 0) {
  316. operator_info << "output(";
  317. for (size_t i = 0; i < output_num; ++i) {
  318. operator_info << TypeIdLabel(kernel_build_Info->GetOutputDeviceType(i));
  319. if (i != kernel_build_Info->GetOutputNum() - 1) {
  320. operator_info << ",";
  321. }
  322. }
  323. operator_info << ") ";
  324. }
  325. operator_info << "is not support.";
  326. MS_LOG(EXCEPTION) << operator_info.str() << " Trace: " << trace::DumpSourceLines(kernel_node);
  327. }
  328. } // namespace
  329. void CPUSession::BuildKernel(const KernelGraph *kernel_graph) {
  330. MS_EXCEPTION_IF_NULL(kernel_graph);
  331. auto &kernel_nodes = kernel_graph->execution_order();
  332. for (const auto &kernel_node : kernel_nodes) {
  333. MS_EXCEPTION_IF_NULL(kernel_node);
  334. std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
  335. MS_LOG(INFO) << "Cpu building operator[" << kernel_name << "].";
  336. std::shared_ptr<kernel::CPUKernel> cpu_kernel =
  337. kernel::CPUKernelFactory::GetInstance().Create(kernel_name, kernel_node);
  338. if (cpu_kernel == nullptr) {
  339. KernelNotSupportException(kernel_node);
  340. }
  341. try {
  342. cpu_kernel->Init(kernel_node);
  343. } catch (std::exception &e) {
  344. MS_LOG(EXCEPTION) << e.what() << "\nTrace: " << trace::DumpSourceLines(kernel_node);
  345. }
  346. AnfAlgo::SetKernelMod(cpu_kernel, kernel_node.get());
  347. MS_LOG(INFO) << "Cpu build success operator[" << kernel_name << "].";
  348. }
  349. }
  350. } // namespace session
  351. } // namespace mindspore