/** * Copyright 2019-2022 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "backend/common/session/cpu_session.h" #include #include #include #include "ir/anf.h" #include "utils/ms_utils.h" #include "utils/trace_base.h" #include "include/common/utils/context/graph_kernel_flags.h" #include "backend/common/session/anf_runtime_algorithm.h" #include "include/common/utils/anfalgo.h" #include "runtime/device/kernel_runtime.h" #include "plugin/device/cpu/kernel/akg/akg_cpu_kernel_build.h" #include "plugin/device/cpu/kernel/cpu_kernel_factory.h" #include "plugin/device/cpu/hal/device/kernel_select_cpu.h" #include "backend/common/optimizer/optimizer.h" #include "backend/common/optimizer/pass_manager.h" #include "plugin/device/cpu/optimizer/insert_cast_cpu.h" #include "plugin/device/cpu/optimizer/insert_format_transform_op.h" #include "common/graph_kernel/adapter/graph_kernel_optimization.h" #include "backend/common/pass/replace_node_by_proxy.h" #include "backend/common/pass/erase_visit_attr.h" #include "debug/anf_ir_dump.h" #include "backend/common/optimizer/common_backend_optimization.h" #include "debug/dump_proto.h" #ifndef ENABLE_SECURITY #include "debug/data_dump/dump_json_parser.h" #endif #if ((defined ENABLE_CPU) && (!defined _WIN32)) #include "ps/util.h" #include "ps/ps_context.h" #endif #ifdef ENABLE_DUMP_IR #include "debug/rdr/graph_recorder.h" #include "debug/rdr/running_data_recorder.h" #endif namespace mindspore { namespace session { void CPUSession::Init(uint32_t device_id) { #ifndef ENABLE_SECURITY // Dump json config file if dump is enabled auto &json_parser = DumpJsonParser::GetInstance(); json_parser.Parse(); json_parser.CopyMSCfgJsonToDir(rank_id_); #endif InitExecutor(kCPUDevice, device_id); } ParameterPtr CPUSession::CreateNewParameterFromParameter(const AnfNodePtr &anf, KernelGraph *graph) { MS_EXCEPTION_IF_NULL(anf); MS_EXCEPTION_IF_NULL(graph); if (!anf->isa()) { MS_LOG(EXCEPTION) << "anf[" << anf->DebugString() << "] is not a parameter"; } auto valid_inputs = graph->MutableValidInputs(); auto graph_inputs = graph->MutableInputs(); MS_EXCEPTION_IF_NULL(graph_inputs); TraceManager::DebugTrace(std::make_shared(anf->debug_info())); ParameterPtr new_parameter = graph->NewParameter(anf->cast()); TraceManager::EndTrace(); graph_inputs->push_back(new_parameter); valid_inputs->push_back(true); return new_parameter; } // Remove after PS feature finish adapting push/pull in auto_monad. void CPUSession::Reorder(std::vector *node_list) { common::AnfAlgo::ReorderPosteriorExecList(NOT_NULL(node_list)); } void CPUSession::Optimize(const std::shared_ptr &kernel_graph) { auto optimizer = std::make_shared(); auto pm = std::make_shared(); #if ((defined ENABLE_CPU) && (!defined _WIN32) && !defined(__APPLE__)) auto ms_context = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(ms_context); if (ms_context->get_param(MS_CTX_EXECUTION_MODE) != kPynativeMode && ps::PSContext::instance()->is_ps_mode()) { AssignParamKey(kernel_graph); if (ps::PSContext::instance()->is_worker()) { std::string pass_name = "replace_node_by_proxy"; pass_name.append(std::to_string(graph_sum_)); pm->AddPass(std::make_shared(pass_name)); } } #endif pm->AddPass(std::make_shared("insert_format_transform_op_cpu")); pm->AddPass(std::make_shared("insert_cast")); pm->AddPass(std::make_shared()); optimizer->AddPassManager(pm); (void)optimizer->Optimize(kernel_graph); kernel_graph->SetExecOrderByDefault(); } void CPUSession::GraphKernelOptimize(const std::shared_ptr &kernel_graph) { #ifdef ENABLE_AKG if (!graphkernel::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) { return; } graphkernel::GraphKernelOptimize(kernel_graph); kernel_graph->SetExecOrderByDefault(); #endif } GraphId CPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) { auto graph_id = graph_sum_; auto graph = ConstructKernelGraph(lst, outputs, DeviceAddressType::kCPU); MS_EXCEPTION_IF_NULL(graph); opt::AddDynamicShapeAttrPass(graph); MS_LOG(INFO) << "Set kernel info"; SetKernelInfo(graph.get()); MS_LOG(INFO) << "Set kernel info end"; Optimize(graph); FinalOptimize(graph); GraphKernelOptimize(graph); MS_LOG(INFO) << "Build kernel"; BuildKernel(graph.get()); // Remove reorder after PS feature finish adapting push/pull in auto_monad. auto execution_order = graph->execution_order(); Reorder(&execution_order); graph->set_execution_order(execution_order); // runtime init if (!runtime_.Init()) { MS_LOG(EXCEPTION) << "Kernel runtime init error."; } MS_LOG(INFO) << "Assign kernel graph address"; runtime_.AssignKernelGraphAddress(graph.get()); // set summary node #ifndef ENABLE_SECURITY SetSummaryNodes(graph.get()); #endif runtime_.IncreaseSummaryRefCount(graph->summary_nodes()); DumpGraphs({graph}); return graph_id; } void CPUSession::CreateOutputTensors(const GraphId &graph_id, const std::vector &input_tensors, VectorRef *outputs, std::map *tensor_to_node, KernelMapTensor *) { auto kernel_graph = GetGraph(graph_id); MS_EXCEPTION_IF_NULL(kernel_graph); runtime_.CreateOutputTensors(kernel_graph.get(), input_tensors, outputs, tensor_to_node); } void CPUSession::LoadInputData(const std::shared_ptr &kernel_graph, const std::vector &inputs_const) const { MS_EXCEPTION_IF_NULL(kernel_graph); auto &input_nodes = kernel_graph->input_nodes(); if (input_nodes.size() != inputs_const.size()) { MS_LOG(EXCEPTION) << "Input size " << inputs_const.size() << " is not equal to input node size " << input_nodes.size(); } for (size_t input_idx = 0; input_idx < input_nodes.size(); ++input_idx) { auto &input_node = input_nodes[input_idx]; MS_EXCEPTION_IF_NULL(input_node); if (!input_node->isa() || HasAbstractMonad(input_node)) { continue; } auto address = AnfAlgo::GetMutableOutputAddr(input_node, 0); auto tensor = inputs_const[input_idx]; auto tensor_address = tensor->device_address(); MS_EXCEPTION_IF_NULL(address); MS_EXCEPTION_IF_NULL(tensor); if (tensor_address == nullptr || tensor_address == address) { continue; } auto input_param = input_node->cast(); if (common::AnfAlgo::IsParameterWeight(input_param) && !tensor->IsUpdatedByDevice()) { continue; } if (std::dynamic_pointer_cast(tensor_address)->DeviceType() != device::DeviceAddressType::kCPU) { tensor->data_sync(false); } } } void CPUSession::PreExecuteGraph(const std::shared_ptr &kernel_graph, const std::vector &inputs, VectorRef *const outputs) { MS_LOG(INFO) << "Bind input output address"; runtime_.BindInputOutput(kernel_graph.get(), inputs, outputs); #if ((defined ENABLE_CPU) && (!defined _WIN32) && !defined(__APPLE__)) InitPSParamAndOptim(kernel_graph, inputs); #endif } void CPUSession::PostExecuteGraph(const std::shared_ptr &kernel_graph, const std::vector &, VectorRef *const) { #ifndef ENABLE_SECURITY Summary(kernel_graph.get()); #endif } void CPUSession::ExecuteGraph(const std::shared_ptr &kernel_graph) { bool ret = runtime_.Run(*kernel_graph, false); if (!ret) { MS_LOG(EXCEPTION) << "Run graph failed"; } } KernelGraphPtr CPUSession::BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info, const std::vector &input_tensors, const std::vector &tensors_mask) { // Check if the graph cache exists. auto it = run_op_graphs_.find(graph_info); if (it != run_op_graphs_.end()) { return it->second; } // Prepare the graph const auto &kernel_graph = ConstructSingleOpGraph(op_run_info, input_tensors, tensors_mask); MS_EXCEPTION_IF_NULL(kernel_graph); SetKernelInfo(kernel_graph.get()); Optimize(kernel_graph); BuildKernel(kernel_graph.get()); auto enable_op_graph_cache = MsContext::GetInstance()->get_param(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE); if (enable_op_graph_cache) { run_op_graphs_[graph_info] = kernel_graph; } return kernel_graph; } void CPUSession::SetOutputFlags(const VectorRef &base_ref) { for (size_t i = 0; i < base_ref.size(); ++i) { if (utils::isa(base_ref[i])) { auto ref_iter = utils::cast(base_ref[i]); SetOutputFlags(ref_iter); } else if (utils::isa(base_ref[i])) { auto tensor_ptr = utils::cast>(base_ref[i]); tensor_ptr->SetNeedWait(false); tensor_ptr->data_sync(false); } } } void CPUSession::UpdateDynamicOutputShape(const std::map &tensor_to_node) { for (const auto &tensor_node : tensor_to_node) { if (common::AnfAlgo::IsDynamicShape(tensor_node.second.first)) { const auto &kernel = tensor_node.second.first; const auto &output_index = tensor_node.second.second; const auto &shape = common::AnfAlgo::GetOutputInferShape(kernel, output_index); std::vector refresh_shape; (void)std::copy(shape.begin(), shape.end(), std::back_inserter(refresh_shape)); MS_EXCEPTION_IF_NULL(tensor_node.first); tensor_node.first->set_shape(refresh_shape); } } } void CPUSession::RunOpImplOrigin(const GraphInfo &graph_info, OpRunInfo *op_run_info, std::vector *input_tensors, VectorRef *outputs, const std::vector &tensors_mask) { RunOpImpl(graph_info, op_run_info, input_tensors, outputs, tensors_mask); } void CPUSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info, std::vector *input_tensors, VectorRef *outputs, const std::vector &tensors_mask) { MS_EXCEPTION_IF_NULL(input_tensors); MS_EXCEPTION_IF_NULL(op_run_info); ProcessInputTensorsForHeterogeneous("CPU", *input_tensors); const auto &kernel_graph = BuildOpImpl(*op_run_info, graph_info, *input_tensors, tensors_mask); EraseValueNodeTensor(tensors_mask, input_tensors); // Remove reorder after PS feature finish adapting push/pull in auto_monad. auto execution_order = kernel_graph->execution_order(); Reorder(&execution_order); kernel_graph->set_execution_order(execution_order); // runtime init if (!runtime_.Init()) { MS_LOG(EXCEPTION) << "Kernel runtime init error."; } runtime_.AssignKernelGraphAddress(kernel_graph.get()); std::map tensor_to_node; runtime_.CreateOutputTensors(kernel_graph.get(), *input_tensors, outputs, &tensor_to_node); runtime_.BindInputOutput(kernel_graph.get(), *input_tensors, outputs); bool ret = runtime_.Run(*kernel_graph, false); if (!ret) { MS_LOG(EXCEPTION) << "Run Op failed"; } UpdateDynamicOutputShape(tensor_to_node); // update output abstract of dynamic op to op_run_info if (op_run_info->is_dynamic_shape) { UpdateOutputAbstract(kernel_graph, op_run_info); } SetOutputFlags(*outputs); runtime_.RunOpClearMemory(*kernel_graph); } void CPUSession::SetKernelInfo(const KernelGraph *kernel_graph) { MS_EXCEPTION_IF_NULL(kernel_graph); auto &kernel_nodes = kernel_graph->execution_order(); for (const auto &kernel_node : kernel_nodes) { MS_EXCEPTION_IF_NULL(kernel_node); device::cpu::SetKernelInfo(kernel_node); } } namespace { void KernelNotSupportException(const AnfNodePtr &kernel_node) { std::string kernel_name = common::AnfAlgo::GetCNodeName(kernel_node); std::stringstream operator_info; operator_info << "Operator[" << kernel_name << "] "; auto kernel_info = dynamic_cast(kernel_node->kernel_info()); if (kernel_info == nullptr) { operator_info << "is not support."; MS_LOG(EXCEPTION) << operator_info.str(); } auto kernel_build_Info = kernel_info->select_kernel_build_info(); if (kernel_build_Info == nullptr) { operator_info << "is not support."; MS_LOG(EXCEPTION) << operator_info.str(); } size_t input_num = kernel_build_Info->GetInputNum(); if (input_num > 0) { operator_info << " input("; for (size_t i = 0; i < input_num; ++i) { operator_info << TypeIdLabel(kernel_build_Info->GetInputDeviceType(i)); if (i != input_num - 1) { operator_info << ","; } } operator_info << ") "; } size_t output_num = kernel_build_Info->GetOutputNum(); if (output_num > 0) { operator_info << "output("; for (size_t i = 0; i < output_num; ++i) { operator_info << TypeIdLabel(kernel_build_Info->GetOutputDeviceType(i)); if (i != kernel_build_Info->GetOutputNum() - 1) { operator_info << ","; } } operator_info << ") "; } operator_info << "is not support."; MS_LOG(EXCEPTION) << operator_info.str() << trace::DumpSourceLines(kernel_node); } } // namespace void CPUSession::BuildKernel(const KernelGraph *kernel_graph) { MS_EXCEPTION_IF_NULL(kernel_graph); auto &kernel_nodes = kernel_graph->execution_order(); kernel::KernelMeta *bin_map = kernel::KernelMeta::GetInstance(); MS_EXCEPTION_IF_NULL(bin_map); std::vector akg_nodes; for (const auto &kernel_node : kernel_nodes) { MS_EXCEPTION_IF_NULL(kernel_node); std::string kernel_name = common::AnfAlgo::GetCNodeName(kernel_node); MS_LOG(INFO) << "Cpu building operator[" << kernel_name << "]."; if (session::AnfRuntimeAlgorithm::GetKernelType(kernel_node) == KernelType::AKG_KERNEL) { if (!bin_map->initialized()) { bin_map->Initialize(); } akg_nodes.push_back(kernel_node); continue; } std::shared_ptr cpu_kernel_mod = kernel::NativeCpuKernelModFactory::GetInstance().Create(kernel_name, kernel_node); if (cpu_kernel_mod == nullptr) { KernelNotSupportException(kernel_node); } try { cpu_kernel_mod->Init(kernel_node); } catch (std::exception &e) { MS_LOG(EXCEPTION) << e.what() << trace::DumpSourceLines(kernel_node); } AnfAlgo::SetKernelMod(cpu_kernel_mod, kernel_node.get()); MS_LOG(INFO) << "Cpu build success operator[" << kernel_name << "]."; } #ifdef ENABLE_AKG kernel::AkgCpuKernelBuilder akg_cpu_kernel_builder; (void)akg_cpu_kernel_builder.AkgKernelParallelBuild(akg_nodes); #endif } } // namespace session } // namespace mindspore