| @@ -59,6 +59,7 @@ | |||||
| #include "debug/data_dump/e2e_dump.h" | #include "debug/data_dump/e2e_dump.h" | ||||
| #include "debug/anf_ir_dump.h" | #include "debug/anf_ir_dump.h" | ||||
| #include "debug/dump_proto.h" | #include "debug/dump_proto.h" | ||||
| #include "abstract/utils.h" | |||||
| #ifdef ENABLE_DEBUGGER | #ifdef ENABLE_DEBUGGER | ||||
| #include "debug/debugger/proto_exporter.h" | #include "debug/debugger/proto_exporter.h" | ||||
| #else | #else | ||||
| @@ -209,69 +210,6 @@ void GenOpOutputStubTensor(const KernelGraphPtr &single_op_graph, const CNodePtr | |||||
| (*op_output_info)[kernel_with_index] = output_tensor_info; | (*op_output_info)[kernel_with_index] = output_tensor_info; | ||||
| } | } | ||||
| } | } | ||||
| } // namespace | |||||
| void AscendSession::Init(uint32_t device_id) { InitExecutor(kAscendDevice, device_id); } | |||||
| void AscendSession::UnifyMindIR(const KernelGraphPtr &graph) { | |||||
| auto context_ptr = MsContext::GetInstance(); | |||||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||||
| bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG); | |||||
| if (save_graphs) { | |||||
| std::string file_name = "hwopt_d_before_unify_mindir_graph_" + std::to_string(graph->graph_id()) + ".ir"; | |||||
| DumpIR(file_name, graph); | |||||
| DumpIRProto(graph, "before_unify_mindir_hwopt_" + std::to_string(graph->graph_id())); | |||||
| } | |||||
| auto optimizer = std::make_shared<opt::GraphOptimizer>(); | |||||
| auto unify_mindir_pm = std::make_shared<opt::PassManager>("unify_mindir_pm"); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::SpaceToBatchNDAttrUpdate>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::BatchToSpaceNDAttrUpdate>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::MaxPool2MaxPoolWithArgmax>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::MaxPoolWithArgmaxUnifyMindIR>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::MaxPoolGradWithArgmaxUnifyMindIR>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::Conv2DUnifyMindIR>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::Conv2DBackpropInputUnifyMindIR>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::Conv2DBackpropFilterUnifyMindIR>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::SliceGradUnifyMindIR>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::AvgPoolGradUnifyMindIR>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::FtrlUnifyOutput>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::MomentumUnifyOutput>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::RMSPropUnifyOutput>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::CenteredRMSPropUnifyOutput>()); | |||||
| auto ms_context = MsContext::GetInstance(); | |||||
| MS_EXCEPTION_IF_NULL(ms_context); | |||||
| if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kGraphMode) { | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::DropoutAndDropoutGradUnifyMindIR>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::DropoutUnifyMindIR0>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::GradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::GradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIRV2>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::SparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>()); | |||||
| } else { | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::PynativeSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::PynativeGradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>()); | |||||
| } | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::DropoutUnifyMindIR1>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::DropoutGradUnifyMindIR>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::BatchNormGradUnifyMindIR>()); | |||||
| optimizer->AddPassManager(unify_mindir_pm); | |||||
| (void)optimizer->Optimize(graph); | |||||
| graph->SetExecOrderByDefault(); | |||||
| if (save_graphs) { | |||||
| std::string file_name = "hwopt_d_after_unify_mindir_graph_" + std::to_string(graph->graph_id()) + ".ir"; | |||||
| DumpIR(file_name, graph); | |||||
| } | |||||
| } | |||||
| GraphId AscendSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) { | |||||
| MS_LOG(INFO) << "Start"; | |||||
| // construct graph, if successfully, graph_sum_ + 1 | |||||
| auto graph = ConstructKernelGraph(lst, outputs); | |||||
| auto graph_id = graph->graph_id(); | |||||
| InitAllBucket(graph); | |||||
| MS_LOG(INFO) << "Compile graph " << graph_id << " success"; | |||||
| return graph_id; | |||||
| } | |||||
| bool IsBackward(const CNodePtr &cnode) { | bool IsBackward(const CNodePtr &cnode) { | ||||
| auto prim = GetValueNode<PrimitivePtr>(cnode->input(0)); | auto prim = GetValueNode<PrimitivePtr>(cnode->input(0)); | ||||
| @@ -357,6 +295,183 @@ void ReorderSendRecv(std::vector<CNodePtr> *execution_order) { | |||||
| } | } | ||||
| } | } | ||||
| size_t LoadCtrlInputTensor(const std::shared_ptr<KernelGraph> &graph, std::vector<tensor::TensorPtr> *inputs) { | |||||
| MS_EXCEPTION_IF_NULL(graph); | |||||
| MS_LOG(INFO) << "Load kInputCtrlTensors"; | |||||
| auto inputs_params = graph->input_ctrl_tensors(); | |||||
| if (inputs_params == nullptr) { | |||||
| return 0; | |||||
| } | |||||
| if (inputs_params->size() < 3) { | |||||
| MS_LOG(EXCEPTION) << "Illegal inputs_params size"; | |||||
| } | |||||
| // update current loop tensor to 0 per iterator | |||||
| auto cur_loop_tensor = (*inputs_params)[0]; | |||||
| MS_EXCEPTION_IF_NULL(cur_loop_tensor); | |||||
| auto *cur_val = static_cast<int32_t *>(cur_loop_tensor->data_c()); | |||||
| MS_EXCEPTION_IF_NULL(cur_val); | |||||
| *cur_val = 0; | |||||
| cur_loop_tensor->set_sync_status(kNeedSyncHostToDevice); | |||||
| // set loop_count to zero | |||||
| MS_EXCEPTION_IF_NULL(inputs); | |||||
| inputs->push_back(cur_loop_tensor); | |||||
| // update next loop tensor to 0 per iterator | |||||
| auto next_loop_tensor = (*inputs_params)[1]; | |||||
| MS_EXCEPTION_IF_NULL(next_loop_tensor); | |||||
| auto *next_val = static_cast<int32_t *>(next_loop_tensor->data_c()); | |||||
| MS_EXCEPTION_IF_NULL(next_val); | |||||
| *next_val = 0; | |||||
| next_loop_tensor->set_sync_status(kNeedSyncHostToDevice); | |||||
| // set loop_count to zero | |||||
| MS_EXCEPTION_IF_NULL(inputs); | |||||
| inputs->push_back(next_loop_tensor); | |||||
| auto epoch_tensor = (*inputs_params)[2]; | |||||
| MS_EXCEPTION_IF_NULL(epoch_tensor); | |||||
| auto *epoch_val = static_cast<int32_t *>(epoch_tensor->data_c()); | |||||
| MS_EXCEPTION_IF_NULL(epoch_val); | |||||
| *epoch_val = graph->current_epoch(); | |||||
| epoch_tensor->set_sync_status(kNeedSyncHostToDevice); | |||||
| inputs->push_back(epoch_tensor); | |||||
| MS_LOG(INFO) << "Load epoch_val:" << *epoch_val; | |||||
| graph->set_current_epoch(graph->current_epoch() + 1); | |||||
| return inputs_params->size(); | |||||
| } | |||||
| bool TensorNeedSync(const AnfNodePtr ¶meter, const tensor::TensorPtr &tensor) { | |||||
| auto ms_context = MsContext::GetInstance(); | |||||
| MS_EXCEPTION_IF_NULL(ms_context); | |||||
| auto device_address = AnfAlgo::GetMutableOutputAddr(parameter, 0); | |||||
| if (ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER)) { | |||||
| return tensor->device_address().get() == nullptr || tensor->device_address() != device_address; | |||||
| } | |||||
| if (tensor->NeedSyncHostToDevice()) { | |||||
| return true; | |||||
| } | |||||
| auto tensor_address = tensor->device_address(); | |||||
| if (tensor_address != device_address) { | |||||
| tensor->data_sync(false); | |||||
| return true; | |||||
| } | |||||
| return false; | |||||
| } | |||||
| } // namespace | |||||
| void AscendSession::Init(uint32_t device_id) { InitExecutor(kAscendDevice, device_id); } | |||||
| void AscendSession::UnifyMindIR(const KernelGraphPtr &graph) { | |||||
| auto context_ptr = MsContext::GetInstance(); | |||||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||||
| bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG); | |||||
| if (save_graphs) { | |||||
| std::string file_name = "hwopt_d_before_unify_mindir_graph_" + std::to_string(graph->graph_id()) + ".ir"; | |||||
| DumpIR(file_name, graph); | |||||
| DumpIRProto(graph, "before_unify_mindir_hwopt_" + std::to_string(graph->graph_id())); | |||||
| } | |||||
| auto optimizer = std::make_shared<opt::GraphOptimizer>(); | |||||
| auto unify_mindir_pm = std::make_shared<opt::PassManager>("unify_mindir_pm"); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::SpaceToBatchNDAttrUpdate>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::BatchToSpaceNDAttrUpdate>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::MaxPool2MaxPoolWithArgmax>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::MaxPoolWithArgmaxUnifyMindIR>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::MaxPoolGradWithArgmaxUnifyMindIR>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::Conv2DUnifyMindIR>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::Conv2DBackpropInputUnifyMindIR>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::Conv2DBackpropFilterUnifyMindIR>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::SliceGradUnifyMindIR>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::AvgPoolGradUnifyMindIR>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::FtrlUnifyOutput>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::MomentumUnifyOutput>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::RMSPropUnifyOutput>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::CenteredRMSPropUnifyOutput>()); | |||||
| auto ms_context = MsContext::GetInstance(); | |||||
| MS_EXCEPTION_IF_NULL(ms_context); | |||||
| if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kGraphMode) { | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::DropoutAndDropoutGradUnifyMindIR>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::DropoutUnifyMindIR0>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::GradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::GradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIRV2>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::SparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>()); | |||||
| } else { | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::PynativeSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::PynativeGradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>()); | |||||
| } | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::DropoutUnifyMindIR1>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::DropoutGradUnifyMindIR>()); | |||||
| unify_mindir_pm->AddPass(std::make_shared<opt::BatchNormGradUnifyMindIR>()); | |||||
| optimizer->AddPassManager(unify_mindir_pm); | |||||
| (void)optimizer->Optimize(graph); | |||||
| graph->SetExecOrderByDefault(); | |||||
| if (save_graphs) { | |||||
| std::string file_name = "hwopt_d_after_unify_mindir_graph_" + std::to_string(graph->graph_id()) + ".ir"; | |||||
| DumpIR(file_name, graph); | |||||
| } | |||||
| } | |||||
| void AscendSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph, | |||||
| const std::vector<tensor::TensorPtr> &inputs_const) const { | |||||
| std::vector<tensor::TensorPtr> inputs(inputs_const); | |||||
| size_t input_ctrl_size = 3; | |||||
| MS_EXCEPTION_IF_NULL(kernel_graph); | |||||
| if (kernel_graph->input_ctrl_tensors()) { | |||||
| input_ctrl_size = LoadCtrlInputTensor(kernel_graph, &inputs); | |||||
| } | |||||
| auto &input_nodes = kernel_graph->input_nodes(); | |||||
| auto extra_param_size = kernel_graph->GetExtraParamAndTensor().size(); | |||||
| if ((inputs.size() + input_ctrl_size) - 3 != input_nodes.size() - extra_param_size) { | |||||
| MS_LOG(EXCEPTION) << "Tensor input:" << inputs.size() << " is not equal graph inputs:" << input_nodes.size() | |||||
| << ", input_ctrl_size:" << input_ctrl_size << ", extra_param_size:" << extra_param_size; | |||||
| } | |||||
| auto ms_context = MsContext::GetInstance(); | |||||
| MS_EXCEPTION_IF_NULL(ms_context); | |||||
| for (size_t i = 0; i < inputs.size(); ++i) { | |||||
| auto tensor = inputs[i]; | |||||
| MS_EXCEPTION_IF_NULL(tensor); | |||||
| auto input_node = input_nodes[i]; | |||||
| MS_EXCEPTION_IF_NULL(input_node); | |||||
| auto size = LongToSize(tensor->data().nbytes()); | |||||
| if (input_node->isa<Parameter>() && input_node->cast<ParameterPtr>()->is_used_by_dynamic_kernel()) { | |||||
| auto tensor_shape = tensor->shape(); | |||||
| std::vector<size_t> shape_tmp; | |||||
| (void)std::transform(tensor_shape.begin(), tensor_shape.end(), std::back_inserter(shape_tmp), IntToSize); | |||||
| AnfAlgo::SetOutputInferTypeAndShape({AnfAlgo::GetOutputInferDataType(input_node, 0)}, {shape_tmp}, | |||||
| input_node.get()); | |||||
| size = abstract::ShapeSize(shape_tmp) * abstract::TypeIdSize(tensor->data_type()); | |||||
| } | |||||
| if (input_node->isa<Parameter>() && AnfAlgo::OutputAddrExist(input_node, 0) && TensorNeedSync(input_node, tensor)) { | |||||
| #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU)) | |||||
| const std::string ¶m_name = input_node->fullname_with_scope(); | |||||
| if (ps::ps_cache_instance.IsHashTable(param_name)) { | |||||
| continue; | |||||
| } | |||||
| #endif | |||||
| auto device_address = AnfAlgo::GetMutableOutputAddr(input_node, 0); | |||||
| MS_EXCEPTION_IF_NULL(device_address); | |||||
| if (size != 0 && !device_address->SyncHostToDevice(trans::GetRuntimePaddingShape(input_node, 0), size, | |||||
| tensor->data_type(), tensor->data_c())) { | |||||
| MS_LOG(EXCEPTION) << "SyncHostToDevice failed."; | |||||
| } | |||||
| if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode || | |||||
| AnfAlgo::IsParameterWeight(input_node->cast<ParameterPtr>())) { | |||||
| tensor->set_device_address(device_address); | |||||
| } | |||||
| } | |||||
| tensor->set_sync_status(kNoNeedSync); | |||||
| } | |||||
| } | |||||
| GraphId AscendSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) { | |||||
| MS_LOG(INFO) << "Start"; | |||||
| // construct graph, if successfully, graph_sum_ + 1 | |||||
| auto graph = ConstructKernelGraph(lst, outputs); | |||||
| auto graph_id = graph->graph_id(); | |||||
| InitAllBucket(graph); | |||||
| MS_LOG(INFO) << "Compile graph " << graph_id << " success"; | |||||
| return graph_id; | |||||
| } | |||||
| GraphId AscendSession::CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) { | GraphId AscendSession::CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) { | ||||
| MS_LOG(INFO) << "Start"; | MS_LOG(INFO) << "Start"; | ||||
| std::vector<KernelGraphPtr> all_graphs; | std::vector<KernelGraphPtr> all_graphs; | ||||
| @@ -559,16 +674,8 @@ void AscendSession::CompileChildGraph(const KernelGraphPtr &child_graph) { | |||||
| bool AscendSession::IsSupportSummary() { return !device::KernelAdjust::NeedInsertSwitch(); } | bool AscendSession::IsSupportSummary() { return !device::KernelAdjust::NeedInsertSwitch(); } | ||||
| void AscendSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, | |||||
| VectorRef *const outputs) { | |||||
| MS_LOG(INFO) << "Start"; | |||||
| auto kernel_graph = GetGraph(graph_id); | |||||
| MS_EXCEPTION_IF_NULL(kernel_graph); | |||||
| // if none of child graph and no anf output exists | |||||
| if (!kernel_graph->executable()) { | |||||
| MS_LOG(INFO) << "No child graph has anf output"; | |||||
| return; | |||||
| } | |||||
| void AscendSession::PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph, | |||||
| const std::vector<tensor::TensorPtr> &inputs, VectorRef *const outputs) { | |||||
| // load data to extra params | // load data to extra params | ||||
| std::set<KernelGraphPtr> memo; | std::set<KernelGraphPtr> memo; | ||||
| SyncDataToExtraParams(NOT_NULL(kernel_graph), NOT_NULL(&memo)); | SyncDataToExtraParams(NOT_NULL(kernel_graph), NOT_NULL(&memo)); | ||||
| @@ -580,14 +687,14 @@ void AscendSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tens | |||||
| // Initialize parameter server | // Initialize parameter server | ||||
| InitPSParamAndOptim(kernel_graph, inputs); | InitPSParamAndOptim(kernel_graph, inputs); | ||||
| std::string channel_name; | std::string channel_name; | ||||
| if (ps::PsDataPrefetch::GetInstance().cache_enable() && IsGetNextGraph(graph_id, &channel_name)) { | |||||
| if (ps::PsDataPrefetch::GetInstance().cache_enable() && IsGetNextGraph(kernel_graph, &channel_name)) { | |||||
| ps::ps_cache_instance.IncreaseGraphStep(channel_name); | ps::ps_cache_instance.IncreaseGraphStep(channel_name); | ||||
| } | } | ||||
| #endif | #endif | ||||
| { | |||||
| // run task on device | |||||
| Execute(kernel_graph, true); | |||||
| } | |||||
| } | |||||
| void AscendSession::PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph, | |||||
| const std::vector<tensor::TensorPtr> &inputs, VectorRef *const outputs) { | |||||
| // summary | // summary | ||||
| Summary(kernel_graph.get()); | Summary(kernel_graph.get()); | ||||
| // load tensor from device for debugger | // load tensor from device for debugger | ||||
| @@ -598,9 +705,10 @@ void AscendSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tens | |||||
| if (debugger_) { | if (debugger_) { | ||||
| debugger_->PostExecute(); | debugger_->PostExecute(); | ||||
| } | } | ||||
| MS_LOG(INFO) << "Finish!"; | |||||
| } | } | ||||
| void AscendSession::ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) { Execute(kernel_graph, true); } | |||||
| void AscendSession::RunOpHardwareOptimize(const std::shared_ptr<session::KernelGraph> &kernel_graph) const { | void AscendSession::RunOpHardwareOptimize(const std::shared_ptr<session::KernelGraph> &kernel_graph) const { | ||||
| MS_LOG(INFO) << "Start"; | MS_LOG(INFO) << "Start"; | ||||
| // data layout optimization | // data layout optimization | ||||
| @@ -50,7 +50,13 @@ class AscendSession : public SessionBasic { | |||||
| GraphId CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) override; | GraphId CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) override; | ||||
| GraphId CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) override; | GraphId CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) override; | ||||
| bool IsSupportSummary() override; | bool IsSupportSummary() override; | ||||
| void RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) override; | |||||
| void LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph, | |||||
| const std::vector<tensor::TensorPtr> &inputs_const) const override; | |||||
| void PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph, const std::vector<tensor::TensorPtr> &inputs, | |||||
| VectorRef *const outputs) override; | |||||
| void PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph, const std::vector<tensor::TensorPtr> &inputs, | |||||
| VectorRef *const outputs) override; | |||||
| void ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) override; | |||||
| void BuildGraphImpl(GraphId) override; | void BuildGraphImpl(GraphId) override; | ||||
| void BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info, | void BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info, | ||||
| const std::vector<tensor::TensorPtr> &input_tensors, | const std::vector<tensor::TensorPtr> &input_tensors, | ||||
| @@ -102,20 +102,19 @@ GraphId CPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr | |||||
| Optimize(graph); | Optimize(graph); | ||||
| MS_LOG(INFO) << "Build kernel"; | MS_LOG(INFO) << "Build kernel"; | ||||
| BuildKernel(graph.get()); | BuildKernel(graph.get()); | ||||
| // Remove reorder after PS feature finish adapting push/pull in auto_monad. | // Remove reorder after PS feature finish adapting push/pull in auto_monad. | ||||
| auto execution_order = graph->execution_order(); | auto execution_order = graph->execution_order(); | ||||
| Reorder(&execution_order); | Reorder(&execution_order); | ||||
| graph->set_execution_order(execution_order); | graph->set_execution_order(execution_order); | ||||
| // runtime init | // runtime init | ||||
| if (!runtime_.Init()) { | if (!runtime_.Init()) { | ||||
| MS_LOG(EXCEPTION) << "Kernel runtime init error."; | MS_LOG(EXCEPTION) << "Kernel runtime init error."; | ||||
| } | } | ||||
| MS_LOG(INFO) << "Assign kernel address"; | MS_LOG(INFO) << "Assign kernel address"; | ||||
| runtime_.AssignKernelAddress(graph.get()); | runtime_.AssignKernelAddress(graph.get()); | ||||
| // set summary node | |||||
| SetSummaryNodes(graph.get()); | |||||
| runtime_.IncreaseSummaryRefCount(graph->summary_nodes()); | |||||
| DumpGraph(graph); | DumpGraph(graph); | ||||
| return graph_id; | return graph_id; | ||||
| } | } | ||||
| @@ -154,38 +153,26 @@ void CPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph, | |||||
| } | } | ||||
| } | } | ||||
| void CPUSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, | |||||
| VectorRef *outputs) { | |||||
| auto kernel_graph = GetGraph(graph_id); | |||||
| MS_EXCEPTION_IF_NULL(kernel_graph); | |||||
| void CPUSession::PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph, | |||||
| const std::vector<tensor::TensorPtr> &inputs, VectorRef *const outputs) { | |||||
| MS_LOG(INFO) << "Bind input output address"; | MS_LOG(INFO) << "Bind input output address"; | ||||
| runtime_.BindInputOutput(kernel_graph.get(), inputs, outputs); | runtime_.BindInputOutput(kernel_graph.get(), inputs, outputs); | ||||
| #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU)) | #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU)) | ||||
| InitPSParamAndOptim(kernel_graph, inputs); | InitPSParamAndOptim(kernel_graph, inputs); | ||||
| #endif | #endif | ||||
| } | |||||
| MS_LOG(INFO) << "Run graph start"; | |||||
| bool enable_summary = summary_callback_ != nullptr; | |||||
| NamedSummaryOutputs summary_outputs; | |||||
| if (enable_summary) { | |||||
| SetSummaryNodes(kernel_graph.get()); | |||||
| summary_outputs = kernel_graph->summary_nodes(); | |||||
| runtime_.IncreaseSummaryRefCount(summary_outputs); | |||||
| } | |||||
| void CPUSession::PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph, | |||||
| const std::vector<tensor::TensorPtr> &inputs, VectorRef *const outputs) { | |||||
| Summary(kernel_graph.get()); | |||||
| } | |||||
| void CPUSession::ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) { | |||||
| bool ret = runtime_.Run(kernel_graph.get(), false); | bool ret = runtime_.Run(kernel_graph.get(), false); | ||||
| if (!ret) { | if (!ret) { | ||||
| MS_LOG(EXCEPTION) << "Run graph failed"; | MS_LOG(EXCEPTION) << "Run graph failed"; | ||||
| } | } | ||||
| if (enable_summary) { | |||||
| Summary(kernel_graph.get()); | |||||
| runtime_.DecreaseSummaryRefCount(summary_outputs); | |||||
| } | |||||
| MS_LOG(INFO) << "Run graph end"; | |||||
| } | } | ||||
| void CPUSession::BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info, | void CPUSession::BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info, | ||||
| @@ -36,7 +36,11 @@ class CPUSession : public SessionBasic { | |||||
| void CreateOutputTensors(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &input_tensors, VectorRef *, | void CreateOutputTensors(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &input_tensors, VectorRef *, | ||||
| std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node) override; | std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node) override; | ||||
| GraphId CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) override; | GraphId CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) override; | ||||
| void RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) override; | |||||
| void PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph, const std::vector<tensor::TensorPtr> &inputs, | |||||
| VectorRef *const outputs) override; | |||||
| void PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph, const std::vector<tensor::TensorPtr> &inputs, | |||||
| VectorRef *const outputs) override; | |||||
| void ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) override; | |||||
| ParameterPtr CreateNewParameterFromParameter(const AnfNodePtr &anf, KernelGraph *graph) override; | ParameterPtr CreateNewParameterFromParameter(const AnfNodePtr &anf, KernelGraph *graph) override; | ||||
| void Optimize(const std::shared_ptr<KernelGraph> &kernel_graph); | void Optimize(const std::shared_ptr<KernelGraph> &kernel_graph); | ||||
| void BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info, | void BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info, | ||||
| @@ -236,6 +236,7 @@ void Executor::WorkerLoop() { | |||||
| done_tasks_.emplace_back(task); | done_tasks_.emplace_back(task); | ||||
| } | } | ||||
| if (task->type_ != kRunGraph || task->sync_run_) { | if (task->type_ != kRunGraph || task->sync_run_) { | ||||
| std::lock_guard<std::mutex> lock(task_mutex_); | |||||
| sync_run_task_finished_ = true; | sync_run_task_finished_ = true; | ||||
| sync_cond_var_.notify_all(); | sync_cond_var_.notify_all(); | ||||
| } | } | ||||
| @@ -310,9 +311,9 @@ void Executor::ClearDoneTasks() { | |||||
| } | } | ||||
| void Executor::RunTask(const std::shared_ptr<Task> &task, bool sync, bool long_run) { | void Executor::RunTask(const std::shared_ptr<Task> &task, bool sync, bool long_run) { | ||||
| sync_run_task_finished_ = false; | |||||
| { | { | ||||
| std::lock_guard<std::mutex> lock(task_mutex_); | std::lock_guard<std::mutex> lock(task_mutex_); | ||||
| sync_run_task_finished_ = false; | |||||
| ready_tasks_.push(task); | ready_tasks_.push(task); | ||||
| } | } | ||||
| task_cond_var_.notify_all(); | task_cond_var_.notify_all(); | ||||
| @@ -299,14 +299,6 @@ void GPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph, | |||||
| } | } | ||||
| } | } | ||||
| void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const { | |||||
| auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); | |||||
| MS_EXCEPTION_IF_NULL(runtime_instance); | |||||
| if (!runtime_instance->Run(kernel_graph.get(), false)) { | |||||
| MS_LOG(EXCEPTION) << "GPU execute graph failed!"; | |||||
| } | |||||
| } | |||||
| GraphId GPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) { | GraphId GPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) { | ||||
| // Construct graph, if successfully, graph_sum_ + 1 | // Construct graph, if successfully, graph_sum_ + 1 | ||||
| auto graph = ConstructKernelGraph(lst, outputs); | auto graph = ConstructKernelGraph(lst, outputs); | ||||
| @@ -419,10 +411,8 @@ GraphId GPUSession::CompileGraphImpl(KernelGraphPtr graph) { | |||||
| return graph->graph_id(); | return graph->graph_id(); | ||||
| } | } | ||||
| void GPUSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, | |||||
| VectorRef *outputs) { | |||||
| auto &kernel_graph = graphs_[graph_id]; | |||||
| MS_LOG(INFO) << "RunGraph graph_id: " << graph_id; | |||||
| void GPUSession::PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph, | |||||
| const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) { | |||||
| if (debugger_) { | if (debugger_) { | ||||
| debugger_->PreExecute(kernel_graph, graph_sum_); | debugger_->PreExecute(kernel_graph, graph_sum_); | ||||
| } | } | ||||
| @@ -430,26 +420,48 @@ void GPUSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor: | |||||
| // Initialize parameter server | // Initialize parameter server | ||||
| InitPSParamAndOptim(kernel_graph, inputs); | InitPSParamAndOptim(kernel_graph, inputs); | ||||
| #endif | #endif | ||||
| MS_EXCEPTION_IF_NULL(kernel_graph); | |||||
| // It's InitDataset graph if kernel_num == 1, skip the loop. | |||||
| } | |||||
| void GPUSession::PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph, | |||||
| const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) { | |||||
| // Summary | |||||
| auto context_ptr = MsContext::GetInstance(); | |||||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||||
| if (context_ptr->get_param<bool>(MS_CTX_ENABLE_GPU_SUMMARY)) { | |||||
| Summary(kernel_graph.get()); | |||||
| } | |||||
| bool dump_enabled = DumpDataEnabledIteration(); | |||||
| // debug used for dump | |||||
| if (debugger_ && dump_enabled) { | |||||
| Dump(kernel_graph); | |||||
| } else { | |||||
| DumpJsonParser::GetInstance().UpdateDumpIter(); | |||||
| } | |||||
| if (debugger_) { | |||||
| debugger_->PostExecute(); | |||||
| } | |||||
| } | |||||
| void GPUSession::ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) { | |||||
| int kernel_num = kernel_graph->execution_order().size(); | int kernel_num = kernel_graph->execution_order().size(); | ||||
| int64_t loopsize = (kernel_num > 1) ? ConfigManager::GetInstance().gpu_loopsink_size() : 1; | int64_t loopsize = (kernel_num > 1) ? ConfigManager::GetInstance().gpu_loopsink_size() : 1; | ||||
| for (int64_t i = 0; i < loopsize; i++) { | for (int64_t i = 0; i < loopsize; i++) { | ||||
| #if ENABLE_CPU && ENABLE_GPU | #if ENABLE_CPU && ENABLE_GPU | ||||
| std::string channel_name; | std::string channel_name; | ||||
| if (ps::PsDataPrefetch::GetInstance().cache_enable() && IsGetNextGraph(graph_id, &channel_name)) { | |||||
| if (ps::PsDataPrefetch::GetInstance().cache_enable() && IsGetNextGraph(kernel_graph, &channel_name)) { | |||||
| ps::ps_cache_instance.IncreaseGraphStep(channel_name); | ps::ps_cache_instance.IncreaseGraphStep(channel_name); | ||||
| } | } | ||||
| #endif | #endif | ||||
| Execute(kernel_graph); | Execute(kernel_graph); | ||||
| } | } | ||||
| // Summary | |||||
| auto context_ptr = MsContext::GetInstance(); | |||||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||||
| if (context_ptr->get_param<bool>(MS_CTX_ENABLE_GPU_SUMMARY)) { | |||||
| Summary(kernel_graph.get()); | |||||
| } | |||||
| void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const { | |||||
| auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); | |||||
| MS_EXCEPTION_IF_NULL(runtime_instance); | |||||
| if (!runtime_instance->Run(kernel_graph.get(), false)) { | |||||
| MS_LOG(EXCEPTION) << "GPU execute graph failed!"; | |||||
| } | } | ||||
| PostIterationDbg(kernel_graph); | |||||
| } | } | ||||
| void GPUSession::BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info, | void GPUSession::BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info, | ||||
| @@ -519,19 +531,6 @@ bool GPUSession::DumpDataEnabledIteration() const { | |||||
| return runtime_instance->DumpDataEnabledIteration(); | return runtime_instance->DumpDataEnabledIteration(); | ||||
| } | } | ||||
| void GPUSession::PostIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const { | |||||
| bool dump_enabled = DumpDataEnabledIteration(); | |||||
| // debug used for dump | |||||
| if (debugger_ && dump_enabled) { | |||||
| Dump(kernel_graph); | |||||
| } else { | |||||
| DumpJsonParser::GetInstance().UpdateDumpIter(); | |||||
| } | |||||
| if (debugger_) { | |||||
| debugger_->PostExecute(); | |||||
| } | |||||
| } | |||||
| void GPUSession::SyncStream() { | void GPUSession::SyncStream() { | ||||
| auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); | auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); | ||||
| MS_EXCEPTION_IF_NULL(runtime_instance); | MS_EXCEPTION_IF_NULL(runtime_instance); | ||||
| @@ -39,7 +39,11 @@ class GPUSession : public SessionBasic { | |||||
| void UnifyMindIR(const KernelGraphPtr &graph) override { return; } | void UnifyMindIR(const KernelGraphPtr &graph) override { return; } | ||||
| GraphId CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) override; | GraphId CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) override; | ||||
| GraphId CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) override; | GraphId CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) override; | ||||
| void RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) override; | |||||
| void PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph, const std::vector<tensor::TensorPtr> &inputs, | |||||
| VectorRef *const outputs) override; | |||||
| void PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph, const std::vector<tensor::TensorPtr> &inputs, | |||||
| VectorRef *const outputs) override; | |||||
| void ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) override; | |||||
| void BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info, | void BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info, | ||||
| const std::vector<tensor::TensorPtr> &input_tensors, | const std::vector<tensor::TensorPtr> &input_tensors, | ||||
| const std::vector<int64_t> &tensors_mask) override; | const std::vector<int64_t> &tensors_mask) override; | ||||
| @@ -79,8 +83,6 @@ class GPUSession : public SessionBasic { | |||||
| bool DumpDataEnabledIteration() const; | bool DumpDataEnabledIteration() const; | ||||
| void PostIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const; | |||||
| GraphId CompileGraphImpl(KernelGraphPtr kernel_graph); | GraphId CompileGraphImpl(KernelGraphPtr kernel_graph); | ||||
| }; | }; | ||||
| using GPUSessionPtr = std::shared_ptr<GPUSession>; | using GPUSessionPtr = std::shared_ptr<GPUSession>; | ||||
| @@ -252,52 +252,6 @@ ValueNodePtr CreateNewValueNode(const AnfNodePtr &anf, KernelGraph *graph) { | |||||
| return new_value_node; | return new_value_node; | ||||
| } | } | ||||
| size_t LoadCtrlInputTensor(const std::shared_ptr<KernelGraph> &graph, std::vector<tensor::TensorPtr> *inputs) { | |||||
| MS_EXCEPTION_IF_NULL(graph); | |||||
| MS_LOG(INFO) << "Load kInputCtrlTensors"; | |||||
| auto inputs_params = graph->input_ctrl_tensors(); | |||||
| if (inputs_params == nullptr) { | |||||
| return 0; | |||||
| } | |||||
| if (inputs_params->size() < 3) { | |||||
| MS_LOG(EXCEPTION) << "Illegal inputs_params size"; | |||||
| } | |||||
| // update current loop tensor to 0 per iterator | |||||
| auto cur_loop_tensor = (*inputs_params)[0]; | |||||
| MS_EXCEPTION_IF_NULL(cur_loop_tensor); | |||||
| auto *cur_val = static_cast<int32_t *>(cur_loop_tensor->data_c()); | |||||
| MS_EXCEPTION_IF_NULL(cur_val); | |||||
| *cur_val = 0; | |||||
| cur_loop_tensor->set_sync_status(kNeedSyncHostToDevice); | |||||
| // set loop_count to zero | |||||
| MS_EXCEPTION_IF_NULL(inputs); | |||||
| inputs->push_back(cur_loop_tensor); | |||||
| // update next loop tensor to 0 per iterator | |||||
| auto next_loop_tensor = (*inputs_params)[1]; | |||||
| MS_EXCEPTION_IF_NULL(next_loop_tensor); | |||||
| auto *next_val = static_cast<int32_t *>(next_loop_tensor->data_c()); | |||||
| MS_EXCEPTION_IF_NULL(next_val); | |||||
| *next_val = 0; | |||||
| next_loop_tensor->set_sync_status(kNeedSyncHostToDevice); | |||||
| // set loop_count to zero | |||||
| MS_EXCEPTION_IF_NULL(inputs); | |||||
| inputs->push_back(next_loop_tensor); | |||||
| auto epoch_tensor = (*inputs_params)[2]; | |||||
| MS_EXCEPTION_IF_NULL(epoch_tensor); | |||||
| auto *epoch_val = static_cast<int32_t *>(epoch_tensor->data_c()); | |||||
| MS_EXCEPTION_IF_NULL(epoch_val); | |||||
| *epoch_val = graph->current_epoch(); | |||||
| epoch_tensor->set_sync_status(kNeedSyncHostToDevice); | |||||
| inputs->push_back(epoch_tensor); | |||||
| MS_LOG(INFO) << "Load epoch_val:" << *epoch_val; | |||||
| graph->set_current_epoch(graph->current_epoch() + 1); | |||||
| return inputs_params->size(); | |||||
| } | |||||
| ValueNodePtr ConstructRunOpValueNode(const std::shared_ptr<KernelGraph> &graph, const tensor::TensorPtr &input_tensor) { | ValueNodePtr ConstructRunOpValueNode(const std::shared_ptr<KernelGraph> &graph, const tensor::TensorPtr &input_tensor) { | ||||
| MS_EXCEPTION_IF_NULL(graph); | MS_EXCEPTION_IF_NULL(graph); | ||||
| MS_EXCEPTION_IF_NULL(input_tensor); | MS_EXCEPTION_IF_NULL(input_tensor); | ||||
| @@ -1544,80 +1498,6 @@ void SessionBasic::AddParameterToGraphInputs(const std::vector<AnfNodePtr> ¶ | |||||
| } | } | ||||
| } | } | ||||
| namespace { | |||||
| bool TensorNeedSync(const AnfNodePtr ¶meter, const tensor::TensorPtr &tensor) { | |||||
| auto ms_context = MsContext::GetInstance(); | |||||
| MS_EXCEPTION_IF_NULL(ms_context); | |||||
| auto device_address = AnfAlgo::GetMutableOutputAddr(parameter, 0); | |||||
| if (ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER)) { | |||||
| return tensor->device_address().get() == nullptr || tensor->device_address() != device_address; | |||||
| } | |||||
| if (tensor->NeedSyncHostToDevice()) { | |||||
| return true; | |||||
| } | |||||
| auto tensor_address = tensor->device_address(); | |||||
| if (tensor_address != device_address) { | |||||
| tensor->data_sync(false); | |||||
| return true; | |||||
| } | |||||
| return false; | |||||
| } | |||||
| } // namespace | |||||
| // run graph steps | |||||
| void SessionBasic::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph, | |||||
| const std::vector<tensor::TensorPtr> &inputs_const) const { | |||||
| std::vector<tensor::TensorPtr> inputs(inputs_const); | |||||
| size_t input_ctrl_size = 3; | |||||
| MS_EXCEPTION_IF_NULL(kernel_graph); | |||||
| if (kernel_graph->input_ctrl_tensors()) { | |||||
| input_ctrl_size = LoadCtrlInputTensor(kernel_graph, &inputs); | |||||
| } | |||||
| auto &input_nodes = kernel_graph->input_nodes(); | |||||
| auto extra_param_size = kernel_graph->GetExtraParamAndTensor().size(); | |||||
| if ((inputs.size() + input_ctrl_size) - 3 != input_nodes.size() - extra_param_size) { | |||||
| MS_LOG(EXCEPTION) << "Tensor input:" << inputs.size() << " is not equal graph inputs:" << input_nodes.size() | |||||
| << ", input_ctrl_size:" << input_ctrl_size << ", extra_param_size:" << extra_param_size; | |||||
| } | |||||
| auto ms_context = MsContext::GetInstance(); | |||||
| MS_EXCEPTION_IF_NULL(ms_context); | |||||
| for (size_t i = 0; i < inputs.size(); ++i) { | |||||
| auto tensor = inputs[i]; | |||||
| MS_EXCEPTION_IF_NULL(tensor); | |||||
| auto input_node = input_nodes[i]; | |||||
| MS_EXCEPTION_IF_NULL(input_node); | |||||
| auto size = LongToSize(tensor->data().nbytes()); | |||||
| if (input_node->isa<Parameter>() && input_node->cast<ParameterPtr>()->is_used_by_dynamic_kernel()) { | |||||
| auto tensor_shape = tensor->shape(); | |||||
| std::vector<size_t> shape_tmp; | |||||
| (void)std::transform(tensor_shape.begin(), tensor_shape.end(), std::back_inserter(shape_tmp), IntToSize); | |||||
| AnfAlgo::SetOutputInferTypeAndShape({AnfAlgo::GetOutputInferDataType(input_node, 0)}, {shape_tmp}, | |||||
| input_node.get()); | |||||
| size = abstract::ShapeSize(shape_tmp) * abstract::TypeIdSize(tensor->data_type()); | |||||
| } | |||||
| if (input_node->isa<Parameter>() && AnfAlgo::OutputAddrExist(input_node, 0) && TensorNeedSync(input_node, tensor)) { | |||||
| #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU)) | |||||
| const std::string ¶m_name = input_node->fullname_with_scope(); | |||||
| if (ps::ps_cache_instance.IsHashTable(param_name)) { | |||||
| continue; | |||||
| } | |||||
| #endif | |||||
| auto device_address = AnfAlgo::GetMutableOutputAddr(input_node, 0); | |||||
| MS_EXCEPTION_IF_NULL(device_address); | |||||
| if (size != 0 && !device_address->SyncHostToDevice(trans::GetRuntimePaddingShape(input_node, 0), size, | |||||
| tensor->data_type(), tensor->data_c())) { | |||||
| MS_LOG(EXCEPTION) << "SyncHostToDevice failed."; | |||||
| } | |||||
| if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode || | |||||
| AnfAlgo::IsParameterWeight(input_node->cast<ParameterPtr>())) { | |||||
| tensor->set_device_address(device_address); | |||||
| } | |||||
| } | |||||
| tensor->set_sync_status(kNoNeedSync); | |||||
| } | |||||
| } | |||||
| void SessionBasic::UpdateOutputs(const std::shared_ptr<KernelGraph> &kernel_graph, VectorRef *const outputs, | void SessionBasic::UpdateOutputs(const std::shared_ptr<KernelGraph> &kernel_graph, VectorRef *const outputs, | ||||
| const std::vector<tensor::TensorPtr> &input_tensors) const { | const std::vector<tensor::TensorPtr> &input_tensors) const { | ||||
| MS_EXCEPTION_IF_NULL(kernel_graph); | MS_EXCEPTION_IF_NULL(kernel_graph); | ||||
| @@ -2130,6 +2010,22 @@ void SessionBasic::RunGraphAsync(const GraphId &graph_id, const std::vector<tens | |||||
| executor_->RunGraphAsync(shared_from_this(), graph_id, inputs, outputs); | executor_->RunGraphAsync(shared_from_this(), graph_id, inputs, outputs); | ||||
| } | } | ||||
| void SessionBasic::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, | |||||
| VectorRef *const outputs) { | |||||
| MS_LOG(INFO) << "Run graph start, graph id: " << graph_id; | |||||
| auto kernel_graph = GetGraph(graph_id); | |||||
| MS_EXCEPTION_IF_NULL(kernel_graph); | |||||
| // if none of child graph and no anf output exists | |||||
| if (!kernel_graph->executable()) { | |||||
| MS_LOG(INFO) << "No child graph has anf output"; | |||||
| return; | |||||
| } | |||||
| PreExecuteGraph(kernel_graph, inputs, outputs); | |||||
| ExecuteGraph(kernel_graph); | |||||
| PostExecuteGraph(kernel_graph, inputs, outputs); | |||||
| MS_LOG(INFO) << "Run graph end, graph id: " << graph_id; | |||||
| } | |||||
| void SessionBasic::RunOpsInGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, | void SessionBasic::RunOpsInGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, | ||||
| VectorRef *outputs) { | VectorRef *outputs) { | ||||
| MS_LOG(INFO) << "Start!"; | MS_LOG(INFO) << "Start!"; | ||||
| @@ -2214,8 +2110,7 @@ void SessionBasic::UpdateGraphDynamicShapeAttr(const NotNull<KernelGraphPtr> &ro | |||||
| root_graph->UpdateGraphDynamicAttr(); | root_graph->UpdateGraphDynamicAttr(); | ||||
| } | } | ||||
| bool SessionBasic::IsGetNextGraph(const GraphId &graph_id, std::string *channel_name) { | |||||
| auto kernel_graph = graphs_[graph_id]; | |||||
| bool SessionBasic::IsGetNextGraph(const std::shared_ptr<KernelGraph> &kernel_graph, std::string *channel_name) { | |||||
| MS_EXCEPTION_IF_NULL(kernel_graph); | MS_EXCEPTION_IF_NULL(kernel_graph); | ||||
| for (const auto &kernel_node : kernel_graph->execution_order()) { | for (const auto &kernel_node : kernel_graph->execution_order()) { | ||||
| auto kernel_name = AnfAlgo::GetCNodeName(kernel_node); | auto kernel_name = AnfAlgo::GetCNodeName(kernel_node); | ||||
| @@ -114,7 +114,7 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> { | |||||
| virtual GraphId GetFinalRunGraph() const { return kInvalidGraphId; } | virtual GraphId GetFinalRunGraph() const { return kInvalidGraphId; } | ||||
| void AssignParamKey(const KernelGraphPtr &kernel_graph); | void AssignParamKey(const KernelGraphPtr &kernel_graph); | ||||
| void InitPSParamAndOptim(const KernelGraphPtr &kernel_graph, const std::vector<tensor::TensorPtr> &inputs_const); | void InitPSParamAndOptim(const KernelGraphPtr &kernel_graph, const std::vector<tensor::TensorPtr> &inputs_const); | ||||
| bool IsGetNextGraph(const GraphId &graph_id, std::string *channel_name); | |||||
| bool IsGetNextGraph(const std::shared_ptr<KernelGraph> &kernel_graph, std::string *channel_name); | |||||
| virtual bool CheckModelInputs(uint32_t graph_id, const std::vector<tensor::TensorPtr> &inputs, | virtual bool CheckModelInputs(uint32_t graph_id, const std::vector<tensor::TensorPtr> &inputs, | ||||
| std::string *error_msg) const { | std::string *error_msg) const { | ||||
| return true; | return true; | ||||
| @@ -173,8 +173,12 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> { | |||||
| virtual GraphId CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) { return 0; } | virtual GraphId CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) { return 0; } | ||||
| virtual GraphId CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) { return kInvalidGraphId; } | virtual GraphId CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) { return kInvalidGraphId; } | ||||
| virtual void BuildGraphImpl(GraphId) {} | virtual void BuildGraphImpl(GraphId) {} | ||||
| virtual void RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) { | |||||
| } | |||||
| virtual void PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph, | |||||
| const std::vector<tensor::TensorPtr> &inputs, VectorRef *const outputs) {} | |||||
| virtual void PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph, | |||||
| const std::vector<tensor::TensorPtr> &inputs, VectorRef *const outputs) {} | |||||
| virtual void ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) {} | |||||
| void RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs); | |||||
| virtual void BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info, | virtual void BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info, | ||||
| const std::vector<tensor::TensorPtr> &input_tensors, | const std::vector<tensor::TensorPtr> &input_tensors, | ||||
| const std::vector<int64_t> &tensors_mask) {} | const std::vector<int64_t> &tensors_mask) {} | ||||
| @@ -195,7 +199,7 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> { | |||||
| } | } | ||||
| virtual void LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph, | virtual void LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph, | ||||
| const std::vector<tensor::TensorPtr> &inputs_const) const; | |||||
| const std::vector<tensor::TensorPtr> &inputs_const) const {} | |||||
| void UpdateOutputs(const std::shared_ptr<KernelGraph> &kernel_graph, VectorRef *const outputs, | void UpdateOutputs(const std::shared_ptr<KernelGraph> &kernel_graph, VectorRef *const outputs, | ||||
| const std::vector<tensor::TensorPtr> &input_tensors) const; | const std::vector<tensor::TensorPtr> &input_tensors) const; | ||||
| void UpdateOutputAbstract(const std::shared_ptr<KernelGraph> &kernel_graph, OpRunInfo *op_run_info) const; | void UpdateOutputAbstract(const std::shared_ptr<KernelGraph> &kernel_graph, OpRunInfo *op_run_info) const; | ||||