Merge pull request !25132 from caifubi/master-pynative-mindrt-gpu-async-buildtags/v1.6.0
| @@ -220,6 +220,7 @@ set(SUB_COMP | |||
| runtime/device | |||
| runtime/framework | |||
| runtime/hardware | |||
| runtime/op_builder | |||
| runtime/hccl_adapter | |||
| frontend/optimizer | |||
| frontend/parallel | |||
| @@ -70,11 +70,6 @@ bool DatasetIteratorKernel::Init(const CNodePtr &kernel_node) { | |||
| total_bytes_ += bytes; | |||
| } | |||
| handle_ = GpuBufferMgr::GetInstance().Open(0, queue_name_, output_size_list_); | |||
| if (handle_ == HandleMgr::INVALID_HANDLE) { | |||
| MS_LOG(EXCEPTION) << "Gpu Queue(" << queue_name_ << ") Open Failed"; | |||
| } | |||
| #ifndef ENABLE_SECURITY | |||
| auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(profiler_inst); | |||
| @@ -991,13 +991,13 @@ void AscendSession::BuildOpsInGraph(const GraphId &graph_id, const std::map<AnfN | |||
| InputTensorInfo input_tensor_info; | |||
| GetOpInputStubTensors(kernel, parameter_index, graph_inputs, op_output_info, &input_tensor_info); | |||
| // Get OpRunInfo and GraphInfo | |||
| OpRunInfo op_run_info; | |||
| GetSingleOpRunInfo(kernel, &op_run_info); | |||
| const GraphInfo &graph_info = GetSingleOpGraphInfo(kernel, input_tensor_info.input_tensors); | |||
| OpRunInfo op_run_info = GetSingleOpRunInfo(kernel, graph_info, input_tensor_info); | |||
| if (op_run_info.is_dynamic_shape) { | |||
| MS_LOG(INFO) << "BuildOpsInGraph stop, op " << op_run_info.op_name << " is dynamic shape."; | |||
| break; | |||
| } | |||
| const GraphInfo &graph_info = GetSingleOpGraphInfo(kernel, input_tensor_info.input_tensors); | |||
| const auto &single_op_graph_iter = run_op_graphs_.find(graph_info); | |||
| if (single_op_graph_iter != run_op_graphs_.end()) { | |||
| // if graph of same single op exists, the output tensor of current op should be generated | |||
| @@ -1218,20 +1218,29 @@ GraphInfo SessionBasic::GetSingleOpGraphInfo(const CNodePtr &kernel, | |||
| return graph_info; | |||
| } | |||
| void SessionBasic::GetSingleOpRunInfo(const CNodePtr cnode, OpRunInfo *run_info) { | |||
| OpRunInfo SessionBasic::GetSingleOpRunInfo(const CNodePtr &cnode, const GraphInfo &graph_info, | |||
| const InputTensorInfo &tensor_info) { | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| MS_EXCEPTION_IF_NULL(run_info); | |||
| auto primitive = AnfAlgo::GetCNodePrimitive(cnode); | |||
| run_info->primitive = primitive; | |||
| run_info->op_name = primitive->name(); | |||
| const auto &abstract = cnode->abstract(); | |||
| if (abstract == nullptr) { | |||
| MS_LOG(EXCEPTION) << "Abstract is nullptr, node = " << cnode->DebugString(); | |||
| } | |||
| run_info->abstract = abstract; | |||
| const auto &shape = abstract->BuildShape(); | |||
| MS_EXCEPTION_IF_NULL(shape); | |||
| run_info->is_dynamic_shape = shape->IsDynamic(); | |||
| OpRunInfo op_run_info = {.op_name = primitive->name(), | |||
| .primitive = primitive.get(), | |||
| .abstract = abstract, | |||
| .is_dynamic_shape = shape->IsDynamic(), | |||
| .is_auto_mixed_precision = false, | |||
| .lazy_build = false, | |||
| .next_op_name = std::string(), | |||
| .next_input_index = 0, | |||
| .graph_info = graph_info, | |||
| .tensor_mask = tensor_info.input_tensors_mask, | |||
| .input_tensors = tensor_info.input_tensors}; | |||
| return op_run_info; | |||
| } | |||
| void SessionBasic::GetParameterIndex(const KernelGraph *graph, const std::vector<tensor::TensorPtr> &inputs, | |||
| @@ -2143,7 +2152,7 @@ std::shared_ptr<KernelGraph> SessionBasic::ConstructSingleOpGraph(const OpRunInf | |||
| graph_sum_++; | |||
| std::vector<AnfNodePtr> inputs; | |||
| // set input[0] | |||
| PrimitivePtr op_prim = op_run_info.primitive; | |||
| auto op_prim = op_run_info.primitive; | |||
| MS_EXCEPTION_IF_NULL(op_prim); | |||
| // Decoupling of frontend PrimitivePy and backend Primitive | |||
| inputs.push_back(std::make_shared<ValueNode>(std::make_shared<Primitive>(*op_prim))); | |||
| @@ -2238,11 +2247,11 @@ void SessionBasic::BuildGraph(GraphId graph_id) { | |||
| executor_->BuildGraph(shared_from_this(), graph_id); | |||
| } | |||
| void SessionBasic::RunOp(OpRunInfo *op_run_info, const GraphInfo &graph_info, | |||
| std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs, | |||
| const std::vector<int64_t> &tensors_mask) { | |||
| void SessionBasic::RunOp(OpRunInfo *op_run_info, VectorRef *outputs) { | |||
| MS_EXCEPTION_IF_NULL(executor_); | |||
| executor_->RunOp(shared_from_this(), op_run_info, graph_info, input_tensors, outputs, tensors_mask); | |||
| MS_EXCEPTION_IF_NULL(op_run_info); | |||
| executor_->RunOp(shared_from_this(), op_run_info, op_run_info->graph_info, &op_run_info->input_tensors, outputs, | |||
| op_run_info->tensor_mask); | |||
| } | |||
| void SessionBasic::RunOpsInGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, | |||
| @@ -2305,13 +2314,12 @@ void SessionBasic::RunOpsInGraphImpl(const GraphId &graph_id, const std::vector< | |||
| InputTensorInfo input_tensor_info; | |||
| GetOpInputTensors(kernel, op_output_map, parameter_index, inputs, &input_tensor_info); | |||
| VectorRef op_outputs; | |||
| // Get OpRunInfo and GraphInfo | |||
| OpRunInfo run_info; | |||
| GetSingleOpRunInfo(kernel, &run_info); | |||
| GraphInfo graph_info = GetSingleOpGraphInfo(kernel, input_tensor_info.input_tensors); | |||
| OpRunInfo run_info = GetSingleOpRunInfo(kernel, graph_info, input_tensor_info); | |||
| // Build and run current single op | |||
| VectorRef op_outputs; | |||
| RunOpImplOrigin(graph_info, &run_info, &input_tensor_info.input_tensors, &op_outputs, | |||
| input_tensor_info.input_tensors_mask); | |||
| graph_output_info.graph_output_tensors.clear(); | |||
| @@ -58,17 +58,20 @@ using AnyListPtr = std::shared_ptr<AnyList>; | |||
| struct OpRunInfo { | |||
| std::string op_name; | |||
| PrimitivePtr primitive; | |||
| Primitive *primitive; | |||
| AbstractBasePtr abstract; | |||
| bool is_dynamic_shape = false; | |||
| bool is_auto_mixed_precision = false; | |||
| bool lazy_build = false; | |||
| std::string next_op_name = ""; | |||
| std::string next_op_name; | |||
| #if defined(__APPLE__) | |||
| int next_input_index = 0; | |||
| #else | |||
| size_t next_input_index = 0; | |||
| #endif | |||
| std::string graph_info; | |||
| std::vector<int64_t> tensor_mask; | |||
| std::vector<tensor::TensorPtr> input_tensors; | |||
| }; | |||
| struct InputTensorInfo { | |||
| @@ -108,8 +111,7 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> { | |||
| void BuildGraph(GraphId graphId); | |||
| void RunGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs); | |||
| void RunGraphAsync(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs); | |||
| void RunOp(OpRunInfo *, const GraphInfo &, std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs, | |||
| const std::vector<int64_t> &tensors_mask); | |||
| void RunOp(OpRunInfo *, VectorRef *outputs); | |||
| void RunOpsInGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs); | |||
| #ifndef ENABLE_SECURITY | |||
| @@ -276,7 +278,7 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> { | |||
| CNodePtr ConstructOutput(const AnfNodePtrList &outputs, const std::shared_ptr<KernelGraph> &graph); | |||
| // Generate graph info for a single op graph | |||
| GraphInfo GetSingleOpGraphInfo(const CNodePtr &kernel, const std::vector<tensor::TensorPtr> &input_tensors); | |||
| void GetSingleOpRunInfo(const CNodePtr cnode, OpRunInfo *run_info); | |||
| OpRunInfo GetSingleOpRunInfo(const CNodePtr &cnode, const GraphInfo &graph_info, const InputTensorInfo &tensor_info); | |||
| tensor::TensorPtr GetValueNodeOutputTensor(const AnfNodePtr &node, size_t output_index); | |||
| tensor::TensorPtr GetParameterOutputTensor(const AnfNodePtr &node, | |||
| const std::map<AnfNodePtr, size_t> ¶meter_index, | |||
| @@ -751,6 +751,9 @@ void UpdateTensorInfo(const tensor::TensorPtr &new_tensor, const std::vector<ten | |||
| pre_tensor->set_device_address(new_tensor->device_address()); | |||
| continue; | |||
| } | |||
| if (mind_rt_backend != nullptr) { | |||
| mind_rt_backend->SyncLazyTasks(); | |||
| } | |||
| // Replace data in device address when run in CPU device. | |||
| if (pre_tensor->device_address() != nullptr) { | |||
| auto old_device_address = std::dynamic_pointer_cast<device::DeviceAddress>(pre_tensor->device_address()); | |||
| @@ -811,6 +814,10 @@ py::object GetDstType(const TypeId &type_id) { | |||
| MS_EXCEPTION_IF_NULL(value); | |||
| return py::cast(value); | |||
| } | |||
| bool IsPyObjTypeInvalid(const py::object &obj) { | |||
| return !py::isinstance<tensor::Tensor>(obj) && !py::isinstance<py::int_>(obj) && !py::isinstance<py::float_>(obj); | |||
| } | |||
| } // namespace | |||
| py::object RealRunOp(const py::args &args) { | |||
| @@ -1276,7 +1283,7 @@ void ForwardExecutor::DoSignatureCast(const PrimitivePyPtr &prim, | |||
| continue; | |||
| } | |||
| if (!py::isinstance<tensor::Tensor>(obj) && !py::isinstance<py::int_>(obj) && !py::isinstance<py::float_>(obj)) { | |||
| if (IsPyObjTypeInvalid(obj)) { | |||
| MS_EXCEPTION(TypeError) << "For '" << prim->name() << "', the " << i << "th input " << signature[i].name | |||
| << " is a not support implicit conversion. " | |||
| << "Its type is " << py::cast<std::string>(obj.attr("__class__").attr("__name__")) | |||
| @@ -2013,28 +2020,35 @@ py::object ForwardExecutor::RunOpInMs(const OpExecInfoPtr &op_exec_info, Pynativ | |||
| ConvertAttrToUnifyMindIR(op_exec_info); | |||
| // get graph info for checking it whether existing in the cache | |||
| GetSingleOpGraphInfo(op_exec_info, input_tensors, tensors_mask, &graph_info); | |||
| VectorRef outputs; | |||
| #if defined(__APPLE__) | |||
| session::OpRunInfo op_run_info = {op_exec_info->op_name, | |||
| op_exec_info->py_primitive, | |||
| op_exec_info->py_primitive.get(), | |||
| op_exec_info->abstract, | |||
| op_exec_info->is_dynamic_shape, | |||
| op_exec_info->is_mixed_precision_cast, | |||
| op_exec_info->lazy_build, | |||
| op_exec_info->next_op_name, | |||
| static_cast<int>(op_exec_info->next_input_index)}; | |||
| static_cast<int>(op_exec_info->next_input_index), | |||
| graph_info, | |||
| tensors_mask, | |||
| input_tensors}; | |||
| #else | |||
| session::OpRunInfo op_run_info = {op_exec_info->op_name, | |||
| op_exec_info->py_primitive, | |||
| op_exec_info->py_primitive.get(), | |||
| op_exec_info->abstract, | |||
| op_exec_info->is_dynamic_shape, | |||
| op_exec_info->is_mixed_precision_cast, | |||
| op_exec_info->lazy_build, | |||
| op_exec_info->next_op_name, | |||
| op_exec_info->next_input_index}; | |||
| op_exec_info->next_input_index, | |||
| graph_info, | |||
| tensors_mask, | |||
| input_tensors}; | |||
| #endif | |||
| VectorRef outputs; | |||
| if (!ms_context->get_param<bool>(MS_CTX_ENABLE_MINDRT)) { | |||
| kSession->RunOp(&op_run_info, graph_info, &input_tensors, &outputs, tensors_mask); | |||
| kSession->RunOp(&op_run_info, &outputs); | |||
| } else { | |||
| if (mind_rt_backend == nullptr) { | |||
| const auto &device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET); | |||
| @@ -2043,9 +2057,7 @@ py::object ForwardExecutor::RunOpInMs(const OpExecInfoPtr &op_exec_info, Pynativ | |||
| } | |||
| mindspore::ScopedLongRunning long_running; | |||
| const compile::ActorInfo &actor_info = | |||
| mind_rt_backend->CompileGraph(op_run_info, graph_info, &tensors_mask, &input_tensors); | |||
| mind_rt_backend->RunGraph(actor_info, &op_run_info, &tensors_mask, &input_tensors, &outputs); | |||
| mind_rt_backend->RunOp(&op_run_info, &outputs); | |||
| } | |||
| if (op_exec_info->is_dynamic_shape) { | |||
| @@ -3222,6 +3234,9 @@ void PynativeExecutor::ClearGrad(const py::object &cell, const py::args &args) { | |||
| void PynativeExecutor::ClearRes() { | |||
| MS_LOG(DEBUG) << "Clear all res"; | |||
| session::PynativeTaskManager::GetInstance().Reset(); | |||
| if (mind_rt_backend != nullptr) { | |||
| mind_rt_backend->ClearOpBuilderResource(); | |||
| } | |||
| SetLazyBuild(false); | |||
| cell_depth_ = 0; | |||
| @@ -3279,6 +3294,8 @@ void PynativeExecutor::Sync() { | |||
| auto ms_context = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(ms_context); | |||
| ExecuteAllTask(); | |||
| if (!ms_context->get_param<bool>(MS_CTX_ENABLE_MINDRT)) { | |||
| if (kSession == nullptr) { | |||
| MS_EXCEPTION(NotExistsError) << "No session has been created!"; | |||
| @@ -3312,7 +3329,12 @@ void PynativeExecutor::ExitCell() { | |||
| bool PynativeExecutor::IsTopCell() const { return cell_depth_ == 0; } | |||
| void PynativeExecutor::ExecuteAllTask() { session::PynativeTaskManager::GetInstance().ExecuteRemainingTasks(); } | |||
| void PynativeExecutor::ExecuteAllTask() { | |||
| session::PynativeTaskManager::GetInstance().ExecuteRemainingTasks(); | |||
| if (mind_rt_backend != nullptr) { | |||
| mind_rt_backend->SyncLazyTasks(); | |||
| } | |||
| } | |||
| REGISTER_PYBIND_DEFINE(PynativeExecutor_, ([](const py::module *m) { | |||
| (void)py::class_<PynativeExecutor, std::shared_ptr<PynativeExecutor>>(*m, "PynativeExecutor_") | |||
| @@ -18,7 +18,9 @@ | |||
| #include <numeric> | |||
| #include <map> | |||
| #include <utility> | |||
| #include <algorithm> | |||
| #include "runtime/framework/graph_scheduler.h" | |||
| #include "runtime/op_builder/op_lazy_builder.h" | |||
| #include "runtime/device/device_address.h" | |||
| #include "common/trans.h" | |||
| #include "utils/convert_utils.h" | |||
| @@ -179,16 +181,16 @@ void CreateKernelOutputDeviceAddress(const DeviceContext *device_context, const | |||
| if (AnfAlgo::IsControlOpExecInBackend(kernel)) { | |||
| continue; | |||
| } | |||
| auto kernel_mod = AnfAlgo::GetKernelMod(kernel); | |||
| MS_EXCEPTION_IF_NULL(kernel_mod); | |||
| auto output_sizes = kernel_mod->GetOutputSizeList(); | |||
| for (size_t i = 0; i < output_sizes.size(); ++i) { | |||
| auto output_size = AnfAlgo::GetOutputTensorNum(kernel); | |||
| for (size_t i = 0; i < output_size; ++i) { | |||
| if (AnfAlgo::OutputAddrExist(kernel, i)) { | |||
| continue; | |||
| } | |||
| auto output_format = AnfAlgo::GetOutputFormat(kernel, i); | |||
| auto output_type = AnfAlgo::GetOutputDeviceDataType(kernel, i); | |||
| auto device_address = device_context->CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type); | |||
| auto address_size = AnfAlgo::GetOutputTensorMemSize(kernel, i); | |||
| auto device_address = device_context->CreateDeviceAddress(nullptr, address_size, output_format, output_type); | |||
| MS_LOG(DEBUG) << "Create addr for node:" << AnfAlgo::GetNodeDebugString(kernel) << " addr:" << device_address; | |||
| AnfAlgo::SetOutputAddr(device_address, i, kernel.get()); | |||
| } | |||
| @@ -465,13 +467,12 @@ GraphId GraphCompiler::CompileGraphImpl(const KernelGraphPtr &graph, const Devic | |||
| return graph->graph_id(); | |||
| } | |||
| GraphId GraphCompiler::CompileGraph(const session::OpRunInfo &op_run_info, const GraphInfo &graph_info, | |||
| const std::vector<int64_t> *tensors_mask, | |||
| std::vector<TensorPtr> *const input_tensors, bool *single_op_cache_hit, | |||
| GraphId GraphCompiler::CompileGraph(const session::OpRunInfo &op_run_info, bool *single_op_cache_hit, | |||
| const DeviceContext *device_context) { | |||
| // Check if the graph cache exists. | |||
| auto iter = run_op_graphs_.find(graph_info); | |||
| if (iter != run_op_graphs_.end()) { | |||
| auto iter = run_op_graphs_.find(op_run_info.graph_info); | |||
| auto &op_lazy_builder = runtime::OpLazyBuilder::GetInstance(); | |||
| if (iter != run_op_graphs_.end() && op_lazy_builder.QueueEmpty()) { | |||
| const auto &graph = iter->second; | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| *single_op_cache_hit = true; | |||
| @@ -480,22 +481,20 @@ GraphId GraphCompiler::CompileGraph(const session::OpRunInfo &op_run_info, const | |||
| *single_op_cache_hit = false; | |||
| // Generate kernel graph. | |||
| MS_EXCEPTION_IF_NULL(session_); | |||
| KernelGraphPtr graph = session_->ConstructSingleOpGraph(op_run_info, *input_tensors, *tensors_mask); | |||
| KernelGraphPtr graph = | |||
| session_->ConstructSingleOpGraph(op_run_info, op_run_info.input_tensors, op_run_info.tensor_mask); | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| MS_EXCEPTION_IF_NULL(device_context); | |||
| device_context->OptimizeSingleOpGraph(graph); | |||
| // Generate 'KernelMod' for kernel in graph. | |||
| device_context->CreateKernel(graph->execution_order()); | |||
| device_context->PreprocessBeforeRunSingleOpGraph(graph); | |||
| // Create device address for all anf nodes of graph. | |||
| CreateDeviceAddress(graph, device_context); | |||
| CreateDeviceAddressWithoutWorkspace(graph, device_context); | |||
| graph->set_is_all_nop_node(opt::IsAllNopNode(graph.get())); | |||
| run_op_graphs_[graph_info] = graph; | |||
| run_op_graphs_[op_run_info.graph_info] = graph; | |||
| auto output_nodes = graph->outputs(); | |||
| auto &outputs_with_index = run_op_graph_output_nodes_[graph->graph_id()]; | |||
| @@ -509,6 +508,22 @@ GraphId GraphCompiler::CompileGraph(const session::OpRunInfo &op_run_info, const | |||
| return graph->graph_id(); | |||
| } | |||
| void GraphCompiler::BuildSingleOpGraphs(const std::vector<KernelGraphPtr> &graphs, | |||
| const DeviceContext *device_context) const { | |||
| MS_EXCEPTION_IF_NULL(device_context); | |||
| std::vector<CNodePtr> node_to_build; | |||
| for (const auto &graph : graphs) { | |||
| const auto &nodes = graph->execution_order(); | |||
| std::copy(nodes.begin(), nodes.end(), std::back_inserter(node_to_build)); | |||
| } | |||
| device_context->CreateKernel(node_to_build); | |||
| for (const auto &graph : graphs) { | |||
| CreateKernelWorkspaceDeviceAddress(device_context, graph); | |||
| } | |||
| } | |||
| KernelGraphPtr GraphCompiler::Fetch(GraphId graph_id) const { | |||
| MS_EXCEPTION_IF_NULL(session_); | |||
| return session_->GetGraph(graph_id); | |||
| @@ -532,6 +547,14 @@ void GraphCompiler::CreateDeviceAddress(const KernelGraphPtr &graph, const Devic | |||
| UpdateDeviceAddressForRefNode(graph); | |||
| } | |||
| void GraphCompiler::CreateDeviceAddressWithoutWorkspace(const KernelGraphPtr &graph, | |||
| const DeviceContext *device_context) const { | |||
| CreateParameterDeviceAddress(device_context, graph); | |||
| CreateValueNodeDeviceAddress(device_context, graph); | |||
| CreateKernelOutputDeviceAddress(device_context, graph); | |||
| UpdateDeviceAddressForInplaceNode(graph); | |||
| } | |||
| void GraphCompiler::GetParamAndOutputIndex( | |||
| const KernelGraphPtr &graph, const std::vector<TensorPtr> &inputs, VectorRef *const outputs, | |||
| std::map<AnfNodePtr, size_t> *parameter_index, | |||
| @@ -560,12 +583,13 @@ TensorPtr GraphCompiler::GetSingleOpInputTensorByIndex(const CNodePtr &kernel, | |||
| input_index); | |||
| } | |||
| void GraphCompiler::GetSingleOpRunInfoAndGraphInfo(const CNodePtr &kernel, const std::vector<TensorPtr> &input_tensors, | |||
| OpRunInfo *const run_info, GraphInfo *const graph_info) { | |||
| void GraphCompiler::GetSingleOpRunInfoAndGraphInfo(const CNodePtr &kernel, const InputTensorInfo &tensor_info, | |||
| OpRunInfo *run_info, GraphInfo *graph_info) { | |||
| MS_EXCEPTION_IF_NULL(session_); | |||
| MS_EXCEPTION_IF_NULL(graph_info); | |||
| session_->GetSingleOpRunInfo(kernel, run_info); | |||
| *graph_info = session_->GetSingleOpGraphInfo(kernel, input_tensors); | |||
| *graph_info = session_->GetSingleOpGraphInfo(kernel, tensor_info.input_tensors); | |||
| *run_info = session_->GetSingleOpRunInfo(kernel, *graph_info, tensor_info); | |||
| } | |||
| void GraphCompiler::CalculateRefCount(const KernelGraphPtr &graph, std::map<KernelWithIndex, size_t> *ref_count) const { | |||
| @@ -104,9 +104,11 @@ class GraphCompiler { | |||
| GraphId CompileGraph(const FuncGraphPtr &func_graph, const DeviceContext *device_context); | |||
| // Construct single op kernel graph and compile the kernel graph in PyNative mode. | |||
| GraphId CompileGraph(const session::OpRunInfo &op_run_info, const GraphInfo &graph_info, | |||
| const std::vector<int64_t> *tensors_mask, std::vector<TensorPtr> *const input_tensors, | |||
| bool *single_op_cache_hit, const DeviceContext *device_context); | |||
| GraphId CompileGraph(const session::OpRunInfo &op_run_info, bool *single_op_cache_hit, | |||
| const DeviceContext *device_context); | |||
| // Create kernel and Create workspace for graphs in PyNative mode. | |||
| void BuildSingleOpGraphs(const std::vector<KernelGraphPtr> &graphs, const DeviceContext *device_context) const; | |||
| // Get graph by graph id, if not exist return nullptr, used in Graph mode. | |||
| KernelGraphPtr Fetch(GraphId graph_id) const; | |||
| @@ -135,8 +137,8 @@ class GraphCompiler { | |||
| InputTensorInfo *const input_tensor_info, size_t input_index); | |||
| // Get OpRunInfo and GraphInfo for single op compile and run. | |||
| void GetSingleOpRunInfoAndGraphInfo(const CNodePtr &kernel, const std::vector<TensorPtr> &input_tensors, | |||
| OpRunInfo *const run_info, GraphInfo *const graph_info); | |||
| void GetSingleOpRunInfoAndGraphInfo(const CNodePtr &kernel, const InputTensorInfo &tensor_info, OpRunInfo *run_info, | |||
| GraphInfo *graph_info); | |||
| // Calculate ref count of PyNative back propagation operators. | |||
| void CalculateRefCount(const KernelGraphPtr &graph, std::map<KernelWithIndex, size_t> *ref_count) const; | |||
| @@ -181,6 +183,9 @@ class GraphCompiler { | |||
| // Create device address for all anf nodes of graph. | |||
| void CreateDeviceAddress(const KernelGraphPtr &graph, const DeviceContext *device_context) const; | |||
| // Create device address for input and output of ops. | |||
| void CreateDeviceAddressWithoutWorkspace(const KernelGraphPtr &graph, const DeviceContext *device_context) const; | |||
| // Single op kernel graph cache for PyNative mode. | |||
| std::unordered_map<GraphInfo, KernelGraphPtr> run_op_graphs_; | |||
| // Single op kernel graph output nodes cache for PyNative mode. | |||
| @@ -0,0 +1,5 @@ | |||
| file(GLOB_RECURSE BUILDER_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} | |||
| "op_lazy_builder.cc") | |||
| set_property(SOURCE ${BUILDER_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE) | |||
| add_library(_mindspore_runtime_op_builder_obj OBJECT ${BUILDER_SRC_LIST}) | |||
| @@ -0,0 +1,45 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "runtime/op_builder/op_lazy_builder.h" | |||
| namespace mindspore::runtime { | |||
| void OpLazyBuilder::Register(const std::function<void()> &callback) { | |||
| execute_callback_ = callback; | |||
| registered_ = true; | |||
| } | |||
| void OpLazyBuilder::Reset() { | |||
| ClearAllResources(); | |||
| execute_callback_ = nullptr; | |||
| registered_ = false; | |||
| } | |||
| void OpLazyBuilder::ClearAllResources() { | |||
| op_build_tasks.clear(); | |||
| std::queue<std::shared_ptr<OpTask>> empty; | |||
| std::swap(op_run_tasks, empty); | |||
| } | |||
| void OpLazyBuilder::ExecuteRemainingTasks() { | |||
| if (!executing_) { | |||
| ExecuteGuard guard; | |||
| if (execute_callback_ != nullptr) { | |||
| execute_callback_(); | |||
| } | |||
| } | |||
| } | |||
| } // namespace mindspore::runtime | |||
| @@ -0,0 +1,120 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_OP_BUILDER_OP_LAZY_BUILDER_H_ | |||
| #define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_OP_BUILDER_OP_LAZY_BUILDER_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include <queue> | |||
| #include <map> | |||
| #include <string> | |||
| #include <utility> | |||
| #include "backend/session/kernel_graph.h" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "runtime/hardware/device_context.h" | |||
| #include "runtime/framework/graph_scheduler.h" | |||
| namespace mindspore::runtime { | |||
| class OpLazyBuilderContext { | |||
| public: | |||
| OpLazyBuilderContext(GraphCompilerInfo *graph_compiler_info, KernelGraphPtr graph, | |||
| std::vector<session::KernelWithIndex> output_nodes, const session::OpRunInfo &op_run_info, | |||
| device::DeviceContext *device_context) | |||
| : graph_compiler_info_(graph_compiler_info), | |||
| graph_(std::move(graph)), | |||
| output_nodes_(std::move(output_nodes)), | |||
| op_run_info_(op_run_info), | |||
| device_context_(device_context) {} | |||
| ~OpLazyBuilderContext() = default; | |||
| GraphCompilerInfo *graph_compiler_info() const { return graph_compiler_info_; } | |||
| const KernelGraphPtr &graph() const { return graph_; } | |||
| const std::vector<session::KernelWithIndex> &output_nodes() const { return output_nodes_; } | |||
| const session::OpRunInfo &op_run_info() const { return op_run_info_; } | |||
| device::DeviceContext *device_context() const { return device_context_; } | |||
| private: | |||
| GraphCompilerInfo *graph_compiler_info_; | |||
| KernelGraphPtr graph_; | |||
| std::vector<session::KernelWithIndex> output_nodes_; | |||
| session::OpRunInfo op_run_info_; | |||
| device::DeviceContext *device_context_; | |||
| }; | |||
| class OpTask { | |||
| public: | |||
| explicit OpTask(std::shared_ptr<OpLazyBuilderContext> context) : context_(std::move(context)) {} | |||
| virtual ~OpTask() = default; | |||
| const std::shared_ptr<OpLazyBuilderContext> &context() { return context_; } | |||
| protected: | |||
| std::shared_ptr<OpLazyBuilderContext> context_; | |||
| }; | |||
| class OpBuildTask : public OpTask { | |||
| public: | |||
| explicit OpBuildTask(std::shared_ptr<OpLazyBuilderContext> context) : OpTask(std::move(context)) {} | |||
| ~OpBuildTask() override = default; | |||
| }; | |||
| class OpRunTask : public OpTask { | |||
| public: | |||
| explicit OpRunTask(std::shared_ptr<OpLazyBuilderContext> context) : OpTask(std::move(context)) {} | |||
| ~OpRunTask() override = default; | |||
| }; | |||
| class OpLazyBuilder { | |||
| public: | |||
| static OpLazyBuilder &GetInstance() { | |||
| static OpLazyBuilder instance; | |||
| return instance; | |||
| } | |||
| class ExecuteGuard { | |||
| public: | |||
| ExecuteGuard() { OpLazyBuilder::GetInstance().executing_ = true; } | |||
| ~ExecuteGuard() { OpLazyBuilder::GetInstance().executing_ = false; } | |||
| }; | |||
| void Register(const std::function<void()> &callback); | |||
| const std::vector<std::shared_ptr<OpTask>> &GetOpBuildTasks() const { return op_build_tasks; } | |||
| const std::queue<std::shared_ptr<OpTask>> &GetOpRunTasks() const { return op_run_tasks; } | |||
| void ClearOpBuildTasks() { op_build_tasks.clear(); } | |||
| void Reset(); | |||
| void ClearAllResources(); | |||
| void ExecuteRemainingTasks(); | |||
| void PushOpBuildTask(const std::shared_ptr<OpTask> &op_build_task) { op_build_tasks.push_back(op_build_task); } | |||
| void PushOpRunTask(const std::shared_ptr<OpTask> &op_run_task) { op_run_tasks.push(op_run_task); } | |||
| void PopOpRunTask() { op_run_tasks.pop(); } | |||
| bool QueueEmpty() const { return op_run_tasks.empty() && op_build_tasks.empty(); } | |||
| bool QueueFull() const { return op_build_tasks.size() > kMaxQueueSize || op_run_tasks.size() > kMaxQueueSize; } | |||
| bool registered() const { return registered_; } | |||
| private: | |||
| OpLazyBuilder() = default; | |||
| ~OpLazyBuilder() = default; | |||
| DISABLE_COPY_AND_ASSIGN(OpLazyBuilder); | |||
| std::vector<std::shared_ptr<OpTask>> op_build_tasks; | |||
| std::queue<std::shared_ptr<OpTask>> op_run_tasks; | |||
| std::function<void()> execute_callback_{nullptr}; | |||
| inline static size_t kMaxQueueSize = 100; | |||
| bool executing_{false}; | |||
| bool registered_{false}; | |||
| }; | |||
| } // namespace mindspore::runtime | |||
| #endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_OP_BUILDER_OP_LAZY_BUILDER_H_ | |||
| @@ -21,6 +21,7 @@ | |||
| #include "vm/transform.h" | |||
| #include "backend/session/session_factory.h" | |||
| #include "runtime/op_builder/op_lazy_builder.h" | |||
| #include "backend/optimizer/common/helper.h" | |||
| #include "pipeline/pynative/pynative_execute.h" | |||
| #include "pipeline/jit/parse/data_converter.h" | |||
| @@ -207,6 +208,8 @@ TensorPtr CreateOutputTensor(const AnfNodePtr &output_node, size_t output_index) | |||
| MS_EXCEPTION_IF_NULL(device_tensor); | |||
| tensor->set_device_address(device_tensor); | |||
| // MindRT is disabled in the multi graphs scenario | |||
| // Delete tensor->data_sync() when MindRT is enabled in all scenes. | |||
| auto ms_context = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(ms_context); | |||
| if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) != kPynativeMode) { | |||
| @@ -490,45 +493,6 @@ void MindRTBackend::CompileGraph(const GraphSegmentPtr &segment, bool contain_mu | |||
| } | |||
| } | |||
| const ActorInfo &MindRTBackend::CompileGraph(const OpRunInfo &op_run_info, const GraphInfo &graph_info, | |||
| const std::vector<int64_t> *tensors_mask, | |||
| std::vector<tensor::TensorPtr> *input_tensors) { | |||
| MS_EXCEPTION_IF_NULL(graph_compiler_); | |||
| // Get the device context. | |||
| const auto &device_context = | |||
| device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_name_, device_id_}); | |||
| MS_EXCEPTION_IF_NULL(device_context); | |||
| device_context->Initialize(); | |||
| bool single_op_cache_hit = true; | |||
| auto graph_id = graph_compiler_->CompileGraph(op_run_info, graph_info, tensors_mask, input_tensors, | |||
| &single_op_cache_hit, device_context); | |||
| // The actor set name: graph_id + single operator name. | |||
| std::string actor_info = std::to_string(graph_id) + "_" + op_run_info.op_name; | |||
| if (single_op_cache_hit) { | |||
| auto iter = actor_to_graph_compiler_info_.find(actor_info); | |||
| if (iter == actor_to_graph_compiler_info_.end()) { | |||
| MS_LOG(EXCEPTION) << "Can not find graph compiler info for actor set: " << actor_info; | |||
| } | |||
| return iter->first; | |||
| } | |||
| graph_info_to_device_context_.clear(); | |||
| graph_info_to_device_context_[graph_info] = device_context; | |||
| auto context_ptr = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||
| bool enable_cache = context_ptr->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE); | |||
| auto graph_compiler_info = ConstructGraphCompilerInfo(actor_info, tensors_mask, input_tensors, !enable_cache); | |||
| const auto actor_set = runtime::GraphScheduler::GetInstance().Transform(*graph_compiler_info); | |||
| runtime::GraphScheduler::GetInstance().Schedule(actor_set); | |||
| MS_EXCEPTION_IF_NULL(graph_compiler_info); | |||
| graph_compiler_info->input_tensors_.clear(); | |||
| auto ret = actor_to_graph_compiler_info_.emplace(actor_info, std::move(graph_compiler_info)); | |||
| return ret.first->first; | |||
| } | |||
| namespace { | |||
| void GetControlOpInput(const std::shared_ptr<GraphCompiler> &graph_compiler, const CNodePtr &front_cnode, | |||
| const CNodePtr &backend_cnode, const std::map<KernelWithIndex, tensor::TensorPtr> &op_output_map, | |||
| @@ -776,6 +740,7 @@ void PushTupleTensor(const VectorRef &args, const std::vector<AnfNodePtr> ¶m | |||
| void MindRTBackend::RunGraphBySingleOp(const std::vector<KernelGraphPtr> &graphs, | |||
| const std::vector<std::vector<tensor::TensorPtr>> &inputs, VectorRef *outputs) { | |||
| runtime::OpLazyBuilder::GetInstance().ExecuteRemainingTasks(); | |||
| MS_EXCEPTION_IF_NULL(graph_compiler_); | |||
| for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) { | |||
| const auto &graph = graphs[graph_index]; | |||
| @@ -810,16 +775,14 @@ void MindRTBackend::RunGraphBySingleOp(const std::vector<KernelGraphPtr> &graphs | |||
| GraphInfo graph_info; | |||
| graph_compiler_->GetSingleOpInputTensors(kernel, op_output_map, parameter_index, inputs[graph_index], | |||
| &input_tensor_info); | |||
| graph_compiler_->GetSingleOpRunInfoAndGraphInfo(kernel, input_tensor_info.input_tensors, &op_run_info, | |||
| &graph_info); | |||
| graph_compiler_->GetSingleOpRunInfoAndGraphInfo(kernel, input_tensor_info, &op_run_info, &graph_info); | |||
| const ActorInfo &actor_info = CompileGraph(op_run_info, graph_info, &input_tensor_info.input_tensors_mask, | |||
| &input_tensor_info.input_tensors); | |||
| RunGraph(actor_info, &op_run_info, &input_tensor_info.input_tensors_mask, &input_tensor_info.input_tensors, | |||
| &op_outputs); | |||
| RunOp(&op_run_info, &op_outputs); | |||
| } else { | |||
| RunControlOperator(graph_compiler_, graph, kernel, op_output_map, parameter_index, inputs[graph_index], | |||
| &input_tensor_info, &op_outputs); | |||
| // Execute remaining lazy tasks before PyNative hook exit. | |||
| runtime::OpLazyBuilder::GetInstance().ExecuteRemainingTasks(); | |||
| } | |||
| graph_compiler_->UpdateRefCount(input_tensor_info.input_kernel, &cnode_ref_count, &op_output_map); | |||
| @@ -999,6 +962,10 @@ void MindRTBackend::SetDebuggerInit() { | |||
| } | |||
| #endif | |||
| void MindRTBackend::SyncLazyTasks() const { runtime::OpLazyBuilder::GetInstance().ExecuteRemainingTasks(); } | |||
| void MindRTBackend::ClearOpBuilderResource() const { runtime::OpLazyBuilder::GetInstance().Reset(); } | |||
| std::unique_ptr<GraphCompilerInfo> MindRTBackend::ConstructGraphCompilerInfo(const FuncGraphPtr &root_graph) { | |||
| MS_EXCEPTION_IF_NULL(root_graph); | |||
| MS_EXCEPTION_IF_NULL(graph_compiler_); | |||
| @@ -1092,22 +1059,21 @@ void MindRTBackend::EraseSingleOpCache(const ActorInfo &actor_info, const Kernel | |||
| actor_to_graph_compiler_info_.erase(actor_info); | |||
| } | |||
| void MindRTBackend::RunGraph(const ActorInfo &actor_info, OpRunInfo *op_run_info, | |||
| const std::vector<int64_t> *tensors_mask, | |||
| const std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs) { | |||
| MS_EXCEPTION_IF_NULL(input_tensors); | |||
| MS_EXCEPTION_IF_NULL(op_run_info); | |||
| MS_EXCEPTION_IF_NULL(tensors_mask); | |||
| void MindRTBackend::RunSingleOpGraph(const KernelGraphPtr &graph, | |||
| const std::vector<session::KernelWithIndex> &output_nodes, | |||
| const OpRunInfo &op_run_info, const GraphCompilerInfo *graph_compiler_info, | |||
| DeviceContext *device_context) { | |||
| // Erase value node tensor. | |||
| std::vector<tensor::TensorPtr> tensors_without_value_node; | |||
| if (input_tensors->size() != tensors_mask->size()) { | |||
| MS_LOG(EXCEPTION) << "Input tensors size " << input_tensors->size() << " should be equal to tensors mask size " | |||
| << tensors_mask->size(); | |||
| } | |||
| for (size_t index = 0; index < tensors_mask->size(); ++index) { | |||
| if (tensors_mask->at(index) != kValueNodeTensorMask) { | |||
| (void)tensors_without_value_node.emplace_back(input_tensors->at(index)); | |||
| const auto &input_tensors = op_run_info.input_tensors; | |||
| const auto &tensors_mask = op_run_info.tensor_mask; | |||
| if (input_tensors.size() != tensors_mask.size()) { | |||
| MS_LOG(EXCEPTION) << "Input tensors size " << input_tensors.size() << " should be equal to tensors mask size " | |||
| << tensors_mask.size(); | |||
| } | |||
| for (size_t index = 0; index < tensors_mask.size(); ++index) { | |||
| if (tensors_mask.at(index) != kValueNodeTensorMask) { | |||
| (void)tensors_without_value_node.emplace_back(input_tensors.at(index)); | |||
| } | |||
| } | |||
| @@ -1119,30 +1085,11 @@ void MindRTBackend::RunGraph(const ActorInfo &actor_info, OpRunInfo *op_run_info | |||
| } | |||
| // Run actor DAG. | |||
| const auto &actor_set = runtime::GraphScheduler::GetInstance().Fetch(actor_info); | |||
| const auto &actor_set = runtime::GraphScheduler::GetInstance().Fetch(graph_compiler_info->name_); | |||
| MS_EXCEPTION_IF_NULL(actor_set); | |||
| runtime::GraphScheduler::GetInstance().Run(actor_set, {}, {tensors_without_value_node}, *input_tensors, | |||
| runtime::GraphScheduler::GetInstance().Run(actor_set, {}, {tensors_without_value_node}, input_tensors, | |||
| runtime::GraphExecutionStrategy::kStep); | |||
| // Fetch outputs. | |||
| const auto &graph_iter = actor_to_graph_compiler_info_.find(actor_info); | |||
| if (graph_iter == actor_to_graph_compiler_info_.end()) { | |||
| MS_LOG(EXCEPTION) << "Can't find the graph compiler info."; | |||
| } | |||
| MS_EXCEPTION_IF_NULL(graph_iter->second); | |||
| const auto &graph_compiler_info = *(graph_iter->second); | |||
| const auto &graph = graph_compiler_info.graphs_.front(); | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| MS_EXCEPTION_IF_NULL(graph_compiler_); | |||
| const auto &output_nodes = graph_compiler_->GetGraphOutputNodes(graph->graph_id()); | |||
| MS_EXCEPTION_IF_NULL(outputs); | |||
| UpdateOutput(output_nodes, outputs); | |||
| // Update output abstract of dynamic op to op_run_info | |||
| if (op_run_info->is_dynamic_shape) { | |||
| UpdateOutputAbstract(graph, op_run_info); | |||
| } | |||
| // Release the kernel resource. | |||
| const auto &kernels = graph->execution_order(); | |||
| for (const auto &kernel : kernels) { | |||
| @@ -1154,14 +1101,171 @@ void MindRTBackend::RunGraph(const ActorInfo &actor_info, OpRunInfo *op_run_info | |||
| } | |||
| } | |||
| } | |||
| } | |||
| void MindRTBackend::CompileSingleOpGraphs(const std::vector<std::shared_ptr<runtime::OpTask>> &build_tasks) { | |||
| if (build_tasks.empty()) { | |||
| return; | |||
| } | |||
| std::vector<KernelGraphPtr> graphs; | |||
| std::vector<GraphCompilerInfo *> graph_compiler_infos; | |||
| for (const auto &task : build_tasks) { | |||
| MS_EXCEPTION_IF_NULL(task); | |||
| const auto &context = task->context(); | |||
| MS_EXCEPTION_IF_NULL(context); | |||
| graphs.push_back(context->graph()); | |||
| graph_compiler_infos.push_back(context->graph_compiler_info()); | |||
| } | |||
| MS_EXCEPTION_IF_NULL(build_tasks[0]); | |||
| auto &task_context = build_tasks[0]->context(); | |||
| MS_EXCEPTION_IF_NULL(task_context); | |||
| auto device_context = task_context->device_context(); | |||
| graph_compiler_->BuildSingleOpGraphs(graphs, device_context); | |||
| for (const auto &graph_compiler_info : graph_compiler_infos) { | |||
| MS_EXCEPTION_IF_NULL(graph_compiler_info); | |||
| auto actor_set = runtime::GraphScheduler::GetInstance().Transform(*graph_compiler_info); | |||
| graph_compiler_info->input_tensors_.clear(); | |||
| runtime::GraphScheduler::GetInstance().Schedule(actor_set); | |||
| } | |||
| } | |||
| void MindRTBackend::LazyExecuteTaskCallback() { | |||
| auto &op_lazy_builder = runtime::OpLazyBuilder::GetInstance(); | |||
| if (op_lazy_builder.QueueEmpty()) { | |||
| return; | |||
| } | |||
| try { | |||
| MS_LOG(DEBUG) << "Start"; | |||
| auto ms_context = MsContext::GetInstance(); | |||
| auto infer_flag = ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER); | |||
| ms_context->set_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER, true); | |||
| CompileSingleOpGraphs(op_lazy_builder.GetOpBuildTasks()); | |||
| op_lazy_builder.ClearOpBuildTasks(); | |||
| // Run op one by one | |||
| auto &op_run_tasks = op_lazy_builder.GetOpRunTasks(); | |||
| while (!op_run_tasks.empty()) { | |||
| auto &op_run_task = op_run_tasks.front(); | |||
| const auto &context = op_run_task->context(); | |||
| RunSingleOpGraph(context->graph(), context->output_nodes(), context->op_run_info(), | |||
| context->graph_compiler_info(), context->device_context()); | |||
| UpdateOutputDeviceAddress(context->output_nodes(), context->device_context()); | |||
| UpdateInputDeviceAddress(context->graph()); | |||
| op_lazy_builder.PopOpRunTask(); | |||
| } | |||
| ms_context->set_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER, infer_flag); | |||
| MS_LOG(DEBUG) << "End"; | |||
| } catch (const std::exception &ex) { | |||
| op_lazy_builder.Reset(); | |||
| throw(std::runtime_error(ex.what())); | |||
| } catch (...) { | |||
| op_lazy_builder.Reset(); | |||
| std::string exName(abi::__cxa_current_exception_type()->name()); | |||
| MS_LOG(EXCEPTION) << "Error occurred when execute task in queue. Exception name: " << exName; | |||
| } | |||
| } | |||
| // Update device address for input and output of graph. | |||
| UpdateOutputDeviceAddress(output_nodes, graph_compiler_info.device_contexts_.front()); | |||
| UpdateInputDeviceAddress(graph); | |||
| void MindRTBackend::RunOpInternal(bool single_op_cache_hit, GraphCompilerInfo *graph_compiler_info, | |||
| OpRunInfo *op_run_info, VectorRef *outputs) { | |||
| MS_EXCEPTION_IF_NULL(op_run_info); | |||
| MS_EXCEPTION_IF_NULL(graph_compiler_info); | |||
| // Fetch outputs. | |||
| const auto &graph = graph_compiler_info->graphs_.front(); | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| MS_EXCEPTION_IF_NULL(graph_compiler_); | |||
| const auto &output_nodes = graph_compiler_->GetGraphOutputNodes(graph->graph_id()); | |||
| MS_EXCEPTION_IF_NULL(outputs); | |||
| if (graph_compiler_info.need_erase_) { | |||
| EraseSingleOpCache(actor_info, graph); | |||
| auto device_context = graph_compiler_info->device_contexts_.front(); | |||
| auto &op_lazy_builder = runtime::OpLazyBuilder::GetInstance(); | |||
| bool lazy_build_disabled = graph_compiler_info->need_erase_ || | |||
| (single_op_cache_hit && op_lazy_builder.QueueEmpty()) || !op_run_info->lazy_build; | |||
| if (lazy_build_disabled) { | |||
| if (!single_op_cache_hit) { | |||
| CompileSingleOpGraph(graph, device_context, graph_compiler_info); | |||
| } | |||
| RunSingleOpGraph(graph, output_nodes, *op_run_info, graph_compiler_info, device_context); | |||
| UpdateOutput(output_nodes, outputs); | |||
| UpdateOutputDeviceAddress(output_nodes, device_context); | |||
| UpdateInputDeviceAddress(graph); | |||
| if (op_run_info->is_dynamic_shape) { | |||
| UpdateOutputAbstract(graph, op_run_info); | |||
| } | |||
| if (graph_compiler_info->need_erase_) { | |||
| EraseSingleOpCache(graph_compiler_info->name_, graph); | |||
| } | |||
| } else { | |||
| UpdateOutput(output_nodes, outputs); | |||
| auto run_op_context = std::make_shared<runtime::OpLazyBuilderContext>( | |||
| graph_compiler_info, graph, output_nodes, *op_run_info, graph_compiler_info->device_contexts_.front()); | |||
| if (!single_op_cache_hit) { | |||
| op_lazy_builder.PushOpBuildTask(std::make_shared<runtime::OpBuildTask>(run_op_context)); | |||
| } | |||
| op_lazy_builder.PushOpRunTask(std::make_shared<runtime::OpRunTask>(run_op_context)); | |||
| if (!op_lazy_builder.registered()) { | |||
| op_lazy_builder.Register([this]() { LazyExecuteTaskCallback(); }); | |||
| } | |||
| if (op_lazy_builder.QueueFull()) { | |||
| op_lazy_builder.ExecuteRemainingTasks(); | |||
| } | |||
| } | |||
| } | |||
| void MindRTBackend::RunOp(OpRunInfo *op_run_info, VectorRef *outputs) { | |||
| MS_EXCEPTION_IF_NULL(op_run_info); | |||
| MS_EXCEPTION_IF_NULL(graph_compiler_); | |||
| // Get the device context. | |||
| const auto &device_context = | |||
| device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_name_, device_id_}); | |||
| MS_EXCEPTION_IF_NULL(device_context); | |||
| device_context->Initialize(); | |||
| bool single_op_cache_hit = true; | |||
| auto graph_id = graph_compiler_->CompileGraph(*op_run_info, &single_op_cache_hit, device_context); | |||
| std::string actor_info = std::to_string(graph_id) + "_" + op_run_info->op_name; | |||
| GraphCompilerInfo *graph_compiler_info_ptr; | |||
| if (single_op_cache_hit) { | |||
| auto iter = actor_to_graph_compiler_info_.find(actor_info); | |||
| if (iter == actor_to_graph_compiler_info_.end()) { | |||
| MS_LOG(EXCEPTION) << "Can not find graph compiler info for actor set: " << actor_info; | |||
| } | |||
| graph_compiler_info_ptr = iter->second.get(); | |||
| } else { | |||
| graph_info_to_device_context_.clear(); | |||
| graph_info_to_device_context_[op_run_info->graph_info] = device_context; | |||
| auto context_ptr = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||
| bool enable_cache = context_ptr->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE); | |||
| auto graph_compiler_info = | |||
| ConstructGraphCompilerInfo(actor_info, &op_run_info->tensor_mask, &op_run_info->input_tensors, !enable_cache); | |||
| graph_compiler_info_ptr = graph_compiler_info.get(); | |||
| auto ret = actor_to_graph_compiler_info_.try_emplace(actor_info, std::move(graph_compiler_info)); | |||
| if (!ret.second) { | |||
| MS_LOG(WARNING) << "ActorInfo:" << actor_info << " already exist in the map."; | |||
| } | |||
| } | |||
| RunOpInternal(single_op_cache_hit, graph_compiler_info_ptr, op_run_info, outputs); | |||
| } | |||
| void MindRTBackend::CompileSingleOpGraph(const KernelGraphPtr &graph, const DeviceContext *device_context, | |||
| GraphCompilerInfo *graph_compiler_info) const { | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| MS_EXCEPTION_IF_NULL(device_context); | |||
| graph_compiler_->BuildSingleOpGraphs({graph}, device_context); | |||
| MS_EXCEPTION_IF_NULL(graph_compiler_info); | |||
| auto actor_set = runtime::GraphScheduler::GetInstance().Transform(*graph_compiler_info); | |||
| graph_compiler_info->input_tensors_.clear(); | |||
| // Actor::Init() is called in Schedule. | |||
| // Workspace need to be initialized in Actor::Init(). | |||
| // So `Schedule` need to execute after `CreateKernelWorkspaceDeviceAddress`. | |||
| runtime::GraphScheduler::GetInstance().Schedule(actor_set); | |||
| } | |||
| } // namespace compile | |||
| } // namespace mindspore | |||
| @@ -32,6 +32,7 @@ | |||
| #include "backend/session/session_basic.h" | |||
| #include "runtime/hardware/device_context.h" | |||
| #include "runtime/framework/graph_scheduler.h" | |||
| #include "runtime/op_builder/op_lazy_builder.h" | |||
| namespace mindspore { | |||
| namespace compile { | |||
| @@ -108,21 +109,19 @@ class MindRTBackend : public Backend { | |||
| // all sub graphs to call CompileGraph. | |||
| const ActorInfo &CompileGraphs(const FuncGraphPtr &root_graph); | |||
| // Compile single op kernel graph in the pyNative mode. | |||
| const ActorInfo &CompileGraph(const OpRunInfo &op_run_info, const GraphInfo &graph_info, | |||
| const std::vector<int64_t> *tensors_mask, | |||
| std::vector<tensor::TensorPtr> *input_tensors); | |||
| // Run Graph in the graph mode. | |||
| void RunGraph(const ActorInfo &actor_info, const VectorRef &args, VectorRef *outputs); | |||
| // Run Graph in the pyNative mode. | |||
| void RunGraph(const ActorInfo &actor_info, OpRunInfo *op_run_info, const std::vector<int64_t> *tensors_mask, | |||
| const std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs); | |||
| // Run single op in the PyNative mode. | |||
| void RunOp(OpRunInfo *op_run_info, VectorRef *outputs); | |||
| #ifdef ENABLE_DEBUGGER | |||
| void SetDebuggerInit(); | |||
| #endif | |||
| // Execute all tasks in queue when lazy build is enabled in PyNative mode. | |||
| void SyncLazyTasks() const; | |||
| // Clear resource when python exit. | |||
| void ClearOpBuilderResource() const; | |||
| private: | |||
| // The parameter func_graph is a graph, it can be either a root graph or a sub graph, | |||
| // The result of graph compiler is stored in graph_id_to_device_context_ and control_nodes_. | |||
| @@ -132,6 +131,13 @@ class MindRTBackend : public Backend { | |||
| // Compile the kernel graph by the segment which is from the function graph partition. | |||
| void CompileGraph(const GraphSegmentPtr &segment, bool contain_multi_target); | |||
| // CreateKernel, Transform and Schedule have not been finished when LazyBuild is enabled in PyNative mode. | |||
| void CompileSingleOpGraph(const KernelGraphPtr &graph, const DeviceContext *device_context, | |||
| GraphCompilerInfo *graph_compiler_info) const; | |||
| // Get saved OpBuildTask in OpLazyBuilder and build all the kernels together in PyNative mode. | |||
| void CompileSingleOpGraphs(const std::vector<std::shared_ptr<runtime::OpTask>> &build_tasks); | |||
| // Restore the outputs tuple by the origin funcGraph output node and output tensors. | |||
| void ConstructOutputs(const AnfNodePtr &output_node, const std::vector<tensor::TensorPtr> &output_tensors, | |||
| size_t *output_position, VectorRef *outputs); | |||
| @@ -149,6 +155,18 @@ class MindRTBackend : public Backend { | |||
| // so the latest single op cache should be erased when cache list size exceeds threshold value. | |||
| void EraseSingleOpCache(const ActorInfo &actor_info, const KernelGraphPtr &graph); | |||
| // Run op immediately when the single_op_cache hit and the queue of OpLazyBuilder is empty in PyNative mode. | |||
| void RunSingleOpGraph(const KernelGraphPtr &graph, const std::vector<session::KernelWithIndex> &output_nodes, | |||
| const OpRunInfo &op_run_info, const GraphCompilerInfo *graph_compiler_info, | |||
| DeviceContext *device_context); | |||
| // Execute OpBuildTask and OpRunTask when the OpLazyBuilder queue is full in PyNative mode. | |||
| void LazyExecuteTaskCallback(); | |||
| // Run op immediately or save OpBuildTask and OpRunTask in OpLazyBuilder. | |||
| void RunOpInternal(bool single_op_cache_hit, GraphCompilerInfo *graph_compiler_info, OpRunInfo *op_run_info, | |||
| VectorRef *outputs); | |||
| // Split complete kernel graph to single op graph in PyNative back | |||
| // propagation, then compile and run single op graph. | |||
| void RunGraphBySingleOp(const std::vector<KernelGraphPtr> &graphs, | |||
| @@ -482,18 +482,5 @@ void FinalVM::InstPushPrim(const VectorRef &args) { | |||
| MS_LOG(DEBUG) << "End"; | |||
| } | |||
| void FinalVM::SyncData(const py::object &arg) { | |||
| if (py::isinstance<py::tuple>(arg)) { | |||
| auto arg_list = py::cast<py::tuple>(arg); | |||
| for (size_t i = 0; i < arg_list.size(); i++) { | |||
| SyncData(arg_list[i]); | |||
| } | |||
| } | |||
| if (py::isinstance<tensor::Tensor>(arg)) { | |||
| auto tensor = py::cast<tensor::TensorPtr>(arg); | |||
| tensor->data_sync(); | |||
| } | |||
| } | |||
| } // namespace compile | |||
| } // namespace mindspore | |||
| @@ -130,7 +130,6 @@ class FinalVM { | |||
| void Pushsp(); | |||
| void Popsp(); | |||
| void DoJmp(const BaseRef &jmp); | |||
| void SyncData(const py::object &args); | |||
| private: | |||
| InstSet insts_; | |||