!25132 [PyNative][MindRT][GPU] Op Lazy Build

Merge pull request !25132 from caifubi/master-pynative-mindrt-gpu-async-build
4 years ago · 1a7a04e4c9
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@@ -220,6 +220,7 @@ set(SUB_COMP
        runtime/device
        runtime/framework
        runtime/hardware
        runtime/op_builder
        runtime/hccl_adapter
        frontend/optimizer
        frontend/parallel
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.cc
@@ -70,11 +70,6 @@ bool DatasetIteratorKernel::Init(const CNodePtr &kernel_node) {
    total_bytes_ += bytes;
  }

  handle_ = GpuBufferMgr::GetInstance().Open(0, queue_name_, output_size_list_);
  if (handle_ == HandleMgr::INVALID_HANDLE) {
    MS_LOG(EXCEPTION) << "Gpu Queue(" << queue_name_ << ") Open Failed";
  }

 #ifndef ENABLE_SECURITY
  auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance();
  MS_EXCEPTION_IF_NULL(profiler_inst);
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@@ -991,13 +991,13 @@ void AscendSession::BuildOpsInGraph(const GraphId &graph_id, const std::map<AnfN
    InputTensorInfo input_tensor_info;
    GetOpInputStubTensors(kernel, parameter_index, graph_inputs, op_output_info, &input_tensor_info);
    // Get OpRunInfo and GraphInfo
    OpRunInfo op_run_info;
    GetSingleOpRunInfo(kernel, &op_run_info);
    const GraphInfo &graph_info = GetSingleOpGraphInfo(kernel, input_tensor_info.input_tensors);
    OpRunInfo op_run_info = GetSingleOpRunInfo(kernel, graph_info, input_tensor_info);
    if (op_run_info.is_dynamic_shape) {
      MS_LOG(INFO) << "BuildOpsInGraph stop, op " << op_run_info.op_name << " is dynamic shape.";
      break;
    }
    const GraphInfo &graph_info = GetSingleOpGraphInfo(kernel, input_tensor_info.input_tensors);

    const auto &single_op_graph_iter = run_op_graphs_.find(graph_info);
    if (single_op_graph_iter != run_op_graphs_.end()) {
      // if graph of same single op exists, the output tensor of current op should be generated
--- a/mindspore/ccsrc/backend/session/session_basic.cc
+++ b/mindspore/ccsrc/backend/session/session_basic.cc
@@ -1218,20 +1218,29 @@ GraphInfo SessionBasic::GetSingleOpGraphInfo(const CNodePtr &kernel,
  return graph_info;
 }

 void SessionBasic::GetSingleOpRunInfo(const CNodePtr cnode, OpRunInfo *run_info) {
 OpRunInfo SessionBasic::GetSingleOpRunInfo(const CNodePtr &cnode, const GraphInfo &graph_info,
                                           const InputTensorInfo &tensor_info) {
  MS_EXCEPTION_IF_NULL(cnode);
  MS_EXCEPTION_IF_NULL(run_info);
  auto primitive = AnfAlgo::GetCNodePrimitive(cnode);
  run_info->primitive = primitive;
  run_info->op_name = primitive->name();
  const auto &abstract = cnode->abstract();
  if (abstract == nullptr) {
    MS_LOG(EXCEPTION) << "Abstract is nullptr, node = " << cnode->DebugString();
  }
  run_info->abstract = abstract;
  const auto &shape = abstract->BuildShape();
  MS_EXCEPTION_IF_NULL(shape);
  run_info->is_dynamic_shape = shape->IsDynamic();

  OpRunInfo op_run_info = {.op_name = primitive->name(),
                           .primitive = primitive.get(),
                           .abstract = abstract,
                           .is_dynamic_shape = shape->IsDynamic(),
                           .is_auto_mixed_precision = false,
                           .lazy_build = false,
                           .next_op_name = std::string(),
                           .next_input_index = 0,
                           .graph_info = graph_info,
                           .tensor_mask = tensor_info.input_tensors_mask,
                           .input_tensors = tensor_info.input_tensors};
  return op_run_info;
 }

 void SessionBasic::GetParameterIndex(const KernelGraph *graph, const std::vector<tensor::TensorPtr> &inputs,
@@ -2143,7 +2152,7 @@ std::shared_ptr<KernelGraph> SessionBasic::ConstructSingleOpGraph(const OpRunInf
  graph_sum_++;
  std::vector<AnfNodePtr> inputs;
  // set input[0]
  PrimitivePtr op_prim = op_run_info.primitive;
  auto op_prim = op_run_info.primitive;
  MS_EXCEPTION_IF_NULL(op_prim);
  // Decoupling of frontend PrimitivePy and backend Primitive
  inputs.push_back(std::make_shared<ValueNode>(std::make_shared<Primitive>(*op_prim)));
@@ -2238,11 +2247,11 @@ void SessionBasic::BuildGraph(GraphId graph_id) {
  executor_->BuildGraph(shared_from_this(), graph_id);
 }

 void SessionBasic::RunOp(OpRunInfo *op_run_info, const GraphInfo &graph_info,
                         std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs,
                         const std::vector<int64_t> &tensors_mask) {
 void SessionBasic::RunOp(OpRunInfo *op_run_info, VectorRef *outputs) {
  MS_EXCEPTION_IF_NULL(executor_);
  executor_->RunOp(shared_from_this(), op_run_info, graph_info, input_tensors, outputs, tensors_mask);
  MS_EXCEPTION_IF_NULL(op_run_info);
  executor_->RunOp(shared_from_this(), op_run_info, op_run_info->graph_info, &op_run_info->input_tensors, outputs,
                   op_run_info->tensor_mask);
 }

 void SessionBasic::RunOpsInGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs,
@@ -2305,13 +2314,12 @@ void SessionBasic::RunOpsInGraphImpl(const GraphId &graph_id, const std::vector<
    InputTensorInfo input_tensor_info;
    GetOpInputTensors(kernel, op_output_map, parameter_index, inputs, &input_tensor_info);

    VectorRef op_outputs;
    // Get OpRunInfo and GraphInfo
    OpRunInfo run_info;
    GetSingleOpRunInfo(kernel, &run_info);
    GraphInfo graph_info = GetSingleOpGraphInfo(kernel, input_tensor_info.input_tensors);
    OpRunInfo run_info = GetSingleOpRunInfo(kernel, graph_info, input_tensor_info);

    // Build and run current single op
    VectorRef op_outputs;
    RunOpImplOrigin(graph_info, &run_info, &input_tensor_info.input_tensors, &op_outputs,
                    input_tensor_info.input_tensors_mask);
    graph_output_info.graph_output_tensors.clear();
--- a/mindspore/ccsrc/backend/session/session_basic.h
+++ b/mindspore/ccsrc/backend/session/session_basic.h
@@ -58,17 +58,20 @@ using AnyListPtr = std::shared_ptr<AnyList>;

 struct OpRunInfo {
  std::string op_name;
  PrimitivePtr primitive;
  Primitive *primitive;
  AbstractBasePtr abstract;
  bool is_dynamic_shape = false;
  bool is_auto_mixed_precision = false;
  bool lazy_build = false;
  std::string next_op_name = "";
  std::string next_op_name;
 #if defined(__APPLE__)
  int next_input_index = 0;
 #else
  size_t next_input_index = 0;
 #endif
  std::string graph_info;
  std::vector<int64_t> tensor_mask;
  std::vector<tensor::TensorPtr> input_tensors;
 };

 struct InputTensorInfo {
@@ -108,8 +111,7 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> {
  void BuildGraph(GraphId graphId);
  void RunGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs);
  void RunGraphAsync(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs);
  void RunOp(OpRunInfo *, const GraphInfo &, std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs,
             const std::vector<int64_t> &tensors_mask);
  void RunOp(OpRunInfo *, VectorRef *outputs);
  void RunOpsInGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs);

 #ifndef ENABLE_SECURITY
@@ -276,7 +278,7 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> {
  CNodePtr ConstructOutput(const AnfNodePtrList &outputs, const std::shared_ptr<KernelGraph> &graph);
  // Generate graph info for a single op graph
  GraphInfo GetSingleOpGraphInfo(const CNodePtr &kernel, const std::vector<tensor::TensorPtr> &input_tensors);
  void GetSingleOpRunInfo(const CNodePtr cnode, OpRunInfo *run_info);
  OpRunInfo GetSingleOpRunInfo(const CNodePtr &cnode, const GraphInfo &graph_info, const InputTensorInfo &tensor_info);
  tensor::TensorPtr GetValueNodeOutputTensor(const AnfNodePtr &node, size_t output_index);
  tensor::TensorPtr GetParameterOutputTensor(const AnfNodePtr &node,
                                             const std::map<AnfNodePtr, size_t> &parameter_index,
--- a/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc
+++ b/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc
@@ -751,6 +751,9 @@ void UpdateTensorInfo(const tensor::TensorPtr &new_tensor, const std::vector<ten
      pre_tensor->set_device_address(new_tensor->device_address());
      continue;
    }
    if (mind_rt_backend != nullptr) {
      mind_rt_backend->SyncLazyTasks();
    }
    // Replace data in device address when run in CPU device.
    if (pre_tensor->device_address() != nullptr) {
      auto old_device_address = std::dynamic_pointer_cast<device::DeviceAddress>(pre_tensor->device_address());
@@ -811,6 +814,10 @@ py::object GetDstType(const TypeId &type_id) {
  MS_EXCEPTION_IF_NULL(value);
  return py::cast(value);
 }

 bool IsPyObjTypeInvalid(const py::object &obj) {
  return !py::isinstance<tensor::Tensor>(obj) && !py::isinstance<py::int_>(obj) && !py::isinstance<py::float_>(obj);
 }
 }  // namespace

 py::object RealRunOp(const py::args &args) {
@@ -1276,7 +1283,7 @@ void ForwardExecutor::DoSignatureCast(const PrimitivePyPtr &prim,
      continue;
    }

    if (!py::isinstance<tensor::Tensor>(obj) && !py::isinstance<py::int_>(obj) && !py::isinstance<py::float_>(obj)) {
    if (IsPyObjTypeInvalid(obj)) {
      MS_EXCEPTION(TypeError) << "For '" << prim->name() << "', the " << i << "th input " << signature[i].name
                              << " is a not support implicit conversion. "
                              << "Its type is " << py::cast<std::string>(obj.attr("__class__").attr("__name__"))
@@ -2013,28 +2020,35 @@ py::object ForwardExecutor::RunOpInMs(const OpExecInfoPtr &op_exec_info, Pynativ
  ConvertAttrToUnifyMindIR(op_exec_info);
  // get graph info for checking it whether existing in the cache
  GetSingleOpGraphInfo(op_exec_info, input_tensors, tensors_mask, &graph_info);
  VectorRef outputs;
 #if defined(__APPLE__)
  session::OpRunInfo op_run_info = {op_exec_info->op_name,
                                    op_exec_info->py_primitive,
                                    op_exec_info->py_primitive.get(),
                                    op_exec_info->abstract,
                                    op_exec_info->is_dynamic_shape,
                                    op_exec_info->is_mixed_precision_cast,
                                    op_exec_info->lazy_build,
                                    op_exec_info->next_op_name,
                                    static_cast<int>(op_exec_info->next_input_index)};
                                    static_cast<int>(op_exec_info->next_input_index),
                                    graph_info,
                                    tensors_mask,
                                    input_tensors};
 #else
  session::OpRunInfo op_run_info = {op_exec_info->op_name,
                                    op_exec_info->py_primitive,
                                    op_exec_info->py_primitive.get(),
                                    op_exec_info->abstract,
                                    op_exec_info->is_dynamic_shape,
                                    op_exec_info->is_mixed_precision_cast,
                                    op_exec_info->lazy_build,
                                    op_exec_info->next_op_name,
                                    op_exec_info->next_input_index};
                                    op_exec_info->next_input_index,
                                    graph_info,
                                    tensors_mask,
                                    input_tensors};
 #endif
  VectorRef outputs;

  if (!ms_context->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
    kSession->RunOp(&op_run_info, graph_info, &input_tensors, &outputs, tensors_mask);
    kSession->RunOp(&op_run_info, &outputs);
  } else {
    if (mind_rt_backend == nullptr) {
      const auto &device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
@@ -2043,9 +2057,7 @@ py::object ForwardExecutor::RunOpInMs(const OpExecInfoPtr &op_exec_info, Pynativ
    }

    mindspore::ScopedLongRunning long_running;
    const compile::ActorInfo &actor_info =
      mind_rt_backend->CompileGraph(op_run_info, graph_info, &tensors_mask, &input_tensors);
    mind_rt_backend->RunGraph(actor_info, &op_run_info, &tensors_mask, &input_tensors, &outputs);
    mind_rt_backend->RunOp(&op_run_info, &outputs);
  }

  if (op_exec_info->is_dynamic_shape) {
@@ -3222,6 +3234,9 @@ void PynativeExecutor::ClearGrad(const py::object &cell, const py::args &args) {
 void PynativeExecutor::ClearRes() {
  MS_LOG(DEBUG) << "Clear all res";
  session::PynativeTaskManager::GetInstance().Reset();
  if (mind_rt_backend != nullptr) {
    mind_rt_backend->ClearOpBuilderResource();
  }
  SetLazyBuild(false);
  cell_depth_ = 0;

@@ -3279,6 +3294,8 @@ void PynativeExecutor::Sync() {
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);

  ExecuteAllTask();

  if (!ms_context->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
    if (kSession == nullptr) {
      MS_EXCEPTION(NotExistsError) << "No session has been created!";
@@ -3312,7 +3329,12 @@ void PynativeExecutor::ExitCell() {

 bool PynativeExecutor::IsTopCell() const { return cell_depth_ == 0; }

 void PynativeExecutor::ExecuteAllTask() { session::PynativeTaskManager::GetInstance().ExecuteRemainingTasks(); }
 void PynativeExecutor::ExecuteAllTask() {
  session::PynativeTaskManager::GetInstance().ExecuteRemainingTasks();
  if (mind_rt_backend != nullptr) {
    mind_rt_backend->SyncLazyTasks();
  }
 }

 REGISTER_PYBIND_DEFINE(PynativeExecutor_, ([](const py::module *m) {
                         (void)py::class_<PynativeExecutor, std::shared_ptr<PynativeExecutor>>(*m, "PynativeExecutor_")
--- a/mindspore/ccsrc/runtime/framework/graph_compiler.cc
+++ b/mindspore/ccsrc/runtime/framework/graph_compiler.cc
@@ -18,7 +18,9 @@
 #include <numeric>
 #include <map>
 #include <utility>
 #include <algorithm>
 #include "runtime/framework/graph_scheduler.h"
 #include "runtime/op_builder/op_lazy_builder.h"
 #include "runtime/device/device_address.h"
 #include "common/trans.h"
 #include "utils/convert_utils.h"
@@ -179,16 +181,16 @@ void CreateKernelOutputDeviceAddress(const DeviceContext *device_context, const
    if (AnfAlgo::IsControlOpExecInBackend(kernel)) {
      continue;
    }
    auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
    MS_EXCEPTION_IF_NULL(kernel_mod);
    auto output_sizes = kernel_mod->GetOutputSizeList();
    for (size_t i = 0; i < output_sizes.size(); ++i) {

    auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
    for (size_t i = 0; i < output_size; ++i) {
      if (AnfAlgo::OutputAddrExist(kernel, i)) {
        continue;
      }
      auto output_format = AnfAlgo::GetOutputFormat(kernel, i);
      auto output_type = AnfAlgo::GetOutputDeviceDataType(kernel, i);
      auto device_address = device_context->CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type);
      auto address_size = AnfAlgo::GetOutputTensorMemSize(kernel, i);
      auto device_address = device_context->CreateDeviceAddress(nullptr, address_size, output_format, output_type);
      MS_LOG(DEBUG) << "Create addr for node:" << AnfAlgo::GetNodeDebugString(kernel) << " addr:" << device_address;
      AnfAlgo::SetOutputAddr(device_address, i, kernel.get());
    }
@@ -465,13 +467,12 @@ GraphId GraphCompiler::CompileGraphImpl(const KernelGraphPtr &graph, const Devic
  return graph->graph_id();
 }

 GraphId GraphCompiler::CompileGraph(const session::OpRunInfo &op_run_info, const GraphInfo &graph_info,
                                    const std::vector<int64_t> *tensors_mask,
                                    std::vector<TensorPtr> *const input_tensors, bool *single_op_cache_hit,
 GraphId GraphCompiler::CompileGraph(const session::OpRunInfo &op_run_info, bool *single_op_cache_hit,
                                    const DeviceContext *device_context) {
  // Check if the graph cache exists.
  auto iter = run_op_graphs_.find(graph_info);
  if (iter != run_op_graphs_.end()) {
  auto iter = run_op_graphs_.find(op_run_info.graph_info);
  auto &op_lazy_builder = runtime::OpLazyBuilder::GetInstance();
  if (iter != run_op_graphs_.end() && op_lazy_builder.QueueEmpty()) {
    const auto &graph = iter->second;
    MS_EXCEPTION_IF_NULL(graph);
    *single_op_cache_hit = true;
@@ -480,22 +481,20 @@ GraphId GraphCompiler::CompileGraph(const session::OpRunInfo &op_run_info, const
  *single_op_cache_hit = false;
  // Generate kernel graph.
  MS_EXCEPTION_IF_NULL(session_);
  KernelGraphPtr graph = session_->ConstructSingleOpGraph(op_run_info, *input_tensors, *tensors_mask);
  KernelGraphPtr graph =
    session_->ConstructSingleOpGraph(op_run_info, op_run_info.input_tensors, op_run_info.tensor_mask);
  MS_EXCEPTION_IF_NULL(graph);

  MS_EXCEPTION_IF_NULL(device_context);
  device_context->OptimizeSingleOpGraph(graph);

  // Generate 'KernelMod' for kernel in graph.
  device_context->CreateKernel(graph->execution_order());

  device_context->PreprocessBeforeRunSingleOpGraph(graph);

  // Create device address for all anf nodes of graph.
  CreateDeviceAddress(graph, device_context);
  CreateDeviceAddressWithoutWorkspace(graph, device_context);

  graph->set_is_all_nop_node(opt::IsAllNopNode(graph.get()));
  run_op_graphs_[graph_info] = graph;
  run_op_graphs_[op_run_info.graph_info] = graph;

  auto output_nodes = graph->outputs();
  auto &outputs_with_index = run_op_graph_output_nodes_[graph->graph_id()];
@@ -509,6 +508,22 @@ GraphId GraphCompiler::CompileGraph(const session::OpRunInfo &op_run_info, const
  return graph->graph_id();
 }

 void GraphCompiler::BuildSingleOpGraphs(const std::vector<KernelGraphPtr> &graphs,
                                        const DeviceContext *device_context) const {
  MS_EXCEPTION_IF_NULL(device_context);
  std::vector<CNodePtr> node_to_build;
  for (const auto &graph : graphs) {
    const auto &nodes = graph->execution_order();
    std::copy(nodes.begin(), nodes.end(), std::back_inserter(node_to_build));
  }

  device_context->CreateKernel(node_to_build);

  for (const auto &graph : graphs) {
    CreateKernelWorkspaceDeviceAddress(device_context, graph);
  }
 }

 KernelGraphPtr GraphCompiler::Fetch(GraphId graph_id) const {
  MS_EXCEPTION_IF_NULL(session_);
  return session_->GetGraph(graph_id);
@@ -532,6 +547,14 @@ void GraphCompiler::CreateDeviceAddress(const KernelGraphPtr &graph, const Devic
  UpdateDeviceAddressForRefNode(graph);
 }

 void GraphCompiler::CreateDeviceAddressWithoutWorkspace(const KernelGraphPtr &graph,
                                                        const DeviceContext *device_context) const {
  CreateParameterDeviceAddress(device_context, graph);
  CreateValueNodeDeviceAddress(device_context, graph);
  CreateKernelOutputDeviceAddress(device_context, graph);
  UpdateDeviceAddressForInplaceNode(graph);
 }

 void GraphCompiler::GetParamAndOutputIndex(
  const KernelGraphPtr &graph, const std::vector<TensorPtr> &inputs, VectorRef *const outputs,
  std::map<AnfNodePtr, size_t> *parameter_index,
@@ -560,12 +583,13 @@ TensorPtr GraphCompiler::GetSingleOpInputTensorByIndex(const CNodePtr &kernel,
                                           input_index);
 }

 void GraphCompiler::GetSingleOpRunInfoAndGraphInfo(const CNodePtr &kernel, const std::vector<TensorPtr> &input_tensors,
                                                   OpRunInfo *const run_info, GraphInfo *const graph_info) {
 void GraphCompiler::GetSingleOpRunInfoAndGraphInfo(const CNodePtr &kernel, const InputTensorInfo &tensor_info,
                                                   OpRunInfo *run_info, GraphInfo *graph_info) {
  MS_EXCEPTION_IF_NULL(session_);
  MS_EXCEPTION_IF_NULL(graph_info);
  session_->GetSingleOpRunInfo(kernel, run_info);
  *graph_info = session_->GetSingleOpGraphInfo(kernel, input_tensors);

  *graph_info = session_->GetSingleOpGraphInfo(kernel, tensor_info.input_tensors);
  *run_info = session_->GetSingleOpRunInfo(kernel, *graph_info, tensor_info);
 }

 void GraphCompiler::CalculateRefCount(const KernelGraphPtr &graph, std::map<KernelWithIndex, size_t> *ref_count) const {
--- a/mindspore/ccsrc/runtime/framework/graph_compiler.h
+++ b/mindspore/ccsrc/runtime/framework/graph_compiler.h
@@ -104,9 +104,11 @@ class GraphCompiler {
  GraphId CompileGraph(const FuncGraphPtr &func_graph, const DeviceContext *device_context);

  // Construct single op kernel graph and compile the kernel graph in PyNative mode.
  GraphId CompileGraph(const session::OpRunInfo &op_run_info, const GraphInfo &graph_info,
                       const std::vector<int64_t> *tensors_mask, std::vector<TensorPtr> *const input_tensors,
                       bool *single_op_cache_hit, const DeviceContext *device_context);
  GraphId CompileGraph(const session::OpRunInfo &op_run_info, bool *single_op_cache_hit,
                       const DeviceContext *device_context);

  // Create kernel and Create workspace for graphs in PyNative mode.
  void BuildSingleOpGraphs(const std::vector<KernelGraphPtr> &graphs, const DeviceContext *device_context) const;

  // Get graph by graph id, if not exist return nullptr, used in Graph mode.
  KernelGraphPtr Fetch(GraphId graph_id) const;
@@ -135,8 +137,8 @@ class GraphCompiler {
                                          InputTensorInfo *const input_tensor_info, size_t input_index);

  // Get OpRunInfo and GraphInfo for single op compile and run.
  void GetSingleOpRunInfoAndGraphInfo(const CNodePtr &kernel, const std::vector<TensorPtr> &input_tensors,
                                      OpRunInfo *const run_info, GraphInfo *const graph_info);
  void GetSingleOpRunInfoAndGraphInfo(const CNodePtr &kernel, const InputTensorInfo &tensor_info, OpRunInfo *run_info,
                                      GraphInfo *graph_info);

  // Calculate ref count of PyNative back propagation operators.
  void CalculateRefCount(const KernelGraphPtr &graph, std::map<KernelWithIndex, size_t> *ref_count) const;
@@ -181,6 +183,9 @@ class GraphCompiler {
  // Create device address for all anf nodes of graph.
  void CreateDeviceAddress(const KernelGraphPtr &graph, const DeviceContext *device_context) const;

  // Create device address for input and output of ops.
  void CreateDeviceAddressWithoutWorkspace(const KernelGraphPtr &graph, const DeviceContext *device_context) const;

  // Single op kernel graph cache for PyNative mode.
  std::unordered_map<GraphInfo, KernelGraphPtr> run_op_graphs_;
  // Single op kernel graph output nodes cache for PyNative mode.
--- a/mindspore/ccsrc/runtime/op_builder/CMakeLists.txt
+++ b/mindspore/ccsrc/runtime/op_builder/CMakeLists.txt
@@ -0,0 +1,5 @@
 file(GLOB_RECURSE BUILDER_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        "op_lazy_builder.cc")

 set_property(SOURCE ${BUILDER_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE)
 add_library(_mindspore_runtime_op_builder_obj OBJECT ${BUILDER_SRC_LIST})
--- a/mindspore/ccsrc/runtime/op_builder/op_lazy_builder.cc
+++ b/mindspore/ccsrc/runtime/op_builder/op_lazy_builder.cc
@@ -0,0 +1,45 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "runtime/op_builder/op_lazy_builder.h"

 namespace mindspore::runtime {
 void OpLazyBuilder::Register(const std::function<void()> &callback) {
  execute_callback_ = callback;
  registered_ = true;
 }

 void OpLazyBuilder::Reset() {
  ClearAllResources();
  execute_callback_ = nullptr;
  registered_ = false;
 }

 void OpLazyBuilder::ClearAllResources() {
  op_build_tasks.clear();
  std::queue<std::shared_ptr<OpTask>> empty;
  std::swap(op_run_tasks, empty);
 }

 void OpLazyBuilder::ExecuteRemainingTasks() {
  if (!executing_) {
    ExecuteGuard guard;
    if (execute_callback_ != nullptr) {
      execute_callback_();
    }
  }
 }
 }  // namespace mindspore::runtime
--- a/mindspore/ccsrc/runtime/op_builder/op_lazy_builder.h
+++ b/mindspore/ccsrc/runtime/op_builder/op_lazy_builder.h
@@ -0,0 +1,120 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_OP_BUILDER_OP_LAZY_BUILDER_H_
 #define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_OP_BUILDER_OP_LAZY_BUILDER_H_

 #include <vector>
 #include <memory>
 #include <queue>
 #include <map>
 #include <string>
 #include <utility>
 #include "backend/session/kernel_graph.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "runtime/hardware/device_context.h"
 #include "runtime/framework/graph_scheduler.h"

 namespace mindspore::runtime {
 class OpLazyBuilderContext {
 public:
  OpLazyBuilderContext(GraphCompilerInfo *graph_compiler_info, KernelGraphPtr graph,
                       std::vector<session::KernelWithIndex> output_nodes, const session::OpRunInfo &op_run_info,
                       device::DeviceContext *device_context)
      : graph_compiler_info_(graph_compiler_info),
        graph_(std::move(graph)),
        output_nodes_(std::move(output_nodes)),
        op_run_info_(op_run_info),
        device_context_(device_context) {}
  ~OpLazyBuilderContext() = default;

  GraphCompilerInfo *graph_compiler_info() const { return graph_compiler_info_; }
  const KernelGraphPtr &graph() const { return graph_; }
  const std::vector<session::KernelWithIndex> &output_nodes() const { return output_nodes_; }
  const session::OpRunInfo &op_run_info() const { return op_run_info_; }
  device::DeviceContext *device_context() const { return device_context_; }

 private:
  GraphCompilerInfo *graph_compiler_info_;
  KernelGraphPtr graph_;
  std::vector<session::KernelWithIndex> output_nodes_;
  session::OpRunInfo op_run_info_;
  device::DeviceContext *device_context_;
 };

 class OpTask {
 public:
  explicit OpTask(std::shared_ptr<OpLazyBuilderContext> context) : context_(std::move(context)) {}
  virtual ~OpTask() = default;
  const std::shared_ptr<OpLazyBuilderContext> &context() { return context_; }

 protected:
  std::shared_ptr<OpLazyBuilderContext> context_;
 };

 class OpBuildTask : public OpTask {
 public:
  explicit OpBuildTask(std::shared_ptr<OpLazyBuilderContext> context) : OpTask(std::move(context)) {}
  ~OpBuildTask() override = default;
 };

 class OpRunTask : public OpTask {
 public:
  explicit OpRunTask(std::shared_ptr<OpLazyBuilderContext> context) : OpTask(std::move(context)) {}
  ~OpRunTask() override = default;
 };

 class OpLazyBuilder {
 public:
  static OpLazyBuilder &GetInstance() {
    static OpLazyBuilder instance;
    return instance;
  }

  class ExecuteGuard {
   public:
    ExecuteGuard() { OpLazyBuilder::GetInstance().executing_ = true; }
    ~ExecuteGuard() { OpLazyBuilder::GetInstance().executing_ = false; }
  };

  void Register(const std::function<void()> &callback);
  const std::vector<std::shared_ptr<OpTask>> &GetOpBuildTasks() const { return op_build_tasks; }
  const std::queue<std::shared_ptr<OpTask>> &GetOpRunTasks() const { return op_run_tasks; }
  void ClearOpBuildTasks() { op_build_tasks.clear(); }
  void Reset();
  void ClearAllResources();
  void ExecuteRemainingTasks();

  void PushOpBuildTask(const std::shared_ptr<OpTask> &op_build_task) { op_build_tasks.push_back(op_build_task); }
  void PushOpRunTask(const std::shared_ptr<OpTask> &op_run_task) { op_run_tasks.push(op_run_task); }
  void PopOpRunTask() { op_run_tasks.pop(); }
  bool QueueEmpty() const { return op_run_tasks.empty() && op_build_tasks.empty(); }
  bool QueueFull() const { return op_build_tasks.size() > kMaxQueueSize || op_run_tasks.size() > kMaxQueueSize; }
  bool registered() const { return registered_; }

 private:
  OpLazyBuilder() = default;
  ~OpLazyBuilder() = default;
  DISABLE_COPY_AND_ASSIGN(OpLazyBuilder);
  std::vector<std::shared_ptr<OpTask>> op_build_tasks;
  std::queue<std::shared_ptr<OpTask>> op_run_tasks;
  std::function<void()> execute_callback_{nullptr};
  inline static size_t kMaxQueueSize = 100;
  bool executing_{false};
  bool registered_{false};
 };
 }  // namespace mindspore::runtime
 #endif  // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_OP_BUILDER_OP_LAZY_BUILDER_H_
--- a/mindspore/ccsrc/vm/backend.cc
+++ b/mindspore/ccsrc/vm/backend.cc
@@ -21,6 +21,7 @@

 #include "vm/transform.h"
 #include "backend/session/session_factory.h"
 #include "runtime/op_builder/op_lazy_builder.h"
 #include "backend/optimizer/common/helper.h"
 #include "pipeline/pynative/pynative_execute.h"
 #include "pipeline/jit/parse/data_converter.h"
@@ -207,6 +208,8 @@ TensorPtr CreateOutputTensor(const AnfNodePtr &output_node, size_t output_index)
  MS_EXCEPTION_IF_NULL(device_tensor);
  tensor->set_device_address(device_tensor);

  // MindRT is disabled in the multi graphs scenario
  // Delete tensor->data_sync() when MindRT is enabled in all scenes.
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) != kPynativeMode) {
@@ -490,45 +493,6 @@ void MindRTBackend::CompileGraph(const GraphSegmentPtr &segment, bool contain_mu
  }
 }

 const ActorInfo &MindRTBackend::CompileGraph(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
                                             const std::vector<int64_t> *tensors_mask,
                                             std::vector<tensor::TensorPtr> *input_tensors) {
  MS_EXCEPTION_IF_NULL(graph_compiler_);
  // Get the device context.
  const auto &device_context =
    device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_name_, device_id_});
  MS_EXCEPTION_IF_NULL(device_context);
  device_context->Initialize();

  bool single_op_cache_hit = true;
  auto graph_id = graph_compiler_->CompileGraph(op_run_info, graph_info, tensors_mask, input_tensors,
                                                &single_op_cache_hit, device_context);
  // The actor set name: graph_id + single operator name.
  std::string actor_info = std::to_string(graph_id) + "_" + op_run_info.op_name;
  if (single_op_cache_hit) {
    auto iter = actor_to_graph_compiler_info_.find(actor_info);
    if (iter == actor_to_graph_compiler_info_.end()) {
      MS_LOG(EXCEPTION) << "Can not find graph compiler info for actor set: " << actor_info;
    }
    return iter->first;
  }

  graph_info_to_device_context_.clear();
  graph_info_to_device_context_[graph_info] = device_context;

  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
  bool enable_cache = context_ptr->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE);
  auto graph_compiler_info = ConstructGraphCompilerInfo(actor_info, tensors_mask, input_tensors, !enable_cache);
  const auto actor_set = runtime::GraphScheduler::GetInstance().Transform(*graph_compiler_info);
  runtime::GraphScheduler::GetInstance().Schedule(actor_set);
  MS_EXCEPTION_IF_NULL(graph_compiler_info);
  graph_compiler_info->input_tensors_.clear();

  auto ret = actor_to_graph_compiler_info_.emplace(actor_info, std::move(graph_compiler_info));
  return ret.first->first;
 }

 namespace {
 void GetControlOpInput(const std::shared_ptr<GraphCompiler> &graph_compiler, const CNodePtr &front_cnode,
                       const CNodePtr &backend_cnode, const std::map<KernelWithIndex, tensor::TensorPtr> &op_output_map,
@@ -776,6 +740,7 @@ void PushTupleTensor(const VectorRef &args, const std::vector<AnfNodePtr> &param

 void MindRTBackend::RunGraphBySingleOp(const std::vector<KernelGraphPtr> &graphs,
                                       const std::vector<std::vector<tensor::TensorPtr>> &inputs, VectorRef *outputs) {
  runtime::OpLazyBuilder::GetInstance().ExecuteRemainingTasks();
  MS_EXCEPTION_IF_NULL(graph_compiler_);
  for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
    const auto &graph = graphs[graph_index];
@@ -810,16 +775,14 @@ void MindRTBackend::RunGraphBySingleOp(const std::vector<KernelGraphPtr> &graphs
        GraphInfo graph_info;
        graph_compiler_->GetSingleOpInputTensors(kernel, op_output_map, parameter_index, inputs[graph_index],
                                                 &input_tensor_info);
        graph_compiler_->GetSingleOpRunInfoAndGraphInfo(kernel, input_tensor_info.input_tensors, &op_run_info,
                                                        &graph_info);
        graph_compiler_->GetSingleOpRunInfoAndGraphInfo(kernel, input_tensor_info, &op_run_info, &graph_info);

        const ActorInfo &actor_info = CompileGraph(op_run_info, graph_info, &input_tensor_info.input_tensors_mask,
                                                   &input_tensor_info.input_tensors);
        RunGraph(actor_info, &op_run_info, &input_tensor_info.input_tensors_mask, &input_tensor_info.input_tensors,
                 &op_outputs);
        RunOp(&op_run_info, &op_outputs);
      } else {
        RunControlOperator(graph_compiler_, graph, kernel, op_output_map, parameter_index, inputs[graph_index],
                           &input_tensor_info, &op_outputs);
        // Execute remaining lazy tasks before PyNative hook exit.
        runtime::OpLazyBuilder::GetInstance().ExecuteRemainingTasks();
      }

      graph_compiler_->UpdateRefCount(input_tensor_info.input_kernel, &cnode_ref_count, &op_output_map);
@@ -999,6 +962,10 @@ void MindRTBackend::SetDebuggerInit() {
 }
 #endif

 void MindRTBackend::SyncLazyTasks() const { runtime::OpLazyBuilder::GetInstance().ExecuteRemainingTasks(); }

 void MindRTBackend::ClearOpBuilderResource() const { runtime::OpLazyBuilder::GetInstance().Reset(); }

 std::unique_ptr<GraphCompilerInfo> MindRTBackend::ConstructGraphCompilerInfo(const FuncGraphPtr &root_graph) {
  MS_EXCEPTION_IF_NULL(root_graph);
  MS_EXCEPTION_IF_NULL(graph_compiler_);
@@ -1092,22 +1059,21 @@ void MindRTBackend::EraseSingleOpCache(const ActorInfo &actor_info, const Kernel
  actor_to_graph_compiler_info_.erase(actor_info);
 }

 void MindRTBackend::RunGraph(const ActorInfo &actor_info, OpRunInfo *op_run_info,
                             const std::vector<int64_t> *tensors_mask,
                             const std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs) {
  MS_EXCEPTION_IF_NULL(input_tensors);
  MS_EXCEPTION_IF_NULL(op_run_info);
  MS_EXCEPTION_IF_NULL(tensors_mask);

 void MindRTBackend::RunSingleOpGraph(const KernelGraphPtr &graph,
                                     const std::vector<session::KernelWithIndex> &output_nodes,
                                     const OpRunInfo &op_run_info, const GraphCompilerInfo *graph_compiler_info,
                                     DeviceContext *device_context) {
  // Erase value node tensor.
  std::vector<tensor::TensorPtr> tensors_without_value_node;
  if (input_tensors->size() != tensors_mask->size()) {
    MS_LOG(EXCEPTION) << "Input tensors size " << input_tensors->size() << " should be equal to tensors mask size "
                      << tensors_mask->size();
  }
  for (size_t index = 0; index < tensors_mask->size(); ++index) {
    if (tensors_mask->at(index) != kValueNodeTensorMask) {
      (void)tensors_without_value_node.emplace_back(input_tensors->at(index));
  const auto &input_tensors = op_run_info.input_tensors;
  const auto &tensors_mask = op_run_info.tensor_mask;
  if (input_tensors.size() != tensors_mask.size()) {
    MS_LOG(EXCEPTION) << "Input tensors size " << input_tensors.size() << " should be equal to tensors mask size "
                      << tensors_mask.size();
  }
  for (size_t index = 0; index < tensors_mask.size(); ++index) {
    if (tensors_mask.at(index) != kValueNodeTensorMask) {
      (void)tensors_without_value_node.emplace_back(input_tensors.at(index));
    }
  }

@@ -1119,30 +1085,11 @@ void MindRTBackend::RunGraph(const ActorInfo &actor_info, OpRunInfo *op_run_info
  }

  // Run actor DAG.
  const auto &actor_set = runtime::GraphScheduler::GetInstance().Fetch(actor_info);
  const auto &actor_set = runtime::GraphScheduler::GetInstance().Fetch(graph_compiler_info->name_);
  MS_EXCEPTION_IF_NULL(actor_set);
  runtime::GraphScheduler::GetInstance().Run(actor_set, {}, {tensors_without_value_node}, *input_tensors,
  runtime::GraphScheduler::GetInstance().Run(actor_set, {}, {tensors_without_value_node}, input_tensors,
                                             runtime::GraphExecutionStrategy::kStep);

  // Fetch outputs.
  const auto &graph_iter = actor_to_graph_compiler_info_.find(actor_info);
  if (graph_iter == actor_to_graph_compiler_info_.end()) {
    MS_LOG(EXCEPTION) << "Can't find the graph compiler info.";
  }
  MS_EXCEPTION_IF_NULL(graph_iter->second);
  const auto &graph_compiler_info = *(graph_iter->second);
  const auto &graph = graph_compiler_info.graphs_.front();
  MS_EXCEPTION_IF_NULL(graph);
  MS_EXCEPTION_IF_NULL(graph_compiler_);
  const auto &output_nodes = graph_compiler_->GetGraphOutputNodes(graph->graph_id());
  MS_EXCEPTION_IF_NULL(outputs);
  UpdateOutput(output_nodes, outputs);

  // Update output abstract of dynamic op to op_run_info
  if (op_run_info->is_dynamic_shape) {
    UpdateOutputAbstract(graph, op_run_info);
  }

  // Release the kernel resource.
  const auto &kernels = graph->execution_order();
  for (const auto &kernel : kernels) {
@@ -1154,14 +1101,171 @@ void MindRTBackend::RunGraph(const ActorInfo &actor_info, OpRunInfo *op_run_info
      }
    }
  }
 }

 void MindRTBackend::CompileSingleOpGraphs(const std::vector<std::shared_ptr<runtime::OpTask>> &build_tasks) {
  if (build_tasks.empty()) {
    return;
  }
  std::vector<KernelGraphPtr> graphs;
  std::vector<GraphCompilerInfo *> graph_compiler_infos;
  for (const auto &task : build_tasks) {
    MS_EXCEPTION_IF_NULL(task);
    const auto &context = task->context();
    MS_EXCEPTION_IF_NULL(context);
    graphs.push_back(context->graph());
    graph_compiler_infos.push_back(context->graph_compiler_info());
  }
  MS_EXCEPTION_IF_NULL(build_tasks[0]);
  auto &task_context = build_tasks[0]->context();
  MS_EXCEPTION_IF_NULL(task_context);
  auto device_context = task_context->device_context();
  graph_compiler_->BuildSingleOpGraphs(graphs, device_context);

  for (const auto &graph_compiler_info : graph_compiler_infos) {
    MS_EXCEPTION_IF_NULL(graph_compiler_info);
    auto actor_set = runtime::GraphScheduler::GetInstance().Transform(*graph_compiler_info);
    graph_compiler_info->input_tensors_.clear();
    runtime::GraphScheduler::GetInstance().Schedule(actor_set);
  }
 }

 void MindRTBackend::LazyExecuteTaskCallback() {
  auto &op_lazy_builder = runtime::OpLazyBuilder::GetInstance();
  if (op_lazy_builder.QueueEmpty()) {
    return;
  }

  try {
    MS_LOG(DEBUG) << "Start";
    auto ms_context = MsContext::GetInstance();
    auto infer_flag = ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER);
    ms_context->set_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER, true);

    CompileSingleOpGraphs(op_lazy_builder.GetOpBuildTasks());
    op_lazy_builder.ClearOpBuildTasks();

    // Run op one by one
    auto &op_run_tasks = op_lazy_builder.GetOpRunTasks();
    while (!op_run_tasks.empty()) {
      auto &op_run_task = op_run_tasks.front();
      const auto &context = op_run_task->context();
      RunSingleOpGraph(context->graph(), context->output_nodes(), context->op_run_info(),
                       context->graph_compiler_info(), context->device_context());
      UpdateOutputDeviceAddress(context->output_nodes(), context->device_context());
      UpdateInputDeviceAddress(context->graph());

      op_lazy_builder.PopOpRunTask();
    }

    ms_context->set_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER, infer_flag);
    MS_LOG(DEBUG) << "End";
  } catch (const std::exception &ex) {
    op_lazy_builder.Reset();
    throw(std::runtime_error(ex.what()));
  } catch (...) {
    op_lazy_builder.Reset();
    std::string exName(abi::__cxa_current_exception_type()->name());
    MS_LOG(EXCEPTION) << "Error occurred when execute task in queue. Exception name: " << exName;
  }
 }

  // Update device address for input and output of graph.
  UpdateOutputDeviceAddress(output_nodes, graph_compiler_info.device_contexts_.front());
  UpdateInputDeviceAddress(graph);
 void MindRTBackend::RunOpInternal(bool single_op_cache_hit, GraphCompilerInfo *graph_compiler_info,
                                  OpRunInfo *op_run_info, VectorRef *outputs) {
  MS_EXCEPTION_IF_NULL(op_run_info);
  MS_EXCEPTION_IF_NULL(graph_compiler_info);
  // Fetch outputs.
  const auto &graph = graph_compiler_info->graphs_.front();
  MS_EXCEPTION_IF_NULL(graph);
  MS_EXCEPTION_IF_NULL(graph_compiler_);
  const auto &output_nodes = graph_compiler_->GetGraphOutputNodes(graph->graph_id());
  MS_EXCEPTION_IF_NULL(outputs);

  if (graph_compiler_info.need_erase_) {
    EraseSingleOpCache(actor_info, graph);
  auto device_context = graph_compiler_info->device_contexts_.front();
  auto &op_lazy_builder = runtime::OpLazyBuilder::GetInstance();
  bool lazy_build_disabled = graph_compiler_info->need_erase_ ||
                             (single_op_cache_hit && op_lazy_builder.QueueEmpty()) || !op_run_info->lazy_build;
  if (lazy_build_disabled) {
    if (!single_op_cache_hit) {
      CompileSingleOpGraph(graph, device_context, graph_compiler_info);
    }
    RunSingleOpGraph(graph, output_nodes, *op_run_info, graph_compiler_info, device_context);
    UpdateOutput(output_nodes, outputs);
    UpdateOutputDeviceAddress(output_nodes, device_context);
    UpdateInputDeviceAddress(graph);
    if (op_run_info->is_dynamic_shape) {
      UpdateOutputAbstract(graph, op_run_info);
    }
    if (graph_compiler_info->need_erase_) {
      EraseSingleOpCache(graph_compiler_info->name_, graph);
    }
  } else {
    UpdateOutput(output_nodes, outputs);
    auto run_op_context = std::make_shared<runtime::OpLazyBuilderContext>(
      graph_compiler_info, graph, output_nodes, *op_run_info, graph_compiler_info->device_contexts_.front());
    if (!single_op_cache_hit) {
      op_lazy_builder.PushOpBuildTask(std::make_shared<runtime::OpBuildTask>(run_op_context));
    }
    op_lazy_builder.PushOpRunTask(std::make_shared<runtime::OpRunTask>(run_op_context));
    if (!op_lazy_builder.registered()) {
      op_lazy_builder.Register([this]() { LazyExecuteTaskCallback(); });
    }
    if (op_lazy_builder.QueueFull()) {
      op_lazy_builder.ExecuteRemainingTasks();
    }
  }
 }

 void MindRTBackend::RunOp(OpRunInfo *op_run_info, VectorRef *outputs) {
  MS_EXCEPTION_IF_NULL(op_run_info);
  MS_EXCEPTION_IF_NULL(graph_compiler_);
  // Get the device context.
  const auto &device_context =
    device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_name_, device_id_});
  MS_EXCEPTION_IF_NULL(device_context);
  device_context->Initialize();

  bool single_op_cache_hit = true;
  auto graph_id = graph_compiler_->CompileGraph(*op_run_info, &single_op_cache_hit, device_context);
  std::string actor_info = std::to_string(graph_id) + "_" + op_run_info->op_name;
  GraphCompilerInfo *graph_compiler_info_ptr;
  if (single_op_cache_hit) {
    auto iter = actor_to_graph_compiler_info_.find(actor_info);
    if (iter == actor_to_graph_compiler_info_.end()) {
      MS_LOG(EXCEPTION) << "Can not find graph compiler info for actor set: " << actor_info;
    }
    graph_compiler_info_ptr = iter->second.get();
  } else {
    graph_info_to_device_context_.clear();
    graph_info_to_device_context_[op_run_info->graph_info] = device_context;
    auto context_ptr = MsContext::GetInstance();
    MS_EXCEPTION_IF_NULL(context_ptr);
    bool enable_cache = context_ptr->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE);
    auto graph_compiler_info =
      ConstructGraphCompilerInfo(actor_info, &op_run_info->tensor_mask, &op_run_info->input_tensors, !enable_cache);
    graph_compiler_info_ptr = graph_compiler_info.get();

    auto ret = actor_to_graph_compiler_info_.try_emplace(actor_info, std::move(graph_compiler_info));
    if (!ret.second) {
      MS_LOG(WARNING) << "ActorInfo:" << actor_info << " already exist in the map.";
    }
  }

  RunOpInternal(single_op_cache_hit, graph_compiler_info_ptr, op_run_info, outputs);
 }

 void MindRTBackend::CompileSingleOpGraph(const KernelGraphPtr &graph, const DeviceContext *device_context,
                                         GraphCompilerInfo *graph_compiler_info) const {
  MS_EXCEPTION_IF_NULL(graph);
  MS_EXCEPTION_IF_NULL(device_context);
  graph_compiler_->BuildSingleOpGraphs({graph}, device_context);
  MS_EXCEPTION_IF_NULL(graph_compiler_info);
  auto actor_set = runtime::GraphScheduler::GetInstance().Transform(*graph_compiler_info);
  graph_compiler_info->input_tensors_.clear();
  // Actor::Init() is called in Schedule.
  // Workspace need to be initialized in Actor::Init().
  // So `Schedule` need to execute after `CreateKernelWorkspaceDeviceAddress`.
  runtime::GraphScheduler::GetInstance().Schedule(actor_set);
 }
 }  // namespace compile
 }  // namespace mindspore
--- a/mindspore/ccsrc/vm/backend.h
+++ b/mindspore/ccsrc/vm/backend.h
@@ -32,6 +32,7 @@
 #include "backend/session/session_basic.h"
 #include "runtime/hardware/device_context.h"
 #include "runtime/framework/graph_scheduler.h"
 #include "runtime/op_builder/op_lazy_builder.h"

 namespace mindspore {
 namespace compile {
@@ -108,21 +109,19 @@ class MindRTBackend : public Backend {
  // all sub graphs to call CompileGraph.
  const ActorInfo &CompileGraphs(const FuncGraphPtr &root_graph);

  // Compile single op kernel graph in the pyNative mode.
  const ActorInfo &CompileGraph(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
                                const std::vector<int64_t> *tensors_mask,
                                std::vector<tensor::TensorPtr> *input_tensors);

  // Run Graph in the graph mode.
  void RunGraph(const ActorInfo &actor_info, const VectorRef &args, VectorRef *outputs);

  // Run Graph in the pyNative mode.
  void RunGraph(const ActorInfo &actor_info, OpRunInfo *op_run_info, const std::vector<int64_t> *tensors_mask,
                const std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs);
  // Run single op in the PyNative mode.
  void RunOp(OpRunInfo *op_run_info, VectorRef *outputs);
 #ifdef ENABLE_DEBUGGER
  void SetDebuggerInit();
 #endif

  // Execute all tasks in queue when lazy build is enabled in PyNative mode.
  void SyncLazyTasks() const;
  // Clear resource when python exit.
  void ClearOpBuilderResource() const;

 private:
  // The parameter func_graph is a graph, it can be either a root graph or a sub graph,
  // The result of graph compiler is stored in graph_id_to_device_context_ and control_nodes_.
@@ -132,6 +131,13 @@ class MindRTBackend : public Backend {
  // Compile the kernel graph by the segment which is from the function graph partition.
  void CompileGraph(const GraphSegmentPtr &segment, bool contain_multi_target);

  // CreateKernel, Transform and Schedule have not been finished when LazyBuild is enabled in PyNative mode.
  void CompileSingleOpGraph(const KernelGraphPtr &graph, const DeviceContext *device_context,
                            GraphCompilerInfo *graph_compiler_info) const;

  // Get saved OpBuildTask in OpLazyBuilder and build all the kernels together in PyNative mode.
  void CompileSingleOpGraphs(const std::vector<std::shared_ptr<runtime::OpTask>> &build_tasks);

  // Restore the outputs tuple by the origin funcGraph output node and output tensors.
  void ConstructOutputs(const AnfNodePtr &output_node, const std::vector<tensor::TensorPtr> &output_tensors,
                        size_t *output_position, VectorRef *outputs);
@@ -149,6 +155,18 @@ class MindRTBackend : public Backend {
  // so the latest single op cache should be erased when cache list size exceeds threshold value.
  void EraseSingleOpCache(const ActorInfo &actor_info, const KernelGraphPtr &graph);

  // Run op immediately when the single_op_cache hit and the queue of OpLazyBuilder is empty in PyNative mode.
  void RunSingleOpGraph(const KernelGraphPtr &graph, const std::vector<session::KernelWithIndex> &output_nodes,
                        const OpRunInfo &op_run_info, const GraphCompilerInfo *graph_compiler_info,
                        DeviceContext *device_context);

  // Execute OpBuildTask and OpRunTask when the OpLazyBuilder queue is full in PyNative mode.
  void LazyExecuteTaskCallback();

  // Run op immediately or save OpBuildTask and OpRunTask in OpLazyBuilder.
  void RunOpInternal(bool single_op_cache_hit, GraphCompilerInfo *graph_compiler_info, OpRunInfo *op_run_info,
                     VectorRef *outputs);

  // Split complete kernel graph to single op graph in PyNative back
  // propagation, then compile and run single op graph.
  void RunGraphBySingleOp(const std::vector<KernelGraphPtr> &graphs,
--- a/mindspore/ccsrc/vm/vm.cc
+++ b/mindspore/ccsrc/vm/vm.cc
@@ -482,18 +482,5 @@ void FinalVM::InstPushPrim(const VectorRef &args) {

  MS_LOG(DEBUG) << "End";
 }

 void FinalVM::SyncData(const py::object &arg) {
  if (py::isinstance<py::tuple>(arg)) {
    auto arg_list = py::cast<py::tuple>(arg);
    for (size_t i = 0; i < arg_list.size(); i++) {
      SyncData(arg_list[i]);
    }
  }
  if (py::isinstance<tensor::Tensor>(arg)) {
    auto tensor = py::cast<tensor::TensorPtr>(arg);
    tensor->data_sync();
  }
 }
 }  // namespace compile
 }  // namespace mindspore
--- a/mindspore/ccsrc/vm/vm.h
+++ b/mindspore/ccsrc/vm/vm.h
@@ -130,7 +130,6 @@ class FinalVM {
  void Pushsp();
  void Popsp();
  void DoJmp(const BaseRef &jmp);
  void SyncData(const py::object &args);

 private:
  InstSet insts_;