zzy34407230
/
mindspore2022

 
			
			   
				 
					
						
						
							
							/**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "runtime/framework/graph_scheduler.h"
#include "runtime/framework/actor/memory_manager_actor.h"
#include "mindrt/src/actor/actormgr.h"
#include "mindrt/include/async/async.h"
#include "backend/session/anf_runtime_algorithm.h"
#include "backend/optimizer/common/helper.h"
#include "utils/config_manager.h"
#include "utils/log_adapter.h"
#include "utils/convert_utils.h"
#include "common/trans.h"

namespace mindspore {
namespace runtime {
namespace {
bool IsDeviceQueueDSActor(const AnfNodePtr &node) {
  MS_EXCEPTION_IF_NULL(node);
  if (node->isa<CNode>() && (AnfAlgo::GetCNodeName(node) == kGetNextOpName)) {
    return true;
  }
  return false;
}

bool IsHostQueueDSActor(const AnfNodePtr &node) {
  MS_EXCEPTION_IF_NULL(node);
  if (node->isa<Parameter>() && (!AnfAlgo::IsParameterWeight(node->cast<ParameterPtr>()))) {
    return true;
  }
  return false;
}

bool IsKernelActor(const AnfNodePtr &node) {
  MS_EXCEPTION_IF_NULL(node);
  if (node->isa<CNode>() && (AnfAlgo::GetCNodeName(node) != kGetNextOpName)) {
    return true;
  }
  return false;
}

// Judge whether the device tensor of the node is persistent or not.
bool IsPersistentDeviceTensor(const AnfNodePtr &node) {
  MS_EXCEPTION_IF_NULL(node);
  if (node->isa<ValueNode>()) {
    return true;
  }
  if (node->isa<Parameter>() && AnfAlgo::IsParameterWeight(node->cast<ParameterPtr>())) {
    return true;
  }
  return false;
}

KernelActor *FindKernelActor(const KernelMapActor &kernel_actors_map, const std::string &name) {
  auto iter = kernel_actors_map.find(name);
  if (iter != kernel_actors_map.end()) {
    return iter->second.get();
  }
  return nullptr;
}

DeviceQueueDataSourceActor *FindDeviceQueueDSActor(const std::vector<DataSourceActorPtr> &data_source_actors) {
  for (auto &actor : data_source_actors) {
    MS_EXCEPTION_IF_NULL(actor);
    if (actor->GetAID().Name().find("_DeviceQueueDataSourceActor") != string::npos) {
      auto device_queue_ds_actor = dynamic_cast<DeviceQueueDataSourceActor *>(actor.get());
      return device_queue_ds_actor;
    }
  }
  return nullptr;
}

HostQueueDataSourceActor *FindHostQueueDSActor(const std::vector<DataSourceActorPtr> &data_source_actors) {
  for (auto &actor : data_source_actors) {
    MS_EXCEPTION_IF_NULL(actor);
    if (actor->GetAID().Name().find("_HostQueueDataSourceActor") != string::npos) {
      auto device_queue_ds_actor = dynamic_cast<HostQueueDataSourceActor *>(actor.get());
      return device_queue_ds_actor;
    }
  }
  return nullptr;
}

// Update the reference count of device tensor by the output index of node.
void UpdateRefCount(const AnfNodePtr &node, size_t output_idx) {
  MS_EXCEPTION_IF_NULL(node);
  auto device_tensor = AnfAlgo::GetMutableOutputAddr(node, output_idx);
  MS_EXCEPTION_IF_NULL(device_tensor);
  device_tensor->IncreaseOriginalRefCount();
  device_tensor->ResetRefCount();
}

//  The branch processing of PrepareDataForValueNode that value type is tensor.
void PrepareDataForValueNodeTensor(const ValueNodePtr &node, const ValuePtr &node_value,
                                   const DeviceContext *device_context) {
  MS_EXCEPTION_IF_NULL(node);
  MS_EXCEPTION_IF_NULL(node_value);
  MS_EXCEPTION_IF_NULL(device_context);

  std::vector<TensorPtr> tensors;
  TensorValueToTensor(node_value, &tensors);

  for (size_t i = 0; i < tensors.size(); i++) {
    const auto &tensor = tensors[i];
    if (tensor == nullptr) {
      MS_LOG(WARNING) << "Tensor is null";
      return;
    }

    const auto &device_tensor = AnfAlgo::GetMutableOutputAddr(node, i);
    MS_EXCEPTION_IF_NULL(device_tensor);
    // If the ptr of device tensor is not nullptr, it indicates that the device data has been prepared.
    if (device_tensor->GetPtr() != nullptr) {
      return;
    }
    MS_LOG(INFO) << "Prepare device data for value node: " << node->fullname_with_scope() << ", output index: " << i;
    tensor->set_device_address(device_tensor);

    // Allocate device memory.
    if (!device_context->AllocateMemory(device_tensor.get(), device_tensor->GetSize())) {
      MS_LOG(EXCEPTION) << "Device memory isn't enough and alloc failed, node name: " << node->fullname_with_scope()
                        << ", alloc size: " << device_tensor->GetSize();
    }

    // Copy data from host tensor to device.
    if (!device_tensor->SyncHostToDevice(trans::GetRuntimePaddingShape(node, 0), LongToSize(tensor->data().nbytes()),
                                         tensor->data_type(), tensor->data_c())) {
      MS_LOG(EXCEPTION) << "SyncHostToDevice failed, node name: " << node->fullname_with_scope();
    }
  }
}

// Prepare the device data for persistent device tensor of value node.
void PrepareDataForValueNode(const ValueNodePtr &node, const DeviceContext *device_context) {
  MS_EXCEPTION_IF_NULL(node);
  MS_EXCEPTION_IF_NULL(device_context);
  auto &node_value = node->value();
  MS_EXCEPTION_IF_NULL(node_value);

  if (node_value->isa<tensor::Tensor>() || node_value->isa<ValueTuple>()) {
    //  The branch processing that value type is tensor.
    PrepareDataForValueNodeTensor(node, node_value, device_context);
  } else if (node_value->isa<StringImm>()) {
    const auto &device_tensor = AnfAlgo::GetMutableOutputAddr(node, 0);
    MS_EXCEPTION_IF_NULL(device_tensor);
    // If the ptr of device tensor is not nullptr, it indicates that the device data has been prepared.
    if (device_tensor->GetPtr() != nullptr) {
      return;
    }
    MS_LOG(INFO) << "Prepare device data for value node: " << node->fullname_with_scope();

    // Allocate device memory.
    if (!device_context->AllocateMemory(device_tensor.get(), device_tensor->GetSize())) {
      MS_LOG(EXCEPTION) << "Device memory isn't enough and alloc failed, node name: " << node->fullname_with_scope()
                        << ", alloc size: " << device_tensor->GetSize();
    }

    // Copy data from value to device.
    auto value = GetValue<std::string>(node_value);
    size_t tensor_size = value.size();
    ShapeVector shape = {1, SizeToLong(tensor_size)};
    if (!device_tensor->SyncHostToDevice(shape, tensor_size, kNumberTypeUInt8, value.data())) {
      MS_LOG(EXCEPTION) << "SyncHostToDevice failed, node name: " << node->fullname_with_scope();
    }
  }
}

// Prepare the device data for persistent device tensor of weight node from host tensor.
void PrepareDataForWeightNode(const AnfNodePtr &node, const TensorPtr &tensor, const DeviceContext *device_context) {
  MS_EXCEPTION_IF_NULL(node);
  MS_EXCEPTION_IF_NULL(tensor);
  const auto &device_tensor = AnfAlgo::GetMutableOutputAddr(node, 0);
  MS_EXCEPTION_IF_NULL(device_tensor);
  const auto &host_tensor_address = std::dynamic_pointer_cast<DeviceTensor>(tensor->device_address());
  // If the host tensor has the device address, it indicates that the device address of host tensor is new.
  if (host_tensor_address != nullptr) {
    if (host_tensor_address != device_tensor) {
      AnfAlgo::SetOutputAddr(host_tensor_address, 0, node.get());
      DeviceTensorStore::GetInstance().Insert(node.get(), host_tensor_address);
    }
    return;
  }

  // If the ptr of device tensor is not nullptr, it indicates that the device data has been prepared.
  if (device_tensor->GetPtr() != nullptr) {
    return;
  }
  MS_LOG(INFO) << "Prepare device data for weight node: " << node->fullname_with_scope();
  tensor->set_device_address(device_tensor);

  // Allocate device memory.
  if (!device_context->AllocateMemory(device_tensor.get(), device_tensor->GetSize())) {
    MS_LOG(EXCEPTION) << "Device memory isn't enough and alloc failed, node name: " << node->fullname_with_scope()
                      << ", alloc size: " << device_tensor->GetSize();
  }

  // Copy data from host tensor to device.
  if (!device_tensor->SyncHostToDevice(trans::GetRuntimePaddingShape(node, 0), LongToSize(tensor->data().nbytes()),
                                       tensor->data_type(), tensor->data_c())) {
    MS_LOG(EXCEPTION) << "SyncHostToDevice failed, node name: " << node->fullname_with_scope();
  }
}

BaseRef CreateOutputTensor(const session::KernelWithIndex &node_output_pair, const KernelGraphPtr &graph,
                           const std::vector<tensor::TensorPtr> &input_tensors) {
  auto &node = node_output_pair.first;
  auto output_index = node_output_pair.second;
  MS_EXCEPTION_IF_NULL(node);

  if (node->isa<ValueNode>()) {
    // If node is a value node, return the value.
    auto value_node = node->cast<ValueNodePtr>();
    MS_EXCEPTION_IF_NULL(value_node);
    return value_node->value();
  } else if (node->isa<Parameter>()) {
    // If node is a parameter node, return tensor from input_tensors.
    MS_EXCEPTION_IF_NULL(graph);
    const auto &input_nodes = graph->inputs();
    auto iter = find(input_nodes.begin(), input_nodes.end(), node);
    if (iter == input_nodes.end()) {
      MS_LOG(EXCEPTION) << "Parameter node: " << node->fullname_with_scope() << " is not exist.";
    }
    auto position = IntToSize(std::distance(input_nodes.begin(), iter));
    return input_tensors[position];
  } else {
    // Create tensor.
    TypeId type_id = AnfAlgo::GetOutputDeviceDataType(node, output_index);
    if (type_id == kTypeUnknown) {
      type_id = AnfAlgo::GetOutputInferDataType(node, output_index);
    }
    std::vector<int64_t> temp_shape;
    auto shape = AnfAlgo::GetOutputInferShape(node, output_index);
    (void)std::copy(shape.begin(), shape.end(), std::back_inserter(temp_shape));
    auto tensor = std::make_shared<tensor::Tensor>(type_id, temp_shape);
    MS_EXCEPTION_IF_NULL(tensor);
    tensor->set_padding_type(AnfAlgo::GetOutputReshapeType(node, output_index));

    // Set device address to tensor.
    const auto &device_tensor = AnfAlgo::GetMutableOutputAddr(node, output_index);
    MS_EXCEPTION_IF_NULL(device_tensor);
    tensor->set_device_address(device_tensor);
    device_tensor->set_original_ref_count(SIZE_MAX);
    device_tensor->ResetRefCount();
    return tensor;
  }
}

BaseRef CreateOutputTensors(const AnfNodePtr &output_node, const KernelGraphPtr &graph,
                            const std::vector<tensor::TensorPtr> &input_tensors) {
  MS_EXCEPTION_IF_NULL(output_node);
  auto item_with_index = AnfAlgo::VisitKernelWithReturnType(output_node, 0);
  MS_EXCEPTION_IF_NULL(item_with_index.first);

  // Special handle for make tuple.
  if (AnfAlgo::CheckPrimitiveType(item_with_index.first, prim::kPrimMakeTuple)) {
    auto cnode = item_with_index.first->cast<CNodePtr>();
    MS_EXCEPTION_IF_NULL(cnode);
    VectorRef ret;
    for (size_t i = 1; i < cnode->inputs().size(); ++i) {
      auto out = CreateOutputTensors(cnode->input(i), graph, input_tensors);
      ret.push_back(out);
    }
    return ret;
  }

  // If the node return nothing, return an empty vectorRef.
  if (AnfAlgo::GetOutputTensorNum(item_with_index.first) == 0) {
    return VectorRef();
  }

  return CreateOutputTensor(item_with_index, graph, input_tensors);
}

void AllocateContinuousMemoryForInput(const AnfNodePtr &kernel, const DeviceContext *device_context,
                                      bool is_all_nop_node) {
  MS_EXCEPTION_IF_NULL(kernel);
  MS_EXCEPTION_IF_NULL(device_context);
  bool is_need_alloc_memory = false;
  size_t total_size = 0;
  std::vector<size_t> size_list;
  std::vector<DeviceTensorPtr> addr_list;

  const auto &kernel_mod = AnfAlgo::GetKernelMod(kernel);
  MS_EXCEPTION_IF_NULL(kernel_mod);
  const auto &intput_sizes = kernel_mod->GetInputSizeList();
  for (size_t i = 0; i < intput_sizes.size(); ++i) {
    DeviceTensorPtr device_tensor;
    if (is_all_nop_node) {
      // Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
      device_tensor = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false);
    } else {
      device_tensor = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, true);
    }
    MS_EXCEPTION_IF_NULL(device_tensor);
    //  In the scene of communication op and computing op parallel multi stream, the input address of communication op
    //  can't be reused, so set the max reference count.
    device_tensor->set_original_ref_count(SIZE_MAX);
    device_tensor->ResetRefCount();

    if (device_tensor->GetPtr() == nullptr) {
      is_need_alloc_memory = true;
    }
    total_size += intput_sizes[i];
    size_list.emplace_back(intput_sizes[i]);
    addr_list.emplace_back(device_tensor);
  }

  if (is_need_alloc_memory) {
    auto ret = device_context->AllocateContinuousMemory(addr_list, total_size, size_list);
    if (!ret) {
      MS_LOG(EXCEPTION) << "Malloc device memory failed.";
    }
  }
}

void AllocateContinuousMemoryForOutput(const AnfNodePtr &kernel, const DeviceContext *device_context) {
  MS_EXCEPTION_IF_NULL(kernel);
  MS_EXCEPTION_IF_NULL(device_context);
  bool is_need_alloc_memory = false;
  size_t total_size = 0;
  std::vector<size_t> size_list;
  std::vector<DeviceTensorPtr> addr_list;

  const auto &kernel_mod = AnfAlgo::GetKernelMod(kernel);
  MS_EXCEPTION_IF_NULL(kernel_mod);
  const auto &output_sizes = kernel_mod->GetOutputSizeList();
  for (size_t i = 0; i < output_sizes.size(); ++i) {
    const auto &device_tensor = AnfAlgo::GetMutableOutputAddr(kernel, i, false);
    MS_EXCEPTION_IF_NULL(device_tensor);
    // One time application for continuous memory, so set the max reference count.
    device_tensor->set_original_ref_count(SIZE_MAX);
    device_tensor->ResetRefCount();

    if (device_tensor->GetPtr() == nullptr) {
      is_need_alloc_memory = true;
    }
    total_size += output_sizes[i];
    size_list.emplace_back(output_sizes[i]);
    addr_list.emplace_back(device_tensor);
  }

  if (is_need_alloc_memory) {
    auto ret = device_context->AllocateContinuousMemory(addr_list, total_size, size_list);
    if (!ret) {
      MS_LOG(EXCEPTION) << "Malloc device memory failed.";
    }
  }
}
}  // namespace

void GraphScheduler::Initialize() {
  if (init_) {
    return;
  }
  init_ = true;

  auto actorMgr = ActorMgr::GetActorMgrRef();
  MS_EXCEPTION_IF_NULL(actorMgr);

  // Create the thread pool of actor runtime.
  auto max_thread_num = GetMaxThreadNum();
  MS_LOG(INFO) << "Max available thread number: " << max_thread_num;
  actorMgr->Initialize(max_thread_num);

  // Create memory manager actor.
  auto memory_manager_actor = std::make_shared<MemoryManagerActor>();
  MS_EXCEPTION_IF_NULL(memory_manager_actor);
  memory_manager_aid_ = memory_manager_actor->GetAID();
  // Schedule memory manager actor, bind single thread to response to memory alloc and free quickly.
  auto base_actor = static_cast<ActorReference>(memory_manager_actor);
  (void)actorMgr->Spawn(base_actor, false);
}

ActorSet *GraphScheduler::Transform(const std::vector<KernelGraphPtr> &graphs,
                                    const std::vector<DeviceContext *> &device_contexts,
                                    const std::vector<TensorPtr> *input_tensors,
                                    const std::vector<AnfNodePtr> *control_nodes, GraphExecutionStrategy strategy) {
  if (graphs.size() != device_contexts.size()) {
    MS_LOG(EXCEPTION) << "The number of graphs is not equal to the number of device_contexts.";
  }
  Initialize();
  std::vector<ActorSetPtr> actor_sets;
  for (size_t i = 0; i < graphs.size(); ++i) {
    auto graph = graphs[i];
    auto device_context = device_contexts[i];
    MS_EXCEPTION_IF_NULL(graph);
    MS_LOG(INFO) << "Graph(" << graph->ToString() << ") transforms actor begin.";
    PersistDeviceTensor(graph);
    auto actor_set = Build(graph, device_context);
    actor_sets.emplace_back(actor_set);
    graph_to_actors_.emplace(graph, actor_set);
    Link(actor_set.get(), graph, strategy);

    if (!CheckActorValid(actor_set.get())) {
      MS_LOG(EXCEPTION) << "The actor set of " << graph->ToString() << " is invalid.";
    }
    MS_LOG(INFO) << "Graph(" << graph->ToString() << ") transforms actor end.";
  }
  return actor_sets[0].get();
}

void GraphScheduler::Schedule(const ActorSet *actor_set) {
  MS_EXCEPTION_IF_NULL(actor_set);
  auto actorMgr = ActorMgr::GetActorMgrRef();
  MS_EXCEPTION_IF_NULL(actorMgr);

  // Schedule dats source actors.
  for (auto &data_source_actor : actor_set->data_source_actors_) {
    MS_EXCEPTION_IF_NULL(data_source_actor);
    auto base_actor = static_cast<ActorReference>(data_source_actor);
    (void)actorMgr->Spawn(base_actor);
  }

  // Schedule kernel actors.
  for (auto &kernel_actor : actor_set->kernel_actors_) {
    MS_EXCEPTION_IF_NULL(kernel_actor);
    auto base_actor = static_cast<ActorReference>(kernel_actor);
    (void)actorMgr->Spawn(base_actor);
  }

  // Schedule loop count actor.
  if (actor_set->loop_count_actor_ != nullptr) {
    auto base_actor = static_cast<ActorReference>(actor_set->loop_count_actor_);
    (void)actorMgr->Spawn(base_actor);
  }
}

void GraphScheduler::PrepareRun(const KernelGraphPtr &graph, const std::vector<TensorPtr> *input_tensors,
                                VectorRef *const &outputs) {
  MS_EXCEPTION_IF_NULL(graph);
  MS_EXCEPTION_IF_NULL(input_tensors);
  MS_EXCEPTION_IF_NULL(outputs);
  // Get the device context for the first kernel actor.
  const auto &actor_set = Fetch(graph);
  MS_EXCEPTION_IF_NULL(actor_set);
  const auto &first_kernel_actor = actor_set->kernel_actors_[0];
  MS_EXCEPTION_IF_NULL(first_kernel_actor);
  const auto &device_context = first_kernel_actor->device_context_;

  // 1.Prepare the data of device tensor store(value nodes of graph).
  for (const auto &value_node : graph->graph_value_nodes()) {
    if (AnfAlgo::OutputAddrExist(value_node, 0)) {
      PrepareDataForValueNode(value_node, device_context);
    }
  }

  // 1.Prepare the data of device tensor store(weights of graph), and fill the host tensors for non weighted parameters.
  std::vector<TensorPtr> host_tensors;
  const auto &input_nodes = graph->input_nodes();
  for (size_t i = 0; i < input_nodes.size(); ++i) {
    const auto &input_node = input_nodes[i];
    const auto &input_tensor = (*input_tensors)[i];
    MS_EXCEPTION_IF_NULL(input_node);
    if (IsPersistentDeviceTensor(input_node)) {
      // Prepare the device data for weights.
      PrepareDataForWeightNode(input_node, input_tensor, device_context);
    } else {
      // Fill the host tensors for non weighted parameters.
      host_tensors.emplace_back(input_tensor);
    }
  }

  // 2.Prepare the data of host tensor queue(non weighted parameters of graph).
  const auto &host_tensor_queue = FetchHostQueue(graph);
  if (host_tensor_queue != nullptr) {
    host_tensor_queue->PushData(host_tensors);
  }

  // 3.Prepare the output tensor of graph.
  for (const auto &output_node : graph->outputs()) {
    MS_EXCEPTION_IF_NULL(output_node);
    MS_LOG(INFO) << "Create node output: " << output_node->fullname_with_scope();
    outputs->emplace_back(CreateOutputTensors(output_node, graph, *input_tensors));
  }

  // 4.Prepare the continuous memory for communication kernel.
  for (const auto &kernel : graph->execution_order()) {
    if (AnfAlgo::IsCommunicationOp(kernel)) {
      AllocateContinuousMemoryForInput(kernel, device_context, graph->is_all_nop_node());
      AllocateContinuousMemoryForOutput(kernel, device_context);
    }
  }
}

bool GraphScheduler::Run(const ActorSet *actor_set, GraphExecutionStrategy strategy) {
  MS_EXCEPTION_IF_NULL(actor_set);
  // Construct OpContext.
  OpContext<DeviceTensor> op_context;
  uuids::uuid sequential_num;
  std::vector<Promise<int>> result(1);
  op_context.sequential_num_ = &sequential_num;
  op_context.results_ = &result;

  // Trigger no input kernel actor running.
  for (auto &no_input_kernel_actor : actor_set->no_input_kernel_actors_) {
    MS_EXCEPTION_IF_NULL(no_input_kernel_actor);
    Async(no_input_kernel_actor->GetAID(), &KernelActor::RunOpControl, nullptr, &op_context);
  }

  // Trigger data source actor running.
  for (auto &data_source_actor : actor_set->data_source_actors_) {
    MS_EXCEPTION_IF_NULL(data_source_actor);
    Async(data_source_actor->GetAID(), &DataSourceActor::FetchData, &op_context);
  }

  // Trigger kernel actor running in the step execution strategy.
  if (strategy == GraphExecutionStrategy::kStep) {
    for (auto &kernel_actor : actor_set->kernel_actors_) {
      MS_EXCEPTION_IF_NULL(kernel_actor);
      Async(kernel_actor->GetAID(), &KernelActor::RunOpControl, nullptr, &op_context);
    }
  }

  // Get the run result.
  auto result_future = result[0].GetFuture();
  result_future.Wait();
  if (!result_future.IsOK()) {
    return false;
  }

  // Sync device stream.
  const auto &first_kernel_actor = actor_set->kernel_actors_[0];
  MS_EXCEPTION_IF_NULL(first_kernel_actor);
  const auto &device_context = first_kernel_actor->device_context_;
  MS_EXCEPTION_IF_NULL(device_context);
  if (!device_context->SyncStream()) {
    MS_LOG(ERROR) << "Sync stream failed.";
    return false;
  }

  return true;
}

ActorSet *GraphScheduler::Fetch(const KernelGraphPtr &graph) const {
  MS_EXCEPTION_IF_NULL(graph);
  auto iter = graph_to_actors_.find(graph);
  if (iter != graph_to_actors_.end()) {
    return iter->second.get();
  } else {
    MS_LOG(ERROR) << "Can't find the actors map of graph: " << graph->ToString();
    return nullptr;
  }
}

ActorSetPtr GraphScheduler::Build(const KernelGraphPtr &graph, const DeviceContext *device_context) {
  auto actor_set = std::make_shared<ActorSet>();
  MS_EXCEPTION_IF_NULL(actor_set);

  auto data_source_actors = BuildDataSourceActor(graph, device_context);
  actor_set->data_source_actors_.swap(data_source_actors);

  auto kernel_actors = BuildKernelActor(graph, device_context);
  actor_set->kernel_actors_.swap(kernel_actors);

  auto loop_count_actor = BuildLoopCountActor(graph);
  actor_set->loop_count_actor_ = loop_count_actor;

  return actor_set;
}

void GraphScheduler::Link(ActorSet *actor_set, const KernelGraphPtr &graph, GraphExecutionStrategy strategy) {
  MS_EXCEPTION_IF_NULL(actor_set);
  MS_EXCEPTION_IF_NULL(graph);
  KernelMapActor kernel_actors_temp_map;
  for (auto &actor : actor_set->kernel_actors_) {
    MS_EXCEPTION_IF_NULL(actor);
    kernel_actors_temp_map.emplace(actor->GetAID().Name(), actor);
  }

  // Foreach the execution order to link the actors.
  auto execution_order = graph->execution_order();
  for (auto &kernel : execution_order) {
    if (!IsKernelActor(kernel)) {
      continue;
    }
    auto kernel_actor = FindKernelActor(kernel_actors_temp_map, kernel->fullname_with_scope());
    // Link the control arrows of kernel actor.
    LinkControlArrowForKernelActor(kernel_actor, actor_set->loop_count_actor_.get(), graph, strategy);

    for (size_t i = 0; i < AnfAlgo::GetInputNum(kernel); ++i) {
      auto input_node = AnfAlgo::GetInputNode(kernel, i);
      // Link the control arrows of kernel actor by the auto monad, the inputs include monad node.
      LinkControlArrowByAutoMonad(kernel_actor, input_node, kernel_actors_temp_map);
      if (HasAbstractMonad(input_node)) {
        continue;  // No data arrow for monad input.
      }

      KernelWithIndex from_kernel_with_output_idx = AnfAlgo::VisitKernelWithReturnType(input_node, 0, true);
      KernelWithIndex to_kernel_with_input_idx = std::make_pair(kernel, i);
      auto from_kernel = from_kernel_with_output_idx.first;

      if (IsDeviceQueueDSActor(from_kernel)) {
        // Link the data arrows of device queue data source actor.
        auto from_actor = FindDeviceQueueDSActor(actor_set->data_source_actors_);
        LinkDataArrowForDeviceDSActor(from_actor, kernel_actor, from_kernel_with_output_idx, to_kernel_with_input_idx);
      } else if (IsHostQueueDSActor(from_kernel)) {
        // Link the data arrows of host queue data source actor.
        auto from_actor = FindHostQueueDSActor(actor_set->data_source_actors_);
        LinkDataArrowForHostDSActor(from_actor, kernel_actor, from_kernel_with_output_idx, to_kernel_with_input_idx);
      } else {
        // Link the data arrows of kernel actor.
        auto from_actor = FindKernelActor(kernel_actors_temp_map, from_kernel->fullname_with_scope());
        LinkDataArrowForKernelActor(from_actor, kernel_actor, from_kernel_with_output_idx, to_kernel_with_input_idx);
      }
    }
  }

  // BuildNoInputKernelActor depends on whether kernel actors have input, so must be behind the link of kernel actors.
  auto no_input_kernel_actors = BuildNoInputKernelActor(graph);
  actor_set->no_input_kernel_actors_.swap(no_input_kernel_actors);

  // Link the control arrows of loop count actor, which depends on the no input kernel actors.
  LinkControlArrowForLoopCountActor(actor_set->loop_count_actor_.get(), graph);
}

std::vector<DataSourceActorPtr> GraphScheduler::BuildDataSourceActor(const KernelGraphPtr &graph,
                                                                     const DeviceContext *device_context) {
  MS_EXCEPTION_IF_NULL(graph);
  std::vector<DataSourceActorPtr> data_source_actors;

  // Build host queue data source actor.
  HostQueueDSActorPtr host_queue_ds_actor = nullptr;
  for (auto &input_node : graph->input_nodes()) {
    MS_EXCEPTION_IF_NULL(input_node);
    if (IsHostQueueDSActor(input_node)) {
      if (host_queue_ds_actor == nullptr) {
        auto actor_name = graph->ToString() + "_" + "HostQueueDataSourceActor";
        MS_LOG(INFO) << "Create host queue data source actor: " << actor_name;
        auto host_queue = std::make_shared<HostTensorQueue>();
        graph_to_host_queue_.emplace(graph, host_queue);
        host_queue_ds_actor =
          std::make_shared<HostQueueDataSourceActor>(actor_name, 1, device_context, memory_manager_aid_, host_queue);
        data_source_actors.emplace_back(host_queue_ds_actor);
      }
      host_queue_ds_actor->data_nodes_.emplace_back(input_node);
    }
  }

  // Build device queue data source actor.
  auto execution_order = graph->execution_order();
  auto iter = std::find_if(execution_order.begin(), execution_order.end(),
                           [](const CNodePtr &node) { return IsDeviceQueueDSActor(node); });
  if (iter != execution_order.end()) {
    auto actor_name = graph->ToString() + "_" + "DeviceQueueDataSourceActor";
    MS_LOG(INFO) << "Create queue data source actor: " << actor_name;
    auto device_queue_ds_actor =
      std::make_shared<DeviceQueueDataSourceActor>(actor_name, 1, device_context, memory_manager_aid_);
    MS_EXCEPTION_IF_NULL(device_queue_ds_actor);
    data_source_actors.emplace_back(device_queue_ds_actor);
    device_queue_ds_actor->data_kernel_ = *iter;
  }
  return data_source_actors;
}

std::vector<KernelActorPtr> GraphScheduler::BuildKernelActor(const KernelGraphPtr &graph,
                                                             const DeviceContext *device_context) {
  MS_EXCEPTION_IF_NULL(graph);
  std::vector<KernelActorPtr> kernel_actors;

  auto execution_order = graph->execution_order();
  for (auto &kernel : execution_order) {
    if (IsKernelActor(kernel)) {
      auto kernel_actor =
        std::make_shared<KernelActor>(kernel->fullname_with_scope(), kernel, device_context, memory_manager_aid_);
      MS_EXCEPTION_IF_NULL(kernel_actor);
      kernel_actors.emplace_back(kernel_actor);
    }
  }
  return kernel_actors;
}

std::vector<KernelActorPtr> GraphScheduler::BuildNoInputKernelActor(const KernelGraphPtr &graph) {
  MS_EXCEPTION_IF_NULL(graph);
  std::vector<KernelActorPtr> no_input_kernel_actors;

  auto actor_set = Fetch(graph);
  MS_EXCEPTION_IF_NULL(actor_set);
  for (auto &kernel_actor : actor_set->kernel_actors_) {
    MS_EXCEPTION_IF_NULL(kernel_actor);
    if ((kernel_actor->input_datas_num_ == 0) && (kernel_actor->input_controls_num_ == 0)) {
      no_input_kernel_actors.emplace_back(kernel_actor);
      // The no input kernel actor will be triggered by loop count actor, so need set the input_controls_num_.
      kernel_actor->input_controls_num_ = 1;
    }
  }
  return no_input_kernel_actors;
}

LoopCountActorPtr GraphScheduler::BuildLoopCountActor(const KernelGraphPtr &graph) {
  MS_EXCEPTION_IF_NULL(graph);
  auto loop_count = ConfigManager::GetInstance().iter_num();
  auto actor_name = graph->ToString() + "_" + "LoopCountActor";
  auto loop_count_actor = std::make_shared<LoopCountActor>(actor_name, loop_count);
  MS_LOG(INFO) << "Create loop count actor: " << actor_name;
  MS_EXCEPTION_IF_NULL(loop_count_actor);
  return loop_count_actor;
}

void GraphScheduler::LinkDataArrowForDeviceDSActor(DeviceQueueDataSourceActor *from_actor, KernelActor *to_actor,
                                                   KernelWithIndex from_kernel_with_output_idx,
                                                   KernelWithIndex to_kernel_with_input_idx) {
  MS_EXCEPTION_IF_NULL(from_actor);
  MS_EXCEPTION_IF_NULL(to_actor);

  auto from_kernel = from_kernel_with_output_idx.first;
  MS_EXCEPTION_IF_NULL(from_kernel);
  auto from_output_index = from_kernel_with_output_idx.second;
  auto to_input_index = to_kernel_with_input_idx.second;

  auto to_aid = to_actor->GetAID();
  auto op_arrow = std::make_shared<OpArrow>(from_output_index, to_aid, to_input_index);
  from_actor->output_op_arrows_.emplace_back(op_arrow);
  to_actor->input_datas_num_++;

  // Update the reference count of device tensor.
  UpdateRefCount(from_kernel, from_output_index);
}

void GraphScheduler::LinkDataArrowForHostDSActor(HostQueueDataSourceActor *from_actor, KernelActor *to_actor,
                                                 KernelWithIndex from_kernel_with_output_idx,
                                                 KernelWithIndex to_kernel_with_input_idx) {
  MS_EXCEPTION_IF_NULL(from_actor);
  MS_EXCEPTION_IF_NULL(to_actor);

  auto from_kernel = from_kernel_with_output_idx.first;
  MS_EXCEPTION_IF_NULL(from_kernel);
  auto from_output_index = from_kernel_with_output_idx.second;
  auto to_input_index = to_kernel_with_input_idx.second;

  auto data_nodes = from_actor->data_nodes_;
  auto iter = find(data_nodes.begin(), data_nodes.end(), from_kernel);
  if (iter == data_nodes.end()) {
    MS_LOG(EXCEPTION) << "Parameter node: " << from_kernel->fullname_with_scope() << " is not exist.";
  }
  auto position = IntToSize(std::distance(data_nodes.begin(), iter));
  auto to_aid = to_actor->GetAID();
  auto op_arrow = std::make_shared<OpArrow>(position, to_aid, to_input_index);
  from_actor->output_op_arrows_.emplace_back(op_arrow);
  to_actor->input_datas_num_++;

  // Update the reference count of device tensor.
  UpdateRefCount(from_kernel, from_output_index);
}

void GraphScheduler::LinkDataArrowForKernelActor(KernelActor *from_actor, KernelActor *to_actor,
                                                 KernelWithIndex from_kernel_with_output_idx,
                                                 KernelWithIndex to_kernel_with_input_idx) {
  MS_EXCEPTION_IF_NULL(to_actor);
  auto from_kernel = from_kernel_with_output_idx.first;
  MS_EXCEPTION_IF_NULL(from_kernel);
  auto from_output_index = from_kernel_with_output_idx.second;
  auto to_input_index = to_kernel_with_input_idx.second;

  if (IsPersistentDeviceTensor(from_kernel)) {
    to_actor->device_tensor_store_keys_.emplace_back(to_input_index, static_cast<void *>(from_kernel.get()));
  } else if (IsKernelActor(from_kernel)) {
    MS_EXCEPTION_IF_NULL(from_actor);
    auto to_aid = to_actor->GetAID();
    auto op_arrow = std::make_shared<OpArrow>(from_output_index, to_aid, to_input_index);
    from_actor->output_op_arrows_.emplace_back(op_arrow);
    to_actor->input_datas_num_++;

    // Update the reference count of device tensor.
    UpdateRefCount(from_kernel, from_output_index);
  }
}

void GraphScheduler::LinkControlArrowForKernelActor(KernelActor *from_actor, LoopCountActor *to_actor,
                                                    const KernelGraphPtr &graph, GraphExecutionStrategy strategy) {
  MS_EXCEPTION_IF_NULL(from_actor);
  MS_EXCEPTION_IF_NULL(to_actor);
  MS_EXCEPTION_IF_NULL(graph);

  if (strategy == GraphExecutionStrategy::kStep) {
    from_actor->input_controls_num_++;
  }

  // The manager of graph member is weak ptr, so need created and used in the function IsNotRealUsedByOthers.
  const auto &manager = Manage(graph, true);
  MS_EXCEPTION_IF_NULL(manager);
  if (opt::IsNotRealUsedByOthers(graph, from_actor->kernel_)) {
    MS_EXCEPTION_IF_NULL(from_actor->kernel_);
    MS_LOG(INFO) << from_actor->kernel_->fullname_with_scope() << " is not real used by other nodes.";
    auto to_aid = to_actor->GetAID();
    from_actor->output_op_controls_.emplace_back(to_aid);
    to_actor->input_controls_num_++;
  }
}

void GraphScheduler::LinkControlArrowByAutoMonad(KernelActor *to_actor, const AnfNodePtr &from_node,
                                                 const KernelMapActor &kernel_actors_map) {
  MS_EXCEPTION_IF_NULL(to_actor);
  MS_EXCEPTION_IF_NULL(from_node);
  if (!from_node->isa<CNode>()) {
    return;
  }
  // Find the real input node, include the monad node and make tuple node.
  const std::vector<PrimitivePtr> &return_types = {prim::kPrimUpdateState, prim::kPrimLoad, prim::kPrimMakeTuple};
  const auto &input_kernel_with_output_idx = AnfAlgo::VisitKernelWithReturnType(from_node, 0, true, return_types);
  MS_EXCEPTION_IF_NULL(input_kernel_with_output_idx.first);
  if (!input_kernel_with_output_idx.first->isa<CNode>()) {
    return;
  }
  const auto &input_cnode = input_kernel_with_output_idx.first->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(input_cnode);

  // Get the real depend input by monad node which needs to link the control arrow.
  AnfNodePtr real_depend_input = nullptr;
  if (AnfAlgo::CheckPrimitiveType(input_cnode, prim::kPrimUpdateState)) {
    real_depend_input = input_cnode->input(kUpdateStateRealInput);
  } else if (AnfAlgo::CheckPrimitiveType(input_cnode, prim::kPrimLoad)) {
    real_depend_input = input_cnode->input(kLoadStateInput);
  } else if (AnfAlgo::CheckPrimitiveType(input_cnode, prim::kPrimMakeTuple)) {
    // Make tuple node needs to be expanded.
    for (size_t i = 1; i < input_cnode->inputs().size(); ++i) {
      LinkControlArrowByAutoMonad(to_actor, input_cnode->input(i), kernel_actors_map);
    }
    return;
  } else {
    return;
  }

  MS_EXCEPTION_IF_NULL(real_depend_input);
  if (!real_depend_input->isa<CNode>()) {
    return;
  }
  // The monad node and make tuple node need recursion.
  if (AnfAlgo::CheckPrimitiveType(real_depend_input, prim::kPrimUpdateState) ||
      AnfAlgo::CheckPrimitiveType(real_depend_input, prim::kPrimLoad) ||
      AnfAlgo::CheckPrimitiveType(real_depend_input, prim::kPrimMakeTuple)) {
    LinkControlArrowByAutoMonad(to_actor, real_depend_input, kernel_actors_map);
    return;
  }

  // Link the control arrow between the kernel actors.
  auto from_actor = FindKernelActor(kernel_actors_map, real_depend_input->fullname_with_scope());
  MS_EXCEPTION_IF_NULL(from_actor);
  from_actor->output_op_controls_.emplace_back(to_actor->GetAID());
  to_actor->input_controls_num_++;
}

void GraphScheduler::LinkControlArrowForLoopCountActor(LoopCountActor *loop_count_actor, const KernelGraphPtr &graph) {
  MS_EXCEPTION_IF_NULL(graph);
  MS_EXCEPTION_IF_NULL(loop_count_actor);

  auto actor_set = Fetch(graph);
  MS_EXCEPTION_IF_NULL(actor_set);

  // Set the source data actor.
  for (auto &data_source_actor : actor_set->data_source_actors_) {
    MS_EXCEPTION_IF_NULL(data_source_actor);
    loop_count_actor->data_source_aids_.emplace_back(data_source_actor->GetAID());
  }

  // Set the no input kernel actor.
  for (auto &no_input_kernel_actor : actor_set->no_input_kernel_actors_) {
    MS_EXCEPTION_IF_NULL(no_input_kernel_actor);
    loop_count_actor->no_input_kernel_aids_.emplace_back(no_input_kernel_actor->GetAID());
  }
}

bool GraphScheduler::CheckActorValid(const ActorSet *actor_set) const {
  MS_EXCEPTION_IF_NULL(actor_set);
  // Check the data source actors.
  for (const auto &data_source_actor : actor_set->data_source_actors_) {
    MS_EXCEPTION_IF_NULL(data_source_actor);
    if (data_source_actor->output_op_arrows_.size() == 0) {
      MS_LOG(ERROR) << data_source_actor->GetAID().Name() << " has no user.";
      return false;
    }
  }

  // Check the kernel actors.
  for (const auto &kernel_actor : actor_set->kernel_actors_) {
    MS_EXCEPTION_IF_NULL(kernel_actor);
    if (kernel_actor->output_op_arrows_.size() + kernel_actor->output_op_controls_.size() == 0) {
      MS_LOG(ERROR) << kernel_actor->GetAID().Name() << " has no user.";
      return false;
    }

    auto input_num = AnfAlgo::GetInputTensorNum(kernel_actor->kernel_);
    auto input_data_num = kernel_actor->input_datas_num_;
    auto device_tensor_store_num = kernel_actor->device_tensor_store_keys_.size();
    if (input_data_num + device_tensor_store_num != input_num) {
      MS_LOG(ERROR) << "The input building of " << kernel_actor->GetAID().Name()
                    << " is wrong, input data num: " << input_data_num
                    << ", device tensor store num: " << device_tensor_store_num << ", total input num: " << input_num;
      return false;
    }
  }

  // Check the loop count actor.
  const auto &loop_count_actor = actor_set->loop_count_actor_;
  if (loop_count_actor != nullptr) {
    if (loop_count_actor->input_controls_num_ == 0) {
      MS_LOG(ERROR) << loop_count_actor->GetAID().Name() << " has no source.";
      return false;
    }
  }

  return true;
}

void GraphScheduler::PersistDeviceTensor(const KernelGraphPtr &graph) {
  MS_EXCEPTION_IF_NULL(graph);

  for (auto &value_node : graph->graph_value_nodes()) {
    MS_EXCEPTION_IF_NULL(value_node);
    if (!AnfAlgo::OutputAddrExist(value_node, 0)) {
      MS_LOG(INFO) << "The device address is not exist: " << value_node->ToString();
      continue;
    }
    auto device_tensor = AnfAlgo::GetMutableOutputAddr(value_node, 0);
    DeviceTensorStore::GetInstance().Insert(value_node.get(), device_tensor);
    device_tensor->set_original_ref_count(SIZE_MAX);
    device_tensor->ResetRefCount();
  }

  for (auto &input_node : graph->input_nodes()) {
    MS_EXCEPTION_IF_NULL(input_node);
    if (IsPersistentDeviceTensor(input_node)) {
      auto device_tensor = AnfAlgo::GetMutableOutputAddr(input_node, 0);
      MS_EXCEPTION_IF_NULL(device_tensor);
      DeviceTensorStore::GetInstance().Insert(input_node.get(), device_tensor);
      device_tensor->set_original_ref_count(SIZE_MAX);
      device_tensor->ResetRefCount();
    }
  }
}

HostTensorQueue *GraphScheduler::FetchHostQueue(const KernelGraphPtr &graph) const {
  MS_EXCEPTION_IF_NULL(graph);
  const auto &iter = graph_to_host_queue_.find(graph);
  if (iter != graph_to_host_queue_.end()) {
    return iter->second.get();
  } else {
    return nullptr;
  }
}

void GraphScheduler::DumpActor(const KernelGraphPtr &graph) const {
  MS_EXCEPTION_IF_NULL(graph);
  const auto &actor_set = Fetch(graph);
  MS_EXCEPTION_IF_NULL(actor_set);
  std::string filename = "./actor_set_" + graph->ToString() + ".ir";
  std::ofstream ofs(filename);
  if (!ofs.is_open()) {
    MS_LOG(ERROR) << "Open file [" << filename << "] failed!";
    return;
  }

  ofs << "[Data source actors]\n";
  for (const auto &data_source_actor : actor_set->data_source_actors_) {
    DumpDSActor(data_source_actor.get(), ofs);
    ofs << "\n";
  }

  ofs << "\n[Kernel actors]\n";
  for (const auto &kernel_actor : actor_set->kernel_actors_) {
    DumpKernelActor(kernel_actor.get(), ofs);
    ofs << "\n";
  }

  ofs << "\n[No input kernel actors]\n";
  for (const auto &no_input_kernel_actor : actor_set->no_input_kernel_actors_) {
    DumpKernelActor(no_input_kernel_actor.get(), ofs);
    ofs << "\n";
  }

  ofs << "\n[Loop count actor]\n";
  const auto &loop_count_actor = actor_set->loop_count_actor_;
  if (loop_count_actor != nullptr) {
    DumpLoopCountActor(loop_count_actor.get(), ofs);
    ofs << "\n";
  }
}

void GraphScheduler::DumpDSActor(const DataSourceActor *actor, std::ofstream &ofs) const {
  MS_EXCEPTION_IF_NULL(actor);
  const auto &actor_name = actor->GetAID().Name();

  MS_EXCEPTION_IF_NULL(actor->device_context_);
  ofs << "\tactor_name:" << actor_name << "\tdevice_context:" << actor->device_context_->device_context_key().ToString()
      << "\n";

  if (actor_name.find("_DeviceQueueDataSourceActor") != string::npos) {
    // Dump the member info of device queue data source actor.
    const auto &device_queue_ds_actor = dynamic_cast<const DeviceQueueDataSourceActor *>(actor);
    const auto &data_kernel = device_queue_ds_actor->data_kernel_;
    MS_EXCEPTION_IF_NULL(data_kernel);
    ofs << "\t\tdata_kernel_name:" << data_kernel->fullname_with_scope()
        << "\tinput_number:" << AnfAlgo::GetInputTensorNum(data_kernel)
        << "\toutput_number:" << AnfAlgo::GetOutputTensorNum(data_kernel) << "\n";
    for (size_t i = 0; i < AnfAlgo::GetOutputTensorNum(data_kernel); ++i) {
      const auto &device_tensor = AnfAlgo::GetMutableOutputAddr(data_kernel, i, false);
      MS_EXCEPTION_IF_NULL(device_tensor);
      ofs << "\t\t\toutput_index:" << i << "\tptr:" << device_tensor->GetPtr() << "\tsize:" << device_tensor->GetSize()
          << "\toriginal_ref_count:" << device_tensor->original_ref_count() << "\n ";
    }
  } else if (actor_name.find("_HostQueueDataSourceActor") != string::npos) {
    // Dump the member info of host queue data source actor.
    const auto &host_queue_ds_actor = dynamic_cast<const HostQueueDataSourceActor *>(actor);
    ofs << "\t\tdata_nodes:" << host_queue_ds_actor->data_nodes_.size() << "\n";
    for (size_t i = 0; i < host_queue_ds_actor->data_nodes_.size(); ++i) {
      const auto &data_node = host_queue_ds_actor->data_nodes_[i];
      MS_EXCEPTION_IF_NULL(data_node);
      const auto &device_tensor = AnfAlgo::GetMutableOutputAddr(data_node, 0, false);
      MS_EXCEPTION_IF_NULL(device_tensor);
      ofs << "\t\t\tnode_order_number:" << i << "\tnode_name:" << data_node->fullname_with_scope()
          << "\tptr:" << device_tensor->GetPtr() << "\tsize:" << device_tensor->GetSize()
          << "\toriginal_ref_count:" << device_tensor->original_ref_count() << "\n ";
    }
  }

  ofs << "\t\toutput_data_arrows:" << actor->output_op_arrows_.size() << "\n ";
  for (const auto &data_arrow : actor->output_op_arrows_) {
    MS_EXCEPTION_IF_NULL(data_arrow);
    ofs << "\t\t\tfrom_output_index:" << data_arrow->from_output_index_
        << "\tto_actor_name:" << data_arrow->to_op_id_.Name() << "\tto_input_index:" << data_arrow->to_input_index_
        << "\n";
  }
}

void GraphScheduler::DumpLoopCountActor(const LoopCountActor *actor, std::ofstream &ofs) const {
  MS_EXCEPTION_IF_NULL(actor);
  ofs << "\tactor_name:" << actor->GetAID().Name() << "\tloop_count:" << actor->loop_count_
      << "\tinput_controls_num:" << actor->input_controls_num_ << "\n";

  ofs << "\t\toutput_control_arrows:" << (actor->data_source_aids_.size() + actor->no_input_kernel_aids_.size())
      << "\n ";
  for (const auto &aid : actor->data_source_aids_) {
    ofs << "\t\t\tto_actor_name:" << aid.Name() << "\n";
  }
  for (const auto &aid : actor->no_input_kernel_aids_) {
    ofs << "\t\t\tto_actor_name:" << aid.Name() << "\n";
  }
}

void GraphScheduler::DumpKernelActor(const KernelActor *actor, std::ofstream &ofs) const {
  MS_EXCEPTION_IF_NULL(actor);
  MS_EXCEPTION_IF_NULL(actor->device_context_);
  ofs << "\tactor_name:" << actor->GetAID().Name()
      << "\tdevice_context:" << actor->device_context_->device_context_key().ToString()
      << "\tinput_data_num:" << actor->input_datas_num_ << "\tinput_controls_num:" << actor->input_controls_num_
      << "\n";

  const auto &kernel = actor->kernel_;
  MS_EXCEPTION_IF_NULL(kernel);
  ofs << "\t\tkernel_name:" << kernel->fullname_with_scope() << "\tinput_number:" << AnfAlgo::GetInputTensorNum(kernel)
      << "\toutput_number:" << AnfAlgo::GetOutputTensorNum(kernel) << "\n";
  for (size_t i = 0; i < AnfAlgo::GetOutputTensorNum(kernel); ++i) {
    const auto &device_tensor = AnfAlgo::GetMutableOutputAddr(kernel, i, false);
    MS_EXCEPTION_IF_NULL(device_tensor);
    ofs << "\t\t\toutput_index:" << i << "\tptr:" << device_tensor->GetPtr() << "\tsize:" << device_tensor->GetSize()
        << "\toriginal_ref_count:" << device_tensor->original_ref_count() << "\n ";
  }

  ofs << "\t\tdevice_tensor_stores:" << actor->device_tensor_store_keys_.size() << "\n ";
  for (const auto &device_tensor_store_key : actor->device_tensor_store_keys_) {
    const auto &node = reinterpret_cast<AnfNode *>(device_tensor_store_key.second);
    MS_EXCEPTION_IF_NULL(node);
    ofs << "\t\t\tto_input_index:" << device_tensor_store_key.first
        << "\tfrom_node_name:" << node->fullname_with_scope() << "\n";
  }

  ofs << "\t\toutput_data_arrows:" << actor->output_op_arrows_.size() << "\n ";
  for (const auto &data_arrow : actor->output_op_arrows_) {
    MS_EXCEPTION_IF_NULL(data_arrow);
    ofs << "\t\t\tfrom_output_index:" << data_arrow->from_output_index_
        << "\tto_actor_name:" << data_arrow->to_op_id_.Name() << "\tto_input_index:" << data_arrow->to_input_index_
        << "\n";
  }

  ofs << "\t\toutput_control_arrows:" << actor->output_op_controls_.size() << "\n ";
  for (const auto &aid : actor->output_op_controls_) {
    ofs << "\t\t\tto_actor_name:" << aid.Name() << "\n";
  }
}

}  // namespace runtime
}  // namespace mindspore