unified runtime optimize actor dump

4 years ago · a1ef8140a2
--- a/mindspore/ccsrc/runtime/framework/actor/abstract_actor.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/abstract_actor.cc
@@ -96,13 +96,13 @@ void AbstractActor::SendOutput(OpContext<DeviceTensor> *const context) {
  MS_EXCEPTION_IF_NULL(context);
  // Must be the execution order: send result --> send data --> send control, avoid the illegal timing problem.
  // 1.Send graph output result.
  if (output_result_arrows_.size() != output_nodes_.size()) {
  if (output_result_arrows_.size() != output_result_nodes_.size()) {
    SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "The size of output result arrows is not equal to the output nodes.");
  }
  size_t output_node_index = 0;
  for (const auto &result_arrow : output_result_arrows_) {
    MS_EXCEPTION_IF_NULL(result_arrow);
    Async(result_arrow->to_op_id_, &OutputActor::CollectOutput, output_nodes_[output_node_index++],
    Async(result_arrow->to_op_id_, &OutputActor::CollectOutput, output_result_nodes_[output_node_index++],
          result_arrow->from_output_index_, result_arrow->to_input_index_, context);
  }

--- a/mindspore/ccsrc/runtime/framework/actor/abstract_actor.h
+++ b/mindspore/ccsrc/runtime/framework/actor/abstract_actor.h
@@ -53,6 +53,17 @@ class AbstractActor : public OpActor<DeviceTensor> {
  // Get the position of node in the actor.
  virtual size_t FetchNodePosition(const AnfNodePtr &node) const { return 0; }

  // Get the member.
  KernelTransformType type() const { return type_; }
  const std::vector<const DeviceContext *> &device_contexts() const { return device_contexts_; }
  const std::vector<AnfNodePtr> &output_result_nodes() const { return output_result_nodes_; }
  const std::vector<DataArrowPtr> &output_result_arrows() const { return output_result_arrows_; }
  const std::vector<std::pair<size_t, AnfNodePtr>> &device_tensor_store_keys() const {
    return device_tensor_store_keys_;
  }
  const std::vector<AID> &input_data_arrow_aids() const { return input_data_arrow_aids_; }
  const std::vector<AID> &input_control_arrow_aids() const { return input_control_arrow_aids_; }

 protected:
  friend class GraphScheduler;

@@ -84,7 +95,7 @@ class AbstractActor : public OpActor<DeviceTensor> {
  std::vector<OpDataUniquePtr<DeviceTensor>> output_data_;

  // The output nodes and output result arrows of graph output.
  std::vector<AnfNodePtr> output_nodes_;
  std::vector<AnfNodePtr> output_result_nodes_;
  std::vector<DataArrowPtr> output_result_arrows_;

  // The dependent device tensor stores,  the dependent expression is pair<index, AnfNode>.
--- a/mindspore/ccsrc/runtime/framework/actor/actor_dump.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/actor_dump.cc
@@ -0,0 +1,303 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "runtime/framework/actor/actor_dump.h"

 namespace mindspore {
 namespace runtime {
 namespace {
 void DumpAbstractActor(const AbstractActor *actor, std::ofstream &ofs) {
  MS_EXCEPTION_IF_NULL(actor);
  if (actor->device_contexts().size() > 0) {
    ofs << "\t\tdevice_contexts:" << actor->device_contexts().size() << "\n ";
    for (const auto &device_context : actor->device_contexts()) {
      if (device_context == nullptr) {
        ofs << "\t\t\tdevice_context:" << device_context << "\n";
        continue;
      }
      ofs << "\t\t\tdevice_context:" << device_context->device_context_key().ToString() << "\n";
    }
  }

  if (actor->device_tensor_store_keys().size() > 0) {
    ofs << "\t\tdevice_tensor_store_keys:" << actor->device_tensor_store_keys().size() << "\n ";
    for (const auto &device_tensor_store_key : actor->device_tensor_store_keys()) {
      MS_EXCEPTION_IF_NULL(device_tensor_store_key.second);
      ofs << "\t\t\tto_input_index:" << device_tensor_store_key.first
          << "\tfrom_node_name:" << device_tensor_store_key.second->fullname_with_scope() << "\n";
    }
  }

  if (actor->input_data_arrow_aids().size() > 0) {
    ofs << "\t\tinput_data_arrow_actors:" << actor->input_data_arrow_aids().size() << "\n ";
    for (const auto &input_data_arrow_aid : actor->input_data_arrow_aids()) {
      ofs << "\t\t\tfrom_actor_name:" << input_data_arrow_aid.Name() << "\n";
    }
  }

  if (actor->input_control_arrow_aids().size() > 0) {
    ofs << "\t\tinput_control_arrow_actors:" << actor->input_control_arrow_aids().size() << "\n ";
    for (const auto &input_control_arrow_aid : actor->input_control_arrow_aids()) {
      ofs << "\t\t\tfrom_actor_name:" << input_control_arrow_aid.Name() << "\n";
    }
  }

  const auto &output_data_arrows = actor->output_data_arrows();
  if (output_data_arrows.size() > 0) {
    ofs << "\t\toutput_data_arrows:" << output_data_arrows.size() << "\n ";
    for (const auto &data_arrow : output_data_arrows) {
      MS_EXCEPTION_IF_NULL(data_arrow);
      ofs << "\t\t\tfrom_output_index:" << data_arrow->from_output_index_
          << "\tto_actor_name:" << data_arrow->to_op_id_.Name() << "\tto_input_index:" << data_arrow->to_input_index_
          << "\n";
    }
  }

  const auto &output_control_arrows = actor->output_control_arrows();
  if (output_control_arrows.size() > 0) {
    ofs << "\t\toutput_control_arrows:" << output_control_arrows.size() << "\n ";
    for (const auto &aid : output_control_arrows) {
      ofs << "\t\t\tto_actor_name:" << aid.Name() << "\n";
    }
  }

  if (actor->output_result_arrows().size() != actor->output_result_nodes().size()) {
    MS_LOG(EXCEPTION) << "The size of output result arrows is not equal to the output nodes.";
  }
  if (actor->output_result_arrows().size() > 0) {
    ofs << "\t\toutput_result_arrows:" << actor->output_result_arrows().size() << "\n ";
    for (size_t i = 0; i < actor->output_result_arrows().size(); ++i) {
      auto result_arrow = actor->output_result_arrows()[i];
      auto output_node = actor->output_result_nodes()[i];
      MS_EXCEPTION_IF_NULL(result_arrow);
      MS_EXCEPTION_IF_NULL(output_node);
      ofs << "\t\t\tfrom_output_node:" << output_node->fullname_with_scope()
          << "\tfrom_output_index:" << result_arrow->from_output_index_
          << "\tto_actor_name:" << result_arrow->to_op_id_.Name()
          << "\toutput_node_position:" << result_arrow->to_input_index_ << "\n";
    }
  }
 }

 void DumpDSActor(const DataSourceActor *actor, std::ofstream &ofs) {
  MS_EXCEPTION_IF_NULL(actor);
  const auto &actor_name = actor->GetAID().Name();
  ofs << "\tactor_name:" << actor_name << "\n";

  if (actor->type() == KernelTransformType::kDeviceDataSourceActor) {
    // Dump the member info of device queue data source actor.
    const auto &device_queue_ds_actor = dynamic_cast<const DeviceQueueDataSourceActor *>(actor);
    MS_EXCEPTION_IF_NULL(device_queue_ds_actor);
    const auto &data_kernel = device_queue_ds_actor->data_kernel();
    MS_EXCEPTION_IF_NULL(data_kernel);
    ofs << "\t\tdata_kernel_name:" << data_kernel->fullname_with_scope()
        << "\tinput_number:" << AnfAlgo::GetInputTensorNum(data_kernel)
        << "\toutput_number:" << AnfAlgo::GetOutputTensorNum(data_kernel) << "\n";
    for (size_t i = 0; i < AnfAlgo::GetOutputTensorNum(data_kernel); ++i) {
      const auto &device_tensor = AnfAlgo::GetMutableOutputAddr(data_kernel, i, false);
      MS_EXCEPTION_IF_NULL(device_tensor);
      ofs << "\t\t\toutput_index:" << i << "\tptr:" << device_tensor->GetPtr() << "\tsize:" << device_tensor->GetSize()
          << "\toriginal_ref_count:" << device_tensor->original_ref_count() << "\n ";
    }
  } else if (actor->type() == KernelTransformType::kHostDataSourceActor) {
    // Dump the member info of host queue data source actor.
    const auto &host_queue_ds_actor = dynamic_cast<const HostQueueDataSourceActor *>(actor);
    MS_EXCEPTION_IF_NULL(host_queue_ds_actor);
    ofs << "\t\tdata_nodes:" << host_queue_ds_actor->data_nodes().size() << "\n";
    for (size_t i = 0; i < host_queue_ds_actor->data_nodes().size(); ++i) {
      const auto &data_node = host_queue_ds_actor->data_nodes()[i];
      MS_EXCEPTION_IF_NULL(data_node);
      const auto &device_tensor = AnfAlgo::GetMutableOutputAddr(data_node, 0, false);
      MS_EXCEPTION_IF_NULL(device_tensor);
      ofs << "\t\t\tnode_order_number:" << i << "\tnode_name:" << data_node->fullname_with_scope()
          << "\tptr:" << device_tensor->GetPtr() << "\tsize:" << device_tensor->GetSize()
          << "\toriginal_ref_count:" << device_tensor->original_ref_count() << "\n";
    }
  }

  DumpAbstractActor(actor, ofs);
  ofs << "\n";
 }

 void DumpKernelActor(const KernelActor *actor, std::ofstream &ofs) {
  MS_EXCEPTION_IF_NULL(actor);
  ofs << "\tactor_name:" << actor->GetAID().Name() << "\n";

  const auto &kernel = actor->kernel();
  MS_EXCEPTION_IF_NULL(kernel);
  ofs << "\t\tkernel_name:" << kernel->fullname_with_scope() << "\tinputs_num:" << AnfAlgo::GetInputTensorNum(kernel)
      << "\toutputs_num:" << AnfAlgo::GetOutputTensorNum(kernel) << "\n";
  for (size_t i = 0; i < AnfAlgo::GetOutputTensorNum(kernel); ++i) {
    const auto &device_tensor = AnfAlgo::GetMutableOutputAddr(kernel, i, false);
    MS_EXCEPTION_IF_NULL(device_tensor);
    ofs << "\t\t\toutput_index:" << i << "\tptr:" << device_tensor->GetPtr() << "\tsize:" << device_tensor->GetSize()
        << "\toriginal_ref_count:" << device_tensor->original_ref_count() << "\n ";
  }

  DumpAbstractActor(actor, ofs);
  ofs << "\n";
 }

 void DumpSuperKernelActor(const SuperKernelActor *actor, std::ofstream &ofs) {
  MS_EXCEPTION_IF_NULL(actor);
  ofs << "\tactor_name:" << actor->GetAID().Name() << "\n";

  const auto &graph = actor->graph();
  MS_EXCEPTION_IF_NULL(graph);

  ofs << "\t\tgraph_id:" << graph->graph_id() << "\tgraphl_name:" << graph->ToString()
      << "\tis_sink:" << graph->is_sink() << "\tinputs_num:" << (graph->input_nodes()).size()
      << "\tkernels_num:" << (graph->execution_order()).size() << "\n";

  DumpAbstractActor(actor, ofs);
  ofs << "\n";
 }

 void DumpCopyActor(const CopyActor *actor, std::ofstream &ofs) {
  MS_EXCEPTION_IF_NULL(actor);
  ofs << "\tactor_name:" << actor->GetAID().Name() << "\n";

  auto device_tensor = actor->output();
  if (device_tensor != nullptr) {
    ofs << "\t\toutput_index:" << 0 << "\tptr:" << device_tensor->GetPtr() << "\tsize:" << device_tensor->GetSize()
        << "\toriginal_ref_count:" << device_tensor->original_ref_count() << "\n ";
  }

  DumpAbstractActor(actor, ofs);
  ofs << "\n";
 }

 void DumpGatherActor(const GatherActor *actor, std::ofstream &ofs) {
  MS_EXCEPTION_IF_NULL(actor);
  ofs << "\tactor_name:" << actor->GetAID().Name() << '\n';
 }

 void DumpSwitchActor(const SwitchActor *actor, std::ofstream &ofs) {
  MS_EXCEPTION_IF_NULL(actor);
  ofs << "\tactor_name:" << actor->GetAID().Name() << '\n';
 }
 }  // namespace

 void DumpDataPrepareActor(const DataPrepareActorPtr &actor, std::ofstream &ofs) {
  ofs << "\n\n[Data prepare actor:" << (actor != nullptr ? 1 : 0) << "]\n";
  if (actor == nullptr) {
    return;
  }

  ofs << "\tactor_name:" << actor->GetAID().Name() << "\n";
  DumpAbstractActor(actor.get(), ofs);

  ofs << "\t\tcontinuous_memory_nodes:" << actor->continuous_memory_nodes().size() << "\n ";
  for (const auto &iter : actor->continuous_memory_nodes()) {
    MS_EXCEPTION_IF_NULL(iter.first.first);
    MS_EXCEPTION_IF_NULL(iter.first.second);
    ofs << "\t\t\tnode_name:" << iter.first.first->fullname_with_scope()
        << "\tdevice_context:" << iter.first.second->device_context_key().ToString()
        << "\tis_input_need:" << iter.second.first << "\tis_output_need:" << iter.second.second << "\n";
  }
 }

 void DumpLoopCountActor(const LoopCountActorPtr &actor, std::ofstream &ofs) {
  ofs << "\n\n[Loop count actor:" << (actor != nullptr ? 1 : 0) << "]\n";
  if (actor == nullptr) {
    return;
  }

  ofs << "\tactor_name:" << actor->GetAID().Name() << "\tloop_count:" << actor->loop_count() << "\n";
  DumpAbstractActor(actor.get(), ofs);

  const size_t kOutputControlArrowsNum = 2;
  ofs << "\t\toutput_control_arrows:" << kOutputControlArrowsNum << "\n ";
  ofs << "\t\t\tto_actor_name:" << actor->output_aid().Name() << "\n";
  ofs << "\t\t\tto_actor_name:" << actor->data_prepare_aid().Name() << "\n";
 }

 void DumpOutputActor(const OutputActorPtr &actor, std::ofstream &ofs) {
  ofs << "\n\n[Output actor:" << (actor != nullptr ? 1 : 0) << "]\n";
  if (actor == nullptr) {
    return;
  }

  ofs << "\tactor_name:" << actor->GetAID().Name() << "\tloop_count:" << actor->loop_count()
      << "\toutputs_num:" << actor->outputs_num() << "\n";

  DumpAbstractActor(actor.get(), ofs);

  ofs << "\t\tinput_result_arrows:" << actor->input_result_arrow_aids().size() << "\n ";
  for (const auto &input_result_arrow_aid : actor->input_result_arrow_aids()) {
    ofs << "\t\t\tfrom_actor_name:" << input_result_arrow_aid.Name() << "\n";
  }
 }

 void DumpDSActors(const std::vector<DataSourceActorPtr> &actors, std::ofstream &ofs) {
  ofs << "\n\n[Data source actors:" << actors.size() << "]\n";
  for (const auto &data_source_actor : actors) {
    DumpDSActor(data_source_actor.get(), ofs);
  }
 }

 void DumpKernelActors(const std::vector<KernelActorPtr> &actors, std::ofstream &ofs) {
  ofs << "\n\n[Kernel actors:" << actors.size() << "]\n";
  for (const auto &kernel_actor : actors) {
    DumpKernelActor(kernel_actor.get(), ofs);
  }
 }

 void DumpSuperKernelActors(const std::vector<SuperKernelActorPtr> &actors, std::ofstream &ofs) {
  ofs << "\n\n[Super kernel actors:" << actors.size() << "]\n";
  for (const auto &super_kernel_actor : actors) {
    DumpSuperKernelActor(super_kernel_actor.get(), ofs);
  }
 }

 void DumpNoInputKernelActors(const std::vector<AbstractActorPtr> &actors, std::ofstream &ofs) {
  ofs << "\n\n[No input kernel actors:" << actors.size() << "]\n";
  for (const auto &actor : actors) {
    MS_EXCEPTION_IF_NULL(actor);
    if (actor->type() == KernelTransformType::kKernelActor) {
      auto kernel_actor = dynamic_cast<const KernelActor *>(actor.get());
      MS_EXCEPTION_IF_NULL(kernel_actor);
      DumpKernelActor(kernel_actor, ofs);
    } else if (actor->type() == KernelTransformType::kSuperKernelActor) {
      auto super_kernel_actor = dynamic_cast<const SuperKernelActor *>(actor.get());
      MS_EXCEPTION_IF_NULL(super_kernel_actor);
      DumpSuperKernelActor(super_kernel_actor, ofs);
    }
  }
 }

 void DumpCopyActors(const std::vector<CopyActorPtr> &actors, std::ofstream &ofs) {
  ofs << "\n\n[Copy actors:" << actors.size() << "]\n";
  for (const auto &copy_actor : actors) {
    DumpCopyActor(copy_actor.get(), ofs);
  }
 }

 void DumpGatherActors(const std::vector<GatherActorPtr> &actors, std::ofstream &ofs) {
  ofs << "\n\n[Gather actors:" << actors.size() << "]\n";
  for (const auto &gather_actor : actors) {
    DumpGatherActor(gather_actor.get(), ofs);
  }
 }

 void DumpSwitchActors(const std::vector<SwitchActorPtr> &actors, std::ofstream &ofs) {
  ofs << "\n\n[Switch actors:" << actors.size() << "]\n";
  for (const auto &switch_actor : actors) {
    DumpSwitchActor(switch_actor.get(), ofs);
  }
 }
 }  // namespace runtime
 }  // namespace mindspore
--- a/mindspore/ccsrc/runtime/framework/actor/actor_dump.h
+++ b/mindspore/ccsrc/runtime/framework/actor/actor_dump.h
@@ -0,0 +1,51 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_ACTOR_DUMP_H_
 #define MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_ACTOR_DUMP_H_

 #include <vector>
 #include <string>
 #include <memory>
 #include <utility>
 #include <fstream>
 #include "runtime/framework/actor/abstract_actor.h"
 #include "runtime/framework/actor/data_prepare_actor.h"
 #include "runtime/framework/actor/data_source_actor.h"
 #include "runtime/framework/actor/loop_count_actor.h"
 #include "runtime/framework/actor/kernel_actor.h"
 #include "runtime/framework/actor/super_kernel_actor.h"
 #include "runtime/framework/actor/output_actor.h"
 #include "runtime/framework/actor/copy_actor.h"
 #include "runtime/framework/actor/control_flow/switch_actor.h"
 #include "runtime/framework/actor/control_flow/gather_actor.h"

 namespace mindspore {
 namespace runtime {
 void DumpDataPrepareActor(const DataPrepareActorPtr &actor, std::ofstream &ofs);
 void DumpLoopCountActor(const LoopCountActorPtr &actor, std::ofstream &ofs);
 void DumpOutputActor(const OutputActorPtr &actor, std::ofstream &ofs);
 void DumpDSActors(const std::vector<DataSourceActorPtr> &actors, std::ofstream &ofs);
 void DumpKernelActors(const std::vector<KernelActorPtr> &actors, std::ofstream &ofs);
 void DumpSuperKernelActors(const std::vector<SuperKernelActorPtr> &actors, std::ofstream &ofs);
 void DumpNoInputKernelActors(const std::vector<AbstractActorPtr> &actors, std::ofstream &ofs);
 void DumpCopyActors(const std::vector<CopyActorPtr> &actors, std::ofstream &ofs);
 void DumpGatherActors(const std::vector<GatherActorPtr> &actors, std::ofstream &ofs);
 void DumpSwitchActors(const std::vector<SwitchActorPtr> &actors, std::ofstream &ofs);
 }  // namespace runtime
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_ACTOR_DUMP_H_
--- a/mindspore/ccsrc/runtime/framework/actor/copy_actor.h
+++ b/mindspore/ccsrc/runtime/framework/actor/copy_actor.h
@@ -48,6 +48,8 @@ class CopyActor : public MemoryAwareActor {
  // The copy processing after memory alloc finished.
  void OnMemoryAllocFinish(OpContext<DeviceTensor> *const context) override;

  const DeviceTensorPtr &output() const { return output_; }

 protected:
  void Run(OpContext<DeviceTensor> *const context) override;
  void UpdateOutputData(OpData<DeviceTensor> *const output_data, const DataArrow *data_arrow,
--- a/mindspore/ccsrc/runtime/framework/actor/data_prepare_actor.h
+++ b/mindspore/ccsrc/runtime/framework/actor/data_prepare_actor.h
@@ -61,6 +61,10 @@ class DataPrepareActor : public DebugAwareActor {
  void SendMemoryAllocReq(OpContext<DeviceTensor> *const context) override;
  void OnMemoryAllocFinish(OpContext<DeviceTensor> *const context) override;

  const std::map<std::pair<CNodePtr, DeviceContext *>, std::pair<bool, bool>> &continuous_memory_nodes() const {
    return continuous_memory_nodes_;
  }

 private:
  friend class GraphScheduler;

--- a/mindspore/ccsrc/runtime/framework/actor/data_source_actor.h
+++ b/mindspore/ccsrc/runtime/framework/actor/data_source_actor.h
@@ -89,6 +89,8 @@ class DeviceQueueDataSourceActor : public DataSourceActor {
  void SendDebugReq(OpContext<DeviceTensor> *const context) override;
  void OnDebugFinish(OpContext<DeviceTensor> *const context) override;

  const CNodePtr &data_kernel() const { return data_kernel_; }

 protected:
  void FillDataBuffer() override;
  void SendRecorderInfo(OpContext<DeviceTensor> *const context) const override;
--- a/mindspore/ccsrc/runtime/framework/actor/kernel_actor.h
+++ b/mindspore/ccsrc/runtime/framework/actor/kernel_actor.h
@@ -73,6 +73,8 @@ class KernelActor : public DebugAwareActor {
  // The callback after debug finished.
  void OnDebugFinish(OpContext<DeviceTensor> *const context) override;

  const CNodePtr &kernel() const { return kernel_; }

 protected:
  void Run(OpContext<DeviceTensor> *const context) override;
  void SendRecorderInfo(OpContext<DeviceTensor> *const context) const override;
--- a/mindspore/ccsrc/runtime/framework/actor/loop_count_actor.h
+++ b/mindspore/ccsrc/runtime/framework/actor/loop_count_actor.h
@@ -51,6 +51,11 @@ class LoopCountActor : public DebugAwareActor {
  // The callback after debug finished.
  void OnDebugFinish(OpContext<DeviceTensor> *const context) override;

  // Get the member.
  size_t loop_count() const { return loop_count_; }
  const AID &data_prepare_aid() const { return data_prepare_aid_; }
  const AID &output_aid() const { return output_aid_; }

 protected:
  void Run(OpContext<DeviceTensor> *const context) override;
  void SendOutput(OpContext<DeviceTensor> *const context) override;
--- a/mindspore/ccsrc/runtime/framework/actor/output_actor.h
+++ b/mindspore/ccsrc/runtime/framework/actor/output_actor.h
@@ -66,6 +66,10 @@ class OutputActor : public AbstractActor {
  // context of tensor be rewritten in the next step or next loop.
  void UpdateOutputDeviceAddress();

  // Get the member.
  size_t loop_count() const { return loop_count_; }
  size_t outputs_num() const { return outputs_num_; }
  const std::vector<AID> &input_result_arrow_aids() const { return input_result_arrow_aids_; }
  std::vector<TensorPtr> &outputs() { return outputs_; }

 private:
--- a/mindspore/ccsrc/runtime/framework/actor/super_kernel_actor.h
+++ b/mindspore/ccsrc/runtime/framework/actor/super_kernel_actor.h
@@ -42,6 +42,8 @@ class SuperKernelActor : public DebugAwareActor {

  void Init() override;

  const KernelGraphPtr &graph() const { return graph_; }

 protected:
  void Run(OpContext<DeviceTensor> *const context) override;

--- a/mindspore/ccsrc/runtime/framework/graph_scheduler.cc
+++ b/mindspore/ccsrc/runtime/framework/graph_scheduler.cc
@@ -1479,7 +1479,7 @@ void GraphScheduler::LinkOutputResultArrowForOutputActor(OutputActor *to_actor,
        }
        auto op_arrow = std::make_shared<DataArrow>(output_with_index.second, to_actor->GetAID(), output_position);
        (void)from_actor->output_result_arrows_.emplace_back(op_arrow);
        (void)from_actor->output_nodes_.emplace_back(output_with_index.first);
        (void)from_actor->output_result_nodes_.emplace_back(output_with_index.first);
        (void)to_actor->input_result_arrow_aids_.emplace_back(from_actor->GetAID());

        // Update the real compute node in the host data source actor.
@@ -1488,7 +1488,7 @@ void GraphScheduler::LinkOutputResultArrowForOutputActor(OutputActor *to_actor,
          MS_EXCEPTION_IF_NULL(host_queue_ds_actor);
          auto position = host_queue_ds_actor->FetchNodePosition(output_with_index.first);
          auto real_node = host_queue_ds_actor->FetchNode(position);
          from_actor->output_nodes_[from_actor->output_nodes_.size() - 1] = real_node;
          from_actor->output_result_nodes_[from_actor->output_result_nodes_.size() - 1] = real_node;
          UpdateRefCount(real_node, output_with_index.second, true);
        }
      }
@@ -1757,277 +1757,22 @@ void GraphScheduler::DumpActor(const ActorSet *actor_set, const GraphCompilerInf
    return;
  }

  ofs << "[Device tensor stores]\n";
  DumpDeviceTensorStore(graph_compiler_info, ofs);

  const auto &data_prepare_actor = actor_set->data_prepare_actor_;
  ofs << "\n\n[Data prepare actor:" << (data_prepare_actor != nullptr ? 1 : 0) << "]\n";
  if (data_prepare_actor != nullptr) {
    DumpDataPrepareActor(data_prepare_actor.get(), ofs);
  }

  ofs << "\n\n[Data source actors:" << actor_set->data_source_actors_.size() << "]\n";
  for (const auto &data_source_actor : actor_set->data_source_actors_) {
    DumpDSActor(data_source_actor.get(), ofs);
  }

  ofs << "\n\n[Kernel actors:" << actor_set->kernel_actors_.size() << "]\n";
  for (const auto &kernel_actor : actor_set->kernel_actors_) {
    DumpKernelActor(kernel_actor.get(), ofs);
  }

  ofs << "\n\n[Super kernel actors:" << actor_set->super_kernel_actors_.size() << "]\n";
  for (const auto &super_kernel_actor : actor_set->super_kernel_actors_) {
    DumpSuperKernelActor(super_kernel_actor.get(), ofs);
  }

  ofs << "\n\n[No input kernel actors:" << actor_set->no_input_kernel_actors_.size() << "]\n";
  for (const auto &no_input_kernel_actor : actor_set->no_input_kernel_actors_) {
    DumpNoInputKernelActor(no_input_kernel_actor.get(), ofs);
  }

  ofs << "\n\n[Copy actors:" << actor_set->copy_actors_.size() << "]\n";
  for (const auto &copy_actor : actor_set->copy_actors_) {
    DumpCopyActor(copy_actor.get(), ofs);
  }

  ofs << "\n\n[Gather actors:" << actor_set->gather_actors_.size() << "]\n";
  for (const auto &gather_actor : actor_set->gather_actors_) {
    DumpGatherActor(gather_actor.get(), ofs);
  }

  ofs << "\n\n[Switch actors:" << actor_set->switch_actors_.size() << "]\n";
  for (const auto &switch_actor : actor_set->switch_actors_) {
    DumpSwitchActor(switch_actor.get(), ofs);
  }

  const auto &loop_count_actor = actor_set->loop_count_actor_;
  ofs << "\n\n[Loop count actor:" << (loop_count_actor != nullptr ? 1 : 0) << "]\n";
  if (loop_count_actor != nullptr) {
    DumpLoopCountActor(loop_count_actor.get(), ofs);
  }

  const auto &output_actor = actor_set->output_actor_;
  ofs << "\n\n[Output actor:" << (output_actor != nullptr ? 1 : 0) << "]\n";
  if (output_actor != nullptr) {
    DumpOutputActor(output_actor.get(), ofs);
  }
 }

 void GraphScheduler::DumpAbstractActor(const AbstractActor *actor, std::ofstream &ofs) const {
  MS_EXCEPTION_IF_NULL(actor);
  if (actor->device_contexts_.size() > 0) {
    ofs << "\t\tdevice_contexts:" << actor->device_contexts_.size() << "\n ";
    for (const auto &device_context : actor->device_contexts_) {
      if (device_context == nullptr) {
        ofs << "\t\t\tdevice_context:" << device_context << "\n";
        continue;
      }
      ofs << "\t\t\tdevice_context:" << device_context->device_context_key().ToString() << "\n";
    }
  }

  if (actor->device_tensor_store_keys_.size() > 0) {
    ofs << "\t\tdevice_tensor_store_keys:" << actor->device_tensor_store_keys_.size() << "\n ";
    for (const auto &device_tensor_store_key : actor->device_tensor_store_keys_) {
      MS_EXCEPTION_IF_NULL(device_tensor_store_key.second);
      ofs << "\t\t\tto_input_index:" << device_tensor_store_key.first
          << "\tfrom_node_name:" << device_tensor_store_key.second->fullname_with_scope() << "\n";
    }
  }

  if (actor->input_data_arrow_aids_.size() > 0) {
    ofs << "\t\tinput_data_arrow_actors:" << actor->input_data_arrow_aids_.size() << "\n ";
    for (const auto &input_data_arrow_aid : actor->input_data_arrow_aids_) {
      ofs << "\t\t\tfrom_actor_name:" << input_data_arrow_aid.Name() << "\n";
    }
  }

  if (actor->input_control_arrow_aids_.size() > 0) {
    ofs << "\t\tinput_control_arrow_actors:" << actor->input_control_arrow_aids_.size() << "\n ";
    for (const auto &input_control_arrow_aid : actor->input_control_arrow_aids_) {
      ofs << "\t\t\tfrom_actor_name:" << input_control_arrow_aid.Name() << "\n";
    }
  }

  const auto &output_data_arrows = actor->output_data_arrows();
  if (output_data_arrows.size() > 0) {
    ofs << "\t\toutput_data_arrows:" << output_data_arrows.size() << "\n ";
    for (const auto &data_arrow : output_data_arrows) {
      MS_EXCEPTION_IF_NULL(data_arrow);
      ofs << "\t\t\tfrom_output_index:" << data_arrow->from_output_index_
          << "\tto_actor_name:" << data_arrow->to_op_id_.Name() << "\tto_input_index:" << data_arrow->to_input_index_
          << "\n";
    }
  }

  const auto &output_control_arrows = actor->output_control_arrows();
  if (output_control_arrows.size() > 0) {
    ofs << "\t\toutput_control_arrows:" << output_control_arrows.size() << "\n ";
    for (const auto &aid : output_control_arrows) {
      ofs << "\t\t\tto_actor_name:" << aid.Name() << "\n";
    }
  }

  if (actor->output_result_arrows_.size() != actor->output_nodes_.size()) {
    MS_LOG(EXCEPTION) << "The size of output result arrows is not equal to the output nodes.";
  }
  if (actor->output_result_arrows_.size() > 0) {
    ofs << "\t\toutput_result_arrows:" << actor->output_result_arrows_.size() << "\n ";
    for (size_t i = 0; i < actor->output_result_arrows_.size(); ++i) {
      auto result_arrow = actor->output_result_arrows_[i];
      auto output_node = actor->output_nodes_[i];
      MS_EXCEPTION_IF_NULL(result_arrow);
      MS_EXCEPTION_IF_NULL(output_node);
      ofs << "\t\t\tfrom_output_node:" << output_node->fullname_with_scope()
          << "\tfrom_output_index:" << result_arrow->from_output_index_
          << "\tto_actor_name:" << result_arrow->to_op_id_.Name()
          << "\toutput_node_position:" << result_arrow->to_input_index_ << "\n";
    }
  }
 }

 void GraphScheduler::DumpDataPrepareActor(const DataPrepareActor *actor, std::ofstream &ofs) const {
  MS_EXCEPTION_IF_NULL(actor);
  ofs << "\tactor_name:" << actor->GetAID().Name() << "\n";
  DumpAbstractActor(actor, ofs);

  ofs << "\t\tcontinuous_memory_nodes:" << actor->continuous_memory_nodes_.size() << "\n ";
  for (const auto &iter : actor->continuous_memory_nodes_) {
    MS_EXCEPTION_IF_NULL(iter.first.first);
    MS_EXCEPTION_IF_NULL(iter.first.second);
    ofs << "\t\t\tnode_name:" << iter.first.first->fullname_with_scope()
        << "\tdevice_context:" << iter.first.second->device_context_key().ToString()
        << "\tis_input_need:" << iter.second.first << "\tis_output_need:" << iter.second.second << "\n";
  }
 }

 void GraphScheduler::DumpDSActor(const DataSourceActor *actor, std::ofstream &ofs) const {
  MS_EXCEPTION_IF_NULL(actor);
  const auto &actor_name = actor->GetAID().Name();
  ofs << "\tactor_name:" << actor_name << "\n";

  if (actor->type_ == KernelTransformType::kDeviceDataSourceActor) {
    // Dump the member info of device queue data source actor.
    const auto &device_queue_ds_actor = dynamic_cast<const DeviceQueueDataSourceActor *>(actor);
    MS_EXCEPTION_IF_NULL(device_queue_ds_actor);
    const auto &data_kernel = device_queue_ds_actor->data_kernel_;
    MS_EXCEPTION_IF_NULL(data_kernel);
    ofs << "\t\tdata_kernel_name:" << data_kernel->fullname_with_scope()
        << "\tinput_number:" << AnfAlgo::GetInputTensorNum(data_kernel)
        << "\toutput_number:" << AnfAlgo::GetOutputTensorNum(data_kernel) << "\n";
    for (size_t i = 0; i < AnfAlgo::GetOutputTensorNum(data_kernel); ++i) {
      const auto &device_tensor = AnfAlgo::GetMutableOutputAddr(data_kernel, i, false);
      MS_EXCEPTION_IF_NULL(device_tensor);
      ofs << "\t\t\toutput_index:" << i << "\tptr:" << device_tensor->GetPtr() << "\tsize:" << device_tensor->GetSize()
          << "\toriginal_ref_count:" << device_tensor->original_ref_count() << "\n ";
    }
  } else if (actor->type_ == KernelTransformType::kHostDataSourceActor) {
    // Dump the member info of host queue data source actor.
    const auto &host_queue_ds_actor = dynamic_cast<const HostQueueDataSourceActor *>(actor);
    MS_EXCEPTION_IF_NULL(host_queue_ds_actor);
    ofs << "\t\tdata_nodes:" << host_queue_ds_actor->data_nodes_.size() << "\n";
    for (size_t i = 0; i < host_queue_ds_actor->data_nodes_.size(); ++i) {
      const auto &data_node = host_queue_ds_actor->data_nodes_[i];
      MS_EXCEPTION_IF_NULL(data_node);
      const auto &device_tensor = AnfAlgo::GetMutableOutputAddr(data_node, 0, false);
      MS_EXCEPTION_IF_NULL(device_tensor);
      ofs << "\t\t\tnode_order_number:" << i << "\tnode_name:" << data_node->fullname_with_scope()
          << "\tptr:" << device_tensor->GetPtr() << "\tsize:" << device_tensor->GetSize()
          << "\toriginal_ref_count:" << device_tensor->original_ref_count() << "\n";
    }
  }

  DumpAbstractActor(actor, ofs);
  ofs << "\n";
 }

 void GraphScheduler::DumpLoopCountActor(const LoopCountActor *actor, std::ofstream &ofs) const {
  MS_EXCEPTION_IF_NULL(actor);
  ofs << "\tactor_name:" << actor->GetAID().Name() << "\tloop_count:" << actor->loop_count_ << "\n";
  DumpAbstractActor(actor, ofs);

  const size_t kOutputControlArrowsNum = 2;
  ofs << "\t\toutput_control_arrows:" << kOutputControlArrowsNum << "\n ";
  ofs << "\t\t\tto_actor_name:" << actor->output_aid_.Name() << "\n";
  ofs << "\t\t\tto_actor_name:" << actor->data_prepare_aid_.Name() << "\n";
 }

 void GraphScheduler::DumpKernelActor(const KernelActor *actor, std::ofstream &ofs) const {
  MS_EXCEPTION_IF_NULL(actor);
  ofs << "\tactor_name:" << actor->GetAID().Name() << "\n";

  const auto &kernel = actor->kernel_;
  MS_EXCEPTION_IF_NULL(kernel);
  ofs << "\t\tkernel_name:" << kernel->fullname_with_scope() << "\tinputs_num:" << AnfAlgo::GetInputTensorNum(kernel)
      << "\toutputs_num:" << AnfAlgo::GetOutputTensorNum(kernel) << "\n";
  for (size_t i = 0; i < AnfAlgo::GetOutputTensorNum(kernel); ++i) {
    const auto &device_tensor = AnfAlgo::GetMutableOutputAddr(kernel, i, false);
    MS_EXCEPTION_IF_NULL(device_tensor);
    ofs << "\t\t\toutput_index:" << i << "\tptr:" << device_tensor->GetPtr() << "\tsize:" << device_tensor->GetSize()
        << "\toriginal_ref_count:" << device_tensor->original_ref_count() << "\n ";
  }

  DumpAbstractActor(actor, ofs);
  ofs << "\n";
 }

 void GraphScheduler::DumpSuperKernelActor(const SuperKernelActor *actor, std::ofstream &ofs) const {
  MS_EXCEPTION_IF_NULL(actor);
  ofs << "\tactor_name:" << actor->GetAID().Name() << "\n";

  const auto &graph = actor->graph_;
  MS_EXCEPTION_IF_NULL(graph);

  ofs << "\t\tgraph_id:" << graph->graph_id() << "\tgraphl_name:" << graph->ToString()
      << "\tis_sink:" << graph->is_sink() << "\tinputs_num:" << (graph->input_nodes()).size()
      << "\tkernels_num:" << (graph->execution_order()).size() << "\n";

  DumpAbstractActor(actor, ofs);
  ofs << "\n";
 }

 void GraphScheduler::DumpNoInputKernelActor(const AbstractActor *actor, std::ofstream &ofs) const {
  MS_EXCEPTION_IF_NULL(actor);
  if (actor->type_ == KernelTransformType::kKernelActor) {
    auto kernel_actor = dynamic_cast<const KernelActor *>(actor);
    MS_EXCEPTION_IF_NULL(kernel_actor);
    DumpKernelActor(kernel_actor, ofs);
  } else if (actor->type_ == KernelTransformType::kSuperKernelActor) {
    auto super_kernel_actor = dynamic_cast<const SuperKernelActor *>(actor);
    MS_EXCEPTION_IF_NULL(super_kernel_actor);
    DumpSuperKernelActor(super_kernel_actor, ofs);
  }
 }

 void GraphScheduler::DumpOutputActor(const OutputActor *actor, std::ofstream &ofs) const {
  MS_EXCEPTION_IF_NULL(actor);
  ofs << "\tactor_name:" << actor->GetAID().Name() << "\tloop_count:" << actor->loop_count_
      << "\toutputs_num:" << actor->outputs_num_ << "\n";

  DumpAbstractActor(actor, ofs);

  ofs << "\t\tinput_result_arrows:" << actor->input_result_arrow_aids_.size() << "\n ";
  for (const auto &input_result_arrow_aid : actor->input_result_arrow_aids_) {
    ofs << "\t\t\tfrom_actor_name:" << input_result_arrow_aid.Name() << "\n";
  }
 }

 void GraphScheduler::DumpCopyActor(const CopyActor *actor, std::ofstream &ofs) const {
  MS_EXCEPTION_IF_NULL(actor);
  ofs << "\tactor_name:" << actor->GetAID().Name() << "\n";

  auto device_tensor = actor->output_;
  if (device_tensor != nullptr) {
    ofs << "\t\toutput_index:" << 0 << "\tptr:" << device_tensor->GetPtr() << "\tsize:" << device_tensor->GetSize()
        << "\toriginal_ref_count:" << device_tensor->original_ref_count() << "\n ";
  }

  DumpAbstractActor(actor, ofs);
  ofs << "\n";
  DumpDataPrepareActor(actor_set->data_prepare_actor_, ofs);
  DumpDSActors(actor_set->data_source_actors_, ofs);
  DumpKernelActors(actor_set->kernel_actors_, ofs);
  DumpSuperKernelActors(actor_set->super_kernel_actors_, ofs);
  DumpNoInputKernelActors(actor_set->no_input_kernel_actors_, ofs);
  DumpCopyActors(actor_set->copy_actors_, ofs);
  DumpGatherActors(actor_set->gather_actors_, ofs);
  DumpSwitchActors(actor_set->switch_actors_, ofs);
  DumpLoopCountActor(actor_set->loop_count_actor_, ofs);
  DumpOutputActor(actor_set->output_actor_, ofs);
 }

 void GraphScheduler::DumpDeviceTensorStore(const GraphCompilerInfo &graph_compiler_info, std::ofstream &ofs) const {
  ofs << "[Device tensor stores]\n";

  for (const auto &graph : graph_compiler_info.graphs_) {
    MS_EXCEPTION_IF_NULL(graph);
    ofs << "\tgraph_id:" << graph->graph_id() << "\tis_sink:" << graph->is_sink()
@@ -2076,15 +1821,5 @@ void GraphScheduler::DumpDeviceTensorStore(const GraphCompilerInfo &graph_compil
    ofs << "\n";
  }
 }

 void GraphScheduler::DumpGatherActor(const GatherActor *actor, std::ofstream &ofs) const {
  MS_EXCEPTION_IF_NULL(actor);
  ofs << "\tactor_name:" << actor->GetAID().Name() << '\n';
 }

 void GraphScheduler::DumpSwitchActor(const SwitchActor *actor, std::ofstream &ofs) const {
  MS_EXCEPTION_IF_NULL(actor);
  ofs << "\tactor_name:" << actor->GetAID().Name() << '\n';
 }
 }  // namespace runtime
 }  // namespace mindspore
--- a/mindspore/ccsrc/runtime/framework/graph_scheduler.h
+++ b/mindspore/ccsrc/runtime/framework/graph_scheduler.h
@@ -28,15 +28,7 @@
 #include <algorithm>
 #include <fstream>
 #include "runtime/framework/graph_compiler.h"
 #include "runtime/framework/actor/data_prepare_actor.h"
 #include "runtime/framework/actor/data_source_actor.h"
 #include "runtime/framework/actor/loop_count_actor.h"
 #include "runtime/framework/actor/kernel_actor.h"
 #include "runtime/framework/actor/super_kernel_actor.h"
 #include "runtime/framework/actor/output_actor.h"
 #include "runtime/framework/actor/copy_actor.h"
 #include "runtime/framework/actor/control_flow/switch_actor.h"
 #include "runtime/framework/actor/control_flow/gather_actor.h"
 #include "runtime/framework/actor/actor_dump.h"
 #include "thread/actor_threadpool.h"

 namespace mindspore {
@@ -223,17 +215,6 @@ class GraphScheduler {

  // Display the actor information of corresponding kernel graph.
  void DumpActor(const ActorSet *actor_set, const GraphCompilerInfo &graph_compiler_info) const;
  void DumpAbstractActor(const AbstractActor *actor, std::ofstream &ofs) const;
  void DumpDataPrepareActor(const DataPrepareActor *actor, std::ofstream &ofs) const;
  void DumpDSActor(const DataSourceActor *actor, std::ofstream &ofs) const;
  void DumpLoopCountActor(const LoopCountActor *actor, std::ofstream &ofs) const;
  void DumpKernelActor(const KernelActor *actor, std::ofstream &ofs) const;
  void DumpSuperKernelActor(const SuperKernelActor *actor, std::ofstream &ofs) const;
  void DumpNoInputKernelActor(const AbstractActor *actor, std::ofstream &ofs) const;
  void DumpOutputActor(const OutputActor *actor, std::ofstream &ofs) const;
  void DumpCopyActor(const CopyActor *actor, std::ofstream &ofs) const;
  void DumpGatherActor(const GatherActor *actor, std::ofstream &ofs) const;
  void DumpSwitchActor(const SwitchActor *actor, std::ofstream &ofs) const;
  void DumpDeviceTensorStore(const GraphCompilerInfo &graph_compiler_info, std::ofstream &ofs) const;

  // The global maps, only be cleared in the deconstruction.
--- a/mindspore/ccsrc/vm/backend.cc
+++ b/mindspore/ccsrc/vm/backend.cc
@@ -365,6 +365,7 @@ MindRTBackend::MindRTBackend(const std::string &backend_name, const std::string
 const ActorInfo &MindRTBackend::CompileGraphs(const FuncGraphPtr &func_graph) {
  MS_EXCEPTION_IF_NULL(graph_compiler_);
  MS_EXCEPTION_IF_NULL(func_graph);
  MS_LOG(INFO) << "Status record: start compile function graph: " << func_graph->ToString();
  auto root_graph = WrapPrimitives(func_graph);
  MS_EXCEPTION_IF_NULL(root_graph);
  root_graph_ = root_graph.get();
@@ -403,6 +404,8 @@ const ActorInfo &MindRTBackend::CompileGraphs(const FuncGraphPtr &func_graph) {
  MS_EXCEPTION_IF_NULL(graph_compiler_info);
  const ActorInfo &actor_info = graph_compiler_info->name_;
  (void)actor_to_graph_compiler_info_.emplace(graph_compiler_info->name_, std::move(graph_compiler_info));
  MS_LOG(INFO) << "Status record: end compile function graph: " << func_graph->ToString()
               << ", produce actor: " << actor_info;
  return actor_info;
 }

@@ -787,7 +790,6 @@ void MindRTBackend::RunGraphBySingleOp(const std::vector<KernelGraphPtr> &graphs
 }

 void MindRTBackend::RunGraph(const ActorInfo &actor_info, const VectorRef &args, VectorRef *outputs) {
  MS_LOG(INFO) << "Run actor begin, actor name: " << actor_info;
  MS_EXCEPTION_IF_NULL(root_graph_);
  if (IsGraphOutputValueNodeOrParameter(root_graph_->output(), args, outputs)) {
    return;
@@ -800,6 +802,7 @@ void MindRTBackend::RunGraph(const ActorInfo &actor_info, const VectorRef &args,
    return;
  }

  MS_LOG(INFO) << "Status record: start run actor: " << actor_info;
  // Fetch the graph compiler info.
  const auto &graph_iter = actor_to_graph_compiler_info_.find(actor_info);
  if (graph_iter == actor_to_graph_compiler_info_.end()) {
@@ -837,6 +840,7 @@ void MindRTBackend::RunGraph(const ActorInfo &actor_info, const VectorRef &args,
  // There will be more than one kernel graph in heterogeneous scenario in a ms function of PyNative Mode.
  if (real_execution_mode_ == kPynativeMode) {
    RunGraphBySingleOp(graph_compiler_info.graphs_, input_tensors, outputs);
    MS_LOG(INFO) << "Status record: end run actor: " << actor_info;
    return;
  }
  // Run actor DAG.
@@ -875,7 +879,7 @@ void MindRTBackend::RunGraph(const ActorInfo &actor_info, const VectorRef &args,

  // Update device address for output node of graph.
  actor_set->output_actor_->UpdateOutputDeviceAddress();
  MS_LOG(INFO) << "Run actor end, actor name: " << actor_info;
  MS_LOG(INFO) << "Status record: end run actor: " << actor_info;
 }

 void MindRTBackend::ConstructOutputs(const AnfNodePtr &output_node,