!2394 sync cpu output if needed

Merge pull request !2394 from kisnwang/add-cpu-outputsync
5 years ago · 9969a9b07d
--- a/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.cc
@@ -21,6 +21,7 @@
 #include <utility>
 #include <functional>
 #include <unordered_map>
 #include <set>
 #include "kernel/kernel.h"
 #include "device/cpu/cpu_device_address.h"
 #include "utils/context/ms_context.h"
@@ -139,8 +140,12 @@ DeviceAddressPtr CPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t
  return std::make_shared<CPUDeviceAddress>(device_ptr, device_size, format, type_id);
 }

 BaseRef CPUKernelRuntime::CreatTensorForOutput(const AnfNodePtr &input_node, size_t index,
                                               const std::unordered_map<AnfNode *, tensor::TensorPtr> &input_map) {
 BaseRef CPUKernelRuntime::CreatTensorForOutput(const session::KernelWithIndex &kernel_with_index,
                                               const std::unordered_map<AnfNode *, tensor::TensorPtr> &input_map,
                                               std::set<DeviceAddressPtr> *bound_addresses,
                                               std::vector<tensor::TensorPtr> *need_sync_outputs) {
  auto &input_node = kernel_with_index.first;
  auto index = kernel_with_index.second;
  MS_EXCEPTION_IF_NULL(input_node);
  if (input_node->isa<CNode>() && AnfAlgo::GetCNodeName(input_node) == prim::kPrimMakeTuple->name()) {
    auto cnode = input_node->cast<CNodePtr>();
@@ -148,7 +153,7 @@ BaseRef CPUKernelRuntime::CreatTensorForOutput(const AnfNodePtr &input_node, siz
    VectorRef ret;
    for (size_t i = 1; i < cnode->inputs().size(); i++) {
      auto item_with_index = AnfAlgo::VisitKernelWithReturnType(cnode->input(i), 0);
      auto out = CreatTensorForOutput(item_with_index.first, item_with_index.second, input_map);
      auto out = CreatTensorForOutput(item_with_index, input_map, bound_addresses, need_sync_outputs);
      ret.push_back(out);
    }
    return ret;
@@ -169,11 +174,13 @@ BaseRef CPUKernelRuntime::CreatTensorForOutput(const AnfNodePtr &input_node, siz
    type_id = GetCPUSupportOutputTypeId(type_id);
    tensor::TensorPtr tensor = std::make_shared<tensor::Tensor>(type_id, temp_shape);
    MS_EXCEPTION_IF_NULL(tensor);
    if (address->ref_count_ > 0 && address->ptr_ != nullptr) {
    if (bound_addresses->find(address) != bound_addresses->end()) {
      tensor->set_device_address(address);
      need_sync_outputs->emplace_back(tensor);
    } else {
      address->ptr_ = tensor->data_c(true);
      address->ref_count_ = INIT_NODE_REF;
      (void)bound_addresses->insert(address);
    }
    tensor->set_dirty(false);
    return tensor;
@@ -187,7 +194,8 @@ BaseRef CPUKernelRuntime::CreatTensorForOutput(const AnfNodePtr &input_node, siz
 }

 void CPUKernelRuntime::BindInputOutput(const session::KernelGraph *kernel_graph,
                                       const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
                                       const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs,
                                       std::vector<tensor::TensorPtr> *need_sync_outputs) {
  MS_EXCEPTION_IF_NULL(kernel_graph);
  MS_EXCEPTION_IF_NULL(outputs);
  // bind input ptr
@@ -195,10 +203,8 @@ void CPUKernelRuntime::BindInputOutput(const session::KernelGraph *kernel_graph,
  if (input_nodes.size() != inputs.size()) {
    MS_LOG(EXCEPTION) << "Input size not equal to input node size!";
  }

  std::unordered_map<AnfNode *, tensor::TensorPtr> input_map;
  size_t input_idx = 0;
  size_t type_size = sizeof(float);
  for (auto &item : input_nodes) {
    MS_EXCEPTION_IF_NULL(item);
    input_map[item.get()] = inputs[input_idx];
@@ -212,7 +218,8 @@ void CPUKernelRuntime::BindInputOutput(const session::KernelGraph *kernel_graph,
        (void)tensor->data_sync();
      }
      std::vector<int> data_shape = tensor->shape();
      size_t tensor_size = std::accumulate(data_shape.begin(), data_shape.end(), type_size, std::multiplies<size_t>());
      size_t tensor_size =
        std::accumulate(data_shape.begin(), data_shape.end(), sizeof(float), std::multiplies<size_t>());
      if (tensor->data_type() == kNumberTypeFloat32 || tensor->data_type() == kNumberTypeInt32) {
        address->ptr_ = tensor->data_c(false);
      } else {
@@ -223,18 +230,17 @@ void CPUKernelRuntime::BindInputOutput(const session::KernelGraph *kernel_graph,
        }
        tensor->set_dirty(true);
      }

      address->ref_count_ = INIT_NODE_REF;
      tensor->set_device_address(address);
    }
    input_idx++;
  }

  // new output and bind ptr
  std::set<DeviceAddressPtr> bound_addresses;
  auto output_nodes = kernel_graph->outputs();
  for (const auto &item : output_nodes) {
    auto item_with_index = AnfAlgo::VisitKernelWithReturnType(item, 0, true);
    auto out = CreatTensorForOutput(item_with_index.first, item_with_index.second, input_map);
    auto out = CreatTensorForOutput(item_with_index, input_map, &bound_addresses, need_sync_outputs);
    outputs->push_back(std::move(out));
  }
 }
--- a/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.h
+++ b/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.h
@@ -20,10 +20,12 @@
 #include <vector>
 #include <string>
 #include <unordered_map>
 #include <set>
 #include "device/kernel_runtime.h"
 #include "session/kernel_graph.h"
 #include "session/session_basic.h"
 #include "device/cpu/cpu_resource_manager.h"
 #include "session/anf_runtime_algorithm.h"
 #include "utils/any.h"
 namespace mindspore {
 namespace device {
@@ -37,7 +39,7 @@ class CPUKernelRuntime : public KernelRuntime {
  bool Run(session::KernelGraph *graph) override;
  void AssignKernelAddress(session::KernelGraph *kernel_graph);
  void BindInputOutput(const session::KernelGraph *kernel_graph, const std::vector<tensor::TensorPtr> &inputs,
                       VectorRef *outputs);
                       VectorRef *outputs, std::vector<tensor::TensorPtr> *need_sync_outputs);
  void IncreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs);
  void DecreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs);

@@ -47,8 +49,10 @@ class CPUKernelRuntime : public KernelRuntime {
                                       TypeId type_id) override;

 private:
  BaseRef CreatTensorForOutput(const AnfNodePtr &input_node, size_t index,
                               const std::unordered_map<AnfNode *, tensor::TensorPtr> &input_map);
  BaseRef CreatTensorForOutput(const session::KernelWithIndex &kernel_with_index,
                               const std::unordered_map<AnfNode *, tensor::TensorPtr> &input_map,
                               std::set<DeviceAddressPtr> *bound_addresses,
                               std::vector<tensor::TensorPtr> *need_sync_outputs);
  void AssignValueNodeAddress(session::KernelGraph *kernel_graph);
  void AssignInputNodeAddress(const session::KernelGraph *kernel_graph);
  void AssignKernelOutputAddress(const session::KernelGraph *kernel_graph);
--- a/mindspore/ccsrc/kernel/cpu/gather_cpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/cpu/gather_cpu_kernel.cc
@@ -74,8 +74,8 @@ void GatherV2CPUKernel::CopyDataToOutput(const std::vector<kernel::AddressPtr> &
                                         size_t dim2, float **output_addr, size_t *buff_size) {
  auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
  auto indices_addr = reinterpret_cast<int *>(inputs[1]->addr);

  for (size_t i = 0; i < output_shape_[axis_]; ++i) {
  size_t elem_num = inputs[1]->size / 4;
  for (size_t i = 0; i < elem_num; ++i) {
    size_t index = IntToSize(indices_addr[i]);
    size_t pos = 0;
    if (axis_ == 3) {
--- a/mindspore/ccsrc/session/cpu_session.cc
+++ b/mindspore/ccsrc/session/cpu_session.cc
@@ -63,7 +63,8 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
  auto &kernel_graph = graphs_[graph_id];
  MS_EXCEPTION_IF_NULL(kernel_graph);
  MS_LOG(INFO) << "Bind input output address";
  runtime_.BindInputOutput(kernel_graph.get(), inputs, outputs);
  std::vector<tensor::TensorPtr> need_sync_outputs;
  runtime_.BindInputOutput(kernel_graph.get(), inputs, outputs, &need_sync_outputs);
  MS_LOG(INFO) << "Run graph start";
  predictmodel::StepConvertWeight(inputs);
  auto execution_order = kernel_graph->execution_order();
@@ -82,6 +83,9 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
  if (!ret) {
    MS_LOG(EXCEPTION) << "Run graph failed";
  }
  for (auto output : need_sync_outputs) {
    (void)output->data_sync();
  }

  if (enable_summary) {
    Summary(kernel_graph.get());