!14052 add op atomic clean to clear input addr in launch allreduce

From: @lvchangquan Reviewed-by: @kisnwang,@chujinjin Signed-off-by: @chujinjin
4 years ago · a48785cdcc
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_bucket.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_bucket.cc
@@ -27,6 +27,7 @@
 #include "runtime/device/kernel_runtime_manager.h"
 #include "runtime/device/ascend/ascend_event.h"
 #include "runtime/device/ascend/ascend_launch_mul.h"
 #include "runtime/device/ascend/ascend_launch_atomic_clean.h"
 #include "utils/profile.h"

 #define CHECK_ASCEND_RT_WITH_EXCEPTION(expression, message)    \
@@ -90,16 +91,18 @@ void AscendBucket::FreeAllDeviceMem() {
    ar_output_addr_ = nullptr;
  }
  // clear launch mul device Memory
  if (launch_kernel != nullptr) {
    launch_kernel->FreeLaunchDeviceMem();
  if (launch_mul_ != nullptr) {
    launch_mul_->FreeLaunchDeviceMem();
  }
  // clear launch atomic clean device Memory
  if (launch_atomic_clean_ != nullptr) {
    launch_atomic_clean_->FreeLaunchDeviceMem();
  }
 }

 void AscendBucket::CopyTensorToContiguousMemory() {
  // Clean input addr
  CHECK_ASCEND_RT_WITH_EXCEPTION(rtMemsetAsync(ar_input_addr_, total_size_, 0, total_size_, compute_stream_),
                                 "Call rtMemsetAsync failed");

  // clear allreduce input addr
  CleanAllReduceInputAddr();
  for (size_t i = 0; i < bucket_size_; ++i) {
    MS_EXCEPTION_IF_NULL(memcpy_input_addrs_[i]);
    MS_EXCEPTION_IF_NULL(memcpy_output_addrs_[i]);
@@ -151,15 +154,36 @@ void AscendBucket::LaunchAllReduce() {
  }
 }

 std::shared_ptr<LaunchKernel> AscendBucket::CreateLaunchKernel() {
 void AscendBucket::CleanAllReduceInputAddr() {
  if (launch_atomic_clean_ == nullptr) {
    launch_atomic_clean_ = CreateLaunchAtomicClean();
    MS_EXCEPTION_IF_NULL(launch_atomic_clean_);
  }
  // set atomic clean input addr
  launch_atomic_clean_->SetInputAddr(ar_input_addr_);
  // launch atomic clean
  launch_atomic_clean_->LaunchOpKernel();
 }

 std::shared_ptr<LaunchKernel> AscendBucket::CreateLaunchMul() {
  if (tensor_type_list_.empty()) {
    MS_LOG(ERROR) << "tensor_type_list_ is empty";
  }
  auto launch_mul = std::make_shared<AscendLaunchMul>(stream_, tensor_type_list_[0], total_size_, ar_output_addr_);
  auto launch_mul = std::make_shared<AscendLaunchMul>(stream_, tensor_type_list_[0], total_size_);
  MS_EXCEPTION_IF_NULL(launch_mul);
  return launch_mul;
 }

 std::shared_ptr<LaunchKernel> AscendBucket::CreateLaunchAtomicClean() {
  if (tensor_type_list_.empty()) {
    MS_LOG(ERROR) << "tensor_type_list_ is empty";
  }
  auto launch_atomic_clean =
    std::make_shared<AscendLaunchAtomicClean>(compute_stream_, tensor_type_list_[0], total_size_);
  MS_EXCEPTION_IF_NULL(launch_atomic_clean);
  return launch_atomic_clean;
 }

 void AscendBucket::Init() {
  pre_event_ = std::make_shared<AscendEvent>();
  post_event_ = std::make_shared<AscendEvent>();
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_bucket.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_bucket.h
@@ -34,7 +34,9 @@ class AscendBucket : public Bucket {
  void FreeDeviceMem(void *dev_ptr) override;
  void CopyTensorToContiguousMemory() override;
  void LaunchAllReduce() override;
  std::shared_ptr<LaunchKernel> CreateLaunchKernel() override;
  std::shared_ptr<LaunchKernel> CreateLaunchMul() override;
  std::shared_ptr<LaunchKernel> CreateLaunchAtomicClean();
  void CleanAllReduceInputAddr();
 };
 }  // namespace mindspore::device::ascend
 #endif  // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_BUCKET_H_
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.cc
@@ -0,0 +1,114 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "runtime/device/ascend/ascend_launch_atomic_clean.h"

 #include <memory>
 #include <vector>
 #include "abstract/utils.h"
 #include "backend/session/single_kernel_graph.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "debug/anf_ir_dump.h"

 namespace mindspore::device::ascend {
 void AscendLaunchAtomicClean::FreeDeviceMem(void *addr) { AscendLaunchKernel::FreeDeviceMem(addr); }

 size_t AscendLaunchAtomicClean::AlignSizeForLaunchKernel(size_t size) {
  return AscendLaunchKernel::AlignSizeForLaunchKernel(size);
 }

 uint8_t *AscendLaunchAtomicClean::AllocDeviceMem(size_t size) { return AscendLaunchKernel::AllocDeviceMem(size); }

 void AscendLaunchAtomicClean::KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) {
  AscendLaunchKernel::KernelSelect(kernel_graph);
 }

 void AscendLaunchAtomicClean::KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) {
  AscendLaunchKernel::KernelBuild(kernel_graph);
 }

 void AscendLaunchAtomicClean::LaunchOpKernel() {
  if (atomic_clean_graph_ == nullptr) {
    // construct atomic clean kernel graph and set attr
    ConstructKernelGraphAndSetAttr();
    // kernel build
    KernelBuild(atomic_clean_graph_);
  }
  // obtain kernel_mod
  if (atomic_clean_graph_->execution_order().size() != 1) {
    MS_LOG(ERROR) << "the execution order of the atomic clean graph should have only one node";
  }
  kernel_mod_ = AnfAlgo::GetKernelMod(atomic_clean_graph_->execution_order()[0]);
  MS_EXCEPTION_IF_NULL(kernel_mod_);
  // obtain kernel inputs
  std::vector<kernel::AddressPtr> kernel_inputs;
  auto input = std::make_shared<kernel::Address>();
  MS_EXCEPTION_IF_NULL(input);
  input->addr = input_addr_;
  MS_EXCEPTION_IF_NULL(input->addr);
  input->size = total_size_;
  kernel_inputs.push_back(input);
  // obtain kernel outputs
  auto kernel_outputs = ObtainKernelOutputs(kernel_mod_->GetOutputSizeList());
  // obtain kernel workspace
  auto kernel_workspaces = ObtainKernelWorkspaces(kernel_mod_->GetWorkspaceSizeList());
  // launch
  auto ret_status = kernel_mod_->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_);
  if (!ret_status) {
    MS_LOG(ERROR) << "Launch single kernel failed.";
  }
 }

 void AscendLaunchAtomicClean::FreeLaunchDeviceMem() {
  input_addr_ = nullptr;
  FreeOutputAndWorkspaceDeviceMem();
 }

 std::shared_ptr<session::KernelGraph> AscendLaunchAtomicClean::ObtainAtomicCleanKernelGraph() {
  std::vector<TypeId> input_dtypes = {dtype_};
  std::vector<TypeId> output_dtypes = {};
  // obtain input & output shapes
  size_t dtype_size = abstract::TypeIdSize(dtype_);
  int64_t shape = total_size_ / dtype_size;
  std::vector<std::vector<int64_t>> input_shapes = {{shape}};
  std::vector<std::vector<size_t>> output_shapes = {};
  auto atomic_clean_graph = session::SingleKernelGraph::ConstructKernelGraphBasedOnSingleOp(
    kAtomicAddrCleanOpName, input_dtypes, input_shapes, output_dtypes, output_shapes);
  MS_EXCEPTION_IF_NULL(atomic_clean_graph);
  return atomic_clean_graph;
 }

 void AscendLaunchAtomicClean::ConstructKernelGraphAndSetAttr() {
  // construct atomic clean kernel graph
  atomic_clean_graph_ = ObtainAtomicCleanKernelGraph();
  MS_EXCEPTION_IF_NULL(atomic_clean_graph_);
  // set atomic clean attr
  if (!atomic_clean_graph_->execution_order().empty()) {
    auto clean_node = atomic_clean_graph_->execution_order()[0];
    // set abstract
    AbstractBasePtr abstract = std::make_shared<abstract::AbstractNone>();
    MS_EXCEPTION_IF_NULL(abstract);
    clean_node->set_abstract(abstract);
    // set build info
    auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
    builder->SetKernelType(KernelType::TBE_KERNEL);
    AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), clean_node.get());
    // set attr
    std::vector<size_t> clean_size = {total_size_};
    AnfAlgo::SetNodeAttr(kAttrAtomicAddMemSize, MakeValue(clean_size), clean_node);
  }
 }
 }  // namespace mindspore::device::ascend
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.h
@@ -0,0 +1,57 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_ATOMIC_CLEAN_H_
 #define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_ATOMIC_CLEAN_H_

 #include <vector>
 #include <memory>
 #include "runtime/device/ascend/ascend_launch_kernel.h"

 namespace mindspore::device::ascend {
 class AscendLaunchAtomicClean : public AscendLaunchKernel {
 public:
  AscendLaunchAtomicClean(void *stream, TypeId dtype, size_t total_size)
      : AscendLaunchKernel(stream),
        dtype_(dtype),
        total_size_(total_size),
        atomic_clean_graph_(nullptr),
        input_addr_(nullptr) {}
  ~AscendLaunchAtomicClean() override = default;

  void SetInputAddr(uint8_t *input_addr) override { input_addr_ = input_addr; }
  void FreeDeviceMem(void *addr) override;
  size_t AlignSizeForLaunchKernel(size_t size) override;
  uint8_t *AllocDeviceMem(size_t size) override;
  void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) override;
  void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) override;

  void LaunchOpKernel() override;
  void FreeLaunchDeviceMem() override;

 protected:
  TypeId dtype_;
  size_t total_size_;
  std::shared_ptr<session::KernelGraph> atomic_clean_graph_;
  uint8_t *input_addr_;

 private:
  std::shared_ptr<session::KernelGraph> ObtainAtomicCleanKernelGraph();
  void ConstructKernelGraphAndSetAttr();
 };
 }  // namespace mindspore::device::ascend

 #endif  // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_ATOMIC_CLEAN_H_
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_launch_kernel.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_launch_kernel.h
@@ -33,6 +33,7 @@ class AscendLaunchKernel : public LaunchKernel {
  void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) override;
  void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) override;

  void SetInputAddr(uint8_t *input_addr) override = 0;
  void LaunchOpKernel() override = 0;
  void FreeLaunchDeviceMem() override = 0;
 };
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_launch_mul.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_launch_mul.h
@@ -25,10 +25,11 @@
 namespace mindspore::device::ascend {
 class AscendLaunchMul : public AscendLaunchKernel, public LaunchMul {
 public:
  AscendLaunchMul(void *stream, TypeId dtype, size_t total_size, uint8_t *input1_addr)
      : AscendLaunchKernel(stream), LaunchMul(dtype, total_size, input1_addr) {}
  AscendLaunchMul(void *stream, TypeId dtype, size_t total_size)
      : AscendLaunchKernel(stream), LaunchMul(dtype, total_size) {}
  ~AscendLaunchMul() override = default;

  void SetInputAddr(uint8_t *input1_addr) override { input1_addr_ = input1_addr; }
  void FreeDeviceMem(void *addr) override;
  size_t AlignSizeForLaunchKernel(size_t size) override;
  uint8_t *AllocDeviceMem(size_t size) override;
--- a/mindspore/ccsrc/runtime/device/bucket.cc
+++ b/mindspore/ccsrc/runtime/device/bucket.cc
@@ -94,12 +94,16 @@ void Bucket::CalculateMean() {
  if (!grad_mean) {
    return;
  }
  launch_kernel = CreateLaunchKernel();
  MS_EXCEPTION_IF_NULL(launch_kernel);
  if (launch_mul_ == nullptr) {
    launch_mul_ = CreateLaunchMul();
    MS_EXCEPTION_IF_NULL(launch_mul_);
  }
  // set mul input1 addr
  launch_mul_->SetInputAddr(ar_output_addr_);
  // launch mean
  launch_kernel->LaunchOpKernel();
  launch_mul_->LaunchOpKernel();
  // store output tensor addr
  auto launch_output = launch_kernel->GetKernelOutputAddr();
  auto launch_output = launch_mul_->GetKernelOutputAddr();
  if (launch_output.size() != 1) {
    MS_LOG(ERROR) << "launch mul outputs should have one output";
  }
--- a/mindspore/ccsrc/runtime/device/bucket.h
+++ b/mindspore/ccsrc/runtime/device/bucket.h
@@ -38,7 +38,8 @@ class Bucket {
        compute_stream_(nullptr),
        pre_event_(nullptr),
        post_event_(nullptr),
        launch_kernel(nullptr),
        launch_mul_(nullptr),
        launch_atomic_clean_(nullptr),
        total_size_(0),
        ar_input_addr_(nullptr),
        ar_output_addr_(nullptr) {}
@@ -60,7 +61,8 @@ class Bucket {

  std::shared_ptr<DeviceEvent> pre_event_;
  std::shared_ptr<DeviceEvent> post_event_;
  std::shared_ptr<LaunchKernel> launch_kernel;
  std::shared_ptr<LaunchKernel> launch_mul_;
  std::shared_ptr<LaunchKernel> launch_atomic_clean_;

  size_t total_size_;
  uint8_t *ar_input_addr_;
@@ -77,7 +79,7 @@ class Bucket {
  virtual void AllocateAllReduceAddr() = 0;
  void UpdateTensorAddr();
  void CalculateMean();
  virtual std::shared_ptr<LaunchKernel> CreateLaunchKernel() = 0;
  virtual std::shared_ptr<LaunchKernel> CreateLaunchMul() = 0;
  virtual void LaunchAllReduce() = 0;
  virtual void FreeAllDeviceMem() = 0;
  virtual void FreeDeviceMem(void *dev_ptr) = 0;
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_bucket.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_bucket.cc
@@ -92,8 +92,8 @@ void GPUBucket::FreeAllDeviceMem() {
    ar_output_addr_ = nullptr;
  }
  // clear launch mul device memory
  if (launch_kernel != nullptr) {
    launch_kernel->FreeLaunchDeviceMem();
  if (launch_mul_ != nullptr) {
    launch_mul_->FreeLaunchDeviceMem();
  }
  MS_LOG(INFO) << "end";
 }
@@ -156,11 +156,11 @@ void GPUBucket::LaunchAllReduce() {
  MS_LOG(INFO) << "end";
 }

 std::shared_ptr<LaunchKernel> GPUBucket::CreateLaunchKernel() {
 std::shared_ptr<LaunchKernel> GPUBucket::CreateLaunchMul() {
  if (tensor_type_list_.empty()) {
    MS_LOG(ERROR) << "tensor_type_list_ is empty";
  }
  auto launch_mul = std::make_shared<GPULaunchMul>(stream_, tensor_type_list_[0], total_size_, ar_output_addr_);
  auto launch_mul = std::make_shared<GPULaunchMul>(stream_, tensor_type_list_[0], total_size_);
  MS_EXCEPTION_IF_NULL(launch_mul);
  return launch_mul;
 }
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_bucket.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_bucket.h
@@ -34,7 +34,7 @@ class GPUBucket : public Bucket {
  void FreeDeviceMem(void *dev_ptr) override;
  void CopyTensorToContiguousMemory() override;
  void LaunchAllReduce() override;
  std::shared_ptr<LaunchKernel> CreateLaunchKernel() override;
  std::shared_ptr<LaunchKernel> CreateLaunchMul() override;
  const void *collective_handle_;
 };
 }  // namespace mindspore::device::gpu
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_launch_kernel.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_launch_kernel.h
@@ -33,6 +33,7 @@ class GPULaunchkernel : public LaunchKernel {
  void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) override;
  void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) override;

  void SetInputAddr(uint8_t *input_addr) override = 0;
  void LaunchOpKernel() override = 0;
  void FreeLaunchDeviceMem() override = 0;
 };
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_launch_mul.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_launch_mul.h
@@ -25,10 +25,10 @@
 namespace mindspore::device::gpu {
 class GPULaunchMul : public GPULaunchkernel, public LaunchMul {
 public:
  GPULaunchMul(void *stream, TypeId dtype, size_t total_size, uint8_t *input1_addr)
      : GPULaunchkernel(stream), LaunchMul(dtype, total_size, input1_addr) {}
  GPULaunchMul(void *stream, TypeId dtype, size_t total_size) : GPULaunchkernel(stream), LaunchMul(dtype, total_size) {}
  ~GPULaunchMul() override = default;

  void SetInputAddr(uint8_t *input1_addr) override { input1_addr_ = input1_addr; }
  void FreeDeviceMem(void *addr) override;
  size_t AlignSizeForLaunchKernel(size_t size) override;
  uint8_t *AllocDeviceMem(size_t size) override;
--- a/mindspore/ccsrc/runtime/device/launch_kernel.cc
+++ b/mindspore/ccsrc/runtime/device/launch_kernel.cc
@@ -83,7 +83,7 @@ void LaunchKernel::LaunchSingleKernel(const std::vector<uint8_t *> &inputs_addr)
  // launch
  auto ret_status = kernel_mod_->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_);
  if (!ret_status) {
    MS_LOG(ERROR) << "Launch mul kernel failed.";
    MS_LOG(ERROR) << "Launch single kernel failed.";
  }
 }

--- a/mindspore/ccsrc/runtime/device/launch_kernel.h
+++ b/mindspore/ccsrc/runtime/device/launch_kernel.h
@@ -37,6 +37,7 @@ class LaunchKernel {
  virtual void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) = 0;
  virtual void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) = 0;

  virtual void SetInputAddr(uint8_t *input_addr) = 0;
  virtual void LaunchOpKernel() = 0;
  virtual void FreeLaunchDeviceMem() = 0;

@@ -46,7 +47,6 @@ class LaunchKernel {
  std::vector<uint8_t *> outputs_addr_;
  std::vector<uint8_t *> workspaces_addr_;

 private:
  std::vector<kernel::AddressPtr> ObtainKernelAddress(const std::vector<size_t> &list, std::vector<uint8_t *> *addr);
  std::vector<kernel::AddressPtr> ObtainKernelInputs(const std::vector<size_t> &inputs_list,
                                                     const std::vector<uint8_t *> &inputs_addr);
--- a/mindspore/ccsrc/runtime/device/launch_mul.h
+++ b/mindspore/ccsrc/runtime/device/launch_mul.h
@@ -24,10 +24,10 @@
 namespace mindspore::device {
 class LaunchMul {
 public:
  LaunchMul(TypeId dtype, size_t total_size, uint8_t *input1_addr)
  LaunchMul(TypeId dtype, size_t total_size)
      : dtype_(dtype),
        total_size_(total_size),
        input1_addr_(input1_addr),
        input1_addr_(nullptr),
        input2_addr_(nullptr),
        input2_value_(0),
        mul_graph_(nullptr) {}
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@@ -106,6 +106,7 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        "../../../mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc"
        "../../../mindspore/ccsrc/runtime/device/ascend/ascend_launch_kernel.cc"
        "../../../mindspore/ccsrc/runtime/device/ascend/ascend_launch_mul.cc"
        "../../../mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.cc"
        "../../../mindspore/ccsrc/runtime/device/ascend/kernel_select_graph_kernel.cc"
        "../../../mindspore/ccsrc/runtime/device/convert_tensor_utils.cc"
        "../../../mindspore/ccsrc/runtime/device/ascend/ascend_bucket.cc"