!32414 RL add gpu kernel: batch read write

Merge pull request !32414 from VectorSL/read_write_op
3 years ago · 4adf5064fc
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/rl/batch_assign_gpu_base.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/rl/batch_assign_gpu_base.h
@@ -0,0 +1,45 @@
 /**
 * Copyright 2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_RL_BATCH_ASSIGN_GPU_BASE_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_RL_BATCH_ASSIGN_GPU_BASE_H_

 #include <string>
 #include <vector>
 #include <shared_mutex>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 class BatchAssignBaseMod : public NativeGpuKernelMod {
 public:
  BatchAssignBaseMod() = default;
  ~BatchAssignBaseMod() override = default;

  virtual bool Init(const CNodePtr &kernel_node) = 0;

 protected:
  void InitSizeLists() {}
  // Using shared_mutex to achieve the followings：
  // The read-write lock can only have one writer or multiple readers at the same time,
  // but it can't have both readers and writers at the same time.
  static std::shared_mutex rw_mutex_;
 };
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_RL_BATCH_ASSIGN_GPU_BASE_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/rl/batch_assign_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/rl/batch_assign_gpu_kernel.cc
@@ -0,0 +1,85 @@
 /**
 * Copyright 2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <memory>
 #include <shared_mutex>
 #include "plugin/device/gpu/kernel/rl/batch_assign_gpu_kernel.h"
 #include "kernel/common_utils.h"

 namespace mindspore {
 namespace kernel {
 constexpr size_t kHalf = 2;
 // Init shared_mutex in base.
 std::shared_mutex BatchAssignBaseMod::rw_mutex_;

 BatchAssignKernelMod::BatchAssignKernelMod() : elements_num_(0), lock_(false) {}

 bool BatchAssignKernelMod::Init(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_node_ = kernel_node;
  kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
  lock_ = GetAttr<bool>(kernel_node, "lock");
  size_t input_num = common::AnfAlgo::GetInputNum(kernel_node);
  elements_num_ = input_num / kHalf;
  // Compute the size for each input. There has two input lists.
  // Each list has the same elements number, shape series， type series.
  for (size_t i = 0; i < elements_num_; i++) {
    auto type = AnfAlgo::GetInputDeviceDataType(kernel_node, i);
    size_t element_size = GetTypeByte(TypeIdToType(type));
    auto shape = AnfAlgo::GetInputDeviceShape(kernel_node, i);
    for (auto x : shape) {
      element_size *= LongToSize(x);
    }
    input_size_list_.push_back(element_size);
  }
  // Set input size for another input list.
  for (size_t i = 0; i < elements_num_; i++) {
    input_size_list_.push_back(input_size_list_[i]);
  }
  // Set an output for placeholder.
  output_size_list_.push_back(sizeof(float));
  return true;
 }

 bool BatchAssignKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
                                  const std::vector<AddressPtr> &, void *stream) {
  auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
  // Using shared lock for reader so there can be more than one readers in the same time.
  // Using lock for writer to ensure there's only one writer at a time.
  if (lock_) {
    // Execute rw_mutex_.unlock() in lock's deconstruct.
    std::unique_lock<std::shared_mutex> lock(rw_mutex_);
  } else {
    // Execute rw_mutex_.unlock_shared() in lock's deconstruct.
    std::shared_lock<std::shared_mutex> lock(rw_mutex_);
  }
  // Usually, we will get two inputs list, the first half are the weights to be updated, and the last half
  // are the sources. So we just copy the source to overwrite the dst.
  for (size_t i = 0; i < elements_num_; i++) {
    auto local_addr = GetDeviceAddress<int64_t>(inputs, i);
    auto source_addr = GetDeviceAddress<int64_t>(inputs, i + elements_num_);
    MS_ERROR_IF_NULL(local_addr);
    MS_ERROR_IF_NULL(source_addr);
    CHECK_CUDA_RET_WITH_EXCEPT(
      kernel_node_,
      cudaMemcpyAsync(local_addr, source_addr, input_size_list_[i], cudaMemcpyDeviceToDevice, cuda_stream),
      "Overwrite failed");
  }
  CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(cuda_stream),
                             "BatchAssignKernel cudaStreamSynchronized failed");
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/rl/batch_assign_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/rl/batch_assign_gpu_kernel.h
@@ -0,0 +1,46 @@
 /**
 * Copyright 2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_RL_BATCH_ASSIGN_GPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_RL_BATCH_ASSIGN_GPU_KERNEL_H_

 #include <string>
 #include <vector>
 #include "plugin/device/gpu/kernel/rl/batch_assign_gpu_base.h"
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 class BatchAssignKernelMod : public BatchAssignBaseMod {
 public:
  BatchAssignKernelMod();
  ~BatchAssignKernelMod() = default;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
  bool Init(const CNodePtr &kernel_node) override;

 private:
  size_t elements_num_;
  bool lock_;
 };

 MS_REG_GPU_KERNEL(BatchAssign, BatchAssignKernelMod)
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_RL_BATCH_ASSIGN_GPU_KERNEL_H_