You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gpu_bucket.cc 7.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. /**
  2. * Copyright 2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "runtime/device/gpu/gpu_bucket.h"
  17. #include <cuda_runtime_api.h>
  18. #include <nccl.h>
  19. #include <vector>
  20. #include <memory>
  21. #include "abstract/utils.h"
  22. #include "runtime/device/gpu/gpu_event.h"
  23. #include "runtime/device/gpu/gpu_memory_allocator.h"
  24. #include "runtime/device/gpu/gpu_device_manager.h"
  25. #include "runtime/device/kernel_runtime_manager.h"
  26. #include "runtime/device/gpu/distribution/collective_init.h"
  27. #include "runtime/device/gpu/gpu_launch_mul.h"
  28. #include "backend/kernel_compiler/gpu/nccl/nccl_gpu_kernel.h"
  29. #include "runtime/device/gpu/gpu_common.h"
  30. namespace {
  31. const size_t kCommunicationMemAlignSize = 16;
  32. size_t AlignMemorySize(size_t size) {
  33. if (size == 0) {
  34. return kCommunicationMemAlignSize;
  35. }
  36. return ((size + kCommunicationMemAlignSize - 1) / kCommunicationMemAlignSize) * kCommunicationMemAlignSize;
  37. }
  38. } // namespace
  39. namespace mindspore::device::gpu {
  40. GPUBucket::GPUBucket(uint32_t id, uint32_t bucket_size) : Bucket(id, bucket_size), collective_handle_(nullptr) {
  41. group_ = kNcclWorldGroup;
  42. }
  43. void GPUBucket::AllocateAllReduceAddr() {
  44. MS_LOG(INFO) << "start";
  45. if (grad_tensor_list_.size() != bucket_size_) {
  46. MS_LOG(EXCEPTION) << "grad tensor list size:" << grad_tensor_list_.size()
  47. << " is not equal to bucket size:" << bucket_size_;
  48. }
  49. auto total_size = 0;
  50. std::vector<size_t> size_list;
  51. for (auto &tensor : grad_tensor_list_) {
  52. MS_EXCEPTION_IF_NULL(tensor);
  53. tensor_type_list_.emplace_back(tensor->data_type());
  54. DeviceAddressPtr device_address = std::dynamic_pointer_cast<DeviceAddress>(tensor->device_address());
  55. MS_EXCEPTION_IF_NULL(device_address);
  56. auto origin_size = device_address->GetSize();
  57. auto align_size = AlignMemorySize(origin_size);
  58. size_list.emplace_back(origin_size);
  59. align_size_list_.emplace_back(align_size);
  60. total_size += align_size;
  61. memcpy_input_addrs_.emplace_back(
  62. std::make_shared<kernel::Address>(static_cast<uint8_t *>(device_address->GetMutablePtr()), origin_size));
  63. }
  64. total_size_ = total_size;
  65. ar_input_addr_ = static_cast<uint8_t *>(GPUMemoryAllocator::GetInstance().AllocTensorMem(total_size));
  66. ar_output_addr_ = static_cast<uint8_t *>(GPUMemoryAllocator::GetInstance().AllocTensorMem(total_size));
  67. uint8_t *memcpy_output = ar_input_addr_;
  68. for (size_t i = 0; i < bucket_size_; ++i) {
  69. memcpy_output_addrs_.emplace_back(std::make_shared<kernel::Address>(memcpy_output, size_list[i]));
  70. memcpy_output += align_size_list_[i];
  71. }
  72. MS_LOG(INFO) << "end";
  73. }
  74. void GPUBucket::FreeDeviceMem(void *dev_ptr) { GPUMemoryAllocator::GetInstance().FreeTensorMem(dev_ptr); }
  75. void GPUBucket::FreeAllDeviceMem() {
  76. MS_LOG(INFO) << "start";
  77. if (ar_input_addr_ != nullptr) {
  78. FreeDeviceMem(ar_input_addr_);
  79. ar_input_addr_ = nullptr;
  80. }
  81. if (ar_output_addr_ != nullptr) {
  82. FreeDeviceMem(ar_output_addr_);
  83. ar_output_addr_ = nullptr;
  84. }
  85. // clear launch mul device memory
  86. if (launch_mul_ != nullptr) {
  87. launch_mul_->FreeLaunchDeviceMem();
  88. }
  89. MS_LOG(INFO) << "end";
  90. }
  91. void GPUBucket::CopyTensorToContiguousMemory() {
  92. MS_LOG(INFO) << "start";
  93. MS_EXCEPTION_IF_NULL(compute_stream_);
  94. // Clean allreduce input
  95. CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(
  96. cudaMemsetAsync(ar_input_addr_, 0, total_size_, static_cast<cudaStream_t>(compute_stream_)),
  97. "Call cudaMemsetAsync failed");
  98. for (size_t i = 0; i < bucket_size_; ++i) {
  99. MS_EXCEPTION_IF_NULL(memcpy_output_addrs_[i]);
  100. MS_EXCEPTION_IF_NULL(memcpy_input_addrs_[i]);
  101. if (!GPUDeviceManager::GetInstance().CopyDeviceMemToDeviceAsync(memcpy_output_addrs_[i]->addr,
  102. memcpy_input_addrs_[i]->addr,
  103. memcpy_output_addrs_[i]->size, compute_stream_)) {
  104. MS_LOG(EXCEPTION) << "Copy memory failed";
  105. }
  106. }
  107. MS_LOG(INFO) << "end";
  108. }
  109. void GPUBucket::LaunchAllReduce() {
  110. MS_LOG(INFO) << "start";
  111. collective_handle_ = device::gpu::CollectiveInitializer::instance().collective_handle();
  112. auto all_reduce_funcptr =
  113. reinterpret_cast<kernel::AllReduce>(dlsym(const_cast<void *>(collective_handle_), "AllReduce"));
  114. MS_EXCEPTION_IF_NULL(all_reduce_funcptr);
  115. MS_EXCEPTION_IF_NULL(stream_);
  116. if (tensor_type_list_.empty()) {
  117. MS_LOG(EXCEPTION) << "No tesnor type found";
  118. }
  119. auto type = tensor_type_list_[0];
  120. if (std::any_of(tensor_type_list_.begin(), tensor_type_list_.end(),
  121. [&type](TypeId tensor_type) { return type != tensor_type; })) {
  122. MS_LOG(EXCEPTION) << "AllReduce input have different dtype";
  123. }
  124. auto type_size = abstract::TypeIdSize(type);
  125. if (type_size == 0) {
  126. MS_LOG(EXCEPTION) << "Invalid type:" << type;
  127. }
  128. // typeid to nccl_data_type
  129. auto nccl_data_type_iter = kernel::kNcclDtypeMap.find(TypeIdLabel(type));
  130. if (nccl_data_type_iter == kernel::kNcclDtypeMap.end()) {
  131. MS_LOG(EXCEPTION) << "Invalid type:" << type;
  132. }
  133. auto nccl_result =
  134. (*all_reduce_funcptr)(ar_input_addr_, ar_output_addr_, total_size_ / type_size, nccl_data_type_iter->second,
  135. ncclRedOp_t::ncclSum, static_cast<cudaStream_t>(stream_), group_);
  136. if (nccl_result != ncclSuccess) {
  137. MS_LOG(EXCEPTION) << "AllReduce failed, ret:" << nccl_result;
  138. }
  139. MS_LOG(INFO) << "end";
  140. }
  141. std::shared_ptr<LaunchKernel> GPUBucket::CreateLaunchMul() {
  142. if (tensor_type_list_.empty()) {
  143. MS_LOG(ERROR) << "tensor_type_list_ is empty";
  144. }
  145. auto launch_mul = std::make_shared<GPULaunchMul>(stream_, tensor_type_list_[0], total_size_);
  146. MS_EXCEPTION_IF_NULL(launch_mul);
  147. return launch_mul;
  148. }
  149. void GPUBucket::Init(const std::vector<void *> &compute_streams, const std::vector<void *> &communication_streams) {
  150. pre_event_ = std::make_shared<GpuEvent>();
  151. post_event_ = std::make_shared<GpuEvent>();
  152. if (!compute_streams.empty()) {
  153. compute_stream_ = compute_streams.front();
  154. }
  155. if (!communication_streams.empty()) {
  156. stream_ = communication_streams.front();
  157. }
  158. MS_EXCEPTION_IF_NULL(compute_stream_);
  159. MS_EXCEPTION_IF_NULL(stream_);
  160. MS_EXCEPTION_IF_NULL(pre_event_);
  161. MS_EXCEPTION_IF_NULL(post_event_);
  162. pre_event_->set_record_stream(compute_stream_);
  163. pre_event_->set_wait_stream(stream_);
  164. post_event_->set_record_stream(stream_);
  165. post_event_->set_wait_stream(compute_stream_);
  166. }
  167. } // namespace mindspore::device::gpu