zzy34407230
/
mindspore2022

 
			
			   
				 
					
						
						
							
							/**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "runtime/device/bucket.h"

#include "runtime/device/kernel_runtime_manager.h"
#include "frontend/parallel/context.h"
#include "utils/profile.h"

namespace mindspore::device {
void Bucket::AddGradTensor(const tensor::TensorPtr &tensor) {
  if (grad_tensor_list_.size() >= bucket_size_) {
    MS_LOG(EXCEPTION) << "bucket is full";
  }
  grad_tensor_list_.emplace_back(tensor);
  if (grad_tensor_list_.size() > bucket_size_) {
    MS_LOG(EXCEPTION) << "too many tensor add to the bucket, bucket_size_:" << bucket_size_
                      << " total tensor size:" << grad_tensor_list_.size();
  }
  MS_LOG(INFO) << "current bucket tensors size:" << grad_tensor_list_.size();
  // bucket is full, start to launch allreduce
  if (grad_tensor_list_.size() == bucket_size_) {
    full_ = true;
  }
}

void Bucket::Launch() {
  auto start = GetTime();
  if (grad_tensor_list_.size() != bucket_size_) {
    MS_LOG(EXCEPTION) << "Bucket is not full, grad_tensor_list_ size:" << grad_tensor_list_.size()
                      << " bucket_size_:" << bucket_size_;
  }
  MS_LOG(INFO) << "Bucket is full, start to launch AllReduce";
  MS_EXCEPTION_IF_NULL(pre_event_);
  MS_EXCEPTION_IF_NULL(post_event_);
  AllocateAllReduceMemory();
  CopyTensorToContiguousMemory();
  pre_event_->RecordEvent();
  pre_event_->WaitEvent();
  LaunchAllReduce();
  post_event_->RecordEvent();
  UpdateTensorAddr();
  // pass event to the tensor
  for (auto &tensor : grad_tensor_list_) {
    MS_EXCEPTION_IF_NULL(tensor);
    tensor->SetDeviceEvent(post_event_);
  }
  MS_LOG(INFO) << "Bucket launch cost:" << (GetTime() - start) * 1e6 << " us";
}

void Bucket::AllocateAllReduceMemory() {
  // Check bucket is full
  if (grad_tensor_list_.size() != bucket_size_) {
    MS_LOG(EXCEPTION) << "Grad tensor list size:" << grad_tensor_list_.size()
                      << " is not equal to bucket size:" << bucket_size_;
  }

  size_t total_size = 0;
  std::vector<size_t> origin_size_list;
  for (auto &tensor : grad_tensor_list_) {
    MS_EXCEPTION_IF_NULL(tensor);
    tensor_type_list_.emplace_back(tensor->data_type());
    DeviceAddressPtr device_address = std::dynamic_pointer_cast<DeviceAddress>(tensor->device_address());
    MS_EXCEPTION_IF_NULL(device_address);
    auto origin_size = device_address->GetSize();
    auto align_size = MemoryManager::GetCommonAlignSize(origin_size);
    origin_size_list.emplace_back(origin_size);
    align_size_list_.emplace_back(align_size);
    total_size += align_size;
    memcpy_input_addrs_.emplace_back(std::make_shared<kernel::Address>(
      static_cast<uint8_t *>(device_address->GetMutablePtr()), device_address->GetSize()));

    auto tensor_address = std::dynamic_pointer_cast<device::DeviceAddress>(tensor->device_address());
    MS_EXCEPTION_IF_NULL(tensor_address);

    ar_input_address_list_.emplace_back(
      CreateDeviceAddress(origin_size, tensor_address->type_id(), tensor_address->format()));
    ar_output_address_list_.emplace_back(
      CreateDeviceAddress(origin_size, tensor_address->type_id(), tensor_address->format()));
  }

  total_size_ = total_size;

  AllocateContinousMemory(ar_input_address_list_, total_size, align_size_list_);
  AllocateContinousMemory(ar_output_address_list_, total_size, align_size_list_);

  // generate memecpy output addr
  if (origin_size_list.size() != ar_input_address_list_.size()) {
    MS_LOG(EXCEPTION) << "Invalid ar_input_address_list size:" << ar_input_address_list_.size()
                      << " origin_size_list size:" << origin_size_list.size();
  }
  size_t item_index = 0;
  for (const auto &ar_input_address_item : ar_input_address_list_) {
    MS_EXCEPTION_IF_NULL(ar_input_address_item);
    memcpy_output_addrs_.emplace_back(
      std::make_shared<kernel::Address>(ar_input_address_item->GetMutablePtr(), origin_size_list[item_index]));
    ++item_index;
  }
}

void Bucket::UpdateTensorAddr() {
  if (grad_tensor_list_.size() != bucket_size_ || ar_output_address_list_.size() != bucket_size_) {
    MS_LOG(EXCEPTION) << "grad_tensor_list_ size:" << grad_tensor_list_.size()
                      << " ar_output_address_list_ size:" << ar_output_address_list_.size()
                      << " bucket size:" << bucket_size_;
  }

  for (size_t i = 0; i < bucket_size_; ++i) {
    auto &tensor = grad_tensor_list_[i];
    MS_EXCEPTION_IF_NULL(tensor);
    tensor->set_device_address(ar_output_address_list_[i]);
  }
}

void Bucket::Release() {
  MS_LOG(INFO) << "Clear bucket:" << id_;
  grad_tensor_list_.clear();
  align_size_list_.clear();
  new_tensor_output_addrs_.clear();
  memcpy_input_addrs_.clear();
  memcpy_output_addrs_.clear();
  tensor_type_list_.clear();
  ar_input_address_list_.clear();
  ar_output_address_list_.clear();
  FreeAllDeviceMem();
  full_ = false;
}
}  // namespace mindspore::device