zzy34407230
/
mindspore2022

/**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "ps/server/collective_ops_impl.h"

namespace mindspore {
namespace ps {
namespace server {
void CollectiveOpsImpl::Initialize(const std::shared_ptr<core::ServerNode> &server_node) {
  MS_EXCEPTION_IF_NULL(server_node);
  server_node_ = server_node;
  local_rank_ = server_node_->rank_id();
  server_num_ = PSContext::instance()->initial_server_num();
  return;
}

template <typename T>
bool CollectiveOpsImpl::RingAllReduce(const void *sendbuff, void *recvbuff, size_t count) {
  int ret = memcpy_s(recvbuff, count * sizeof(T), sendbuff, count * sizeof(T));
  if (ret != 0) {
    MS_LOG(ERROR) << "memcpy_s error, errorno(" << ret << ")";
    return false;
  }

  uint32_t rank_size = server_num_;
  uint32_t local_rank_ = server_node_->rank_id();
  size_t chunk_size = count / rank_size;
  size_t remainder_size = count % rank_size;
  std::vector<size_t> chunk_sizes(rank_size, chunk_size);
  // The rest of the data should be assigned to each chunk.
  for (size_t i = 0; i < remainder_size; i++) {
    chunk_sizes[i]++;
  }
  // Store offsets to get every data chunk's address.
  std::vector<size_t> chunk_offset;
  for (size_t i = 0; i < rank_size; i++) {
    size_t ofs =
      std::accumulate(chunk_sizes.begin(), chunk_sizes.begin() + i, static_cast<size_t>(0), std::plus<size_t>());
    chunk_offset.push_back(ofs);
  }

  T *output_buff = reinterpret_cast<T *>(recvbuff);
  uint32_t send_to_rank = (local_rank_ + 1) % rank_size;
  uint32_t recv_from_rank = (local_rank_ - 1 + rank_size) % rank_size;
  MS_LOG(DEBUG) << "AllReduce count:" << count << ", rank_size:" << rank_size << ", local_rank_:" << local_rank_
                << ", chunk_size:" << chunk_size << ", remainder_size:" << remainder_size
                << ", chunk_sizes:" << chunk_sizes << ", send_to_rank:" << send_to_rank
                << ", recv_from_rank:" << recv_from_rank;

  // Ring ReduceScatter.
  MS_LOG(DEBUG) << "Start Ring ReduceScatter.";
  std::unique_ptr<T[]> tmp_recv_chunk = std::make_unique<T[]>(chunk_sizes[0]);
  for (size_t i = 0; i < rank_size - 1; i++) {
    // Step 1: Async send data to next rank.
    size_t send_chunk_index = (local_rank_ - i + rank_size) % rank_size;
    T *send_chunk = output_buff + chunk_offset[send_chunk_index];
    auto send_req_id = server_node_->CollectiveSendAsync(core::NodeRole::SERVER, send_to_rank, send_chunk,
                                                         chunk_sizes[send_chunk_index] * sizeof(T));
    // Step 2: Async receive data to next rank and wait until it's done.
    size_t recv_chunk_index = (local_rank_ - i - 1 + rank_size) % rank_size;
    T *recv_chunk = output_buff + chunk_offset[recv_chunk_index];
    MS_LOG(DEBUG) << "Ring ReduceScatter send_to_rank:" << send_to_rank << ", recv_from_rank:" << recv_from_rank
                  << ", send count:" << chunk_sizes[send_chunk_index]
                  << ", recv count:" << chunk_sizes[recv_chunk_index] << ", iteration:" << i;

    std::shared_ptr<std::vector<unsigned char>> recv_str;
    auto recv_req_id = server_node_->CollectiveReceiveAsync(core::NodeRole::SERVER, recv_from_rank, &recv_str);
    if (!server_node_->CollectiveWait(recv_req_id, 1)) {
      MS_LOG(ERROR) << "CollectiveWait " << recv_req_id << " failed.";
      return false;
    }
    memcpy_s(tmp_recv_chunk.get(), chunk_sizes[recv_chunk_index] * sizeof(T), recv_str->data(), recv_str->size());

    // Step 3: Reduce the data so we can overlap the time cost of send.
    for (size_t j = 0; j < chunk_sizes[recv_chunk_index]; j++) {
      recv_chunk[j] += tmp_recv_chunk[j];
    }
    // Step 4: Wait until send is done.
    if (!server_node_->Wait(send_req_id, 1)) {
      MS_LOG(ERROR) << "CollectiveWait " << send_req_id << " failed.";
      return false;
    }
  }
  MS_LOG(DEBUG) << "End Ring ReduceScatter.";

  // Ring AllGather.
  MS_LOG(DEBUG) << "Start Ring AllGather.";
  for (size_t i = 0; i < rank_size - 1; i++) {
    size_t send_chunk_index = (local_rank_ - i + 1 + rank_size) % rank_size;
    T *send_chunk = output_buff + chunk_offset[send_chunk_index];
    auto send_req_id = server_node_->CollectiveSendAsync(core::NodeRole::SERVER, send_to_rank, send_chunk,
                                                         chunk_sizes[send_chunk_index] * sizeof(T));
    size_t recv_chunk_index = (local_rank_ - i + rank_size) % rank_size;
    T *recv_chunk = output_buff + chunk_offset[recv_chunk_index];
    MS_LOG(DEBUG) << "Ring AllGather send_to_rank:" << send_to_rank << ", recv_from_rank:" << recv_from_rank
                  << ", send count:" << chunk_sizes[send_chunk_index]
                  << ", recv count:" << chunk_sizes[recv_chunk_index] << ", iteration:" << i;

    std::shared_ptr<std::vector<unsigned char>> recv_str;
    auto recv_req_id = server_node_->CollectiveReceiveAsync(core::NodeRole::SERVER, recv_from_rank, &recv_str);

    if (!server_node_->CollectiveWait(recv_req_id, 1)) {
      MS_LOG(ERROR) << "CollectiveWait " << recv_req_id << " failed.";
      return false;
    }
    memcpy_s(recv_chunk, chunk_sizes[recv_chunk_index] * sizeof(T), recv_str->data(), recv_str->size());
    if (!server_node_->Wait(send_req_id, 1)) {
      MS_LOG(ERROR) << "CollectiveWait " << send_req_id << " failed.";
      return false;
    }
  }
  MS_LOG(DEBUG) << "End Ring AllGather.";
  return true;
}

template <typename T>
bool CollectiveOpsImpl::ReduceBroadcastAllReduce(const void *sendbuff, void *recvbuff, size_t count) {
  uint32_t rank_size = server_num_;
  uint32_t local_rank_ = server_node_->rank_id();
  MS_LOG(DEBUG) << "Reduce Broadcast AllReduce rank_size:" << rank_size << ", local_rank_:" << local_rank_
                << ", count:" << count;
  int ret = memcpy_s(recvbuff, count * sizeof(T), sendbuff, count * sizeof(T));
  if (ret != 0) {
    MS_LOG(ERROR) << "memcpy_s error, errorno(" << ret << ")";
    return false;
  }
  T *output_buff = reinterpret_cast<T *>(recvbuff);
  // Reduce data to rank 0 process.
  MS_LOG(DEBUG) << "Start Reduce to rank 0 process.";
  if (local_rank_ == 0) {
    std::unique_ptr<T[]> tmp_recv_buff = std::make_unique<T[]>(count);
    for (uint32_t i = 1; i < rank_size; i++) {
      std::shared_ptr<std::vector<unsigned char>> recv_str;
      MS_LOG(DEBUG) << "Reduce rank 0 receive from rank " << i;
      auto recv_req_id = server_node_->CollectiveReceiveAsync(core::NodeRole::SERVER, i, &recv_str);
      if (!server_node_->CollectiveWait(recv_req_id, 1)) {
        MS_LOG(ERROR) << "CollectiveWait " << recv_req_id << " failed.";
        return false;
      }
      memcpy_s(tmp_recv_buff.get(), count * sizeof(T), recv_str->data(), recv_str->size());
      for (size_t j = 0; j < count; j++) {
        output_buff[j] += tmp_recv_buff[j];
      }
    }
  } else {
    MS_LOG(DEBUG) << "Reduce send data to rank 0 process.";
    auto send_req_id = server_node_->CollectiveSendAsync(core::NodeRole::SERVER, 0, sendbuff, count * sizeof(T));
    if (!server_node_->Wait(send_req_id, 1)) {
      MS_LOG(ERROR) << "CollectiveWait " << send_req_id << " failed.";
      return false;
    }
  }
  MS_LOG(DEBUG) << "End Reduce.";

  // Broadcast data to not 0 rank process.
  MS_LOG(DEBUG) << "Start broadcast from rank 0 to other processes.";
  if (local_rank_ == 0) {
    for (uint32_t i = 1; i < rank_size; i++) {
      MS_LOG(DEBUG) << "Broadcast data to process " << i;
      auto send_req_id = server_node_->CollectiveSendAsync(core::NodeRole::SERVER, i, output_buff, count * sizeof(T));
      if (!server_node_->Wait(send_req_id, 1)) {
        MS_LOG(ERROR) << "CollectiveWait " << send_req_id << " failed.";
        return false;
      }
    }
  } else {
    MS_LOG(DEBUG) << "Broadcast receive from rank 0.";
    std::shared_ptr<std::vector<unsigned char>> recv_str;
    auto recv_req_id = server_node_->CollectiveReceiveAsync(core::NodeRole::SERVER, 0, &recv_str);
    if (!server_node_->CollectiveWait(recv_req_id, 1)) {
      MS_LOG(ERROR) << "CollectiveWait " << recv_req_id << " failed.";
      return false;
    }
    memcpy_s(output_buff, count * sizeof(T), recv_str->data(), recv_str->size());
  }
  MS_LOG(DEBUG) << "End broadcast.";
  return true;
}

template <typename T>
bool CollectiveOpsImpl::AllReduce(const void *sendbuff, void *recvbuff, size_t count) {
  // The collective communication API does not support calling Send and Recv concurrently with multiple threads;
  std::unique_lock<std::mutex> lock(mtx_);
  if (sendbuff == nullptr || recvbuff == nullptr) {
    MS_LOG(ERROR) << "AllReduce sendbuff or recvbuff is nullptr.";
    return false;
  }

  uint32_t rank_size = server_num_;
  if (count >= rank_size) {
    return RingAllReduce<T>(sendbuff, recvbuff, count);
  } else {
    return ReduceBroadcastAllReduce<T>(sendbuff, recvbuff, count);
  }
}

template bool CollectiveOpsImpl::RingAllReduce<float>(const void *sendbuff, void *recvbuff, size_t count);
template bool CollectiveOpsImpl::RingAllReduce<size_t>(const void *sendbuff, void *recvbuff, size_t count);
template bool CollectiveOpsImpl::RingAllReduce<int>(const void *sendbuff, void *recvbuff, size_t count);

template bool CollectiveOpsImpl::ReduceBroadcastAllReduce<float>(const void *sendbuff, void *recvbuff, size_t count);
template bool CollectiveOpsImpl::ReduceBroadcastAllReduce<size_t>(const void *sendbuff, void *recvbuff, size_t count);
template bool CollectiveOpsImpl::ReduceBroadcastAllReduce<int>(const void *sendbuff, void *recvbuff, size_t count);

template bool CollectiveOpsImpl::AllReduce<float>(const void *sendbuff, void *recvbuff, size_t count);
template bool CollectiveOpsImpl::AllReduce<size_t>(const void *sendbuff, void *recvbuff, size_t count);
template bool CollectiveOpsImpl::AllReduce<int>(const void *sendbuff, void *recvbuff, size_t count);
}  // namespace server
}  // namespace ps
}  // namespace mindspore