Add NeighborExchange support for GPU

4 years ago · 8035bafd7c
--- a/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
@@ -93,6 +93,7 @@ if(ENABLE_GPU)

    file(GLOB_RECURSE GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "gpu/*.cc")
    list(REMOVE_ITEM GPU_SRC_LIST "gpu/nccl/nccl_collective_gpu_kernel.cc")
    list(REMOVE_ITEM GPU_SRC_LIST "gpu/nccl/nccl_p2p_gpu_kernel.cc")
    list(REMOVE_ITEM GPU_SRC_LIST "gpu/nccl/nccl_send_gpu_kernel.cc")
    list(REMOVE_ITEM GPU_SRC_LIST "gpu/nccl/nccl_recv_gpu_kernel.cc")
    list(REMOVE_ITEM GPU_SRC_LIST "gpu/trt/trt_kernel.cc")
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_collective_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_collective_gpu_kernel.cc
@@ -58,15 +58,5 @@ MS_REG_GPU_KERNEL_ONE(Broadcast,
                      KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
                      NcclCollectiveGpuKernel, int)

 MS_REG_GPU_KERNEL_ONE(
  AllToAllv, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  NcclCollectiveGpuKernel, float)
 MS_REG_GPU_KERNEL_ONE(
  AllToAllv, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
  NcclCollectiveGpuKernel, half)
 MS_REG_GPU_KERNEL_ONE(AllToAllv,
                      KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
                      NcclCollectiveGpuKernel, int)

 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_collective_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_collective_gpu_kernel.h
@@ -31,14 +31,12 @@ enum NcclKernelType {
  NCCL_ALL_GATHER,
  NCCL_REDUCE_SCATTER,
  NCCL_BROADCAST,
  NCCL_ALLTOALLV,
  NCCL_INVALID_TYPE = 255
 };
 const std::map<std::string, NcclKernelType> kNcclTypeMap = {{"AllReduce", NCCL_ALL_REDUCE},
                                                            {"AllGather", NCCL_ALL_GATHER},
                                                            {"ReduceScatter", NCCL_REDUCE_SCATTER},
                                                            {"Broadcast", NCCL_BROADCAST},
                                                            {"AllToAllv", NCCL_ALLTOALLV}};
                                                            {"Broadcast", NCCL_BROADCAST}};

 template <typename T>
 class NcclCollectiveGpuKernel : public NcclGpuKernel {
@@ -72,10 +70,6 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel {
        LaunchBroadcast(inputs, outputs, stream_ptr);
        break;
      }
      case NCCL_ALLTOALLV: {
        LaunchAllToAllv(inputs, outputs, stream_ptr);
        break;
      }
      default: {
        MS_LOG(EXCEPTION) << "Kernel type " << nccl_kernel_type_ << " is not supported.";
      }
@@ -214,37 +208,6 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel {
    }
  }

  void LaunchAllToAllv(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs,
                       void *stream_ptr) {
    T *input_addr = nullptr;
    T *output_addr = nullptr;
    cudaStream_t stream = comm_stream_ ? comm_stream_ : reinterpret_cast<cudaStream_t>(stream_ptr);
    auto nccl_recv_func = reinterpret_cast<Recv>(dlsym(const_cast<void *>(collective_handle_), "Recv"));
    auto nccl_send_func = reinterpret_cast<Send>(dlsym(const_cast<void *>(collective_handle_), "Send"));
    auto nccl_gstart_func = reinterpret_cast<GroupStart>(dlsym(const_cast<void *>(collective_handle_), "GroupStart"));
    auto nccl_gend_func = reinterpret_cast<GroupEnd>(dlsym(const_cast<void *>(collective_handle_), "GroupEnd"));
    MS_EXCEPTION_IF_NULL(nccl_recv_func);
    MS_EXCEPTION_IF_NULL(nccl_send_func);
    MS_EXCEPTION_IF_NULL(nccl_gstart_func);
    MS_EXCEPTION_IF_NULL(nccl_gend_func);

    // This implementation refers to NVIDIA NCCL 2.11 doc.
    CHECK_NCCL_RET_WITH_EXCEPT(kernel_node_, (*nccl_gstart_func)(), "AllToAllv: ncclGroupStart failed");
    for (int i = 0; i < SizeToInt(input_size_list_.size()); ++i) {
      input_addr = GetDeviceAddress<T>(inputs, i);
      output_addr = GetDeviceAddress<T>(outputs, i);
      CHECK_NCCL_RET_WITH_EXCEPT(
        kernel_node_,
        (*nccl_send_func)(input_addr, input_size_list_[i] / sizeof(T), nccl_data_type_, i, stream, group_name_),
        "AllToAllv: ncclSend failed");
      CHECK_NCCL_RET_WITH_EXCEPT(
        kernel_node_,
        (*nccl_recv_func)(output_addr, output_size_list_[i] / sizeof(T), nccl_data_type_, i, stream, group_name_),
        "AllToAllv: ncclRecv failed");
    }
    CHECK_NCCL_RET_WITH_EXCEPT(kernel_node_, (*nccl_gend_func)(), "AllToAllv: ncclGroupEnd failed");
  }

  void InferCommType(const CNodePtr &kernel_node) {
    std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    auto iter = kNcclTypeMap.find(kernel_name);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_p2p_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_p2p_gpu_kernel.cc
@@ -0,0 +1,57 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/nccl/nccl_p2p_gpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(
  AllToAllv, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  NcclP2PGpuKernel, float, float)
 MS_REG_GPU_KERNEL_TWO(
  AllToAllv, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
  NcclP2PGpuKernel, half, half)
 MS_REG_GPU_KERNEL_TWO(AllToAllv,
                      KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
                      NcclP2PGpuKernel, int, int)
 MS_REG_GPU_KERNEL_TWO(
  AllToAllv, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat16),
  NcclP2PGpuKernel, float, half)
 MS_REG_GPU_KERNEL_TWO(
  AllToAllv, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat32),
  NcclP2PGpuKernel, half, float)

 MS_REG_GPU_KERNEL_TWO(
  NeighborExchange,
  KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  NcclP2PGpuKernel, float, float)
 MS_REG_GPU_KERNEL_TWO(
  NeighborExchange,
  KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
  NcclP2PGpuKernel, half, half)
 MS_REG_GPU_KERNEL_TWO(NeighborExchange,
                      KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
                      NcclP2PGpuKernel, int, int)
 MS_REG_GPU_KERNEL_TWO(
  NeighborExchange,
  KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat16),
  NcclP2PGpuKernel, float, half)
 MS_REG_GPU_KERNEL_TWO(
  NeighborExchange,
  KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat32),
  NcclP2PGpuKernel, half, float)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_p2p_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_p2p_gpu_kernel.h
@@ -0,0 +1,224 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NCCL_P2P_GPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NCCL_P2P_GPU_KERNEL_H_

 #include <dlfcn.h>
 #include <stdint.h>
 #include <vector>
 #include <string>
 #include <map>
 #include "backend/kernel_compiler/gpu/nccl/nccl_gpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 enum NcclKernelType { NCCL_ALLTOALLV = 0, NCCL_NEIGHBOREXCHANGE = 1, NCCL_INVALID_TYPE = 255 };
 const std::map<std::string, NcclKernelType> kNcclTypeMap = {{"AllToAllv", NCCL_ALLTOALLV},
                                                            {"NeighborExchange", NCCL_NEIGHBOREXCHANGE}};

 template <typename T, typename I>
 class NcclP2PGpuKernel : public NcclGpuKernel {
 public:
  NcclP2PGpuKernel() { ResetResource(); }
  ~NcclP2PGpuKernel() override = default;

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    switch (nccl_kernel_type_) {
      case NCCL_ALLTOALLV: {
        LaunchAllToAllv(inputs, outputs, stream_ptr);
        break;
      }
      case NCCL_NEIGHBOREXCHANGE: {
        LaunchAllToAllv(inputs, outputs, stream_ptr);
        break;
      }
      default: {
        MS_LOG(EXCEPTION) << "Kernel type " << nccl_kernel_type_ << " is not supported.";
      }
    }
    return true;
  }

  bool Init(const CNodePtr &kernel_node) override {
    MS_EXCEPTION_IF_NULL(kernel_node);
    kernel_node_ = kernel_node;
    InferCommType(kernel_node);

    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (input_num > 0) {
      input_nccl_data_type_ = nccl_dtype(AnfAlgo::GetInputDeviceDataType(kernel_node, 0));
    }
    if (output_num > 0) {
      output_nccl_data_type_ = nccl_dtype(AnfAlgo::GetOutputDeviceDataType(kernel_node, 0));
    }
    for (size_t i = 0; i < input_num; ++i) {
      auto shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, i);
      is_null_input_ = CHECK_NULL_INPUT(shape);
      if (is_null_input_) {
        MS_LOG(WARNING) << "For 'NcclP2PGpuKernel', input shape is null ";
        InitSizeLists();
        return true;
      }
      size_t size = sizeof(T);
      for (size_t j = 0; j < shape.size(); j++) {
        size *= IntToSize(shape[j]);
      }
      input_size_list_.push_back(size);
      input_size_ += size;
    }
    for (size_t i = 0; i < output_num; ++i) {
      auto shape = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, i);
      is_null_input_ = CHECK_NULL_INPUT(shape);
      if (is_null_input_) {
        MS_LOG(WARNING) << "For 'NcclP2PGpuKernel', output shape is null";
        InitSizeLists();
        return true;
      }
      size_t size = sizeof(I);
      for (size_t j = 0; j < shape.size(); j++) {
        size *= IntToSize(shape[j]);
      }
      output_size_list_.push_back(size);
      output_size_ += size;
    }

    group_name_ = GetAttr<std::string>(kernel_node, kAttrGroup);
    MS_LOG(INFO) << AnfAlgo::GetCNodeName(kernel_node) << " for group " << group_name_;
    auto prim = AnfAlgo::GetCNodePrimitive(kernel_node);
    MS_EXCEPTION_IF_NULL(prim);
    auto comm_stream_attr = prim->GetAttr("stream_id");
    if (comm_stream_attr) {
      comm_stream_ = reinterpret_cast<cudaStream_t>(GetValue<uintptr_t>(comm_stream_attr));
      MS_EXCEPTION_IF_NULL(comm_stream_);
    }

    // Used by AlltoAllv
    auto send_rank_ids_attr = prim->GetAttr(kAttrSendRankIds);
    auto recv_rank_ids_attr = prim->GetAttr(kAttrRecvRankIds);
    if (send_rank_ids_attr && recv_rank_ids_attr) {
      send_rank_ids = GetValue<std::vector<int64_t>>(send_rank_ids_attr);
      recv_rank_ids = GetValue<std::vector<int64_t>>(recv_rank_ids_attr);
    }

    collective_handle_ = device::gpu::CollectiveInitializer::instance().collective_handle();
    MS_EXCEPTION_IF_NULL(collective_handle_);
    return true;
  }

  void ResetResource() noexcept override {
    nccl_kernel_type_ = NCCL_INVALID_TYPE;
    input_size_ = 0;
    output_size_ = 0;
    root_ = 0;
    is_null_input_ = false;
    collective_handle_ = nullptr;
    comm_stream_ = nullptr;
    input_size_list_.clear();
    output_size_list_.clear();
    workspace_size_list_.clear();
  }

 protected:
  void InitSizeLists() override { return; }

 private:
  void LaunchAllToAllv(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs,
                       void *stream_ptr) {
    T *input_addr = nullptr;
    I *output_addr = nullptr;
    cudaStream_t stream = comm_stream_ ? comm_stream_ : reinterpret_cast<cudaStream_t>(stream_ptr);

    // send_rank_id and recv rank_id size needs to be equal to input_list size
    if (send_rank_ids.size() != input_size_list_.size()) {
      MS_LOG(ERROR) << "Trying to use AlltoAllv, but send_rank_ids vector size not equals to input_list size.";
    }
    if (recv_rank_ids.size() != output_size_list_.size()) {
      MS_LOG(ERROR) << "Trying to use AlltoAllv, but recv_rank_ids vector size not equals to output_list size.";
    }

    auto nccl_recv_func = reinterpret_cast<Recv>(dlsym(const_cast<void *>(collective_handle_), "Recv"));
    auto nccl_send_func = reinterpret_cast<Send>(dlsym(const_cast<void *>(collective_handle_), "Send"));
    auto nccl_gstart_func = reinterpret_cast<GroupStart>(dlsym(const_cast<void *>(collective_handle_), "GroupStart"));
    auto nccl_gend_func = reinterpret_cast<GroupEnd>(dlsym(const_cast<void *>(collective_handle_), "GroupEnd"));
    MS_EXCEPTION_IF_NULL(nccl_recv_func);
    MS_EXCEPTION_IF_NULL(nccl_send_func);
    MS_EXCEPTION_IF_NULL(nccl_gstart_func);
    MS_EXCEPTION_IF_NULL(nccl_gend_func);

    // This implementation refers to NVIDIA NCCL 2.11 doc.
    CHECK_NCCL_RET_WITH_EXCEPT(kernel_node_, (*nccl_gstart_func)(), "AllToAllv: ncclGroupStart failed");
    for (int i = 0; i < SizeToInt(input_size_list_.size()); ++i) {
      input_addr = GetDeviceAddress<T>(inputs, i);
      CHECK_NCCL_RET_WITH_EXCEPT(kernel_node_,
                                 (*nccl_send_func)(input_addr, input_size_list_[i] / sizeof(T), input_nccl_data_type_,
                                                   send_rank_ids[i], stream, group_name_),
                                 "AllToAllv: ncclSend failed");
    }
    for (int i = 0; i < SizeToInt(output_size_list_.size()); ++i) {
      output_addr = GetDeviceAddress<I>(outputs, i);
      CHECK_NCCL_RET_WITH_EXCEPT(kernel_node_,
                                 (*nccl_recv_func)(output_addr, output_size_list_[i] / sizeof(I),
                                                   output_nccl_data_type_, recv_rank_ids[i], stream, group_name_),
                                 "AllToAllv: ncclRecv failed");
    }
    CHECK_NCCL_RET_WITH_EXCEPT(kernel_node_, (*nccl_gend_func)(), "AllToAllv: ncclGroupEnd failed");
  }

  void InferCommType(const CNodePtr &kernel_node) {
    std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    auto iter = kNcclTypeMap.find(kernel_name);
    if (iter == kNcclTypeMap.end()) {
      MS_LOG(EXCEPTION) << "Kernel " << kernel_name << " is not supported.";
    } else {
      nccl_kernel_type_ = iter->second;
    }

    auto prim = AnfAlgo::GetCNodePrimitive(kernel_node);
    MS_EXCEPTION_IF_NULL(prim);

    auto root_rank = prim->GetAttr(kAttrRootRank);
    if (root_rank) {
      root_ = static_cast<int>(GetValue<int64_t>(root_rank));
    }
    return;
  }

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  NcclKernelType nccl_kernel_type_;
  size_t input_size_;
  size_t output_size_;
  int root_;
  bool is_null_input_;
  const void *collective_handle_;
  cudaStream_t comm_stream_;
  ncclDataType_t output_nccl_data_type_;
  ncclDataType_t input_nccl_data_type_;
  std::vector<int64_t> send_rank_ids;
  std::vector<int64_t> recv_rank_ids;
 };
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NCCL_P2P_GPU_KERNEL_H_
--- a/mindspore/ccsrc/frontend/parallel/ops_info/conv2d_info.cc
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/conv2d_info.cc
@@ -658,7 +658,8 @@ OperatorAttrs Conv2DInfo::CreateNeighborExchangeAttrs(const CNodePtr &cnode) {
  Attr send_shapes = {SEND_SHAPES, MakeTupleListValue(send_shapes_)};
  Attr recv_shapes = {RECV_SHAPES, MakeTupleListValue(recv_shapes_)};
  Attr recv_type = {RECV_TYPE, dtype};
  OperatorAttrs attrs = {send_ranks, recv_ranks, recv_shapes, send_shapes, recv_type};
  Attr group = {GROUP, MakeValue(g_device_manager->world_group())};
  OperatorAttrs attrs = {send_ranks, recv_ranks, recv_shapes, send_shapes, recv_type, group};
  return attrs;
 }

--- a/tests/st/nccl/test_nccl_all.py
+++ b/tests/st/nccl/test_nccl_all.py
@@ -67,4 +67,16 @@ def test_nccl_send_recv_op():
 def test_nccl_all_to_all_op():
    return_code = os.system("mpirun -n 8 pytest -s test_nccl_all_to_all_op.py")
    assert return_code == 0
    

@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_single
 def test_nccl_neighbor_exchange_op():
    """
    Feature: NeighborExchange GPU operator
    Description: see details in test_nccl_neighbor_exchange_op.py
    Expectation: success, return_code==0
    """
    return_code = os.system(
        "mpirun -n 8 pytest -s test_nccl_neighbor_exchange_op.py")
    assert return_code == 0
--- a/tests/st/nccl/test_nccl_neighbor_exchange_op.py
+++ b/tests/st/nccl/test_nccl_neighbor_exchange_op.py
@@ -0,0 +1,65 @@
 # Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 import numpy as np

 import mindspore.context as context
 import mindspore.nn as nn
 from mindspore import Tensor
 from mindspore.communication.management import init, get_rank, get_group_size
 from mindspore.ops import operations as P
 import mindspore as ms

 context.set_context(mode=context.GRAPH_MODE, device_target='GPU')

 init()
 rank = get_rank()
 size = get_group_size()

 x = np.asarray([1, 1, 1, 1, 1, 1, 1, 1]).astype(np.float32) * rank


 class Net(nn.Cell):
    def __init__(self):
        super(Net, self).__init__()
        self.neighborexchange = P.comm_ops.NeighborExchange(
            send_rank_ids=[(rank - 1) % 8],
            recv_rank_ids=[(rank + 1) % 8],
            recv_shapes=tuple([[8]]),
            send_shapes=tuple([[8]]),
            recv_type=ms.float32,
            group="nccl_world_group")

    def construct(self, inputs):
        return self.neighborexchange(inputs)


 def test_neighborexchange():
    """
    Feature: NeighborExchange operator on GPU
    Description: for each device, send to previous rank and receive from next rank.
                example: rank 0 send to rank 7 and receive from rank 1.
    Expectation: on rank i, result == [1 ,1 ,1, 1, 1, 1, 1, 1] * ((i + 1) % 8)
    """
    neighborexchange = Net()
    expect0 = np.asarray([1, 1, 1, 1, 1, 1, 1, 1]).astype(
        np.float32) * ((rank + 1) % 8)
    inputs = []
    inputs.append(Tensor(x))
    inputs = tuple(inputs)
    output0 = neighborexchange(inputs)[0].asnumpy()
    diff0 = output0 - expect0
    error0 = np.ones(shape=expect0.shape) * 1.0e-5
    assert np.all(diff0 < error0)
    assert output0.shape == expect0.shape