| @@ -93,6 +93,7 @@ if(ENABLE_GPU) | |||
| file(GLOB_RECURSE GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "gpu/*.cc") | |||
| list(REMOVE_ITEM GPU_SRC_LIST "gpu/nccl/nccl_collective_gpu_kernel.cc") | |||
| list(REMOVE_ITEM GPU_SRC_LIST "gpu/nccl/nccl_p2p_gpu_kernel.cc") | |||
| list(REMOVE_ITEM GPU_SRC_LIST "gpu/nccl/nccl_send_gpu_kernel.cc") | |||
| list(REMOVE_ITEM GPU_SRC_LIST "gpu/nccl/nccl_recv_gpu_kernel.cc") | |||
| list(REMOVE_ITEM GPU_SRC_LIST "gpu/trt/trt_kernel.cc") | |||
| @@ -58,15 +58,5 @@ MS_REG_GPU_KERNEL_ONE(Broadcast, | |||
| KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), | |||
| NcclCollectiveGpuKernel, int) | |||
| MS_REG_GPU_KERNEL_ONE( | |||
| AllToAllv, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), | |||
| NcclCollectiveGpuKernel, float) | |||
| MS_REG_GPU_KERNEL_ONE( | |||
| AllToAllv, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), | |||
| NcclCollectiveGpuKernel, half) | |||
| MS_REG_GPU_KERNEL_ONE(AllToAllv, | |||
| KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), | |||
| NcclCollectiveGpuKernel, int) | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -31,14 +31,12 @@ enum NcclKernelType { | |||
| NCCL_ALL_GATHER, | |||
| NCCL_REDUCE_SCATTER, | |||
| NCCL_BROADCAST, | |||
| NCCL_ALLTOALLV, | |||
| NCCL_INVALID_TYPE = 255 | |||
| }; | |||
| const std::map<std::string, NcclKernelType> kNcclTypeMap = {{"AllReduce", NCCL_ALL_REDUCE}, | |||
| {"AllGather", NCCL_ALL_GATHER}, | |||
| {"ReduceScatter", NCCL_REDUCE_SCATTER}, | |||
| {"Broadcast", NCCL_BROADCAST}, | |||
| {"AllToAllv", NCCL_ALLTOALLV}}; | |||
| {"Broadcast", NCCL_BROADCAST}}; | |||
| template <typename T> | |||
| class NcclCollectiveGpuKernel : public NcclGpuKernel { | |||
| @@ -72,10 +70,6 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel { | |||
| LaunchBroadcast(inputs, outputs, stream_ptr); | |||
| break; | |||
| } | |||
| case NCCL_ALLTOALLV: { | |||
| LaunchAllToAllv(inputs, outputs, stream_ptr); | |||
| break; | |||
| } | |||
| default: { | |||
| MS_LOG(EXCEPTION) << "Kernel type " << nccl_kernel_type_ << " is not supported."; | |||
| } | |||
| @@ -214,37 +208,6 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel { | |||
| } | |||
| } | |||
| void LaunchAllToAllv(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs, | |||
| void *stream_ptr) { | |||
| T *input_addr = nullptr; | |||
| T *output_addr = nullptr; | |||
| cudaStream_t stream = comm_stream_ ? comm_stream_ : reinterpret_cast<cudaStream_t>(stream_ptr); | |||
| auto nccl_recv_func = reinterpret_cast<Recv>(dlsym(const_cast<void *>(collective_handle_), "Recv")); | |||
| auto nccl_send_func = reinterpret_cast<Send>(dlsym(const_cast<void *>(collective_handle_), "Send")); | |||
| auto nccl_gstart_func = reinterpret_cast<GroupStart>(dlsym(const_cast<void *>(collective_handle_), "GroupStart")); | |||
| auto nccl_gend_func = reinterpret_cast<GroupEnd>(dlsym(const_cast<void *>(collective_handle_), "GroupEnd")); | |||
| MS_EXCEPTION_IF_NULL(nccl_recv_func); | |||
| MS_EXCEPTION_IF_NULL(nccl_send_func); | |||
| MS_EXCEPTION_IF_NULL(nccl_gstart_func); | |||
| MS_EXCEPTION_IF_NULL(nccl_gend_func); | |||
| // This implementation refers to NVIDIA NCCL 2.11 doc. | |||
| CHECK_NCCL_RET_WITH_EXCEPT(kernel_node_, (*nccl_gstart_func)(), "AllToAllv: ncclGroupStart failed"); | |||
| for (int i = 0; i < SizeToInt(input_size_list_.size()); ++i) { | |||
| input_addr = GetDeviceAddress<T>(inputs, i); | |||
| output_addr = GetDeviceAddress<T>(outputs, i); | |||
| CHECK_NCCL_RET_WITH_EXCEPT( | |||
| kernel_node_, | |||
| (*nccl_send_func)(input_addr, input_size_list_[i] / sizeof(T), nccl_data_type_, i, stream, group_name_), | |||
| "AllToAllv: ncclSend failed"); | |||
| CHECK_NCCL_RET_WITH_EXCEPT( | |||
| kernel_node_, | |||
| (*nccl_recv_func)(output_addr, output_size_list_[i] / sizeof(T), nccl_data_type_, i, stream, group_name_), | |||
| "AllToAllv: ncclRecv failed"); | |||
| } | |||
| CHECK_NCCL_RET_WITH_EXCEPT(kernel_node_, (*nccl_gend_func)(), "AllToAllv: ncclGroupEnd failed"); | |||
| } | |||
| void InferCommType(const CNodePtr &kernel_node) { | |||
| std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node); | |||
| auto iter = kNcclTypeMap.find(kernel_name); | |||
| @@ -0,0 +1,57 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/gpu/nccl/nccl_p2p_gpu_kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| MS_REG_GPU_KERNEL_TWO( | |||
| AllToAllv, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), | |||
| NcclP2PGpuKernel, float, float) | |||
| MS_REG_GPU_KERNEL_TWO( | |||
| AllToAllv, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), | |||
| NcclP2PGpuKernel, half, half) | |||
| MS_REG_GPU_KERNEL_TWO(AllToAllv, | |||
| KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), | |||
| NcclP2PGpuKernel, int, int) | |||
| MS_REG_GPU_KERNEL_TWO( | |||
| AllToAllv, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat16), | |||
| NcclP2PGpuKernel, float, half) | |||
| MS_REG_GPU_KERNEL_TWO( | |||
| AllToAllv, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat32), | |||
| NcclP2PGpuKernel, half, float) | |||
| MS_REG_GPU_KERNEL_TWO( | |||
| NeighborExchange, | |||
| KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), | |||
| NcclP2PGpuKernel, float, float) | |||
| MS_REG_GPU_KERNEL_TWO( | |||
| NeighborExchange, | |||
| KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), | |||
| NcclP2PGpuKernel, half, half) | |||
| MS_REG_GPU_KERNEL_TWO(NeighborExchange, | |||
| KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), | |||
| NcclP2PGpuKernel, int, int) | |||
| MS_REG_GPU_KERNEL_TWO( | |||
| NeighborExchange, | |||
| KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat16), | |||
| NcclP2PGpuKernel, float, half) | |||
| MS_REG_GPU_KERNEL_TWO( | |||
| NeighborExchange, | |||
| KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat32), | |||
| NcclP2PGpuKernel, half, float) | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,224 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NCCL_P2P_GPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NCCL_P2P_GPU_KERNEL_H_ | |||
| #include <dlfcn.h> | |||
| #include <stdint.h> | |||
| #include <vector> | |||
| #include <string> | |||
| #include <map> | |||
| #include "backend/kernel_compiler/gpu/nccl/nccl_gpu_kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| enum NcclKernelType { NCCL_ALLTOALLV = 0, NCCL_NEIGHBOREXCHANGE = 1, NCCL_INVALID_TYPE = 255 }; | |||
| const std::map<std::string, NcclKernelType> kNcclTypeMap = {{"AllToAllv", NCCL_ALLTOALLV}, | |||
| {"NeighborExchange", NCCL_NEIGHBOREXCHANGE}}; | |||
| template <typename T, typename I> | |||
| class NcclP2PGpuKernel : public NcclGpuKernel { | |||
| public: | |||
| NcclP2PGpuKernel() { ResetResource(); } | |||
| ~NcclP2PGpuKernel() override = default; | |||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | |||
| const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | |||
| const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | |||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) override { | |||
| switch (nccl_kernel_type_) { | |||
| case NCCL_ALLTOALLV: { | |||
| LaunchAllToAllv(inputs, outputs, stream_ptr); | |||
| break; | |||
| } | |||
| case NCCL_NEIGHBOREXCHANGE: { | |||
| LaunchAllToAllv(inputs, outputs, stream_ptr); | |||
| break; | |||
| } | |||
| default: { | |||
| MS_LOG(EXCEPTION) << "Kernel type " << nccl_kernel_type_ << " is not supported."; | |||
| } | |||
| } | |||
| return true; | |||
| } | |||
| bool Init(const CNodePtr &kernel_node) override { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| kernel_node_ = kernel_node; | |||
| InferCommType(kernel_node); | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||
| if (input_num > 0) { | |||
| input_nccl_data_type_ = nccl_dtype(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)); | |||
| } | |||
| if (output_num > 0) { | |||
| output_nccl_data_type_ = nccl_dtype(AnfAlgo::GetOutputDeviceDataType(kernel_node, 0)); | |||
| } | |||
| for (size_t i = 0; i < input_num; ++i) { | |||
| auto shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, i); | |||
| is_null_input_ = CHECK_NULL_INPUT(shape); | |||
| if (is_null_input_) { | |||
| MS_LOG(WARNING) << "For 'NcclP2PGpuKernel', input shape is null "; | |||
| InitSizeLists(); | |||
| return true; | |||
| } | |||
| size_t size = sizeof(T); | |||
| for (size_t j = 0; j < shape.size(); j++) { | |||
| size *= IntToSize(shape[j]); | |||
| } | |||
| input_size_list_.push_back(size); | |||
| input_size_ += size; | |||
| } | |||
| for (size_t i = 0; i < output_num; ++i) { | |||
| auto shape = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, i); | |||
| is_null_input_ = CHECK_NULL_INPUT(shape); | |||
| if (is_null_input_) { | |||
| MS_LOG(WARNING) << "For 'NcclP2PGpuKernel', output shape is null"; | |||
| InitSizeLists(); | |||
| return true; | |||
| } | |||
| size_t size = sizeof(I); | |||
| for (size_t j = 0; j < shape.size(); j++) { | |||
| size *= IntToSize(shape[j]); | |||
| } | |||
| output_size_list_.push_back(size); | |||
| output_size_ += size; | |||
| } | |||
| group_name_ = GetAttr<std::string>(kernel_node, kAttrGroup); | |||
| MS_LOG(INFO) << AnfAlgo::GetCNodeName(kernel_node) << " for group " << group_name_; | |||
| auto prim = AnfAlgo::GetCNodePrimitive(kernel_node); | |||
| MS_EXCEPTION_IF_NULL(prim); | |||
| auto comm_stream_attr = prim->GetAttr("stream_id"); | |||
| if (comm_stream_attr) { | |||
| comm_stream_ = reinterpret_cast<cudaStream_t>(GetValue<uintptr_t>(comm_stream_attr)); | |||
| MS_EXCEPTION_IF_NULL(comm_stream_); | |||
| } | |||
| // Used by AlltoAllv | |||
| auto send_rank_ids_attr = prim->GetAttr(kAttrSendRankIds); | |||
| auto recv_rank_ids_attr = prim->GetAttr(kAttrRecvRankIds); | |||
| if (send_rank_ids_attr && recv_rank_ids_attr) { | |||
| send_rank_ids = GetValue<std::vector<int64_t>>(send_rank_ids_attr); | |||
| recv_rank_ids = GetValue<std::vector<int64_t>>(recv_rank_ids_attr); | |||
| } | |||
| collective_handle_ = device::gpu::CollectiveInitializer::instance().collective_handle(); | |||
| MS_EXCEPTION_IF_NULL(collective_handle_); | |||
| return true; | |||
| } | |||
| void ResetResource() noexcept override { | |||
| nccl_kernel_type_ = NCCL_INVALID_TYPE; | |||
| input_size_ = 0; | |||
| output_size_ = 0; | |||
| root_ = 0; | |||
| is_null_input_ = false; | |||
| collective_handle_ = nullptr; | |||
| comm_stream_ = nullptr; | |||
| input_size_list_.clear(); | |||
| output_size_list_.clear(); | |||
| workspace_size_list_.clear(); | |||
| } | |||
| protected: | |||
| void InitSizeLists() override { return; } | |||
| private: | |||
| void LaunchAllToAllv(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs, | |||
| void *stream_ptr) { | |||
| T *input_addr = nullptr; | |||
| I *output_addr = nullptr; | |||
| cudaStream_t stream = comm_stream_ ? comm_stream_ : reinterpret_cast<cudaStream_t>(stream_ptr); | |||
| // send_rank_id and recv rank_id size needs to be equal to input_list size | |||
| if (send_rank_ids.size() != input_size_list_.size()) { | |||
| MS_LOG(ERROR) << "Trying to use AlltoAllv, but send_rank_ids vector size not equals to input_list size."; | |||
| } | |||
| if (recv_rank_ids.size() != output_size_list_.size()) { | |||
| MS_LOG(ERROR) << "Trying to use AlltoAllv, but recv_rank_ids vector size not equals to output_list size."; | |||
| } | |||
| auto nccl_recv_func = reinterpret_cast<Recv>(dlsym(const_cast<void *>(collective_handle_), "Recv")); | |||
| auto nccl_send_func = reinterpret_cast<Send>(dlsym(const_cast<void *>(collective_handle_), "Send")); | |||
| auto nccl_gstart_func = reinterpret_cast<GroupStart>(dlsym(const_cast<void *>(collective_handle_), "GroupStart")); | |||
| auto nccl_gend_func = reinterpret_cast<GroupEnd>(dlsym(const_cast<void *>(collective_handle_), "GroupEnd")); | |||
| MS_EXCEPTION_IF_NULL(nccl_recv_func); | |||
| MS_EXCEPTION_IF_NULL(nccl_send_func); | |||
| MS_EXCEPTION_IF_NULL(nccl_gstart_func); | |||
| MS_EXCEPTION_IF_NULL(nccl_gend_func); | |||
| // This implementation refers to NVIDIA NCCL 2.11 doc. | |||
| CHECK_NCCL_RET_WITH_EXCEPT(kernel_node_, (*nccl_gstart_func)(), "AllToAllv: ncclGroupStart failed"); | |||
| for (int i = 0; i < SizeToInt(input_size_list_.size()); ++i) { | |||
| input_addr = GetDeviceAddress<T>(inputs, i); | |||
| CHECK_NCCL_RET_WITH_EXCEPT(kernel_node_, | |||
| (*nccl_send_func)(input_addr, input_size_list_[i] / sizeof(T), input_nccl_data_type_, | |||
| send_rank_ids[i], stream, group_name_), | |||
| "AllToAllv: ncclSend failed"); | |||
| } | |||
| for (int i = 0; i < SizeToInt(output_size_list_.size()); ++i) { | |||
| output_addr = GetDeviceAddress<I>(outputs, i); | |||
| CHECK_NCCL_RET_WITH_EXCEPT(kernel_node_, | |||
| (*nccl_recv_func)(output_addr, output_size_list_[i] / sizeof(I), | |||
| output_nccl_data_type_, recv_rank_ids[i], stream, group_name_), | |||
| "AllToAllv: ncclRecv failed"); | |||
| } | |||
| CHECK_NCCL_RET_WITH_EXCEPT(kernel_node_, (*nccl_gend_func)(), "AllToAllv: ncclGroupEnd failed"); | |||
| } | |||
| void InferCommType(const CNodePtr &kernel_node) { | |||
| std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node); | |||
| auto iter = kNcclTypeMap.find(kernel_name); | |||
| if (iter == kNcclTypeMap.end()) { | |||
| MS_LOG(EXCEPTION) << "Kernel " << kernel_name << " is not supported."; | |||
| } else { | |||
| nccl_kernel_type_ = iter->second; | |||
| } | |||
| auto prim = AnfAlgo::GetCNodePrimitive(kernel_node); | |||
| MS_EXCEPTION_IF_NULL(prim); | |||
| auto root_rank = prim->GetAttr(kAttrRootRank); | |||
| if (root_rank) { | |||
| root_ = static_cast<int>(GetValue<int64_t>(root_rank)); | |||
| } | |||
| return; | |||
| } | |||
| std::vector<size_t> input_size_list_; | |||
| std::vector<size_t> output_size_list_; | |||
| std::vector<size_t> workspace_size_list_; | |||
| NcclKernelType nccl_kernel_type_; | |||
| size_t input_size_; | |||
| size_t output_size_; | |||
| int root_; | |||
| bool is_null_input_; | |||
| const void *collective_handle_; | |||
| cudaStream_t comm_stream_; | |||
| ncclDataType_t output_nccl_data_type_; | |||
| ncclDataType_t input_nccl_data_type_; | |||
| std::vector<int64_t> send_rank_ids; | |||
| std::vector<int64_t> recv_rank_ids; | |||
| }; | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NCCL_P2P_GPU_KERNEL_H_ | |||
| @@ -658,7 +658,8 @@ OperatorAttrs Conv2DInfo::CreateNeighborExchangeAttrs(const CNodePtr &cnode) { | |||
| Attr send_shapes = {SEND_SHAPES, MakeTupleListValue(send_shapes_)}; | |||
| Attr recv_shapes = {RECV_SHAPES, MakeTupleListValue(recv_shapes_)}; | |||
| Attr recv_type = {RECV_TYPE, dtype}; | |||
| OperatorAttrs attrs = {send_ranks, recv_ranks, recv_shapes, send_shapes, recv_type}; | |||
| Attr group = {GROUP, MakeValue(g_device_manager->world_group())}; | |||
| OperatorAttrs attrs = {send_ranks, recv_ranks, recv_shapes, send_shapes, recv_type, group}; | |||
| return attrs; | |||
| } | |||
| @@ -67,4 +67,16 @@ def test_nccl_send_recv_op(): | |||
| def test_nccl_all_to_all_op(): | |||
| return_code = os.system("mpirun -n 8 pytest -s test_nccl_all_to_all_op.py") | |||
| assert return_code == 0 | |||
| @pytest.mark.level0 | |||
| @pytest.mark.platform_x86_gpu_training | |||
| @pytest.mark.env_single | |||
| def test_nccl_neighbor_exchange_op(): | |||
| """ | |||
| Feature: NeighborExchange GPU operator | |||
| Description: see details in test_nccl_neighbor_exchange_op.py | |||
| Expectation: success, return_code==0 | |||
| """ | |||
| return_code = os.system( | |||
| "mpirun -n 8 pytest -s test_nccl_neighbor_exchange_op.py") | |||
| assert return_code == 0 | |||
| @@ -0,0 +1,65 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| import numpy as np | |||
| import mindspore.context as context | |||
| import mindspore.nn as nn | |||
| from mindspore import Tensor | |||
| from mindspore.communication.management import init, get_rank, get_group_size | |||
| from mindspore.ops import operations as P | |||
| import mindspore as ms | |||
| context.set_context(mode=context.GRAPH_MODE, device_target='GPU') | |||
| init() | |||
| rank = get_rank() | |||
| size = get_group_size() | |||
| x = np.asarray([1, 1, 1, 1, 1, 1, 1, 1]).astype(np.float32) * rank | |||
| class Net(nn.Cell): | |||
| def __init__(self): | |||
| super(Net, self).__init__() | |||
| self.neighborexchange = P.comm_ops.NeighborExchange( | |||
| send_rank_ids=[(rank - 1) % 8], | |||
| recv_rank_ids=[(rank + 1) % 8], | |||
| recv_shapes=tuple([[8]]), | |||
| send_shapes=tuple([[8]]), | |||
| recv_type=ms.float32, | |||
| group="nccl_world_group") | |||
| def construct(self, inputs): | |||
| return self.neighborexchange(inputs) | |||
| def test_neighborexchange(): | |||
| """ | |||
| Feature: NeighborExchange operator on GPU | |||
| Description: for each device, send to previous rank and receive from next rank. | |||
| example: rank 0 send to rank 7 and receive from rank 1. | |||
| Expectation: on rank i, result == [1 ,1 ,1, 1, 1, 1, 1, 1] * ((i + 1) % 8) | |||
| """ | |||
| neighborexchange = Net() | |||
| expect0 = np.asarray([1, 1, 1, 1, 1, 1, 1, 1]).astype( | |||
| np.float32) * ((rank + 1) % 8) | |||
| inputs = [] | |||
| inputs.append(Tensor(x)) | |||
| inputs = tuple(inputs) | |||
| output0 = neighborexchange(inputs)[0].asnumpy() | |||
| diff0 = output0 - expect0 | |||
| error0 = np.ones(shape=expect0.shape) * 1.0e-5 | |||
| assert np.all(diff0 < error0) | |||
| assert output0.shape == expect0.shape | |||