| @@ -389,31 +389,52 @@ def reduce_scatter_sum( | |||||
| def all_reduce_sum( | def all_reduce_sum( | ||||
| inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None, | inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None, | ||||
| ) -> Tensor: | ) -> Tensor: | ||||
| r"""Reduce tensors across the specified group by sum. | |||||
| r"""Reduce tensors with sum operation on each value across the specified group. | |||||
| Note: | |||||
| ``inp`` tensor must have identical shape in all processes across the group. | |||||
| Args: | Args: | ||||
| inp: Input tensor. | |||||
| group: The process group to work on. | |||||
| The default group is WORLD which means all processes available. | |||||
| You can use a list of process ranks to create new group to work on it, e.g. [1, 3, 5]. | |||||
| device: The specific device to execute this operator. | |||||
| None default device means the device of inp will be used. | |||||
| Specify "gpu0:1" to execute this operator on diffrent cuda stream, | |||||
| 1 is stream id, and default stream id is 0. | |||||
| inp (Tensor): tensor to be reduced. | |||||
| Keyword args: | |||||
| group (Group or sequence of ints): the process group to work on. Default: ``WORLD``. | |||||
| ``WORLD`` group selects all processes available. | |||||
| list of process rank as parameter will create a new group to work on. | |||||
| device (:attr:`.Tensor.device`): the specific device to execute this operator. Default: ``None`` | |||||
| ``None`` will select the device of ``inp`` to execute. | |||||
| Specially, ``GPU`` device can assign a different stream to execute | |||||
| by adding a number right after a colon following the device name while | |||||
| ``:0`` denotes default stream of GPU, otherwise will use default stream. | |||||
| Returns: | Returns: | ||||
| Result tensor. | |||||
| A tensor with sum operation on each value across the group. | |||||
| The shape of the output tensor must be the same as ``inp``, and the output | |||||
| tensor is going to be bitwise identical in all processes across the group. | |||||
| Examples: | Examples: | ||||
| .. code-block:: | |||||
| >>> # We execute all_reduce_sum on rank 0 and rank 1 | |||||
| >>> input = F.arange(2) + 1 + 2 * rank | |||||
| >>> input | |||||
| Tensor([1. 2.], device=xpux:0) # Rank 0 | |||||
| Tensor([3. 4.], device=xpux:0) # Rank 1 | |||||
| >>> F.distributed.all_reduce_sum(input, group=[0, 1]) | |||||
| Tensor([4. 6.], device=xpux:0) # Rank 0 | |||||
| Tensor([4. 6.], device=xpux:0) # Rank 1 | |||||
| >>> # We execute all_reduce_sum with on gpu0 with cuda stream 1 | |||||
| >>> megengine.set_default_device("gpu0") | |||||
| >>> input = F.arange(2) + 1 + 2 * rank | |||||
| >>> input | |||||
| Tensor([1. 2.], device=gpu0:0) # Rank 0 | |||||
| Tensor([3. 4.], device=gpu0:0) # Rank 1 | |||||
| >>> F.distributed.all_reduce_sum(input, device="gpu0:1") | |||||
| Tensor([4. 6.], device=gpu0:0) # Rank 0 | |||||
| Tensor([4. 6.], device=gpu0:0) # Rank 1 | |||||
| input = Tensor(rank) | |||||
| # Rank 0 # input: Tensor(0) | |||||
| # Rank 1 # input: Tensor(1) | |||||
| output = all_reduce_sum(input) | |||||
| # Rank 0 # output: Tensor(1) | |||||
| # Rank 1 # output: Tensor(1) | |||||
| """ | """ | ||||
| mode = CollectiveComm.Mode.ALL_REDUCE_SUM | mode = CollectiveComm.Mode.ALL_REDUCE_SUM | ||||
| return collective_comm(inp, mode, group, device) | return collective_comm(inp, mode, group, device) | ||||
| @@ -422,32 +443,53 @@ def all_reduce_sum( | |||||
| def all_reduce_max( | def all_reduce_max( | ||||
| inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None, | inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None, | ||||
| ) -> Tensor: | ) -> Tensor: | ||||
| r"""Reduce tensors across the specified group by max. | |||||
| r"""Reduce tensors with max operation on each value across the specified group. | |||||
| Note: | |||||
| ``inp`` tensor must have identical shape in all processes across the group. | |||||
| Args: | Args: | ||||
| inp: Input tensor. | |||||
| group: The process group to work on. | |||||
| The default group is WORLD which means all processes available. | |||||
| You can use a list of process ranks to create new group to work on it, e.g. [1, 3, 5]. | |||||
| device: The specific device to execute this operator. | |||||
| None default device means the device of inp will be used. | |||||
| Specify "gpu0:1" to execute this operator on diffrent cuda stream, | |||||
| 1 is stream id, and default stream id is 0. | |||||
| inp (Tensor): tensor to be reduced. | |||||
| Keyword args: | |||||
| group (Group or sequence of ints): the process group to work on. Default: ``WORLD``. | |||||
| ``WORLD`` group selects all processes available. | |||||
| list of process rank as parameter will create a new group to work on. | |||||
| device (:attr:`.Tensor.device`): the specific device to execute this operator. Default: ``None`` | |||||
| ``None`` will select the device of ``inp`` to execute. | |||||
| Specially, ``GPU`` device can assign a different stream to execute | |||||
| by adding a number right after a colon following the device name while | |||||
| ``:0`` denotes default stream of GPU, otherwise will use default stream. | |||||
| Returns: | Returns: | ||||
| Result tensor. | |||||
| A tensor with max operation on each value across the group. | |||||
| The shape of the output tensor must be the same as ``inp``, and the output | |||||
| tensor is going to be bitwise identical in all processes across the group. | |||||
| Examples: | Examples: | ||||
| .. code-block:: | |||||
| >>> # We execute all_reduce_max on rank 0 and rank 1 | |||||
| >>> input = F.arange(2) + 1 + 2 * rank | |||||
| >>> input | |||||
| Tensor([1. 2.], device=xpux:0) # Rank 0 | |||||
| Tensor([3. 4.], device=xpux:0) # Rank 1 | |||||
| >>> F.distributed.all_reduce_max(input, group=[0, 1]) | |||||
| Tensor([3. 4.], device=xpux:0) # Rank 0 | |||||
| Tensor([3. 4.], device=xpux:0) # Rank 1 | |||||
| >>> # We execute all_reduce_max with on gpu0 with cuda stream 1 | |||||
| >>> megengine.set_default_device("gpu0") | |||||
| >>> input = F.arange(2) + 1 + 2 * rank | |||||
| >>> input | |||||
| Tensor([1. 2.], device=gpu0:0) # Rank 0 | |||||
| Tensor([3. 4.], device=gpu0:0) # Rank 1 | |||||
| >>> F.distributed.all_reduce_max(input, device="gpu0:1") | |||||
| Tensor([3. 4.], device=xpux:0) # Rank 0 | |||||
| Tensor([3. 4.], device=xpux:0) # Rank 1 | |||||
| input = Tensor(rank) | |||||
| # Rank 0 # input: Tensor(0) | |||||
| # Rank 1 # input: Tensor(1) | |||||
| output = all_reduce_max(input) | |||||
| # Rank 0 # output: Tensor(1) | |||||
| # Rank 1 # output: Tensor(1) | |||||
| """ | """ | ||||
| mode = CollectiveComm.Mode.ALL_REDUCE_MAX | mode = CollectiveComm.Mode.ALL_REDUCE_MAX | ||||
| return collective_comm(inp, mode, group, device) | return collective_comm(inp, mode, group, device) | ||||
| @@ -455,31 +497,51 @@ def all_reduce_max( | |||||
| def all_reduce_min( | def all_reduce_min( | ||||
| inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None, | inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None, | ||||
| ) -> Tensor: | ) -> Tensor: | ||||
| r"""Reduce tensors across the specified group by min. | |||||
| r"""Reduce tensors with min operation on each value across the specified group. | |||||
| Note: | |||||
| ``inp`` tensor must have identical shape in all processes across the group. | |||||
| Args: | Args: | ||||
| inp: Input tensor. | |||||
| group: The process group to work on. | |||||
| The default group is WORLD which means all processes available. | |||||
| You can use a list of process ranks to create new group to work on it, e.g. [1, 3, 5]. | |||||
| device: The specific device to execute this operator. | |||||
| None default device means the device of inp will be used. | |||||
| Specify "gpu0:1" to execute this operator on diffrent cuda stream, | |||||
| 1 is stream id, and default stream id is 0. | |||||
| inp (Tensor): tensor to be reduced. | |||||
| Keyword args: | |||||
| group (Group or sequence of ints): the process group to work on. Default: ``WORLD``. | |||||
| ``WORLD`` group selects all processes available. | |||||
| list of process rank as parameter will create a new group to work on. | |||||
| device (:attr:`.Tensor.device`): the specific device to execute this operator. Default: ``None`` | |||||
| ``None`` will select the device of ``inp`` to execute. | |||||
| Specially, ``GPU`` device can assign a different stream to execute | |||||
| by adding a number right after a colon following the device name while | |||||
| ``:0`` denotes default stream of GPU, otherwise will use default stream. | |||||
| Returns: | Returns: | ||||
| Result tensor. | |||||
| A tensor with min operation on each value across the group. | |||||
| The shape of the output tensor must be the same as ``inp``, and the output | |||||
| tensor is going to be bitwise identical in all processes across the group. | |||||
| Examples: | Examples: | ||||
| .. code-block:: | |||||
| >>> # We execute all_reduce_min on rank 0 and rank 1 | |||||
| >>> input = F.arange(2) + 1 + 2 * rank | |||||
| >>> input | |||||
| Tensor([1. 2.], device=xpux:0) # Rank 0 | |||||
| Tensor([3. 4.], device=xpux:0) # Rank 1 | |||||
| >>> F.distributed.all_reduce_min(input, group=[0, 1]) | |||||
| Tensor([1. 2.], device=xpux:0) # Rank 0 | |||||
| Tensor([1. 2.], device=xpux:0) # Rank 1 | |||||
| >>> # We execute all_reduce_min with on gpu0 with cuda stream 1 | |||||
| >>> megengine.set_default_device("gpu0") | |||||
| >>> input = F.arange(2) + 1 + 2 * rank | |||||
| >>> input | |||||
| Tensor([1. 2.], device=gpu0:0) # Rank 0 | |||||
| Tensor([3. 4.], device=gpu0:0) # Rank 1 | |||||
| >>> F.distributed.all_reduce_min(input, device="gpu0:1") | |||||
| Tensor([1. 2.], device=xpux:0) # Rank 0 | |||||
| Tensor([1. 2.], device=xpux:0) # Rank 1 | |||||
| input = Tensor(rank) | |||||
| # Rank 0 # input: Tensor(0) | |||||
| # Rank 1 # input: Tensor(1) | |||||
| output = all_reduce_min(input) | |||||
| # Rank 0 # output: Tensor(0) | |||||
| # Rank 1 # output: Tensor(0) | |||||
| """ | """ | ||||
| mode = CollectiveComm.Mode.ALL_REDUCE_MIN | mode = CollectiveComm.Mode.ALL_REDUCE_MIN | ||||
| return collective_comm(inp, mode, group, device) | return collective_comm(inp, mode, group, device) | ||||