auto parallel for sparse gradient

5 years ago · 48c734b669
--- a/mindspore/nn/wrap/grad_reducer.py
+++ b/mindspore/nn/wrap/grad_reducer.py
@@ -16,18 +16,22 @@
 from mindspore.nn.cell import Cell
 from mindspore.communication.management import GlobalComm, get_group_size
 from mindspore.ops import functional as F, composite as C, operations as P
 from mindspore.ops.operations.comm_ops import AllReduce, ReduceOp
 from mindspore.ops.operations.comm_ops import AllReduce, ReduceOp, AllGather
 import mindspore.common.dtype as mstype

 reduce_opt = C.MultitypeFuncGraph("reduce_opt")

 _all_reduce = AllReduce()
 _all_gather = None


 def _init_optimizer_allreduce():
 def _init_optimizer_communication():
    global _all_reduce
    global _all_gather

    _all_reduce = AllReduce(ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP)
    _all_reduce.add_prim_attr('fusion', 1)
    _all_gather = AllGather(GlobalComm.WORLD_COMM_GROUP)


@reduce_opt.register("Function", "Number", "Bool", "Tensor")
@@ -72,8 +76,8 @@ def _tensors_allreduce_mean_with_sparse(mul, degree, allreduce_filter, grad):
        degree = F.scalar_cast(degree, F.dtype(grad[1]))
        dout = _all_gather(grad[1])
        cast_op = P.Cast()
        dout = mul(dout, cast_op(F.scalar_to_array(1.0/degree), F.dtype(dout)))
        grad = (indices, dout, dout[2])
        dout = mul(dout, cast_op(F.scalar_to_array(1.0 / degree), F.dtype(dout)))
        grad = (indices, dout, grad[2])
    return grad


@@ -110,7 +114,7 @@ def _tensors_allreduce_with_sparse(allreduce_filter, grad):
    if allreduce_filter:
        indices = _all_gather(grad[0])
        dout = _all_gather(grad[1])
        grad = (indices, dout, dout[2])
        grad = (indices, dout, grad[2])
    return grad


@@ -131,6 +135,20 @@ def _tensors_get_datatype(grad):
    return F.dtype(grad)


@_get_datatype.register("Tuple")
 def _tensors_get_datatype_with_sparse(grad):
    """
    Acquire gradient datatype.

    Args:
        grad (Tuple): The gradient tensor before operation.

    Returns:
        mstype, the datatype of gradient.
    """
    return F.dtype(grad[1])


 _cast_datatype = C.MultitypeFuncGraph("_cast_datatype")


@@ -149,6 +167,22 @@ def _tensors_cast_datatype(datatype, grad):
    return F.cast(grad, datatype)


@_cast_datatype.register("TypeType", "Tuple")
 def _tensors_cast_datatype_with_sparse(datatype, grad):
    """
    Cast gradient to datatype.

    Args:
        datatype (mstype): the destination datatype of gradient.
        grad (Tuple): The gradient tensor before operation.

    Returns:
        Tuple, the gradient tuple after operation.
    """
    dout = F.cast(grad[1], datatype)
    return (grad[0], dout, grad[2])


 class DistributedGradReducer(Cell):
    """
    A distributed optimizer.
@@ -224,7 +258,7 @@ class DistributedGradReducer(Cell):

    def __init__(self, parameters, mean=True, degree=None):
        super(DistributedGradReducer, self).__init__(auto_prefix=False)
        self.hyper_map = C.HyperMap()
        self.map_ = C.Map()
        self.mul = P.Mul()
        if degree is None:
            self.degree = get_group_size()
@@ -234,19 +268,27 @@ class DistributedGradReducer(Cell):
            self.degree = degree
        self.mean = mean
        self.allreduce_filter = tuple(x.layerwise_parallel is False for x in parameters)
        _init_optimizer_allreduce()
        _init_optimizer_communication()

    def construct(self, grads):
        # In some circumstances, the data precision of grads could be mixed with float16 and float32. Thus, the
        # result of AllReduce is unreliable. To solve the problem, grads should be cast to float32 before AllReduce,
        # and cast back after the operation.
        datatypes = self.hyper_map(F.partial(_get_datatype), grads)
        grads = self.hyper_map(F.partial(_cast_datatype, mstype.float32), grads)
        """
        In some circumstances, the data precision of grads could be mixed with float16 and float32. Thus, the
        result of AllReduce is unreliable. To solve the problem, grads should be cast to float32 before AllReduce,
        and cast back after the operation.

        Args:
            grads (Union[Tensor, tuple[Tensor]]): The gradient tensor or tuple before operation.

        Returns:
            new_grads (Union[Tensor, tuple[Tensor]]), the gradient tensor or tuple after operation.
        """
        datatypes = self.map_(F.partial(_get_datatype), grads)
        grads = self.map_(F.partial(_cast_datatype, mstype.float32), grads)

        if self.mean:
            new_grad = self.hyper_map(F.partial(reduce_opt, self.mul, self.degree), self.allreduce_filter, grads)
            new_grad = self.map_(F.partial(reduce_opt, self.mul, self.degree), self.allreduce_filter, grads)
        else:
            new_grad = self.hyper_map(F.partial(reduce_opt), self.allreduce_filter, grads)
            new_grad = self.map_(F.partial(reduce_opt), self.allreduce_filter, grads)

        new_grad = self.hyper_map(F.partial(_cast_datatype), datatypes, new_grad)
        new_grad = self.map_(F.partial(_cast_datatype), datatypes, new_grad)
        return new_grad