diff --git a/mindspore/nn/loss/loss.py b/mindspore/nn/loss/loss.py
index 589f9318fd..d9b39b8674 100644
--- a/mindspore/nn/loss/loss.py
+++ b/mindspore/nn/loss/loss.py
@@ -187,8 +187,8 @@ class RMSELoss(_Loss):
 
 
     Inputs:
-        - **logits** (Tensor) - Tensor of shape :math:`(x_1, x_2, ..., x_R)`.
-        - **label** (Tensor) - Tensor of shape :math:`(y_1, y_2, ..., y_S)`.
+        - **logits** (Tensor) - Tensor of shape :math:`(x_1, x_2, ..., x_M)`.
+        - **label** (Tensor) - Tensor of shape :math:`(y_1, y_2, ..., y_N)`.
 
     Outputs:
         Tensor, weighted loss float tensor.
@@ -219,19 +219,20 @@ class MAELoss(_Loss):
     MAELoss creates a standard to measure the average absolute error between :math:`x` and :math:`y`
     element-wise, where :math:`x` is the input and :math:`y` is the target.
 
-    For simplicity, let :math:`x` and :math:`y` be 1-dimensional Tensor with length :math:`N`,
+    For simplicity, let :math:`x` and :math:`y` be 1-dimensional Tensor with length :math:`M` and :math:`N`,
     the unreduced loss (i.e. with argument reduction set to 'none') of :math:`x` and :math:`y` is given as:
 
     .. math::
-        \text{MAE} = \frac{1}{M}\sum_{m=1}^N\left| x_m - y_m \right|
+        MAE =  \begin{cases} \sqrt{\frac{1}{M}\sum_{m=1,n=1}^{M,N}{|x_m-y_n|}}, & \text {if  M > N } \\\\
+        \sqrt{\frac{1}{N}\sum_{m=1,n=1}^{M,N}{|x_m-y_n|}}, &\text{if  M < N } \end{cases}
 
     Args:
         reduction (str): Type of reduction to be applied to loss. The optional values are "mean", "sum", and "none".
                          Default: "mean".
 
     Inputs:
-        - **logits** (Tensor) - Tensor of shape :math:`(x_1, x_2, ..., x_R)`.
-        - **label** (Tensor) - Tensor of shape :math:`(y_1, y_2, ..., y_S)`.
+        - **logits** (Tensor) - Tensor of shape :math:`(x_1, x_2, ..., x_M)`.
+        - **label** (Tensor) - Tensor of shape :math:`(y_1, y_2, ..., y_N)`.
 
     Outputs:
         Tensor, weighted loss float tensor.
@@ -488,9 +489,9 @@ class MultiClassDiceLoss(_Loss):
             Default: 'softmax'. Choose from: ['softmax', 'logsoftmax', 'relu', 'relu6', 'tanh','Sigmoid']
 
     Inputs:
-        - **y_pred** (Tensor) - Tensor of shape (N, C, ...). y_pred dimension should be greater than 1. The data type
-            must be float16 or float32.
-        - **y** (Tensor) - Tensor of shape (N, C, ...). y dimension should be greater than 1. The data type must be
+        - **y_pred** (Tensor) - Tensor of shape (N, C, ...). The y_pred dimension should be greater than 1. The data
+            type must be float16 or float32.
+        - **y** (Tensor) - Tensor of shape (N, C, ...). The y dimension should be greater than 1. The data type must be
             float16 or float32.
 
     Outputs:
diff --git a/mindspore/nn/optim/optimizer.py b/mindspore/nn/optim/optimizer.py
index e4bbf0a2fb..69722dad85 100755
--- a/mindspore/nn/optim/optimizer.py
+++ b/mindspore/nn/optim/optimizer.py
@@ -621,6 +621,17 @@ def _tensor_apply_decay(weight_decay, if_apply, weight, gradient):
     return gradient
 
 
+@_apply_grad_centralization.register("Bool", "RowTensor")
+def _tensor_apply_grad_centralization_with_sparse(if_apply, gradient):
+    """Get grad with grad_centralization."""
+    if if_apply:
+        indices = gradient.indices
+        values = op_gc(gradient.values, -1)
+        shape = gradient.dense_shape
+        return RowTensor(indices, values, shape)
+    return gradient
+
+
 @_apply_grad_centralization.register("Bool", "Tensor")
 def _tensor_apply_grad_centralization(if_apply, gradient):
     """Get grad with grad_centralization."""