diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/l2_loss.cu b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/l2_loss.cu
index 41103cc92b..b80db775ba 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/l2_loss.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/l2_loss.cu
@@ -22,15 +22,22 @@ template <typename T>
 __global__ void L2LossKernel(const size_t input_size, const T *input , T *output) {
   T ret = 0;
   for (size_t id = blockIdx.x * blockDim.x + threadIdx.x; id < input_size; id += blockDim.x * gridDim.x) {
-      ret = (input[id] * input[id]);
+      ret = input[id] * input[id];
       ret /= static_cast<T>(2);
       MsAtomicAdd(output, ret);
   }
   return;
 }
 
+template <typename T>
+__global__ void ClearOutputMem(T *output) {
+    output[0] = static_cast<T>(0);
+    return;
+}
+
 template <typename T>
 void L2Loss(const size_t input_size, const T *input , T *output, cudaStream_t stream) {
+  ClearOutputMem<<<GET_BLOCKS(1), GET_THREADS, 0, stream>>>(output);
   L2LossKernel<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(input_size, input, output);
 }
 
diff --git a/mindspore/ops/operations/math_ops.py b/mindspore/ops/operations/math_ops.py
index 5530f75997..bd3b433af9 100644
--- a/mindspore/ops/operations/math_ops.py
+++ b/mindspore/ops/operations/math_ops.py
@@ -445,7 +445,7 @@ class ReduceAll(_Reduce):
           the shape of output is :math:`(x_1, x_4, ..., x_R)`.
 
     Supported Platforms:
-        ``Ascend``
+        ``Ascend`` ``GPU``
 
     Examples:
         >>> input_x = Tensor(np.array([[True, False], [True, True]]))
@@ -487,7 +487,7 @@ class ReduceAny(_Reduce):
           the shape of output is :math:`(x_1, x_4, ..., x_R)`.
 
     Supported Platforms:
-        ``Ascend``
+        ``Ascend`` ``GPU``
 
     Examples:
         >>> input_x = Tensor(np.array([[True, False], [True, True]]))