add distributed parallel operators for reduceall and reduceprod

4 years ago · dfc92f1791
--- a/mindspore/ccsrc/frontend/parallel/auto_parallel/operator_costmodel.h
+++ b/mindspore/ccsrc/frontend/parallel/auto_parallel/operator_costmodel.h
@@ -752,6 +752,7 @@ class ReduceSumCost : public OperatorCost {
  bool cross_batch_ = false;
 };
 using ReduceMethodCost = ReduceSumCost;
 using ReduceProdCost = ReduceSumCost;

 class ReduceMeanCost : public ReduceSumCost {
 public:
--- a/mindspore/ccsrc/frontend/parallel/dynamic_creator.h
+++ b/mindspore/ccsrc/frontend/parallel/dynamic_creator.h
@@ -215,6 +215,8 @@ REGISTER(IOUInfo);
 REGISTER(RandomChoiceWithMaskInfo);
 REGISTER(CropAndResizeInfo);
 REGISTER(ROIAlignInfo);
 REGISTER(ReduceProdInfo);
 REGISTER(ReduceAllInfo);
 }  // namespace parallel
 }  // namespace mindspore

--- a/mindspore/ccsrc/frontend/parallel/ops_info/ops_utils.h
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/ops_utils.h
@@ -102,7 +102,9 @@ constexpr char REDUCE_OP_SUM[] = "sum";
 constexpr char STRATEGY_GEN_MODE[] = "strategy_gen_mode";
 constexpr char REDUCE_OP_MAX[] = "max";
 constexpr char REDUCE_OP_MIN[] = "min";
 constexpr char REDUCE_OP_ANY[] = "any";
 constexpr char REDUCE_OP_ANY[] = "sum";
 constexpr char REDUCE_OP_ALL[] = "prod";
 constexpr char REDUCE_OP_PROD[] = "prod";
 constexpr char OP_PATH[] = "mindspore.ops.operations";
 constexpr char INNER_OP_PATH[] = "mindspore.ops.operations._inner_ops";
 constexpr char FUNCTIONAL_OP_PATH[] = "mindspore.ops.functional";
@@ -328,6 +330,9 @@ constexpr char REDUCE_MAX[] = "ReduceMax";
 constexpr char REDUCE_MIN[] = "ReduceMin";
 constexpr char REDUCE_SUM[] = "ReduceSum";
 constexpr char REDUCE_MEAN[] = "ReduceMean";
 constexpr char REDUCE_PROD[] = "ReduceProd";
 constexpr char REDUCE_ALL[] = "ReduceAll";
 constexpr char REDUCE_ANY[] = "ReduceAny";
 constexpr char ARGMAXWITHVALUE[] = "ArgMaxWithValue";
 constexpr char ARGMINWITHVALUE[] = "ArgMinWithValue";
 constexpr char CONV2D[] = "Conv2D";
--- a/mindspore/ccsrc/frontend/parallel/ops_info/reduce_method_info.cc
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/reduce_method_info.cc
@@ -280,6 +280,74 @@ Status ReduceMeanInfo::InferForwardCommunication() {
  return SUCCESS;
 }

 ForwardOp ReduceAnyInfo::CreateForwardOp(const std::vector<Group> &forward_group) {
  // Create Cast to Int32 op
  Operator op0 = CreateCastOp(kInt32);

  // Create AllReduce op
  Operator op1 = CreateAllReduceOp(reduce_method_, forward_group[0].name());
  std::string group_name = forward_group[0].name();
  MS_LOG(INFO) << "The group of forward all reduce is " << group_name << ", method is " << reduce_method_;

  // Create Cast to Bool op
  Operator op2 = CreateCastOp(kBool);

  ForwardOp forward_op = {op0, op1, op2};

  return forward_op;
 }

 Status ReduceAnyInfo::InferForwardCommunication() {
  Dimensions stra = strategy_->GetInputDim().at(0);
  if (cross_batch_ && IsDataParallelStrategy(stra, stage_id_)) {
    MS_LOG(INFO) << name_ << ": cross_batch is True, don't need to InferForwardCommunication";
    return SUCCESS;
  }
  forward_op_.clear();
  std::vector<int64_t> dim_list = reduce_dim();
  size_t size = stra.size();
  // judge if the reduce dim is partitioned.
  Shape group_creat_map;

  // if repeated calculation and the repeated_calc_num_ insert to the first dimension of dev matrix,
  // it need to handle the first dimension of map.
  if ((dev_matrix_shape_.size() > size) && !repeated_num_in_dev_matrix_right_) {
    group_creat_map.push_back(SizeToInt(dev_matrix_shape_.size() - size_t(1)));
  }

  for (size_t index = 0; index < size; ++index) {
    auto pos =
      std::find_if(dim_list.begin(), dim_list.end(), [index](const int64_t &dim) { return SizeToLong(index) == dim; });
    if (pos != dim_list.end() && stra[index] != 1) {
      continue;
    }
    group_creat_map.push_back(SizeToLong(size) - SizeToLong(index) - 1);
  }

  // if repeated calculation and the repeated_calc_num_ insert to the last dimension of dev matrix,
  // it need to handle the group_creat_map and insert the 0 to the last dimension of the group_creat_map.
  if (repeated_num_in_dev_matrix_right_ && (repeated_calc_num_ > 1)) {
    for (auto &ele : group_creat_map) {
      if (ele == MAP_NONE) {
        continue;
      }
      ele += 1;
    }
    group_creat_map.push_back(0);
  }

  std::vector<Group> forward_group;
  if (CreateGroupByTensorMap(group_creat_map, &forward_group) != SUCCESS) {
    ReportError(name_ + ": Create group failed.");
    return FAILED;
  }
  if (!forward_group.empty()) {
    forward_op_ = CreateForwardOp(forward_group);
  }

  return SUCCESS;
 }

 Status ReduceMethod::InferMirrorOps() {
  mirror_ops_.clear();
  Shape input_tensor_map = inputs_tensor_map_.at(0);
@@ -520,24 +588,5 @@ std::vector<StrategyPtr> ArgMaxWithValueInfo::GenerateOpStrategies(int64_t stage

  return sp_vector;
 }

 Status ReduceAnyInfo::CheckStrategy(const StrategyPtr &strategy) {
  if (ReduceMethod::CheckStrategy(strategy) != SUCCESS) {
    MS_LOG(ERROR) << name_ << ": checking strategy failed.";
    return FAILED;
  }
  auto dim_list = ReduceMethod::reduce_dim();
  Dimensions stra = strategy->GetInputDim().at(0);
  for (size_t index = 0; index < stra.size(); ++index) {
    auto pos =
      std::find_if(dim_list.begin(), dim_list.end(), [index](const int64_t &dim) { return SizeToLong(index) == dim; });
    if (pos != dim_list.end() && stra[index] != 1) {
      MS_LOG(ERROR) << name_
                    << ": checking strategy failed. ReduceAny operator does not support reduced dimension split.";
      return FAILED;
    }
  }
  return SUCCESS;
 }
 }  // namespace parallel
 }  // namespace mindspore
--- a/mindspore/ccsrc/frontend/parallel/ops_info/reduce_method_info.h
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/reduce_method_info.h
@@ -131,7 +131,8 @@ class ReduceAnyInfo : public ReduceMethod {
  ~ReduceAnyInfo() override = default;

 protected:
  Status CheckStrategy(const StrategyPtr &strategy) override;
  Status InferForwardCommunication() override;
  ForwardOp CreateForwardOp(const std::vector<Group> &forward_group);
 };

 class ReduceMinInfo : public ReduceMethod {
@@ -144,6 +145,28 @@ class ReduceMinInfo : public ReduceMethod {

  ~ReduceMinInfo() override = default;
 };

 class ReduceProdInfo : public ReduceMethod {
 public:
  ReduceProdInfo(const std::string &name, const Shapes &inputs_shape, const Shapes &outputs_shape,
                 const PrimitiveAttrs &attrs)
      : ReduceMethod(name, inputs_shape, outputs_shape, attrs, std::make_shared<ReduceProdCost>()) {
    reduce_method_ = REDUCE_OP_PROD;
  }

  ~ReduceProdInfo() override = default;
 };

 class ReduceAllInfo : public ReduceAnyInfo {
 public:
  ReduceAllInfo(const std::string &name, const Shapes &inputs_shape, const Shapes &outputs_shape,
                const PrimitiveAttrs &attrs)
      : ReduceAnyInfo(name, inputs_shape, outputs_shape, attrs) {
    reduce_method_ = REDUCE_OP_ALL;
  }

  ~ReduceAllInfo() override = default;
 };
 }  // namespace parallel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_REDUCE_SUM_INFO_H_
--- a/mindspore/ccsrc/frontend/parallel/step_auto_parallel.cc
+++ b/mindspore/ccsrc/frontend/parallel/step_auto_parallel.cc
@@ -174,7 +174,7 @@ bool IsSplittableOperator(const std::string &op_name) {
     UNSORTED_SEGMENT_MIN, REPEAT_ELEMENTS, TENSOR_DOT, RANGE, UNIFORM_CANDIDATE_SAMPLER, SLICE, SELECT, GATHERD,
     UNSORTED_SEGMENT_MAX, GATHER_ND, TOPK, SCATTER_UPDATE, VIRTUAL_OUTPUT, CONV2D_BACK_PROP_INPUT, CONV2D_TRANSPOSE,
     MATMUL_DDS, DSD_MATMUL, UNIFORMREAL, RESIZE_BILINEAR, RESIZE_NEAREST_NEIGHBOR, CUMSUM, FAST_GELU, IOU,
     BOUNDING_BOX_ENCODE, RANDOM_CHOICE_WITH_MASK, CROP_AND_RESIZE, ROI_ALIGN};
     BOUNDING_BOX_ENCODE, RANDOM_CHOICE_WITH_MASK, CROP_AND_RESIZE, ROI_ALIGN, REDUCE_PROD, REDUCE_ANY, REDUCE_ALL};
  // clang-format on

  auto iter = splittable_op.find(op_name);
--- a/tests/ut/python/parallel/test_reduce_method_info.py
+++ b/tests/ut/python/parallel/test_reduce_method_info.py
@@ -11,9 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 '''Reduce method ut'''
 import numpy as np
 import pytest
 import mindspore as ms
 import mindspore.nn as nn
 from mindspore import Tensor
@@ -81,9 +80,14 @@ def compile_net(net, x, y, b):

 # model_parallel test
 def test_sum_mul():
    """
    Feature: test ReduceSum model parallel strategy
    Description: partition the non-reduced axes, keep_dims is False
    Expectation: compile success
    """
    class Net(nn.Cell):
        def __init__(self, strategy1, strategy2, strategy3):
            super().__init__()
            super(Net, self).__init__()
            self.mul1 = P.Mul().shard(strategy1)
            self.reduce_sum = P.ReduceSum(keep_dims=False).shard(strategy2)
            self.mul2 = P.Mul().shard(strategy3)
@@ -108,9 +112,14 @@ def test_sum_mul():


 def test_sum_mul2():
    """
    Feature: test ReduceSum model parallel strategy
    Description: partition the reduced axes, keep_dims is False
    Expectation: compile success
    """
    class Net(nn.Cell):
        def __init__(self, strategy1, strategy2, strategy3):
            super().__init__()
            super(Net, self).__init__()
            self.mul1 = P.Mul().shard(strategy1)
            self.reduce_sum = P.ReduceSum(keep_dims=False).shard(strategy2)
            self.mul2 = P.Mul().shard(strategy3)
@@ -135,9 +144,14 @@ def test_sum_mul2():


 def test_sum_mul3():
    """
    Feature: test ReduceSum model parallel strategy
    Description: partition the non-reduced axes, keep_dims is False
    Expectation: compile success
    """
    class Net(nn.Cell):
        def __init__(self, strategy1, strategy2, strategy3):
            super().__init__()
            super(Net, self).__init__()
            self.mul1 = P.Mul().shard(strategy1)
            self.reduce_sum = P.ReduceSum(keep_dims=False).shard(strategy2)
            self.mul2 = P.Mul().shard(strategy3)
@@ -162,9 +176,14 @@ def test_sum_mul3():


 def test_sum_mul4():
    """
    Feature: test ReduceSum model parallel strategy
    Description: partition the reduced axes, keep_dims is True
    Expectation: compile success
    """
    class Net(nn.Cell):
        def __init__(self, strategy1, strategy2, strategy3):
            super().__init__()
            super(Net, self).__init__()
            self.mul1 = P.Mul().shard(strategy1)
            self.reduce_sum = P.ReduceSum(keep_dims=True).shard(strategy2)
            self.mul2 = P.Mul().shard(strategy3)
@@ -189,9 +208,14 @@ def test_sum_mul4():


 def test_sum_mul5():
    """
    Feature: test ReduceSum model parallel strategy
    Description: partition the reduced axes, keep_dims is True
    Expectation: compile success
    """
    class Net(nn.Cell):
        def __init__(self, strategy1, strategy2):
            super().__init__()
            super(Net, self).__init__()
            self.mul1 = P.Mul().shard(strategy1)
            self.reduce_sum = P.ReduceSum(keep_dims=True).shard(strategy2)

@@ -212,9 +236,14 @@ def test_sum_mul5():


 def test_sum_mul6():
    """
    Feature: test ReduceSum model parallel strategy
    Description: partition the non-reduced axes, keep_dims is True
    Expectation: compile success
    """
    class Net(nn.Cell):
        def __init__(self, strategy1, strategy2):
            super().__init__()
            super(Net, self).__init__()
            self.mul1 = P.Mul().shard(strategy1)
            self.reduce_sum = P.ReduceSum(keep_dims=True).shard(strategy2)

@@ -235,9 +264,14 @@ def test_sum_mul6():


 def test_sum_mul7():
    """
    Feature: test ReduceSum model parallel strategy
    Description: partition the reduced axes, keep_dims is True
    Expectation: compile success
    """
    class Net(nn.Cell):
        def __init__(self, strategy1, strategy2):
            super().__init__()
            super(Net, self).__init__()
            self.mul1 = P.Mul().shard(strategy1)
            self.reduce_sum = P.ReduceSum(keep_dims=True).shard(strategy2)

@@ -258,9 +292,14 @@ def test_sum_mul7():


 def test_max_mul():
    """
    Feature: test ReduceMax model parallel strategy
    Description: partition the reduced axes, keep_dims is False
    Expectation: compile success
    """
    class Net(nn.Cell):
        def __init__(self, strategy1, strategy2, strategy3):
            super().__init__()
            super(Net, self).__init__()
            self.mul1 = P.Mul().shard(strategy1)
            self.reduce_max = P.ReduceMax(keep_dims=False).shard(strategy2)
            self.mul2 = P.Mul().shard(strategy3)
@@ -285,9 +324,14 @@ def test_max_mul():


 def test_min_mul():
    """
    Feature: test ReduceMin model parallel strategy
    Description: partition the reduced axes, keep_dims is False
    Expectation: compile success
    """
    class Net(nn.Cell):
        def __init__(self, strategy1, strategy2, strategy3):
            super().__init__()
            super(Net, self).__init__()
            self.mul1 = P.Mul().shard(strategy1)
            self.reduce_min = P.ReduceMin(keep_dims=False).shard(strategy2)
            self.mul2 = P.Mul().shard(strategy3)
@@ -312,9 +356,14 @@ def test_min_mul():


 def test_reduce_mean_mul_float32():
    """
    Feature: test ReduceMean model parallel strategy
    Description: partition the reduced axes, keep_dims is False
    Expectation: compile success
    """
    class Net(nn.Cell):
        def __init__(self, strategy1, strategy2, strategy3):
            super().__init__()
            super(Net, self).__init__()
            self.mul1 = P.Mul().shard(strategy1)
            self.reduce_mean = P.ReduceMean(keep_dims=False).shard(strategy2)
            self.mul2 = P.Mul().shard(strategy3)
@@ -341,7 +390,7 @@ def test_reduce_mean_mul_float32():

 class ArgMaxWithValueNet(nn.Cell):
    def __init__(self, strategy1, strategy2, strategy3):
        super().__init__()
        super(ArgMaxWithValueNet, self).__init__()
        self.mul1 = P.Mul().shard(strategy1)
        self.arg_max_with_value = P.ArgMaxWithValue(keep_dims=False, axis=-1).shard(strategy2)
        self.mul2 = P.Mul().shard(strategy3)
@@ -355,7 +404,7 @@ class ArgMaxWithValueNet(nn.Cell):

 class ArgMinWithValueNet(nn.Cell):
    def __init__(self, strategy1, strategy2, strategy3):
        super().__init__()
        super(ArgMinWithValueNet, self).__init__()
        self.mul1 = P.Mul().shard(strategy1)
        self.arg_min_with_value = P.ArgMinWithValue(keep_dims=False, axis=-1).shard(strategy2)
        self.mul2 = P.Mul().shard(strategy3)
@@ -391,6 +440,11 @@ def tobefixed_test_arg_max_with_value_mul_semi_axis_parallel():


 def test_arg_max_with_value_mul_semi():
    """
    Feature: test ArgMaxWithValue semi parallel strategy
    Description: partition the reduced axes, keep_dims is False
    Expectation: compile success
    """
    context.set_auto_parallel_context(device_num=8, global_rank=0)
    strategy1 = ((1, 4, 2), (1, 4, 2))
    strategy2 = ((4, 1, 1),)
@@ -401,6 +455,11 @@ def test_arg_max_with_value_mul_semi():


 def test_arg_max_with_value_mul_auto():
    """
    Feature: test ArgMaxWithValue auto parallel strategy
    Description: don't set the strategy, keep_dims is False
    Expectation: compile success
    """
    context.set_auto_parallel_context(device_num=8, global_rank=0)
    strategy1 = None
    strategy2 = None
@@ -411,6 +470,11 @@ def test_arg_max_with_value_mul_auto():


 def test_arg_min_with_value_mul_semi_axis_parallel():
    """
    Feature: test ArgMinWithValue semi parallel strategy
    Description: partition the reduced axes, keep_dims is False
    Expectation: compile success
    """
    context.set_auto_parallel_context(device_num=8, global_rank=0)
    strategy1 = ((1, 4, 2), (1, 4, 2))
    strategy2 = ((4, 1, 2),)
@@ -421,6 +485,11 @@ def test_arg_min_with_value_mul_semi_axis_parallel():


 def test_arg_min_with_value_mul_semi():
    """
    Feature: test ArgMinWithValue model parallel strategy
    Description: partition the non-reduced axes, keep_dims is False
    Expectation: compile success
    """
    context.set_auto_parallel_context(device_num=8, global_rank=0)
    strategy1 = ((1, 4, 2), (1, 4, 2))
    strategy2 = ((4, 1, 1),)
@@ -431,6 +500,11 @@ def test_arg_min_with_value_mul_semi():


 def test_arg_min_with_value_mul_auto():
    """
    Feature: test ArgMinWithValue auto parallel strategy
    Description: don't set the strategy, keep_dims is False
    Expectation: compile success
    """
    context.set_auto_parallel_context(device_num=8, global_rank=0)
    strategy1 = None
    strategy2 = None
@@ -442,7 +516,7 @@ def test_arg_min_with_value_mul_auto():

 class ArgMinWithValueNet2(nn.Cell):
    def __init__(self, strategy1, strategy2, strategy3):
        super().__init__()
        super(ArgMinWithValueNet2, self).__init__()
        self.mul1 = P.Mul().shard(strategy1)
        self.arg_min_with_value = P.ArgMinWithValue(keep_dims=True, axis=-1).shard(strategy2)
        self.relu = P.ReLU().shard(strategy3)
@@ -465,6 +539,11 @@ def tobefixed_test_arg_min_with_value_mul_semi_axis_parallel2():


 def test_arg_min_with_value_mul_semi2():
    """
    Feature: test ArgMinWithValue semi parallel strategy
    Description: partition the non-reduced axes, keep_dims is True
    Expectation: compile success
    """
    context.set_auto_parallel_context(device_num=8, global_rank=0)
    strategy1 = ((1, 4, 2), (1, 4, 2))
    strategy2 = ((4, 1, 1),)
@@ -475,6 +554,11 @@ def test_arg_min_with_value_mul_semi2():


 def test_arg_min_with_value_mul_auto2():
    """
    Feature: test ArgMinWithValue auto parallel strategy
    Description: don't set the strategy, keep_dims is True
    Expectation: compile success
    """
    context.set_auto_parallel_context(device_num=8, global_rank=0)
    strategy1 = None
    strategy2 = None
@@ -485,12 +569,18 @@ def test_arg_min_with_value_mul_auto2():


 def test_cross_batch():
    """
    Feature: test ReduceMean semi parallel strategy with cross_batch
    Description: partition the reduced axes, keep_dims is False
    Expectation: compile success
    """
    class Net(nn.Cell):
        def __init__(self, strategy1, strategy2, strategy3):
            super().__init__()
            super(Net, self).__init__()
            self.mul1 = P.Mul().shard(strategy1)
            self.reduce_sum = P.ReduceSum(keep_dims=False).shard(strategy2)
            self.reduce_mean = P.ReduceMean(keep_dims=False).shard(strategy3).add_prim_attr("cross_batch", True)
            self.reduce_mean = P.ReduceMean(keep_dims=False).shard(strategy3) \
                                .add_prim_attr("cross_batch", True)

        def construct(self, x, y):
            out = self.mul1(x, y)
@@ -511,12 +601,18 @@ def test_cross_batch():


 def test_cross_batch2():
    """
    Feature: test ReduceSum semi parallel strategy with cross_batch
    Description: partition the reduced axes, keep_dims is False
    Expectation: compile success
    """
    class Net(nn.Cell):
        def __init__(self, strategy1, strategy2, strategy3):
            super().__init__()
            super(Net, self).__init__()
            self.mul1 = P.Mul().shard(strategy1)
            self.reduce_mean = P.ReduceMean(keep_dims=False).shard(strategy2)
            self.reduce_sum = P.ReduceSum(keep_dims=False).shard(strategy3).add_prim_attr("cross_batch", True)
            self.reduce_sum = P.ReduceSum(keep_dims=False).shard(strategy3) \
                               .add_prim_attr("cross_batch", True)

        def construct(self, x, y):
            out = self.mul1(x, y)
@@ -537,9 +633,14 @@ def test_cross_batch2():


 def test_cross_batch_auto():
    """
    Feature: test ReduceSum auto parallel strategy with cross_batch
    Description: don't set the strategy, keep_dims is False
    Expectation: compile success
    """
    class Net(nn.Cell):
        def __init__(self):
            super().__init__()
            super(Net, self).__init__()
            self.mul1 = P.Mul()
            self.reduce_mean = P.ReduceMean(keep_dims=False)
            self.reduce_sum = P.ReduceSum(keep_dims=False).add_prim_attr("cross_batch", True)
@@ -560,9 +661,14 @@ def test_cross_batch_auto():


 def test_max_empty_tuple():
    """
    Feature: test ReduceMax semi parallel strategy
    Description: partition the reduced axes, keep_dims is False
    Expectation: compile success
    """
    class Net(nn.Cell):
        def __init__(self, strategy1, strategy2, strategy3):
            super().__init__()
            super(Net, self).__init__()
            self.mul = P.Mul().shard(strategy1)
            self.reduce_max = P.ReduceMax(keep_dims=False).shard(strategy2)
            self.add = P.Add().shard(strategy3)
@@ -588,9 +694,14 @@ def test_max_empty_tuple():


 def test_any_mul():
    """
    Feature: test ReduceAny semi parallel strategy
    Description: partition the reduced axes, keep_dims is False
    Expectation: compile success
    """
    class Net(nn.Cell):
        def __init__(self, strategy1, strategy2):
            super().__init__()
            super(Net, self).__init__()
            self.mul1 = P.Mul().shard(strategy1)
            self.reduce_any = P.ReduceAny(keep_dims=False).shard(strategy2)
            self.cast = P.Cast()
@@ -609,14 +720,18 @@ def test_any_mul():

    x = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    y = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    with pytest.raises(RuntimeError):
        compile_net_no_bias(net, x, y)
    compile_net_no_bias(net, x, y)


 def test_any_mul2():
    """
    Feature: test ReduceAny semi parallel strategy
    Description: partition the non-reduced axes, keep_dims is False
    Expectation: compile success
    """
    class Net(nn.Cell):
        def __init__(self, strategy1, strategy2):
            super().__init__()
            super(Net, self).__init__()
            self.mul1 = P.Mul().shard(strategy1)
            self.reduce_any = P.ReduceAny(keep_dims=False).shard(strategy2)
            self.cast = P.Cast()
@@ -636,3 +751,167 @@ def test_any_mul2():
    x = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    y = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    compile_net_no_bias(net, x, y)

 def test_all_mul():
    """
    Feature: test ReduceAll semi parallel strategy
    Description: partition the reduced axes, keep_dims is False
    Expectation: compile success
    """
    class Net(nn.Cell):
        def __init__(self, strategy1, strategy2):
            super(Net, self).__init__()
            self.mul1 = P.Mul().shard(strategy1)
            self.reduce_all = P.ReduceAll(keep_dims=False).shard(strategy2)
            self.cast = P.Cast()

        def construct(self, x, y):
            out = self.mul1(x, y)
            out = self.cast(out, ms.bool_)
            out = self.reduce_all(out, 1)
            return out

    context.set_auto_parallel_context(device_num=8, global_rank=0)
    strategy1 = ((1, 8, 1), (1, 8, 1))
    strategy2 = ((1, 8, 1),)
    net = GradWrapNoBias(NetWithLossNoBias(Net(strategy1, strategy2)))
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")

    x = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    y = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    compile_net_no_bias(net, x, y)


 def test_all_mul2():
    """
    Feature: test ReduceAll semi parallel strategy
    Description: partition the non-reduced axes, keep_dims is False
    Expectation: compile success
    """
    class Net(nn.Cell):
        def __init__(self, strategy1, strategy2):
            super(Net, self).__init__()
            self.mul1 = P.Mul().shard(strategy1)
            self.reduce_all = P.ReduceAll(keep_dims=False).shard(strategy2)
            self.cast = P.Cast()

        def construct(self, x, y):
            out = self.mul1(x, y)
            out = self.cast(out, ms.bool_)
            out = self.reduce_all(out, -1)
            return out

    context.set_auto_parallel_context(device_num=8, global_rank=0)
    strategy1 = ((8, 1, 1), (8, 1, 1))
    strategy2 = ((8, 1, 1),)
    net = GradWrapNoBias(NetWithLossNoBias(Net(strategy1, strategy2)))
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")

    x = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    y = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    compile_net_no_bias(net, x, y)

 def test_prod_mul():
    """
    Feature: test ReduceProd model parallel strategy
    Description: partition the reduced axes, keep_dims is False
    Expectation: compile success
    """
    class Net(nn.Cell):
        def __init__(self, strategy1, strategy2):
            super(Net, self).__init__()
            self.mul1 = P.Mul().shard(strategy1)
            self.reduce_prod = P.ReduceProd(keep_dims=False).shard(strategy2)

        def construct(self, x, y):
            out = self.mul1(x, y)
            out = self.reduce_prod(out, 0)
            return out

    context.set_auto_parallel_context(device_num=8, global_rank=0)
    strategy1 = ((1, 1, 8), (1, 1, 8))
    strategy2 = ((2, 4, 1),)
    net = GradWrapNoBias(NetWithLossNoBias(Net(strategy1, strategy2)))
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")

    x = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    y = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    compile_net_no_bias(net, x, y)

 def test_prod_mul2():
    """
    Feature: test ReduceProd model parallel strategy
    Description: partition the non-reduced axes, keep_dims is False
    Expectation: compile success
    """
    class Net(nn.Cell):
        def __init__(self, strategy1, strategy2):
            super(Net, self).__init__()
            self.mul1 = P.Mul().shard(strategy1)
            self.reduce_prod = P.ReduceProd(keep_dims=False).shard(strategy2)

        def construct(self, x, y):
            out = self.mul1(x, y)
            out = self.reduce_prod(out, -1)
            return out

    context.set_auto_parallel_context(device_num=8, global_rank=0)
    strategy1 = ((1, 8, 1), (1, 8, 1))
    strategy2 = ((2, 4, 1),)
    net = GradWrapNoBias(NetWithLossNoBias(Net(strategy1, strategy2)))
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")

    x = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    y = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    compile_net_no_bias(net, x, y)

 def test_prod_mul3():
    """
    Feature: test ReduceProd model parallel strategy
    Description: partition the reduced axes, keep_dims is True
    Expectation: compile success
    """
    class Net(nn.Cell):
        def __init__(self, stra_mul, stra_prod):
            super(Net, self).__init__()
            self.mul = P.Mul().shard(stra_mul)
            self.reduce_prod = P.ReduceProd(keep_dims=True).shard(stra_prod)

        def construct(self, x, y):
            out = self.mul(x, y)
            out = self.reduce_prod(out, 0)
            return out

    context.set_auto_parallel_context(device_num=8, global_rank=0)
    strategy1 = ((1, 1, 8), (1, 1, 8))
    strategy2 = ((8, 1, 1),)
    net = GradWrapNoBias(NetWithLossNoBias(Net(strategy1, strategy2)))
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")

    x = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    y = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    compile_net_no_bias(net, x, y)

 def test_prod_mul_auto():
    """
    Feature: test ReduceProd auto parallel strategy
    Description: don't set the strategy, keep_dims is True
    Expectation: compile success
    """
    class Net(nn.Cell):
        def __init__(self, strategy1, strategy2):
            super(Net, self).__init__()
            self.mul1 = P.Mul().shard(strategy1)
            self.reduce_prod = P.ReduceProd(keep_dims=True).shard(strategy2)

        def construct(self, x, y):
            out = self.mul1(x, y)
            out = self.reduce_prod(out, 0)
            return out

    context.set_auto_parallel_context(device_num=8, global_rank=0)
    strategy1 = None
    strategy2 = None
    net = GradWrapNoBias(NetWithLossNoBias(Net(strategy1, strategy2)))
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    gen_inputs_and_compile_net_no_bias(net)