From 9e5248497bb45b6f69c92d2047c3fa0d47f769c8 Mon Sep 17 00:00:00 2001
From: Ziyan <gongziyan1@huawei.com>
Date: Thu, 17 Sep 2020 15:26:19 +0800
Subject: [PATCH] add batch parallel info black list

---
 .../ccsrc/frontend/parallel/node_check.cc     |  8 ++
 .../ccsrc/frontend/parallel/node_check.h      |  1 +
 .../frontend/parallel/ops_info/ops_utils.h    |  6 +-
 .../ccsrc/frontend/parallel/step_parallel.cc  |  5 +-
 .../tensor_layout/tensor_redistribution.cc    | 93 +++++++++++--------
 .../tensor_layout/tensor_redistribution.h     |  3 +-
 6 files changed, 74 insertions(+), 42 deletions(-)
diff --git a/mindspore/ccsrc/frontend/parallel/node_check.cc b/mindspore/ccsrc/frontend/parallel/node_check.cc
index a6e8f7ea2b..48601e792c 100644
--- a/mindspore/ccsrc/frontend/parallel/node_check.cc
+++ b/mindspore/ccsrc/frontend/parallel/node_check.cc
@@ -80,9 +80,17 @@ const std::set<std::string> BLACK_LIST = {TUPLE_GETITEM,
                                           REF_TO_EMBED,
                                           STOP_GRADIENT};
 
+const std::set<std::string> BATCH_PARALLEL_BLACK_LIST = {PACK, TENSOR_SCATTER_UPDATE, MIN_MAX_UPDATE_PER_LAYER};
+
 bool IsInBlackList(const PrimitivePtr &prim) {
   MS_EXCEPTION_IF_NULL(prim);
   return (BLACK_LIST.find(prim->name()) != BLACK_LIST.end());
 }
+
+bool IsInBatchParallelBlackList(const PrimitivePtr &prim) {
+  MS_EXCEPTION_IF_NULL(prim);
+  return (BATCH_PARALLEL_BLACK_LIST.find(prim->name()) != BATCH_PARALLEL_BLACK_LIST.end());
+}
+
 }  // namespace parallel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/frontend/parallel/node_check.h b/mindspore/ccsrc/frontend/parallel/node_check.h
index ac8f79ee12..d89b067c1d 100644
--- a/mindspore/ccsrc/frontend/parallel/node_check.h
+++ b/mindspore/ccsrc/frontend/parallel/node_check.h
@@ -22,6 +22,7 @@
 namespace mindspore {
 namespace parallel {
 bool IsInBlackList(const PrimitivePtr &prim);
+bool IsInBatchParallelBlackList(const PrimitivePtr &prim);
 }  // namespace parallel
 }  // namespace mindspore
 
diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/ops_utils.h b/mindspore/ccsrc/frontend/parallel/ops_info/ops_utils.h
index 6fa8761d64..6729dceb0a 100644
--- a/mindspore/ccsrc/frontend/parallel/ops_info/ops_utils.h
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/ops_utils.h
@@ -232,7 +232,6 @@ constexpr char SQUARE[] = "Square";
 constexpr char BATCHMATMUL[] = "BatchMatMul";
 constexpr char TOPK[] = "TopK";
 constexpr char IN_TOPK[] = "InTopK";
-constexpr char PACK[] = "Pack";
 constexpr char GATHER_ND[] = "GatherNd";
 constexpr char UNSORTEF_SEGMENT_MIND[] = "UnsortedSegmentMinD";
 constexpr char UNSORTEF_SEGMENT_PRODD[] = "UnsortedSegmentProdD";
@@ -298,6 +297,11 @@ constexpr char ZEROSLIKE[] = "ZerosLike";
 constexpr char REF_TO_EMBED[] = "RefToEmbed";
 constexpr char STOP_GRADIENT[] = "stop_gradient";
 
+// Batch parallel black list
+constexpr char TENSOR_SCATTER_UPDATE[] = "TensorScatterUpdate";
+constexpr char MIN_MAX_UPDATE_PER_LAYER[] = "MinMaxUpdatePerLayer";
+constexpr char PACK[] = "Pack";
+
 constexpr size_t LAST_INDEX(size_t s) { return s - 1; }
 constexpr size_t SECOND_FROM_END(size_t s) { return s - 2; }
 constexpr size_t THIRD_FROM_END(size_t s) { return s - 3; }
diff --git a/mindspore/ccsrc/frontend/parallel/step_parallel.cc b/mindspore/ccsrc/frontend/parallel/step_parallel.cc
index e13b34f9d8..2defbfa642 100644
--- a/mindspore/ccsrc/frontend/parallel/step_parallel.cc
+++ b/mindspore/ccsrc/frontend/parallel/step_parallel.cc
@@ -1029,7 +1029,10 @@ OperatorInfoPtr OperatorInstance(const PrimitivePtr &prim, const PrimitiveAttrs
                                  const std::vector<Shapes> &shape_list) {
   MS_EXCEPTION_IF_NULL(prim);
   OperatorInfoPtr operator_ = OperatorInstanceByName(prim->name(), attrs, shape_list);
-  if ((operator_ == nullptr) && (prim->name() != MAKE_TUPLE)) {
+  if (operator_ == nullptr) {
+    if (IsInBatchParallelBlackList(prim)) {
+      MS_LOG(EXCEPTION) << "Operator " << prim->name() << " is not supported yet in auto parallel mode.";
+    }
     MS_LOG(INFO) << "Creat " << prim->name() << " failed, use batch parallel";
     operator_ = OperatorInstanceByName(BATCH_PARALLEL, attrs, shape_list);
     MS_EXCEPTION_IF_NULL(operator_);
diff --git a/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_redistribution.cc b/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_redistribution.cc
index d03dceead6..1b59b09054 100644
--- a/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_redistribution.cc
+++ b/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_redistribution.cc
@@ -149,45 +149,10 @@ Status TensorRedistribution::ComputeCost() {
     double prod =
       std::accumulate(slice_shape.begin(), slice_shape.end(), static_cast<double>(1.0), std::multiplies<double>());
     std::string str = op.first;
-    if (str == PERMUTE_BY_AXIS) {
-      // Since AlltoAll is a virtual operator, the expanded operators are used here to compute cost.
-      // communication cost = all_gather + reduce_scatter = before_slice_shape + after_slice_shape
-      forward_comm_cost_ += prod * ALLTOALL_SCALE_FACTOR;
-      backward_comm_cost_ += prod * ALLTOALL_SCALE_FACTOR;
-      comm_cost_ += 2.0 * prod * ALLTOALL_SCALE_FACTOR;
-      int32_t concat_dim = op.second[2];
-      if (concat_dim == 0) {
-        // memory cost = all_gather
-        computation_cost_ += prod;
-        memory_cost_ += prod;
-      } else {
-        // memory cost = all_gather + split + concat
-        int32_t dev_num = op.second[4];
-        computation_cost_ += (prod + prod * dev_num + prod * dev_num);
-        memory_cost_ += (prod * dev_num + prod * dev_num + prod);
-      }
-    } else if (str == CONCAT_BY_AXIS) {
-      // communication cost = all_gather + reduce_scatter = before_slice_shape + after_slice_shape
-      // computation cost = before_slice_shape
-      if (op.second.size() < 3) {
-        MS_LOG(ERROR) << "op.second size should not be less than 3!";
-        return Status::FAILED;
-      }
-      double dev_num = op.second[2];
-      // here, communication cost = all_gather + reduce_scatter
-      forward_comm_cost_ += prod * dev_num * ALLGATHER_REDUCESCATTER_SCALE_FACTOR;
-      backward_comm_cost_ += prod * ALLGATHER_REDUCESCATTER_SCALE_FACTOR;
-      comm_cost_ += prod * (dev_num + 1.0) * ALLGATHER_REDUCESCATTER_SCALE_FACTOR;
-      int32_t concat_dim = op.second[0];
-      if (concat_dim == 0) {
-        // computation cost = all_gather
-        computation_cost_ += prod;
-        memory_cost_ += prod * dev_num;
-      } else {
-        // computation cost = all_gather + split + concat
-        computation_cost_ += (prod + prod * dev_num + prod * dev_num);
-        memory_cost_ += (prod * dev_num + prod * dev_num + prod);
-      }
+    if (str == PERMUTE_BY_AXIS && ComputePermuteCost(prod, op.second) != Status::SUCCESS) {
+      return Status::FAILED;
+    } else if (str == CONCAT_BY_AXIS && ComputeConcatCost(prod, op.second) != Status::SUCCESS) {
+      return Status::FAILED;
     } else {
       // There is only computation cost in SplitByAxis.
       // computation cost = before_slice_shape
@@ -204,5 +169,55 @@ Status TensorRedistribution::ComputeCost() {
   }
   return Status::SUCCESS;
 }
+
+Status TensorRedistribution::ComputePermuteCost(double input_size, Shape attrs) {
+  // Since AlltoAll is a virtual operator, the expanded operators are used here to compute cost.
+  // communication cost = all_gather + reduce_scatter = before_slice_shape + after_slice_shape
+  if (attrs.size() < 4) {
+    MS_LOG(ERROR) << "attrs size should not be less than 4!";
+    return Status::FAILED;
+  }
+  forward_comm_cost_ += input_size * ALLTOALL_SCALE_FACTOR;
+  backward_comm_cost_ += input_size * ALLTOALL_SCALE_FACTOR;
+  comm_cost_ += 2.0 * input_size * ALLTOALL_SCALE_FACTOR;
+  int32_t concat_dim = attrs[2];
+  if (concat_dim == 0) {
+    // memory cost = all_gather
+    computation_cost_ += input_size;
+    memory_cost_ += input_size;
+  } else {
+    // memory cost = all_gather + split + concat
+    int32_t dev_num = attrs[4];
+    computation_cost_ += (input_size + input_size * dev_num + input_size * dev_num);
+    memory_cost_ += (input_size * dev_num + input_size * dev_num + input_size);
+  }
+  return Status::SUCCESS;
+}
+
+Status TensorRedistribution::ComputeConcatCost(double input_size, Shape attrs) {
+  // communication cost = all_gather + reduce_scatter = before_slice_shape + after_slice_shape
+  // computation cost = before_slice_shape
+  if (attrs.size() < 3) {
+    MS_LOG(ERROR) << "op.second size should not be less than 3!";
+    return Status::FAILED;
+  }
+  double dev_num = attrs[2];
+  // here, communication cost = all_gather + reduce_scatter
+  forward_comm_cost_ += input_size * dev_num * ALLGATHER_REDUCESCATTER_SCALE_FACTOR;
+  backward_comm_cost_ += input_size * ALLGATHER_REDUCESCATTER_SCALE_FACTOR;
+  comm_cost_ += input_size * (dev_num + 1.0) * ALLGATHER_REDUCESCATTER_SCALE_FACTOR;
+  int32_t concat_dim = attrs[0];
+  if (concat_dim == 0) {
+    // computation cost = all_gather
+    computation_cost_ += input_size;
+    memory_cost_ += input_size * dev_num;
+  } else {
+    // computation cost = all_gather + split + concat
+    computation_cost_ += (input_size + input_size * dev_num + input_size * dev_num);
+    memory_cost_ += (input_size * dev_num + input_size * dev_num + input_size);
+  }
+  return Status::SUCCESS;
+}
+
 }  // namespace parallel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_redistribution.h b/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_redistribution.h
index 196827d18a..2509e28553 100644
--- a/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_redistribution.h
+++ b/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_redistribution.h
@@ -61,7 +61,8 @@ class TensorRedistribution {
  private:
   Status InferReshape(const TensorLayout &from_layout, const TensorLayout &to_layout,
                       OperatorVector *const operator_vector, OutPutInfoVector *const output_info_vector);
-
+  Status ComputeConcatCost(double input_size, Shape attrs);
+  Status ComputePermuteCost(double input_size, Shape attrs);
   TensorLayout from_origin_;
   TensorLayout to_origin_;
   TensorLayout from_;