From 9e5248497bb45b6f69c92d2047c3fa0d47f769c8 Mon Sep 17 00:00:00 2001 From: Ziyan Date: Thu, 17 Sep 2020 15:26:19 +0800 Subject: [PATCH] add batch parallel info black list --- .../ccsrc/frontend/parallel/node_check.cc | 8 ++ .../ccsrc/frontend/parallel/node_check.h | 1 + .../frontend/parallel/ops_info/ops_utils.h | 6 +- .../ccsrc/frontend/parallel/step_parallel.cc | 5 +- .../tensor_layout/tensor_redistribution.cc | 93 +++++++++++-------- .../tensor_layout/tensor_redistribution.h | 3 +- 6 files changed, 74 insertions(+), 42 deletions(-) diff --git a/mindspore/ccsrc/frontend/parallel/node_check.cc b/mindspore/ccsrc/frontend/parallel/node_check.cc index a6e8f7ea2b..48601e792c 100644 --- a/mindspore/ccsrc/frontend/parallel/node_check.cc +++ b/mindspore/ccsrc/frontend/parallel/node_check.cc @@ -80,9 +80,17 @@ const std::set BLACK_LIST = {TUPLE_GETITEM, REF_TO_EMBED, STOP_GRADIENT}; +const std::set BATCH_PARALLEL_BLACK_LIST = {PACK, TENSOR_SCATTER_UPDATE, MIN_MAX_UPDATE_PER_LAYER}; + bool IsInBlackList(const PrimitivePtr &prim) { MS_EXCEPTION_IF_NULL(prim); return (BLACK_LIST.find(prim->name()) != BLACK_LIST.end()); } + +bool IsInBatchParallelBlackList(const PrimitivePtr &prim) { + MS_EXCEPTION_IF_NULL(prim); + return (BATCH_PARALLEL_BLACK_LIST.find(prim->name()) != BATCH_PARALLEL_BLACK_LIST.end()); +} + } // namespace parallel } // namespace mindspore diff --git a/mindspore/ccsrc/frontend/parallel/node_check.h b/mindspore/ccsrc/frontend/parallel/node_check.h index ac8f79ee12..d89b067c1d 100644 --- a/mindspore/ccsrc/frontend/parallel/node_check.h +++ b/mindspore/ccsrc/frontend/parallel/node_check.h @@ -22,6 +22,7 @@ namespace mindspore { namespace parallel { bool IsInBlackList(const PrimitivePtr &prim); +bool IsInBatchParallelBlackList(const PrimitivePtr &prim); } // namespace parallel } // namespace mindspore diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/ops_utils.h b/mindspore/ccsrc/frontend/parallel/ops_info/ops_utils.h index 6fa8761d64..6729dceb0a 100644 --- a/mindspore/ccsrc/frontend/parallel/ops_info/ops_utils.h +++ b/mindspore/ccsrc/frontend/parallel/ops_info/ops_utils.h @@ -232,7 +232,6 @@ constexpr char SQUARE[] = "Square"; constexpr char BATCHMATMUL[] = "BatchMatMul"; constexpr char TOPK[] = "TopK"; constexpr char IN_TOPK[] = "InTopK"; -constexpr char PACK[] = "Pack"; constexpr char GATHER_ND[] = "GatherNd"; constexpr char UNSORTEF_SEGMENT_MIND[] = "UnsortedSegmentMinD"; constexpr char UNSORTEF_SEGMENT_PRODD[] = "UnsortedSegmentProdD"; @@ -298,6 +297,11 @@ constexpr char ZEROSLIKE[] = "ZerosLike"; constexpr char REF_TO_EMBED[] = "RefToEmbed"; constexpr char STOP_GRADIENT[] = "stop_gradient"; +// Batch parallel black list +constexpr char TENSOR_SCATTER_UPDATE[] = "TensorScatterUpdate"; +constexpr char MIN_MAX_UPDATE_PER_LAYER[] = "MinMaxUpdatePerLayer"; +constexpr char PACK[] = "Pack"; + constexpr size_t LAST_INDEX(size_t s) { return s - 1; } constexpr size_t SECOND_FROM_END(size_t s) { return s - 2; } constexpr size_t THIRD_FROM_END(size_t s) { return s - 3; } diff --git a/mindspore/ccsrc/frontend/parallel/step_parallel.cc b/mindspore/ccsrc/frontend/parallel/step_parallel.cc index e13b34f9d8..2defbfa642 100644 --- a/mindspore/ccsrc/frontend/parallel/step_parallel.cc +++ b/mindspore/ccsrc/frontend/parallel/step_parallel.cc @@ -1029,7 +1029,10 @@ OperatorInfoPtr OperatorInstance(const PrimitivePtr &prim, const PrimitiveAttrs const std::vector &shape_list) { MS_EXCEPTION_IF_NULL(prim); OperatorInfoPtr operator_ = OperatorInstanceByName(prim->name(), attrs, shape_list); - if ((operator_ == nullptr) && (prim->name() != MAKE_TUPLE)) { + if (operator_ == nullptr) { + if (IsInBatchParallelBlackList(prim)) { + MS_LOG(EXCEPTION) << "Operator " << prim->name() << " is not supported yet in auto parallel mode."; + } MS_LOG(INFO) << "Creat " << prim->name() << " failed, use batch parallel"; operator_ = OperatorInstanceByName(BATCH_PARALLEL, attrs, shape_list); MS_EXCEPTION_IF_NULL(operator_); diff --git a/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_redistribution.cc b/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_redistribution.cc index d03dceead6..1b59b09054 100644 --- a/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_redistribution.cc +++ b/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_redistribution.cc @@ -149,45 +149,10 @@ Status TensorRedistribution::ComputeCost() { double prod = std::accumulate(slice_shape.begin(), slice_shape.end(), static_cast(1.0), std::multiplies()); std::string str = op.first; - if (str == PERMUTE_BY_AXIS) { - // Since AlltoAll is a virtual operator, the expanded operators are used here to compute cost. - // communication cost = all_gather + reduce_scatter = before_slice_shape + after_slice_shape - forward_comm_cost_ += prod * ALLTOALL_SCALE_FACTOR; - backward_comm_cost_ += prod * ALLTOALL_SCALE_FACTOR; - comm_cost_ += 2.0 * prod * ALLTOALL_SCALE_FACTOR; - int32_t concat_dim = op.second[2]; - if (concat_dim == 0) { - // memory cost = all_gather - computation_cost_ += prod; - memory_cost_ += prod; - } else { - // memory cost = all_gather + split + concat - int32_t dev_num = op.second[4]; - computation_cost_ += (prod + prod * dev_num + prod * dev_num); - memory_cost_ += (prod * dev_num + prod * dev_num + prod); - } - } else if (str == CONCAT_BY_AXIS) { - // communication cost = all_gather + reduce_scatter = before_slice_shape + after_slice_shape - // computation cost = before_slice_shape - if (op.second.size() < 3) { - MS_LOG(ERROR) << "op.second size should not be less than 3!"; - return Status::FAILED; - } - double dev_num = op.second[2]; - // here, communication cost = all_gather + reduce_scatter - forward_comm_cost_ += prod * dev_num * ALLGATHER_REDUCESCATTER_SCALE_FACTOR; - backward_comm_cost_ += prod * ALLGATHER_REDUCESCATTER_SCALE_FACTOR; - comm_cost_ += prod * (dev_num + 1.0) * ALLGATHER_REDUCESCATTER_SCALE_FACTOR; - int32_t concat_dim = op.second[0]; - if (concat_dim == 0) { - // computation cost = all_gather - computation_cost_ += prod; - memory_cost_ += prod * dev_num; - } else { - // computation cost = all_gather + split + concat - computation_cost_ += (prod + prod * dev_num + prod * dev_num); - memory_cost_ += (prod * dev_num + prod * dev_num + prod); - } + if (str == PERMUTE_BY_AXIS && ComputePermuteCost(prod, op.second) != Status::SUCCESS) { + return Status::FAILED; + } else if (str == CONCAT_BY_AXIS && ComputeConcatCost(prod, op.second) != Status::SUCCESS) { + return Status::FAILED; } else { // There is only computation cost in SplitByAxis. // computation cost = before_slice_shape @@ -204,5 +169,55 @@ Status TensorRedistribution::ComputeCost() { } return Status::SUCCESS; } + +Status TensorRedistribution::ComputePermuteCost(double input_size, Shape attrs) { + // Since AlltoAll is a virtual operator, the expanded operators are used here to compute cost. + // communication cost = all_gather + reduce_scatter = before_slice_shape + after_slice_shape + if (attrs.size() < 4) { + MS_LOG(ERROR) << "attrs size should not be less than 4!"; + return Status::FAILED; + } + forward_comm_cost_ += input_size * ALLTOALL_SCALE_FACTOR; + backward_comm_cost_ += input_size * ALLTOALL_SCALE_FACTOR; + comm_cost_ += 2.0 * input_size * ALLTOALL_SCALE_FACTOR; + int32_t concat_dim = attrs[2]; + if (concat_dim == 0) { + // memory cost = all_gather + computation_cost_ += input_size; + memory_cost_ += input_size; + } else { + // memory cost = all_gather + split + concat + int32_t dev_num = attrs[4]; + computation_cost_ += (input_size + input_size * dev_num + input_size * dev_num); + memory_cost_ += (input_size * dev_num + input_size * dev_num + input_size); + } + return Status::SUCCESS; +} + +Status TensorRedistribution::ComputeConcatCost(double input_size, Shape attrs) { + // communication cost = all_gather + reduce_scatter = before_slice_shape + after_slice_shape + // computation cost = before_slice_shape + if (attrs.size() < 3) { + MS_LOG(ERROR) << "op.second size should not be less than 3!"; + return Status::FAILED; + } + double dev_num = attrs[2]; + // here, communication cost = all_gather + reduce_scatter + forward_comm_cost_ += input_size * dev_num * ALLGATHER_REDUCESCATTER_SCALE_FACTOR; + backward_comm_cost_ += input_size * ALLGATHER_REDUCESCATTER_SCALE_FACTOR; + comm_cost_ += input_size * (dev_num + 1.0) * ALLGATHER_REDUCESCATTER_SCALE_FACTOR; + int32_t concat_dim = attrs[0]; + if (concat_dim == 0) { + // computation cost = all_gather + computation_cost_ += input_size; + memory_cost_ += input_size * dev_num; + } else { + // computation cost = all_gather + split + concat + computation_cost_ += (input_size + input_size * dev_num + input_size * dev_num); + memory_cost_ += (input_size * dev_num + input_size * dev_num + input_size); + } + return Status::SUCCESS; +} + } // namespace parallel } // namespace mindspore diff --git a/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_redistribution.h b/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_redistribution.h index 196827d18a..2509e28553 100644 --- a/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_redistribution.h +++ b/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_redistribution.h @@ -61,7 +61,8 @@ class TensorRedistribution { private: Status InferReshape(const TensorLayout &from_layout, const TensorLayout &to_layout, OperatorVector *const operator_vector, OutPutInfoVector *const output_info_vector); - + Status ComputeConcatCost(double input_size, Shape attrs); + Status ComputePermuteCost(double input_size, Shape attrs); TensorLayout from_origin_; TensorLayout to_origin_; TensorLayout from_;