Browse Source

add batch parallel info black list

tags/v1.0.0
Ziyan 5 years ago
parent
commit
9e5248497b
6 changed files with 74 additions and 42 deletions
  1. +8
    -0
      mindspore/ccsrc/frontend/parallel/node_check.cc
  2. +1
    -0
      mindspore/ccsrc/frontend/parallel/node_check.h
  3. +5
    -1
      mindspore/ccsrc/frontend/parallel/ops_info/ops_utils.h
  4. +4
    -1
      mindspore/ccsrc/frontend/parallel/step_parallel.cc
  5. +54
    -39
      mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_redistribution.cc
  6. +2
    -1
      mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_redistribution.h

+ 8
- 0
mindspore/ccsrc/frontend/parallel/node_check.cc View File

@@ -80,9 +80,17 @@ const std::set<std::string> BLACK_LIST = {TUPLE_GETITEM,
REF_TO_EMBED,
STOP_GRADIENT};

const std::set<std::string> BATCH_PARALLEL_BLACK_LIST = {PACK, TENSOR_SCATTER_UPDATE, MIN_MAX_UPDATE_PER_LAYER};

bool IsInBlackList(const PrimitivePtr &prim) {
MS_EXCEPTION_IF_NULL(prim);
return (BLACK_LIST.find(prim->name()) != BLACK_LIST.end());
}

bool IsInBatchParallelBlackList(const PrimitivePtr &prim) {
MS_EXCEPTION_IF_NULL(prim);
return (BATCH_PARALLEL_BLACK_LIST.find(prim->name()) != BATCH_PARALLEL_BLACK_LIST.end());
}

} // namespace parallel
} // namespace mindspore

+ 1
- 0
mindspore/ccsrc/frontend/parallel/node_check.h View File

@@ -22,6 +22,7 @@
namespace mindspore {
namespace parallel {
bool IsInBlackList(const PrimitivePtr &prim);
bool IsInBatchParallelBlackList(const PrimitivePtr &prim);
} // namespace parallel
} // namespace mindspore



+ 5
- 1
mindspore/ccsrc/frontend/parallel/ops_info/ops_utils.h View File

@@ -232,7 +232,6 @@ constexpr char SQUARE[] = "Square";
constexpr char BATCHMATMUL[] = "BatchMatMul";
constexpr char TOPK[] = "TopK";
constexpr char IN_TOPK[] = "InTopK";
constexpr char PACK[] = "Pack";
constexpr char GATHER_ND[] = "GatherNd";
constexpr char UNSORTEF_SEGMENT_MIND[] = "UnsortedSegmentMinD";
constexpr char UNSORTEF_SEGMENT_PRODD[] = "UnsortedSegmentProdD";
@@ -298,6 +297,11 @@ constexpr char ZEROSLIKE[] = "ZerosLike";
constexpr char REF_TO_EMBED[] = "RefToEmbed";
constexpr char STOP_GRADIENT[] = "stop_gradient";

// Batch parallel black list
constexpr char TENSOR_SCATTER_UPDATE[] = "TensorScatterUpdate";
constexpr char MIN_MAX_UPDATE_PER_LAYER[] = "MinMaxUpdatePerLayer";
constexpr char PACK[] = "Pack";

constexpr size_t LAST_INDEX(size_t s) { return s - 1; }
constexpr size_t SECOND_FROM_END(size_t s) { return s - 2; }
constexpr size_t THIRD_FROM_END(size_t s) { return s - 3; }


+ 4
- 1
mindspore/ccsrc/frontend/parallel/step_parallel.cc View File

@@ -1029,7 +1029,10 @@ OperatorInfoPtr OperatorInstance(const PrimitivePtr &prim, const PrimitiveAttrs
const std::vector<Shapes> &shape_list) {
MS_EXCEPTION_IF_NULL(prim);
OperatorInfoPtr operator_ = OperatorInstanceByName(prim->name(), attrs, shape_list);
if ((operator_ == nullptr) && (prim->name() != MAKE_TUPLE)) {
if (operator_ == nullptr) {
if (IsInBatchParallelBlackList(prim)) {
MS_LOG(EXCEPTION) << "Operator " << prim->name() << " is not supported yet in auto parallel mode.";
}
MS_LOG(INFO) << "Creat " << prim->name() << " failed, use batch parallel";
operator_ = OperatorInstanceByName(BATCH_PARALLEL, attrs, shape_list);
MS_EXCEPTION_IF_NULL(operator_);


+ 54
- 39
mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_redistribution.cc View File

@@ -149,45 +149,10 @@ Status TensorRedistribution::ComputeCost() {
double prod =
std::accumulate(slice_shape.begin(), slice_shape.end(), static_cast<double>(1.0), std::multiplies<double>());
std::string str = op.first;
if (str == PERMUTE_BY_AXIS) {
// Since AlltoAll is a virtual operator, the expanded operators are used here to compute cost.
// communication cost = all_gather + reduce_scatter = before_slice_shape + after_slice_shape
forward_comm_cost_ += prod * ALLTOALL_SCALE_FACTOR;
backward_comm_cost_ += prod * ALLTOALL_SCALE_FACTOR;
comm_cost_ += 2.0 * prod * ALLTOALL_SCALE_FACTOR;
int32_t concat_dim = op.second[2];
if (concat_dim == 0) {
// memory cost = all_gather
computation_cost_ += prod;
memory_cost_ += prod;
} else {
// memory cost = all_gather + split + concat
int32_t dev_num = op.second[4];
computation_cost_ += (prod + prod * dev_num + prod * dev_num);
memory_cost_ += (prod * dev_num + prod * dev_num + prod);
}
} else if (str == CONCAT_BY_AXIS) {
// communication cost = all_gather + reduce_scatter = before_slice_shape + after_slice_shape
// computation cost = before_slice_shape
if (op.second.size() < 3) {
MS_LOG(ERROR) << "op.second size should not be less than 3!";
return Status::FAILED;
}
double dev_num = op.second[2];
// here, communication cost = all_gather + reduce_scatter
forward_comm_cost_ += prod * dev_num * ALLGATHER_REDUCESCATTER_SCALE_FACTOR;
backward_comm_cost_ += prod * ALLGATHER_REDUCESCATTER_SCALE_FACTOR;
comm_cost_ += prod * (dev_num + 1.0) * ALLGATHER_REDUCESCATTER_SCALE_FACTOR;
int32_t concat_dim = op.second[0];
if (concat_dim == 0) {
// computation cost = all_gather
computation_cost_ += prod;
memory_cost_ += prod * dev_num;
} else {
// computation cost = all_gather + split + concat
computation_cost_ += (prod + prod * dev_num + prod * dev_num);
memory_cost_ += (prod * dev_num + prod * dev_num + prod);
}
if (str == PERMUTE_BY_AXIS && ComputePermuteCost(prod, op.second) != Status::SUCCESS) {
return Status::FAILED;
} else if (str == CONCAT_BY_AXIS && ComputeConcatCost(prod, op.second) != Status::SUCCESS) {
return Status::FAILED;
} else {
// There is only computation cost in SplitByAxis.
// computation cost = before_slice_shape
@@ -204,5 +169,55 @@ Status TensorRedistribution::ComputeCost() {
}
return Status::SUCCESS;
}

Status TensorRedistribution::ComputePermuteCost(double input_size, Shape attrs) {
// Since AlltoAll is a virtual operator, the expanded operators are used here to compute cost.
// communication cost = all_gather + reduce_scatter = before_slice_shape + after_slice_shape
if (attrs.size() < 4) {
MS_LOG(ERROR) << "attrs size should not be less than 4!";
return Status::FAILED;
}
forward_comm_cost_ += input_size * ALLTOALL_SCALE_FACTOR;
backward_comm_cost_ += input_size * ALLTOALL_SCALE_FACTOR;
comm_cost_ += 2.0 * input_size * ALLTOALL_SCALE_FACTOR;
int32_t concat_dim = attrs[2];
if (concat_dim == 0) {
// memory cost = all_gather
computation_cost_ += input_size;
memory_cost_ += input_size;
} else {
// memory cost = all_gather + split + concat
int32_t dev_num = attrs[4];
computation_cost_ += (input_size + input_size * dev_num + input_size * dev_num);
memory_cost_ += (input_size * dev_num + input_size * dev_num + input_size);
}
return Status::SUCCESS;
}

Status TensorRedistribution::ComputeConcatCost(double input_size, Shape attrs) {
// communication cost = all_gather + reduce_scatter = before_slice_shape + after_slice_shape
// computation cost = before_slice_shape
if (attrs.size() < 3) {
MS_LOG(ERROR) << "op.second size should not be less than 3!";
return Status::FAILED;
}
double dev_num = attrs[2];
// here, communication cost = all_gather + reduce_scatter
forward_comm_cost_ += input_size * dev_num * ALLGATHER_REDUCESCATTER_SCALE_FACTOR;
backward_comm_cost_ += input_size * ALLGATHER_REDUCESCATTER_SCALE_FACTOR;
comm_cost_ += input_size * (dev_num + 1.0) * ALLGATHER_REDUCESCATTER_SCALE_FACTOR;
int32_t concat_dim = attrs[0];
if (concat_dim == 0) {
// computation cost = all_gather
computation_cost_ += input_size;
memory_cost_ += input_size * dev_num;
} else {
// computation cost = all_gather + split + concat
computation_cost_ += (input_size + input_size * dev_num + input_size * dev_num);
memory_cost_ += (input_size * dev_num + input_size * dev_num + input_size);
}
return Status::SUCCESS;
}

} // namespace parallel
} // namespace mindspore

+ 2
- 1
mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_redistribution.h View File

@@ -61,7 +61,8 @@ class TensorRedistribution {
private:
Status InferReshape(const TensorLayout &from_layout, const TensorLayout &to_layout,
OperatorVector *const operator_vector, OutPutInfoVector *const output_info_vector);

Status ComputeConcatCost(double input_size, Shape attrs);
Status ComputePermuteCost(double input_size, Shape attrs);
TensorLayout from_origin_;
TensorLayout to_origin_;
TensorLayout from_;


Loading…
Cancel
Save