Browse Source

!5837 [AutoParallel]Rectification allreduce fusion api

Merge pull request !5837 from lichen/rectification_allreduce_fusion_api
tags/v1.0.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
4535faa2b0
11 changed files with 32 additions and 38 deletions
  1. +4
    -2
      mindspore/context.py
  2. +7
    -3
      mindspore/parallel/_auto_parallel_context.py
  3. +2
    -2
      model_zoo/official/cv/mobilenetv2/src/utils.py
  4. +3
    -4
      model_zoo/official/cv/resnet/train.py
  5. +1
    -3
      model_zoo/official/cv/resnet50_quant/train.py
  6. +2
    -5
      model_zoo/official/cv/resnet_thor/train.py
  7. +4
    -5
      model_zoo/official/nlp/bert/run_pretrain.py
  8. +4
    -5
      tests/st/networks/models/resnet50/test_resnet50_imagenet.py
  9. +2
    -3
      tests/st/tbe_networks/resnet_cifar.py
  10. +1
    -3
      tests/st/tbe_networks/test_resnet_cifar_8p.py
  11. +2
    -3
      tests/ut/python/parallel/test_parallel_optimizer.py

+ 4
- 2
mindspore/context.py View File

@@ -325,7 +325,8 @@ def _context():

@args_type_check(device_num=int, global_rank=int, gradients_mean=bool, gradient_fp32_sync=bool, parallel_mode=str,
auto_parallel_search_mode=str, parameter_broadcast=bool, strategy_ckpt_load_file=str,
strategy_ckpt_save_file=str, full_batch=bool, enable_parallel_optimizer=bool)
strategy_ckpt_save_file=str, full_batch=bool, enable_parallel_optimizer=bool,
all_reduce_fusion_config=list)
def set_auto_parallel_context(**kwargs):
"""
Set auto parallel context.
@@ -371,8 +372,9 @@ def set_auto_parallel_context(**kwargs):
strategy_ckpt_load_file (str): The path to load parallel strategy checkpoint. Default: ''
strategy_ckpt_save_file (str): The path to save parallel strategy checkpoint. Default: ''
full_batch (bool): Whether to load the whole batch on each device. Default: False.
enable_parallel_optimizer(bool): This is a developing feature, which shards the weight update computation in
enable_parallel_optimizer (bool): This is a developing feature, which shards the weight update computation in
data parallel training in the benefit of time and memory saving.
all_reduce_fusion_config (list): Set allreduce fusion strategy by parameters indices.

Raises:
ValueError: If input key is not attribute in auto parallel context.


+ 7
- 3
mindspore/parallel/_auto_parallel_context.py View File

@@ -462,7 +462,8 @@ _set_auto_parallel_context_func_map = {
"strategy_ckpt_load_file": auto_parallel_context().set_strategy_ckpt_load_file,
"strategy_ckpt_save_file": auto_parallel_context().set_strategy_ckpt_save_file,
"full_batch": auto_parallel_context().set_full_batch,
"enable_parallel_optimizer": auto_parallel_context().set_enable_parallel_optimizer}
"enable_parallel_optimizer": auto_parallel_context().set_enable_parallel_optimizer,
"all_reduce_fusion_config": auto_parallel_context().set_all_reduce_fusion_split_indices}


_get_auto_parallel_context_func_map = {
@@ -477,13 +478,15 @@ _get_auto_parallel_context_func_map = {
"strategy_ckpt_load_file": auto_parallel_context().get_strategy_ckpt_load_file,
"strategy_ckpt_save_file": auto_parallel_context().get_strategy_ckpt_save_file,
"full_batch": auto_parallel_context().get_full_batch,
"enable_parallel_optimizer": auto_parallel_context().get_enable_parallel_optimizer}
"enable_parallel_optimizer": auto_parallel_context().get_enable_parallel_optimizer,
"all_reduce_fusion_config": auto_parallel_context().get_all_reduce_fusion_split_indices}


@args_type_check(device_num=int, global_rank=int, gradients_mean=bool, gradient_fp32_sync=bool,
loss_repeated_mean=bool, parallel_mode=str, auto_parallel_search_mode=str,
parameter_broadcast=bool, strategy_ckpt_load_file=str,
strategy_ckpt_save_file=str, full_batch=bool, enable_parallel_optimizer=bool)
strategy_ckpt_save_file=str, full_batch=bool, enable_parallel_optimizer=bool,
all_reduce_fusion_config=list)

def _set_auto_parallel_context(**kwargs):
"""
@@ -526,6 +529,7 @@ def _set_auto_parallel_context(**kwargs):
strategy_ckpt_save_file (str): The path to save parallel strategy checkpoint. Default: ''
full_batch (bool): Whether to load the whole batch on each device. Default: False.
enable_parallel_optimizer (bool): Enable using optimizer segmentation or not. Default: False.
all_reduce_fusion_config (list): Set allreduce fusion strategy by parameters indices.

Raises:
ValueError: If input key is not attribute in auto parallel context.


+ 2
- 2
model_zoo/official/cv/mobilenetv2/src/utils.py View File

@@ -47,8 +47,8 @@ def context_device_init(config):
if config.run_distribute:
context.set_auto_parallel_context(device_num=config.rank_size,
parallel_mode=ParallelMode.DATA_PARALLEL,
parameter_broadcast=True, gradients_mean=True)
auto_parallel_context().set_all_reduce_fusion_split_indices([140])
parameter_broadcast=True, gradients_mean=True,
all_reduce_fusion_config=[140])
init()
else:
raise ValueError("Only support CPU, GPU and Ascend.")


+ 3
- 4
model_zoo/official/cv/resnet/train.py View File

@@ -18,7 +18,6 @@ import argparse
import ast
from mindspore import context
from mindspore import Tensor
from mindspore.parallel._auto_parallel_context import auto_parallel_context
from mindspore.nn.optim.momentum import Momentum
from mindspore.train.model import Model
from mindspore.context import ParallelMode
@@ -78,9 +77,9 @@ if __name__ == '__main__':
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
gradients_mean=True)
if args_opt.net == "resnet50" or args_opt.net == "se-resnet50":
auto_parallel_context().set_all_reduce_fusion_split_indices([85, 160])
context.set_auto_parallel_context(all_reduce_fusion_config=[85, 150])
else:
auto_parallel_context().set_all_reduce_fusion_split_indices([180, 313])
context.set_auto_parallel_context(all_reduce_fusion_config=[180, 313])
init()
# GPU target
else:
@@ -88,7 +87,7 @@ if __name__ == '__main__':
context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL,
gradients_mean=True)
if args_opt.net == "resnet50":
auto_parallel_context().set_all_reduce_fusion_split_indices([85, 160])
context.set_auto_parallel_context(all_reduce_fusion_config=[85, 160])
ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/"

# create dataset


+ 1
- 3
model_zoo/official/cv/resnet50_quant/train.py View File

@@ -19,7 +19,6 @@ import argparse

from mindspore import context
from mindspore import Tensor
from mindspore.parallel._auto_parallel_context import auto_parallel_context
from mindspore.nn.optim.momentum import Momentum
from mindspore.train.model import Model
from mindspore.context import ParallelMode
@@ -80,8 +79,7 @@ if __name__ == '__main__':
init()
context.set_auto_parallel_context(device_num=args_opt.device_num,
parallel_mode=ParallelMode.DATA_PARALLEL,
gradients_mean=True)
auto_parallel_context().set_all_reduce_fusion_split_indices([107, 160])
gradients_mean=True, all_reduce_fusion_config=[107, 160])

# define network
net = resnet50_quant(class_num=config.class_num)


+ 2
- 5
model_zoo/official/cv/resnet_thor/train.py View File

@@ -20,7 +20,6 @@ import numpy as np
from mindspore import context
from mindspore import Tensor
from mindspore.common import set_seed
from mindspore.parallel._auto_parallel_context import auto_parallel_context
from mindspore.context import ParallelMode
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor, LossMonitor
from mindspore.train.loss_scale_manager import FixedLossScaleManager
@@ -94,15 +93,13 @@ if __name__ == '__main__':
device_id = int(os.getenv('DEVICE_ID'))
context.set_context(device_id=device_id, enable_auto_mixed_precision=True)
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
gradients_mean=True)
auto_parallel_context().set_all_reduce_fusion_split_indices([107])
gradients_mean=True, all_reduce_fusion_config=[107])
init()
# GPU target
else:
init()
context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL,
gradients_mean=True)
auto_parallel_context().set_all_reduce_fusion_split_indices([107])
gradients_mean=True, all_reduce_fusion_config=[104])
ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/"

# create dataset


+ 4
- 5
model_zoo/official/nlp/bert/run_pretrain.py View File

@@ -87,17 +87,16 @@ def run_pretrain():
context.reset_auto_parallel_context()
context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
device_num=device_num)
from mindspore.parallel._auto_parallel_context import auto_parallel_context
if bert_net_cfg.num_hidden_layers == 12:
if bert_net_cfg.use_relative_positions:
auto_parallel_context().set_all_reduce_fusion_split_indices([29, 58, 87, 116, 145, 174, 203, 217])
context.set_auto_parallel_context(all_reduce_fusion_config=[29, 58, 87, 116, 145, 174, 203, 217])
else:
auto_parallel_context().set_all_reduce_fusion_split_indices([28, 55, 82, 109, 136, 163, 190, 205])
context.set_auto_parallel_context(all_reduce_fusion_config=[28, 55, 82, 109, 136, 163, 190, 205])
elif bert_net_cfg.num_hidden_layers == 24:
if bert_net_cfg.use_relative_positions:
auto_parallel_context().set_all_reduce_fusion_split_indices([30, 90, 150, 210, 270, 330, 390, 421])
context.set_auto_parallel_context(all_reduce_fusion_config=[30, 90, 150, 210, 270, 330, 390, 421])
else:
auto_parallel_context().set_all_reduce_fusion_split_indices([38, 93, 148, 203, 258, 313, 368, 397])
context.set_auto_parallel_context(all_reduce_fusion_config=[38, 93, 148, 203, 258, 313, 368, 397])
else:
rank = 0
device_num = 1


+ 4
- 5
tests/st/networks/models/resnet50/test_resnet50_imagenet.py View File

@@ -23,7 +23,6 @@ import numpy as np

from mindspore import context, Tensor
from mindspore.communication.management import init
from mindspore.parallel._auto_parallel_context import auto_parallel_context
from mindspore.train.model import Model
from mindspore.context import ParallelMode
from mindspore.train.callback import Callback
@@ -137,8 +136,8 @@ def train_process(q, device_id, epoch_size, device_num, enable_hccl):
os.environ['RANK_SIZE'] = str(device_num)
if enable_hccl:
context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
gradients_mean=True, parameter_broadcast=True)
auto_parallel_context().set_all_reduce_fusion_split_indices([107, 160])
gradients_mean=True, parameter_broadcast=True,
all_reduce_fusion_config=[107, 160])
init()

# network
@@ -240,8 +239,8 @@ def train_process_thor(q, device_id, epoch_size, device_num, enable_hccl):
os.environ['RANK_SIZE'] = str(device_num)
if enable_hccl:
context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
gradients_mean=True, parameter_broadcast=True)
auto_parallel_context().set_all_reduce_fusion_split_indices([107])
gradients_mean=True, parameter_broadcast=True,
all_reduce_fusion_config=[107])
init()

# network


+ 2
- 3
tests/st/tbe_networks/resnet_cifar.py View File

@@ -31,7 +31,6 @@ from mindspore import context
from mindspore.communication.management import init
from mindspore.nn.optim.momentum import Momentum
from mindspore.ops import operations as P
from mindspore.parallel._auto_parallel_context import auto_parallel_context
from mindspore.train.model import Model
from mindspore.context import ParallelMode

@@ -124,8 +123,8 @@ class CrossEntropyLoss(nn.Cell):

if __name__ == '__main__':
if not args_opt.do_eval and args_opt.run_distribute:
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL)
auto_parallel_context().set_all_reduce_fusion_split_indices([140])
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
all_reduce_fusion_config=[140])
init()

context.set_context(mode=context.GRAPH_MODE)


+ 1
- 3
tests/st/tbe_networks/test_resnet_cifar_8p.py View File

@@ -30,7 +30,6 @@ from mindspore import context
from mindspore.communication.management import init
from mindspore.nn.optim.momentum import Momentum
from mindspore.ops import operations as P
from mindspore.parallel._auto_parallel_context import auto_parallel_context
from mindspore.train.callback import Callback
from mindspore.train.model import Model
from mindspore.context import ParallelMode
@@ -154,8 +153,7 @@ def train_process(q, device_id, epoch_size, num_classes, device_num, batch_size,
os.environ['RANK_SIZE'] = str(device_num)
if enable_hccl:
context.set_auto_parallel_context(
device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL)
auto_parallel_context().set_all_reduce_fusion_split_indices([140])
device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, all_reduce_fusion_config=[140])
init()
context.set_context(mode=context.GRAPH_MODE)
net = resnet50(batch_size, num_classes)


+ 2
- 3
tests/ut/python/parallel/test_parallel_optimizer.py View File

@@ -23,7 +23,6 @@ from mindspore.nn import TrainOneStepCell, WithLossCell
from mindspore.nn.optim import Adam, AdamWeightDecay, Lamb
from mindspore.ops import operations as P
from mindspore import context
from mindspore.parallel._auto_parallel_context import auto_parallel_context

class Net(nn.Cell):
"""Net definition"""
@@ -85,8 +84,8 @@ def test_lamb_compile():

def test_lamb_split_fusion():
""" test_Lamb_split_fusion """
context.set_auto_parallel_context(parallel_mode="data_parallel", device_num=2, enable_parallel_optimizer=True)
auto_parallel_context().set_all_reduce_fusion_split_indices([2, 4, 6, 8])
context.set_auto_parallel_context(parallel_mode="data_parallel", device_num=2, enable_parallel_optimizer=True,
all_reduce_fusion_config=[2, 4, 6, 8])
inputs = Tensor(np.ones([32, 128]).astype(np.float32))
label = Tensor(np.zeros([32, 768]).astype(np.float32))
net = Net()


Loading…
Cancel
Save