Merge pull request !4801 from yihuaijie/devtags/v0.7.0-beta
| @@ -81,6 +81,8 @@ void ParallelContext::set_mirror_mean(bool mirror_mean) { mirror_mean_ = mirror_ | |||||
| void ParallelContext::set_full_batch(bool full_batch) { full_batch_ = full_batch; } | void ParallelContext::set_full_batch(bool full_batch) { full_batch_ = full_batch; } | ||||
| void ParallelContext::set_has_initializer(bool has_initializer) { has_initializer_ = has_initializer; } | |||||
| void ParallelContext::set_cast_before_mirror(bool cast_before_mirror) { cast_before_mirror_ = cast_before_mirror; } | void ParallelContext::set_cast_before_mirror(bool cast_before_mirror) { cast_before_mirror_ = cast_before_mirror; } | ||||
| void ParallelContext::set_loss_repeated_mean(bool loss_repeated_mean) { loss_repeated_mean_ = loss_repeated_mean; } | void ParallelContext::set_loss_repeated_mean(bool loss_repeated_mean) { loss_repeated_mean_ = loss_repeated_mean; } | ||||
| @@ -58,6 +58,9 @@ class ParallelContext { | |||||
| void set_full_batch(bool full_batch); | void set_full_batch(bool full_batch); | ||||
| bool full_batch() const { return full_batch_; } | bool full_batch() const { return full_batch_; } | ||||
| void set_has_initializer(bool has_initializer); | |||||
| bool has_initializer() const { return has_initializer_; } | |||||
| void set_cast_before_mirror(bool cast_before_mirror); | void set_cast_before_mirror(bool cast_before_mirror); | ||||
| bool cast_before_mirror() const { return cast_before_mirror_; } | bool cast_before_mirror() const { return cast_before_mirror_; } | ||||
| @@ -112,6 +115,7 @@ class ParallelContext { | |||||
| static std::shared_ptr<ParallelContext> inst_context_; | static std::shared_ptr<ParallelContext> inst_context_; | ||||
| bool mirror_mean_; | bool mirror_mean_; | ||||
| bool full_batch_; | bool full_batch_; | ||||
| bool has_initializer_ = false; | |||||
| bool cast_before_mirror_; | bool cast_before_mirror_; | ||||
| bool loss_repeated_mean_; | bool loss_repeated_mean_; | ||||
| int32_t device_num_; | int32_t device_num_; | ||||
| @@ -193,6 +193,8 @@ PYBIND11_MODULE(_c_expression, m) { | |||||
| .def("get_strategy_ckpt_save_file", &ParallelContext::strategy_ckpt_save_file, "Get strategy checkpoint save file.") | .def("get_strategy_ckpt_save_file", &ParallelContext::strategy_ckpt_save_file, "Get strategy checkpoint save file.") | ||||
| .def("set_full_batch", &ParallelContext::set_full_batch, "Set whether load full batch on each device.") | .def("set_full_batch", &ParallelContext::set_full_batch, "Set whether load full batch on each device.") | ||||
| .def("get_full_batch", &ParallelContext::full_batch, "Get whether load full batch on each device.") | .def("get_full_batch", &ParallelContext::full_batch, "Get whether load full batch on each device.") | ||||
| .def("set_has_initializer", &ParallelContext::set_has_initializer, "Set whether any Initializer has been created.") | |||||
| .def("get_has_initializer", &ParallelContext::has_initializer, "Get whether any Initializer has been created.") | |||||
| .def("set_enable_parallel_optimizer", &ParallelContext::set_enable_parallel_optimizer, | .def("set_enable_parallel_optimizer", &ParallelContext::set_enable_parallel_optimizer, | ||||
| "Set enable/disable parallel optimizer.") | "Set enable/disable parallel optimizer.") | ||||
| .def("get_enable_parallel_optimizer", &ParallelContext::enable_parallel_optimizer, | .def("get_enable_parallel_optimizer", &ParallelContext::enable_parallel_optimizer, | ||||
| @@ -24,7 +24,7 @@ from mindspore import log as logger | |||||
| from .._c_expression import generate_key, Executor_, Tensor, MetaTensor, PynativeExecutor_ | from .._c_expression import generate_key, Executor_, Tensor, MetaTensor, PynativeExecutor_ | ||||
| from .._c_expression import verify_inputs_signature, init_exec_dataset, _set_dataset_mode_config, init_backend | from .._c_expression import verify_inputs_signature, init_exec_dataset, _set_dataset_mode_config, init_backend | ||||
| from .tensor import Tensor as MsTensor | from .tensor import Tensor as MsTensor | ||||
| from ..parallel._utils import _get_device_num, _get_global_rank, _need_to_full, _to_full_tensor | |||||
| from ..parallel._utils import _get_device_num, _get_global_rank, _need_to_full, _to_full_tensor, _set_has_initializer | |||||
| # store ms_function class compiled pipeline cache | # store ms_function class compiled pipeline cache | ||||
| ms_compile_cache = {} | ms_compile_cache = {} | ||||
| @@ -383,6 +383,7 @@ class _Executor: | |||||
| Str, the full phase of the cell. | Str, the full phase of the cell. | ||||
| Bool, if the graph has been compiled before, return False, else return True. | Bool, if the graph has been compiled before, return False, else return True. | ||||
| """ | """ | ||||
| _set_has_initializer(False) | |||||
| obj.check_names() | obj.check_names() | ||||
| args_names, args_list = _generate_pip_args(obj, *args) | args_names, args_list = _generate_pip_args(obj, *args) | ||||
| dic = dict(zip(args_names, args_list)) | dic = dict(zip(args_names, args_list)) | ||||
| @@ -24,6 +24,7 @@ from mindspore import log as logger | |||||
| from . import dtype as mstype | from . import dtype as mstype | ||||
| from .tensor import Tensor | from .tensor import Tensor | ||||
| from .._c_expression import random_normal | from .._c_expression import random_normal | ||||
| from ..parallel._utils import _set_has_initializer | |||||
| _INITIALIZER_ALIAS = dict() | _INITIALIZER_ALIAS = dict() | ||||
| @@ -42,6 +43,7 @@ class Initializer: | |||||
| self._kwargs = kwargs | self._kwargs = kwargs | ||||
| self.shape = None | self.shape = None | ||||
| self.dtype = None | self.dtype = None | ||||
| _set_has_initializer(True) | |||||
| def _initialize(self, *kwargs): | def _initialize(self, *kwargs): | ||||
| raise NotImplementedError('Must be overridden!') | raise NotImplementedError('Must be overridden!') | ||||
| @@ -437,6 +437,8 @@ def set_auto_parallel_context(**kwargs): | |||||
| If a program has tasks with different parallel modes, then before setting new parallel mode for | If a program has tasks with different parallel modes, then before setting new parallel mode for | ||||
| next task, interface mindspore.context.reset_auto_parallel_context() needs to be called to reset | next task, interface mindspore.context.reset_auto_parallel_context() needs to be called to reset | ||||
| the configuration. | the configuration. | ||||
| Setting or changing parallel modes must be called before any Initializer created, or RuntimeError | |||||
| will be raised. | |||||
| Args: | Args: | ||||
| device_num (int): Available device number, the value must be in [1, 4096]. Default: 1. | device_num (int): Available device number, the value must be in [1, 4096]. Default: 1. | ||||
| @@ -477,6 +479,7 @@ def set_auto_parallel_context(**kwargs): | |||||
| Raises: | Raises: | ||||
| ValueError: If input key is not attribute in auto parallel context. | ValueError: If input key is not attribute in auto parallel context. | ||||
| RuntimeError: If there is any Initializer created before setting or changing parallel_mode. | |||||
| Examples: | Examples: | ||||
| >>> context.set_auto_parallel_context(device_num=8) | >>> context.set_auto_parallel_context(device_num=8) | ||||
| @@ -176,8 +176,12 @@ class _AutoParallelContext: | |||||
| Raises: | Raises: | ||||
| ValueError: If parallel mode is not supported. | ValueError: If parallel mode is not supported. | ||||
| RuntimeError: If there is any Initializer created before setting or changing parallel_mode. | |||||
| """ | """ | ||||
| self.check_context_handle() | self.check_context_handle() | ||||
| if self.get_has_initializer(): | |||||
| self.set_has_initializer(False) | |||||
| raise RuntimeError("Must set or change parallel mode before any Initializer created.") | |||||
| ret = self._context_handle.set_parallel_mode(parallel_mode) | ret = self._context_handle.set_parallel_mode(parallel_mode) | ||||
| if ret is False: | if ret is False: | ||||
| raise ValueError("Parallel mode does not support {}".format(parallel_mode)) | raise ValueError("Parallel mode does not support {}".format(parallel_mode)) | ||||
| @@ -249,6 +253,21 @@ class _AutoParallelContext: | |||||
| self.check_context_handle() | self.check_context_handle() | ||||
| return self._context_handle.get_full_batch() | return self._context_handle.get_full_batch() | ||||
| def set_has_initializer(self, has_initializer): | |||||
| """ | |||||
| Set whether any Initializer has been created. | |||||
| Args: | |||||
| has_initializer (bool): True if a Initializer created. | |||||
| """ | |||||
| self.check_context_handle() | |||||
| self._context_handle.set_has_initializer(has_initializer) | |||||
| def get_has_initializer(self): | |||||
| """Get whether any Initializer has been created.""" | |||||
| self.check_context_handle() | |||||
| return self._context_handle.get_has_initializer() | |||||
| def set_strategy_ckpt_save_file(self, strategy_ckpt_save_file): | def set_strategy_ckpt_save_file(self, strategy_ckpt_save_file): | ||||
| """ | """ | ||||
| Set strategy checkpoint save path. | Set strategy checkpoint save path. | ||||
| @@ -543,6 +562,7 @@ def _set_auto_parallel_context(**kwargs): | |||||
| Raises: | Raises: | ||||
| ValueError: If input key is not attribute in auto parallel context. | ValueError: If input key is not attribute in auto parallel context. | ||||
| RuntimeError: If there is any Initializer created before setting or changing parallel_mode. | |||||
| """ | """ | ||||
| for key, value in kwargs.items(): | for key, value in kwargs.items(): | ||||
| if key not in _set_auto_parallel_context_func_map: | if key not in _set_auto_parallel_context_func_map: | ||||
| @@ -32,6 +32,19 @@ def _get_full_batch(): | |||||
| """Get whether to use full_batch.""" | """Get whether to use full_batch.""" | ||||
| return auto_parallel_context().get_full_batch() | return auto_parallel_context().get_full_batch() | ||||
| def _get_has_initializer(): | |||||
| """Get whether any Initializer has been created.""" | |||||
| return auto_parallel_context().get_has_initializer() | |||||
| def _set_has_initializer(has_initializer): | |||||
| """ | |||||
| Set whether any Initializer has been created. | |||||
| Args: | |||||
| has_initializer (bool): True if a Initializer created. | |||||
| """ | |||||
| auto_parallel_context().set_has_initializer(has_initializer) | |||||
| def _need_to_full(): | def _need_to_full(): | ||||
| """Check whether to convert input to full shape or tensor.""" | """Check whether to convert input to full shape or tensor.""" | ||||
| @@ -78,6 +78,7 @@ def multisteplr(total_steps, gap, base_lr=0.9, gamma=0.1, dtype=mstype.float32): | |||||
| def test_lenet_nccl(): | def test_lenet_nccl(): | ||||
| context.set_auto_parallel_context(parallel_mode="data_parallel", mirror_mean=True, device_num=get_group_size()) | |||||
| net = LeNet() | net = LeNet() | ||||
| net.set_train() | net.set_train() | ||||
| @@ -86,7 +87,6 @@ def test_lenet_nccl(): | |||||
| mom_optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum) | mom_optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum) | ||||
| criterion = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) | criterion = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) | ||||
| net_with_criterion = WithLossCell(net, criterion) | net_with_criterion = WithLossCell(net, criterion) | ||||
| context.set_auto_parallel_context(parallel_mode="data_parallel", mirror_mean=True, device_num=get_group_size()) | |||||
| train_network = TrainOneStepCell(net_with_criterion, mom_optimizer) | train_network = TrainOneStepCell(net_with_criterion, mom_optimizer) | ||||
| train_network.set_train() | train_network.set_train() | ||||
| losses = [] | losses = [] | ||||
| @@ -24,6 +24,7 @@ import mindspore.nn as nn | |||||
| from mindspore import Tensor, Model, ParallelMode | from mindspore import Tensor, Model, ParallelMode | ||||
| from mindspore.nn.optim import Momentum | from mindspore.nn.optim import Momentum | ||||
| from mindspore.ops import operations as P | from mindspore.ops import operations as P | ||||
| from mindspore.parallel._utils import _set_has_initializer | |||||
| _current_dir = os.path.dirname(os.path.realpath(__file__)) + "/../test_data" | _current_dir = os.path.dirname(os.path.realpath(__file__)) + "/../test_data" | ||||
| @@ -89,3 +90,4 @@ def test_lenet5_train_step_training_pynative(): | |||||
| Model(network=network, loss_fn=loss_fn, optimizer=optimizer) | Model(network=network, loss_fn=loss_fn, optimizer=optimizer) | ||||
| context.set_context(mode=context.GRAPH_MODE) | context.set_context(mode=context.GRAPH_MODE) | ||||
| context.reset_auto_parallel_context() | context.reset_auto_parallel_context() | ||||
| _set_has_initializer(False) | |||||
| @@ -96,6 +96,8 @@ def test_on_momentum(): | |||||
| def test_data_parallel_with_cast(): | def test_data_parallel_with_cast(): | ||||
| """test_data_parallel_with_cast""" | """test_data_parallel_with_cast""" | ||||
| context.reset_auto_parallel_context() | |||||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, device_num=8) | |||||
| predict = Tensor(np.ones([1, 1, 32, 32]).astype(np.float32) * 0.01) | predict = Tensor(np.ones([1, 1, 32, 32]).astype(np.float32) * 0.01) | ||||
| label = Tensor(np.zeros([1, 10]).astype(np.float32)) | label = Tensor(np.zeros([1, 10]).astype(np.float32)) | ||||
| net = LeNet5() | net = LeNet5() | ||||
| @@ -107,8 +109,6 @@ def test_data_parallel_with_cast(): | |||||
| learning_rate=0.1, | learning_rate=0.1, | ||||
| momentum=0.9) | momentum=0.9) | ||||
| net = WithLossCell(net, loss_fn) | net = WithLossCell(net, loss_fn) | ||||
| context.reset_auto_parallel_context() | |||||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, device_num=8) | |||||
| net = TrainOneStepCell(net, optimizer) | net = TrainOneStepCell(net, optimizer) | ||||
| _executor.compile(net, predict, label) | _executor.compile(net, predict, label) | ||||
| @@ -21,7 +21,7 @@ from mindspore import context, Tensor, Parameter, ParameterTuple | |||||
| from mindspore._checkparam import _check_str_by_regular | from mindspore._checkparam import _check_str_by_regular | ||||
| from mindspore.common import dtype as mstype | from mindspore.common import dtype as mstype | ||||
| from mindspore.common.initializer import initializer | from mindspore.common.initializer import initializer | ||||
| from mindspore.parallel._utils import _set_has_initializer | |||||
| def test_parameter_init(): | def test_parameter_init(): | ||||
| dat = np.array([[1, 2, 3], [2, 3, 4]]) | dat = np.array([[1, 2, 3], [2, 3, 4]]) | ||||
| @@ -170,6 +170,7 @@ def test_scalar_parameter_update(): | |||||
| def test_parameter_lazy_init(): | def test_parameter_lazy_init(): | ||||
| _set_has_initializer(False) | |||||
| # support lazy init in SEMI_AUTO_PARALLEL mode | # support lazy init in SEMI_AUTO_PARALLEL mode | ||||
| context.reset_auto_parallel_context() | context.reset_auto_parallel_context() | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8) | context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8) | ||||
| @@ -20,6 +20,7 @@ from mindspore import context | |||||
| from mindspore.common.api import _executor | from mindspore.common.api import _executor | ||||
| from mindspore.ops import composite as C | from mindspore.ops import composite as C | ||||
| from mindspore.ops import operations as P | from mindspore.ops import operations as P | ||||
| from mindspore.parallel._utils import _set_has_initializer | |||||
| from tests.ut.python.ops.test_math_ops import VirtualLoss | from tests.ut.python.ops.test_math_ops import VirtualLoss | ||||
| @@ -60,12 +61,13 @@ def compile_net(net, x, y): | |||||
| def test_add_relu_stride_slice(): | def test_add_relu_stride_slice(): | ||||
| _set_has_initializer(False) | |||||
| context.set_auto_parallel_context(device_num=8, global_rank=7) | context.set_auto_parallel_context(device_num=8, global_rank=7) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy0 = ((1, 1), (1, 1)) | strategy0 = ((1, 1), (1, 1)) | ||||
| strategy1 = ((8, 1),) | strategy1 = ((8, 1),) | ||||
| net = Grad(NetWithLoss(AddRelu(strategy0, strategy1))) | net = Grad(NetWithLoss(AddRelu(strategy0, strategy1))) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| x = Tensor(np.ones([128, 32]), dtype=ms.float32) | x = Tensor(np.ones([128, 32]), dtype=ms.float32) | ||||
| y = Tensor(np.ones([128, 32]), dtype=ms.float32) | y = Tensor(np.ones([128, 32]), dtype=ms.float32) | ||||
| @@ -73,12 +75,13 @@ def test_add_relu_stride_slice(): | |||||
| def test_add_relu_all_gather(): | def test_add_relu_all_gather(): | ||||
| _set_has_initializer(False) | |||||
| context.set_auto_parallel_context(device_num=8, global_rank=7) | context.set_auto_parallel_context(device_num=8, global_rank=7) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy0 = ((8, 1), (8, 1)) | strategy0 = ((8, 1), (8, 1)) | ||||
| strategy1 = ((1, 1),) | strategy1 = ((1, 1),) | ||||
| net = Grad(NetWithLoss(AddRelu(strategy0, strategy1))) | net = Grad(NetWithLoss(AddRelu(strategy0, strategy1))) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| x = Tensor(np.ones([128, 32]), dtype=ms.float32) | x = Tensor(np.ones([128, 32]), dtype=ms.float32) | ||||
| y = Tensor(np.ones([128, 32]), dtype=ms.float32) | y = Tensor(np.ones([128, 32]), dtype=ms.float32) | ||||
| @@ -23,6 +23,7 @@ from mindspore.nn.optim.momentum import Momentum | |||||
| from mindspore.parallel import _cost_model_context as cost_model_context | from mindspore.parallel import _cost_model_context as cost_model_context | ||||
| from mindspore.parallel._auto_parallel_context import auto_parallel_context | from mindspore.parallel._auto_parallel_context import auto_parallel_context | ||||
| from mindspore.train import Model, ParallelMode | from mindspore.train import Model, ParallelMode | ||||
| from mindspore.parallel._utils import _set_has_initializer | |||||
| from tests.dataset_mock import MindData | from tests.dataset_mock import MindData | ||||
| @@ -105,10 +106,8 @@ def train_common(net): | |||||
| momentum = 0.9 | momentum = 0.9 | ||||
| epoch_size = 2 | epoch_size = 2 | ||||
| device_num = 4 | device_num = 4 | ||||
| context.reset_auto_parallel_context() | |||||
| auto_parallel_context().set_enable_all_reduce_fusion(enable_all_reduce_fusion=True) | auto_parallel_context().set_enable_all_reduce_fusion(enable_all_reduce_fusion=True) | ||||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=device_num, | |||||
| parameter_broadcast=False) | |||||
| context.set_auto_parallel_context(device_num=device_num, parameter_broadcast=False) | |||||
| context.set_context(mode=context.GRAPH_MODE) | context.set_context(mode=context.GRAPH_MODE) | ||||
| predict = Tensor(np.ones([batch_size, 128]), dtype=ms.float32) | predict = Tensor(np.ones([batch_size, 128]), dtype=ms.float32) | ||||
| @@ -183,9 +182,12 @@ def test_allreduce_fusion_parameters(): | |||||
| def test_allreduce_fusion1(): | def test_allreduce_fusion1(): | ||||
| _set_has_initializer(False) | |||||
| cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_algorithm=1) | cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_algorithm=1) | ||||
| cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_times=2) | cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_times=2) | ||||
| cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_percent=0.5) | cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_percent=0.5) | ||||
| context.reset_auto_parallel_context() | |||||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL) | |||||
| net = SimpleDMLNet(DenseNet1(has_bias=False, activation=None), DenseNet2(has_bias=False, activation=None)) | net = SimpleDMLNet(DenseNet1(has_bias=False, activation=None), DenseNet2(has_bias=False, activation=None)) | ||||
| allreduce_fusion_dict = train_common(net) | allreduce_fusion_dict = train_common(net) | ||||
| expect_dict = {'backbone2.fc8.weight': 2, | expect_dict = {'backbone2.fc8.weight': 2, | ||||
| @@ -210,6 +212,8 @@ def test_allreduce_fusion2(): | |||||
| cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_times=2) | cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_times=2) | ||||
| cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_percent=0.5) | cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_percent=0.5) | ||||
| cost_model_context.reset_cost_model_context() | cost_model_context.reset_cost_model_context() | ||||
| context.reset_auto_parallel_context() | |||||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL) | |||||
| net = SimpleDMLNet(DenseNet1(has_bias=False, activation=None), DenseNet2(has_bias=False, activation=None)) | net = SimpleDMLNet(DenseNet1(has_bias=False, activation=None), DenseNet2(has_bias=False, activation=None)) | ||||
| allreduce_fusion_dict = train_common(net) | allreduce_fusion_dict = train_common(net) | ||||
| expect_dict = {} | expect_dict = {} | ||||
| @@ -221,6 +225,8 @@ def test_allreduce_fusion3(): | |||||
| cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_algorithm=1) | cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_algorithm=1) | ||||
| cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_times=3) | cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_times=3) | ||||
| cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_percent=0.3333333) | cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_percent=0.3333333) | ||||
| context.reset_auto_parallel_context() | |||||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL) | |||||
| net = SimpleDMLNet(DenseNet1(has_bias=True, activation='relu'), DenseNet2(has_bias=False, activation='relu')) | net = SimpleDMLNet(DenseNet1(has_bias=True, activation='relu'), DenseNet2(has_bias=False, activation='relu')) | ||||
| allreduce_fusion_dict = train_common(net) | allreduce_fusion_dict = train_common(net) | ||||
| expect_dict = {'backbone2.fc8.weight': 3, | expect_dict = {'backbone2.fc8.weight': 3, | ||||
| @@ -247,6 +253,8 @@ def test_allreduce_fusion4(): | |||||
| cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_algorithm=1) | cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_algorithm=1) | ||||
| cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_times=2) | cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_times=2) | ||||
| cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_percent=0.5) | cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_percent=0.5) | ||||
| context.reset_auto_parallel_context() | |||||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL) | |||||
| net = SimpleDMLNet(DenseNet2(has_bias=False, activation=None), DenseNet2(has_bias=False, activation=None)) | net = SimpleDMLNet(DenseNet2(has_bias=False, activation=None), DenseNet2(has_bias=False, activation=None)) | ||||
| allreduce_fusion_dict = train_common(net) | allreduce_fusion_dict = train_common(net) | ||||
| expect_dict = {'backbone2.fc8.weight': 2, | expect_dict = {'backbone2.fc8.weight': 2, | ||||
| @@ -276,6 +284,8 @@ def test_allreduce_fusion5(): | |||||
| cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_allreduce_inherent_time=0.05) | cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_allreduce_inherent_time=0.05) | ||||
| cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_allreduce_bandwidth=0.000001) | cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_allreduce_bandwidth=0.000001) | ||||
| cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_computation_time_parameter=0.0000015) | cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_computation_time_parameter=0.0000015) | ||||
| context.reset_auto_parallel_context() | |||||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL) | |||||
| net = SimpleDMLNet(DenseNet2(has_bias=False, activation=None), DenseNet2(has_bias=False, activation=None)) | net = SimpleDMLNet(DenseNet2(has_bias=False, activation=None), DenseNet2(has_bias=False, activation=None)) | ||||
| allreduce_fusion_dict = train_common(net) | allreduce_fusion_dict = train_common(net) | ||||
| @@ -23,7 +23,7 @@ from mindspore.common.parameter import Parameter | |||||
| from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits | from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits | ||||
| from mindspore.nn.optim.momentum import Momentum | from mindspore.nn.optim.momentum import Momentum | ||||
| from mindspore.ops import operations as P | from mindspore.ops import operations as P | ||||
| from mindspore.parallel._utils import _reset_op_id | |||||
| from mindspore.parallel._utils import _reset_op_id, _set_has_initializer | |||||
| from mindspore.train import Model, ParallelMode | from mindspore.train import Model, ParallelMode | ||||
| from tests.dataset_mock import MindData | from tests.dataset_mock import MindData | ||||
| @@ -90,6 +90,7 @@ def all_to_all_common(strategy1): | |||||
| def test_all_to_all(): | def test_all_to_all(): | ||||
| _set_has_initializer(False) | |||||
| strategy1 = ((8, 1),) | strategy1 = ((8, 1),) | ||||
| context.set_context(mode=context.GRAPH_MODE, save_graphs=False) | context.set_context(mode=context.GRAPH_MODE, save_graphs=False) | ||||
| _reset_op_id() | _reset_op_id() | ||||
| @@ -20,6 +20,7 @@ from mindspore import Parameter, Tensor, context | |||||
| from mindspore.common.api import _executor | from mindspore.common.api import _executor | ||||
| from mindspore.ops import composite as C | from mindspore.ops import composite as C | ||||
| from mindspore.ops import operations as P | from mindspore.ops import operations as P | ||||
| from mindspore.parallel._utils import _set_has_initializer | |||||
| from tests.ut.python.ops.test_math_ops import VirtualLoss | from tests.ut.python.ops.test_math_ops import VirtualLoss | ||||
| @@ -60,11 +61,12 @@ def test_matmul_sub(): | |||||
| out = self.sub(out, b) | out = self.sub(out, b) | ||||
| return out | return out | ||||
| _set_has_initializer(False) | |||||
| context.set_auto_parallel_context(device_num=8, global_rank=0) | context.set_auto_parallel_context(device_num=8, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((2, 2), (2, 2)) | strategy1 = ((2, 2), (2, 2)) | ||||
| strategy2 = ((4, 2), (4, 2)) | strategy2 = ((4, 2), (4, 2)) | ||||
| net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| x = Tensor(np.ones([64, 32]), dtype=ms.float32) | x = Tensor(np.ones([64, 32]), dtype=ms.float32) | ||||
| y = Tensor(np.ones([32, 64]), dtype=ms.float32) | y = Tensor(np.ones([32, 64]), dtype=ms.float32) | ||||
| @@ -85,10 +87,10 @@ def test_matmul_add(): | |||||
| return out | return out | ||||
| context.set_auto_parallel_context(device_num=8, global_rank=0) | context.set_auto_parallel_context(device_num=8, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((2, 2), (2, 2)) | strategy1 = ((2, 2), (2, 2)) | ||||
| strategy2 = ((4, 2), (4, 2)) | strategy2 = ((4, 2), (4, 2)) | ||||
| net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| x = Tensor(np.ones([64, 32]), dtype=ms.float32) | x = Tensor(np.ones([64, 32]), dtype=ms.float32) | ||||
| y = Tensor(np.ones([32, 64]), dtype=ms.float32) | y = Tensor(np.ones([32, 64]), dtype=ms.float32) | ||||
| @@ -109,10 +111,10 @@ def test_matmul_mul(): | |||||
| return out | return out | ||||
| context.set_auto_parallel_context(device_num=8, global_rank=0) | context.set_auto_parallel_context(device_num=8, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((2, 2), (2, 2)) | strategy1 = ((2, 2), (2, 2)) | ||||
| strategy2 = ((4, 2), (4, 2)) | strategy2 = ((4, 2), (4, 2)) | ||||
| net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| x = Tensor(np.ones([64, 32]), dtype=ms.float32) | x = Tensor(np.ones([64, 32]), dtype=ms.float32) | ||||
| y = Tensor(np.ones([32, 64]), dtype=ms.float32) | y = Tensor(np.ones([32, 64]), dtype=ms.float32) | ||||
| @@ -133,10 +135,10 @@ def test_matmul_div(): | |||||
| return out | return out | ||||
| context.set_auto_parallel_context(device_num=8, global_rank=0) | context.set_auto_parallel_context(device_num=8, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((2, 2), (2, 2)) | strategy1 = ((2, 2), (2, 2)) | ||||
| strategy2 = ((4, 2), (4, 2)) | strategy2 = ((4, 2), (4, 2)) | ||||
| net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| x = Tensor(np.ones([64, 32]), dtype=ms.float32) | x = Tensor(np.ones([64, 32]), dtype=ms.float32) | ||||
| y = Tensor(np.ones([32, 64]), dtype=ms.float32) | y = Tensor(np.ones([32, 64]), dtype=ms.float32) | ||||
| @@ -157,10 +159,10 @@ def test_matmul_greater(): | |||||
| return out | return out | ||||
| context.set_auto_parallel_context(device_num=8, global_rank=0) | context.set_auto_parallel_context(device_num=8, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((2, 2), (2, 2)) | strategy1 = ((2, 2), (2, 2)) | ||||
| strategy2 = ((4, 2), (4, 2)) | strategy2 = ((4, 2), (4, 2)) | ||||
| net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| x = Tensor(np.ones([64, 32]), dtype=ms.float32) | x = Tensor(np.ones([64, 32]), dtype=ms.float32) | ||||
| y = Tensor(np.ones([32, 64]), dtype=ms.float32) | y = Tensor(np.ones([32, 64]), dtype=ms.float32) | ||||
| @@ -181,10 +183,10 @@ def test_matmul_add_broadcast(): | |||||
| return out | return out | ||||
| context.set_auto_parallel_context(device_num=8, global_rank=0) | context.set_auto_parallel_context(device_num=8, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((2, 2), (2, 2)) | strategy1 = ((2, 2), (2, 2)) | ||||
| strategy2 = ((4, 2), (2,)) | strategy2 = ((4, 2), (2,)) | ||||
| net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| x = Tensor(np.ones([64, 32]), dtype=ms.float32) | x = Tensor(np.ones([64, 32]), dtype=ms.float32) | ||||
| y = Tensor(np.ones([32, 64]), dtype=ms.float32) | y = Tensor(np.ones([32, 64]), dtype=ms.float32) | ||||
| @@ -205,10 +207,10 @@ def test_matmul_add_broadcast2(): | |||||
| return out | return out | ||||
| context.set_auto_parallel_context(device_num=8, global_rank=0) | context.set_auto_parallel_context(device_num=8, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((2, 4), (4, 1)) | strategy1 = ((2, 4), (4, 1)) | ||||
| strategy2 = ((4, 1), (1, 2)) | strategy2 = ((4, 1), (1, 2)) | ||||
| net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| x = Tensor(np.ones([64, 32]), dtype=ms.float32) | x = Tensor(np.ones([64, 32]), dtype=ms.float32) | ||||
| y = Tensor(np.ones([32, 1]), dtype=ms.float32) | y = Tensor(np.ones([32, 1]), dtype=ms.float32) | ||||
| @@ -229,10 +231,10 @@ def test_matmul_sub_broadcast(): | |||||
| return out | return out | ||||
| context.set_auto_parallel_context(device_num=8, global_rank=0) | context.set_auto_parallel_context(device_num=8, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((2, 2), (2, 2)) | strategy1 = ((2, 2), (2, 2)) | ||||
| strategy2 = ((4, 2), (2,)) | strategy2 = ((4, 2), (2,)) | ||||
| net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| x = Tensor(np.ones([64, 32]), dtype=ms.float32) | x = Tensor(np.ones([64, 32]), dtype=ms.float32) | ||||
| y = Tensor(np.ones([32, 64]), dtype=ms.float32) | y = Tensor(np.ones([32, 64]), dtype=ms.float32) | ||||
| @@ -253,10 +255,10 @@ def test_matmul_sub_broadcast2(): | |||||
| return out | return out | ||||
| context.set_auto_parallel_context(device_num=8, global_rank=0) | context.set_auto_parallel_context(device_num=8, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((2, 4), (4, 1)) | strategy1 = ((2, 4), (4, 1)) | ||||
| strategy2 = ((4, 1), (1, 2)) | strategy2 = ((4, 1), (1, 2)) | ||||
| net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| x = Tensor(np.ones([64, 32]), dtype=ms.float32) | x = Tensor(np.ones([64, 32]), dtype=ms.float32) | ||||
| y = Tensor(np.ones([32, 1]), dtype=ms.float32) | y = Tensor(np.ones([32, 1]), dtype=ms.float32) | ||||
| @@ -277,10 +279,10 @@ def test_matmul_mul_broadcast(): | |||||
| return out | return out | ||||
| context.set_auto_parallel_context(device_num=8, global_rank=0) | context.set_auto_parallel_context(device_num=8, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((2, 2), (2, 2)) | strategy1 = ((2, 2), (2, 2)) | ||||
| strategy2 = ((4, 2), (2,)) | strategy2 = ((4, 2), (2,)) | ||||
| net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| x = Tensor(np.ones([64, 32]), dtype=ms.float32) | x = Tensor(np.ones([64, 32]), dtype=ms.float32) | ||||
| y = Tensor(np.ones([32, 64]), dtype=ms.float32) | y = Tensor(np.ones([32, 64]), dtype=ms.float32) | ||||
| @@ -301,10 +303,10 @@ def test_matmul_mul_broadcast2(): | |||||
| return out | return out | ||||
| context.set_auto_parallel_context(device_num=8, global_rank=0) | context.set_auto_parallel_context(device_num=8, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((2, 4), (4, 1)) | strategy1 = ((2, 4), (4, 1)) | ||||
| strategy2 = ((4, 1), (1, 2)) | strategy2 = ((4, 1), (1, 2)) | ||||
| net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| x = Tensor(np.ones([64, 32]), dtype=ms.float32) | x = Tensor(np.ones([64, 32]), dtype=ms.float32) | ||||
| y = Tensor(np.ones([32, 1]), dtype=ms.float32) | y = Tensor(np.ones([32, 1]), dtype=ms.float32) | ||||
| @@ -325,10 +327,10 @@ def test_matmul_div_broadcast(): | |||||
| return out | return out | ||||
| context.set_auto_parallel_context(device_num=8, global_rank=0) | context.set_auto_parallel_context(device_num=8, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((2, 2), (2, 2)) | strategy1 = ((2, 2), (2, 2)) | ||||
| strategy2 = ((4, 2), (2,)) | strategy2 = ((4, 2), (2,)) | ||||
| net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| x = Tensor(np.ones([64, 32]), dtype=ms.float32) | x = Tensor(np.ones([64, 32]), dtype=ms.float32) | ||||
| y = Tensor(np.ones([32, 64]), dtype=ms.float32) | y = Tensor(np.ones([32, 64]), dtype=ms.float32) | ||||
| @@ -349,10 +351,10 @@ def test_matmul_div_broadcast2(): | |||||
| return out | return out | ||||
| context.set_auto_parallel_context(device_num=8, global_rank=0) | context.set_auto_parallel_context(device_num=8, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((2, 4), (4, 1)) | strategy1 = ((2, 4), (4, 1)) | ||||
| strategy2 = ((4, 1), (1, 2)) | strategy2 = ((4, 1), (1, 2)) | ||||
| net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| x = Tensor(np.ones([64, 32]), dtype=ms.float32) | x = Tensor(np.ones([64, 32]), dtype=ms.float32) | ||||
| y = Tensor(np.ones([32, 1]), dtype=ms.float32) | y = Tensor(np.ones([32, 1]), dtype=ms.float32) | ||||
| @@ -373,10 +375,10 @@ def test_matmul_greater_broadcast(): | |||||
| return out | return out | ||||
| context.set_auto_parallel_context(device_num=8, global_rank=0) | context.set_auto_parallel_context(device_num=8, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((2, 2), (2, 2)) | strategy1 = ((2, 2), (2, 2)) | ||||
| strategy2 = ((4, 2), (2,)) | strategy2 = ((4, 2), (2,)) | ||||
| net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| x = Tensor(np.ones([64, 32]), dtype=ms.float32) | x = Tensor(np.ones([64, 32]), dtype=ms.float32) | ||||
| y = Tensor(np.ones([32, 64]), dtype=ms.float32) | y = Tensor(np.ones([32, 64]), dtype=ms.float32) | ||||
| @@ -397,10 +399,10 @@ def test_matmul_greater_broadcast2(): | |||||
| return out | return out | ||||
| context.set_auto_parallel_context(device_num=8, global_rank=0) | context.set_auto_parallel_context(device_num=8, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((2, 4), (4, 1)) | strategy1 = ((2, 4), (4, 1)) | ||||
| strategy2 = ((4, 1), (1, 2)) | strategy2 = ((4, 1), (1, 2)) | ||||
| net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| x = Tensor(np.ones([64, 32]), dtype=ms.float32) | x = Tensor(np.ones([64, 32]), dtype=ms.float32) | ||||
| y = Tensor(np.ones([32, 1]), dtype=ms.float32) | y = Tensor(np.ones([32, 1]), dtype=ms.float32) | ||||
| @@ -421,10 +423,10 @@ def test_matmul_floordiv(): | |||||
| return out | return out | ||||
| context.set_auto_parallel_context(device_num=8, global_rank=0) | context.set_auto_parallel_context(device_num=8, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((2, 2), (2, 2)) | strategy1 = ((2, 2), (2, 2)) | ||||
| strategy2 = ((4, 2), (4, 2)) | strategy2 = ((4, 2), (4, 2)) | ||||
| net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| x = Tensor(np.ones([64, 32]), dtype=ms.float32) | x = Tensor(np.ones([64, 32]), dtype=ms.float32) | ||||
| y = Tensor(np.ones([32, 64]), dtype=ms.float32) | y = Tensor(np.ones([32, 64]), dtype=ms.float32) | ||||
| @@ -445,10 +447,10 @@ def test_matmul_floordiv_broadcast(): | |||||
| return out | return out | ||||
| context.set_auto_parallel_context(device_num=8, global_rank=0) | context.set_auto_parallel_context(device_num=8, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((2, 2), (2, 2)) | strategy1 = ((2, 2), (2, 2)) | ||||
| strategy2 = ((4, 2), (2,)) | strategy2 = ((4, 2), (2,)) | ||||
| net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| x = Tensor(np.ones([64, 32]), dtype=ms.float32) | x = Tensor(np.ones([64, 32]), dtype=ms.float32) | ||||
| y = Tensor(np.ones([32, 64]), dtype=ms.float32) | y = Tensor(np.ones([32, 64]), dtype=ms.float32) | ||||
| @@ -469,10 +471,10 @@ def test_matmul_floordiv_broadcast2(): | |||||
| return out | return out | ||||
| context.set_auto_parallel_context(device_num=8, global_rank=0) | context.set_auto_parallel_context(device_num=8, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((2, 4), (4, 1)) | strategy1 = ((2, 4), (4, 1)) | ||||
| strategy2 = ((4, 1), (1, 2)) | strategy2 = ((4, 1), (1, 2)) | ||||
| net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| x = Tensor(np.ones([64, 32]), dtype=ms.float32) | x = Tensor(np.ones([64, 32]), dtype=ms.float32) | ||||
| y = Tensor(np.ones([32, 1]), dtype=ms.float32) | y = Tensor(np.ones([32, 1]), dtype=ms.float32) | ||||
| @@ -64,10 +64,10 @@ def test_auto_parallel_bn_with_prelu(): | |||||
| size = 8 | size = 8 | ||||
| context.set_auto_parallel_context(device_num=size, global_rank=0) | context.set_auto_parallel_context(device_num=size, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="auto_parallel") | |||||
| x = Tensor(np.random.rand(16, 16, 32, 64), dtype=ms.float32) | x = Tensor(np.random.rand(16, 16, 32, 64), dtype=ms.float32) | ||||
| net = GradWrap(NetWithLoss(Net())) | net = GradWrap(NetWithLoss(Net())) | ||||
| context.set_auto_parallel_context(parallel_mode="auto_parallel") | |||||
| net.set_auto_parallel() | net.set_auto_parallel() | ||||
| _executor.compile(net, x) | _executor.compile(net, x) | ||||
| @@ -106,8 +106,8 @@ def test_double_subgraphs(): | |||||
| cost_model_context.set_cost_model_context(multi_subgraphs=True) | cost_model_context.set_cost_model_context(multi_subgraphs=True) | ||||
| context.set_context(save_graphs=True) | context.set_context(save_graphs=True) | ||||
| context.set_auto_parallel_context(device_num=8, global_rank=0) | context.set_auto_parallel_context(device_num=8, global_rank=0) | ||||
| net = TrainStepWarp(NetWithLoss(Net())) | |||||
| context.set_auto_parallel_context(parallel_mode="auto_parallel") | context.set_auto_parallel_context(parallel_mode="auto_parallel") | ||||
| net = TrainStepWarp(NetWithLoss(Net())) | |||||
| net.set_auto_parallel() | net.set_auto_parallel() | ||||
| x = Tensor(np.ones([8, 8, 8, 8]), dtype=ms.float32) | x = Tensor(np.ones([8, 8, 8, 8]), dtype=ms.float32) | ||||
| @@ -68,9 +68,9 @@ def test_virtual_dataset_3_input(): | |||||
| out = self.matmul2(out, b) | out = self.matmul2(out, b) | ||||
| return out | return out | ||||
| net = GradWrap(NetWithLoss(Net())) | |||||
| context.set_auto_parallel_context(parallel_mode="auto_parallel") | context.set_auto_parallel_context(parallel_mode="auto_parallel") | ||||
| context.set_auto_parallel_context(device_num=8, global_rank=0) | context.set_auto_parallel_context(device_num=8, global_rank=0) | ||||
| net = GradWrap(NetWithLoss(Net())) | |||||
| net.set_auto_parallel() | net.set_auto_parallel() | ||||
| x = Tensor(np.ones([128, 32]), dtype=ms.float32) | x = Tensor(np.ones([128, 32]), dtype=ms.float32) | ||||
| y = Tensor(np.ones([32, 64]), dtype=ms.float32) | y = Tensor(np.ones([32, 64]), dtype=ms.float32) | ||||
| @@ -68,11 +68,11 @@ def test_two_bn(): | |||||
| out = self.block2(out) | out = self.block2(out) | ||||
| return out | return out | ||||
| net = NetWithLoss(Net()) | |||||
| x = Tensor(np.ones([64, 64]), dtype=ms.float32) | |||||
| context.set_context(save_graphs=True) | context.set_context(save_graphs=True) | ||||
| context.set_auto_parallel_context(device_num=8, global_rank=0) | context.set_auto_parallel_context(device_num=8, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="auto_parallel") | context.set_auto_parallel_context(parallel_mode="auto_parallel") | ||||
| net = NetWithLoss(Net()) | |||||
| x = Tensor(np.ones([64, 64]), dtype=ms.float32) | |||||
| net.set_auto_parallel() | net.set_auto_parallel() | ||||
| set_algo_parameters(elementwise_op_strategy_follow=True) | set_algo_parameters(elementwise_op_strategy_follow=True) | ||||
| reset_op_id() | reset_op_id() | ||||
| @@ -94,12 +94,12 @@ def test_batch(): | |||||
| return out4 | return out4 | ||||
| context.set_auto_parallel_context(device_num=8, global_rank=0) | context.set_auto_parallel_context(device_num=8, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((8, 1, 1, 1), (1, 1, 1, 1)) | strategy1 = ((8, 1, 1, 1), (1, 1, 1, 1)) | ||||
| strategy2 = ((1, 1, 1, 8), (1, 1, 1, 8)) | strategy2 = ((1, 1, 1, 8), (1, 1, 1, 8)) | ||||
| strategy3 = ((4, 1, 1, 2), (4, 1, 1, 2)) | strategy3 = ((4, 1, 1, 2), (4, 1, 1, 2)) | ||||
| net = GradWrap(NetWithLoss(Net(strategy1, strategy2, strategy3))) | net = GradWrap(NetWithLoss(Net(strategy1, strategy2, strategy3))) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| net.set_auto_parallel() | net.set_auto_parallel() | ||||
| x = Tensor(np.ones([128, 16, 34, 34]), dtype=ms.float32) | x = Tensor(np.ones([128, 16, 34, 34]), dtype=ms.float32) | ||||
| @@ -118,6 +118,9 @@ def batchnorm_net(num_classes): | |||||
| def test_batchnorm_batch_parallel(): | def test_batchnorm_batch_parallel(): | ||||
| context.reset_auto_parallel_context() | |||||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=dev_num) | |||||
| context.set_context(mode=context.GRAPH_MODE) | |||||
| num_classes = 1001 | num_classes = 1001 | ||||
| batch_size = 32 | batch_size = 32 | ||||
| learning_rate = 0.1 | learning_rate = 0.1 | ||||
| @@ -134,9 +137,6 @@ def test_batchnorm_batch_parallel(): | |||||
| loss.softmax_cross_entropy.set_strategy(((dev_num, 1), (dev_num, 1))) | loss.softmax_cross_entropy.set_strategy(((dev_num, 1), (dev_num, 1))) | ||||
| opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum) | opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum) | ||||
| context.reset_auto_parallel_context() | |||||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=dev_num) | |||||
| context.set_context(mode=context.GRAPH_MODE) | |||||
| model = Model(net, loss, opt) | model = Model(net, loss, opt) | ||||
| model.train(epoch_size, dataset, dataset_sink_mode=False) | model.train(epoch_size, dataset, dataset_sink_mode=False) | ||||
| @@ -198,6 +198,7 @@ def bn_net(): | |||||
| def bn_common(parallel_mode, train_flag, strategy_loss=None): | def bn_common(parallel_mode, train_flag, strategy_loss=None): | ||||
| context.set_context(mode=context.GRAPH_MODE) | context.set_context(mode=context.GRAPH_MODE) | ||||
| context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=8) | |||||
| learning_rate = 0.1 | learning_rate = 0.1 | ||||
| momentum = 0.9 | momentum = 0.9 | ||||
| epoch_size = 2 | epoch_size = 2 | ||||
| @@ -218,7 +219,6 @@ def bn_common(parallel_mode, train_flag, strategy_loss=None): | |||||
| if parallel_mode == ParallelMode.DATA_PARALLEL: | if parallel_mode == ParallelMode.DATA_PARALLEL: | ||||
| context.set_auto_parallel_context(parameter_broadcast=True) | context.set_auto_parallel_context(parameter_broadcast=True) | ||||
| context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=8) | |||||
| model = Model(net, loss, opt) | model = Model(net, loss, opt) | ||||
| if train_flag: | if train_flag: | ||||
| model.train(epoch_size, dataset, dataset_sink_mode=False) | model.train(epoch_size, dataset, dataset_sink_mode=False) | ||||
| @@ -88,13 +88,13 @@ def test_get_next_semi_auto_parallel(): | |||||
| return x | return x | ||||
| context.set_auto_parallel_context(device_num=4, global_rank=0) | context.set_auto_parallel_context(device_num=4, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| network = Net(strategy1=((1, 4),), strategy2=((4, 1), (1,))) | network = Net(strategy1=((1, 4),), strategy2=((4, 1), (1,))) | ||||
| strategy3 = ((4, 1), (), ()) | strategy3 = ((4, 1), (), ()) | ||||
| strategy4 = ((4, 1), (4, 1)) | strategy4 = ((4, 1), (4, 1)) | ||||
| net_with_loss = NetWithLoss(network, [ms.float32, ms.int32], [[32, 64], [32]], 2, strategy3=strategy3, | net_with_loss = NetWithLoss(network, [ms.float32, ms.int32], [[32, 64], [32]], 2, strategy3=strategy3, | ||||
| strategy4=strategy4) | strategy4=strategy4) | ||||
| net = GradWrap(net_with_loss) | net = GradWrap(net_with_loss) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| compile_net(net) | compile_net(net) | ||||
| @@ -112,13 +112,13 @@ def test_get_next_semi_auto_parallel1(): | |||||
| return x | return x | ||||
| context.set_auto_parallel_context(device_num=4, global_rank=0) | context.set_auto_parallel_context(device_num=4, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| network = Net(strategy1=((1, 4),), strategy2=((4, 1), (1,))) | network = Net(strategy1=((1, 4),), strategy2=((4, 1), (1,))) | ||||
| strategy3 = ((1, 4), (), ()) | strategy3 = ((1, 4), (), ()) | ||||
| strategy4 = ((4, 1), (4, 1)) | strategy4 = ((4, 1), (4, 1)) | ||||
| net_with_loss = NetWithLoss(network, [ms.float32, ms.int32], [[32, 64], [32]], 2, strategy3=strategy3, | net_with_loss = NetWithLoss(network, [ms.float32, ms.int32], [[32, 64], [32]], 2, strategy3=strategy3, | ||||
| strategy4=strategy4) | strategy4=strategy4) | ||||
| net = GradWrap(net_with_loss) | net = GradWrap(net_with_loss) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| compile_net(net) | compile_net(net) | ||||
| @@ -136,10 +136,10 @@ def test_get_next_auto_parallel(): | |||||
| return x | return x | ||||
| context.set_auto_parallel_context(device_num=4, global_rank=0) | context.set_auto_parallel_context(device_num=4, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="auto_parallel") | |||||
| network = Net() | network = Net() | ||||
| net_with_loss = NetWithLoss(network, [ms.float32, ms.int32], [[32, 64], [32]], 2) | net_with_loss = NetWithLoss(network, [ms.float32, ms.int32], [[32, 64], [32]], 2) | ||||
| net = GradWrap(net_with_loss) | net = GradWrap(net_with_loss) | ||||
| context.set_auto_parallel_context(parallel_mode="auto_parallel") | |||||
| compile_net(net) | compile_net(net) | ||||
| @@ -153,6 +153,6 @@ def test_only_one_get_next(): | |||||
| return self.get_next() | return self.get_next() | ||||
| context.set_auto_parallel_context(device_num=4, global_rank=0) | context.set_auto_parallel_context(device_num=4, global_rank=0) | ||||
| net = Net() | |||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | ||||
| net = Net() | |||||
| compile_net(net) | compile_net(net) | ||||
| @@ -13,6 +13,7 @@ | |||||
| # limitations under the License. | # limitations under the License. | ||||
| import numpy as np | import numpy as np | ||||
| import pytest | |||||
| from mindspore import context | from mindspore import context | ||||
| import mindspore.nn as nn | import mindspore.nn as nn | ||||
| from mindspore.ops import operations as P | from mindspore.ops import operations as P | ||||
| @@ -22,20 +23,19 @@ import mindspore.common.api as me | |||||
| from mindspore.common.initializer import initializer | from mindspore.common.initializer import initializer | ||||
| from hccl_test.manage.api import Hccl | from hccl_test.manage.api import Hccl | ||||
| class Net(nn.Cell): | |||||
| def __init__(self, strategy1, strategy2, weight): | |||||
| super().__init__() | |||||
| self.weight = Parameter(weight, "w1") | |||||
| self.matmul = P.MatMul(transpose_a=False, transpose_b=True).set_strategy(strategy1) | |||||
| self.relu = P.ReLU().set_strategy(strategy2) | |||||
| def check_initializer_weight_slice(init_name="Uniform"): | |||||
| class Net(nn.Cell): | |||||
| def __init__(self, strategy1, strategy2, weight): | |||||
| super().__init__() | |||||
| self.weight = Parameter(weight, "w1") | |||||
| self.matmul = P.MatMul(transpose_a=False, transpose_b=True).set_strategy(strategy1) | |||||
| self.relu = P.ReLU().set_strategy(strategy2) | |||||
| def construct(self, x): | |||||
| out = self.matmul(x, self.weight) | |||||
| out = self.relu(out) | |||||
| return out | |||||
| def construct(self, x): | |||||
| out = self.matmul(x, self.weight) | |||||
| out = self.relu(out) | |||||
| return out | |||||
| def check_initializer_weight_slice(init_name="Uniform"): | |||||
| def get_slice(rank): | def get_slice(rank): | ||||
| hccl = Hccl() | hccl = Hccl() | ||||
| rank_save = hccl.rank_id | rank_save = hccl.rank_id | ||||
| @@ -77,5 +77,28 @@ def test_initializer_weight_slice(): | |||||
| for init_name in initializers: | for init_name in initializers: | ||||
| check_initializer_weight_slice(init_name) | check_initializer_weight_slice(init_name) | ||||
| def test_wrong_order_set_parallel_mode_with_initializer(): | |||||
| weight = initializer("Normal", [64, 32], ms.float32) | |||||
| strategy1 = ((2, 1), (4, 1)) | |||||
| strategy2 = ((2, 4),) | |||||
| net = Net(strategy1, strategy2, weight) | |||||
| exe = me._executor | |||||
| x = Tensor(np.ones([32, 32]), dtype=ms.float32) | |||||
| with pytest.raises(RuntimeError): | |||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0) | |||||
| net.set_auto_parallel() | |||||
| exe.compile(net, x, auto_parallel_mode=True, phase='train') | |||||
| def test_wrong_order_set_parallel_mode_without_initializer(): | |||||
| weight = Tensor(np.ones([64, 32]), ms.float32) | |||||
| strategy1 = ((2, 1), (4, 1)) | |||||
| strategy2 = ((2, 4),) | |||||
| net = Net(strategy1, strategy2, weight) | |||||
| exe = me._executor | |||||
| x = Tensor(np.ones([32, 32]), dtype=ms.float32) | |||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0) | |||||
| net.set_auto_parallel() | |||||
| exe.compile(net, x, auto_parallel_mode=True, phase='train') | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| test_initializer_weight_slice() | test_initializer_weight_slice() | ||||
| @@ -58,12 +58,12 @@ def test_linear(): | |||||
| return out | return out | ||||
| context.set_auto_parallel_context(device_num=16, global_rank=0) | context.set_auto_parallel_context(device_num=16, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy0 = ((2, 4), (2, 4)) | strategy0 = ((2, 4), (2, 4)) | ||||
| strategy1 = ((2, 4), (4,)) | strategy1 = ((2, 4), (4,)) | ||||
| strategy2 = ((2, 8),) | strategy2 = ((2, 8),) | ||||
| strategy3 = ((16, 1), (16, 1)) | strategy3 = ((16, 1), (16, 1)) | ||||
| net = GradWrap(NetWithLoss(Net(strategy0, strategy1, strategy2), strategy3)) | net = GradWrap(NetWithLoss(Net(strategy0, strategy1, strategy2), strategy3)) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| net.set_auto_parallel() | net.set_auto_parallel() | ||||
| x = Tensor(np.ones([64, 32]), dtype=ms.float32) | x = Tensor(np.ones([64, 32]), dtype=ms.float32) | ||||
| @@ -54,6 +54,7 @@ def test_momentum(): | |||||
| return out | return out | ||||
| context.set_auto_parallel_context(device_num=4, global_rank=0) | context.set_auto_parallel_context(device_num=4, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((2, 1), (2, 1)) | strategy1 = ((2, 1), (2, 1)) | ||||
| strategy2 = ((4, 1),) | strategy2 = ((4, 1),) | ||||
| strategy3 = ((4, 1), (4, 1)) | strategy3 = ((4, 1), (4, 1)) | ||||
| @@ -69,7 +70,6 @@ def test_momentum(): | |||||
| net_with_loss = NetWithLoss(net, strategy3) | net_with_loss = NetWithLoss(net, strategy3) | ||||
| train_net = TrainOneStepCell(net_with_loss, optimizer) | train_net = TrainOneStepCell(net_with_loss, optimizer) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| compile_net(train_net, x, b) | compile_net(train_net, x, b) | ||||
| @@ -88,6 +88,7 @@ def test_momentum_with_loss_scale(): | |||||
| return out | return out | ||||
| context.set_auto_parallel_context(device_num=4, global_rank=0) | context.set_auto_parallel_context(device_num=4, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((2, 1), (2, 1)) | strategy1 = ((2, 1), (2, 1)) | ||||
| strategy2 = ((4, 1),) | strategy2 = ((4, 1),) | ||||
| strategy3 = ((4, 1), (4, 1)) | strategy3 = ((4, 1), (4, 1)) | ||||
| @@ -103,7 +104,6 @@ def test_momentum_with_loss_scale(): | |||||
| net_with_loss = NetWithLoss(net, strategy3) | net_with_loss = NetWithLoss(net, strategy3) | ||||
| train_net = TrainOneStepCell(net_with_loss, optimizer) | train_net = TrainOneStepCell(net_with_loss, optimizer) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| compile_net(train_net, x, b) | compile_net(train_net, x, b) | ||||
| @@ -122,6 +122,7 @@ def test_momentum_with_dynamic_lr(): | |||||
| return out | return out | ||||
| context.set_auto_parallel_context(device_num=4, global_rank=0) | context.set_auto_parallel_context(device_num=4, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((2, 1), (2, 1)) | strategy1 = ((2, 1), (2, 1)) | ||||
| strategy2 = ((4, 1),) | strategy2 = ((4, 1),) | ||||
| strategy3 = ((4, 1), (4, 1)) | strategy3 = ((4, 1), (4, 1)) | ||||
| @@ -138,7 +139,6 @@ def test_momentum_with_dynamic_lr(): | |||||
| net_with_loss = NetWithLoss(net, strategy3) | net_with_loss = NetWithLoss(net, strategy3) | ||||
| train_net = TrainOneStepCell(net_with_loss, optimizer) | train_net = TrainOneStepCell(net_with_loss, optimizer) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| compile_net(train_net, x, b) | compile_net(train_net, x, b) | ||||
| @@ -157,6 +157,7 @@ def test_momentum_with_loss_scale_and_dynamic_lr(): | |||||
| return out | return out | ||||
| context.set_auto_parallel_context(device_num=4, global_rank=0) | context.set_auto_parallel_context(device_num=4, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((2, 1), (2, 1)) | strategy1 = ((2, 1), (2, 1)) | ||||
| strategy2 = ((4, 1),) | strategy2 = ((4, 1),) | ||||
| @@ -174,7 +175,6 @@ def test_momentum_with_loss_scale_and_dynamic_lr(): | |||||
| net_with_loss = NetWithLoss(net, strategy3) | net_with_loss = NetWithLoss(net, strategy3) | ||||
| train_net = TrainOneStepCell(net_with_loss, optimizer) | train_net = TrainOneStepCell(net_with_loss, optimizer) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| compile_net(train_net, x, b) | compile_net(train_net, x, b) | ||||
| @@ -193,6 +193,7 @@ def test_lars(): | |||||
| return out | return out | ||||
| context.set_auto_parallel_context(device_num=4, global_rank=0) | context.set_auto_parallel_context(device_num=4, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((2, 1), (2, 1)) | strategy1 = ((2, 1), (2, 1)) | ||||
| strategy2 = ((4, 1),) | strategy2 = ((4, 1),) | ||||
| strategy3 = ((4, 1), (4, 1)) | strategy3 = ((4, 1), (4, 1)) | ||||
| @@ -209,6 +210,5 @@ def test_lars(): | |||||
| lars_filter=lambda x: 'bn' not in x.name) | lars_filter=lambda x: 'bn' not in x.name) | ||||
| net_with_loss = NetWithLoss(net, strategy3) | net_with_loss = NetWithLoss(net, strategy3) | ||||
| train_net = TrainOneStepCell(net_with_loss, optimizer) | train_net = TrainOneStepCell(net_with_loss, optimizer) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| compile_net(train_net, x, b) | compile_net(train_net, x, b) | ||||
| @@ -266,11 +266,11 @@ class BNReshapeDenseBNNet(nn.Cell): | |||||
| def test_bn_reshape_dense_bn_train_loss(): | def test_bn_reshape_dense_bn_train_loss(): | ||||
| batch_size = 16 | batch_size = 16 | ||||
| context.set_auto_parallel_context(device_num=device_num, global_rank=0) | context.set_auto_parallel_context(device_num=device_num, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| input_ = Tensor(np.ones([batch_size, 2, 32, 32]).astype(np.float32) * 0.01) | input_ = Tensor(np.ones([batch_size, 2, 32, 32]).astype(np.float32) * 0.01) | ||||
| label = Tensor(np.ones([batch_size]), dtype=ms.int32) | label = Tensor(np.ones([batch_size]), dtype=ms.int32) | ||||
| net = GradWrap(NetWithLoss(BNReshapeDenseBNNet())) | net = GradWrap(NetWithLoss(BNReshapeDenseBNNet())) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| net.set_auto_parallel() | net.set_auto_parallel() | ||||
| _executor.compile(net, input_, label) | _executor.compile(net, input_, label) | ||||
| @@ -279,12 +279,12 @@ def test_bn_reshape_dense_bn_train_loss(): | |||||
| def test_semi_one_hot_net_batch(): | def test_semi_one_hot_net_batch(): | ||||
| batch_size = 16 | batch_size = 16 | ||||
| context.set_auto_parallel_context(device_num=device_num, global_rank=0) | context.set_auto_parallel_context(device_num=device_num, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| input_ = Tensor(np.ones([batch_size * 1, 512]).astype(np.float32) * 0.01) | input_ = Tensor(np.ones([batch_size * 1, 512]).astype(np.float32) * 0.01) | ||||
| label = Tensor(np.ones([batch_size]), dtype=ms.int32) | label = Tensor(np.ones([batch_size]), dtype=ms.int32) | ||||
| net = SemiAutoOneHotNet(args=Args(), strategy=StrategyBatch()) | net = SemiAutoOneHotNet(args=Args(), strategy=StrategyBatch()) | ||||
| net = GradWrap(NetWithLoss(net)) | net = GradWrap(NetWithLoss(net)) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| net.set_auto_parallel() | net.set_auto_parallel() | ||||
| _executor.compile(net, input_, label) | _executor.compile(net, input_, label) | ||||
| @@ -300,10 +300,10 @@ def test_semi_one_hot_net_model(): | |||||
| label = Tensor(np.ones([batch_size]), dtype=ms.int32) | label = Tensor(np.ones([batch_size]), dtype=ms.int32) | ||||
| dataset = Dataset(predict, label, 2, input_num=2) | dataset = Dataset(predict, label, 2, input_num=2) | ||||
| net = SemiAutoOneHotNet(args=Args(), strategy=StrategyModel()) | |||||
| opt = Momentum(net.trainable_params(), learning_rate, momentum) | |||||
| context.reset_auto_parallel_context() | context.reset_auto_parallel_context() | ||||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=16) | context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=16) | ||||
| context.set_context(mode=context.GRAPH_MODE) | context.set_context(mode=context.GRAPH_MODE) | ||||
| net = SemiAutoOneHotNet(args=Args(), strategy=StrategyModel()) | |||||
| opt = Momentum(net.trainable_params(), learning_rate, momentum) | |||||
| model = Model(net, optimizer=opt) | model = Model(net, optimizer=opt) | ||||
| model.train(epoch_size, dataset, dataset_sink_mode=False) | model.train(epoch_size, dataset, dataset_sink_mode=False) | ||||
| @@ -353,6 +353,8 @@ def test_resnet_operator_batch_parallel(): | |||||
| context.reset_auto_parallel_context() | context.reset_auto_parallel_context() | ||||
| context.set_auto_parallel_context(device_num=dev_num, global_rank=0) | context.set_auto_parallel_context(device_num=dev_num, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=dev_num) | |||||
| context.set_context(mode=context.GRAPH_MODE) | |||||
| predict = Tensor(np.ones([batch_size, 3, 224, 224]), dtype=ms.float32) | predict = Tensor(np.ones([batch_size, 3, 224, 224]), dtype=ms.float32) | ||||
| label = Tensor(np.ones([batch_size]), dtype=ms.int32) | label = Tensor(np.ones([batch_size]), dtype=ms.int32) | ||||
| @@ -363,9 +365,6 @@ def test_resnet_operator_batch_parallel(): | |||||
| loss.softmax_cross_entropy.set_strategy(((dev_num, 1), (dev_num, 1))) | loss.softmax_cross_entropy.set_strategy(((dev_num, 1), (dev_num, 1))) | ||||
| opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum) | opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum) | ||||
| context.reset_auto_parallel_context() | |||||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=dev_num) | |||||
| context.set_context(mode=context.GRAPH_MODE) | |||||
| model = Model(net, loss, opt) | model = Model(net, loss, opt) | ||||
| model.train(epoch_size, dataset, dataset_sink_mode=False) | model.train(epoch_size, dataset, dataset_sink_mode=False) | ||||
| @@ -379,6 +378,8 @@ def test_resnet_model_parallel(): | |||||
| context.reset_auto_parallel_context() | context.reset_auto_parallel_context() | ||||
| context.set_auto_parallel_context(device_num=dev_num, global_rank=0) | context.set_auto_parallel_context(device_num=dev_num, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=dev_num) | |||||
| context.set_context(mode=context.GRAPH_MODE) | |||||
| predict = Tensor(np.ones([batch_size, 64, 112, 112]), dtype=ms.float32) | predict = Tensor(np.ones([batch_size, 64, 112, 112]), dtype=ms.float32) | ||||
| label = Tensor(np.ones([batch_size]), dtype=ms.int32) | label = Tensor(np.ones([batch_size]), dtype=ms.int32) | ||||
| @@ -389,9 +390,6 @@ def test_resnet_model_parallel(): | |||||
| loss.softmax_cross_entropy.set_strategy(((dev_num, 1), (dev_num, 1))) | loss.softmax_cross_entropy.set_strategy(((dev_num, 1), (dev_num, 1))) | ||||
| opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum) | opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum) | ||||
| context.reset_auto_parallel_context() | |||||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=dev_num) | |||||
| context.set_context(mode=context.GRAPH_MODE) | |||||
| model = Model(net, loss, opt) | model = Model(net, loss, opt) | ||||
| model.train(epoch_size, dataset, dataset_sink_mode=False) | model.train(epoch_size, dataset, dataset_sink_mode=False) | ||||
| @@ -45,6 +45,8 @@ class Net(nn.Cell): | |||||
| def test_dense_gen_graph(): | def test_dense_gen_graph(): | ||||
| context.set_context(mode=context.GRAPH_MODE) | context.set_context(mode=context.GRAPH_MODE) | ||||
| context.reset_auto_parallel_context() | |||||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.HYBRID_PARALLEL, mirror_mean=True, device_num=8) | |||||
| init() | init() | ||||
| network = Net(512, 128) | network = Net(512, 128) | ||||
| @@ -53,8 +55,6 @@ def test_dense_gen_graph(): | |||||
| learning_rate=0.1, | learning_rate=0.1, | ||||
| momentum=0.9) | momentum=0.9) | ||||
| network = WithLossCell(network, loss_fn) | network = WithLossCell(network, loss_fn) | ||||
| context.reset_auto_parallel_context() | |||||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.HYBRID_PARALLEL, mirror_mean=True, device_num=8) | |||||
| network = TrainOneStepCell(network, optimizer) | network = TrainOneStepCell(network, optimizer) | ||||
| predict = Tensor(np.ones([64, 512]).astype(np.float32) * 0.01) | predict = Tensor(np.ones([64, 512]).astype(np.float32) * 0.01) | ||||
| @@ -54,6 +54,7 @@ def test_optimizer_clone_weight(): | |||||
| return out | return out | ||||
| context.set_auto_parallel_context(device_num=4, global_rank=0) | context.set_auto_parallel_context(device_num=4, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((2, 1), (2, 1)) | strategy1 = ((2, 1), (2, 1)) | ||||
| strategy2 = ((4, 1),) | strategy2 = ((4, 1),) | ||||
| @@ -70,7 +71,6 @@ def test_optimizer_clone_weight(): | |||||
| net_with_loss = NetWithLoss(net, strategy3) | net_with_loss = NetWithLoss(net, strategy3) | ||||
| train_net = TrainOneStepCell(net_with_loss, optimizer) | train_net = TrainOneStepCell(net_with_loss, optimizer) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| compile_net(train_net, x, b) | compile_net(train_net, x, b) | ||||
| @@ -89,6 +89,7 @@ def test_optimizer_clone_weight2(): | |||||
| return out | return out | ||||
| context.set_auto_parallel_context(device_num=4, global_rank=0) | context.set_auto_parallel_context(device_num=4, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| strategy1 = ((2, 1), (2, 1)) | strategy1 = ((2, 1), (2, 1)) | ||||
| strategy2 = ((4, 1),) | strategy2 = ((4, 1),) | ||||
| @@ -105,6 +106,5 @@ def test_optimizer_clone_weight2(): | |||||
| net_with_loss = NetWithLoss(net, strategy3) | net_with_loss = NetWithLoss(net, strategy3) | ||||
| train_net = TrainOneStepCell(net_with_loss, optimizer) | train_net = TrainOneStepCell(net_with_loss, optimizer) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| compile_net(train_net, x, b) | compile_net(train_net, x, b) | ||||
| @@ -320,10 +320,10 @@ def reshape_net2(backbone): | |||||
| batch_size = 16 | batch_size = 16 | ||||
| device_num = 16 | device_num = 16 | ||||
| context.set_auto_parallel_context(device_num=device_num, global_rank=0) | context.set_auto_parallel_context(device_num=device_num, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| input_ = Tensor(np.ones([batch_size * device_num, 512, 7, 7]).astype(np.float32) * 0.01) | input_ = Tensor(np.ones([batch_size * device_num, 512, 7, 7]).astype(np.float32) * 0.01) | ||||
| net = GradWrap(NetWithLoss(backbone)) | net = GradWrap(NetWithLoss(backbone)) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| compile_net(net, input_) | compile_net(net, input_) | ||||
| @@ -530,10 +530,10 @@ def test_bn_reshape_dense_bn_train(): | |||||
| batch_size = 16 | batch_size = 16 | ||||
| device_num = 16 | device_num = 16 | ||||
| context.set_auto_parallel_context(device_num=device_num, global_rank=0) | context.set_auto_parallel_context(device_num=device_num, global_rank=0) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| input_ = Tensor(np.ones([batch_size, 2, 32, 32]).astype(np.float32) * 0.01) | input_ = Tensor(np.ones([batch_size, 2, 32, 32]).astype(np.float32) * 0.01) | ||||
| net = GradWrap(NetWithLoss(BNReshapeDenseBNNet())) | net = GradWrap(NetWithLoss(BNReshapeDenseBNNet())) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| compile_net(net, input_) | compile_net(net, input_) | ||||
| @@ -18,6 +18,7 @@ from numpy import allclose | |||||
| import mindspore.common.initializer as init | import mindspore.common.initializer as init | ||||
| import mindspore.nn as nn | import mindspore.nn as nn | ||||
| from mindspore import Parameter | from mindspore import Parameter | ||||
| from mindspore.parallel._utils import _set_has_initializer | |||||
| parameter_shape = [16, 4] | parameter_shape = [16, 4] | ||||
| @@ -46,6 +47,7 @@ def test_using_same_seed_for_initializer(): | |||||
| np.random.seed(0) | np.random.seed(0) | ||||
| net2 = ParameterNet() | net2 = ParameterNet() | ||||
| net2.init_parameters_data() | net2.init_parameters_data() | ||||
| _set_has_initializer(False) | |||||
| for key in net1.parameters_dict(): | for key in net1.parameters_dict(): | ||||
| if key not in net2.parameters_dict(): | if key not in net2.parameters_dict(): | ||||
| assert False | assert False | ||||
| @@ -60,6 +62,7 @@ def test_using_diffserent_seed_for_initializer(): | |||||
| np.random.seed(1) | np.random.seed(1) | ||||
| net2 = ParameterNet() | net2 = ParameterNet() | ||||
| net2.init_parameters_data() | net2.init_parameters_data() | ||||
| _set_has_initializer(False) | |||||
| for key in net1.parameters_dict(): | for key in net1.parameters_dict(): | ||||
| if key not in net2.parameters_dict(): | if key not in net2.parameters_dict(): | ||||
| assert False | assert False | ||||
| @@ -62,13 +62,13 @@ def test_virtual_dataset_3_input(): | |||||
| out = self.matmul2(out, b) | out = self.matmul2(out, b) | ||||
| return out | return out | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| context.set_auto_parallel_context(device_num=8, global_rank=0) | |||||
| strategy0 = ((2, 1), (2, 1), (2, 1)) | strategy0 = ((2, 1), (2, 1), (2, 1)) | ||||
| strategy1 = ((2, 2), (2, 2)) | strategy1 = ((2, 2), (2, 2)) | ||||
| strategy2 = ((2, 2), (2, 2)) | strategy2 = ((2, 2), (2, 2)) | ||||
| strategy3 = ((2, 4),) | strategy3 = ((2, 4),) | ||||
| net = GradWrap(NetWithLoss(Net(strategy0, strategy1, strategy2, strategy3))) | net = GradWrap(NetWithLoss(Net(strategy0, strategy1, strategy2, strategy3))) | ||||
| context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") | |||||
| context.set_auto_parallel_context(device_num=8, global_rank=0) | |||||
| x = Tensor(np.ones([128, 32]), dtype=ms.float32) | x = Tensor(np.ones([128, 32]), dtype=ms.float32) | ||||
| y = Tensor(np.ones([32, 64]), dtype=ms.float32) | y = Tensor(np.ones([32, 64]), dtype=ms.float32) | ||||
| b = Tensor(np.ones([64, 2048]), dtype=ms.float32) | b = Tensor(np.ones([64, 2048]), dtype=ms.float32) | ||||
| @@ -89,10 +89,10 @@ def test_virtualdataset_cell_3_inputs(): | |||||
| out = self.matmul2(out, b) | out = self.matmul2(out, b) | ||||
| return out | return out | ||||
| net = GradWrap(VirtualDatasetCellTriple(NetWithLoss(Net(None, None, None)))) | |||||
| context.set_context(save_graphs=True) | context.set_context(save_graphs=True) | ||||
| context.set_auto_parallel_context(parallel_mode="auto_parallel") | context.set_auto_parallel_context(parallel_mode="auto_parallel") | ||||
| context.set_auto_parallel_context(device_num=8, global_rank=0) | context.set_auto_parallel_context(device_num=8, global_rank=0) | ||||
| net = GradWrap(VirtualDatasetCellTriple(NetWithLoss(Net(None, None, None)))) | |||||
| x = Tensor(np.ones([128, 32]), dtype=ms.float32) | x = Tensor(np.ones([128, 32]), dtype=ms.float32) | ||||
| y = Tensor(np.ones([32, 64]), dtype=ms.float32) | y = Tensor(np.ones([32, 64]), dtype=ms.float32) | ||||
| b = Tensor(np.ones([64, 2048]), dtype=ms.float32) | b = Tensor(np.ones([64, 2048]), dtype=ms.float32) | ||||
| @@ -146,6 +146,10 @@ def test_compile_model_train_O2(): | |||||
| def test_compile_model_train_O2_parallel(): | def test_compile_model_train_O2_parallel(): | ||||
| dataset_types = (np.float32, np.float32) | dataset_types = (np.float32, np.float32) | ||||
| dataset_shapes = ((16, 16), (16, 16)) | dataset_shapes = ((16, 16), (16, 16)) | ||||
| context.set_auto_parallel_context( | |||||
| global_rank=0, device_num=8, | |||||
| mirror_mean=True, parameter_broadcast=True, | |||||
| parallel_mode=ParallelMode.DATA_PARALLEL) | |||||
| dataset = MindDataSet(dataset_types, dataset_shapes) | dataset = MindDataSet(dataset_types, dataset_shapes) | ||||
| @@ -153,10 +157,6 @@ def test_compile_model_train_O2_parallel(): | |||||
| loss = nn.MSELoss() | loss = nn.MSELoss() | ||||
| optimizer = nn.Momentum(net.trainable_params(), 0.1, 0.9, 0.00004, 1024.0) | optimizer = nn.Momentum(net.trainable_params(), 0.1, 0.9, 0.00004, 1024.0) | ||||
| context.set_auto_parallel_context( | |||||
| global_rank=0, device_num=8, | |||||
| mirror_mean=True, parameter_broadcast=True, | |||||
| parallel_mode=ParallelMode.DATA_PARALLEL) | |||||
| init() | init() | ||||
| model = Model(net, loss_fn=loss, optimizer=optimizer, metrics={"acc"}, amp_level="O2") | model = Model(net, loss_fn=loss, optimizer=optimizer, metrics={"acc"}, amp_level="O2") | ||||