| @@ -72,7 +72,7 @@ class DataGenerator(): | |||
| i = 0 | |||
| for stra in strategy: | |||
| temp = [] | |||
| while len(blocks) > 0: | |||
| while blocks: | |||
| block = blocks.pop(0) | |||
| temp.extend(np.split(block, stra, axis=i)) | |||
| blocks.extend(temp) | |||
| @@ -63,7 +63,7 @@ class DataGenerator(): | |||
| i = 0 | |||
| for stra in strategy: | |||
| temp = [] | |||
| while len(blocks) > 0: | |||
| while blocks: | |||
| block = blocks.pop(0) | |||
| temp.extend(np.split(block, stra, axis=i)) | |||
| blocks.extend(temp) | |||
| @@ -172,10 +172,12 @@ class ResNet(nn.Cell): | |||
| layer_nums, | |||
| in_channels, | |||
| out_channels, | |||
| strides=[1, 2, 2, 2], | |||
| strides=None, | |||
| num_classes=100): | |||
| super(ResNet, self).__init__() | |||
| if strides is None: | |||
| strides = [1, 2, 2, 2] | |||
| if not len(layer_nums) == len(in_channels) == len(out_channels) == 4: | |||
| raise ValueError("the length of " | |||
| "layer_num, inchannel, outchannel list must be 4!") | |||
| @@ -300,7 +302,7 @@ class DataGenerator(): | |||
| i = 0 | |||
| for stra in strategy: | |||
| temp = [] | |||
| while len(blocks) > 0: | |||
| while blocks: | |||
| block = blocks.pop(0) | |||
| temp.extend(np.split(block, stra, axis=i)) | |||
| blocks.extend(temp) | |||
| @@ -38,17 +38,6 @@ class NetWithLoss(nn.Cell): | |||
| return self.loss(predict) | |||
| class GradWrap(nn.Cell): | |||
| def __init__(self, network): | |||
| super(GradWrap, self).__init__() | |||
| self.network = network | |||
| def construct(self, x, y, z, w): | |||
| return C.grad_all(self.network)(x, y, z, w) | |||
| # model_parallel test | |||
| def test_common_parameter(): | |||
| class Net(nn.Cell): | |||
| def __init__(self): | |||
| @@ -174,9 +174,9 @@ def test_reshape_auto_4(): | |||
| def test_reshape_auto_5(): | |||
| class NetWithLoss(nn.Cell): | |||
| class NetWithLoss5(nn.Cell): | |||
| def __init__(self, network): | |||
| super(NetWithLoss, self).__init__() | |||
| super(NetWithLoss5, self).__init__() | |||
| self.loss = VirtualLoss() | |||
| self.network = network | |||
| @@ -184,9 +184,9 @@ def test_reshape_auto_5(): | |||
| predict = self.network(x, y) | |||
| return self.loss(predict) | |||
| class GradWrap(nn.Cell): | |||
| class GradWrap5(nn.Cell): | |||
| def __init__(self, network): | |||
| super(GradWrap, self).__init__() | |||
| super(GradWrap5, self).__init__() | |||
| self.network = network | |||
| def construct(self, x, y): | |||
| @@ -217,16 +217,16 @@ def test_reshape_auto_5(): | |||
| x = Tensor(np.ones([4, 1024 * size, 1]), dtype=ms.float32) | |||
| y = Tensor(np.ones([4, 1024 * size,]), dtype=ms.float32) | |||
| net = GradWrap(NetWithLoss(Net())) | |||
| net = GradWrap5(NetWithLoss5(Net())) | |||
| context.set_auto_parallel_context(parallel_mode="auto_parallel") | |||
| net.set_auto_parallel() | |||
| _executor.compile(net, x, y) | |||
| def test_reshape_auto_6(): | |||
| class NetWithLoss(nn.Cell): | |||
| class NetWithLoss6(nn.Cell): | |||
| def __init__(self, network): | |||
| super(NetWithLoss, self).__init__() | |||
| super(NetWithLoss6, self).__init__() | |||
| self.loss = VirtualLoss() | |||
| self.network = network | |||
| @@ -234,9 +234,9 @@ def test_reshape_auto_6(): | |||
| predict = self.network(x, y) | |||
| return self.loss(predict) | |||
| class GradWrap(nn.Cell): | |||
| class GradWrap6(nn.Cell): | |||
| def __init__(self, network): | |||
| super(GradWrap, self).__init__() | |||
| super(GradWrap6, self).__init__() | |||
| self.network = network | |||
| def construct(self, x, y): | |||
| @@ -265,7 +265,7 @@ def test_reshape_auto_6(): | |||
| x = Tensor(np.ones([4, 1024, 1]), dtype=ms.float32) | |||
| y = Tensor(np.ones([4, 1024,]), dtype=ms.float32) | |||
| net = GradWrap(NetWithLoss(Net())) | |||
| net = GradWrap6(NetWithLoss6(Net())) | |||
| context.set_auto_parallel_context(parallel_mode="auto_parallel") | |||
| net.set_auto_parallel() | |||
| _executor.compile(net, x, y) | |||
| @@ -12,8 +12,8 @@ | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| import numpy as np | |||
| import re | |||
| import numpy as np | |||
| import mindspore.common.dtype as mstype | |||
| import mindspore.nn as nn | |||
| @@ -36,35 +36,33 @@ context.set_context(device_id=0) | |||
| init() | |||
| def weight_variable(shape, factor=0.1): | |||
| def weight_variable(): | |||
| return TruncatedNormal(0.02) | |||
| def _conv3x3(in_channels, out_channels, stride=1, padding=0, pad_mode='same'): | |||
| """Get a conv2d layer with 3x3 kernel size.""" | |||
| init_value = weight_variable((out_channels, in_channels, 3, 3)) | |||
| init_value = weight_variable() | |||
| return nn.Conv2d(in_channels, out_channels, | |||
| kernel_size=3, stride=stride, padding=padding, pad_mode=pad_mode, weight_init=init_value) | |||
| def _conv1x1(in_channels, out_channels, stride=1, padding=0, pad_mode='same'): | |||
| """Get a conv2d layer with 1x1 kernel size.""" | |||
| init_value = weight_variable((out_channels, in_channels, 1, 1)) | |||
| init_value = weight_variable() | |||
| return nn.Conv2d(in_channels, out_channels, | |||
| kernel_size=1, stride=stride, padding=padding, pad_mode=pad_mode, weight_init=init_value) | |||
| def _conv7x7(in_channels, out_channels, stride=1, padding=0, pad_mode='same'): | |||
| """Get a conv2d layer with 7x7 kernel size.""" | |||
| init_value = weight_variable((out_channels, in_channels, 7, 7)) | |||
| init_value = weight_variable() | |||
| return nn.Conv2d(in_channels, out_channels, | |||
| kernel_size=7, stride=stride, padding=padding, pad_mode=pad_mode, weight_init=init_value) | |||
| def _fused_bn(channels, momentum=0.9): | |||
| """Get a fused batchnorm""" | |||
| init_weight = weight_variable((channels,)) | |||
| init_bias = weight_variable((channels,)) | |||
| return nn.BatchNorm2d(channels, momentum=momentum) | |||
| @@ -132,10 +130,11 @@ class ResNet(nn.Cell): | |||
| layer_nums, | |||
| in_channels, | |||
| out_channels, | |||
| strides=[1, 2, 2, 2], | |||
| strides=None, | |||
| num_classes=100): | |||
| super(ResNet, self).__init__() | |||
| if strides is None: | |||
| strides = [1, 2, 2, 2] | |||
| if not len(layer_nums) == len(in_channels) == len(out_channels) == 4: | |||
| raise ValueError("the length of " | |||
| "layer_num, inchannel, outchannel list must be 4!") | |||
| @@ -168,16 +167,13 @@ class ResNet(nn.Cell): | |||
| self.mean = P.ReduceMean(keep_dims=True) | |||
| self.end_point = nn.Dense(2048, num_classes, has_bias=True, | |||
| weight_init=weight_variable((num_classes, 2048)), | |||
| bias_init=weight_variable((num_classes,))).add_flags_recursive(fp16=True) | |||
| weight_init=weight_variable(), | |||
| bias_init=weight_variable()).add_flags_recursive(fp16=True) | |||
| self.squeeze = P.Squeeze() | |||
| self.cast = P.Cast() | |||
| def _make_layer(self, block, layer_num, in_channel, out_channel, stride): | |||
| layers = [] | |||
| down_sample = False | |||
| if stride != 1 or in_channel != out_channel: | |||
| down_sample = True | |||
| resblk = block(in_channel, out_channel, stride=1) | |||
| layers.append(resblk) | |||
| @@ -279,7 +275,7 @@ class DatasetLenet(): | |||
| return 1 | |||
| def test_train_32k_8p(epoch_size=3, batch_size=32, num_classes=32768): | |||
| def test_train_32k_8p(batch_size=32, num_classes=32768): | |||
| dev_num = 8 | |||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, device_num=dev_num) | |||
| set_algo_parameters(elementwise_op_strategy_follow=True) | |||
| @@ -309,12 +305,12 @@ def test_train_32k_8p(epoch_size=3, batch_size=32, num_classes=32768): | |||
| return allreduce_fusion_dict | |||
| def train_32k_8p_fusion1(epoch_size=3, batch_size=32, num_classes=32768): # 1048576 #131072 #32768 #8192 | |||
| def train_32k_8p_fusion1(batch_size=32, num_classes=32768): # 1048576 #131072 #32768 #8192 | |||
| cost_model_context.set_cost_model_context(costmodel_gamma=0.001, costmodel_beta=400.0) | |||
| cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_algorithm=1) | |||
| cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_times=2) | |||
| cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_percent=0.5) | |||
| allreduce_fusion_dict = test_train_32k_8p(epoch_size, batch_size, num_classes) | |||
| allreduce_fusion_dict = test_train_32k_8p(batch_size, num_classes) | |||
| expect_dict = {'end_point.bias': 2, | |||
| 'end_point.weight': 2, | |||
| 'layer4.2.bn3.beta': 2, | |||
| @@ -477,17 +473,17 @@ def train_32k_8p_fusion1(epoch_size=3, batch_size=32, num_classes=32768): # 104 | |||
| 'bn1.gamma': 1, | |||
| 'conv1.weight': 1} | |||
| assert (allreduce_fusion_dict == expect_dict) | |||
| assert allreduce_fusion_dict == expect_dict | |||
| cost_model_context.reset_cost_model_context() | |||
| def train_32k_8p_fusion2(epoch_size=3, batch_size=32, num_classes=32768): # 1048576 #131072 #32768 #8192 | |||
| def train_32k_8p_fusion2(batch_size=32, num_classes=32768): # 1048576 #131072 #32768 #8192 | |||
| cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_algorithm=2) | |||
| cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_time=0.1) | |||
| cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_allreduce_inherent_time=0.05) | |||
| cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_allreduce_bandwidth=0.000001) | |||
| cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_computation_time_parameter=0.0000015) | |||
| allreduce_fusion_dict = test_train_32k_8p(epoch_size, batch_size, num_classes) | |||
| allreduce_fusion_dict = test_train_32k_8p(batch_size, num_classes) | |||
| expect_dict = {'end_point.bias': 2, | |||
| 'end_point.weight': 2, | |||
| 'layer4.2.bn3.beta': 2, | |||
| @@ -650,11 +646,11 @@ def train_32k_8p_fusion2(epoch_size=3, batch_size=32, num_classes=32768): # 104 | |||
| 'bn1.gamma': 1, | |||
| 'conv1.weight': 1} | |||
| assert (allreduce_fusion_dict == expect_dict) | |||
| assert allreduce_fusion_dict == expect_dict | |||
| cost_model_context.reset_cost_model_context() | |||
| def test_train_64k_8p(epoch_size=3, batch_size=32, num_classes=65536): # 1048576 #131072 #32768 #8192 | |||
| def test_train_64k_8p(batch_size=32, num_classes=65536): # 1048576 #131072 #32768 #8192 | |||
| dev_num = 8 | |||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, device_num=dev_num) | |||
| cost_model_context.set_cost_model_context(costmodel_gamma=0.001, costmodel_beta=400.0) | |||
| @@ -58,7 +58,7 @@ def test_zig_zag_graph(): | |||
| def construct(self, x, y, z, w, a): | |||
| m1_result = self.matmul1(x, y) | |||
| m2_result = self.matmul2(z, w) | |||
| m3_result = self.matmul3(m2_result, m1_result) | |||
| _ = self.matmul3(m2_result, m1_result) | |||
| out = self.matmul4(m2_result, a) | |||
| return out | |||
| @@ -101,7 +101,7 @@ def fixme_test_dataset_interface_sens_scalar(): | |||
| class TrainOneStepCell(nn.Cell): | |||
| def __init__(self, network, optimizer, sens=1.0): | |||
| def __init__(self, network, optimizer): | |||
| super(TrainOneStepCell, self).__init__(auto_prefix=False) | |||
| self.network = network | |||
| self.network.add_flags(defer_inline=True) | |||
| @@ -135,7 +135,7 @@ def test_dataset_interface_sens_shape_not_equal_loss(): | |||
| sens = Tensor(np.ones([256, 1024]), dtype=ms.float32) | |||
| try: | |||
| loss_scale_manager_sens(strategy1, sens) | |||
| except: | |||
| except BaseException: | |||
| pass | |||
| @@ -45,8 +45,10 @@ class GradWrap(nn.Cell): | |||
| class Net(nn.Cell): | |||
| def __init__(self, axis=0, strategy1=None, strategy2=None, shape=[64, 64]): | |||
| def __init__(self, axis=0, strategy1=None, strategy2=None, shape=None): | |||
| super().__init__() | |||
| if shape is None: | |||
| shape = [64, 64] | |||
| self.gatherv2 = P.GatherV2().set_strategy(strategy1) | |||
| self.mul = P.Mul().set_strategy(strategy2) | |||
| self.index = Tensor(np.ones(shape), dtype=ms.int32) | |||
| @@ -221,14 +221,14 @@ def test_axis1_auto_batch_parallel(): | |||
| def test_axis1_batch_parallel(): | |||
| gather_v2_strategy = ((device_number, 1), (1, )) | |||
| gather_v2_strategy = ((device_number, 1), (1,)) | |||
| criterion = GatherV2Axis1(1, strategy=gather_v2_strategy, index_size=512) | |||
| rank = 2 | |||
| net_trains(criterion, rank) | |||
| def test_axis1_strategy1(): | |||
| gather_v2_strategy = ((16, 2), (1, )) | |||
| gather_v2_strategy = ((16, 2), (1,)) | |||
| rank = 17 | |||
| criterion = GatherV2Axis1(1, strategy=gather_v2_strategy, index_size=512) | |||
| net_trains(criterion, rank) | |||
| @@ -265,7 +265,6 @@ class BNReshapeDenseBNNet(nn.Cell): | |||
| def test_bn_reshape_dense_bn_train_loss(): | |||
| batch_size = 16 | |||
| device_num = 16 | |||
| context.set_auto_parallel_context(device_num=device_num, global_rank=0) | |||
| input_ = Tensor(np.ones([batch_size, 2, 32, 32]).astype(np.float32) * 0.01) | |||
| label = Tensor(np.ones([batch_size]), dtype=ms.int32) | |||
| @@ -104,7 +104,7 @@ def test_onehot_batch_parallel_invalid_strategy(): | |||
| strategy4 = ((16, 1), (16, 1)) | |||
| try: | |||
| compile_graph(strategy1, strategy2, strategy3, strategy4) | |||
| except: | |||
| except BaseException: | |||
| pass | |||
| @@ -144,7 +144,7 @@ def test_onehot_batch_parallel_invalid_strategy_axis0(): | |||
| strategy4 = ((16, 1), (16, 1)) | |||
| try: | |||
| compile_graph(strategy1, strategy2, strategy3, strategy4, onthot_axis=0) | |||
| except: | |||
| except BaseException: | |||
| pass | |||
| @@ -124,9 +124,9 @@ def test_prelu_parallel_success2(): | |||
| def test_prelu_parallel_success3(): | |||
| class NetWithLoss(nn.Cell): | |||
| class NetWithLoss3(nn.Cell): | |||
| def __init__(self, network): | |||
| super(NetWithLoss, self).__init__() | |||
| super(NetWithLoss3, self).__init__() | |||
| self.loss = VirtualLoss() | |||
| self.network = network | |||
| @@ -134,9 +134,9 @@ def test_prelu_parallel_success3(): | |||
| predict = self.network(x, y, w) | |||
| return self.loss(predict) | |||
| class GradWrap(nn.Cell): | |||
| class GradWrap3(nn.Cell): | |||
| def __init__(self, network): | |||
| super(GradWrap, self).__init__() | |||
| super(GradWrap3, self).__init__() | |||
| self.network = network | |||
| def construct(self, x, y, w): | |||
| @@ -161,7 +161,7 @@ def test_prelu_parallel_success3(): | |||
| x = Tensor(np.random.rand(128, 64), dtype=ms.float32) | |||
| y = Tensor(np.random.rand(64, 16), dtype=ms.float32) | |||
| w = Tensor(np.random.rand(16), dtype=ms.float32) | |||
| net = GradWrap(NetWithLoss(Net(strategy1, strategy2))) | |||
| net = GradWrap3(NetWithLoss3(Net(strategy1, strategy2))) | |||
| net.set_auto_parallel() | |||
| _executor.compile(net, x, y, w) | |||
| @@ -114,7 +114,7 @@ def test_reshape1_strategy_1(): | |||
| strategy_loss = ((8, 1), (8, 1)) | |||
| try: | |||
| reshape_common(ParallelMode.SEMI_AUTO_PARALLEL, strategy0, strategy1, strategy2, strategy_loss) | |||
| except: | |||
| except BaseException: | |||
| pass | |||
| @@ -125,7 +125,7 @@ def test_reshape1_strategy_2(): | |||
| strategy_loss = ((8, 1), (8, 1)) | |||
| try: | |||
| reshape_common(ParallelMode.AUTO_PARALLEL, strategy0, strategy1, strategy2, strategy_loss) | |||
| except: | |||
| except BaseException: | |||
| pass | |||
| @@ -347,14 +347,14 @@ def test_reshape_net3_2(): | |||
| def test_reshape_net4_1(): | |||
| try: | |||
| reshape_net2(ReshapeNet4(((1, 8), (8, 1)))) | |||
| except: | |||
| except BaseException: | |||
| pass | |||
| def test_reshape_net4_2(): | |||
| try: | |||
| reshape_net2(ReshapeNet4(((1, 8), (8, 2)))) | |||
| except: | |||
| except BaseException: | |||
| pass | |||