!1348 fix pylint warnings of parallel test cases

Merge pull request !1348 from yihuaijie/master
5 years ago · 3f23aa1d79
--- a/tests/st/auto_parallel/onehot_model_parallel.py
+++ b/tests/st/auto_parallel/onehot_model_parallel.py
@@ -1,154 +1,154 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 import numpy as np
 import os
 import pytest

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 import mindspore.context as context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P

 device_num = 2
 device_id = int(os.getenv('DEVICE_ID'))
 rank_id = 0


 def setup_module():
    global device_num
    global rank_id
    np.random.seed(0)
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
    context.set_context(device_id=device_id)
    distributedTool.init()
    device_num = distributedTool.get_group_size()
    rank_id = distributedTool.get_rank()
    context.set_auto_parallel_context(device_num=device_num,
                                      global_rank=rank_id)


 def teardown_module():
    distributedTool.release()


 class Onehot(Cell):
    def __init__(self, axis=-1, depth=1, on_value=1.0, off_value=0.0, strategy=None):
        super(Onehot, self).__init__()
        trans_stra = None
        if strategy:
            trans_stra = (strategy[0],)
        self.onehot = P.OneHot().set_strategy(strategy=strategy)
        self.depth = depth
        self.on_value = Tensor(on_value, ms.float32)
        self.off_value = Tensor(off_value, ms.float32)
        self.transpose = P.Transpose().set_strategy(strategy=trans_stra)
        self.sub = P.Sub().set_strategy(strategy=((1, 1), (1, 1)))

    def construct(self, input, indices):
        x = self.onehot(indices, self.depth, self.on_value, self.off_value)
        x = self.transpose(x, (1, 0))
        x = self.sub(input, x)
        return x


 class DataGenerator():
    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def generate_data(self, shape):
        data = np.random.rand(*shape)
        return data

    def input_data(self, shape):
        data = (self.generate_data(shape) * 2).astype(np.float32)
        stra = [1] * len(shape)
        stra[0] = device_num
        datas = self.get_parallel_blocks(data, stra)
        return Tensor(data), Tensor(datas[rank_id])

    def label_data(self, shape, classes):
        data = (self.generate_data(shape) * (classes - 1)).astype(np.int32)
        stra = [1] * len(shape)
        stra[0] = device_num
        datas = self.get_parallel_blocks(data, stra)
        return Tensor(data), Tensor(datas[rank_id])


 class OneHotFactory:
    def __init__(self, batch_size, classes, on_value=1.0, off_value=0.0, axis=None, strategy=None):
        dataGen = DataGenerator()
        self.input_full, self.input_part = dataGen.input_data((classes, batch_size))
        self.label_full, self.label_part = dataGen.label_data((batch_size,), classes)
        self.depth = classes
        self.on_value = on_value
        self.off_value = off_value
        self.axis = axis
        self.strategy = strategy

    def forward_mindspore_single_impl(self):
        net = Onehot(axis=self.axis,
                     depth=self.depth,
                     on_value=self.on_value,
                     off_value=self.off_value)
        out = net(self.input_full, self.label_full)
        return out

    def forward_mindspore_parallel_impl(self):
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net = Onehot(axis=self.axis,
                     depth=self.depth,
                     on_value=self.on_value,
                     off_value=self.off_value, strategy=self.strategy)
        out = net.compile_and_run(self.input_full, self.label_full)
        return out

    def forward_cmp(self):
        out_mindspore_single = self.forward_mindspore_single_impl().asnumpy()
        context.reset_auto_parallel_context()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl().asnumpy()
        context.reset_auto_parallel_context()
        assert np.allclose(out_mindspore_single, out_mindspore_parallel, 0.0001, 0.0001)


 def test_reid_onehot_forward_int32_128_depth1024_model_parallel():
    fact = OneHotFactory(batch_size=128,
                         classes=1024,
                         on_value=1.000000,
                         off_value=0.000000,
                         axis=-1,
                         strategy=((1, device_num), (), ()))
    fact.forward_cmp()


 def test_reid_onehot_forward_int32_1024_depth128_model_parallel():
    fact = OneHotFactory(batch_size=1024,
                         classes=128,
                         on_value=1.000000,
                         off_value=0.000000,
                         axis=-1,
                         strategy=((1, device_num), (), ()))
    fact.forward_cmp()
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 import os
 import numpy as np

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 import mindspore.context as context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P

 device_num = 2
 device_id = int(os.getenv('DEVICE_ID'))
 rank_id = 0


 def setup_module():
    global device_num
    global rank_id
    np.random.seed(0)
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
    context.set_context(device_id=device_id)
    distributedTool.init()
    device_num = distributedTool.get_group_size()
    rank_id = distributedTool.get_rank()
    context.set_auto_parallel_context(device_num=device_num,
                                      global_rank=rank_id)


 def teardown_module():
    distributedTool.release()


 class Onehot(Cell):
    def __init__(self, axis=-1, depth=1, on_value=1.0, off_value=0.0, strategy=None):
        super(Onehot, self).__init__()
        trans_stra = None
        if strategy:
            trans_stra = (strategy[0],)
        self.onehot = P.OneHot().set_strategy(strategy=strategy)
        self.depth = depth
        self.on_value = Tensor(on_value, ms.float32)
        self.off_value = Tensor(off_value, ms.float32)
        self.transpose = P.Transpose().set_strategy(strategy=trans_stra)
        self.sub = P.Sub().set_strategy(strategy=((1, 1), (1, 1)))
        self.axis = axis

    def construct(self, input_, indices):
        x = self.onehot(indices, self.depth, self.on_value, self.off_value)
        x = self.transpose(x, (1, 0))
        x = self.sub(input_, x)
        return x


 class DataGenerator():
    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def generate_data(self, shape):
        data = np.random.rand(*shape)
        return data

    def input_data(self, shape):
        data = (self.generate_data(shape) * 2).astype(np.float32)
        stra = [1] * len(shape)
        stra[0] = device_num
        datas = self.get_parallel_blocks(data, stra)
        return Tensor(data), Tensor(datas[rank_id])

    def label_data(self, shape, classes):
        data = (self.generate_data(shape) * (classes - 1)).astype(np.int32)
        stra = [1] * len(shape)
        stra[0] = device_num
        datas = self.get_parallel_blocks(data, stra)
        return Tensor(data), Tensor(datas[rank_id])


 class OneHotFactory:
    def __init__(self, batch_size, classes, on_value=1.0, off_value=0.0, axis=None, strategy=None):
        data_gen = DataGenerator()
        self.input_full, self.input_part = data_gen.input_data((classes, batch_size))
        self.label_full, self.label_part = data_gen.label_data((batch_size,), classes)
        self.depth = classes
        self.on_value = on_value
        self.off_value = off_value
        self.axis = axis
        self.strategy = strategy

    def forward_mindspore_single_impl(self):
        net = Onehot(axis=self.axis,
                     depth=self.depth,
                     on_value=self.on_value,
                     off_value=self.off_value)
        out = net(self.input_full, self.label_full)
        return out

    def forward_mindspore_parallel_impl(self):
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net = Onehot(axis=self.axis,
                     depth=self.depth,
                     on_value=self.on_value,
                     off_value=self.off_value, strategy=self.strategy)
        out = net.compile_and_run(self.input_full, self.label_full)
        return out

    def forward_cmp(self):
        out_mindspore_single = self.forward_mindspore_single_impl().asnumpy()
        context.reset_auto_parallel_context()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl().asnumpy()
        context.reset_auto_parallel_context()
        assert np.allclose(out_mindspore_single, out_mindspore_parallel, 0.0001, 0.0001)


 def test_reid_onehot_forward_int32_128_depth1024_model_parallel():
    fact = OneHotFactory(batch_size=128,
                         classes=1024,
                         on_value=1.000000,
                         off_value=0.000000,
                         axis=-1,
                         strategy=((1, device_num), (), ()))
    fact.forward_cmp()


 def test_reid_onehot_forward_int32_1024_depth128_model_parallel():
    fact = OneHotFactory(batch_size=1024,
                         classes=128,
                         on_value=1.000000,
                         off_value=0.000000,
                         axis=-1,
                         strategy=((1, device_num), (), ()))
    fact.forward_cmp()
--- a/tests/st/auto_parallel/soft_entropy_loss_expand_parallel.py
+++ b/tests/st/auto_parallel/soft_entropy_loss_expand_parallel.py
@@ -1,275 +1,276 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 import numpy as np
 import os
 import pytest
 from numpy import allclose

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common import dtype as mstype
 from mindspore.common.parameter import ParameterTuple, Parameter
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.nn.optim.momentum import Momentum
 from mindspore.ops import composite as C
 from mindspore.ops import functional as F
 from mindspore.ops import operations as P
 from mindspore.train import Model, ParallelMode
 from mindspore.train.callback import Callback

 np.set_printoptions(threshold=np.inf)
 device_num = 2
 device_id = int(os.getenv('DEVICE_ID'))
 rank_id = 0
 embed = 128
 classes = 32
 batch_size = 32 * 2
 MatmulParamShape = (classes, embed)


 def setup_module():
    global device_num
    global rank_id
    np.random.seed(0)
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
    context.set_context(device_id=device_id)
    distributedTool.init()
    rank_id = distributedTool.get_rank()
    device_num = distributedTool.get_group_size()
    context.set_auto_parallel_context(device_num=device_num,
                                      global_rank=device_id)


 def teardown_module():
    distributedTool.release()


 class DataGenerator():
    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def generate_data(self, shape):
        size = np.cumprod(shape)[-1]
        num_range = min(size, 1000)
        data = (np.arange(0, size) % num_range) / num_range
        data = np.reshape(data, shape)
        return data

    def input_data(self, shape):
        data = (self.generate_data(shape) * 0.1).astype(np.float32)
        stra = [1] * len(shape)
        stra[0] = device_num
        datas = self.get_parallel_blocks(data, stra)
        return Tensor(data), Tensor(datas[rank_id])

    def label_data(self, shape, embed):
        data = (self.generate_data(shape) * (embed - 1)).astype(np.int32)
        stra = [1] * len(shape)
        stra[0] = device_num
        datas = self.get_parallel_blocks(data, stra)
        return Tensor(data), Tensor(datas[rank_id])


 class Dataset():
    def __init__(self, predict, label, length=1, input_num=2):
        self.predict = predict
        self.label = label
        self.index = 0
        self.length = length
        self.input_num = input_num

    def __iter__(self):
        return self

    def __next__(self):
        if self.index >= self.length:
            raise StopIteration
        self.index += 1
        if self.input_num == 2:
            return self.predict, self.label
        else:
            return self.predict,

    def reset(self):
        self.index = 0

    def get_dataset_size(self):
        return self.length

    def get_repeat_count(self):
        return self.length


 class ModelCallback(Callback):
    def __init__(self):
        super(ModelCallback, self).__init__()
        self.loss_list = []

    def epoch_end(self, run_context, *args):
        cb_params = run_context.original_args()
        result = cb_params.net_outputs
        self.loss_list.append(result.asnumpy().mean())


 class SoftmaxCrossEntropyExpand(Cell):
    def __init__(self, sparse=False, stra_list=[]):
        super(SoftmaxCrossEntropyExpand, self).__init__()
        if len(stra_list) < 11:
            stra_list = [None] * 11
        self.exp = P.Exp()
        self.reduce_sum = P.ReduceSum(keep_dims=True).set_strategy(strategy=stra_list[1])
        self.onehot = P.OneHot().set_strategy(strategy=stra_list[2])
        self.on_value = Tensor(1.0, mstype.float32)
        self.off_value = Tensor(0.0, mstype.float32)
        self.div = P.Div().set_strategy(strategy=stra_list[3])
        self.log = P.Log().set_strategy(strategy=stra_list[4])
        self.sum_cross_entropy = P.ReduceSum(keep_dims=False).set_strategy(strategy=stra_list[5])
        self.mul = P.Mul().set_strategy(strategy=stra_list[6])
        self.mul2 = P.Mul().set_strategy(strategy=stra_list[7])
        self.cast = P.Cast()
        self.reduce_mean = P.ReduceMean(keep_dims=False).set_strategy(strategy=stra_list[8])
        self.sparse = sparse
        self.reduce_max = P.ReduceMax(keep_dims=True).set_strategy(strategy=stra_list[9])
        self.sub = P.Sub().set_strategy(strategy=stra_list[10])

    def construct(self, logit, label):
        logit_max = self.reduce_max(logit, -1)
        exp = self.exp(self.sub(logit, logit_max))
        exp_sum = self.reduce_sum(exp, -1)
        softmax_result = self.div(exp, exp_sum)
        if self.sparse:
            label = self.onehot(label, F.shape(logit)[1], self.on_value, self.off_value)
        softmax_result_log = self.log(softmax_result)
        loss = self.sum_cross_entropy((self.mul(softmax_result_log, label)), -1)
        loss = self.mul2(F.scalar_to_array(-1.0), loss)
        loss = self.reduce_mean(loss, -1)
        return loss


 class MatmulNet(Cell):
    def __init__(self, matmul_stra=None, loss_stra_list=[]):
        super(MatmulNet, self).__init__()
        self.matmul = P.MatMul(transpose_b=True).set_strategy(strategy=matmul_stra)
        self.loss = SoftmaxCrossEntropyExpand(sparse=True, stra_list=loss_stra_list)
        self.weight = Parameter(Tensor(np.ones(MatmulParamShape), dtype=ms.float32), name="weight")

    def construct(self, x, label):
        loss_input = self.matmul(x, self.weight)
        out = self.loss(loss_input, label)
        return out


 class LossFactory():
    def __init__(self):
        dataGen = DataGenerator()
        self.input_full, self.input_part = dataGen.input_data((batch_size, embed))
        self.label_full, self.label_part = dataGen.label_data((batch_size,), embed)

    def single_matmul_trains(self):
        single_callback = ModelCallback()
        net = MatmulNet()
        optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
        model = Model(net, optimizer=optimizer)
        epoch_size = 6
        dataset = Dataset(self.input_full, self.label_full)
        model.train(epoch_size, dataset, callbacks=single_callback, dataset_sink_mode=False)
        loss_value = np.array(single_callback.loss_list)
        return loss_value

    def data_parallel_matmul_trains(self):
        parallel_callback = ModelCallback()
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net = MatmulNet()
        optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
        model = Model(net, optimizer=optimizer)
        epoch_size = 6
        dataset = Dataset(self.input_part, self.label_part)
        model.train(epoch_size, dataset, callbacks=parallel_callback, dataset_sink_mode=False)
        loss_value = np.array(parallel_callback.loss_list)
        return loss_value

    def model_parallel_matmul_trains(self):
        parallel_callback = ModelCallback()
        matmul_stra = ((1, 1), (device_num, 1))
        reduce_max_stra = ((1, device_num),)
        sub_stra = ((1, device_num), (1, 1))
        exp_stra = ((1, device_num),)
        reduce_sum_stra = ((1, device_num),)
        div_stra = ((1, device_num), (1, 1))
        log_stra = ((1, device_num),)
        mul_stra = ((1, device_num), (1, device_num))
        sum_cross_entropy_stra = ((1, device_num),)
        mul2_stra = ((), (device_num,))
        reduce_mean_stra = ((device_num,),)
        onehot_stra = ((1, device_num), (), ())
        loss_stra_list = [exp_stra, reduce_sum_stra, onehot_stra, div_stra, log_stra,
                          sum_cross_entropy_stra, mul_stra, mul2_stra, reduce_mean_stra, reduce_max_stra, sub_stra]
        context.set_auto_parallel_context(parallel_mode="auto_parallel")
        net = MatmulNet(matmul_stra=matmul_stra, loss_stra_list=loss_stra_list)
        optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
        model = Model(net, optimizer=optimizer)
        epoch_size = 6
        dataset = Dataset(self.input_part, self.label_part)
        model.train(epoch_size, dataset, callbacks=parallel_callback, dataset_sink_mode=False)
        loss_value = np.array(parallel_callback.loss_list)
        return loss_value

    def mix_parallel_matmul_trains(self):
        parallel_callback = ModelCallback()
        matmul_stra = ((device_num, 1), (1, 1))
        reduce_max_stra = ((1, device_num),)
        sub_stra = ((device_num, 1), (device_num, 1))
        exp_stra = ((1, device_num),)
        reduce_sum_stra = ((1, device_num),)
        div_stra = ((1, device_num), (1, 1))
        log_stra = ((1, device_num),)
        mul_stra = ((1, device_num), (1, device_num))
        sum_cross_entropy_stra = ((1, device_num),)
        mul2_stra = ((), (device_num,))
        reduce_mean_stra = ((device_num,),)
        onehot_stra = ((1, device_num), (), ())
        loss_stra_list = [exp_stra, reduce_sum_stra, onehot_stra, div_stra, log_stra,
                          sum_cross_entropy_stra, mul_stra, mul2_stra, reduce_mean_stra, reduce_max_stra, sub_stra]
        context.set_auto_parallel_context(parallel_mode="auto_parallel")
        net = MatmulNet(matmul_stra=matmul_stra, loss_stra_list=loss_stra_list)
        optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
        model = Model(net, optimizer=optimizer)
        epoch_size = 6
        dataset = Dataset(self.input_part, self.label_part)
        model.train(epoch_size, dataset, callbacks=parallel_callback, dataset_sink_mode=False)
        loss_value = np.array(parallel_callback.loss_list)
        return loss_value


 def test_all_trains():
    loss_factory = LossFactory()
    context.reset_auto_parallel_context()
    single_loss = loss_factory.single_matmul_trains()
    model_parallel_loss = loss_factory.model_parallel_matmul_trains()
    mix_parallel_loss = loss_factory.mix_parallel_matmul_trains()
    assert allclose(single_loss, model_parallel_loss)
    assert allclose(single_loss, mix_parallel_loss)
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 import os
 import numpy as np
 from numpy import allclose

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common import dtype as mstype
 from mindspore.common.parameter import Parameter
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.nn.optim.momentum import Momentum
 from mindspore.ops import functional as F
 from mindspore.ops import operations as P
 from mindspore.train import Model
 from mindspore.train.callback import Callback

 np.set_printoptions(threshold=np.inf)
 device_num = 2
 device_id = int(os.getenv('DEVICE_ID'))
 rank_id = 0
 embed = 128
 classes = 32
 batch_size = 32 * 2
 MatmulParamShape = (classes, embed)


 def setup_module():
    global device_num
    global rank_id
    np.random.seed(0)
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
    context.set_context(device_id=device_id)
    distributedTool.init()
    rank_id = distributedTool.get_rank()
    device_num = distributedTool.get_group_size()
    context.set_auto_parallel_context(device_num=device_num,
                                      global_rank=device_id)


 def teardown_module():
    distributedTool.release()


 class DataGenerator():
    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def generate_data(self, shape):
        size = np.cumprod(shape)[-1]
        num_range = min(size, 1000)
        data = (np.arange(0, size) % num_range) / num_range
        data = np.reshape(data, shape)
        return data

    def input_data(self, shape):
        data = (self.generate_data(shape) * 0.1).astype(np.float32)
        stra = [1] * len(shape)
        stra[0] = device_num
        datas = self.get_parallel_blocks(data, stra)
        return Tensor(data), Tensor(datas[rank_id])

    def label_data(self, shape, embed_):
        data = (self.generate_data(shape) * (embed_ - 1)).astype(np.int32)
        stra = [1] * len(shape)
        stra[0] = device_num
        datas = self.get_parallel_blocks(data, stra)
        return Tensor(data), Tensor(datas[rank_id])


 class Dataset():
    def __init__(self, predict, label, length=1, input_num=2):
        self.predict = predict
        self.label = label
        self.index = 0
        self.length = length
        self.input_num = input_num

    def __iter__(self):
        return self

    def __next__(self):
        if self.index >= self.length:
            raise StopIteration
        self.index += 1
        if self.input_num == 2:
            return (self.predict, self.label)
        return (self.predict,)

    def reset(self):
        self.index = 0

    def get_dataset_size(self):
        return self.length

    def get_repeat_count(self):
        return self.length


 class ModelCallback(Callback):
    def __init__(self):
        super(ModelCallback, self).__init__()
        self.loss_list = []

    def epoch_end(self, run_context):
        cb_params = run_context.original_args()
        result = cb_params.net_outputs
        self.loss_list.append(result.asnumpy().mean())


 class SoftmaxCrossEntropyExpand(Cell):
    def __init__(self, sparse=False, stra_list=None):
        super(SoftmaxCrossEntropyExpand, self).__init__()
        if stra_list is None:
            stra_list = []
        if len(stra_list) < 11:
            stra_list = [None] * 11
        self.exp = P.Exp()
        self.reduce_sum = P.ReduceSum(keep_dims=True).set_strategy(strategy=stra_list[1])
        self.onehot = P.OneHot().set_strategy(strategy=stra_list[2])
        self.on_value = Tensor(1.0, mstype.float32)
        self.off_value = Tensor(0.0, mstype.float32)
        self.div = P.Div().set_strategy(strategy=stra_list[3])
        self.log = P.Log().set_strategy(strategy=stra_list[4])
        self.sum_cross_entropy = P.ReduceSum(keep_dims=False).set_strategy(strategy=stra_list[5])
        self.mul = P.Mul().set_strategy(strategy=stra_list[6])
        self.mul2 = P.Mul().set_strategy(strategy=stra_list[7])
        self.cast = P.Cast()
        self.reduce_mean = P.ReduceMean(keep_dims=False).set_strategy(strategy=stra_list[8])
        self.sparse = sparse
        self.reduce_max = P.ReduceMax(keep_dims=True).set_strategy(strategy=stra_list[9])
        self.sub = P.Sub().set_strategy(strategy=stra_list[10])

    def construct(self, logit, label):
        logit_max = self.reduce_max(logit, -1)
        exp = self.exp(self.sub(logit, logit_max))
        exp_sum = self.reduce_sum(exp, -1)
        softmax_result = self.div(exp, exp_sum)
        if self.sparse:
            label = self.onehot(label, F.shape(logit)[1], self.on_value, self.off_value)
        softmax_result_log = self.log(softmax_result)
        loss = self.sum_cross_entropy((self.mul(softmax_result_log, label)), -1)
        loss = self.mul2(F.scalar_to_array(-1.0), loss)
        loss = self.reduce_mean(loss, -1)
        return loss


 class MatmulNet(Cell):
    def __init__(self, matmul_stra=None, loss_stra_list=None):
        super(MatmulNet, self).__init__()
        if loss_stra_list is None:
            loss_stra_list = []
        self.matmul = P.MatMul(transpose_b=True).set_strategy(strategy=matmul_stra)
        self.loss = SoftmaxCrossEntropyExpand(sparse=True, stra_list=loss_stra_list)
        self.weight = Parameter(Tensor(np.ones(MatmulParamShape), dtype=ms.float32), name="weight")

    def construct(self, x, label):
        loss_input = self.matmul(x, self.weight)
        out = self.loss(loss_input, label)
        return out


 class LossFactory():
    def __init__(self):
        data_gen = DataGenerator()
        self.input_full, self.input_part = data_gen.input_data((batch_size, embed))
        self.label_full, self.label_part = data_gen.label_data((batch_size,), embed)

    def single_matmul_trains(self):
        single_callback = ModelCallback()
        net = MatmulNet()
        optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
        model = Model(net, optimizer=optimizer)
        epoch_size = 6
        dataset = Dataset(self.input_full, self.label_full)
        model.train(epoch_size, dataset, callbacks=single_callback, dataset_sink_mode=False)
        loss_value = np.array(single_callback.loss_list)
        return loss_value

    def data_parallel_matmul_trains(self):
        parallel_callback = ModelCallback()
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net = MatmulNet()
        optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
        model = Model(net, optimizer=optimizer)
        epoch_size = 6
        dataset = Dataset(self.input_part, self.label_part)
        model.train(epoch_size, dataset, callbacks=parallel_callback, dataset_sink_mode=False)
        loss_value = np.array(parallel_callback.loss_list)
        return loss_value

    def model_parallel_matmul_trains(self):
        parallel_callback = ModelCallback()
        matmul_stra = ((1, 1), (device_num, 1))
        reduce_max_stra = ((1, device_num),)
        sub_stra = ((1, device_num), (1, 1))
        exp_stra = ((1, device_num),)
        reduce_sum_stra = ((1, device_num),)
        div_stra = ((1, device_num), (1, 1))
        log_stra = ((1, device_num),)
        mul_stra = ((1, device_num), (1, device_num))
        sum_cross_entropy_stra = ((1, device_num),)
        mul2_stra = ((), (device_num,))
        reduce_mean_stra = ((device_num,),)
        onehot_stra = ((1, device_num), (), ())
        loss_stra_list = [exp_stra, reduce_sum_stra, onehot_stra, div_stra, log_stra,
                          sum_cross_entropy_stra, mul_stra, mul2_stra, reduce_mean_stra, reduce_max_stra, sub_stra]
        context.set_auto_parallel_context(parallel_mode="auto_parallel")
        net = MatmulNet(matmul_stra=matmul_stra, loss_stra_list=loss_stra_list)
        optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
        model = Model(net, optimizer=optimizer)
        epoch_size = 6
        dataset = Dataset(self.input_part, self.label_part)
        model.train(epoch_size, dataset, callbacks=parallel_callback, dataset_sink_mode=False)
        loss_value = np.array(parallel_callback.loss_list)
        return loss_value

    def mix_parallel_matmul_trains(self):
        parallel_callback = ModelCallback()
        matmul_stra = ((device_num, 1), (1, 1))
        reduce_max_stra = ((1, device_num),)
        sub_stra = ((device_num, 1), (device_num, 1))
        exp_stra = ((1, device_num),)
        reduce_sum_stra = ((1, device_num),)
        div_stra = ((1, device_num), (1, 1))
        log_stra = ((1, device_num),)
        mul_stra = ((1, device_num), (1, device_num))
        sum_cross_entropy_stra = ((1, device_num),)
        mul2_stra = ((), (device_num,))
        reduce_mean_stra = ((device_num,),)
        onehot_stra = ((1, device_num), (), ())
        loss_stra_list = [exp_stra, reduce_sum_stra, onehot_stra, div_stra, log_stra,
                          sum_cross_entropy_stra, mul_stra, mul2_stra, reduce_mean_stra, reduce_max_stra, sub_stra]
        context.set_auto_parallel_context(parallel_mode="auto_parallel")
        net = MatmulNet(matmul_stra=matmul_stra, loss_stra_list=loss_stra_list)
        optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
        model = Model(net, optimizer=optimizer)
        epoch_size = 6
        dataset = Dataset(self.input_part, self.label_part)
        model.train(epoch_size, dataset, callbacks=parallel_callback, dataset_sink_mode=False)
        loss_value = np.array(parallel_callback.loss_list)
        return loss_value


 def test_all_trains():
    loss_factory = LossFactory()
    context.reset_auto_parallel_context()
    single_loss = loss_factory.single_matmul_trains()
    model_parallel_loss = loss_factory.model_parallel_matmul_trains()
    mix_parallel_loss = loss_factory.mix_parallel_matmul_trains()
    assert allclose(single_loss, model_parallel_loss)
    assert allclose(single_loss, mix_parallel_loss)
--- a/tests/st/auto_parallel/test_expand_loss.py
+++ b/tests/st/auto_parallel/test_expand_loss.py
@@ -1,26 +1,26 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 import os
 import pytest


@pytest.mark.level0
@pytest.mark.platform_x86_ascend_training
@pytest.mark.platform_arm_ascend_training
@pytest.mark.env_single
 def test_expand_loss():
    sh_path = os.path.split(os.path.realpath(__file__))[0]
    ret = os.system(f"sh {sh_path}/run_auto_parallel_loss_expand.sh")
    assert (ret == 0)
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 import os
 import pytest


@pytest.mark.level0
@pytest.mark.platform_x86_ascend_training
@pytest.mark.platform_arm_ascend_training
@pytest.mark.env_single
 def test_expand_loss():
    sh_path = os.path.split(os.path.realpath(__file__))[0]
    ret = os.system(f"sh {sh_path}/run_auto_parallel_loss_expand.sh")
    assert ret == 0
--- a/tests/st/auto_parallel/test_model_parallel_onehot.py
+++ b/tests/st/auto_parallel/test_model_parallel_onehot.py
@@ -1,22 +1,21 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 import os
 import pytest


 def test_expand_loss():
    ret = os.system("sh run_onehot_model_parallel.sh")
    assert (ret == 0)
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 import os


 def test_expand_loss():
    ret = os.system("sh run_onehot_model_parallel.sh")
    assert ret == 0
--- a/tests/st/auto_parallel/test_resnet50_expand_loss_2p.py
+++ b/tests/st/auto_parallel/test_resnet50_expand_loss_2p.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 # ============================================================================

 import numpy as np
 import os
 import numpy as np
 import pytest

 import mindspore.common.dtype as mstype
@@ -37,31 +37,29 @@ init()
 context.set_auto_parallel_context(mirror_mean=True, parallel_mode=ParallelMode.AUTO_PARALLEL)


 def weight_variable(shape, factor=0.1):
 def weight_variable():
    return One()


 def _conv3x3(in_channels, out_channels, stride=1, padding=0, pad_mode='same'):
    init_value = weight_variable((out_channels, in_channels, 3, 3))
    init_value = weight_variable()
    return nn.Conv2d(in_channels, out_channels,
                     kernel_size=3, stride=stride, padding=padding, pad_mode=pad_mode, weight_init=init_value)


 def _conv1x1(in_channels, out_channels, stride=1, padding=0, pad_mode='same'):
    init_value = weight_variable((out_channels, in_channels, 1, 1))
    init_value = weight_variable()
    return nn.Conv2d(in_channels, out_channels,
                     kernel_size=1, stride=stride, padding=padding, pad_mode=pad_mode, weight_init=init_value)


 def _conv7x7(in_channels, out_channels, stride=1, padding=0, pad_mode='same'):
    init_value = weight_variable((out_channels, in_channels, 7, 7))
    init_value = weight_variable()
    return nn.Conv2d(in_channels, out_channels,
                     kernel_size=7, stride=stride, padding=padding, pad_mode=pad_mode, weight_init=init_value)


 def _fused_bn(channels, momentum=0.9):
    init_weight = weight_variable((channels,))
    init_bias = weight_variable((channels,))
    return nn.BatchNorm2d(channels, momentum=momentum)


@@ -210,8 +208,8 @@ class ResNet(nn.Cell):

        self.mean = P.ReduceMean(keep_dims=True)
        self.end_point = nn.Dense(2048, num_classes, has_bias=True,
                                  weight_init=weight_variable((num_classes, 2048)),
                                  bias_init=weight_variable((num_classes,)))
                                  weight_init=weight_variable(),
                                  bias_init=weight_variable())
        self.squeeze = P.Squeeze()
        self.cast = P.Cast()

@@ -345,9 +343,8 @@ class Dataset():
            raise StopIteration
        self.index += 1
        if self.input_num == 2:
            return self.predict, self.label
        else:
            return self.predict,
            return (self.predict, self.label)
        return (self.predict,)

    def reset(self):
        self.index = 0
@@ -364,7 +361,7 @@ class ModelCallback(Callback):
        super(ModelCallback, self).__init__()
        self.loss_list = []

    def epoch_end(self, run_context, *args):
    def epoch_end(self, run_context):
        cb_params = run_context.original_args()
        result = cb_params.net_outputs
        self.loss_list.append(result.asnumpy().mean())
@@ -376,9 +373,9 @@ class ModelCallback(Callback):
 def test_train_feed(num_classes=8192):
    set_algo_parameters(elementwise_op_strategy_follow=True)
    parallel_callback = ModelCallback()
    dataGen = DataGenerator()
    input_full, input_part = dataGen.input_data((32 * 2, 3, 224, 224))
    label_full, label_part = dataGen.label_data((32 * 2,))
    data_gen = DataGenerator()
    _, input_part = data_gen.input_data((32 * 2, 3, 224, 224))
    _, label_part = data_gen.label_data((32 * 2,))
    dataset = Dataset(input_part, label_part)
    net = resnet50(num_classes)
    loss = SoftmaxCrossEntropyExpand(sparse=True)
@@ -396,9 +393,9 @@ def test_train_feed(num_classes=8192):
 def test_train_feed2(num_classes=1001):
    set_algo_parameters(elementwise_op_strategy_follow=True)
    parallel_callback = ModelCallback()
    dataGen = DataGenerator()
    input_full, input_part = dataGen.input_data((32 * 2, 3, 224, 224))
    label_full, label_part = dataGen.label_data((32 * 2,))
    data_gen = DataGenerator()
    _, input_part = data_gen.input_data((32 * 2, 3, 224, 224))
    _, label_part = data_gen.label_data((32 * 2,))
    dataset = Dataset(input_part, label_part)
    net = resnet50(num_classes)
    loss = SoftmaxCrossEntropyExpand(sparse=True)
--- a/tests/ut/python/communication/init.py
+++ b/tests/ut/python/communication/init.py
@@ -1,17 +1,17 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import sys

 sys.path.append("../../..")
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import sys

 sys.path.append("../../..")
--- a/tests/ut/python/communication/test_comm.py
+++ b/tests/ut/python/communication/test_comm.py
@@ -25,7 +25,6 @@ from mindspore.nn import Dense
 from mindspore.nn import Momentum
 from mindspore.nn import ReLU
 from mindspore.nn import TrainOneStepCell, WithLossCell
 from mindspore.ops.operations import Split
 from mindspore.ops.operations.comm_ops import AllReduce, AllGather, _AlltoAll, ReduceOp, ReduceScatter
 from mindspore.ops.operations.comm_ops import Broadcast

--- a/tests/ut/python/communication/test_data_parallel_lenet.py
+++ b/tests/ut/python/communication/test_data_parallel_lenet.py
@@ -16,8 +16,8 @@
@File   : test_data_parallel_lenet.py
@Desc   : test data parallel lenet
 """
 import numpy as np
 import os
 import numpy as np

 import mindspore.context as context
 import mindspore.nn as nn
@@ -80,7 +80,6 @@ def test_lenet5_train_step_training_pynative():
    context.reset_auto_parallel_context()
    context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL,
                                      device_num=8, mirror_mean=True)
    size = 3
    predict = Tensor(np.ones([1, 1, 32, 32]).astype(np.float32) * 0.01)
    label = Tensor(np.zeros([1, 10]).astype(np.float32))
    DatasetLenet(predict, label, 2)
--- a/tests/ut/python/parallel/init.py
+++ b/tests/ut/python/parallel/init.py
@@ -19,7 +19,7 @@ from mindspore.parallel._utils import _reset_op_id
 from mindspore.parallel.algo_parameter_config import reset_algo_parameters


 def setup_module(module):
 def setup_module():
    auto_parallel_context().set_enable_all_reduce_fusion(enable_all_reduce_fusion=True)
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False)
    reset_cost_model_context()
--- a/tests/ut/python/parallel/parallel_end_to_end/add_relu/_test_add_relu_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/add_relu/_test_add_relu_parallel_4p.py
@@ -1,178 +1,178 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import numpy as np
 import os
 import pytest

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class AddRelu(Cell):
    def __init__(self, strategy0=None, strategy1=None):
        super(AddRelu, self).__init__()
        self.add = P.TensorAdd(strategy=strategy0)
        self.relu = P.ReLU(strategy=strategy1)

    def construct(self, x, z):
        out = self.add(x, z)
        return self.relu(out)


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, output_grad):
        return grad_all_with_sens(self.network)(x, y, output_grad)


 class AddReluFactory:
    def __init__(self, input_shape, strategy0, strategy1):
        prefix = ""
        size = 1
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(1000, size)
        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
            np.float32)
        self.input_np2 = 1.0
        self.output_grad_np = np.reshape((np.arange(0, size) % (number_range - 10) - number_range / 2) * 0.1,
                                         input_shape).astype(np.float32)
        self.strategy0 = strategy0
        self.strategy1 = strategy1
        need_dev_num = 1
        need_dev_num_ = 1
        for s in strategy0[1]:
            need_dev_num = need_dev_num * s
        for s in strategy1[1]:
            need_dev_num_ = need_dev_num_ * s
        self.x_id = device_id % need_dev_num
        self.y_id = device_id % need_dev_num
        self.out_id = device_id % need_dev_num_

    def forward_mindspore_impl(self):
        net = AddRelu()
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        out = net(x, y)
        return out.asnumpy()

    def forward_mindspore_parallel_impl(self):
        net = AddRelu(strategy0=self.strategy0, strategy1=self.strategy1)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(self.input_np2, ms.float32)
        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
        return out.asnumpy()

    def grad_mindspore_impl(self):
        output_grad = Tensor(self.output_grad_np)
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        net = AddRelu()
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad)
        return input_grad

    def grad_mindspore_parallel_impl(self):
        output_grads = self.get_parallel_blocks(self.output_grad_np, self.strategy1[1])
        output_grad = Tensor(output_grads[self.out_id])
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        net = AddRelu(strategy0=self.strategy0, strategy1=self.strategy1)
        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_auto_parallel()
        grad_net.set_train()
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(self.input_np2, ms.float32)
        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad],
                              parallel_inputs_run=[x1, y1, output_grad])
        return input_grad

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def forward_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        out_blocks = self.get_parallel_blocks(out_mindspore, self.strategy1[1])
        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.001)

    def grad_cmp(self):
        input_grad_mindspore = self.grad_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        assert np.allclose(input_grad_mindspore1, input_grad_mindspore_parallel1, 0.0001, 0.0001)


@pytest.mark.reid_forward
 def test_reid_add_relu_input_256_64():
    stra0 = (0, (2, 2), ())
    stra1 = (0, (2, 2))
    fact = AddReluFactory(input_shape=(256, 64), strategy0=stra0, strategy1=stra1)
    fact.forward_cmp()


@pytest.mark.reid_grad
 def test_reid_grad_add_relu_input_256_64():
    stra0 = (0, (2, 2), ())
    stra1 = (0, (2, 2))
    fact = AddReluFactory(input_shape=(256, 64), strategy0=stra0, strategy1=stra1)
    fact.grad_cmp()
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np
 import pytest

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class AddRelu(Cell):
    def __init__(self, strategy0=None, strategy1=None):
        super(AddRelu, self).__init__()
        self.add = P.TensorAdd(strategy=strategy0)
        self.relu = P.ReLU(strategy=strategy1)

    def construct(self, x, z):
        out = self.add(x, z)
        return self.relu(out)


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, output_grad):
        return grad_all_with_sens(self.network)(x, y, output_grad)


 class AddReluFactory:
    def __init__(self, input_shape, strategy0, strategy1):
        prefix = ""
        size = 1
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(1000, size)
        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
            np.float32)
        self.input_np2 = 1.0
        self.output_grad_np = np.reshape((np.arange(0, size) % (number_range - 10) - number_range / 2) * 0.1,
                                         input_shape).astype(np.float32)
        self.strategy0 = strategy0
        self.strategy1 = strategy1
        need_dev_num = 1
        need_dev_num_ = 1
        for s in strategy0[1]:
            need_dev_num = need_dev_num * s
        for s in strategy1[1]:
            need_dev_num_ = need_dev_num_ * s
        self.x_id = device_id % need_dev_num
        self.y_id = device_id % need_dev_num
        self.out_id = device_id % need_dev_num_

    def forward_mindspore_impl(self):
        net = AddRelu()
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        out = net(x, y)
        return out.asnumpy()

    def forward_mindspore_parallel_impl(self):
        net = AddRelu(strategy0=self.strategy0, strategy1=self.strategy1)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(self.input_np2, ms.float32)
        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
        return out.asnumpy()

    def grad_mindspore_impl(self):
        output_grad = Tensor(self.output_grad_np)
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        net = AddRelu()
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad)
        return input_grad

    def grad_mindspore_parallel_impl(self):
        output_grads = self.get_parallel_blocks(self.output_grad_np, self.strategy1[1])
        output_grad = Tensor(output_grads[self.out_id])
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        net = AddRelu(strategy0=self.strategy0, strategy1=self.strategy1)
        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_auto_parallel()
        grad_net.set_train()
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(self.input_np2, ms.float32)
        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad],
                              parallel_inputs_run=[x1, y1, output_grad])
        return input_grad

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def forward_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        out_blocks = self.get_parallel_blocks(out_mindspore, self.strategy1[1])
        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.001)

    def grad_cmp(self):
        input_grad_mindspore = self.grad_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
        _ = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        _ = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        assert np.allclose(input_grad_mindspore1, input_grad_mindspore_parallel1, 0.0001, 0.0001)


@pytest.mark.reid_forward
 def test_reid_add_relu_input_256_64():
    stra0 = (0, (2, 2), ())
    stra1 = (0, (2, 2))
    fact = AddReluFactory(input_shape=(256, 64), strategy0=stra0, strategy1=stra1)
    fact.forward_cmp()


@pytest.mark.reid_grad
 def test_reid_grad_add_relu_input_256_64():
    stra0 = (0, (2, 2), ())
    stra1 = (0, (2, 2))
    fact = AddReluFactory(input_shape=(256, 64), strategy0=stra0, strategy1=stra1)
    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/batch_parallel/_test_conv2d_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/batch_parallel/_test_conv2d_parallel_4p.py
@@ -1,356 +1,356 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import numpy as np
 import os
 from numpy import allclose

 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore._checkparam import check_bool, twice
 from mindspore.common.initializer import initializer
 from mindspore.common.parameter import Parameter
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class _Conv(Cell):
    r"""Applies a N-D convolution over an input signal composed of several input
       planes.
    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride,
                 pad_mode,
                 padding,
                 dilation,
                 group,
                 has_bias,
                 weight_init,
                 bias_init):
        super(_Conv, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.pad_mode = pad_mode
        self.padding = padding
        self.dilation = dilation
        self.group = group
        self.has_bias = has_bias
        if not (isinstance(in_channels, int) and in_channels > 0):
            raise ValueError('Attr \'in_channels\' of \'Conv2D\' Op passed '
                             + str(in_channels) + ', should be a int and greater than 0.')
        if (not isinstance(kernel_size, tuple)) or len(kernel_size) != 2 or \
                (not isinstance(kernel_size[0], int)) or (not isinstance(kernel_size[1], int)) or \
                kernel_size[0] < 1 or kernel_size[1] < 1:
            raise ValueError('Attr \'kernel_size\' of \'Conv2D\' Op passed '
                             + str(self.kernel_size) + ', should be a int or tuple and equal to or greater than 1.')
        if in_channels % group != 0:
            raise ValueError('Attr \'in_channels\' of \'Conv2D\' Op must be divisible by '
                             'attr \'group\' of \'Conv2D\' Op.')
        if out_channels % group != 0:
            raise ValueError('Attr \'out_channels\' of \'Conv2D\' Op must be divisible by '
                             'attr \'group\' of \'Conv2D\' Op.')

        self.weight = Parameter(initializer(
            weight_init, [out_channels, in_channels // group, *kernel_size]), name='weight')

        if check_bool(has_bias):
            self.bias = Parameter(initializer(
                bias_init, [out_channels]), name='bias')
        else:
            if bias_init != 'zeros':
                print("Value of 'has_bias' is False, value of 'bias_init' will be ignored.")
            self.bias = None

    def construct(self, *inputs):
        raise NotImplementedError


 class Conv2d(_Conv):
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 pad_mode='same',
                 padding=0,
                 dilation=1,
                 group=1,
                 has_bias=False,
                 weight_init='normal',
                 bias_init='zeros',
                 strategy=None):
        kernel_size = twice(kernel_size)
        super(Conv2d, self).__init__(
            in_channels,
            out_channels,
            kernel_size,
            stride,
            pad_mode,
            padding,
            dilation,
            group,
            has_bias,
            weight_init,
            bias_init)
        self.add = P.TensorAdd(strategy)
        self.conv2d = P.Conv2D(out_channel=self.out_channels,
                               kernel_size=self.kernel_size,
                               mode=1,
                               pad_mode=self.pad_mode,
                               pad=self.padding,
                               stride=self.stride,
                               dilation=self.dilation,
                               group=self.group,
                               strategy=None)
        self.bias_add = P.BiasAdd()

    def construct(self, input1, input2):
        x = self.add(input1, input2)
        if self.has_bias:
            return self.bias_add(self.conv2d(x, self.weight),
                                 self.bias)
        return self.conv2d(x, self.weight)


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, input1, input2, output_grad):
        return grad_all_with_sens(self.network)(input1, input2, output_grad)


 class Conv2dFactory:
    def __init__(self, input_shape, filter_shape, stride, pad_mode, padding, dilation, group, has_bias):
        self.in_n, self.in_c, self.in_h, self.in_w = input_shape
        self.out_c, self.kernel_c, self.kernel_h, self.kernel_w = filter_shape
        self.stride = stride
        self.pad_mode = pad_mode
        self.padding = padding
        self.dilation = dilation
        self.group = group
        self.strategy0 = (0, (4, 1, 1, 1), (1, 1, 1, 1))
        prefix = ""
        input_size = 1
        filter_size = 1
        for s in input_shape:
            prefix = prefix + str(s) + "_"
            input_size = input_size * s
        self.prefix = prefix
        for s in filter_shape:
            filter_size = filter_size * s
        number_range1 = min(10, input_size)
        number_range2 = min(10, filter_size)
        self.input_np1 = np.reshape(np.arange(0, input_size) % number_range1 - number_range1 / 2, input_shape).astype(
            np.float16)
        self.input_np2 = np.reshape(np.arange(0, input_size) % number_range1 - number_range1 / 4, input_shape).astype(
            np.float16)
        self.weight_np = np.reshape(np.arange(0, filter_size) % number_range2 - number_range2 / 2, filter_shape).astype(
            np.float16)
        self.has_bias = has_bias
        if self.has_bias is True:
            self.bias_np = np.arange(0, self.out_c).astype(np.float16)

        self.out_shape = (128, 64, 56, 56)
        out_size = 1
        for s in self.out_shape:
            out_size = out_size * s
        number_range3 = min(10, out_size)
        self.output_grad_np = np.reshape(np.arange(0, out_size) % number_range3 - number_range3 / 2,
                                         self.out_shape).astype(np.float16)
        self.x_id = device_id % 4
        self.y_id = device_id % 4
        self.out_strategy = self.strategy0[1]
        self.out_id = device_id % 4

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def forward_conv2d_mindspore_impl(self):
        input1 = Tensor(self.input_np1)
        input2 = Tensor(self.input_np2)
        weight = Tensor(self.weight_np)
        if self.has_bias:
            bias = Tensor(self.bias_np)
            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
                         kernel_size=(self.kernel_h, self.kernel_w),
                         stride=self.stride, pad_mode=self.pad_mode,
                         padding=self.padding, dilation=self.dilation,
                         group=self.group, has_bias=True, weight_init=weight,
                         bias_init=bias)
        else:
            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
                         kernel_size=(self.kernel_h, self.kernel_w),
                         stride=self.stride, pad_mode=self.pad_mode,
                         padding=self.padding, dilation=self.dilation,
                         group=self.group, has_bias=False, weight_init=weight)
        out = net(input1, input2)
        return out.asnumpy()

    def forward_conv2d_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        weight = Tensor(self.weight_np)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        if self.has_bias:
            bias = Tensor(self.bias_np)
            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
                         kernel_size=(self.kernel_h, self.kernel_w),
                         stride=self.stride, pad_mode=self.pad_mode,
                         padding=self.padding, dilation=self.dilation,
                         group=self.group, has_bias=True, weight_init=weight,
                         bias_init=bias, strategy=(self.strategy0[0], self.strategy0[1], self.strategy0[1]))
        else:
            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
                         kernel_size=(self.kernel_h, self.kernel_w),
                         stride=self.stride, pad_mode=self.pad_mode,
                         padding=self.padding, dilation=self.dilation,
                         group=self.group, has_bias=False, weight_init=weight,
                         strategy=(self.strategy0[0], self.strategy0[1], self.strategy0[1]))
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
        return out.asnumpy()

    def grad_conv2d_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        weight = Tensor(self.weight_np)
        output_grad = Tensor(self.output_grad_np)
        if self.has_bias:
            bias = Tensor(self.bias_np)
            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
                         kernel_size=(self.kernel_h, self.kernel_w),
                         stride=self.stride, pad_mode=self.pad_mode,
                         padding=self.padding, dilation=self.dilation,
                         group=self.group, has_bias=True, weight_init=weight,
                         bias_init=bias, )
        else:
            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
                         kernel_size=(self.kernel_h, self.kernel_w),
                         stride=self.stride, pad_mode=self.pad_mode,
                         padding=self.padding, dilation=self.dilation,
                         group=self.group, has_bias=False, weight_init=weight)

        grad_net = Grad(net)
        grad_net.set_train()
        out_grad = grad_net(x, y, output_grad)
        return out_grad

    def grad_conv2d_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        weight = Tensor(self.weight_np)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        output_grad = Tensor(self.output_grad_np)
        output_grads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
        output_grad1 = Tensor(output_grads[self.out_id])
        if self.has_bias:
            bias = Tensor(self.bias_np)
            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
                         kernel_size=(self.kernel_h, self.kernel_w),
                         stride=self.stride, pad_mode=self.pad_mode,
                         padding=self.padding, dilation=self.dilation,
                         group=self.group, has_bias=True, weight_init=weight,
                         bias_init=bias, strategy=(self.strategy0[0], self.strategy0[1], self.strategy0[1]))
        else:
            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
                         kernel_size=(self.kernel_h, self.kernel_w),
                         stride=self.stride, pad_mode=self.pad_mode,
                         padding=self.padding, dilation=self.dilation,
                         group=self.group, has_bias=False, weight_init=weight,
                         strategy=(self.strategy0[0], self.strategy0[1], self.strategy0[1]))

        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_train()
        grad_net.set_auto_parallel()
        out_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad1],
                            parallel_inputs_run=[x1, y1, output_grad1])
        return out_grad

    def forward_conv2d_cmp(self):
        out_mindspore = self.forward_conv2d_mindspore_impl()
        out_mindspore_parallel = self.forward_conv2d_mindspore_parallel_impl()
        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
        assert allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.001, 0.001)

    def grad_conv2d_cmp(self):
        input_grad_mindspore = self.grad_conv2d_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_conv2d_mindspore_parallel_impl()
        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[1])
        assert allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.001, 0.001)
        assert allclose(input_grad_blocks_1[self.x_id], input_grad_mindspore_parallel1, 0.001, 0.001)


 def test_reid_conv2d_input_128_64_112_112_kernel_64_64_1_1_stride_2_padding_0_bias_true():
    fact = Conv2dFactory(input_shape=(128, 64, 112, 112),
                         filter_shape=(64, 64, 1, 1),
                         stride=2, pad_mode='valid', padding=0,
                         dilation=1, group=1, has_bias=False)
    fact.forward_conv2d_cmp()


 def test_reid_conv2d_grad_input_128_64_112_112_kernel_64_64_1_1_stride_2_padding_0_bias_true():
    fact = Conv2dFactory(input_shape=(128, 64, 112, 112),
                         filter_shape=(64, 64, 1, 1),
                         stride=2, pad_mode='valid', padding=0,
                         dilation=1, group=1, has_bias=False)
    fact.grad_conv2d_cmp()
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np
 from numpy import allclose

 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore._checkparam import check_bool, twice
 from mindspore.common.initializer import initializer
 from mindspore.common.parameter import Parameter
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class _Conv(Cell):
    r"""Applies a N-D convolution over an input signal composed of several input
       planes.
    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride,
                 pad_mode,
                 padding,
                 dilation,
                 group,
                 has_bias,
                 weight_init,
                 bias_init):
        super(_Conv, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.pad_mode = pad_mode
        self.padding = padding
        self.dilation = dilation
        self.group = group
        self.has_bias = has_bias
        if not (isinstance(in_channels, int) and in_channels > 0):
            raise ValueError('Attr \'in_channels\' of \'Conv2D\' Op passed '
                             + str(in_channels) + ', should be a int and greater than 0.')
        if (not isinstance(kernel_size, tuple)) or len(kernel_size) != 2 or \
                (not isinstance(kernel_size[0], int)) or (not isinstance(kernel_size[1], int)) or \
                kernel_size[0] < 1 or kernel_size[1] < 1:
            raise ValueError('Attr \'kernel_size\' of \'Conv2D\' Op passed '
                             + str(self.kernel_size) + ', should be a int or tuple and equal to or greater than 1.')
        if in_channels % group != 0:
            raise ValueError('Attr \'in_channels\' of \'Conv2D\' Op must be divisible by '
                             'attr \'group\' of \'Conv2D\' Op.')
        if out_channels % group != 0:
            raise ValueError('Attr \'out_channels\' of \'Conv2D\' Op must be divisible by '
                             'attr \'group\' of \'Conv2D\' Op.')

        self.weight = Parameter(initializer(
            weight_init, [out_channels, in_channels // group, *kernel_size]), name='weight')

        if check_bool(has_bias):
            self.bias = Parameter(initializer(
                bias_init, [out_channels]), name='bias')
        else:
            if bias_init != 'zeros':
                print("Value of 'has_bias' is False, value of 'bias_init' will be ignored.")
            self.bias = None

    def construct(self, *inputs):
        raise NotImplementedError


 class Conv2d(_Conv):
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 pad_mode='same',
                 padding=0,
                 dilation=1,
                 group=1,
                 has_bias=False,
                 weight_init='normal',
                 bias_init='zeros',
                 strategy=None):
        kernel_size = twice(kernel_size)
        super(Conv2d, self).__init__(
            in_channels,
            out_channels,
            kernel_size,
            stride,
            pad_mode,
            padding,
            dilation,
            group,
            has_bias,
            weight_init,
            bias_init)
        self.add = P.TensorAdd(strategy)
        self.conv2d = P.Conv2D(out_channel=self.out_channels,
                               kernel_size=self.kernel_size,
                               mode=1,
                               pad_mode=self.pad_mode,
                               pad=self.padding,
                               stride=self.stride,
                               dilation=self.dilation,
                               group=self.group,
                               strategy=None)
        self.bias_add = P.BiasAdd()

    def construct(self, input1, input2):
        x = self.add(input1, input2)
        if self.has_bias:
            return self.bias_add(self.conv2d(x, self.weight),
                                 self.bias)
        return self.conv2d(x, self.weight)


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, input1, input2, output_grad):
        return grad_all_with_sens(self.network)(input1, input2, output_grad)


 class Conv2dFactory:
    def __init__(self, input_shape, filter_shape, stride, pad_mode, padding, dilation, group, has_bias):
        self.in_n, self.in_c, self.in_h, self.in_w = input_shape
        self.out_c, self.kernel_c, self.kernel_h, self.kernel_w = filter_shape
        self.stride = stride
        self.pad_mode = pad_mode
        self.padding = padding
        self.dilation = dilation
        self.group = group
        self.strategy0 = (0, (4, 1, 1, 1), (1, 1, 1, 1))
        prefix = ""
        input_size = 1
        filter_size = 1
        for s in input_shape:
            prefix = prefix + str(s) + "_"
            input_size = input_size * s
        self.prefix = prefix
        for s in filter_shape:
            filter_size = filter_size * s
        number_range1 = min(10, input_size)
        number_range2 = min(10, filter_size)
        self.input_np1 = np.reshape(np.arange(0, input_size) % number_range1 - number_range1 / 2, input_shape).astype(
            np.float16)
        self.input_np2 = np.reshape(np.arange(0, input_size) % number_range1 - number_range1 / 4, input_shape).astype(
            np.float16)
        self.weight_np = np.reshape(np.arange(0, filter_size) % number_range2 - number_range2 / 2, filter_shape).astype(
            np.float16)
        self.has_bias = has_bias
        if self.has_bias is True:
            self.bias_np = np.arange(0, self.out_c).astype(np.float16)

        self.out_shape = (128, 64, 56, 56)
        out_size = 1
        for s in self.out_shape:
            out_size = out_size * s
        number_range3 = min(10, out_size)
        self.output_grad_np = np.reshape(np.arange(0, out_size) % number_range3 - number_range3 / 2,
                                         self.out_shape).astype(np.float16)
        self.x_id = device_id % 4
        self.y_id = device_id % 4
        self.out_strategy = self.strategy0[1]
        self.out_id = device_id % 4

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def forward_conv2d_mindspore_impl(self):
        input1 = Tensor(self.input_np1)
        input2 = Tensor(self.input_np2)
        weight = Tensor(self.weight_np)
        if self.has_bias:
            bias = Tensor(self.bias_np)
            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
                         kernel_size=(self.kernel_h, self.kernel_w),
                         stride=self.stride, pad_mode=self.pad_mode,
                         padding=self.padding, dilation=self.dilation,
                         group=self.group, has_bias=True, weight_init=weight,
                         bias_init=bias)
        else:
            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
                         kernel_size=(self.kernel_h, self.kernel_w),
                         stride=self.stride, pad_mode=self.pad_mode,
                         padding=self.padding, dilation=self.dilation,
                         group=self.group, has_bias=False, weight_init=weight)
        out = net(input1, input2)
        return out.asnumpy()

    def forward_conv2d_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        weight = Tensor(self.weight_np)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        if self.has_bias:
            bias = Tensor(self.bias_np)
            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
                         kernel_size=(self.kernel_h, self.kernel_w),
                         stride=self.stride, pad_mode=self.pad_mode,
                         padding=self.padding, dilation=self.dilation,
                         group=self.group, has_bias=True, weight_init=weight,
                         bias_init=bias, strategy=(self.strategy0[0], self.strategy0[1], self.strategy0[1]))
        else:
            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
                         kernel_size=(self.kernel_h, self.kernel_w),
                         stride=self.stride, pad_mode=self.pad_mode,
                         padding=self.padding, dilation=self.dilation,
                         group=self.group, has_bias=False, weight_init=weight,
                         strategy=(self.strategy0[0], self.strategy0[1], self.strategy0[1]))
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
        return out.asnumpy()

    def grad_conv2d_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        weight = Tensor(self.weight_np)
        output_grad = Tensor(self.output_grad_np)
        if self.has_bias:
            bias = Tensor(self.bias_np)
            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
                         kernel_size=(self.kernel_h, self.kernel_w),
                         stride=self.stride, pad_mode=self.pad_mode,
                         padding=self.padding, dilation=self.dilation,
                         group=self.group, has_bias=True, weight_init=weight,
                         bias_init=bias,)
        else:
            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
                         kernel_size=(self.kernel_h, self.kernel_w),
                         stride=self.stride, pad_mode=self.pad_mode,
                         padding=self.padding, dilation=self.dilation,
                         group=self.group, has_bias=False, weight_init=weight)

        grad_net = Grad(net)
        grad_net.set_train()
        out_grad = grad_net(x, y, output_grad)
        return out_grad

    def grad_conv2d_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        weight = Tensor(self.weight_np)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        output_grad = Tensor(self.output_grad_np)
        output_grads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
        output_grad1 = Tensor(output_grads[self.out_id])
        if self.has_bias:
            bias = Tensor(self.bias_np)
            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
                         kernel_size=(self.kernel_h, self.kernel_w),
                         stride=self.stride, pad_mode=self.pad_mode,
                         padding=self.padding, dilation=self.dilation,
                         group=self.group, has_bias=True, weight_init=weight,
                         bias_init=bias, strategy=(self.strategy0[0], self.strategy0[1], self.strategy0[1]))
        else:
            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
                         kernel_size=(self.kernel_h, self.kernel_w),
                         stride=self.stride, pad_mode=self.pad_mode,
                         padding=self.padding, dilation=self.dilation,
                         group=self.group, has_bias=False, weight_init=weight,
                         strategy=(self.strategy0[0], self.strategy0[1], self.strategy0[1]))

        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_train()
        grad_net.set_auto_parallel()
        out_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad1],
                            parallel_inputs_run=[x1, y1, output_grad1])
        return out_grad

    def forward_conv2d_cmp(self):
        out_mindspore = self.forward_conv2d_mindspore_impl()
        out_mindspore_parallel = self.forward_conv2d_mindspore_parallel_impl()
        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
        assert allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.001, 0.001)

    def grad_conv2d_cmp(self):
        input_grad_mindspore = self.grad_conv2d_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_conv2d_mindspore_parallel_impl()
        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[1])
        assert allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.001, 0.001)
        assert allclose(input_grad_blocks_1[self.x_id], input_grad_mindspore_parallel1, 0.001, 0.001)


 def test_reid_conv2d_input_128_64_112_112_kernel_64_64_1_1_stride_2_padding_0_bias_true():
    fact = Conv2dFactory(input_shape=(128, 64, 112, 112),
                         filter_shape=(64, 64, 1, 1),
                         stride=2, pad_mode='valid', padding=0,
                         dilation=1, group=1, has_bias=False)
    fact.forward_conv2d_cmp()


 def test_reid_conv2d_grad_input_128_64_112_112_kernel_64_64_1_1_stride_2_padding_0_bias_true():
    fact = Conv2dFactory(input_shape=(128, 64, 112, 112),
                         filter_shape=(64, 64, 1, 1),
                         stride=2, pad_mode='valid', padding=0,
                         dilation=1, group=1, has_bias=False)
    fact.grad_conv2d_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/dropout/_test_dropout_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/dropout/_test_dropout_parallel_4p.py
@@ -1,120 +1,120 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import numpy as np
 import os

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.nn import Dropout

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class Net(Cell):
    def __init__(self, keep_prob, seed0, seed1, strategy=None):
        super(Net, self).__init__()
        self.drop = Dropout(keep_prob, seed0, seed1, dtype=ms.float32, strategy=strategy)

    def construct(self, input):
        x = self.drop(input)
        return x


 # pylint: disable=comparison-with-itself
 class DropoutFactory:
    def __init__(self, input_shape, keep_prob, seed0, seed1, strategy0=None):
        size = 1
        prefix = ""
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(10, size)
        self.input_np = np.reshape(np.arange(0, size) % number_range, input_shape).astype(np.float32)
        self.keep_prob = keep_prob
        self.seed0 = seed0
        self.seed1 = seed1
        self.strategy0 = strategy0
        need_dev_num = 1
        for s in strategy0[1]:
            need_dev_num = need_dev_num * s
        self.x_id = device_id % need_dev_num
        self.out_id = device_id % need_dev_num

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def d4_tensor_compare(self, input, out_me):
        [a, b, c, d] = input.shape
        for i in range(a):
            for j in range(b):
                for k in range(c):
                    for e in range(d):
                        if out_me[i, j, k, e] == 0:
                            assert True == True
                        else:
                            assert np.allclose(out_me[i, j, k, e], input[i, j, k, e] * (1 / 0.4), 0.0001, 0.0001)

    def forward_mindspore_parallel_impl(self):
        x = Tensor(self.input_np)
        inputs_x = self.get_parallel_blocks(self.input_np, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        net = Net(0.4, 0, 0, strategy=self.strategy0)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        out = net(x, parallel_inputs_compile=[x], parallel_inputs_run=[x1])
        return out.asnumpy()

    def forward_cmp(self):
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        input_blocks = self.get_parallel_blocks(self.input_np, self.strategy0[1])
        self.d4_tensor_compare(input_blocks[self.out_id], out_mindspore_parallel)


 def test_reid_dropout_forward_seed_F32_64_512_8_8():
    fact = DropoutFactory(input_shape=(64, 512, 8, 8), keep_prob=0.4, seed0=0, seed1=0, strategy0=(0, (4, 1, 1, 1)))
    fact.forward_cmp()


 def test_reid_dropout_forward_seed_F32_64_512_8_8_repeat():
    fact = DropoutFactory(input_shape=(64, 512, 8, 8), keep_prob=0.4, seed0=0, seed1=0, strategy0=(0, (2, 1, 1, 1)))
    fact.forward_cmp()
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.nn import Dropout

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class Net(Cell):
    def __init__(self, keep_prob, seed0, seed1, strategy=None):
        super(Net, self).__init__()
        self.drop = Dropout(keep_prob, seed0, seed1, dtype=ms.float32, strategy=strategy)

    def construct(self, input_):
        x = self.drop(input_)
        return x


 # pylint: disable=comparison-with-itself
 class DropoutFactory:
    def __init__(self, input_shape, keep_prob, seed0, seed1, strategy0=None):
        size = 1
        prefix = ""
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(10, size)
        self.input_np = np.reshape(np.arange(0, size) % number_range, input_shape).astype(np.float32)
        self.keep_prob = keep_prob
        self.seed0 = seed0
        self.seed1 = seed1
        self.strategy0 = strategy0
        need_dev_num = 1
        for s in strategy0[1]:
            need_dev_num = need_dev_num * s
        self.x_id = device_id % need_dev_num
        self.out_id = device_id % need_dev_num

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def d4_tensor_compare(self, input_, out_me):
        [a, b, c, d] = input_.shape
        for i in range(a):
            for j in range(b):
                for k in range(c):
                    for e in range(d):
                        if out_me[i, j, k, e] == 0:
                            assert True
                        else:
                            assert np.allclose(out_me[i, j, k, e], input_[i, j, k, e] * (1 / 0.4), 0.0001, 0.0001)

    def forward_mindspore_parallel_impl(self):
        x = Tensor(self.input_np)
        inputs_x = self.get_parallel_blocks(self.input_np, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        net = Net(0.4, 0, 0, strategy=self.strategy0)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        out = net(x, parallel_inputs_compile=[x], parallel_inputs_run=[x1])
        return out.asnumpy()

    def forward_cmp(self):
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        input_blocks = self.get_parallel_blocks(self.input_np, self.strategy0[1])
        self.d4_tensor_compare(input_blocks[self.out_id], out_mindspore_parallel)


 def test_reid_dropout_forward_seed_F32_64_512_8_8():
    fact = DropoutFactory(input_shape=(64, 512, 8, 8), keep_prob=0.4, seed0=0, seed1=0, strategy0=(0, (4, 1, 1, 1)))
    fact.forward_cmp()


 def test_reid_dropout_forward_seed_F32_64_512_8_8_repeat():
    fact = DropoutFactory(input_shape=(64, 512, 8, 8), keep_prob=0.4, seed0=0, seed1=0, strategy0=(0, (2, 1, 1, 1)))
    fact.forward_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/hcom/_test_allgather_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/hcom/_test_allgather_4p.py
@@ -1,154 +1,154 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import numpy as np
 import os

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class MatmulSingle(Cell):
    def __init__(self, transpose_a=False, transpose_b=False):
        super(MatmulSingle, self).__init__()
        self.matmul = P.MatMul(transpose_a, transpose_b)
        self.pow = P.Pow()
        self.reduce_sum = P.ReduceSum()

    def construct(self, x, y):
        out = self.matmul(x, y)
        out = self.pow(out, 2.0)
        out = self.reduce_sum(out, None)
        return out


 class MatmulAllgather(Cell):
    def __init__(self, group, transpose_a=False, transpose_b=False):
        super(MatmulAllgather, self).__init__()
        self.allgather = P.AllGather(group=group)
        self.matmul = P.MatMul(transpose_a, transpose_b)
        self.pow = P.Pow()
        self.reduce_sum = P.ReduceSum()
        self.allreduce = P.AllReduce(group=group)

    def construct(self, x, y):
        x = self.allgather(x)
        out = self.matmul(x, y)
        out = self.pow(out, 2.0)
        out = self.reduce_sum(out, None)
        out = self.allreduce(out)
        return out


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, sens):
        return grad_all_with_sens(self.network)(x, y, sens)


 class MatmulAllgatherFactory:
    def __init__(self, inputx_shape, inputy_shape, x_stra, y_stra):
        self.inputx = self.GenValue(inputx_shape, 10)
        self.inputy = self.GenValue(inputy_shape, 20)
        self.x_stra = x_stra
        self.y_stra = y_stra
        stra_size = 1
        for s in x_stra:
            stra_size = stra_size * s
        self.stra_size = stra_size

    def GenValue(self, input_shape, delta):
        size = 1
        for s in input_shape:
            size = size * s
        number_range = min(100, size)
        input_np = np.reshape(np.arange(0, size) % number_range - delta, input_shape).astype(np.float32)
        return input_np

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def grad_mindspore_impl_single(self):
        x = Tensor(self.inputx)
        y = Tensor(self.inputy)
        sens = Tensor(1.0, dtype=ms.float32)
        net = MatmulSingle()
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, sens)
        return input_grad

    def grad_mindspore_impl_reduce(self):
        inputxs = self.get_parallel_blocks(self.inputx, self.x_stra)
        inputys = self.get_parallel_blocks(self.inputy, self.y_stra)
        x = Tensor(inputxs[device_id % self.stra_size])
        y = Tensor(inputys[device_id % self.stra_size])
        repeat_num = device_num / self.stra_size
        v = self.stra_size * repeat_num * repeat_num * repeat_num
        sens = Tensor(1.0 / v, dtype=ms.float32)
        net = MatmulAllgather("hccl_world_group")
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, sens)
        return input_grad

    def grad_cmp(self):
        single_results = self.grad_mindspore_impl_single()
        reduce_results = self.grad_mindspore_impl_reduce()
        single_result0 = self.get_parallel_blocks(single_results[0].asnumpy(), self.x_stra)[device_id % self.stra_size]
        reduce_result0 = reduce_results[0].asnumpy()
        single_result1 = self.get_parallel_blocks(single_results[1].asnumpy(), self.y_stra)[device_id % self.stra_size]
        reduce_result1 = reduce_results[1].asnumpy()
        assert np.allclose(single_result0, reduce_result0, 0.0001, 0.0001)
        assert np.allclose(single_result1, reduce_result1, 0.0001, 0.0001)


 def test_reduce_grad():
    inputx_shape = (64, 32)
    inputy_shape = (32, 64)
    fact = MatmulAllgatherFactory(inputx_shape, inputy_shape, (4, 1), (1, 4))
    fact.grad_cmp()
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class MatmulSingle(Cell):
    def __init__(self, transpose_a=False, transpose_b=False):
        super(MatmulSingle, self).__init__()
        self.matmul = P.MatMul(transpose_a, transpose_b)
        self.pow = P.Pow()
        self.reduce_sum = P.ReduceSum()

    def construct(self, x, y):
        out = self.matmul(x, y)
        out = self.pow(out, 2.0)
        out = self.reduce_sum(out, None)
        return out


 class MatmulAllgather(Cell):
    def __init__(self, group, transpose_a=False, transpose_b=False):
        super(MatmulAllgather, self).__init__()
        self.allgather = P.AllGather(group=group)
        self.matmul = P.MatMul(transpose_a, transpose_b)
        self.pow = P.Pow()
        self.reduce_sum = P.ReduceSum()
        self.allreduce = P.AllReduce(group=group)

    def construct(self, x, y):
        x = self.allgather(x)
        out = self.matmul(x, y)
        out = self.pow(out, 2.0)
        out = self.reduce_sum(out, None)
        out = self.allreduce(out)
        return out


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, sens):
        return grad_all_with_sens(self.network)(x, y, sens)


 class MatmulAllgatherFactory:
    def __init__(self, inputx_shape, inputy_shape, x_stra, y_stra):
        self.inputx = self.gen_value(inputx_shape, 10)
        self.inputy = self.gen_value(inputy_shape, 20)
        self.x_stra = x_stra
        self.y_stra = y_stra
        stra_size = 1
        for s in x_stra:
            stra_size = stra_size * s
        self.stra_size = stra_size

    def gen_value(self, input_shape, delta):
        size = 1
        for s in input_shape:
            size = size * s
        number_range = min(100, size)
        input_np = np.reshape(np.arange(0, size) % number_range - delta, input_shape).astype(np.float32)
        return input_np

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def grad_mindspore_impl_single(self):
        x = Tensor(self.inputx)
        y = Tensor(self.inputy)
        sens = Tensor(1.0, dtype=ms.float32)
        net = MatmulSingle()
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, sens)
        return input_grad

    def grad_mindspore_impl_reduce(self):
        inputxs = self.get_parallel_blocks(self.inputx, self.x_stra)
        inputys = self.get_parallel_blocks(self.inputy, self.y_stra)
        x = Tensor(inputxs[device_id % self.stra_size])
        y = Tensor(inputys[device_id % self.stra_size])
        repeat_num = device_num / self.stra_size
        v = self.stra_size * repeat_num * repeat_num * repeat_num
        sens = Tensor(1.0 / v, dtype=ms.float32)
        net = MatmulAllgather("hccl_world_group")
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, sens)
        return input_grad

    def grad_cmp(self):
        single_results = self.grad_mindspore_impl_single()
        reduce_results = self.grad_mindspore_impl_reduce()
        single_result0 = self.get_parallel_blocks(single_results[0].asnumpy(), self.x_stra)[device_id % self.stra_size]
        reduce_result0 = reduce_results[0].asnumpy()
        single_result1 = self.get_parallel_blocks(single_results[1].asnumpy(), self.y_stra)[device_id % self.stra_size]
        reduce_result1 = reduce_results[1].asnumpy()
        assert np.allclose(single_result0, reduce_result0, 0.0001, 0.0001)
        assert np.allclose(single_result1, reduce_result1, 0.0001, 0.0001)


 def test_reduce_grad():
    inputx_shape = (64, 32)
    inputy_shape = (32, 64)
    fact = MatmulAllgatherFactory(inputx_shape, inputy_shape, (4, 1), (1, 4))
    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/hcom/_test_allreduce_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/hcom/_test_allreduce_4p.py
@@ -1,175 +1,175 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import numpy as np
 import os

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class MatmulSingle(Cell):
    def __init__(self, transpose_a=False, transpose_b=False):
        super(MatmulSingle, self).__init__()
        self.matmul1 = P.MatMul(transpose_a, transpose_b)
        self.matmul2 = P.MatMul(transpose_a, transpose_b)
        self.pow = P.Pow()
        self.reduce_sum = P.ReduceSum()

    def construct(self, x, y, z):
        out = self.matmul1(x, y)
        out = self.matmul2(out, z)
        out = self.pow(out, 2.0)
        out = self.reduce_sum(out, None)
        return out


 class MatmulReduce(Cell):
    def __init__(self, group, transpose_a=False, transpose_b=False):
        super(MatmulReduce, self).__init__()
        self.matmul1 = P.MatMul(transpose_a, transpose_b)
        self.allreduce1 = P.AllReduce(group=group)
        self.matmul2 = P.MatMul(transpose_a, transpose_b)
        self.pow = P.Pow()
        self.reduce_sum = P.ReduceSum()
        self.allreduce2 = P.AllReduce(group=group)

    def construct(self, x, y, z):
        out = self.matmul1(x, y)
        out = self.allreduce1(out)
        out = self.matmul2(out, z)
        out = self.pow(out, 2.0)
        out = self.reduce_sum(out, None)
        out = self.allreduce2(out)
        return out


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, z, sens):
        return grad_all_with_sens(self.network)(x, y, z, sens)


 class MatmulReduceFactory:
    def __init__(self, inputx_shape, inputy_shape, inputz_shape, x_stra, y_stra, z_stra):
        self.inputx = self.GenValue(inputx_shape, 10)
        self.inputy = self.GenValue(inputy_shape, 20)
        self.inputz = self.GenValue(inputz_shape, 30)
        self.x_stra = x_stra
        self.y_stra = y_stra
        self.z_stra = z_stra
        stra_size = 1
        for s in x_stra:
            stra_size = stra_size * s
        self.stra_size = stra_size

    def GenValue(self, input_shape, delta):
        size = 1
        for s in input_shape:
            size = size * s
        number_range = min(100, size)
        input_np = np.reshape(np.arange(0, size) % number_range - delta, input_shape).astype(np.float32)
        return input_np

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def grad_mindspore_impl_single(self):
        x = Tensor(self.inputx)
        y = Tensor(self.inputy)
        z = Tensor(self.inputz)
        sens = Tensor(1.0, dtype=ms.float32)
        net = MatmulSingle()
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, z, sens)
        return input_grad

    def grad_mindspore_impl_reduce(self):
        inputxs = self.get_parallel_blocks(self.inputx, self.x_stra)
        inputys = self.get_parallel_blocks(self.inputy, self.y_stra)
        inputzs = self.get_parallel_blocks(self.inputz, self.z_stra)
        x = Tensor(inputxs[device_id % self.stra_size])
        y = Tensor(inputys[device_id % self.stra_size])
        z = Tensor(inputzs[device_id % self.stra_size])
        repeat_num = device_num / self.stra_size
        v = self.stra_size * repeat_num * repeat_num * repeat_num
        sens = Tensor(1.0 / v, dtype=ms.float32)
        net = MatmulReduce("hccl_world_group")
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, z, sens)
        return input_grad

    def grad_cmp(self):
        single_results = self.grad_mindspore_impl_single()
        reduce_results = self.grad_mindspore_impl_reduce()
        single_result0 = self.get_parallel_blocks(single_results[0].asnumpy(), self.x_stra)[device_id % self.stra_size]
        reduce_result0 = reduce_results[0].asnumpy()
        single_result1 = self.get_parallel_blocks(single_results[1].asnumpy(), self.y_stra)[device_id % self.stra_size]
        reduce_result1 = reduce_results[1].asnumpy()
        single_result2 = self.get_parallel_blocks(single_results[2].asnumpy(), self.z_stra)[device_id % self.stra_size]
        reduce_result2 = reduce_results[2].asnumpy()
        assert np.allclose(single_result0, reduce_result0, 0.0001, 0.0001)
        assert np.allclose(single_result1, reduce_result1, 0.0001, 0.0001)
        assert np.allclose(single_result2, reduce_result2, 0.0001, 0.0001)


 def test_reduce_grad():
    inputx_shape = (32, 64)
    inputy_shape = (64, 64)
    inputz_shape = (64, 32)
    fact = MatmulReduceFactory(inputx_shape, inputy_shape, inputz_shape, (1, 4), (4, 1), (1, 4))
    fact.grad_cmp()


 def test_reduce_grad_repeat():
    inputx_shape = (32, 64)
    inputy_shape = (64, 64)
    inputz_shape = (64, 32)
    fact = MatmulReduceFactory(inputx_shape, inputy_shape, inputz_shape, (1, 2), (2, 1), (1, 2))
    fact.grad_cmp()
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class MatmulSingle(Cell):
    def __init__(self, transpose_a=False, transpose_b=False):
        super(MatmulSingle, self).__init__()
        self.matmul1 = P.MatMul(transpose_a, transpose_b)
        self.matmul2 = P.MatMul(transpose_a, transpose_b)
        self.pow = P.Pow()
        self.reduce_sum = P.ReduceSum()

    def construct(self, x, y, z):
        out = self.matmul1(x, y)
        out = self.matmul2(out, z)
        out = self.pow(out, 2.0)
        out = self.reduce_sum(out, None)
        return out


 class MatmulReduce(Cell):
    def __init__(self, group, transpose_a=False, transpose_b=False):
        super(MatmulReduce, self).__init__()
        self.matmul1 = P.MatMul(transpose_a, transpose_b)
        self.allreduce1 = P.AllReduce(group=group)
        self.matmul2 = P.MatMul(transpose_a, transpose_b)
        self.pow = P.Pow()
        self.reduce_sum = P.ReduceSum()
        self.allreduce2 = P.AllReduce(group=group)

    def construct(self, x, y, z):
        out = self.matmul1(x, y)
        out = self.allreduce1(out)
        out = self.matmul2(out, z)
        out = self.pow(out, 2.0)
        out = self.reduce_sum(out, None)
        out = self.allreduce2(out)
        return out


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, z, sens):
        return grad_all_with_sens(self.network)(x, y, z, sens)


 class MatmulReduceFactory:
    def __init__(self, inputx_shape, inputy_shape, inputz_shape, x_stra, y_stra, z_stra):
        self.inputx = self.gen_value(inputx_shape, 10)
        self.inputy = self.gen_value(inputy_shape, 20)
        self.inputz = self.gen_value(inputz_shape, 30)
        self.x_stra = x_stra
        self.y_stra = y_stra
        self.z_stra = z_stra
        stra_size = 1
        for s in x_stra:
            stra_size = stra_size * s
        self.stra_size = stra_size

    def gen_value(self, input_shape, delta):
        size = 1
        for s in input_shape:
            size = size * s
        number_range = min(100, size)
        input_np = np.reshape(np.arange(0, size) % number_range - delta, input_shape).astype(np.float32)
        return input_np

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def grad_mindspore_impl_single(self):
        x = Tensor(self.inputx)
        y = Tensor(self.inputy)
        z = Tensor(self.inputz)
        sens = Tensor(1.0, dtype=ms.float32)
        net = MatmulSingle()
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, z, sens)
        return input_grad

    def grad_mindspore_impl_reduce(self):
        inputxs = self.get_parallel_blocks(self.inputx, self.x_stra)
        inputys = self.get_parallel_blocks(self.inputy, self.y_stra)
        inputzs = self.get_parallel_blocks(self.inputz, self.z_stra)
        x = Tensor(inputxs[device_id % self.stra_size])
        y = Tensor(inputys[device_id % self.stra_size])
        z = Tensor(inputzs[device_id % self.stra_size])
        repeat_num = device_num / self.stra_size
        v = self.stra_size * repeat_num * repeat_num * repeat_num
        sens = Tensor(1.0 / v, dtype=ms.float32)
        net = MatmulReduce("hccl_world_group")
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, z, sens)
        return input_grad

    def grad_cmp(self):
        single_results = self.grad_mindspore_impl_single()
        reduce_results = self.grad_mindspore_impl_reduce()
        single_result0 = self.get_parallel_blocks(single_results[0].asnumpy(), self.x_stra)[device_id % self.stra_size]
        reduce_result0 = reduce_results[0].asnumpy()
        single_result1 = self.get_parallel_blocks(single_results[1].asnumpy(), self.y_stra)[device_id % self.stra_size]
        reduce_result1 = reduce_results[1].asnumpy()
        single_result2 = self.get_parallel_blocks(single_results[2].asnumpy(), self.z_stra)[device_id % self.stra_size]
        reduce_result2 = reduce_results[2].asnumpy()
        assert np.allclose(single_result0, reduce_result0, 0.0001, 0.0001)
        assert np.allclose(single_result1, reduce_result1, 0.0001, 0.0001)
        assert np.allclose(single_result2, reduce_result2, 0.0001, 0.0001)


 def test_reduce_grad():
    inputx_shape = (32, 64)
    inputy_shape = (64, 64)
    inputz_shape = (64, 32)
    fact = MatmulReduceFactory(inputx_shape, inputy_shape, inputz_shape, (1, 4), (4, 1), (1, 4))
    fact.grad_cmp()


 def test_reduce_grad_repeat():
    inputx_shape = (32, 64)
    inputy_shape = (64, 64)
    inputz_shape = (64, 32)
    fact = MatmulReduceFactory(inputx_shape, inputy_shape, inputz_shape, (1, 2), (2, 1), (1, 2))
    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/l2normalize/_test_l2normalize_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/l2normalize/_test_l2normalize_parallel_4p.py
@@ -1,207 +1,206 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import numpy as np
 import os
 import pytest

 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class L2normalize(Cell):
    def __init__(self, axis=0, epsilon=1e-4, strategy0=None, strategy1=None):
        super(L2normalize, self).__init__()
        self.add = P.TensorAdd(strategy=strategy0)
        self.l2norm = P.L2Normalize(axis, epsilon, strategy1)

    def construct(self, x, y):
        out = self.add(x, y)
        out = self.l2norm(out)
        return out


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, output_grad):
        return grad_all_with_sens(self.network)(x, y, output_grad)


 class L2normalizeFactory:
    def __init__(self, input_shape, axis, strategy0, strategy1):
        prefix = ""
        size = 1
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(1000, size)
        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
            np.float32)
        self.input_np2 = np.reshape(np.arange(0, size) % number_range - number_range / 4, input_shape).astype(
            np.float32)
        target_shape = input_shape
        self.target_shape = target_shape
        target_size = 1
        for s in target_shape:
            target_size = target_size * s
        number_range = min(1000, target_size)
        self.output_grad_np = np.reshape(np.arange(0, target_size) % number_range - number_range / 2,
                                         target_shape).astype(np.float32)
        self.axis = axis
        self.epsilon = 1e-4
        self.strategy0 = strategy0
        self.strategy1 = strategy1
        out_strategy = strategy1[1]
        self.out_strategy = out_strategy
        need_dev_num0 = 1
        need_dev_num1 = 1
        for s in strategy0[1]:
            need_dev_num0 = need_dev_num0 * s
        for s in out_strategy:
            need_dev_num1 = need_dev_num1 * s
        self.x_id = device_id % need_dev_num0
        self.y_id = device_id % need_dev_num0
        self.out_id = device_id % need_dev_num1

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def forward_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        net = L2normalize(self.axis, self.epsilon)
        out = net(x, y)
        return out.asnumpy()

    def forward_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        net = L2normalize(self.axis, self.epsilon, strategy0=self.strategy0, strategy1=self.strategy1)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
        return out.asnumpy()

    def grad_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        net = L2normalize(self.axis, self.epsilon)
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad)
        return input_grad

    def grad_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        outgrads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        output_grad1 = Tensor(outgrads[self.out_id])
        net = L2normalize(self.axis, self.epsilon, strategy0=self.strategy0, strategy1=self.strategy1)
        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_auto_parallel()
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad1],
                              parallel_inputs_run=[x1, y1, output_grad1])
        return input_grad

    def forward_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.001, 0.001)

    def grad_cmp(self):
        input_grad_mindspore = self.grad_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
        assert np.allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
        assert np.allclose(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)


 def test_reid_l2normalize_input_128_512():
    input_shape = (128, 512)
    axis = 0
    fact = L2normalizeFactory(input_shape, axis, strategy0=(0, (4, 1), (4, 1)), strategy1=(0, (1, 4)))
    fact.forward_cmp()


 def test_reid_l2normalize_grad_input_128_512():
    input_shape = (128, 512)
    axis = 0
    fact = L2normalizeFactory(input_shape, axis, (0, (4, 1), (4, 1)), strategy1=(0, (1, 4)))
    fact.grad_cmp()


 def test_reid_l2normalize_input_128_512_repeat():
    input_shape = (128, 512)
    axis = 0
    fact = L2normalizeFactory(input_shape, axis, strategy0=(0, (1, 2), (1, 2)), strategy1=(0, (1, 2)))
    fact.forward_cmp()


 def test_reid_l2normalize_grad_input_128_512_repeat():
    input_shape = (128, 512)
    axis = 0
    fact = L2normalizeFactory(input_shape, axis, strategy0=(0, (1, 2), (1, 2)), strategy1=(0, (1, 2)))
    fact.grad_cmp()
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np

 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class L2normalize(Cell):
    def __init__(self, axis=0, epsilon=1e-4, strategy0=None, strategy1=None):
        super(L2normalize, self).__init__()
        self.add = P.TensorAdd(strategy=strategy0)
        self.l2norm = P.L2Normalize(axis, epsilon, strategy1)

    def construct(self, x, y):
        out = self.add(x, y)
        out = self.l2norm(out)
        return out


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, output_grad):
        return grad_all_with_sens(self.network)(x, y, output_grad)


 class L2normalizeFactory:
    def __init__(self, input_shape, axis, strategy0, strategy1):
        prefix = ""
        size = 1
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(1000, size)
        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
            np.float32)
        self.input_np2 = np.reshape(np.arange(0, size) % number_range - number_range / 4, input_shape).astype(
            np.float32)
        target_shape = input_shape
        self.target_shape = target_shape
        target_size = 1
        for s in target_shape:
            target_size = target_size * s
        number_range = min(1000, target_size)
        self.output_grad_np = np.reshape(np.arange(0, target_size) % number_range - number_range / 2,
                                         target_shape).astype(np.float32)
        self.axis = axis
        self.epsilon = 1e-4
        self.strategy0 = strategy0
        self.strategy1 = strategy1
        out_strategy = strategy1[1]
        self.out_strategy = out_strategy
        need_dev_num0 = 1
        need_dev_num1 = 1
        for s in strategy0[1]:
            need_dev_num0 = need_dev_num0 * s
        for s in out_strategy:
            need_dev_num1 = need_dev_num1 * s
        self.x_id = device_id % need_dev_num0
        self.y_id = device_id % need_dev_num0
        self.out_id = device_id % need_dev_num1

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def forward_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        net = L2normalize(self.axis, self.epsilon)
        out = net(x, y)
        return out.asnumpy()

    def forward_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        net = L2normalize(self.axis, self.epsilon, strategy0=self.strategy0, strategy1=self.strategy1)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
        return out.asnumpy()

    def grad_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        net = L2normalize(self.axis, self.epsilon)
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad)
        return input_grad

    def grad_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        outgrads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        output_grad1 = Tensor(outgrads[self.out_id])
        net = L2normalize(self.axis, self.epsilon, strategy0=self.strategy0, strategy1=self.strategy1)
        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_auto_parallel()
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad1],
                              parallel_inputs_run=[x1, y1, output_grad1])
        return input_grad

    def forward_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.001, 0.001)

    def grad_cmp(self):
        input_grad_mindspore = self.grad_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
        assert np.allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
        assert np.allclose(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)


 def test_reid_l2normalize_input_128_512():
    input_shape = (128, 512)
    axis = 0
    fact = L2normalizeFactory(input_shape, axis, strategy0=(0, (4, 1), (4, 1)), strategy1=(0, (1, 4)))
    fact.forward_cmp()


 def test_reid_l2normalize_grad_input_128_512():
    input_shape = (128, 512)
    axis = 0
    fact = L2normalizeFactory(input_shape, axis, (0, (4, 1), (4, 1)), strategy1=(0, (1, 4)))
    fact.grad_cmp()


 def test_reid_l2normalize_input_128_512_repeat():
    input_shape = (128, 512)
    axis = 0
    fact = L2normalizeFactory(input_shape, axis, strategy0=(0, (1, 2), (1, 2)), strategy1=(0, (1, 2)))
    fact.forward_cmp()


 def test_reid_l2normalize_grad_input_128_512_repeat():
    input_shape = (128, 512)
    axis = 0
    fact = L2normalizeFactory(input_shape, axis, strategy0=(0, (1, 2), (1, 2)), strategy1=(0, (1, 2)))
    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/loss/_test_loss_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/loss/_test_loss_parallel_4p.py
@@ -1,196 +1,195 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import numpy as np
 import os
 import pytest

 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class AddRelu(Cell):
    def __init__(self, strategy0=None, strategy1=None):
        super(AddRelu, self).__init__()
        self.add = P.TensorAdd(strategy=strategy0)
        self.relu = P.ReLU(strategy=strategy1)

    def construct(self, x, y):
        out = self.add(x, y)
        out = self.relu(out)
        return out


 class NetWithLoss(Cell):
    def __init__(self, network, strategy2=None):
        super(NetWithLoss, self).__init__()
        self.loss = P.SoftmaxCrossEntropyWithLogits(strategy=strategy2)
        self.network = network

    def construct(self, x, y, b):
        predict = self.network(x, y)
        return self.loss(predict, b)[0]


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, b):
        return grad_all(self.network)(x, y, b)


 class AddReluFactory:
    def __init__(self, input_shape, strategy0, strategy1, strategy2):
        prefix = ""
        size = 1
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(1000, size)
        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
            np.float32)
        self.input_np2 = np.reshape(np.arange(0, size) % number_range - number_range / 4, input_shape).astype(
            np.float32)
        target_shape = input_shape
        self.target_shape = target_shape
        target_size = 1
        for s in target_shape:
            target_size = target_size * s
        number_range = min(10, target_size)
        self.output_grad_np = np.reshape((np.arange(0, target_size) % number_range) * 0.1, target_shape).astype(
            np.float32)
        self.strategy0 = strategy0
        self.strategy1 = strategy1
        self.strategy2 = strategy2
        out_strategy = strategy1[1]
        self.out_strategy = out_strategy
        need_dev_num0 = 1
        need_dev_num1 = 1
        for s in strategy0[1]:
            need_dev_num0 = need_dev_num0 * s
        for s in out_strategy:
            need_dev_num1 = need_dev_num1 * s
        self.x_id = device_id % need_dev_num0
        self.y_id = device_id % need_dev_num0
        self.out_id = device_id % need_dev_num1

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def grad_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        net = AddRelu()
        net_with_loss = NetWithLoss(net)
        grad_net = Grad(net_with_loss)
        grad_net.set_train()
        input_grads = []
        for i in range(0, 3):
            input_grad = grad_net(x, y, output_grad)
            input_grads.append(input_grad)
        return input_grads

    def grad_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        outgrads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        output_grad1 = Tensor(outgrads[self.out_id])
        net = AddRelu(strategy0=self.strategy0, strategy1=self.strategy1)
        net_with_loss = NetWithLoss(net, strategy2=self.strategy2)
        grad_net = Grad(net_with_loss)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_auto_parallel()
        grad_net.set_train()
        input_grads = []
        for i in range(0, 3):
            input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad],
                                  parallel_inputs_run=[x1, y1, output_grad1])
            input_grads.append(input_grad)
        return input_grads

    def grad_cmp(self):
        input_grad_mindspores = self.grad_mindspore_impl()
        input_grad_mindspore_parallels = self.grad_mindspore_parallel_impl()
        for i in range(0, len(input_grad_mindspores)):
            input_grad_mindspore = input_grad_mindspores[i]
            input_grad_mindspore_parallel = input_grad_mindspore_parallels[i]
            input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
            input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
            input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
            input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
            input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
            input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
            np.save(path + str(i) + "_" + str(device_id) + "_" + self.prefix + "_grad_single0.npy",
                    input_grad_blocks_0[self.x_id])
            np.save(path + str(i) + "_" + str(device_id) + "_" + self.prefix + "_grad_single1.npy",
                    input_grad_blocks_1[self.y_id])
            np.save(path + str(i) + "_" + str(device_id) + "_" + self.prefix + "_grad_parallel0.npy",
                    input_grad_mindspore_parallel0)
            np.save(path + str(i) + "_" + str(device_id) + "_" + self.prefix + "_grad_parallel1.npy",
                    input_grad_mindspore_parallel1)
            assert np.allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
            assert np.allclose(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)


 def test_reid_l2normalize_grad_input_128_512():
    input_shape = (128, 512)
    fact = AddReluFactory(input_shape, strategy0=(0, (4, 1), (4, 1)), strategy1=(0, (4, 1)),
                          strategy2=(0, (4, 1), (4, 1)))
    fact.grad_cmp()


 def test_reid_l2normalize_grad_input_128_512_stridesplit():
    input_shape = (128, 512)
    fact = AddReluFactory(input_shape, strategy0=(0, (1, 1), (1, 1)), strategy1=(0, (4, 1)),
                          strategy2=(0, (4, 1), (4, 1)))
    fact.grad_cmp()
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np

 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class AddRelu(Cell):
    def __init__(self, strategy0=None, strategy1=None):
        super(AddRelu, self).__init__()
        self.add = P.TensorAdd(strategy=strategy0)
        self.relu = P.ReLU(strategy=strategy1)

    def construct(self, x, y):
        out = self.add(x, y)
        out = self.relu(out)
        return out


 class NetWithLoss(Cell):
    def __init__(self, network, strategy2=None):
        super(NetWithLoss, self).__init__()
        self.loss = P.SoftmaxCrossEntropyWithLogits(strategy=strategy2)
        self.network = network

    def construct(self, x, y, b):
        predict = self.network(x, y)
        return self.loss(predict, b)[0]


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, b):
        return grad_all(self.network)(x, y, b)


 class AddReluFactory:
    def __init__(self, input_shape, strategy0, strategy1, strategy2):
        prefix = ""
        size = 1
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(1000, size)
        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
            np.float32)
        self.input_np2 = np.reshape(np.arange(0, size) % number_range - number_range / 4, input_shape).astype(
            np.float32)
        target_shape = input_shape
        self.target_shape = target_shape
        target_size = 1
        for s in target_shape:
            target_size = target_size * s
        number_range = min(10, target_size)
        self.output_grad_np = np.reshape((np.arange(0, target_size) % number_range) * 0.1, target_shape).astype(
            np.float32)
        self.strategy0 = strategy0
        self.strategy1 = strategy1
        self.strategy2 = strategy2
        out_strategy = strategy1[1]
        self.out_strategy = out_strategy
        need_dev_num0 = 1
        need_dev_num1 = 1
        for s in strategy0[1]:
            need_dev_num0 = need_dev_num0 * s
        for s in out_strategy:
            need_dev_num1 = need_dev_num1 * s
        self.x_id = device_id % need_dev_num0
        self.y_id = device_id % need_dev_num0
        self.out_id = device_id % need_dev_num1

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def grad_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        net = AddRelu()
        net_with_loss = NetWithLoss(net)
        grad_net = Grad(net_with_loss)
        grad_net.set_train()
        input_grads = []
        for i in range(0, 3):
            input_grad = grad_net(x, y, output_grad)
            input_grads.append(input_grad)
        return input_grads

    def grad_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        outgrads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        output_grad1 = Tensor(outgrads[self.out_id])
        net = AddRelu(strategy0=self.strategy0, strategy1=self.strategy1)
        net_with_loss = NetWithLoss(net, strategy2=self.strategy2)
        grad_net = Grad(net_with_loss)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_auto_parallel()
        grad_net.set_train()
        input_grads = []
        for i in range(0, 3):
            input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad],
                                  parallel_inputs_run=[x1, y1, output_grad1])
            input_grads.append(input_grad)
        return input_grads

    def grad_cmp(self):
        input_grad_mindspores = self.grad_mindspore_impl()
        input_grad_mindspore_parallels = self.grad_mindspore_parallel_impl()
        for i in range(0, len(input_grad_mindspores)):
            input_grad_mindspore = input_grad_mindspores[i]
            input_grad_mindspore_parallel = input_grad_mindspore_parallels[i]
            input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
            input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
            input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
            input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
            input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
            input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
            np.save(path + str(i) + "_" + str(device_id) + "_" + self.prefix + "_grad_single0.npy",
                    input_grad_blocks_0[self.x_id])
            np.save(path + str(i) + "_" + str(device_id) + "_" + self.prefix + "_grad_single1.npy",
                    input_grad_blocks_1[self.y_id])
            np.save(path + str(i) + "_" + str(device_id) + "_" + self.prefix + "_grad_parallel0.npy",
                    input_grad_mindspore_parallel0)
            np.save(path + str(i) + "_" + str(device_id) + "_" + self.prefix + "_grad_parallel1.npy",
                    input_grad_mindspore_parallel1)
            assert np.allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
            assert np.allclose(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)


 def test_reid_l2normalize_grad_input_128_512():
    input_shape = (128, 512)
    fact = AddReluFactory(input_shape, strategy0=(0, (4, 1), (4, 1)), strategy1=(0, (4, 1)),
                          strategy2=(0, (4, 1), (4, 1)))
    fact.grad_cmp()


 def test_reid_l2normalize_grad_input_128_512_stridesplit():
    input_shape = (128, 512)
    fact = AddReluFactory(input_shape, strategy0=(0, (1, 1), (1, 1)), strategy1=(0, (4, 1)),
                          strategy2=(0, (4, 1), (4, 1)))
    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/matmul/_test_matmul_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/matmul/_test_matmul_parallel_4p.py
@@ -1,329 +1,329 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import numpy as np
 import os
 from numpy import allclose

 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class Matmul(Cell):
    def __init__(self, transpose_a=False, transpose_b=False, strategy0=None, strategy1=None):
        super(Matmul, self).__init__()
        self.add = P.TensorAdd(strategy=strategy1)
        self.matmul = P.MatMul(transpose_a, transpose_b, strategy=strategy0)

    def construct(self, x, w, z):
        out = self.add(x, z)
        return self.matmul(out, w)


 class BatchMatMul(Cell):
    def __init__(self, transpose_a=False, transpose_b=False, strategy0=None, strategy1=None):
        super(BatchMatMul, self).__init__()
        self.add = P.TensorAdd(strategy=strategy1)
        self.batchmatmul = P.BatchMatMul(transpose_a, transpose_b, strategy=strategy0)

    def construct(self, x, w, z):
        out = self.add(x, z)
        return self.batchmatmul(out, w)


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, inputa, inputb, inputz, output_grad):
        gout = grad_all_with_sens(self.network)(inputa, inputb, inputz, output_grad)
        return gout


 class BatchmatmulFactory:
    def __init__(self, inputa_shape, inputb_shape, transpose_a, transpose_b, strategy, strategy_):
        self.strategy = strategy
        self.strategy_ = strategy_
        inputa_size = 1
        inputb_size = 1
        prefix = ""
        for s in inputa_shape:
            prefix = prefix + str(s) + "_"
            inputa_size = inputa_size * s
        prefix = prefix + "and"
        for s in inputb_shape:
            prefix = prefix + str(s) + "_"
            inputb_size = inputb_size * s
        number_rangea = min(1000, inputa_size)
        number_rangeb = min(1000, inputb_size)
        self.inputa = np.reshape(np.arange(0, inputa_size) % number_rangea - number_rangea / 2, inputa_shape).astype(
            np.float32)
        self.inputb = np.reshape(np.arange(0, inputb_size) % number_rangeb - number_rangeb / 2, inputb_shape).astype(
            np.float32)
        self.inputz = np.zeros(self.inputa.shape).astype(np.float32)
        self.transpose_a = transpose_a
        self.transpose_b = transpose_b

        out_shape = []
        device_matrix = []
        out_strategy = []
        if transpose_a:
            temp = inputa_shape[-1]
            inputa_shape[-1] = inputa_shape[-2]
            inputa_shape[-2] = temp
        if transpose_b:
            temp = inputb_shape[-1]
            inputb_shape[-1] = inputb_shape[-2]
            inputb_shape[-2] = temp

        if (len(inputa_shape) >= len(inputb_shape)):
            out_shape = list(inputa_shape)
            out_shape[-1] = inputb_shape[-1]
        else:
            out_shape = list(inputb_shape)
            out_shape[-2] = inputa_shape[-2]

        strategy1 = list(self.strategy[1])
        strategy2 = list(self.strategy[2])
        if transpose_a:
            temp = strategy1[-1]
            strategy1[-1] = strategy1[-2]
            strategy1[-2] = temp
        if transpose_b:
            temp = strategy2[-1]
            strategy2[-1] = strategy2[-2]
            strategy2[-2] = temp

        if (len(strategy1) >= len(strategy2)):
            out_strategy = strategy1.copy()
            out_strategy[-1] = strategy2[-1]
        else:
            out_strategy = strategy2.copy()
            out_strategy[-2] = strategy1[-2]
        device_matrix = out_strategy.copy()
        device_matrix.insert(-1, strategy1[-1])
        self.out_strategy = out_strategy

        need_dev_num = 1
        for s in device_matrix:
            need_dev_num = need_dev_num * s
        self.need_dev_num = need_dev_num
        self.device_matrix = device_matrix

        out_size = 1
        for s in out_shape:
            out_size = out_size * s
        number_range = min(1000, out_size)
        self.output_grad_np = np.reshape(np.arange(0, out_size) % number_range - number_range / 2, out_shape).astype(
            np.float32)

        device_index = self.id_to_list(device_id % need_dev_num, self.device_matrix)
        x_index = device_index[:-1].copy()
        if transpose_a:
            temp = x_index[-1]
            x_index[-1] = x_index[-2]
            x_index[-2] = temp
        y_index = device_index[:-3].copy()
        y_index.append(device_index[-2])
        y_index.append(device_index[-1])
        if transpose_b:
            temp = y_index[-1]
            y_index[-1] = y_index[-2]
            y_index[-2] = temp

        out_index = device_index[:-2].copy()
        out_index.append(device_index[-1])

        print(device_matrix)
        print(device_index)

        need_dev_num_ = 1
        for s in strategy_[1]:
            need_dev_num_ = need_dev_num_ * s
        self.x_id = device_id % need_dev_num_
        self.y_id = self.list_to_id(y_index, self.strategy[2])
        self.out_id = self.list_to_id(out_index, self.out_strategy)

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    """
    shape：每一维的上限，如（2,4,8）
    """

    def id_to_list(self, id, shape):
        result = []
        r = id
        for i in range(0, len(shape)):
            v = 1
            for j in range(i + 1, len(shape)):
                v = v * shape[j]
            result.append(r // v)
            r = r % v
        return result

    def list_to_id(self, id_list, shape):
        result = 0
        for i in range(0, len(id_list)):
            v = 1
            for j in range(i + 1, len(id_list)):
                v = v * shape[j]
            result = result + id_list[i] * v
        return result

    def forward_mindspore_impl(self):
        if len(self.inputa.shape) > 2:
            matmul = BatchMatMul(self.transpose_a, self.transpose_b)
        else:
            matmul = Matmul(self.transpose_a, self.transpose_b)
        matmul.set_train()
        out_me = matmul(Tensor(self.inputa), Tensor(self.inputb), Tensor(self.inputz))
        return out_me.asnumpy()

    def forward_mindspore_parallel_impl(self):
        if len(self.inputa.shape) > 2:
            matmul = BatchMatMul(self.transpose_a, self.transpose_b, strategy0=self.strategy, strategy1=self.strategy_)
        else:
            matmul = Matmul(self.transpose_a, self.transpose_b, strategy0=self.strategy, strategy1=self.strategy_)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        x = Tensor(self.inputa)
        y = Tensor(self.inputb)
        z = Tensor(self.inputz)
        xs = self.get_parallel_blocks(self.inputa, self.strategy_[1])
        ys = self.get_parallel_blocks(self.inputb, self.strategy[2])
        zs = self.get_parallel_blocks(self.inputz, self.strategy_[1])
        x1 = Tensor(xs[self.x_id])  #
        y1 = Tensor(ys[self.y_id])  # 需要从设备矩阵推导
        z1 = Tensor(zs[self.x_id])
        matmul.set_train()
        matmul.set_auto_parallel()
        out_me = matmul(x, y, z, parallel_inputs_compile=[x, y, z], parallel_inputs_run=[x1, y1, z1])
        return out_me.asnumpy()

    def grad_mindspore_impl(self):
        x = Tensor(self.inputa)
        y = Tensor(self.inputb)
        z = Tensor(self.inputz)
        if len(self.inputa.shape) > 2:
            matmul = BatchMatMul(self.transpose_a, self.transpose_b)
        else:
            matmul = Matmul(self.transpose_a, self.transpose_b)
        net_me = Grad(matmul)
        net_me.set_train()
        out_grad_me = Tensor(self.output_grad_np)
        out_grad = net_me(x, y, z, out_grad_me)
        return out_grad

    def grad_mindspore_parallel_impl(self):
        if len(self.inputa.shape) > 2:
            matmul = BatchMatMul(self.transpose_a, self.transpose_b, strategy0=self.strategy, strategy1=self.strategy_)
        else:
            matmul = Matmul(self.transpose_a, self.transpose_b, strategy0=self.strategy, strategy1=self.strategy_)
        x = Tensor(self.inputa)
        y = Tensor(self.inputb)
        z = Tensor(self.inputz)
        out_grad_me = Tensor(self.output_grad_np)

        xs = self.get_parallel_blocks(self.inputa, self.strategy_[1])
        ys = self.get_parallel_blocks(self.inputb, self.strategy[2])
        zs = self.get_parallel_blocks(self.inputz, self.strategy_[1])
        out_grads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)

        x1 = Tensor(xs[self.x_id])  # 需要从设备矩阵推导
        y1 = Tensor(ys[self.y_id])  #
        z1 = Tensor(zs[self.x_id])
        out_grad1 = Tensor(out_grads[self.out_id])
        net_me = Grad(matmul)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net_me.set_auto_parallel()
        net_me.set_train()

        out_grad = net_me(x, y, z, out_grad_me, parallel_inputs_compile=[x, y, z, out_grad1],
                          parallel_inputs_run=[x1, y1, z1, out_grad1])
        return out_grad

    def forward_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspores = self.get_parallel_blocks(out_mindspore, self.out_strategy)
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        assert allclose(out_mindspores[self.out_id], out_mindspore_parallel, 0.0001, 0.0001)

    def grad_cmp(self):
        input_grad_mindspore = self.grad_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
        input_grad_mindspores0 = self.get_parallel_blocks(input_grad_mindspore[0].asnumpy(), self.strategy_[1])
        input_grad_mindspores1 = self.get_parallel_blocks(input_grad_mindspore[1].asnumpy(), self.strategy[2])
        input_grad_mindspores2 = self.get_parallel_blocks(input_grad_mindspore[2].asnumpy(), self.strategy_[1])
        assert allclose(input_grad_mindspores0[self.x_id], input_grad_mindspore_parallel[0].asnumpy(), 0.0001, 0.0001)
        assert allclose(input_grad_mindspores1[self.y_id], input_grad_mindspore_parallel[1].asnumpy(), 0.0001, 0.0001)
        assert allclose(input_grad_mindspores2[self.x_id], input_grad_mindspore_parallel[2].asnumpy(), 0.0001, 0.0001)


 def test_reid_batchmatmul_inputa_128_512_inputb_2000_512():
    inputa = [128, 512]
    inputb = [2000, 512]
    fact = BatchmatmulFactory(inputa, inputb, False, True, (0, (2, 2), (1, 2)), (0, (2, 2), (2, 2)))
    fact.forward_cmp()


 def test_reid_batchmatmul_grad_inputa_128_512_inputb_2000_512():
    inputa = [128, 512]
    inputb = [2000, 512]
    fact = BatchmatmulFactory(inputa, inputb, False, True, (0, (2, 2), (1, 2)), (0, (2, 2), (2, 2)))
    fact.grad_cmp()


 def test_reid_batchmatmul_inputa_128_512_inputb_2000_512_redistribution():
    inputa = [128, 512]
    inputb = [2000, 512]
    fact = BatchmatmulFactory(inputa, inputb, False, True, (0, (1, 2), (1, 2)), (0, (2, 2), (2, 2)))
    fact.forward_cmp()


 def test_reid_batchmatmul_grad_inputa_128_512_inputb_2000_512_redistribution():
    inputa = [128, 512]
    inputb = [2000, 512]
    fact = BatchmatmulFactory(inputa, inputb, False, True, (0, (1, 2), (1, 2)), (0, (2, 2), (2, 2)))
    fact.grad_cmp()
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np
 from numpy import allclose

 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class Matmul(Cell):
    def __init__(self, transpose_a=False, transpose_b=False, strategy0=None, strategy1=None):
        super(Matmul, self).__init__()
        self.add = P.TensorAdd(strategy=strategy1)
        self.matmul = P.MatMul(transpose_a, transpose_b, strategy=strategy0)

    def construct(self, x, w, z):
        out = self.add(x, z)
        return self.matmul(out, w)


 class BatchMatMul(Cell):
    def __init__(self, transpose_a=False, transpose_b=False, strategy0=None, strategy1=None):
        super(BatchMatMul, self).__init__()
        self.add = P.TensorAdd(strategy=strategy1)
        self.batchmatmul = P.BatchMatMul(transpose_a, transpose_b, strategy=strategy0)

    def construct(self, x, w, z):
        out = self.add(x, z)
        return self.batchmatmul(out, w)


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, inputa, inputb, inputz, output_grad):
        gout = grad_all_with_sens(self.network)(inputa, inputb, inputz, output_grad)
        return gout


 class BatchmatmulFactory:
    def __init__(self, inputa_shape, inputb_shape, transpose_a, transpose_b, strategy, strategy_):
        self.strategy = strategy
        self.strategy_ = strategy_
        inputa_size = 1
        inputb_size = 1
        prefix = ""
        for s in inputa_shape:
            prefix = prefix + str(s) + "_"
            inputa_size = inputa_size * s
        prefix = prefix + "and"
        for s in inputb_shape:
            prefix = prefix + str(s) + "_"
            inputb_size = inputb_size * s
        number_rangea = min(1000, inputa_size)
        number_rangeb = min(1000, inputb_size)
        self.inputa = np.reshape(np.arange(0, inputa_size) % number_rangea - number_rangea / 2, inputa_shape).astype(
            np.float32)
        self.inputb = np.reshape(np.arange(0, inputb_size) % number_rangeb - number_rangeb / 2, inputb_shape).astype(
            np.float32)
        self.inputz = np.zeros(self.inputa.shape).astype(np.float32)
        self.transpose_a = transpose_a
        self.transpose_b = transpose_b

        out_shape = []
        device_matrix = []
        out_strategy = []
        if transpose_a:
            temp = inputa_shape[-1]
            inputa_shape[-1] = inputa_shape[-2]
            inputa_shape[-2] = temp
        if transpose_b:
            temp = inputb_shape[-1]
            inputb_shape[-1] = inputb_shape[-2]
            inputb_shape[-2] = temp

        if len(inputa_shape) >= len(inputb_shape):
            out_shape = list(inputa_shape)
            out_shape[-1] = inputb_shape[-1]
        else:
            out_shape = list(inputb_shape)
            out_shape[-2] = inputa_shape[-2]

        strategy1 = list(self.strategy[1])
        strategy2 = list(self.strategy[2])
        if transpose_a:
            temp = strategy1[-1]
            strategy1[-1] = strategy1[-2]
            strategy1[-2] = temp
        if transpose_b:
            temp = strategy2[-1]
            strategy2[-1] = strategy2[-2]
            strategy2[-2] = temp

        if len(strategy1) >= len(strategy2):
            out_strategy = strategy1.copy()
            out_strategy[-1] = strategy2[-1]
        else:
            out_strategy = strategy2.copy()
            out_strategy[-2] = strategy1[-2]
        device_matrix = out_strategy.copy()
        device_matrix.insert(-1, strategy1[-1])
        self.out_strategy = out_strategy

        need_dev_num = 1
        for s in device_matrix:
            need_dev_num = need_dev_num * s
        self.need_dev_num = need_dev_num
        self.device_matrix = device_matrix

        out_size = 1
        for s in out_shape:
            out_size = out_size * s
        number_range = min(1000, out_size)
        self.output_grad_np = np.reshape(np.arange(0, out_size) % number_range - number_range / 2, out_shape).astype(
            np.float32)

        device_index = self.id_to_list(device_id % need_dev_num, self.device_matrix)
        x_index = device_index[:-1].copy()
        if transpose_a:
            temp = x_index[-1]
            x_index[-1] = x_index[-2]
            x_index[-2] = temp
        y_index = device_index[:-3].copy()
        y_index.append(device_index[-2])
        y_index.append(device_index[-1])
        if transpose_b:
            temp = y_index[-1]
            y_index[-1] = y_index[-2]
            y_index[-2] = temp

        out_index = device_index[:-2].copy()
        out_index.append(device_index[-1])

        print(device_matrix)
        print(device_index)

        need_dev_num_ = 1
        for s in strategy_[1]:
            need_dev_num_ = need_dev_num_ * s
        self.x_id = device_id % need_dev_num_
        self.y_id = self.list_to_id(y_index, self.strategy[2])
        self.out_id = self.list_to_id(out_index, self.out_strategy)

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks


    def id_to_list(self, id_, shape):
        """
        shape：每一维的上限，如（2,4,8）
        """
        result = []
        r = id_
        for i in range(0, len(shape)):
            v = 1
            for j in range(i + 1, len(shape)):
                v = v * shape[j]
            result.append(r // v)
            r = r % v
        return result

    def list_to_id(self, id_list, shape):
        result = 0
        for i in range(0, len(id_list)):
            v = 1
            for j in range(i + 1, len(id_list)):
                v = v * shape[j]
            result = result + id_list[i] * v
        return result

    def forward_mindspore_impl(self):
        if len(self.inputa.shape) > 2:
            matmul = BatchMatMul(self.transpose_a, self.transpose_b)
        else:
            matmul = Matmul(self.transpose_a, self.transpose_b)
        matmul.set_train()
        out_me = matmul(Tensor(self.inputa), Tensor(self.inputb), Tensor(self.inputz))
        return out_me.asnumpy()

    def forward_mindspore_parallel_impl(self):
        if len(self.inputa.shape) > 2:
            matmul = BatchMatMul(self.transpose_a, self.transpose_b, strategy0=self.strategy, strategy1=self.strategy_)
        else:
            matmul = Matmul(self.transpose_a, self.transpose_b, strategy0=self.strategy, strategy1=self.strategy_)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        x = Tensor(self.inputa)
        y = Tensor(self.inputb)
        z = Tensor(self.inputz)
        xs = self.get_parallel_blocks(self.inputa, self.strategy_[1])
        ys = self.get_parallel_blocks(self.inputb, self.strategy[2])
        zs = self.get_parallel_blocks(self.inputz, self.strategy_[1])
        x1 = Tensor(xs[self.x_id])  #
        y1 = Tensor(ys[self.y_id])  # 需要从设备矩阵推导
        z1 = Tensor(zs[self.x_id])
        matmul.set_train()
        matmul.set_auto_parallel()
        out_me = matmul(x, y, z, parallel_inputs_compile=[x, y, z], parallel_inputs_run=[x1, y1, z1])
        return out_me.asnumpy()

    def grad_mindspore_impl(self):
        x = Tensor(self.inputa)
        y = Tensor(self.inputb)
        z = Tensor(self.inputz)
        if len(self.inputa.shape) > 2:
            matmul = BatchMatMul(self.transpose_a, self.transpose_b)
        else:
            matmul = Matmul(self.transpose_a, self.transpose_b)
        net_me = Grad(matmul)
        net_me.set_train()
        out_grad_me = Tensor(self.output_grad_np)
        out_grad = net_me(x, y, z, out_grad_me)
        return out_grad

    def grad_mindspore_parallel_impl(self):
        if len(self.inputa.shape) > 2:
            matmul = BatchMatMul(self.transpose_a, self.transpose_b, strategy0=self.strategy, strategy1=self.strategy_)
        else:
            matmul = Matmul(self.transpose_a, self.transpose_b, strategy0=self.strategy, strategy1=self.strategy_)
        x = Tensor(self.inputa)
        y = Tensor(self.inputb)
        z = Tensor(self.inputz)
        out_grad_me = Tensor(self.output_grad_np)

        xs = self.get_parallel_blocks(self.inputa, self.strategy_[1])
        ys = self.get_parallel_blocks(self.inputb, self.strategy[2])
        zs = self.get_parallel_blocks(self.inputz, self.strategy_[1])
        out_grads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)

        x1 = Tensor(xs[self.x_id])  # 需要从设备矩阵推导
        y1 = Tensor(ys[self.y_id])  #
        z1 = Tensor(zs[self.x_id])
        out_grad1 = Tensor(out_grads[self.out_id])
        net_me = Grad(matmul)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net_me.set_auto_parallel()
        net_me.set_train()

        out_grad = net_me(x, y, z, out_grad_me, parallel_inputs_compile=[x, y, z, out_grad1],
                          parallel_inputs_run=[x1, y1, z1, out_grad1])
        return out_grad

    def forward_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspores = self.get_parallel_blocks(out_mindspore, self.out_strategy)
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        assert allclose(out_mindspores[self.out_id], out_mindspore_parallel, 0.0001, 0.0001)

    def grad_cmp(self):
        input_grad_mindspore = self.grad_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
        input_grad_mindspores0 = self.get_parallel_blocks(input_grad_mindspore[0].asnumpy(), self.strategy_[1])
        input_grad_mindspores1 = self.get_parallel_blocks(input_grad_mindspore[1].asnumpy(), self.strategy[2])
        input_grad_mindspores2 = self.get_parallel_blocks(input_grad_mindspore[2].asnumpy(), self.strategy_[1])
        assert allclose(input_grad_mindspores0[self.x_id], input_grad_mindspore_parallel[0].asnumpy(), 0.0001, 0.0001)
        assert allclose(input_grad_mindspores1[self.y_id], input_grad_mindspore_parallel[1].asnumpy(), 0.0001, 0.0001)
        assert allclose(input_grad_mindspores2[self.x_id], input_grad_mindspore_parallel[2].asnumpy(), 0.0001, 0.0001)


 def test_reid_batchmatmul_inputa_128_512_inputb_2000_512():
    inputa = [128, 512]
    inputb = [2000, 512]
    fact = BatchmatmulFactory(inputa, inputb, False, True, (0, (2, 2), (1, 2)), (0, (2, 2), (2, 2)))
    fact.forward_cmp()


 def test_reid_batchmatmul_grad_inputa_128_512_inputb_2000_512():
    inputa = [128, 512]
    inputb = [2000, 512]
    fact = BatchmatmulFactory(inputa, inputb, False, True, (0, (2, 2), (1, 2)), (0, (2, 2), (2, 2)))
    fact.grad_cmp()


 def test_reid_batchmatmul_inputa_128_512_inputb_2000_512_redistribution():
    inputa = [128, 512]
    inputb = [2000, 512]
    fact = BatchmatmulFactory(inputa, inputb, False, True, (0, (1, 2), (1, 2)), (0, (2, 2), (2, 2)))
    fact.forward_cmp()


 def test_reid_batchmatmul_grad_inputa_128_512_inputb_2000_512_redistribution():
    inputa = [128, 512]
    inputb = [2000, 512]
    fact = BatchmatmulFactory(inputa, inputb, False, True, (0, (1, 2), (1, 2)), (0, (2, 2), (2, 2)))
    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/max/_test_max_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/max/_test_max_parallel_4p.py
@@ -1,214 +1,213 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import numpy as np
 import os
 import pytest

 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, input1, input2, output_grad):
        return grad_all_with_sens(self.network)(input1, input2, output_grad)


 class Max(Cell):
    def __init__(self, axis, keep_dims, strategy0=None, strategy1=None):
        super(Max, self).__init__()
        self.add = P.TensorAdd(strategy=strategy0)
        self.reduce_max = P.ReduceMax(keep_dims=keep_dims).set_strategy(strategy=strategy1)
        self.axis = axis

    def construct(self, input1, input2):
        out = self.add(input1, input2)
        return self.reduce_max(out, self.axis)


 class MaxFactory:
    def __init__(self, input_shape, axis, keep_dims, strategy0, strategy1):
        self.strategy0 = strategy0
        self.strategy1 = strategy1
        self.axis = axis
        self.keep_dims = keep_dims
        input_size = 1
        prefix = ""
        for s in input_shape:
            prefix = prefix + str(s) + "_"
            input_size = input_size * s
        number_range = min(1000, input_size)
        self.input_np1 = np.reshape(np.arange(0, input_size) % number_range - number_range / 2, input_shape).astype(
            np.float32)
        self.input_np2 = self.input_np1.copy()
        self.out_grad_np = None
        out_shape = list(input_shape)
        out_shape.pop(axis)
        out_size = input_size / input_shape[axis]
        number_range_ = min(1000, out_size)
        self.out_grad_np = np.reshape(np.arange(0, out_size) % number_range_ - number_range_ / 2, out_shape).astype(
            np.float32)
        out_strategy = list(strategy1[1])
        out_strategy.pop(axis)
        self.out_strategy = out_strategy
        need_dev_num = 1
        need_dev_num_ = 1
        for s in strategy0[1]:
            need_dev_num = need_dev_num * s
        for s in out_strategy:
            need_dev_num_ = need_dev_num_ * s
        self.x_id = device_id % need_dev_num
        self.y_id = device_id % need_dev_num
        self.out_id = device_id % need_dev_num_

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def forward_mindspore_impl(self):
        input1 = Tensor(self.input_np1)
        input2 = Tensor(self.input_np2)
        net = Max(axis=self.axis, keep_dims=self.keep_dims)
        out = net(input1, input2)
        return out.asnumpy()

    def forward_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        xs = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        ys = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        x1 = Tensor(xs[self.x_id])
        y1 = Tensor(ys[self.y_id])
        net = Max(axis=self.axis, keep_dims=self.keep_dims, strategy0=self.strategy0, strategy1=self.strategy1)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
        return out.asnumpy()

    def grad_mindspore_impl(self):
        input1 = Tensor(self.input_np1)
        input2 = Tensor(self.input_np2)
        out_grad = Tensor(self.out_grad_np)
        net = Max(axis=self.axis, keep_dims=self.keep_dims)
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(input1, input2, out_grad)
        return input_grad

    def grad_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grads = self.get_parallel_blocks(self.out_grad_np, self.out_strategy)
        out_grad = Tensor(output_grads[self.out_id])
        xs = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        ys = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        x1 = Tensor(xs[self.x_id])
        y1 = Tensor(ys[self.y_id])
        net = Max(axis=self.axis, keep_dims=self.keep_dims, strategy0=self.strategy0, strategy1=self.strategy1)
        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_auto_parallel()
        grad_net.set_train()
        input_grad = grad_net(x, y, out_grad, parallel_inputs_compile=[x, y, out_grad],
                              parallel_inputs_run=[x1, y1, out_grad])
        return input_grad

    def forward_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        print(out_mindspore)
        print(out_mindspore_parallel)
        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.001, 0.001)

    def grad_cmp(self):
        input_grad_mindspore = self.grad_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
        assert np.allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
        assert np.allclose(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)


 def test_reid_max_forward_input_256_64():
    fact = MaxFactory(input_shape=(256, 64), axis=1, keep_dims=False, strategy0=(0, (4, 1), (4, 1)),
                      strategy1=(0, (4, 1)))
    fact.forward_cmp()


 def test_reid_max_grad_input_256_64():
    fact = MaxFactory(input_shape=(256, 64), axis=1, keep_dims=False, strategy0=(0, (4, 1), (4, 1)),
                      strategy1=(0, (4, 1)))
    fact.grad_cmp()


 def test_reid_max_forward_input_128_64_32_32():
    fact = MaxFactory(input_shape=(128, 64, 32, 32), axis=3, keep_dims=False, strategy0=(0, (2, 1, 2, 1), (2, 1, 2, 1)),
                      strategy1=(0, (2, 1, 2, 1)))
    fact.forward_cmp()


 def test_reid_max_grad_input_128_64_32_32():
    fact = MaxFactory(input_shape=(128, 64, 32, 32), axis=3, keep_dims=False, strategy0=(0, (2, 1, 2, 1), (2, 1, 2, 1)),
                      strategy1=(0, (2, 1, 2, 1)))
    fact.grad_cmp()


 def test_reid_max_forward_input_256_64_repeat():
    fact = MaxFactory(input_shape=(256, 64), axis=1, keep_dims=False, strategy0=(0, (2, 1), (2, 1)),
                      strategy1=(0, (2, 1)))
    fact.forward_cmp()


 def test_reid_max_grad_input_256_64_repeat():
    fact = MaxFactory(input_shape=(256, 64), axis=1, keep_dims=False, strategy0=(0, (2, 1), (2, 1)),
                      strategy1=(0, (2, 1)))
    fact.grad_cmp()
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np

 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, input1, input2, output_grad):
        return grad_all_with_sens(self.network)(input1, input2, output_grad)


 class Max(Cell):
    def __init__(self, axis, keep_dims, strategy0=None, strategy1=None):
        super(Max, self).__init__()
        self.add = P.TensorAdd(strategy=strategy0)
        self.reduce_max = P.ReduceMax(keep_dims=keep_dims).set_strategy(strategy=strategy1)
        self.axis = axis

    def construct(self, input1, input2):
        out = self.add(input1, input2)
        return self.reduce_max(out, self.axis)


 class MaxFactory:
    def __init__(self, input_shape, axis, keep_dims, strategy0, strategy1):
        self.strategy0 = strategy0
        self.strategy1 = strategy1
        self.axis = axis
        self.keep_dims = keep_dims
        input_size = 1
        prefix = ""
        for s in input_shape:
            prefix = prefix + str(s) + "_"
            input_size = input_size * s
        number_range = min(1000, input_size)
        self.input_np1 = np.reshape(np.arange(0, input_size) % number_range - number_range / 2, input_shape).astype(
            np.float32)
        self.input_np2 = self.input_np1.copy()
        self.out_grad_np = None
        out_shape = list(input_shape)
        out_shape.pop(axis)
        out_size = input_size / input_shape[axis]
        number_range_ = min(1000, out_size)
        self.out_grad_np = np.reshape(np.arange(0, out_size) % number_range_ - number_range_ / 2, out_shape).astype(
            np.float32)
        out_strategy = list(strategy1[1])
        out_strategy.pop(axis)
        self.out_strategy = out_strategy
        need_dev_num = 1
        need_dev_num_ = 1
        for s in strategy0[1]:
            need_dev_num = need_dev_num * s
        for s in out_strategy:
            need_dev_num_ = need_dev_num_ * s
        self.x_id = device_id % need_dev_num
        self.y_id = device_id % need_dev_num
        self.out_id = device_id % need_dev_num_

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def forward_mindspore_impl(self):
        input1 = Tensor(self.input_np1)
        input2 = Tensor(self.input_np2)
        net = Max(axis=self.axis, keep_dims=self.keep_dims)
        out = net(input1, input2)
        return out.asnumpy()

    def forward_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        xs = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        ys = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        x1 = Tensor(xs[self.x_id])
        y1 = Tensor(ys[self.y_id])
        net = Max(axis=self.axis, keep_dims=self.keep_dims, strategy0=self.strategy0, strategy1=self.strategy1)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
        return out.asnumpy()

    def grad_mindspore_impl(self):
        input1 = Tensor(self.input_np1)
        input2 = Tensor(self.input_np2)
        out_grad = Tensor(self.out_grad_np)
        net = Max(axis=self.axis, keep_dims=self.keep_dims)
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(input1, input2, out_grad)
        return input_grad

    def grad_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grads = self.get_parallel_blocks(self.out_grad_np, self.out_strategy)
        out_grad = Tensor(output_grads[self.out_id])
        xs = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        ys = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        x1 = Tensor(xs[self.x_id])
        y1 = Tensor(ys[self.y_id])
        net = Max(axis=self.axis, keep_dims=self.keep_dims, strategy0=self.strategy0, strategy1=self.strategy1)
        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_auto_parallel()
        grad_net.set_train()
        input_grad = grad_net(x, y, out_grad, parallel_inputs_compile=[x, y, out_grad],
                              parallel_inputs_run=[x1, y1, out_grad])
        return input_grad

    def forward_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        print(out_mindspore)
        print(out_mindspore_parallel)
        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.001, 0.001)

    def grad_cmp(self):
        input_grad_mindspore = self.grad_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
        assert np.allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
        assert np.allclose(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)


 def test_reid_max_forward_input_256_64():
    fact = MaxFactory(input_shape=(256, 64), axis=1, keep_dims=False, strategy0=(0, (4, 1), (4, 1)),
                      strategy1=(0, (4, 1)))
    fact.forward_cmp()


 def test_reid_max_grad_input_256_64():
    fact = MaxFactory(input_shape=(256, 64), axis=1, keep_dims=False, strategy0=(0, (4, 1), (4, 1)),
                      strategy1=(0, (4, 1)))
    fact.grad_cmp()


 def test_reid_max_forward_input_128_64_32_32():
    fact = MaxFactory(input_shape=(128, 64, 32, 32), axis=3, keep_dims=False, strategy0=(0, (2, 1, 2, 1), (2, 1, 2, 1)),
                      strategy1=(0, (2, 1, 2, 1)))
    fact.forward_cmp()


 def test_reid_max_grad_input_128_64_32_32():
    fact = MaxFactory(input_shape=(128, 64, 32, 32), axis=3, keep_dims=False, strategy0=(0, (2, 1, 2, 1), (2, 1, 2, 1)),
                      strategy1=(0, (2, 1, 2, 1)))
    fact.grad_cmp()


 def test_reid_max_forward_input_256_64_repeat():
    fact = MaxFactory(input_shape=(256, 64), axis=1, keep_dims=False, strategy0=(0, (2, 1), (2, 1)),
                      strategy1=(0, (2, 1)))
    fact.forward_cmp()


 def test_reid_max_grad_input_256_64_repeat():
    fact = MaxFactory(input_shape=(256, 64), axis=1, keep_dims=False, strategy0=(0, (2, 1), (2, 1)),
                      strategy1=(0, (2, 1)))
    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/mul_softmax/need_fix_test_mul_softmax_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/mul_softmax/need_fix_test_mul_softmax_parallel_4p.py
@@ -1,201 +1,200 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import numpy as np
 import os
 import pytest
 from numpy import allclose

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class MulSoftmax(Cell):
    def __init__(self, strategy0=None, strategy1=None, axis=0):
        super(MulSoftmax, self).__init__()
        self.mul = P.Mul(strategy=strategy0)
        self.softmax = P.Softmax(axis=axis, strategy=strategy1)

    def construct(self, x, z):
        out = self.mul(x, z)
        return self.softmax(out)


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, output_grad):
        return grad_all_with_sens(self.network)(x, y, output_grad)


 class MulSoftmaxFactory:
    def __init__(self, input_shape, strategy0, strategy1):
        prefix = ""
        size = 1
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(1000, size)
        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
            np.float32)
        self.input_np2 = 1.0
        self.output_grad_np = np.reshape((np.arange(0, size) % (number_range - 10) - number_range / 2) * 0.1,
                                         input_shape).astype(np.float32)
        self.strategy0 = strategy0
        self.strategy1 = strategy1
        need_dev_num = 1
        need_dev_num_ = 1
        for s in strategy0[1]:
            need_dev_num = need_dev_num * s
        for s in strategy1[1]:
            need_dev_num_ = need_dev_num_ * s
        self.x_id = device_id % need_dev_num
        self.y_id = device_id % need_dev_num
        self.out_id = device_id % need_dev_num_

    def forward_mindspore_impl(self):
        net = MulSoftmax()
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        out = net(x, y)
        return out.asnumpy()

    def forward_mindspore_parallel_impl(self):
        net = MulSoftmax(strategy0=self.strategy0, strategy1=self.strategy1)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(self.input_np2, ms.float32)
        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
        return out.asnumpy()

    def grad_mindspore_impl(self):
        output_grad = Tensor(self.output_grad_np)
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        net = MulSoftmax()
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad)
        return input_grad

    def grad_mindspore_parallel_impl(self):
        output_grads = self.get_parallel_blocks(self.output_grad_np, self.strategy1[1])
        output_grad = Tensor(output_grads[self.out_id])
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        net = MulSoftmax(strategy0=self.strategy0, strategy1=self.strategy1)
        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_train()
        grad_net.set_auto_parallel()
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(self.input_np2, ms.float32)
        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad],
                              parallel_inputs_run=[x1, y1, output_grad])
        return input_grad

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def forward_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        np.save(path + str(device_id) + "_" + self.prefix + "_forward_parallel.npy", out_mindspore_parallel)
        out_blocks = self.get_parallel_blocks(out_mindspore, self.strategy1[1])
        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.001)

    def grad_cmp(self):
        input_grad_mindspore = self.grad_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        np.save(path + str(device_id) + "_" + self.prefix + "_grad_parallel0.npy", input_grad_mindspore_parallel0)
        np.save(path + str(device_id) + "_" + self.prefix + "_grad_parallel1.npy", input_grad_mindspore_parallel1)
        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0,
                                                       self.strategy0[1])  # 这里由于TensorMul两个输入X1没做广播，X2做了广播
        assert np.allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
        assert np.allclose(input_grad_mindspore1, input_grad_mindspore_parallel1, 0.0001, 0.0001)


@pytest.mark.reid_forward
 def test_reid_mul_softmax_input_128x64():
    stra0 = (0, (1, 4), ())
    stra1 = (0, (1, 4))
    fact = MulSoftmaxFactory(input_shape=(128, 64), strategy0=stra0, strategy1=stra1)
    fact.forward_cmp()


@pytest.mark.reid_grad
 def test_reid_grad_mul_softmax_input_128x64():
    stra0 = (0, (1, 4), ())
    stra1 = (0, (1, 4))
    fact = MulSoftmaxFactory(input_shape=(128, 64), strategy0=stra0, strategy1=stra1)
    fact.grad_cmp()


@pytest.mark.reid_forward
 def test_reid_mul_softmax_input_128x64_all_to_all():
    stra0 = (0, (4, 1), ())
    stra1 = (0, (1, 4))
    fact = MulSoftmaxFactory(input_shape=(128, 64), strategy0=stra0, strategy1=stra1)
    fact.forward_cmp()


@pytest.mark.reid_grad
 def test_reid_grad_mul_softmax_input_128x64_all_to_all():
    stra0 = (0, (4, 1), ())
    stra1 = (0, (1, 4))
    fact = MulSoftmaxFactory(input_shape=(128, 64), strategy0=stra0, strategy1=stra1)
    fact.grad_cmp()
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np
 import pytest

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class MulSoftmax(Cell):
    def __init__(self, strategy0=None, strategy1=None, axis=0):
        super(MulSoftmax, self).__init__()
        self.mul = P.Mul(strategy=strategy0)
        self.softmax = P.Softmax(axis=axis, strategy=strategy1)

    def construct(self, x, z):
        out = self.mul(x, z)
        return self.softmax(out)


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, output_grad):
        return grad_all_with_sens(self.network)(x, y, output_grad)


 class MulSoftmaxFactory:
    def __init__(self, input_shape, strategy0, strategy1):
        prefix = ""
        size = 1
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(1000, size)
        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
            np.float32)
        self.input_np2 = 1.0
        self.output_grad_np = np.reshape((np.arange(0, size) % (number_range - 10) - number_range / 2) * 0.1,
                                         input_shape).astype(np.float32)
        self.strategy0 = strategy0
        self.strategy1 = strategy1
        need_dev_num = 1
        need_dev_num_ = 1
        for s in strategy0[1]:
            need_dev_num = need_dev_num * s
        for s in strategy1[1]:
            need_dev_num_ = need_dev_num_ * s
        self.x_id = device_id % need_dev_num
        self.y_id = device_id % need_dev_num
        self.out_id = device_id % need_dev_num_

    def forward_mindspore_impl(self):
        net = MulSoftmax()
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        out = net(x, y)
        return out.asnumpy()

    def forward_mindspore_parallel_impl(self):
        net = MulSoftmax(strategy0=self.strategy0, strategy1=self.strategy1)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(self.input_np2, ms.float32)
        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
        return out.asnumpy()

    def grad_mindspore_impl(self):
        output_grad = Tensor(self.output_grad_np)
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        net = MulSoftmax()
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad)
        return input_grad

    def grad_mindspore_parallel_impl(self):
        output_grads = self.get_parallel_blocks(self.output_grad_np, self.strategy1[1])
        output_grad = Tensor(output_grads[self.out_id])
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        net = MulSoftmax(strategy0=self.strategy0, strategy1=self.strategy1)
        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_train()
        grad_net.set_auto_parallel()
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(self.input_np2, ms.float32)
        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad],
                              parallel_inputs_run=[x1, y1, output_grad])
        return input_grad

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def forward_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        np.save(path + str(device_id) + "_" + self.prefix + "_forward_parallel.npy", out_mindspore_parallel)
        out_blocks = self.get_parallel_blocks(out_mindspore, self.strategy1[1])
        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.001)

    def grad_cmp(self):
        input_grad_mindspore = self.grad_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        np.save(path + str(device_id) + "_" + self.prefix + "_grad_parallel0.npy", input_grad_mindspore_parallel0)
        np.save(path + str(device_id) + "_" + self.prefix + "_grad_parallel1.npy", input_grad_mindspore_parallel1)
        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0,
                                                       self.strategy0[1])  # 这里由于TensorMul两个输入X1没做广播，X2做了广播
        assert np.allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
        assert np.allclose(input_grad_mindspore1, input_grad_mindspore_parallel1, 0.0001, 0.0001)


@pytest.mark.reid_forward
 def test_reid_mul_softmax_input_128x64():
    stra0 = (0, (1, 4), ())
    stra1 = (0, (1, 4))
    fact = MulSoftmaxFactory(input_shape=(128, 64), strategy0=stra0, strategy1=stra1)
    fact.forward_cmp()


@pytest.mark.reid_grad
 def test_reid_grad_mul_softmax_input_128x64():
    stra0 = (0, (1, 4), ())
    stra1 = (0, (1, 4))
    fact = MulSoftmaxFactory(input_shape=(128, 64), strategy0=stra0, strategy1=stra1)
    fact.grad_cmp()


@pytest.mark.reid_forward
 def test_reid_mul_softmax_input_128x64_all_to_all():
    stra0 = (0, (4, 1), ())
    stra1 = (0, (1, 4))
    fact = MulSoftmaxFactory(input_shape=(128, 64), strategy0=stra0, strategy1=stra1)
    fact.forward_cmp()


@pytest.mark.reid_grad
 def test_reid_grad_mul_softmax_input_128x64_all_to_all():
    stra0 = (0, (4, 1), ())
    stra1 = (0, (1, 4))
    fact = MulSoftmaxFactory(input_shape=(128, 64), strategy0=stra0, strategy1=stra1)
    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/onehot/_test_onehot_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/onehot/_test_onehot_parallel_4p.py
@@ -1,149 +1,147 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import numpy as np
 import os
 import pytest
 from numpy import allclose

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class Onehot(Cell):
    def __init__(self, axis=-1, depth=1, on_value=1.0, off_value=0.0, strategy=None):
        super(Onehot, self).__init__()
        self.onehot = P.OneHot(axis, strategy=strategy)
        self.depth = depth
        self.on_value = Tensor(on_value, ms.float32)
        self.off_value = Tensor(off_value, ms.float32)

    def construct(self, indices):
        return self.onehot(indices, self.depth, self.on_value, self.off_value)


 class OneHotFactory:
    def __init__(self, input_shape, depth, on_value=1.0, off_value=0.0, axis=None, dtype=None, strategy0=None):
        size = 1
        prefix = ""
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(10, size)
        self.input_np = np.reshape(np.arange(0, size) % number_range, input_shape).astype(np.int32)
        self.depth = depth
        self.on_value = on_value
        self.off_value = off_value
        self.axis = axis
        self.dtype = dtype
        self.strategy0 = strategy0
        need_dev_num = 1
        for s in strategy0[1]:
            need_dev_num = need_dev_num * s
        self.x_id = device_id % need_dev_num
        self.out_id = device_id % need_dev_num

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def grad_mindspore_impl(self):
        output_grad = Tensor(self.output_grad_np)
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        net = AddRelu()
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad)
        return input_grad

    def forward_mindspore_impl(self):
        indices = Tensor(self.input_np)
        net = Onehot(axis=self.axis,
                     depth=self.depth,
                     on_value=self.on_value,
                     off_value=self.off_value)
        out = net(indices)
        return out.asnumpy()

    def forward_mindspore_parallel_impl(self):
        x = Tensor(self.input_np)
        inputs_x = self.get_parallel_blocks(self.input_np, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        net = Onehot(axis=self.axis,
                     depth=self.depth,
                     on_value=self.on_value,
                     off_value=self.off_value, strategy=self.strategy0)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        out = net(x, parallel_inputs_compile=[x], parallel_inputs_run=[x1])
        return out.asnumpy()

    def forward_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        out_blocks = self.get_parallel_blocks(out_mindspore, self.strategy0[1])
        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.0001)


 def test_reid_onehot_forward_int32_128_depth13000():
    fact = OneHotFactory(input_shape=(128,),
                         depth=131072,
                         on_value=1.000000,
                         off_value=0.000000,
                         axis=-1,
                         dtype="float32",
                         strategy0=(0, (2,)))
    fact.forward_cmp()


 def test_reid_onehot_forward_int32_131072_depth127():
    fact = OneHotFactory(input_shape=(131072,),
                         depth=127,
                         on_value=1.000000,
                         off_value=0.000000,
                         axis=-1,
                         dtype="float32",
                         strategy0=(0, (4,)))
    fact.forward_cmp()
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class Onehot(Cell):
    def __init__(self, axis=-1, depth=1, on_value=1.0, off_value=0.0, strategy=None):
        super(Onehot, self).__init__()
        self.onehot = P.OneHot(axis, strategy=strategy)
        self.depth = depth
        self.on_value = Tensor(on_value, ms.float32)
        self.off_value = Tensor(off_value, ms.float32)

    def construct(self, indices):
        return self.onehot(indices, self.depth, self.on_value, self.off_value)


 class OneHotFactory:
    def __init__(self, input_shape, depth, on_value=1.0, off_value=0.0, axis=None, dtype=None, strategy0=None):
        size = 1
        prefix = ""
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(10, size)
        self.input_np = np.reshape(np.arange(0, size) % number_range, input_shape).astype(np.int32)
        self.depth = depth
        self.on_value = on_value
        self.off_value = off_value
        self.axis = axis
        self.dtype = dtype
        self.strategy0 = strategy0
        need_dev_num = 1
        for s in strategy0[1]:
            need_dev_num = need_dev_num * s
        self.x_id = device_id % need_dev_num
        self.out_id = device_id % need_dev_num

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def grad_mindspore_impl(self):
        output_grad = Tensor(self.output_grad_np)
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        net = AddRelu()
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad)
        return input_grad

    def forward_mindspore_impl(self):
        indices = Tensor(self.input_np)
        net = Onehot(axis=self.axis,
                     depth=self.depth,
                     on_value=self.on_value,
                     off_value=self.off_value)
        out = net(indices)
        return out.asnumpy()

    def forward_mindspore_parallel_impl(self):
        x = Tensor(self.input_np)
        inputs_x = self.get_parallel_blocks(self.input_np, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        net = Onehot(axis=self.axis,
                     depth=self.depth,
                     on_value=self.on_value,
                     off_value=self.off_value, strategy=self.strategy0)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        out = net(x, parallel_inputs_compile=[x], parallel_inputs_run=[x1])
        return out.asnumpy()

    def forward_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        out_blocks = self.get_parallel_blocks(out_mindspore, self.strategy0[1])
        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.0001)


 def test_reid_onehot_forward_int32_128_depth13000():
    fact = OneHotFactory(input_shape=(128,),
                         depth=131072,
                         on_value=1.000000,
                         off_value=0.000000,
                         axis=-1,
                         dtype="float32",
                         strategy0=(0, (2,)))
    fact.forward_cmp()


 def test_reid_onehot_forward_int32_131072_depth127():
    fact = OneHotFactory(input_shape=(131072,),
                         depth=127,
                         on_value=1.000000,
                         off_value=0.000000,
                         axis=-1,
                         dtype="float32",
                         strategy0=(0, (4,)))
    fact.forward_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/prelu/_test_prelu_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/prelu/_test_prelu_parallel_4p.py
@@ -1,206 +1,206 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import numpy as np
 import os
 import pytest
 from numpy import allclose

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class PReLU(Cell):
    def __init__(self, channel=1, w=0.25, strategy_=None, strategy1_=None):
        super(PReLU, self).__init__()
        self.add = P.TensorAdd(strategy=strategy1_)
        self.prelu = P.PReLU(strategy=strategy_)

    def construct(self, x, z, w):
        out = self.add(x, z)
        return self.prelu(out, w)


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, input, z, w, output_grad):
        return grad_all_with_sens(self.network)(input, z, w, output_grad)


 class PReLUFactory:
    def __init__(self, input_shape, strategy):
        n, c = input_shape[:2]
        prefix = ""
        size = 1
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(1000, size)
        self.input_np = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(np.float32)
        self.output_grad_np = np.reshape((np.arange(0, size) % (number_range - 10) - number_range / 2) * 0.1,
                                         input_shape).astype(np.float32)
        self.channel = c
        self.weight = np.array([np.float32(0.25)] * c)
        self.strategy = strategy

    def forward_mindspore_impl(self):
        net = PReLU(channel=self.channel, w=self.weight)
        x = Tensor(self.input_np)
        z = Tensor(np.zeros(self.input_np.shape), ms.float32)
        w = Tensor(self.weight)
        out = net(x, z, w)
        return out.asnumpy()

    def forward_mindspore_parallel_impl(self):
        net = PReLU(channel=self.channel, w=self.weight, strategy_=self.strategy,
                    strategy1_=(self.strategy[0], self.strategy[1], self.strategy[1]))
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        x = Tensor(self.input_np)
        z = Tensor(np.zeros(self.input_np.shape), ms.float32)
        w = Tensor(self.weight)

        inputs = self.get_parallel_blocks(self.input_np, self.strategy[1])
        block_id = device_id % len(inputs)
        x1 = Tensor(inputs[block_id])
        z1 = Tensor(np.zeros(inputs[block_id].shape), ms.float32)
        w1 = Tensor(self.weight)

        out = net(x, z, w, parallel_inputs_compile=[x, z, w], parallel_inputs_run=[x1, z1, w1])
        return out.asnumpy()

    def grad_mindspore_impl(self):
        output_grad = Tensor(self.output_grad_np)
        x = Tensor(self.input_np)
        z = Tensor(np.zeros(self.input_np.shape), ms.float32)
        w = Tensor(self.weight)

        net = PReLU(channel=self.channel, w=self.weight)
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, z, w, output_grad)
        return input_grad

    def grad_mindspore_parallel_impl(self):
        output_grads = self.get_parallel_blocks(self.output_grad_np, self.strategy[1])
        block_id = device_id % len(output_grads)
        output_grad = Tensor(output_grads[block_id])
        x = Tensor(self.input_np)
        z = Tensor(np.zeros(self.input_np.shape), ms.float32)
        w = Tensor(self.weight)

        net = PReLU(channel=self.channel, w=self.weight, strategy_=self.strategy,
                    strategy1_=(self.strategy[0], self.strategy[1], self.strategy[1]))
        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_auto_parallel()

        grad_net.set_train()
        inputs = self.get_parallel_blocks(self.input_np, self.strategy[1])
        x1 = Tensor(inputs[block_id])
        z1 = Tensor(np.zeros(inputs[block_id].shape), ms.float32)
        w1 = Tensor(self.weight)

        input_grad = grad_net(x, z, w, output_grad, parallel_inputs_compile=[x, z, w, output_grad],
                              parallel_inputs_run=[x1, z1, w1, output_grad])
        return input_grad

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def forward_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        out_blocks = self.get_parallel_blocks(out_mindspore, self.strategy[1])
        block_id = device_id % len(out_blocks)
        assert np.allclose(out_blocks[block_id], out_mindspore_parallel, 0.0001, 0.001)

    def grad_cmp(self):
        input_grad_mindspore = self.grad_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        input_grad_mindspore2 = input_grad_mindspore[2].asnumpy()
        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        input_grad_mindspore_parallel2 = input_grad_mindspore_parallel[2].asnumpy()
        input_grad_blocks = self.get_parallel_blocks(input_grad_mindspore0, self.strategy[1])
        input1_grad_blocks = self.get_parallel_blocks(input_grad_mindspore1, self.strategy[1])
        block_id = device_id % len(input_grad_blocks)
        assert np.allclose(input_grad_blocks[block_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
        assert np.allclose(input_grad_mindspore2, input_grad_mindspore_parallel2, 0.0001, 0.0001)
        assert np.allclose(input1_grad_blocks[block_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)


@pytest.mark.reid_grad
 def test_reid_prelu_input_128x64x112x112_repeat():
    stra = (0, (1, 1, 2, 1), (1))
    fact = PReLUFactory(input_shape=(128, 64, 112, 112), strategy=stra)
    fact.forward_cmp()


@pytest.mark.reid_grad
 def test_reid_grad_prelu_input_128x64x112x112_repeat():
    stra = (0, (1, 1, 2, 1), (1))
    fact = PReLUFactory(input_shape=(128, 64, 112, 112), strategy=stra)
    fact.grad_cmp()


@pytest.mark.reid_grad
 def test_reid_prelu_input_128x64x112x112_mix():
    stra = (0, (2, 1, 1, 2), (1))
    fact = PReLUFactory(input_shape=(128, 64, 112, 112), strategy=stra)
    fact.forward_cmp()


@pytest.mark.reid_grad
 def test_reid_grad_prelu_input_128x64x112x112_mix():
    stra = (0, (2, 1, 1, 2), (1))
    fact = PReLUFactory(input_shape=(128, 64, 112, 112), strategy=stra)
    fact.grad_cmp()
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np
 import pytest

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class PReLU(Cell):
    def __init__(self, channel=1, w=0.25, strategy_=None, strategy1_=None):
        super(PReLU, self).__init__()
        self.add = P.TensorAdd(strategy=strategy1_)
        self.prelu = P.PReLU(strategy=strategy_)
        self.channel = channel

    def construct(self, x, z, w):
        out = self.add(x, z)
        return self.prelu(out, w)


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, input_, z, w, output_grad):
        return grad_all_with_sens(self.network)(input_, z, w, output_grad)


 class PReLUFactory:
    def __init__(self, input_shape, strategy):
        n, c = input_shape[:2]
        prefix = ""
        size = 1
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(1000, size)
        self.input_np = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(np.float32)
        self.output_grad_np = np.reshape((np.arange(0, size) % (number_range - 10) - number_range / 2) * 0.1,
                                         input_shape).astype(np.float32)
        self.channel = c
        self.weight = np.array([np.float32(0.25)] * c)
        self.strategy = strategy

    def forward_mindspore_impl(self):
        net = PReLU(channel=self.channel, w=self.weight)
        x = Tensor(self.input_np)
        z = Tensor(np.zeros(self.input_np.shape), ms.float32)
        w = Tensor(self.weight)
        out = net(x, z, w)
        return out.asnumpy()

    def forward_mindspore_parallel_impl(self):
        net = PReLU(channel=self.channel, w=self.weight, strategy_=self.strategy,
                    strategy1_=(self.strategy[0], self.strategy[1], self.strategy[1]))
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        x = Tensor(self.input_np)
        z = Tensor(np.zeros(self.input_np.shape), ms.float32)
        w = Tensor(self.weight)

        inputs = self.get_parallel_blocks(self.input_np, self.strategy[1])
        block_id = device_id % len(inputs)
        x1 = Tensor(inputs[block_id])
        z1 = Tensor(np.zeros(inputs[block_id].shape), ms.float32)
        w1 = Tensor(self.weight)

        out = net(x, z, w, parallel_inputs_compile=[x, z, w], parallel_inputs_run=[x1, z1, w1])
        return out.asnumpy()

    def grad_mindspore_impl(self):
        output_grad = Tensor(self.output_grad_np)
        x = Tensor(self.input_np)
        z = Tensor(np.zeros(self.input_np.shape), ms.float32)
        w = Tensor(self.weight)

        net = PReLU(channel=self.channel, w=self.weight)
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, z, w, output_grad)
        return input_grad

    def grad_mindspore_parallel_impl(self):
        output_grads = self.get_parallel_blocks(self.output_grad_np, self.strategy[1])
        block_id = device_id % len(output_grads)
        output_grad = Tensor(output_grads[block_id])
        x = Tensor(self.input_np)
        z = Tensor(np.zeros(self.input_np.shape), ms.float32)
        w = Tensor(self.weight)

        net = PReLU(channel=self.channel, w=self.weight, strategy_=self.strategy,
                    strategy1_=(self.strategy[0], self.strategy[1], self.strategy[1]))
        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_auto_parallel()

        grad_net.set_train()
        inputs = self.get_parallel_blocks(self.input_np, self.strategy[1])
        x1 = Tensor(inputs[block_id])
        z1 = Tensor(np.zeros(inputs[block_id].shape), ms.float32)
        w1 = Tensor(self.weight)

        input_grad = grad_net(x, z, w, output_grad, parallel_inputs_compile=[x, z, w, output_grad],
                              parallel_inputs_run=[x1, z1, w1, output_grad])
        return input_grad

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def forward_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        out_blocks = self.get_parallel_blocks(out_mindspore, self.strategy[1])
        block_id = device_id % len(out_blocks)
        assert np.allclose(out_blocks[block_id], out_mindspore_parallel, 0.0001, 0.001)

    def grad_cmp(self):
        input_grad_mindspore = self.grad_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        input_grad_mindspore2 = input_grad_mindspore[2].asnumpy()
        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        input_grad_mindspore_parallel2 = input_grad_mindspore_parallel[2].asnumpy()
        input_grad_blocks = self.get_parallel_blocks(input_grad_mindspore0, self.strategy[1])
        input1_grad_blocks = self.get_parallel_blocks(input_grad_mindspore1, self.strategy[1])
        block_id = device_id % len(input_grad_blocks)
        assert np.allclose(input_grad_blocks[block_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
        assert np.allclose(input_grad_mindspore2, input_grad_mindspore_parallel2, 0.0001, 0.0001)
        assert np.allclose(input1_grad_blocks[block_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)


@pytest.mark.reid_grad
 def test_reid_prelu_input_128x64x112x112_repeat():
    stra = (0, (1, 1, 2, 1), (1))
    fact = PReLUFactory(input_shape=(128, 64, 112, 112), strategy=stra)
    fact.forward_cmp()


@pytest.mark.reid_grad
 def test_reid_grad_prelu_input_128x64x112x112_repeat():
    stra = (0, (1, 1, 2, 1), (1))
    fact = PReLUFactory(input_shape=(128, 64, 112, 112), strategy=stra)
    fact.grad_cmp()


@pytest.mark.reid_grad
 def test_reid_prelu_input_128x64x112x112_mix():
    stra = (0, (2, 1, 1, 2), (1))
    fact = PReLUFactory(input_shape=(128, 64, 112, 112), strategy=stra)
    fact.forward_cmp()


@pytest.mark.reid_grad
 def test_reid_grad_prelu_input_128x64x112x112_mix():
    stra = (0, (2, 1, 1, 2), (1))
    fact = PReLUFactory(input_shape=(128, 64, 112, 112), strategy=stra)
    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/reducemean/_test_reducemean_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/reducemean/_test_reducemean_parallel_4p.py
@@ -1,253 +1,252 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import numpy as np
 import os
 import pytest
 from numpy import allclose as allclose_nparray

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, output_grad):
        return grad_all_with_sens(self.network)(x, y, output_grad)


 class GradScalar(Cell):
    def __init__(self, network):
        super(GradScalar, self).__init__()
        self.network = network
        self.sens = Tensor([1.0], dtype=ms.float32)

    def construct(self, x, y):
        return grad_all_with_sens(self.network)(x, y, self.sens)


 class ReduceMean(Cell):
    def __init__(self, keep_dims, axis, strategy0=None, strategy1=None):
        super(ReduceMean, self).__init__()
        self.add = P.TensorAdd(strategy=strategy0)
        self.reduce_mean = P.ReduceMean(keep_dims=keep_dims).set_strategy(strategy=strategy1)
        self.axis = axis

    def construct(self, x, y):
        out = self.add(x, y)
        return self.reduce_mean(out, self.axis)


 class ReduceMeanFactory:
    def __init__(self, input_shape, keep_dims, axis, strategy0=None, strategy1=None):
        prefix = ""
        size = 1
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(1000, size)
        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
            np.float32)
        self.input_np2 = np.reshape(np.arange(0, size) % number_range - number_range / 4, input_shape).astype(
            np.float32)
        self.keep_dims = keep_dims
        self.axis = axis
        target_shape = self.input_np1.mean(axis=axis, keepdims=keep_dims).shape
        target_size = 1
        for s in target_shape:
            target_size = target_size * s
        number_range = min(1000, target_size)
        self.output_grad_np = np.array([1.0], dtype=np.float32)
        if len(target_shape) > 0:
            self.output_grad_np = np.reshape(np.arange(0, target_size) % number_range, target_shape).astype(
                np.float32) + 1.0
        self.shape = target_shape
        self.strategy0 = strategy0
        self.strategy1 = strategy1
        out_strategy = []
        axis_ = list(axis)
        if axis_[0] == -1:
            axis_[0] = len(input_shape) - 1
        for i in range(0, len(input_shape)):
            if i in axis_:
                if keep_dims:
                    out_strategy.append(1)
            else:
                out_strategy.append(strategy1[1][i])
        self.out_strategy = out_strategy
        need_dev_num0 = 1
        need_dev_num1 = 1
        for s in strategy0[1]:
            need_dev_num0 = need_dev_num0 * s
        for s in out_strategy:
            need_dev_num1 = need_dev_num1 * s
        self.x_id = device_id % need_dev_num0
        self.y_id = device_id % need_dev_num0
        block_id = device_id % need_dev_num0
        device_index = self.id_to_list(block_id, self.strategy1[1])
        print(device_index)
        for i in axis:
            device_index[i] = 0
        print(device_index)
        self.out_id = self.list_to_id(device_index, self.out_strategy)
        print(self.out_id)

    def id_to_list(self, id, shape):
        result = []
        r = id
        for i in range(0, len(shape)):
            v = 1
            for j in range(i + 1, len(shape)):
                v = v * shape[j]
            result.append(r // v)
            r = r % v
        return result

    def list_to_id(self, id_list, shape):
        result = 0
        for i in range(0, len(id_list)):
            v = 1
            for j in range(i + 1, len(id_list)):
                v = v * shape[j]
            result = result + id_list[i] * v
        return result

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def forward_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        net = ReduceMean(keep_dims=self.keep_dims, axis=self.axis)
        out = net(x, y)
        return out.asnumpy()

    def forward_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        net = ReduceMean(keep_dims=self.keep_dims, axis=self.axis, strategy0=self.strategy0, strategy1=self.strategy1)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
        return out.asnumpy()

    def grad_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        out_grad = Tensor(self.output_grad_np)
        net = ReduceMean(keep_dims=self.keep_dims, axis=self.axis)
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, out_grad)
        return input_grad

    def grad_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        outgrads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        output_grad1 = Tensor(outgrads[self.out_id])
        net = ReduceMean(keep_dims=self.keep_dims, axis=self.axis, strategy0=self.strategy0, strategy1=self.strategy1)
        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_auto_parallel()
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad1],
                              parallel_inputs_run=[x1, y1, output_grad1])
        return input_grad

    def forward_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.001)

    def grad_cmp(self):
        input_grad_mindspore = self.grad_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
        assert allclose_nparray(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
        assert allclose_nparray(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)


 def test_reid_reducemean_input_64x16():
    fact = ReduceMeanFactory(input_shape=(64 * 16,), keep_dims=False, axis=(-1,), strategy0=(0, (4,), (4,)),
                             strategy1=(0, (4,)))
    fact.forward_cmp()


 def test_grad_reid_reducemean_input_64x16():
    fact = ReduceMeanFactory(input_shape=(64 * 16,), keep_dims=False, axis=(-1,), strategy0=(0, (4,), (4,)),
                             strategy1=(0, (4,)))
    fact.grad_cmp()


 def test_reid_reducemean_input_64x128x28x28():
    fact = ReduceMeanFactory(input_shape=(64, 128, 32, 32), keep_dims=True, axis=(2, 3),
                             strategy0=(0, (2, 1, 2, 1), (2, 1, 2, 1)), strategy1=(0, (2, 1, 2, 1)))
    fact.forward_cmp()


 def test_grad_reid_reducemean_input_64x128x28x28():
    fact = ReduceMeanFactory(input_shape=(64, 128, 32, 32), keep_dims=True, axis=(2, 3),
                             strategy0=(0, (2, 1, 2, 1), (2, 1, 2, 1)), strategy1=(0, (2, 1, 2, 1)))
    fact.grad_cmp()
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np
 from numpy import allclose as allclose_nparray

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, output_grad):
        return grad_all_with_sens(self.network)(x, y, output_grad)


 class GradScalar(Cell):
    def __init__(self, network):
        super(GradScalar, self).__init__()
        self.network = network
        self.sens = Tensor([1.0], dtype=ms.float32)

    def construct(self, x, y):
        return grad_all_with_sens(self.network)(x, y, self.sens)


 class ReduceMean(Cell):
    def __init__(self, keep_dims, axis, strategy0=None, strategy1=None):
        super(ReduceMean, self).__init__()
        self.add = P.TensorAdd(strategy=strategy0)
        self.reduce_mean = P.ReduceMean(keep_dims=keep_dims).set_strategy(strategy=strategy1)
        self.axis = axis

    def construct(self, x, y):
        out = self.add(x, y)
        return self.reduce_mean(out, self.axis)


 class ReduceMeanFactory:
    def __init__(self, input_shape, keep_dims, axis, strategy0=None, strategy1=None):
        prefix = ""
        size = 1
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(1000, size)
        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
            np.float32)
        self.input_np2 = np.reshape(np.arange(0, size) % number_range - number_range / 4, input_shape).astype(
            np.float32)
        self.keep_dims = keep_dims
        self.axis = axis
        target_shape = self.input_np1.mean(axis=axis, keepdims=keep_dims).shape
        target_size = 1
        for s in target_shape:
            target_size = target_size * s
        number_range = min(1000, target_size)
        self.output_grad_np = np.array([1.0], dtype=np.float32)
        if len(target_shape) > 0:
            self.output_grad_np = np.reshape(np.arange(0, target_size) % number_range, target_shape).astype(
                np.float32) + 1.0
        self.shape = target_shape
        self.strategy0 = strategy0
        self.strategy1 = strategy1
        out_strategy = []
        axis_ = list(axis)
        if axis_[0] == -1:
            axis_[0] = len(input_shape) - 1
        for i in range(0, len(input_shape)):
            if i in axis_:
                if keep_dims:
                    out_strategy.append(1)
            else:
                out_strategy.append(strategy1[1][i])
        self.out_strategy = out_strategy
        need_dev_num0 = 1
        need_dev_num1 = 1
        for s in strategy0[1]:
            need_dev_num0 = need_dev_num0 * s
        for s in out_strategy:
            need_dev_num1 = need_dev_num1 * s
        self.x_id = device_id % need_dev_num0
        self.y_id = device_id % need_dev_num0
        block_id = device_id % need_dev_num0
        device_index = self.id_to_list(block_id, self.strategy1[1])
        print(device_index)
        for i in axis:
            device_index[i] = 0
        print(device_index)
        self.out_id = self.list_to_id(device_index, self.out_strategy)
        print(self.out_id)

    def id_to_list(self, id_, shape):
        result = []
        r = id_
        for i in range(0, len(shape)):
            v = 1
            for j in range(i + 1, len(shape)):
                v = v * shape[j]
            result.append(r // v)
            r = r % v
        return result

    def list_to_id(self, id_list, shape):
        result = 0
        for i in range(0, len(id_list)):
            v = 1
            for j in range(i + 1, len(id_list)):
                v = v * shape[j]
            result = result + id_list[i] * v
        return result

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def forward_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        net = ReduceMean(keep_dims=self.keep_dims, axis=self.axis)
        out = net(x, y)
        return out.asnumpy()

    def forward_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        net = ReduceMean(keep_dims=self.keep_dims, axis=self.axis, strategy0=self.strategy0, strategy1=self.strategy1)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
        return out.asnumpy()

    def grad_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        out_grad = Tensor(self.output_grad_np)
        net = ReduceMean(keep_dims=self.keep_dims, axis=self.axis)
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, out_grad)
        return input_grad

    def grad_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        outgrads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        output_grad1 = Tensor(outgrads[self.out_id])
        net = ReduceMean(keep_dims=self.keep_dims, axis=self.axis, strategy0=self.strategy0, strategy1=self.strategy1)
        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_auto_parallel()
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad1],
                              parallel_inputs_run=[x1, y1, output_grad1])
        return input_grad

    def forward_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.001)

    def grad_cmp(self):
        input_grad_mindspore = self.grad_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
        assert allclose_nparray(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
        assert allclose_nparray(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)


 def test_reid_reducemean_input_64x16():
    fact = ReduceMeanFactory(input_shape=(64 * 16,), keep_dims=False, axis=(-1,), strategy0=(0, (4,), (4,)),
                             strategy1=(0, (4,)))
    fact.forward_cmp()


 def test_grad_reid_reducemean_input_64x16():
    fact = ReduceMeanFactory(input_shape=(64 * 16,), keep_dims=False, axis=(-1,), strategy0=(0, (4,), (4,)),
                             strategy1=(0, (4,)))
    fact.grad_cmp()


 def test_reid_reducemean_input_64x128x28x28():
    fact = ReduceMeanFactory(input_shape=(64, 128, 32, 32), keep_dims=True, axis=(2, 3),
                             strategy0=(0, (2, 1, 2, 1), (2, 1, 2, 1)), strategy1=(0, (2, 1, 2, 1)))
    fact.forward_cmp()


 def test_grad_reid_reducemean_input_64x128x28x28():
    fact = ReduceMeanFactory(input_shape=(64, 128, 32, 32), keep_dims=True, axis=(2, 3),
                             strategy0=(0, (2, 1, 2, 1), (2, 1, 2, 1)), strategy1=(0, (2, 1, 2, 1)))
    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/reshape/_test_reshape_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/reshape/_test_reshape_parallel_4p.py
@@ -1,206 +1,206 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import numpy as np
 import os
 import pytest
 from numpy import allclose as allclose_nparray

 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, output_grad):
        return grad_all_with_sens(self.network)(x, y, output_grad)


 class Reshape(Cell):
    def __init__(self, target_shape, strategy0=None, strategy1=None):
        super(Reshape, self).__init__()
        self.add = P.TensorAdd(strategy=strategy0)
        self.reshape = P.Reshape(strategy=strategy1)
        self.shape = tuple(target_shape)

    def construct(self, input1, input2):
        x = self.add(input1, input2)
        return self.reshape(x, self.shape)


 class ReshapeFactory:
    def __init__(self, input_shape, target_shape, strategy0, strategy1):
        prefix = ""
        size = 1
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(1000, size)
        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
            np.float32)
        self.input_np2 = np.reshape(np.arange(0, size) % number_range - number_range / 4, input_shape).astype(
            np.float32)
        target_size = 1
        for s in target_shape:
            target_size = target_size * s
        number_range = min(1000, target_size)
        self.output_grad_np = np.reshape(np.arange(0, target_size) % number_range - number_range / 2,
                                         target_shape).astype(np.float32)
        self.target_shape = target_shape
        self.strategy0 = strategy0
        self.strategy1 = strategy1
        out_strategy = [1] * len(target_shape)
        out_strategy[0] = strategy1[1][0]
        self.out_strategy = out_strategy

        need_dev_num0 = 1
        need_dev_num1 = 1
        for s in strategy0[1]:
            need_dev_num0 = need_dev_num0 * s
        for s in out_strategy:
            need_dev_num1 = need_dev_num1 * s
        self.x_id = device_id % need_dev_num0
        self.y_id = device_id % need_dev_num0
        self.out_id = device_id % need_dev_num1

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def forward_reshape_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        net = Reshape(self.target_shape)
        out = net(x, y)
        return out.asnumpy()

    def forward_reshape_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        net = Reshape(self.target_shape, strategy0=self.strategy0, strategy1=self.strategy1)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
        return out.asnumpy()

    def grad_reshape_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        net = Reshape(self.target_shape)
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad)
        return input_grad

    def grad_reshape_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        outgrads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        output_grad1 = Tensor(outgrads[self.out_id])
        net = Reshape(self.target_shape, strategy0=self.strategy0, strategy1=self.strategy1)
        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_auto_parallel()
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad1],
                              parallel_inputs_run=[x1, y1, output_grad1])
        return input_grad

    def forward_reshape_cmp(self):
        out_mindspore = self.forward_reshape_mindspore_impl()
        out_mindspore_parallel = self.forward_reshape_mindspore_parallel_impl()
        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.001)

    def grad_reshape_cmp(self):
        input_grad_mindspore = self.grad_reshape_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_reshape_mindspore_parallel_impl()
        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
        assert allclose_nparray(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
        assert allclose_nparray(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)


@pytest.mark.reid_forward
 def test_reid_reshape_input_128x512x7x7_target_128x25088():
    fact = ReshapeFactory(input_shape=(128, 512, 7, 7), target_shape=(128, 25088),
                          strategy0=(0, (4, 1, 1, 1), (4, 1, 1, 1)), strategy1=(0, (4, 1, 1, 1)))
    fact.forward_reshape_cmp()


 def test_reid_reshape_grad_input_128x512x7x7_target_128x25088():
    fact = ReshapeFactory(input_shape=(128, 512, 7, 7), target_shape=(128, 25088),
                          strategy0=(0, (4, 1, 1, 1), (4, 1, 1, 1)), strategy1=(0, (4, 1, 1, 1)))
    fact.grad_reshape_cmp()


@pytest.mark.reid_forward
 def test_reid_reshape_input_128x64_target_128x64x1x1():
    fact = ReshapeFactory(input_shape=(128, 64), target_shape=(128, 64, 1, 1), strategy0=(0, (2, 1), (2, 1)),
                          strategy1=(0, (2, 1)))
    fact.forward_reshape_cmp()


@pytest.mark.reid_grad
 def test_reid_reshape_grad_input_128x64_target_128x64x1x1():
    fact = ReshapeFactory(input_shape=(128, 64), target_shape=(128, 64, 1, 1), strategy0=(0, (2, 1), (2, 1)),
                          strategy1=(0, (2, 1)))
    fact.grad_reshape_cmp()
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np
 import pytest
 from numpy import allclose as allclose_nparray

 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, output_grad):
        return grad_all_with_sens(self.network)(x, y, output_grad)


 class Reshape(Cell):
    def __init__(self, target_shape, strategy0=None, strategy1=None):
        super(Reshape, self).__init__()
        self.add = P.TensorAdd(strategy=strategy0)
        self.reshape = P.Reshape(strategy=strategy1)
        self.shape = tuple(target_shape)

    def construct(self, input1, input2):
        x = self.add(input1, input2)
        return self.reshape(x, self.shape)


 class ReshapeFactory:
    def __init__(self, input_shape, target_shape, strategy0, strategy1):
        prefix = ""
        size = 1
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(1000, size)
        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
            np.float32)
        self.input_np2 = np.reshape(np.arange(0, size) % number_range - number_range / 4, input_shape).astype(
            np.float32)
        target_size = 1
        for s in target_shape:
            target_size = target_size * s
        number_range = min(1000, target_size)
        self.output_grad_np = np.reshape(np.arange(0, target_size) % number_range - number_range / 2,
                                         target_shape).astype(np.float32)
        self.target_shape = target_shape
        self.strategy0 = strategy0
        self.strategy1 = strategy1
        out_strategy = [1] * len(target_shape)
        out_strategy[0] = strategy1[1][0]
        self.out_strategy = out_strategy

        need_dev_num0 = 1
        need_dev_num1 = 1
        for s in strategy0[1]:
            need_dev_num0 = need_dev_num0 * s
        for s in out_strategy:
            need_dev_num1 = need_dev_num1 * s
        self.x_id = device_id % need_dev_num0
        self.y_id = device_id % need_dev_num0
        self.out_id = device_id % need_dev_num1

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def forward_reshape_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        net = Reshape(self.target_shape)
        out = net(x, y)
        return out.asnumpy()

    def forward_reshape_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        net = Reshape(self.target_shape, strategy0=self.strategy0, strategy1=self.strategy1)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
        return out.asnumpy()

    def grad_reshape_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        net = Reshape(self.target_shape)
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad)
        return input_grad

    def grad_reshape_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        outgrads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        output_grad1 = Tensor(outgrads[self.out_id])
        net = Reshape(self.target_shape, strategy0=self.strategy0, strategy1=self.strategy1)
        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_auto_parallel()
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad1],
                              parallel_inputs_run=[x1, y1, output_grad1])
        return input_grad

    def forward_reshape_cmp(self):
        out_mindspore = self.forward_reshape_mindspore_impl()
        out_mindspore_parallel = self.forward_reshape_mindspore_parallel_impl()
        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.001)

    def grad_reshape_cmp(self):
        input_grad_mindspore = self.grad_reshape_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_reshape_mindspore_parallel_impl()
        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
        assert allclose_nparray(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
        assert allclose_nparray(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)


@pytest.mark.reid_forward
 def test_reid_reshape_input_128x512x7x7_target_128x25088():
    fact = ReshapeFactory(input_shape=(128, 512, 7, 7), target_shape=(128, 25088),
                          strategy0=(0, (4, 1, 1, 1), (4, 1, 1, 1)), strategy1=(0, (4, 1, 1, 1)))
    fact.forward_reshape_cmp()


 def test_reid_reshape_grad_input_128x512x7x7_target_128x25088():
    fact = ReshapeFactory(input_shape=(128, 512, 7, 7), target_shape=(128, 25088),
                          strategy0=(0, (4, 1, 1, 1), (4, 1, 1, 1)), strategy1=(0, (4, 1, 1, 1)))
    fact.grad_reshape_cmp()


@pytest.mark.reid_forward
 def test_reid_reshape_input_128x64_target_128x64x1x1():
    fact = ReshapeFactory(input_shape=(128, 64), target_shape=(128, 64, 1, 1), strategy0=(0, (2, 1), (2, 1)),
                          strategy1=(0, (2, 1)))
    fact.forward_reshape_cmp()


@pytest.mark.reid_grad
 def test_reid_reshape_grad_input_128x64_target_128x64x1x1():
    fact = ReshapeFactory(input_shape=(128, 64), target_shape=(128, 64, 1, 1), strategy0=(0, (2, 1), (2, 1)),
                          strategy1=(0, (2, 1)))
    fact.grad_reshape_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/transpose/_test_transpose_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/transpose/_test_transpose_parallel_4p.py
@@ -1,236 +1,235 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import numpy as np
 import os
 import pytest
 from numpy import allclose as allclose_nparray

 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class Net(Cell):
    def __init__(self, perm_in, strategy0=None, strategy1=None):
        super(Net, self).__init__()
        self.add = P.TensorAdd(strategy=strategy0)
        self.transpose = P.Transpose(strategy=strategy1)
        self.perm_in = perm_in

    def construct(self, x, y):
        out = self.add(x, y)
        return self.transpose(out, self.perm_in)


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, output_grad):
        return grad_all_with_sens(self.network)(x, y, output_grad)


 class TransposeFactory:
    def __init__(self, input_shape, perm_in, strategy0, strategy1):
        prefix = ""
        size = 1
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(1000, size)
        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
            np.float32)
        self.input_np2 = np.reshape(np.arange(0, size) % number_range - number_range / 4, input_shape).astype(
            np.float32)
        target_shape = self.input_np1.transpose(perm_in).shape
        target_size = 1
        for s in target_shape:
            target_size = target_size * s
        number_range = min(1000, target_size)
        self.target_shape = target_shape
        self.output_grad_np = np.reshape(np.arange(0, target_size) % number_range - number_range / 2,
                                         target_shape).astype(np.float32)
        self.perm_in = perm_in
        self.strategy0 = strategy0
        self.strategy1 = strategy1
        out_strategy = []
        for i in perm_in:
            out_strategy.append(strategy1[1][i])
        self.out_strategy = out_strategy
        need_dev_num0 = 1
        need_dev_num1 = 1
        for s in strategy0[1]:
            need_dev_num0 = need_dev_num0 * s
        for s in out_strategy:
            need_dev_num1 = need_dev_num1 * s
        self.x_id = device_id % need_dev_num0
        self.y_id = device_id % need_dev_num0
        device_index = self.id_to_list(device_id % need_dev_num1,
                                       self.strategy1[1])  # encoding to get the index before transpose
        device_index_transpose = []
        for i in perm_in:
            device_index_transpose.append(device_index[i])
        self.out_id = self.list_to_id(device_index_transpose, self.out_strategy)

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def id_to_list(self, id, shape):
        result = []
        r = id
        for i in range(0, len(shape)):
            v = 1
            for j in range(i + 1, len(shape)):
                v = v * shape[j]
            result.append(r // v)
            r = r % v
        return result

    def list_to_id(self, id_list, shape):
        result = 0
        for i in range(0, len(id_list)):
            v = 1
            for j in range(i + 1, len(id_list)):
                v = v * shape[j]
            result = result + id_list[i] * v
        return result

    def forward_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        net = Net(self.perm_in)
        out = net(x, y)
        return out.asnumpy()

    def forward_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        net = Net(self.perm_in, strategy0=self.strategy0, strategy1=self.strategy1)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
        return out.asnumpy()

    def grad_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        net = Net(self.perm_in)
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad)
        return input_grad

    def grad_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        outgrads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        output_grad1 = Tensor(outgrads[self.out_id])
        net = Net(self.perm_in, strategy0=self.strategy0, strategy1=self.strategy1)
        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_auto_parallel()
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad1],
                              parallel_inputs_run=[x1, y1, output_grad1])
        return input_grad

    def forward_transpose_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.001)

    def grad_transpose_cmp(self):
        input_grad_mindspore = self.grad_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
        assert allclose_nparray(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
        assert allclose_nparray(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)


 def test_reid_transpose_input_256x512_output_512x256_perm_1x0():
    fact = TransposeFactory((256, 512), (1, 0), strategy0=(0, (2, 2), (2, 2)), strategy1=(0, (2, 2)))
    fact.forward_transpose_cmp()


 def test_reid_grad_transpose_input_256x512_output_512x256_perm_1x0():
    fact = TransposeFactory((256, 512), (1, 0), strategy0=(0, (2, 2), (2, 2)), strategy1=(0, (2, 2)))
    fact.grad_transpose_cmp()


 def test_reid_transpose_input_512x256_output_256x512_perm_1x0():
    fact = TransposeFactory((512, 256), (1, 0), strategy0=(0, (4, 1), (4, 1)), strategy1=(0, (1, 4)))
    fact.forward_transpose_cmp()


 def test_reid_grad_transpose_input_512x256_output_256x512_perm_1x0():
    fact = TransposeFactory((512, 256), (1, 0), strategy0=(0, (4, 1), (4, 1)), strategy1=(0, (1, 4)))
    fact.grad_transpose_cmp()


 def test_reid_transpose_input_512x256_output_256x512_perm_1x0_repeat():
    fact = TransposeFactory((512, 256), (1, 0), strategy0=(0, (2, 1), (2, 1)), strategy1=(0, (2, 1)))
    fact.forward_transpose_cmp()


 def test_reid_grad_transpose_input_512x256_output_256x512_perm_1x0_repeat():
    fact = TransposeFactory((512, 256), (1, 0), strategy0=(0, (2, 1), (2, 1)), strategy1=(0, (2, 1)))
    fact.grad_transpose_cmp()
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np
 from numpy import allclose as allclose_nparray

 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class Net(Cell):
    def __init__(self, perm_in, strategy0=None, strategy1=None):
        super(Net, self).__init__()
        self.add = P.TensorAdd(strategy=strategy0)
        self.transpose = P.Transpose(strategy=strategy1)
        self.perm_in = perm_in

    def construct(self, x, y):
        out = self.add(x, y)
        return self.transpose(out, self.perm_in)


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, output_grad):
        return grad_all_with_sens(self.network)(x, y, output_grad)


 class TransposeFactory:
    def __init__(self, input_shape, perm_in, strategy0, strategy1):
        prefix = ""
        size = 1
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(1000, size)
        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
            np.float32)
        self.input_np2 = np.reshape(np.arange(0, size) % number_range - number_range / 4, input_shape).astype(
            np.float32)
        target_shape = self.input_np1.transpose(perm_in).shape
        target_size = 1
        for s in target_shape:
            target_size = target_size * s
        number_range = min(1000, target_size)
        self.target_shape = target_shape
        self.output_grad_np = np.reshape(np.arange(0, target_size) % number_range - number_range / 2,
                                         target_shape).astype(np.float32)
        self.perm_in = perm_in
        self.strategy0 = strategy0
        self.strategy1 = strategy1
        out_strategy = []
        for i in perm_in:
            out_strategy.append(strategy1[1][i])
        self.out_strategy = out_strategy
        need_dev_num0 = 1
        need_dev_num1 = 1
        for s in strategy0[1]:
            need_dev_num0 = need_dev_num0 * s
        for s in out_strategy:
            need_dev_num1 = need_dev_num1 * s
        self.x_id = device_id % need_dev_num0
        self.y_id = device_id % need_dev_num0
        device_index = self.id_to_list(device_id % need_dev_num1,
                                       self.strategy1[1])  # encoding to get the index before transpose
        device_index_transpose = []
        for i in perm_in:
            device_index_transpose.append(device_index[i])
        self.out_id = self.list_to_id(device_index_transpose, self.out_strategy)

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def id_to_list(self, id_, shape):
        result = []
        r = id_
        for i in range(0, len(shape)):
            v = 1
            for j in range(i + 1, len(shape)):
                v = v * shape[j]
            result.append(r // v)
            r = r % v
        return result

    def list_to_id(self, id_list, shape):
        result = 0
        for i in range(0, len(id_list)):
            v = 1
            for j in range(i + 1, len(id_list)):
                v = v * shape[j]
            result = result + id_list[i] * v
        return result

    def forward_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        net = Net(self.perm_in)
        out = net(x, y)
        return out.asnumpy()

    def forward_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        net = Net(self.perm_in, strategy0=self.strategy0, strategy1=self.strategy1)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
        return out.asnumpy()

    def grad_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        net = Net(self.perm_in)
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad)
        return input_grad

    def grad_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        outgrads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        output_grad1 = Tensor(outgrads[self.out_id])
        net = Net(self.perm_in, strategy0=self.strategy0, strategy1=self.strategy1)
        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_auto_parallel()
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad1],
                              parallel_inputs_run=[x1, y1, output_grad1])
        return input_grad

    def forward_transpose_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.001)

    def grad_transpose_cmp(self):
        input_grad_mindspore = self.grad_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
        assert allclose_nparray(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
        assert allclose_nparray(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)


 def test_reid_transpose_input_256x512_output_512x256_perm_1x0():
    fact = TransposeFactory((256, 512), (1, 0), strategy0=(0, (2, 2), (2, 2)), strategy1=(0, (2, 2)))
    fact.forward_transpose_cmp()


 def test_reid_grad_transpose_input_256x512_output_512x256_perm_1x0():
    fact = TransposeFactory((256, 512), (1, 0), strategy0=(0, (2, 2), (2, 2)), strategy1=(0, (2, 2)))
    fact.grad_transpose_cmp()


 def test_reid_transpose_input_512x256_output_256x512_perm_1x0():
    fact = TransposeFactory((512, 256), (1, 0), strategy0=(0, (4, 1), (4, 1)), strategy1=(0, (1, 4)))
    fact.forward_transpose_cmp()


 def test_reid_grad_transpose_input_512x256_output_256x512_perm_1x0():
    fact = TransposeFactory((512, 256), (1, 0), strategy0=(0, (4, 1), (4, 1)), strategy1=(0, (1, 4)))
    fact.grad_transpose_cmp()


 def test_reid_transpose_input_512x256_output_256x512_perm_1x0_repeat():
    fact = TransposeFactory((512, 256), (1, 0), strategy0=(0, (2, 1), (2, 1)), strategy1=(0, (2, 1)))
    fact.forward_transpose_cmp()


 def test_reid_grad_transpose_input_512x256_output_256x512_perm_1x0_repeat():
    fact = TransposeFactory((512, 256), (1, 0), strategy0=(0, (2, 1), (2, 1)), strategy1=(0, (2, 1)))
    fact.grad_transpose_cmp()
--- a/tests/ut/python/parallel/test_add_relu_redistribution.py
+++ b/tests/ut/python/parallel/test_add_relu_redistribution.py
@@ -54,7 +54,7 @@ class Grad(nn.Cell):
        return C.grad_all(self.network)(x, y)


 def compile(net, x, y):
 def compile_net(net, x, y):
    net.set_auto_parallel()
    _executor.compile(net, x, y)

@@ -69,7 +69,7 @@ def test_add_relu_stride_slice():

    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([128, 32]), dtype=ms.float32)
    compile(net, x, y)
    compile_net(net, x, y)


 def test_add_relu_all_gather():
@@ -82,4 +82,4 @@ def test_add_relu_all_gather():

    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([128, 32]), dtype=ms.float32)
    compile(net, x, y)
    compile_net(net, x, y)
--- a/tests/ut/python/parallel/test_allreduce_fusion.py
+++ b/tests/ut/python/parallel/test_allreduce_fusion.py
@@ -17,7 +17,6 @@ import numpy as np
 import mindspore as ms
 import mindspore.nn as nn
 from mindspore import Tensor, context
 from mindspore import context
 from mindspore.common.api import _executor
 from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
 from mindspore.nn.optim.momentum import Momentum
@@ -131,56 +130,56 @@ def test_allreduce_fusion_parameters():
    cost_model_context.reset_cost_model_context()
    cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_algorithm=2)
    algorithm = cost_model_context.get_cost_model_context('costmodel_allreduce_fusion_algorithm')
    assert (algorithm == 2)
    assert algorithm == 2
    cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_algorithm=1)
    algorithm = cost_model_context.get_cost_model_context('costmodel_allreduce_fusion_algorithm')
    assert (algorithm == 1)
    assert algorithm == 1
    cost_model_context.reset_cost_model_context()
    algorithm = cost_model_context.get_cost_model_context('costmodel_allreduce_fusion_algorithm')
    assert (algorithm == 0)
    assert algorithm == 0

    cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_times=2)
    fusion_times = cost_model_context.get_cost_model_context('costmodel_allreduce_fusion_times')
    assert (fusion_times == 2)
    assert fusion_times == 2

    cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_percent=0.2)
    tail_percent = cost_model_context.get_cost_model_context('costmodel_allreduce_fusion_tail_percent')
    assert (tail_percent == 0.2)
    assert tail_percent == 0.2
    cost_model_context.reset_cost_model_context()
    tail_percent = cost_model_context.get_cost_model_context('costmodel_allreduce_fusion_tail_percent')
    assert (tail_percent == 0.1)
    assert tail_percent == 0.1

    cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_time=0.2)
    tail_time = cost_model_context.get_cost_model_context('costmodel_allreduce_fusion_tail_time')
    assert (tail_time == 0.2)
    assert tail_time == 0.2
    cost_model_context.reset_cost_model_context()
    tail_time = cost_model_context.get_cost_model_context('costmodel_allreduce_fusion_tail_time')
    assert (tail_time == 0.1)
    assert tail_time == 0.1

    cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_allreduce_inherent_time=0.2)
    allreduce_inherent_time = cost_model_context.get_cost_model_context(
        'costmodel_allreduce_fusion_allreduce_inherent_time')
    assert (allreduce_inherent_time == 0.2)
    assert allreduce_inherent_time == 0.2
    cost_model_context.reset_cost_model_context()
    allreduce_inherent_time = cost_model_context.get_cost_model_context(
        'costmodel_allreduce_fusion_allreduce_inherent_time')
    assert (allreduce_inherent_time == 0.1)
    assert allreduce_inherent_time == 0.1

    cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_allreduce_bandwidth=0.2)
    allreduce_bandwidth = cost_model_context.get_cost_model_context('costmodel_allreduce_fusion_allreduce_bandwidth')
    assert (allreduce_bandwidth == 0.2)
    assert allreduce_bandwidth == 0.2
    cost_model_context.reset_cost_model_context()
    allreduce_bandwidth = cost_model_context.get_cost_model_context('costmodel_allreduce_fusion_allreduce_bandwidth')
    assert (allreduce_bandwidth == 0.1)
    assert allreduce_bandwidth == 0.1

    cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_computation_time_parameter=0.2)
    computation_time_parameter = cost_model_context.get_cost_model_context(
        'costmodel_allreduce_fusion_computation_time_parameter')
    assert (computation_time_parameter == 0.2)
    assert computation_time_parameter == 0.2
    cost_model_context.reset_cost_model_context()
    computation_time_parameter = cost_model_context.get_cost_model_context(
        'costmodel_allreduce_fusion_computation_time_parameter')
    assert (computation_time_parameter == 0.1)
    assert computation_time_parameter == 0.1


 def test_allreduce_fusion1():
@@ -201,7 +200,7 @@ def test_allreduce_fusion1():
                   'backbone2.fc2.weight': 1,
                   'backbone2.fc1.weight': 1,
                   'backbone1.fc1.weight': 1}
    assert (allreduce_fusion_dict == expect_dict)
    assert allreduce_fusion_dict == expect_dict
    cost_model_context.reset_cost_model_context()


@@ -214,7 +213,7 @@ def test_allreduce_fusion2():
    net = SimpleDMLNet(DenseNet1(has_bias=False, activation=None), DenseNet2(has_bias=False, activation=None))
    allreduce_fusion_dict = train_common(net)
    expect_dict = {}
    assert (allreduce_fusion_dict == expect_dict)
    assert allreduce_fusion_dict == expect_dict
    cost_model_context.reset_cost_model_context()


@@ -240,7 +239,7 @@ def test_allreduce_fusion3():
                   'backbone1.fc2.weight': 2,
                   'backbone1.fc1.bias': 2,
                   'backbone1.fc1.weight': 2}
    assert (allreduce_fusion_dict == expect_dict)
    assert allreduce_fusion_dict == expect_dict
    cost_model_context.reset_cost_model_context()


@@ -267,7 +266,7 @@ def test_allreduce_fusion4():
                   'backbone1.fc2.weight': 1,
                   'backbone1.fc1.weight': 1}

    assert (allreduce_fusion_dict == expect_dict)
    assert allreduce_fusion_dict == expect_dict
    cost_model_context.reset_cost_model_context()


@@ -295,7 +294,7 @@ def test_allreduce_fusion5():
                   'backbone1.fc4.weight': 2,
                   'backbone1.fc3.weight': 2,
                   'backbone1.fc2.weight': 1,
                   'backbone1.fc1.weight': 1, }
                   'backbone1.fc1.weight': 1,}

    assert (allreduce_fusion_dict == expect_dict)
    assert allreduce_fusion_dict == expect_dict
    cost_model_context.reset_cost_model_context()
--- a/tests/ut/python/parallel/test_alltoall.py
+++ b/tests/ut/python/parallel/test_alltoall.py
@@ -67,7 +67,6 @@ def all_to_all_net(strategy1):


 def all_to_all_common(strategy1):
    batch_size = 32
    learning_rate = 0.1
    momentum = 0.9
    epoch_size = 2
@@ -104,7 +103,7 @@ def test_all_to_all():
                       [8, 1]],
                   'Default/network-_VirtualDatasetCell/_backbone-WithLossCell/_backbone-AllToAllNet/MatMul-op0': [
                       [1, 1], [1, 8]]}
    assert (strategys == expect_dict)
    assert strategys == expect_dict
    context.set_context(save_graphs=False)


--- a/tests/ut/python/parallel/test_arithmetic.py
+++ b/tests/ut/python/parallel/test_arithmetic.py
@@ -43,7 +43,7 @@ class GradWrap(nn.Cell):
        return C.grad_all(self.network)(x, y, b)


 def compile(net, x, y, b):
 def compile_net(net, x, y, b):
    net.set_auto_parallel()
    _executor.compile(net, x, y, b)

@@ -69,7 +69,7 @@ def test_matmul_sub():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_add():
@@ -93,7 +93,7 @@ def test_matmul_add():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_mul():
@@ -117,7 +117,7 @@ def test_matmul_mul():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_div():
@@ -141,7 +141,7 @@ def test_matmul_div():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_greater():
@@ -165,7 +165,7 @@ def test_matmul_greater():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_add_broadcast():
@@ -189,7 +189,7 @@ def test_matmul_add_broadcast():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_add_broadcast2():
@@ -213,7 +213,7 @@ def test_matmul_add_broadcast2():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 1]), dtype=ms.float32)
    b = Tensor(np.ones([1, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_sub_broadcast():
@@ -237,7 +237,7 @@ def test_matmul_sub_broadcast():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_sub_broadcast2():
@@ -261,7 +261,7 @@ def test_matmul_sub_broadcast2():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 1]), dtype=ms.float32)
    b = Tensor(np.ones([1, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_mul_broadcast():
@@ -285,7 +285,7 @@ def test_matmul_mul_broadcast():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_mul_broadcast2():
@@ -309,7 +309,7 @@ def test_matmul_mul_broadcast2():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 1]), dtype=ms.float32)
    b = Tensor(np.ones([1, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_div_broadcast():
@@ -333,7 +333,7 @@ def test_matmul_div_broadcast():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_div_broadcast2():
@@ -357,7 +357,7 @@ def test_matmul_div_broadcast2():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 1]), dtype=ms.float32)
    b = Tensor(np.ones([1, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_greater_broadcast():
@@ -381,7 +381,7 @@ def test_matmul_greater_broadcast():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_greater_broadcast2():
@@ -405,7 +405,7 @@ def test_matmul_greater_broadcast2():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 1]), dtype=ms.float32)
    b = Tensor(np.ones([1, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_floordiv():
@@ -429,7 +429,7 @@ def test_matmul_floordiv():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_floordiv_broadcast():
@@ -453,7 +453,7 @@ def test_matmul_floordiv_broadcast():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_floordiv_broadcast2():
@@ -477,7 +477,7 @@ def test_matmul_floordiv_broadcast2():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 1]), dtype=ms.float32)
    b = Tensor(np.ones([1, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_assign_sub():
@@ -504,4 +504,4 @@ def test_assign_sub():
    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([128, 32]), dtype=ms.float32)
    z = Tensor(np.ones([128, 32]), dtype=ms.float32)
    compile(net, x, y, z)
    compile_net(net, x, y, z)
--- a/tests/ut/python/parallel/test_auto_parallel_BN_PReLU.py
+++ b/tests/ut/python/parallel/test_auto_parallel_BN_PReLU.py
@@ -20,7 +20,6 @@ from mindspore import Tensor
 from mindspore import context
 from mindspore.common.api import _executor
 from mindspore.ops import composite as C
 from mindspore.ops import operations as P
 from tests.ut.python.ops.test_math_ops import VirtualLoss


--- a/tests/ut/python/parallel/test_auto_parallel_arithmetic.py
+++ b/tests/ut/python/parallel/test_auto_parallel_arithmetic.py
@@ -18,7 +18,6 @@ import mindspore as ms
 import mindspore.nn as nn
 from mindspore import Tensor
 from mindspore import context
 from mindspore import context
 from mindspore.common.api import _executor
 from mindspore.ops import composite as C
 from mindspore.ops import operations as P
@@ -48,7 +47,7 @@ class GradWrap(nn.Cell):
        return C.grad_all(self.network)(x, y, b)


 def compile(net, x, y, b, phase):
 def compile_net(net, x, y, b, phase):
    net.set_auto_parallel()
    _executor.compile(net, x, y, b, phase=phase)

@@ -73,7 +72,7 @@ def test_auto_parallel_arithmetic():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 128]), dtype=ms.float32)
    b = Tensor(np.ones([64, 128]), dtype=ms.float32)
    compile(net, x, y, b, phase='train')
    compile_net(net, x, y, b, phase='train')
    strategies = _executor._get_strategy(net)
    expected_strategies = {'Default/network-Net/FloorDiv-op0': [[2, 4], [2, 4]],
                           'Default/network-Net/MatMul-op1': [[2, 1], [1, 4]]}
@@ -100,7 +99,7 @@ def test_auto_parallel_arithmetic_broadcast_both():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 1]), dtype=ms.float32)
    b = Tensor(np.ones([1, 64]), dtype=ms.float32)
    compile(net, x, y, b, phase='train')
    compile_net(net, x, y, b, phase='train')
    strategies = _executor._get_strategy(net)
    expected_strategies = {'Default/network-Net/FloorDiv-op0': [[8, 1], [1, 1]],
                           'Default/network-Net/MatMul-op1': [[8, 1], [1, 1]]}
@@ -127,7 +126,7 @@ def test_auto_parallel_arithmetic_broadcast_right():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 32]), dtype=ms.float32)
    b = Tensor(np.ones([32]), dtype=ms.float32)
    compile(net, x, y, b, phase='train')
    compile_net(net, x, y, b, phase='train')
    strategies = _executor._get_strategy(net)
    expected_strategies = {'Default/network-Net/FloorDiv-op0': [[4, 2], [2]],
                           'Default/network-Net/MatMul-op1': [[4, 1], [1, 2]]}
@@ -154,7 +153,7 @@ def test_auto_parallel_arithmetic_broadcast_left():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 32]), dtype=ms.float32)
    b = Tensor(np.ones([128, 64, 32]), dtype=ms.float32)
    compile(net, x, y, b, phase="train")
    compile_net(net, x, y, b, phase="train")
    strategies = _executor._get_strategy(net)
    expected_strategies = {'Default/network-Net/FloorDiv-op0': [[4, 2], [1, 4, 2]],
                           'Default/network-Net/MatMul-op1': [[4, 1], [1, 2]]}
--- a/tests/ut/python/parallel/test_auto_parallel_assign_sub_with_ref_key.py
+++ b/tests/ut/python/parallel/test_auto_parallel_assign_sub_with_ref_key.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import numpy as np
 import re
 import numpy as np

 import mindspore as ms
 import mindspore.nn as nn
@@ -21,7 +21,6 @@ from mindspore import Tensor
 from mindspore import context
 from mindspore.common.api import _executor
 from mindspore.ops import composite as C
 from mindspore.ops import operations as P
 from mindspore.parallel._utils import _reset_op_id as reset_op_id
 from tests.ut.python.ops.test_math_ops import VirtualLoss

--- a/tests/ut/python/parallel/test_auto_parallel_double_subgraphs.py
+++ b/tests/ut/python/parallel/test_auto_parallel_double_subgraphs.py
@@ -10,7 +10,6 @@ from mindspore.ops import composite as C
 from mindspore.ops import functional as F
 from mindspore.ops import operations as P
 from mindspore.parallel import _cost_model_context as cost_model_context
 from mindspore.parallel import set_algo_parameters, get_algo_parameters, reset_algo_parameters
 from mindspore.parallel._utils import _reset_op_id as reset_op_id


--- a/tests/ut/python/parallel/test_auto_parallel_four_matmul.py
+++ b/tests/ut/python/parallel/test_auto_parallel_four_matmul.py
@@ -44,7 +44,7 @@ class GradWrap(nn.Cell):
        return C.grad_all(self.network)(x, y, z, w, b)


 def compile(net, x, y, z, w, b):
 def compile_net(net, x, y, z, w, b):
    net.set_auto_parallel()
    _executor.compile(net, x, y, z, w, b)

@@ -77,7 +77,7 @@ def test_four_matmul_linear():

    net = GradWrap(NetWithLoss(Net()))
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    compile(net, x, y, z, w, b)
    compile_net(net, x, y, z, w, b)


 def test_four_matmul1():
@@ -103,7 +103,7 @@ def test_four_matmul1():

    net = GradWrap(NetWithLoss(Net()))
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    compile(net, x, y, z, w, b)
    compile_net(net, x, y, z, w, b)


 def test_four_matmul2():
@@ -130,4 +130,4 @@ def test_four_matmul2():

    net = GradWrap(NetWithLoss(Net()))
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    compile(net, x, y, z, w, b)
    compile_net(net, x, y, z, w, b)
--- a/tests/ut/python/parallel/test_auto_parallel_inference.py
+++ b/tests/ut/python/parallel/test_auto_parallel_inference.py
@@ -36,4 +36,4 @@ def test_inference_phase():
    train_network.set_train()
    train_network.set_auto_parallel()

    output = train_network(predict, label)
    _ = train_network(predict, label)
--- a/tests/ut/python/parallel/test_auto_parallel_matmul_prelu.py
+++ b/tests/ut/python/parallel/test_auto_parallel_matmul_prelu.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import numpy as np
 import re
 import numpy as np

 import mindspore as ms
 import mindspore.nn as nn
--- a/tests/ut/python/parallel/test_auto_parallel_parameter_cast.py
+++ b/tests/ut/python/parallel/test_auto_parallel_parameter_cast.py
@@ -16,7 +16,6 @@ import numpy as np

 import mindspore as ms
 import mindspore.nn as nn
 from mindspore import Tensor
 from mindspore import Tensor, Parameter
 from mindspore import context
 from mindspore.common import dtype as mstype
--- a/tests/ut/python/parallel/test_auto_parallel_reduce_method.py
+++ b/tests/ut/python/parallel/test_auto_parallel_reduce_method.py
@@ -44,7 +44,7 @@ class GradWrap(nn.Cell):
        return C.grad_all(self.network)(x, y, b)


 def compile(net, x, y, b):
 def compile_net(net, x, y, b):
    net.set_auto_parallel()
    _executor.compile(net, x, y, b)

@@ -71,7 +71,7 @@ def test_sum_mul():
    x = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    y = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([32, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_sum_mul2():
@@ -95,7 +95,7 @@ def test_sum_mul2():
    x = Tensor(np.ones([128, 128, 64, 64]), dtype=ms.float32)
    y = Tensor(np.ones([128, 128, 64, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_sum_mul3():
@@ -119,4 +119,4 @@ def test_sum_mul3():
    x = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    y = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([128, 32]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)
--- a/tests/ut/python/parallel/test_auto_parallel_reshape.py
+++ b/tests/ut/python/parallel/test_auto_parallel_reshape.py
@@ -215,7 +215,7 @@ def test_reshape_auto_5():
    size = 8
    context.set_auto_parallel_context(device_num=size, global_rank=0)
    x = Tensor(np.ones([4, 1024 * size, 1]), dtype=ms.float32)
    y = Tensor(np.ones([4, 1024 * size, ]), dtype=ms.float32)
    y = Tensor(np.ones([4, 1024 * size,]), dtype=ms.float32)

    net = GradWrap(NetWithLoss(Net()))
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
@@ -263,7 +263,7 @@ def test_reshape_auto_6():
    size = 8
    context.set_auto_parallel_context(device_num=size, global_rank=0)
    x = Tensor(np.ones([4, 1024, 1]), dtype=ms.float32)
    y = Tensor(np.ones([4, 1024, ]), dtype=ms.float32)
    y = Tensor(np.ones([4, 1024,]), dtype=ms.float32)

    net = GradWrap(NetWithLoss(Net()))
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
--- a/tests/ut/python/parallel/test_auto_parallel_rhombus.py
+++ b/tests/ut/python/parallel/test_auto_parallel_rhombus.py
@@ -44,7 +44,7 @@ class GradWrap(nn.Cell):
        return C.grad_all(self.network)(x, y, b)


 def compile(net, x, y, b):
 def compile_net(net, x, y, b):
    net.set_auto_parallel()
    _executor.compile(net, x, y, b)

@@ -72,7 +72,7 @@ def test_rhombus1():

    net = GradWrap(NetWithLoss(Net()))
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_rhombus2():
@@ -103,7 +103,7 @@ def test_rhombus2():

    net = GradWrap(NetWithLoss(Net()))
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_rhombus3():
@@ -134,4 +134,4 @@ def test_rhombus3():

    net = GradWrap(NetWithLoss(Net()))
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    compile(net, x, y, z)
    compile_net(net, x, y, z)
--- a/tests/ut/python/parallel/test_auto_parallel_softmax_loss.py
+++ b/tests/ut/python/parallel/test_auto_parallel_softmax_loss.py
@@ -21,7 +21,6 @@ from mindspore import context
 from mindspore.common.api import _executor
 from mindspore.ops import composite as C
 from mindspore.ops import operations as P
 from tests.ut.python.ops.test_math_ops import VirtualLoss


 class NetWithLoss(nn.Cell):
--- a/tests/ut/python/parallel/test_auto_parallel_transformer.py
+++ b/tests/ut/python/parallel/test_auto_parallel_transformer.py
@@ -105,8 +105,8 @@ def test_dmnet_train_step():
    size = 8
    context.set_auto_parallel_context(device_num=size, global_rank=0)

    input = Tensor(np.ones([4096, 4096]).astype(np.float32) * 0.01)
    input_ = Tensor(np.ones([4096, 4096]).astype(np.float32) * 0.01)
    net = GradWrap(NetWithLoss(MultiTransformer()))
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    net.set_auto_parallel()
    _executor.compile(net, input)
    _executor.compile(net, input_)
--- a/tests/ut/python/parallel/test_auto_parallel_two_bn.py
+++ b/tests/ut/python/parallel/test_auto_parallel_two_bn.py
@@ -1,5 +1,19 @@
 import numpy as np
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import re
 import numpy as np

 import mindspore as ms
 import mindspore.nn as nn
@@ -33,7 +47,7 @@ class Blockcell(nn.Cell):
        return out


 def getBlock():
 def get_block():
    return Blockcell()


@@ -41,8 +55,8 @@ def test_two_bn():
    class Net(nn.Cell):
        def __init__(self):
            super().__init__()
            self.block1 = getBlock()
            self.block2 = getBlock()
            self.block1 = get_block()
            self.block2 = get_block()
            self.relu = P.ReLU()
            self.add = P.TensorAdd()
            self.bias = Tensor(np.ones([64, 64]), dtype=ms.float32)
--- a/tests/ut/python/parallel/test_auto_parallel_two_matmul.py
+++ b/tests/ut/python/parallel/test_auto_parallel_two_matmul.py
@@ -104,23 +104,23 @@ def test_two_matmul():
    set_algo_parameters(tensor_slice_align_enable=False, tensor_slice_align_size=32,
                        fully_use_devices=False, elementwise_op_strategy_follow=False)
    para_slice_align_enable = get_algo_parameters("tensor_slice_align_enable")
    assert para_slice_align_enable == False
    assert not para_slice_align_enable
    para_slice_align_size = get_algo_parameters("tensor_slice_align_size")
    assert para_slice_align_size == 32
    fully_use_devices = get_algo_parameters("fully_use_devices")
    assert fully_use_devices == False
    assert not fully_use_devices
    elementwise_op_strategy_follow = get_algo_parameters("elementwise_op_strategy_follow")
    assert elementwise_op_strategy_follow == False
    assert not elementwise_op_strategy_follow

    reset_algo_parameters()
    para_slice_align_enable = get_algo_parameters("tensor_slice_align_enable")
    assert para_slice_align_enable == False
    assert not para_slice_align_enable
    para_slice_align_size = get_algo_parameters("tensor_slice_align_size")
    assert para_slice_align_size == 16
    fully_use_devices = get_algo_parameters("fully_use_devices")
    assert fully_use_devices == True
    assert fully_use_devices
    elementwise_op_strategy_follow = get_algo_parameters("elementwise_op_strategy_follow")
    assert elementwise_op_strategy_follow == False
    assert not elementwise_op_strategy_follow

    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
--- a/tests/ut/python/parallel/test_auto_star_elimination.py
+++ b/tests/ut/python/parallel/test_auto_star_elimination.py
@@ -11,9 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math

 import numpy as np
 import os

 import mindspore as ms
 import mindspore.nn as nn
@@ -21,10 +20,8 @@ from mindspore import Tensor, Parameter
 from mindspore import context
 from mindspore.common import dtype as mstype
 from mindspore.common.api import _executor
 from mindspore.common.initializer import initializer
 from mindspore.nn.loss.loss import _Loss
 from mindspore.ops import composite as C
 from mindspore.ops import functional as F
 from mindspore.ops import operations as P
 from tests.ut.python.ops.test_math_ops import VirtualLoss

--- a/tests/ut/python/parallel/test_batch_matmul.py
+++ b/tests/ut/python/parallel/test_batch_matmul.py
@@ -41,7 +41,7 @@ _w2 = Tensor(np.ones([128, 32, 32]), dtype=ms.float32)
 _b = Tensor(np.ones([128, 64, 16]), dtype=ms.float32)


 def compile(net):
 def compile_net(net):
    optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
    train_net = TrainOneStepCell(net, optimizer)
    train_net.set_auto_parallel()
@@ -54,7 +54,7 @@ def test_batch_matmul_data_parallel():
    strategy1 = ((16, 1, 1), (16, 1, 1))
    strategy2 = ((16, 1, 1), (16, 1, 1))
    net = Net(_w1, _w2, False, strategy1, strategy2)
    compile(net)
    compile_net(net)


 def test_batch_matmul_model_parallel():
@@ -62,7 +62,7 @@ def test_batch_matmul_model_parallel():
    strategy1 = ((1, 1, 1), (1, 1, 1))
    strategy2 = ((1, 1, 1), (1, 1, 16))
    net = Net(_w1, _w2, False, strategy1, strategy2)
    compile(net)
    compile_net(net)


 def test_batch_matmul_hybrid_parallel():
@@ -70,13 +70,13 @@ def test_batch_matmul_hybrid_parallel():
    strategy1 = ((2, 2, 2), (2, 2, 2))
    strategy2 = ((2, 2, 2), (2, 2, 2))
    net = Net(_w1, _w2, False, strategy1, strategy2)
    compile(net)
    compile_net(net)


 def test_batch_matmul_auto_parallel():
    context.set_auto_parallel_context(parallel_mode="auto_parallel", device_num=16, global_rank=0)
    net = Net(_w1, _w2, False)
    compile(net)
    compile_net(net)


 def test_batch_matmul_repeat_calc():
@@ -84,7 +84,7 @@ def test_batch_matmul_repeat_calc():
    strategy1 = ((2, 2, 4), (2, 2, 4))
    strategy2 = ((1, 2, 2), (1, 2, 2))
    net = Net(_w1, _w2, False, strategy1, strategy2)
    compile(net)
    compile_net(net)


 def test_batch_matmul_transpose_b():
@@ -92,4 +92,4 @@ def test_batch_matmul_transpose_b():
    strategy1 = ((2, 2, 4), (2, 2, 4))
    strategy2 = ((1, 2, 2), (1, 2, 2))
    net = Net(_w1, _w2, True, strategy1, strategy2)
    compile(net)
    compile_net(net)
--- a/tests/ut/python/parallel/test_batchnorm_batch_parallel.py
+++ b/tests/ut/python/parallel/test_batchnorm_batch_parallel.py
@@ -30,7 +30,6 @@ from mindspore.train import Model, ParallelMode
 from tests.dataset_mock import MindData

 dev_num = 8
 strategy_no_weight = ((dev_num, 1, 1, 1),)
 strategy_weight = ((dev_num, 1, 1, 1), (1, 1, 1, 1))
 strategy_bn = ((dev_num, 1, 1, 1), (1,), (1,))
 strategy_fc_weight_bias = ((dev_num, 1), (1, 1), (1,))
@@ -62,7 +61,7 @@ def conv7x7(in_channels, out_channels, stride=1, padding=0):
    weight_shape = (out_channels, in_channels, 7, 7)
    weight = Tensor(np.ones(weight_shape).astype(np.float32))
    conv = Conv2d(in_channels, out_channels,
                  kernel_size=7, stride=stride, padding=0, weight_init=weight, has_bias=False,
                  kernel_size=7, stride=stride, padding=padding, weight_init=weight, has_bias=False,
                  pad_mode="same")
    conv.conv2d.set_strategy(strategy_weight)
    return conv
@@ -95,7 +94,7 @@ class ResNet(Cell):
    def __init__(self, num_classes=100):
        super(ResNet, self).__init__()
        strategy_no_weight = ((dev_num, 1, 1, 1),)
        self.conv1 = conv7x7(3, 64, stride=2, padding=3)
        self.conv1 = conv7x7(3, 64, stride=2, padding=0)
        self.bn1 = bn_with_initialize(64)
        self.relu = ReLU()
        self.relu.relu.set_strategy(strategy_no_weight)
@@ -124,7 +123,6 @@ def test_batchnorm_batch_parallel():
    learning_rate = 0.1
    momentum = 0.9
    epoch_size = 2
    rank_size = 0

    predict = Tensor(np.ones([batch_size, 3, 224, 224]), dtype=ms.float32)
    label = Tensor(np.ones([batch_size]), dtype=ms.int32)
--- a/tests/ut/python/parallel/test_bn_prelu_cell.py
+++ b/tests/ut/python/parallel/test_bn_prelu_cell.py
@@ -171,7 +171,7 @@ class PReLU(nn.Cell):

        if not isinstance(w, Tensor):
            w = Tensor(w)
        self.w = Parameter(initializer(w, [channel, ]), name='a')
        self.w = Parameter(initializer(w, [channel,]), name='a')
        self.prelu = P.PReLU()
        self.relu = P.ReLU().set_strategy(((1)))

@@ -181,7 +181,7 @@ class PReLU(nn.Cell):


 class BNNet(nn.Cell):
    def __init__(self, strategy0, strategy1, strategy2):
    def __init__(self):
        super(BNNet, self).__init__()
        self.bn = FusedBatchNorm(512)
        self.prelu = PReLU(512)
@@ -192,13 +192,12 @@ class BNNet(nn.Cell):
        return x


 def bn_net(strategy0, strategy1, strategy2):
    return BNNet(strategy0=strategy0, strategy1=strategy1, strategy2=strategy2)
 def bn_net():
    return BNNet()


 def bn_common(parallel_mode, train_flag, strategy0=None, strategy1=None, strategy2=None, strategy_loss=None):
 def bn_common(parallel_mode, train_flag, strategy_loss=None):
    context.set_context(mode=context.GRAPH_MODE)
    batch_size = 32
    learning_rate = 0.1
    momentum = 0.9
    epoch_size = 2
@@ -207,7 +206,7 @@ def bn_common(parallel_mode, train_flag, strategy0=None, strategy1=None, strateg
    predict = Tensor(np.ones([32, 512]), dtype=ms.float32)
    label = Tensor(np.ones([32]), dtype=ms.int32)
    dataset = Dataset(predict, label, 2)
    net = bn_net(strategy0, strategy1, strategy2)
    net = bn_net()

    loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
    loss.softmax_cross_entropy.set_strategy(strategy_loss)
--- a/tests/ut/python/parallel/test_bool_grad.py
+++ b/tests/ut/python/parallel/test_bool_grad.py
@@ -21,7 +21,7 @@ from mindspore import context
 from mindspore.common.parameter import Parameter
 from mindspore.nn.optim import Momentum
 from mindspore.ops import operations as P
 from mindspore.train import Model, ParallelMode
 from mindspore.train import Model
 from tests.dataset_mock import MindData

 context.set_context(mode=context.GRAPH_MODE)
--- a/tests/ut/python/parallel/test_broadcast_dict.py
+++ b/tests/ut/python/parallel/test_broadcast_dict.py
@@ -54,7 +54,7 @@ def test_param_broadcast():
    network.set_train()

    predict = Tensor(np.ones([64, 512]).astype(np.float32) * 0.01)
    out = network(predict)
    _ = network(predict)
    context.reset_auto_parallel_context()


@@ -67,5 +67,5 @@ def test_param_not_broadcast():
    network.set_train()

    predict = Tensor(np.ones([64, 512]).astype(np.float32) * 0.01)
    out = network(predict)
    _ = network(predict)
    context.reset_auto_parallel_context()
--- a/tests/ut/python/parallel/test_comparison_function_info.py
+++ b/tests/ut/python/parallel/test_comparison_function_info.py
@@ -44,7 +44,7 @@ class GradWrap(nn.Cell):
        return C.grad_all(self.network)(x, y, b)


 def compile(net, x, y, b):
 def compile_net(net, x, y, b):
    net.set_auto_parallel()
    _executor.compile(net, x, y, b)

@@ -69,7 +69,7 @@ def test_matmul_equal():
    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([128, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_not_equal():
@@ -92,7 +92,7 @@ def test_matmul_not_equal():
    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([128, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_not_equal_repeated_calculation():
@@ -115,7 +115,7 @@ def test_matmul_not_equal_repeated_calculation():
    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([128, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_maximum():
@@ -138,7 +138,7 @@ def test_matmul_maximum():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_maximum_broadcast():
@@ -161,7 +161,7 @@ def test_matmul_maximum_broadcast():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_maximum_broadcast2():
@@ -184,7 +184,7 @@ def test_matmul_maximum_broadcast2():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 1]), dtype=ms.float32)
    b = Tensor(np.ones([1, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_minimum():
@@ -207,7 +207,7 @@ def test_matmul_minimum():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_minimum_broadcast():
@@ -230,7 +230,7 @@ def test_matmul_minimum_broadcast():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_minimum_broadcast2():
@@ -253,7 +253,7 @@ def test_matmul_minimum_broadcast2():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 1]), dtype=ms.float32)
    b = Tensor(np.ones([1, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_minimum_auto_parallel():
@@ -274,4 +274,4 @@ def test_matmul_minimum_auto_parallel():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 1]), dtype=ms.float32)
    b = Tensor(np.ones([1, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)
--- a/tests/ut/python/parallel/test_dataset_util.py
+++ b/tests/ut/python/parallel/test_dataset_util.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import numpy as np

 import mindspore as ms
 from mindspore import Tensor
 from mindspore.train._utils import _to_full_shapes, _to_full_tensor
@@ -35,7 +33,7 @@ def test_to_full_tensor_1():
    expect = ([[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [1, 2, 3], [4, 5, 6], [0, 0, 0], [0, 0, 0]])
    expect_tensor = Tensor(expect, dtype=ms.float32)

    assert (full_tensor[0] == expect_tensor)
    assert full_tensor[0] == expect_tensor


 def test_to_full_tensor_2():
@@ -52,7 +50,7 @@ def test_to_full_tensor_2():
    expect_tensor1 = Tensor(expect1, dtype=ms.int32)
    expect_tensors = (expect_tensor0, expect_tensor1)

    assert (full_tensor == expect_tensors)
    assert full_tensor == expect_tensors


 def test_to_full_tensor_sens_2():
@@ -70,4 +68,4 @@ def test_to_full_tensor_sens_2():
    expect_tensor_sens = Tensor(0.1, dtype=ms.float32)
    expect_tensors = (expect_tensor0, expect_tensor1, expect_tensor_sens)

    assert (full_tensor == expect_tensors)
    assert full_tensor == expect_tensors
--- a/tests/ut/python/parallel/test_dense_matmul.py
+++ b/tests/ut/python/parallel/test_dense_matmul.py
@@ -47,8 +47,8 @@ class DenseMutMulNet(nn.Cell):

 def test_dmnet_train_step():
    context.reset_auto_parallel_context()
    input = Tensor(np.ones([32, 128]).astype(np.float32) * 0.01)
    input_ = Tensor(np.ones([32, 128]).astype(np.float32) * 0.01)
    label = Tensor(np.zeros([32, 768]).astype(np.float32))
    net = DenseMutMulNet()
    net = train_step_with_loss_warp(DenseMutMulNet())
    _executor.compile(net, input, label)
    _executor.compile(net, input_, label)
--- a/tests/ut/python/parallel/test_different_type_for_div_op.py
+++ b/tests/ut/python/parallel/test_different_type_for_div_op.py
@@ -32,7 +32,7 @@ class GradWrap(nn.Cell):
        return C.grad_all(self.network)(x, y, bias)


 def compile(net, x, y, bias):
 def compile_net(net, x, y, bias):
    net.set_auto_parallel()
    _executor.compile(net, x, y, bias)

@@ -58,7 +58,7 @@ def test_sum_as_loss_float16():
    x = Tensor(np.ones([64, 32]), dtype=ms.float16)
    y = Tensor(np.ones([64, 32]), dtype=ms.float16)
    bias = Tensor(np.ones([64]), dtype=ms.float16)
    compile(net, x, y, bias)
    compile_net(net, x, y, bias)


 def test_sum_as_loss_float32():
@@ -82,7 +82,7 @@ def test_sum_as_loss_float32():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([64, 32]), dtype=ms.float32)
    bias = Tensor(np.ones([64]), dtype=ms.float32)
    compile(net, x, y, bias)
    compile_net(net, x, y, bias)


 def test_sum_as_loss_int32():
@@ -106,4 +106,4 @@ def test_sum_as_loss_int32():
    x = Tensor(np.ones([64, 32]), dtype=ms.int32)
    y = Tensor(np.ones([64, 32]), dtype=ms.int32)
    bias = Tensor(np.ones([64]), dtype=ms.int32)
    compile(net, x, y, bias)
    compile_net(net, x, y, bias)
--- a/tests/ut/python/parallel/test_dropout_do_mask.py
+++ b/tests/ut/python/parallel/test_dropout_do_mask.py
@@ -50,7 +50,7 @@ _w1 = Tensor(np.ones([128, 64]), dtype=ms.float32)
 _b = Tensor(np.ones([128, 64]), dtype=ms.float32)


 def compile(net):
 def compile_net(net):
    optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
    train_net = TrainOneStepCell(net, optimizer)
    train_net.set_auto_parallel()
@@ -63,7 +63,7 @@ def test_dropout_do_mask_data_parallel():
    strategy1 = ((16, 1), (16, 1))
    strategy2 = ((16, 1),)
    net = Net(_w1, strategy1, strategy2)
    compile(net)
    compile_net(net)


 def test_dropout_do_mask_model_parallel():
@@ -71,7 +71,7 @@ def test_dropout_do_mask_model_parallel():
    strategy1 = ((1, 16), (1, 16))
    strategy2 = ((1, 16),)
    net = Net(_w1, strategy1, strategy2)
    compile(net)
    compile_net(net)


 def test_dropout_do_mask_hybrid_parallel():
@@ -79,13 +79,13 @@ def test_dropout_do_mask_hybrid_parallel():
    strategy1 = ((4, 4), (4, 4))
    strategy2 = ((4, 4),)
    net = Net(_w1, strategy1, strategy2)
    compile(net)
    compile_net(net)


 def test_dropout_do_mask_auto_parallel():
    context.set_auto_parallel_context(parallel_mode="auto_parallel", device_num=16, global_rank=0)
    net = Net(_w1)
    compile(net)
    compile_net(net)


 def test_dropout_do_mask_repeat_calc():
@@ -93,4 +93,4 @@ def test_dropout_do_mask_repeat_calc():
    strategy1 = ((4, 4), (4, 4))
    strategy2 = ((2, 4),)
    net = Net(_w1, strategy1, strategy2)
    compile(net)
    compile_net(net)
--- a/tests/ut/python/parallel/test_element_wise_function.py
+++ b/tests/ut/python/parallel/test_element_wise_function.py
@@ -44,7 +44,7 @@ class GradWrap(nn.Cell):
        return C.grad_all(self.network)(x, y, b)


 def compile(net, x, y, b):
 def compile_net(net, x, y, b):
    net.set_auto_parallel()
    _executor.compile(net, x, y, b)

@@ -72,7 +72,7 @@ def test_matmul_pow():
    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_exp():
@@ -98,7 +98,7 @@ def test_matmul_exp():
    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_log():
@@ -124,7 +124,7 @@ def test_matmul_log():
    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_logical_not():
@@ -151,7 +151,7 @@ def test_matmul_logical_not():
    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([128, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_cast():
@@ -178,7 +178,7 @@ def test_matmul_cast():
    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.int32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_cast_before_mirror():
@@ -202,7 +202,7 @@ def test_cast_before_mirror():
    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float16)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_cast_before_mirror1():
@@ -226,7 +226,7 @@ def test_cast_before_mirror1():
    x = Tensor(np.ones([128, 32]), dtype=ms.float16)
    y = Tensor(np.ones([32, 64]), dtype=ms.float16)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_cast_before_mirror2():
@@ -250,7 +250,7 @@ def test_cast_before_mirror2():
    x = Tensor(np.ones([128, 32]), dtype=ms.float16)
    y = Tensor(np.ones([32, 64]), dtype=ms.float16)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_cast_before_mirror3():
@@ -274,7 +274,7 @@ def test_cast_before_mirror3():
    x = Tensor(np.ones([128, 32]), dtype=ms.float16)
    y = Tensor(np.ones([32, 64]), dtype=ms.float16)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_mul_two_cast():
@@ -303,4 +303,4 @@ def test_mul_two_cast():
    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([128, 32]), dtype=ms.float32)
    b = Tensor(np.ones([128, 32]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)
--- a/tests/ut/python/parallel/test_expand_dims.py
+++ b/tests/ut/python/parallel/test_expand_dims.py
@@ -54,7 +54,7 @@ _w1 = Tensor(np.ones([128, 64, 32]), dtype=ms.float32)
 _b = Tensor(np.ones([128, 64, 32, 1]), dtype=ms.float32)


 def compile(net):
 def compile_net(net):
    optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
    train_net = TrainOneStepCell(net, optimizer)
    train_net.set_auto_parallel()
@@ -68,7 +68,7 @@ def test_expand_dims_data_parallel():
    strategy2 = ((16, 1, 1),)
    strategy3 = ((16, 1, 1, 1), (16, 1, 1, 1))
    net = Net(_w1, strategy1, strategy2, strategy3)
    compile(net)
    compile_net(net)


 def test_expand_dims_model_parallel():
@@ -77,7 +77,7 @@ def test_expand_dims_model_parallel():
    strategy2 = ((1, 1, 16),)
    strategy3 = ((1, 1, 16, 1), (1, 1, 16, 1))
    net = Net(_w1, strategy1, strategy2, strategy3)
    compile(net)
    compile_net(net)


 def test_expand_dims_hybrid_parallel():
@@ -86,13 +86,13 @@ def test_expand_dims_hybrid_parallel():
    strategy2 = ((2, 2, 4),)
    strategy3 = ((2, 2, 4, 1), (2, 2, 4, 1))
    net = Net(_w1, strategy1, strategy2, strategy3)
    compile(net)
    compile_net(net)


 def test_expand_dims_auto_parallel():
    context.set_auto_parallel_context(parallel_mode="auto_parallel", device_num=16, global_rank=0)
    net = Net(_w1)
    compile(net)
    compile_net(net)


 def test_expand_dims_repeat_calc():
@@ -101,7 +101,7 @@ def test_expand_dims_repeat_calc():
    strategy2 = ((1, 2, 2),)
    strategy3 = ((2, 2, 4, 1), (2, 2, 4, 1))
    net = Net(_w1, strategy1, strategy2, strategy3)
    compile(net)
    compile_net(net)


 def test_expand_dims_parameter():
@@ -109,4 +109,4 @@ def test_expand_dims_parameter():
    strategy1 = ((1, 2, 2),)
    strategy2 = ((2, 2, 4, 1), (2, 2, 4, 1))
    net = Net2(_w1, strategy1, strategy2)
    compile(net)
    compile_net(net)
--- a/tests/ut/python/parallel/test_forward_graph.py
+++ b/tests/ut/python/parallel/test_forward_graph.py
@@ -39,7 +39,7 @@ _w1 = Tensor(np.ones([128, 64, 32]), dtype=ms.float32)
 _b = Tensor(np.ones([128, 64, 32]), dtype=ms.float32)


 def compile(net):
 def compile_net(net):
    net.set_auto_parallel()
    _executor.compile(net, _x, _b)
    context.reset_auto_parallel_context()
@@ -50,7 +50,7 @@ def test_forward_graph_data_parallel():
    strategy1 = ((16, 1, 1), (16, 1, 1))
    strategy2 = ((16, 1, 1),)
    net = Net(_w1, strategy1, strategy2)
    compile(net)
    compile_net(net)


 def test_forward_graph_model_parallel():
@@ -58,7 +58,7 @@ def test_forward_graph_model_parallel():
    strategy1 = ((1, 1, 16), (1, 1, 16))
    strategy2 = ((1, 1, 16),)
    net = Net(_w1, strategy1, strategy2)
    compile(net)
    compile_net(net)


 def test_forward_graph_hybrid_parallel():
@@ -66,13 +66,13 @@ def test_forward_graph_hybrid_parallel():
    strategy1 = ((2, 2, 4), (2, 2, 4))
    strategy2 = ((2, 2, 4),)
    net = Net(_w1, strategy1, strategy2)
    compile(net)
    compile_net(net)


 def test_forward_graph_auto_parallel():
    context.set_auto_parallel_context(parallel_mode="auto_parallel", device_num=16, global_rank=0)
    net = Net(_w1)
    compile(net)
    compile_net(net)


 def test_forward_graph_repeat_calc():
@@ -80,4 +80,4 @@ def test_forward_graph_repeat_calc():
    strategy1 = ((2, 2, 4), (2, 2, 4))
    strategy2 = ((1, 2, 2),)
    net = Net(_w1, strategy1, strategy2)
    compile(net)
    compile_net(net)
--- a/tests/ut/python/parallel/test_gather_v2.py
+++ b/tests/ut/python/parallel/test_gather_v2.py
@@ -18,7 +18,6 @@ import mindspore as ms
 import mindspore.nn as nn
 from mindspore import Tensor
 from mindspore import context
 from mindspore.common import dtype as mstype
 from mindspore.common.api import _executor
 from mindspore.ops import composite as C
 from mindspore.ops import operations as P
--- a/tests/ut/python/parallel/test_gather_v2_primitive.py
+++ b/tests/ut/python/parallel/test_gather_v2_primitive.py
@@ -120,7 +120,7 @@ class TrainOneStepCell(Cell):
        return F.depend(loss, self.optimizer(grads))


 def net_trains(gather_v2_strategy, criterion, rank):
 def net_trains(criterion, rank):
    init()
    lr = 0.1
    momentum = 0.9
@@ -151,42 +151,42 @@ def test_auto_batch_parallel():
    gather_v2_strategy = None
    criterion = GatherV2(1, strategy=gather_v2_strategy, index_size=batch_size_per_device * device_number)
    rank = 2
    net_trains(gather_v2_strategy, criterion, rank)
    net_trains(criterion, rank)


 def test_2d_index_auto_batch_parallel():
    gather_v2_strategy = None
    criterion = GatherV2(2, strategy=gather_v2_strategy, index_size=batch_size_per_device * device_number)
    rank = 2
    net_trains(gather_v2_strategy, criterion, rank)
    net_trains(criterion, rank)


 def test_batch_parallel():
    gather_v2_strategy = ((device_number, 1),)
    criterion = GatherV2(1, strategy=gather_v2_strategy, index_size=batch_size_per_device * device_number)
    rank = 2
    net_trains(gather_v2_strategy, criterion, rank)
    net_trains(criterion, rank)


 def test_strategy1():
    gather_v2_strategy = ((16, 2),)
    rank = 2
    criterion = GatherV2(1, strategy=gather_v2_strategy, index_size=batch_size_per_device * device_number)
    net_trains(gather_v2_strategy, criterion, rank)
    net_trains(criterion, rank)


 def test_strategy2():
    gather_v2_strategy = ((1, device_number),)
    rank = 2
    criterion = GatherV2(1, strategy=gather_v2_strategy, index_size=batch_size_per_device * device_number)
    net_trains(gather_v2_strategy, criterion, rank)
    net_trains(criterion, rank)


 def test_strategy3():
    gather_v2_strategy = ((8, 1),)
    rank = 2
    criterion = GatherV2(1, strategy=gather_v2_strategy, index_size=batch_size_per_device * device_number)
    net_trains(gather_v2_strategy, criterion, rank)
    net_trains(criterion, rank)


 class GatherV2Axis1(_Loss):
@@ -217,18 +217,18 @@ def test_axis1_auto_batch_parallel():
    gather_v2_strategy = None
    criterion = GatherV2Axis1(1, strategy=gather_v2_strategy, index_size=512)
    rank = 2
    net_trains(gather_v2_strategy, criterion, rank)
    net_trains(criterion, rank)


 def test_axis1_batch_parallel():
    gather_v2_strategy = ((device_number, 1),)
    criterion = GatherV2Axis1(1, strategy=gather_v2_strategy, index_size=512)
    rank = 2
    net_trains(gather_v2_strategy, criterion, rank)
    net_trains(criterion, rank)


 def test_axis1_strategy1():
    gather_v2_strategy = ((16, 2),)
    rank = 17
    criterion = GatherV2Axis1(1, strategy=gather_v2_strategy, index_size=512)
    net_trains(gather_v2_strategy, criterion, rank)
    net_trains(criterion, rank)
--- a/tests/ut/python/parallel/test_get_next.py
+++ b/tests/ut/python/parallel/test_get_next.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import numpy as np

 import mindspore as ms
 import mindspore.nn as nn
 from mindspore import Tensor
@@ -23,8 +21,6 @@ from mindspore.common.initializer import initializer
 from mindspore.common.parameter import Parameter, ParameterTuple
 from mindspore.ops import composite as C
 from mindspore.ops import operations as P
 from mindspore.ops.operations.comm_ops import _VirtualDataset
 from tests.ut.python.ops.test_math_ops import VirtualLoss

 context.set_context(mode=context.GRAPH_MODE)

@@ -56,7 +52,7 @@ class GradWrap(nn.Cell):
        return C.grad_by_list(self.network, self.weights)()


 def compile(net):
 def compile_net(net):
    net.set_auto_parallel()
    _executor.compile(net)

@@ -67,7 +63,7 @@ def test_get_next_single():
            super().__init__()
            self.norm = P.L2Normalize(axis=1)
            self.prelu = P.PReLU()
            self.w = Parameter(initializer(w, [channel, ]), name='w')
            self.w = Parameter(initializer(w, [channel,]), name='w')

        def construct(self, data):
            x = self.norm(data)
@@ -84,7 +80,7 @@ def test_get_next_semi_auto_parallel():
            super().__init__()
            self.norm = P.L2Normalize().set_strategy(strategy1)
            self.prelu = P.PReLU().set_strategy(strategy2)
            self.w = Parameter(initializer(w, [channel, ]), name='w')
            self.w = Parameter(initializer(w, [channel,]), name='w')

        def construct(self, data):
            x = self.norm(data)
@@ -99,7 +95,7 @@ def test_get_next_semi_auto_parallel():
                                strategy4=strategy4)
    net = GradWrap(net_with_loss)
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
    compile(net)
    compile_net(net)


 def test_get_next_semi_auto_parallel1():
@@ -108,7 +104,7 @@ def test_get_next_semi_auto_parallel1():
            super().__init__()
            self.norm = P.L2Normalize().set_strategy(strategy1)
            self.prelu = P.PReLU().set_strategy(strategy2)
            self.w = Parameter(initializer(w, [channel, ]), name='w')
            self.w = Parameter(initializer(w, [channel,]), name='w')

        def construct(self, data):
            x = self.norm(data)
@@ -123,7 +119,7 @@ def test_get_next_semi_auto_parallel1():
                                strategy4=strategy4)
    net = GradWrap(net_with_loss)
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
    compile(net)
    compile_net(net)


 def test_get_next_auto_parallel():
@@ -132,7 +128,7 @@ def test_get_next_auto_parallel():
            super().__init__()
            self.norm = P.L2Normalize().set_strategy(strategy1)
            self.prelu = P.PReLU().set_strategy(strategy2)
            self.w = Parameter(initializer(w, [channel, ]), name='w')
            self.w = Parameter(initializer(w, [channel,]), name='w')

        def construct(self, data):
            x = self.norm(data)
@@ -144,7 +140,7 @@ def test_get_next_auto_parallel():
    net_with_loss = NetWithLoss(network, [ms.float32, ms.int32], [[32, 64], [32]], 2)
    net = GradWrap(net_with_loss)
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    compile(net)
    compile_net(net)


 def test_only_one_get_next():
@@ -159,4 +155,4 @@ def test_only_one_get_next():
    context.set_auto_parallel_context(device_num=4, global_rank=0)
    net = Net()
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
    compile(net)
    compile_net(net)
--- a/tests/ut/python/parallel/test_get_parameter_layout.py
+++ b/tests/ut/python/parallel/test_get_parameter_layout.py
@@ -52,8 +52,8 @@ def test_get_parameter_layout():
    x_layout = [[2, 4], [1, -1], [16, 32]]  # device_arrangement = [2, 4], tensor_map = [1, -1]
    weight_layout = [[2, 4], [0, -1], [16, 32]]  # device_arrangement = [2, 4], tensor_map = [0, -1]
    expect_dict = {'x': x_layout, 'w1': weight_layout}
    # to be resovled: static local variable count_p is used in step_parallel.cc, it needs to be reset between each ut 
    assert (net.parameter_layout_dict == expect_dict)
    # to be resovled: static local variable count_p is used in step_parallel.cc, it needs to be reset between each ut
    assert net.parameter_layout_dict == expect_dict


 if __name__ == '__main__':
--- a/tests/ut/python/parallel/test_hybird_parallel_activation.py
+++ b/tests/ut/python/parallel/test_hybird_parallel_activation.py
@@ -44,7 +44,7 @@ class GradWrap(nn.Cell):
        return C.grad_all(self.network)(x, y, b)


 def compile(net, x, y, b):
 def compile_net(net, x, y, b):
    net.set_auto_parallel()
    _executor.compile(net, x, y, b)

@@ -72,7 +72,7 @@ def test_matmul_tanh():
    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_activation():
@@ -98,7 +98,7 @@ def test_matmul_activation():
    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_softmax():
@@ -124,7 +124,7 @@ def test_matmul_softmax():
    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_matmul_logsoftmax():
@@ -150,7 +150,7 @@ def test_matmul_logsoftmax():
    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_activations():
@@ -179,7 +179,7 @@ def test_activations():
    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_activations_repeated_calculation():
@@ -211,7 +211,7 @@ def test_activations_repeated_calculation():
    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_activations_axis_tuple():
@@ -243,4 +243,4 @@ def test_activations_axis_tuple():
    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)
--- a/tests/ut/python/parallel/test_layer_norm.py
+++ b/tests/ut/python/parallel/test_layer_norm.py
@@ -48,7 +48,7 @@ _w = Tensor(np.ones([128, 64, 32, 16]), dtype=ms.float32)
 _b = Tensor(np.ones([128, 64, 32, 16]), dtype=ms.float32)


 def compile(net):
 def compile_net(net):
    optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
    train_net = TrainOneStepCell(net, optimizer)
    train_net.set_auto_parallel()
@@ -62,7 +62,7 @@ def test_layer_norm_data_parallel():
    strategy2 = ((16, 1, 1, 1), (1, 1, 1), (1, 1, 1))
    strategy3 = ((16, 1, 1, 1), (16, 1, 1, 1))
    net = Net(_w, strategy1, strategy2, strategy3)
    compile(net)
    compile_net(net)


 def test_layer_norm_model_parallel():
@@ -71,7 +71,7 @@ def test_layer_norm_model_parallel():
    strategy2 = ((1, 16, 1, 1), (16, 1, 1), (16, 1, 1))
    strategy3 = ((1, 16, 1, 1), (1, 16, 1, 1))
    net = Net(_w, strategy1, strategy2, strategy3)
    compile(net)
    compile_net(net)


 def test_layer_norm_hybrid_parallel():
@@ -80,13 +80,13 @@ def test_layer_norm_hybrid_parallel():
    strategy2 = ((2, 8, 1, 1), (8, 1, 1), (8, 1, 1))
    strategy3 = ((2, 8, 1, 1), (2, 8, 1, 1))
    net = Net(_w, strategy1, strategy2, strategy3)
    compile(net)
    compile_net(net)


 def test_layer_norm_auto_parallel():
    context.set_auto_parallel_context(parallel_mode="auto_parallel", device_num=16, global_rank=0)
    net = Net(_w)
    compile(net)
    compile_net(net)


 def test_layer_norm_repeat_calc():
@@ -95,7 +95,7 @@ def test_layer_norm_repeat_calc():
    strategy2 = ((2, 2, 1, 1), (2, 1, 1), (2, 1, 1))
    strategy3 = ((2, 2, 4, 1), (2, 2, 4, 1))
    net = Net(_w, strategy1, strategy2, strategy3)
    compile(net)
    compile_net(net)


 def test_layer_norm_wrong_strategy():
@@ -105,4 +105,4 @@ def test_layer_norm_wrong_strategy():
    strategy3 = ((2, 2, 4, 1), (2, 2, 4, 1))
    net = Net(_w, strategy1, strategy2, strategy3)
    with pytest.raises(RuntimeError):
        compile(net)
        compile_net(net)
--- a/tests/ut/python/parallel/test_linear.py
+++ b/tests/ut/python/parallel/test_linear.py
@@ -21,7 +21,6 @@ from mindspore import context
 from mindspore.common.api import _executor
 from mindspore.ops import composite as C
 from mindspore.ops import operations as P
 from tests.ut.python.ops.test_math_ops import VirtualLoss


 class NetWithLoss(nn.Cell):
--- a/tests/ut/python/parallel/test_loss_and_optimizer.py
+++ b/tests/ut/python/parallel/test_loss_and_optimizer.py
@@ -19,9 +19,8 @@ import mindspore.nn as nn
 from mindspore import Tensor, Parameter
 from mindspore import context
 from mindspore.common.api import _executor
 from mindspore.nn import TrainOneStepCell, WithLossCell
 from mindspore.nn import TrainOneStepCell
 from mindspore.nn.optim import Momentum, LARS
 from mindspore.ops import composite as C
 from mindspore.ops import operations as P


@@ -36,7 +35,7 @@ class NetWithLoss(nn.Cell):
        return self.loss(predict, b)[0]


 def compile(net, x, b):
 def compile_net(net, x, b):
    net.set_auto_parallel()
    _executor.compile(net, x, b)

@@ -72,7 +71,7 @@ def test_momentum():
    train_net = TrainOneStepCell(net_with_loss, optimizer)
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")

    compile(train_net, x, b)
    compile_net(train_net, x, b)


 def test_momentum_with_loss_scale():
@@ -106,7 +105,7 @@ def test_momentum_with_loss_scale():
    train_net = TrainOneStepCell(net_with_loss, optimizer)
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")

    compile(train_net, x, b)
    compile_net(train_net, x, b)


 def test_momentum_with_dynamic_lr():
@@ -141,7 +140,7 @@ def test_momentum_with_dynamic_lr():
    train_net = TrainOneStepCell(net_with_loss, optimizer)
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")

    compile(train_net, x, b)
    compile_net(train_net, x, b)


 def test_momentum_with_loss_scale_and_dynamic_lr():
@@ -177,7 +176,7 @@ def test_momentum_with_loss_scale_and_dynamic_lr():
    train_net = TrainOneStepCell(net_with_loss, optimizer)
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")

    compile(train_net, x, b)
    compile_net(train_net, x, b)


 def test_lars():
@@ -205,11 +204,11 @@ def test_lars():
    net = Net(strategy1, strategy2, weight)

    lr = Tensor(np.ones([6]), dtype=ms.float32)
    SGD = Momentum(net.trainable_params(), lr, 0.9)
    optimizer = LARS(SGD, epsilon=1e-08, hyperpara=0.02, decay_filter=lambda x: 'bn' not in x.name,
    sgd = Momentum(net.trainable_params(), lr, 0.9)
    optimizer = LARS(sgd, epsilon=1e-08, hyperpara=0.02, decay_filter=lambda x: 'bn' not in x.name,
                     lars_filter=lambda x: 'bn' not in x.name)
    net_with_loss = NetWithLoss(net, strategy3)
    train_net = TrainOneStepCell(net_with_loss, optimizer)
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")

    compile(train_net, x, b)
    compile_net(train_net, x, b)
--- a/tests/ut/python/parallel/test_matmul_tensor.py
+++ b/tests/ut/python/parallel/test_matmul_tensor.py
@@ -46,7 +46,7 @@ class GradWrap(nn.Cell):
        return C.grad_all(self.network)(x, y)


 def compile(net, x, y):
 def compile_net(net, x, y):
    net.set_auto_parallel()
    _executor.compile(net, x, y)

@@ -79,7 +79,7 @@ def test_two_matmul():
    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 128]), dtype=ms.float32)

    compile(net, x, y)
    compile_net(net, x, y)


 def test_matmul_mul_broadcast2():
@@ -103,7 +103,7 @@ def test_matmul_mul_broadcast2():

    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 1]), dtype=ms.float32)
    compile(net, x, y)
    compile_net(net, x, y)


 def test_two_matmul1():
@@ -133,7 +133,7 @@ def test_two_matmul1():
    x = Tensor(np.ones([128, 128]), dtype=ms.float32)
    y = Tensor(np.ones([128, 128]), dtype=ms.float32)

    compile(net, x, y)
    compile_net(net, x, y)


 def test_matmul_add_tensor():
@@ -158,4 +158,4 @@ def test_matmul_add_tensor():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)

    compile(net, x, y)
    compile_net(net, x, y)
--- a/tests/ut/python/parallel/test_neg.py
+++ b/tests/ut/python/parallel/test_neg.py
@@ -39,7 +39,7 @@ _w1 = Tensor(np.ones([128, 64, 32]), dtype=ms.float32)
 _b = Tensor(np.ones([128, 64, 32]), dtype=ms.float32)


 def compile(net):
 def compile_net(net):
    optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
    train_net = TrainOneStepCell(net, optimizer)
    train_net.set_auto_parallel()
@@ -52,7 +52,7 @@ def test_neg_data_parallel():
    strategy1 = ((16, 1, 1), (16, 1, 1))
    strategy2 = ((16, 1, 1),)
    net = Net(_w1, strategy1, strategy2)
    compile(net)
    compile_net(net)


 def test_neg_model_parallel():
@@ -60,7 +60,7 @@ def test_neg_model_parallel():
    strategy1 = ((1, 1, 16), (1, 1, 16))
    strategy2 = ((1, 1, 16),)
    net = Net(_w1, strategy1, strategy2)
    compile(net)
    compile_net(net)


 def test_neg_hybrid_parallel():
@@ -68,13 +68,13 @@ def test_neg_hybrid_parallel():
    strategy1 = ((2, 2, 4), (2, 2, 4))
    strategy2 = ((2, 2, 4),)
    net = Net(_w1, strategy1, strategy2)
    compile(net)
    compile_net(net)


 def test_neg_auto_parallel():
    context.set_auto_parallel_context(parallel_mode="auto_parallel", device_num=16, global_rank=0)
    net = Net(_w1)
    compile(net)
    compile_net(net)


 def test_neg_repeat_calc():
@@ -82,4 +82,4 @@ def test_neg_repeat_calc():
    strategy1 = ((2, 2, 4), (2, 2, 4))
    strategy2 = ((1, 2, 2),)
    net = Net(_w1, strategy1, strategy2)
    compile(net)
    compile_net(net)
--- a/tests/ut/python/parallel/test_one_dev.py
+++ b/tests/ut/python/parallel/test_one_dev.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import numpy as np
 import re
 import numpy as np

 import mindspore as ms
 import mindspore.nn as nn
--- a/tests/ut/python/parallel/test_one_hot_net.py
+++ b/tests/ut/python/parallel/test_one_hot_net.py
@@ -159,8 +159,8 @@ class SemiAutoOneHotNet(Cell):
        weight_np = np.zeros(weight_shape, np.float32)
        self.weight = Parameter(Tensor(weight_np), name='model_parallel_weight')

    def construct(self, input, label):
        input_n = self.normalize(input)
    def construct(self, input_, label):
        input_n = self.normalize(input_)
        w = self.normalize2(self.weight)
        fc_o = self.fc(input_n, w)
        fc_o_shape = F.shape(fc_o)
@@ -209,9 +209,8 @@ class Dataset(MindData):
            raise StopIteration
        self.index += 1
        if self.input_num == 2:
            return self.predict, self.label
        else:
            return self.predict,
            return (self.predict, self.label)
        return (self.predict,)

    def reset(self):
        self.index = 0
@@ -268,20 +267,20 @@ def test_bn_reshape_dense_bn_train_loss():
    batch_size = 16
    device_num = 16
    context.set_auto_parallel_context(device_num=device_num, global_rank=0)
    input = Tensor(np.ones([batch_size, 2, 32, 32]).astype(np.float32) * 0.01)
    input_ = Tensor(np.ones([batch_size, 2, 32, 32]).astype(np.float32) * 0.01)
    label = Tensor(np.ones([batch_size]), dtype=ms.int32)

    net = GradWrap(NetWithLoss(BNReshapeDenseBNNet()))
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
    net.set_auto_parallel()

    _executor.compile(net, input, label)
    _executor.compile(net, input_, label)


 def test_semi_one_hot_net_batch():
    batch_size = 16
    context.set_auto_parallel_context(device_num=device_num, global_rank=0)
    input = Tensor(np.ones([batch_size * 1, 512]).astype(np.float32) * 0.01)
    input_ = Tensor(np.ones([batch_size * 1, 512]).astype(np.float32) * 0.01)
    label = Tensor(np.ones([batch_size]), dtype=ms.int32)

    net = SemiAutoOneHotNet(args=Args(), strategy=StrategyBatch())
@@ -289,7 +288,7 @@ def test_semi_one_hot_net_batch():
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
    net.set_auto_parallel()

    _executor.compile(net, input, label)
    _executor.compile(net, input_, label)


 def test_semi_one_hot_net_model():
--- a/tests/ut/python/parallel/test_one_weight_parameter.py
+++ b/tests/ut/python/parallel/test_one_weight_parameter.py
@@ -20,7 +20,6 @@ from mindspore import Tensor, Parameter, ParameterTuple
 from mindspore import context
 from mindspore.common.api import _executor
 from mindspore.ops import composite as C
 from mindspore.ops import functional as F
 from mindspore.ops import operations as P


--- a/tests/ut/python/parallel/test_onehot.py
+++ b/tests/ut/python/parallel/test_onehot.py
@@ -126,15 +126,6 @@ def test_onehot_auto():
    compile_graph(strategy1, strategy2, strategy3, strategy4, auto=True)


 def test_onehot_model_parallel():
    context.set_auto_parallel_context(device_num=16, global_rank=0)
    strategy1 = ((2, 4), (4, 2))
    strategy2 = ((2, 8),)
    strategy3 = ((1, 16), (), ())
    strategy4 = ((16, 1), (16, 1))
    compile_graph(strategy1, strategy2, strategy3, strategy4)


 def test_onehot_batch_parallel_axis0():
    context.set_auto_parallel_context(device_num=16, global_rank=0)
    strategy1 = ((2, 4), (4, 2))
--- a/tests/ut/python/parallel/test_operator_model_parallel.py
+++ b/tests/ut/python/parallel/test_operator_model_parallel.py
@@ -21,8 +21,6 @@ from mindspore.common.initializer import initializer
 from mindspore.common.parameter import Parameter
 from mindspore.common.tensor import Tensor
 from mindspore.nn.cell import Cell
 from mindspore.nn.layer.activation import ReLU
 from mindspore.nn.layer.basic import Dense
 from mindspore.nn.layer.basic import Flatten
 from mindspore.nn.layer.conv import Conv2d
 from mindspore.nn.layer.normalization import BatchNorm2d
@@ -61,8 +59,7 @@ class DenseWrap(Cell):
        self.has_bias = has_bias

        self.weight = Parameter(initializer(
            weight_init, [output_channels, input_channels]),
            name="weight")
            weight_init, [output_channels, input_channels]), name="weight")

        if self.has_bias:
            self.bias = Parameter(initializer(
@@ -103,7 +100,7 @@ class DatasetLenet(MindData):
        self.index = 0


 def conv3x3(in_channels, out_channels, stride=1, padding=1):
 def conv3x3(in_channels, out_channels, stride=1):
    """3x3 convolution """
    weight_shape = (out_channels, in_channels, 3, 3)
    weight = Tensor(np.ones(weight_shape).astype(np.float32))
@@ -114,7 +111,7 @@ def conv3x3(in_channels, out_channels, stride=1, padding=1):
    return conv


 def conv1x1(in_channels, out_channels, stride=1, padding=0):
 def conv1x1(in_channels, out_channels, stride=1):
    """1x1 convolution"""
    weight_shape = (out_channels, in_channels, 1, 1)
    weight = Tensor(np.ones(weight_shape).astype(np.float32))
@@ -125,7 +122,7 @@ def conv1x1(in_channels, out_channels, stride=1, padding=0):
    return conv


 def conv7x7(in_channels, out_channels, stride=1, padding=0):
 def conv7x7(in_channels, out_channels, stride=1):
    """1x1 convolution"""
    weight_shape = (out_channels, in_channels, 7, 7)
    weight = Tensor(np.ones(weight_shape).astype(np.float32))
@@ -186,18 +183,17 @@ class ResidualBlock(Cell):
    def __init__(self,
                 in_channels,
                 out_channels,
                 stride=1,
                 down_sample=False):
                 stride=1):
        super(ResidualBlock, self).__init__()

        out_chls = out_channels // self.expansion
        self.conv1 = conv1x1(in_channels, out_chls, stride=1, padding=0)
        self.conv1 = conv1x1(in_channels, out_chls, stride=1)
        self.bn1 = bn_with_initialize(out_chls)

        self.conv2 = conv3x3(out_chls, out_chls, stride=stride, padding=0)
        self.conv2 = conv3x3(out_chls, out_chls, stride=stride)
        self.bn2 = bn_with_initialize(out_chls)

        self.conv3 = conv1x1(out_chls, out_channels, stride=1, padding=0)
        self.conv3 = conv1x1(out_chls, out_channels, stride=1)
        self.bn3 = bn_with_initialize_last(out_channels)

        self.relu1 = P.ReLU().set_strategy(strategy_no_weight)
@@ -236,21 +232,21 @@ class ResidualBlockWithDown(Cell):
        super(ResidualBlockWithDown, self).__init__()

        out_chls = out_channels // self.expansion
        self.conv1 = conv1x1(in_channels, out_chls, stride=1, padding=0)
        self.conv1 = conv1x1(in_channels, out_chls, stride=1)
        self.bn1 = bn_with_initialize(out_chls)

        self.conv2 = conv3x3(out_chls, out_chls, stride=stride, padding=0)
        self.conv2 = conv3x3(out_chls, out_chls, stride=stride)
        self.bn2 = bn_with_initialize(out_chls)

        self.conv3 = conv1x1(out_chls, out_channels, stride=1, padding=0)
        self.conv3 = conv1x1(out_chls, out_channels, stride=1)
        self.bn3 = bn_with_initialize_last(out_channels)

        self.relu1 = P.ReLU().set_strategy(strategy_no_weight)
        self.relu2 = P.ReLU().set_strategy(strategy_no_weight)
        self.relu3 = P.ReLU().set_strategy(strategy_no_weight)
        self.downSample = down_sample
        self.down_sample = down_sample

        self.conv_down_sample = conv1x1(in_channels, out_channels, stride=stride, padding=0)
        self.conv_down_sample = conv1x1(in_channels, out_channels, stride=stride)
        self.bn_down_sample = bn_with_initialize(out_channels)
        self.add = TensorAdd().set_strategy(strategy_add)

@@ -279,7 +275,7 @@ class ResidualBlockWithDown(Cell):

 class MakeLayer0(Cell):

    def __init__(self, block, layer_num, in_channels, out_channels, stride):
    def __init__(self, block, in_channels, out_channels, stride):
        super(MakeLayer0, self).__init__()
        self.a = ResidualBlockWithDown(in_channels, out_channels, stride=1, down_sample=True)
        self.b = block(out_channels, out_channels, stride=stride)
@@ -295,14 +291,14 @@ class MakeLayer0(Cell):

 class ResNet(Cell):

    def __init__(self, block, layer_num, num_classes=100):
    def __init__(self, block, num_classes=100):
        super(ResNet, self).__init__()
        self.conv1 = conv7x7(3, 64, stride=2, padding=3)
        self.conv1 = conv7x7(3, 64, stride=2)
        self.bn1 = bn_with_initialize(64)
        self.relu = P.ReLU().set_strategy(strategy_no_weight)
        self.maxpool = MaxPool2d(kernel_size=3, stride=2, pad_mode="same")
        self.layer1 = MakeLayer0(
            block, layer_num[0], in_channels=64, out_channels=256, stride=1)
            block, in_channels=64, out_channels=256, stride=1)
        self.pool = M.ReduceMean(keep_dims=True).set_strategy(strategy_no_weight)
        self.fc = fc_with_initialize(64 * block.expansion, num_classes)
        self.flatten = Flatten()
@@ -320,12 +316,12 @@ class ResNet(Cell):


 class ResNetModelParallel(Cell):
    def __init__(self, block, layer_num, num_classes=100):
    def __init__(self, block, num_classes=100):
        super(ResNetModelParallel, self).__init__()
        self.relu = P.ReLU().set_strategy(((1, dev_num, 1, 1),))
        self.maxpool = MaxPool2d(kernel_size=3, stride=2, pad_mode="same")
        self.layer1 = MakeLayer0(
            block, layer_num[0], in_channels=64, out_channels=256, stride=1)
            block, in_channels=64, out_channels=256, stride=1)
        self.pool = M.ReduceMean(keep_dims=True).set_strategy(strategy_no_weight)
        self.fc = fc_with_initialize(64 * block.expansion, num_classes)
        self.flatten = Flatten()
@@ -341,11 +337,11 @@ class ResNetModelParallel(Cell):


 def resnet_operator_net(num_classes):
    return ResNet(ResidualBlock, [3, 4, 6, 3], num_classes)
    return ResNet(ResidualBlock, num_classes)


 def resnet_model_parallel_net(num_classes):
    return ResNetModelParallel(ResidualBlock, [3, 4, 6, 3], num_classes)
    return ResNetModelParallel(ResidualBlock, num_classes)


 def test_resnet_operator_batch_parallel():
@@ -354,7 +350,6 @@ def test_resnet_operator_batch_parallel():
    learning_rate = 0.1
    momentum = 0.9
    epoch_size = 2
    rank_size = dev_num

    context.reset_auto_parallel_context()
    context.set_auto_parallel_context(device_num=dev_num, global_rank=0)
@@ -381,7 +376,6 @@ def test_resnet_model_parallel():
    learning_rate = 0.1
    momentum = 0.9
    epoch_size = 2
    rank_size = dev_num

    context.reset_auto_parallel_context()
    context.set_auto_parallel_context(device_num=dev_num, global_rank=0)
--- a/tests/ut/python/parallel/test_optimizer_clone_weight.py
+++ b/tests/ut/python/parallel/test_optimizer_clone_weight.py
@@ -35,7 +35,7 @@ class NetWithLoss(nn.Cell):
        return self.loss(predict, b)[0]


 def compile(net, x, b):
 def compile_net(net, x, b):
    net.set_auto_parallel()
    _Executor().compile(net, x, b)

@@ -72,7 +72,7 @@ def test_optimizer_clone_weight():
    train_net = TrainOneStepCell(net_with_loss, optimizer)
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")

    compile(train_net, x, b)
    compile_net(train_net, x, b)


 def test_optimizer_clone_weight2():
@@ -107,4 +107,4 @@ def test_optimizer_clone_weight2():
    train_net = TrainOneStepCell(net_with_loss, optimizer)
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")

    compile(train_net, x, b)
    compile_net(train_net, x, b)
--- a/tests/ut/python/parallel/test_parameter_init.py
+++ b/tests/ut/python/parallel/test_parameter_init.py
@@ -52,7 +52,7 @@ def test_parameter_init():
    weight = Tensor(np.ones([64, 32]), dtype=ms.float32)

    net = Net(strategy1, weight)
    net(x, )
    net(x,)


 if __name__ == '__main__':
--- a/tests/ut/python/parallel/test_prelu.py
+++ b/tests/ut/python/parallel/test_prelu.py
@@ -44,7 +44,7 @@ class GradWrap(nn.Cell):
        return C.grad_all(self.network)(x, y)


 def compile(net, x, y):
 def compile_net(net, x, y):
    net.set_auto_parallel()
    _executor.compile(net, x, y)

@@ -63,7 +63,7 @@ def test_prelu_single_success1():
    net = GradWrap(NetWithLoss(Net()))
    x = Tensor(np.random.rand(1, 33, 4, 4), ms.float32)
    w = Tensor(np.random.rand(33), ms.float32)
    compile(net, x, w)
    compile_net(net, x, w)


 def test_prelu_single_success2():
@@ -80,7 +80,7 @@ def test_prelu_single_success2():
    net = GradWrap(NetWithLoss(Net()))
    x = Tensor(np.random.rand(1, 33, 4, 4), ms.float32)
    w = Tensor([0.1], ms.float32)
    compile(net, x, w)
    compile_net(net, x, w)


 def test_prelu_parallel_success1():
@@ -100,7 +100,7 @@ def test_prelu_parallel_success1():
    x = Tensor(np.random.rand(4, 4, 32, 64), dtype=ms.float32)
    w = Tensor(np.random.rand(4), dtype=ms.float32)
    net = GradWrap(NetWithLoss(Net(strategy)))
    compile(net, x, w)
    compile_net(net, x, w)


 def test_prelu_parallel_success2():
@@ -120,7 +120,7 @@ def test_prelu_parallel_success2():
    x = Tensor(np.random.rand(4, 4, 32, 64), dtype=ms.float32)
    w = Tensor(np.random.rand(4), dtype=ms.float32)
    net = GradWrap(NetWithLoss(Net(strategy)))
    compile(net, x, w)
    compile_net(net, x, w)


 def test_prelu_parallel_success3():
@@ -183,7 +183,7 @@ def test_prelu_parallel_success4():
    x = Tensor(np.random.rand(4, 16, 32, 64), dtype=ms.float32)
    w = Tensor(np.random.rand(16), dtype=ms.float32)
    net = GradWrap(NetWithLoss(Net(strategy)))
    compile(net, x, w)
    compile_net(net, x, w)


 def test_prelu_parallel_success5():
@@ -203,4 +203,4 @@ def test_prelu_parallel_success5():
    x = Tensor(np.random.rand(4, 16, 32, 64), dtype=ms.float32)
    w = Tensor(np.random.rand(1), dtype=ms.float32)
    net = GradWrap(NetWithLoss(Net(strategy)))
    compile(net, x, w)
    compile_net(net, x, w)
--- a/tests/ut/python/parallel/test_prelu_cell.py
+++ b/tests/ut/python/parallel/test_prelu_cell.py
@@ -47,9 +47,8 @@ class Dataset(MindData):
            raise StopIteration
        self.index += 1
        if self.input_num == 2:
            return self.predict, self.label
        else:
            return self.predict,
            return (self.predict, self.label)
        return (self.predict,)

    def reset(self):
        self.index = 0
@@ -68,7 +67,7 @@ class PReLU(nn.Cell):
        if not isinstance(w, Tensor):
            raise TypeError("w only support np.float32, float or Tensor type.")

        self.w = Parameter(initializer(w, [channel, ]), name='a')
        self.w = Parameter(initializer(w, [channel,]), name='a')
        self.prelu = P.PReLU()
        self.relu = P.ReLU().set_strategy(((1,),))
        self.sub = P.Sub().set_strategy(((1,), (1,)))
@@ -97,7 +96,6 @@ def prelu_net():


 def reshape_common(parallel_mode):
    batch_size = 32
    learning_rate = 0.1
    momentum = 0.9
    epoch_size = 2
--- a/tests/ut/python/parallel/test_reduce_method_info.py
+++ b/tests/ut/python/parallel/test_reduce_method_info.py
@@ -44,7 +44,7 @@ class GradWrap(nn.Cell):
        return C.grad_all(self.network)(x, y, b)


 def compile(net, x, y, b):
 def compile_net(net, x, y, b):
    net.set_auto_parallel()
    _executor.compile(net, x, y, b)

@@ -74,7 +74,7 @@ def test_sum_mul():
    x = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    y = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([128, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_sum_mul2():
@@ -101,7 +101,7 @@ def test_sum_mul2():
    x = Tensor(np.ones([128, 128, 64, 64]), dtype=ms.float32)
    y = Tensor(np.ones([128, 128, 64, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_sum_mul3():
@@ -128,7 +128,7 @@ def test_sum_mul3():
    x = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    y = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([128, 32]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_sum_mul4():
@@ -155,7 +155,7 @@ def test_sum_mul4():
    x = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    y = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([128, 32, 1]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_sum_mul5():
@@ -179,7 +179,7 @@ def test_sum_mul5():
    x = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    y = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([1, 32, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_sum_mul6():
@@ -203,7 +203,7 @@ def test_sum_mul6():
    x = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    y = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([128, 1, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_sum_mul7():
@@ -227,7 +227,7 @@ def test_sum_mul7():
    x = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    y = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([1, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_max_mul():
@@ -254,7 +254,7 @@ def test_max_mul():
    x = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    y = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([128, 32]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_min_mul():
@@ -281,7 +281,7 @@ def test_min_mul():
    x = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    y = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([32, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_reduce_mean_mul_float32():
@@ -309,7 +309,7 @@ def test_reduce_mean_mul_float32():
    y = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([32, 64]), dtype=ms.float32)

    compile(net, x, y, b)
    compile_net(net, x, y, b)


 class ArgMaxWithValueNet(nn.Cell):
@@ -321,7 +321,7 @@ class ArgMaxWithValueNet(nn.Cell):

    def construct(self, x, y, b):
        out = self.mul1(x, y)
        index, out = self.arg_max_with_value(out)
        _, out = self.arg_max_with_value(out)
        out = self.mul2(out, b)
        return out

@@ -335,16 +335,16 @@ class ArgMinWithValueNet(nn.Cell):

    def construct(self, x, y, b):
        out = self.mul1(x, y)
        index, out = self.arg_min_with_value(out)
        _, out = self.arg_min_with_value(out)
        out = self.mul2(out, b)
        return out


 def gen_inputs_and_compile(net):
 def gen_inputs_and_compile_net(net):
    x = Tensor(np.ones([128, 64, 64]), dtype=ms.float32)
    y = Tensor(np.ones([128, 64, 64]), dtype=ms.float32)
    b = Tensor(np.ones([128, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def tobefixed_test_arg_max_with_value_mul_semi_axis_parallel():
@@ -354,7 +354,7 @@ def tobefixed_test_arg_max_with_value_mul_semi_axis_parallel():
    strategy3 = ((2, 4), (2, 4))
    net = GradWrap(NetWithLoss(ArgMaxWithValueNet(strategy1, strategy2, strategy3)))
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
    gen_inputs_and_compile(net)
    gen_inputs_and_compile_net(net)


 def test_arg_max_with_value_mul_semi():
@@ -364,7 +364,7 @@ def test_arg_max_with_value_mul_semi():
    strategy3 = ((2, 4), (2, 4))
    net = GradWrap(NetWithLoss(ArgMaxWithValueNet(strategy1, strategy2, strategy3)))
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
    gen_inputs_and_compile(net)
    gen_inputs_and_compile_net(net)


 def test_arg_max_with_value_mul_auto():
@@ -374,7 +374,7 @@ def test_arg_max_with_value_mul_auto():
    strategy3 = None
    net = GradWrap(NetWithLoss(ArgMaxWithValueNet(strategy1, strategy2, strategy3)))
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    gen_inputs_and_compile(net)
    gen_inputs_and_compile_net(net)


 def test_arg_min_with_value_mul_semi_axis_parallel():
@@ -384,7 +384,7 @@ def test_arg_min_with_value_mul_semi_axis_parallel():
    strategy3 = ((2, 4), (2, 4))
    net = GradWrap(NetWithLoss(ArgMinWithValueNet(strategy1, strategy2, strategy3)))
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
    gen_inputs_and_compile(net)
    gen_inputs_and_compile_net(net)


 def test_arg_min_with_value_mul_semi():
@@ -394,7 +394,7 @@ def test_arg_min_with_value_mul_semi():
    strategy3 = ((2, 4), (2, 4))
    net = GradWrap(NetWithLoss(ArgMinWithValueNet(strategy1, strategy2, strategy3)))
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
    gen_inputs_and_compile(net)
    gen_inputs_and_compile_net(net)


 def test_arg_min_with_value_mul_auto():
@@ -404,7 +404,7 @@ def test_arg_min_with_value_mul_auto():
    strategy3 = None
    net = GradWrap(NetWithLoss(ArgMinWithValueNet(strategy1, strategy2, strategy3)))
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    gen_inputs_and_compile(net)
    gen_inputs_and_compile_net(net)


 class ArgMinWithValueNet2(nn.Cell):
@@ -416,7 +416,7 @@ class ArgMinWithValueNet2(nn.Cell):

    def construct(self, x, y, b):
        out = self.mul1(x, y)
        index, out = self.arg_min_with_value(out)
        _, out = self.arg_min_with_value(out)
        out = self.relu(out)
        return out

@@ -428,7 +428,7 @@ def tobefixed_test_arg_min_with_value_mul_semi_axis_parallel2():
    strategy3 = ((2, 4, 1),)
    net = GradWrap(NetWithLoss(ArgMinWithValueNet2(strategy1, strategy2, strategy3)))
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
    gen_inputs_and_compile(net)
    gen_inputs_and_compile_net(net)


 def test_arg_min_with_value_mul_semi2():
@@ -438,7 +438,7 @@ def test_arg_min_with_value_mul_semi2():
    strategy3 = ((2, 4, 1),)
    net = GradWrap(NetWithLoss(ArgMinWithValueNet2(strategy1, strategy2, strategy3)))
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
    gen_inputs_and_compile(net)
    gen_inputs_and_compile_net(net)


 def test_arg_min_with_value_mul_auto2():
@@ -448,7 +448,7 @@ def test_arg_min_with_value_mul_auto2():
    strategy3 = None
    net = GradWrap(NetWithLoss(ArgMinWithValueNet2(strategy1, strategy2, strategy3)))
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    gen_inputs_and_compile(net)
    gen_inputs_and_compile_net(net)


 def test_cross_batch():
@@ -475,7 +475,7 @@ def test_cross_batch():
    x = Tensor(np.ones([32, 64]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([32, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_cross_batch2():
@@ -502,7 +502,7 @@ def test_cross_batch2():
    x = Tensor(np.ones([32, 64]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([32, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_cross_batch_auto():
@@ -526,7 +526,7 @@ def test_cross_batch_auto():
    x = Tensor(np.ones([32, 64]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([32, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_max_empty_tuple():
@@ -554,4 +554,4 @@ def test_max_empty_tuple():
    y = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([128, 32]), dtype=ms.float32)

    compile(net, x, y, b)
    compile_net(net, x, y, b)
--- a/tests/ut/python/parallel/test_reshape.py
+++ b/tests/ut/python/parallel/test_reshape.py
@@ -18,7 +18,6 @@ import mindspore as ms
 import mindspore.nn as nn
 from mindspore import Tensor
 from mindspore import context
 from mindspore.common import dtype as mstype
 from mindspore.common.api import _executor
 from mindspore.common.parameter import Parameter
 from mindspore.common.parameter import ParameterTuple
@@ -54,9 +53,8 @@ class Dataset(MindData):
            raise StopIteration
        self.index += 1
        if self.input_num == 2:
            return self.predict, self.label
        else:
            return self.predict,
            return (self.predict, self.label)
        return (self.predict,)

    def reset(self):
        self.index = 0
@@ -82,7 +80,6 @@ def reshape_net(strategy0, strategy1, strategy2):


 def reshape_common(parallel_mode, strategy0, strategy1, strategy2, strategy_loss):
    batch_size = 32
    learning_rate = 0.1
    momentum = 0.9
    epoch_size = 2
@@ -306,21 +303,21 @@ class ReshapeNet6(nn.Cell):
        return matmul2_o


 def compile(net, input):
 def compile_net(net, input_):
    net.set_auto_parallel()
    _executor.compile(net, input)
    _executor.compile(net, input_)


 def reshape_net2(backbone):
    batch_size = 16
    device_num = 16
    context.set_auto_parallel_context(device_num=device_num, global_rank=0)
    input = Tensor(np.ones([batch_size * device_num, 512, 7, 7]).astype(np.float32) * 0.01)
    input_ = Tensor(np.ones([batch_size * device_num, 512, 7, 7]).astype(np.float32) * 0.01)

    net = GradWrap(NetWithLoss(backbone))
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")

    compile(net, input)
    compile_net(net, input_)


 def test_reshape_net1_1():
@@ -480,11 +477,11 @@ def test_batchnorm_reshape_train():
    device_num = 16
    context.set_auto_parallel_context(device_num=device_num, global_rank=0)
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
    input = Tensor(np.ones([batch_size * device_num, 512]).astype(np.float32) * 0.01)
    input_ = Tensor(np.ones([batch_size * device_num, 512]).astype(np.float32) * 0.01)

    net = GradWrap(NetWithLoss(BatchNormReshapeNet()))

    compile(net, input)
    compile_net(net, input_)


 def bn_with_initialize(out_channels):
@@ -517,12 +514,12 @@ def test_bn_reshape_dense_bn_train():
    batch_size = 16
    device_num = 16
    context.set_auto_parallel_context(device_num=device_num, global_rank=0)
    input = Tensor(np.ones([batch_size, 2, 32, 32]).astype(np.float32) * 0.01)
    input_ = Tensor(np.ones([batch_size, 2, 32, 32]).astype(np.float32) * 0.01)

    net = GradWrap(NetWithLoss(BNReshapeDenseBNNet()))
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")

    compile(net, input)
    compile_net(net, input_)


 class ParallelReduceMeanNet(nn.Cell):
--- a/tests/ut/python/parallel/test_reshape_parameter.py
+++ b/tests/ut/python/parallel/test_reshape_parameter.py
@@ -58,7 +58,7 @@ class Net(nn.Cell):
        return out


 def compile(net, x, y):
 def compile_net(net, x, y):
    net.set_auto_parallel()
    _executor.compile(net, x, y)

@@ -69,7 +69,7 @@ def test_reshape_parameter_data_parallel():
    net = GradWrap(NetWithLoss(Net(strategy)))
    x = Tensor(np.ones([10000, 36]), dtype=ms.float32)
    y = Tensor(np.ones([10000, 36, 1]), dtype=ms.float32)
    compile(net, x, y)
    compile_net(net, x, y)


 def test_reshape_parameter_model_parallel():
@@ -78,4 +78,4 @@ def test_reshape_parameter_model_parallel():
    net = GradWrap(NetWithLoss(Net(strategy)))
    x = Tensor(np.ones([10000, 36]), dtype=ms.float32)
    y = Tensor(np.ones([10000, 36, 1]), dtype=ms.float32)
    compile(net, x, y)
    compile_net(net, x, y)
--- a/tests/ut/python/parallel/test_scalar_loss.py
+++ b/tests/ut/python/parallel/test_scalar_loss.py
@@ -22,7 +22,6 @@ from mindspore.common.api import _executor
 from mindspore.ops import composite as C
 from mindspore.ops import functional as F
 from mindspore.ops import operations as P
 from tests.ut.python.ops.test_math_ops import VirtualLoss


 class GradWrap(nn.Cell):
--- a/tests/ut/python/parallel/test_set_auto_parallel_context.py
+++ b/tests/ut/python/parallel/test_set_auto_parallel_context.py
@@ -30,10 +30,10 @@ def test_set_auto_parallel_context():
    parameter_broadcast = context.get_auto_parallel_context("parameter_broadcast")
    assert device_num == 4
    assert global_rank == 3
    assert mirror_mean == True
    assert cast_before_mirror == False
    assert mirror_mean
    assert not cast_before_mirror
    assert parallel_mode == "auto_parallel"
    assert parameter_broadcast == False
    assert not parameter_broadcast

    auto_parallel_context().set_communication_backend("hccl")
    backend = auto_parallel_context().get_communication_backend()
@@ -43,7 +43,7 @@ def test_set_auto_parallel_context():
    device_num = auto_parallel_context().get_device_num()
    device_num_is_set = auto_parallel_context().get_device_num_is_set()
    assert device_num == 4
    assert device_num_is_set == True
    assert device_num_is_set

    auto_parallel_context().set_global_rank(4)
    global_rank = auto_parallel_context().get_global_rank()
@@ -51,14 +51,14 @@ def test_set_auto_parallel_context():

    auto_parallel_context().set_mirror_mean(True)
    mirror_mean = auto_parallel_context().get_mirror_mean()
    assert mirror_mean == True
    assert mirror_mean

    auto_parallel_context().set_cast_before_mirror(False)
    cast_before_mirror = auto_parallel_context().get_cast_before_mirror()
    assert cast_before_mirror == False
    assert not cast_before_mirror

    parameter_broadcast_is_set = auto_parallel_context().get_parameter_broadcast_is_set()
    assert parameter_broadcast_is_set == True
    assert parameter_broadcast_is_set

    with pytest.raises(ValueError):
        context.set_auto_parallel_context(device_num=0)
@@ -94,9 +94,9 @@ def test_reset_auto_parallel_context():
    parameter_broadcast_is_set = auto_parallel_context().get_parameter_broadcast_is_set()
    assert device_num == 1
    assert global_rank == 0
    assert mirror_mean == False
    assert cast_before_mirror == True
    assert not mirror_mean
    assert cast_before_mirror
    assert parallel_mode == "stand_alone"
    assert parameter_broadcast == False
    assert device_num_is_set == False
    assert parameter_broadcast_is_set == False
    assert not parameter_broadcast
    assert not device_num_is_set
    assert not parameter_broadcast_is_set
--- a/tests/ut/python/parallel/test_sigmoid_cross_entropy_with_logits.py
+++ b/tests/ut/python/parallel/test_sigmoid_cross_entropy_with_logits.py
@@ -39,7 +39,7 @@ _w1 = Tensor(np.ones([128, 64]), dtype=ms.float32)
 _b = Tensor(np.ones([128, 64]), dtype=ms.float32)


 def compile(net):
 def compile_net(net):
    optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
    train_net = TrainOneStepCell(net, optimizer)
    train_net.set_auto_parallel()
@@ -52,7 +52,7 @@ def test_sigmoid_cross_entropy_with_logits_data_parallel():
    strategy1 = ((16, 1), (16, 1))
    strategy2 = ((16, 1), (16, 1))
    net = Net(_w1, strategy1, strategy2)
    compile(net)
    compile_net(net)


 def test_sigmoid_cross_entropy_with_logits_model_parallel():
@@ -60,7 +60,7 @@ def test_sigmoid_cross_entropy_with_logits_model_parallel():
    strategy1 = ((1, 16), (1, 16))
    strategy2 = ((1, 16), (1, 16))
    net = Net(_w1, strategy1, strategy2)
    compile(net)
    compile_net(net)


 def test_sigmoid_cross_entropy_with_logits_hybrid_parallel():
@@ -68,13 +68,13 @@ def test_sigmoid_cross_entropy_with_logits_hybrid_parallel():
    strategy1 = ((2, 8), (2, 8))
    strategy2 = ((2, 8), (2, 8))
    net = Net(_w1, strategy1, strategy2)
    compile(net)
    compile_net(net)


 def test_sigmoid_cross_entropy_with_logits_auto_parallel():
    context.set_auto_parallel_context(parallel_mode="auto_parallel", device_num=16, global_rank=0)
    net = Net(_w1)
    compile(net)
    compile_net(net)


 def test_sigmoid_cross_entropy_with_logits_repeat_calc():
@@ -82,4 +82,4 @@ def test_sigmoid_cross_entropy_with_logits_repeat_calc():
    strategy1 = ((2, 8), (2, 8))
    strategy2 = ((2, 2), (2, 2))
    net = Net(_w1, strategy1, strategy2)
    compile(net)
    compile_net(net)
--- a/tests/ut/python/parallel/test_softmax_cross_entropy_loss.py
+++ b/tests/ut/python/parallel/test_softmax_cross_entropy_loss.py
@@ -21,7 +21,6 @@ from mindspore import context
 from mindspore.common.api import _executor
 from mindspore.ops import composite as C
 from mindspore.ops import operations as P
 from tests.ut.python.ops.test_math_ops import VirtualLoss


 class NetWithLoss(nn.Cell):
@@ -44,7 +43,7 @@ class GradWrap(nn.Cell):
        return C.grad_all(self.network)(x, y, b)


 def compile(net, x, y, b):
 def compile_net(net, x, y, b):
    net.set_auto_parallel()
    _executor.compile(net, x, y, b)

@@ -71,7 +70,7 @@ def test_softmax_cross_entropy_loss():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([64, 32]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_softmax_cross_entropy_loss_repeated_calculation():
@@ -96,7 +95,7 @@ def test_softmax_cross_entropy_loss_repeated_calculation():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([64, 32]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_softmax_cross_entropy_loss_auto_batch_parallel():
@@ -118,4 +117,4 @@ def test_softmax_cross_entropy_loss_auto_batch_parallel():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([64, 32]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)
--- a/tests/ut/python/parallel/test_split_grad_sens.py
+++ b/tests/ut/python/parallel/test_split_grad_sens.py
@@ -22,7 +22,6 @@ from mindspore import context
 from mindspore.common.api import _executor
 from mindspore.ops import composite as C
 from mindspore.ops import operations as P
 from tests.ut.python.ops.test_math_ops import VirtualLoss


 class GradWrap(nn.Cell):
@@ -54,7 +53,7 @@ class GradWrap3(nn.Cell):
        return C.grad_all(self.network)(x, y, bias)


 def compile(net, x, y, b):
 def compile_net(net, x, y, b):
    net.set_auto_parallel()
    _executor.compile(net, x, y, b)

@@ -81,7 +80,7 @@ def test_no_grad():
    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_grad_sens_parameter_type():
@@ -135,7 +134,7 @@ def test_grad_sens_tensor_type():
    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_grad_sens_scalar_broadcast():
@@ -159,4 +158,4 @@ def test_grad_sens_scalar_broadcast():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([64, 32]), dtype=ms.float32)
    bias = Tensor(np.ones([64]), dtype=ms.float32)
    compile(net, x, y, bias)
    compile_net(net, x, y, bias)
--- a/tests/ut/python/parallel/test_squeeze_info.py
+++ b/tests/ut/python/parallel/test_squeeze_info.py
@@ -15,9 +15,9 @@
 import numpy as np

 import mindspore as ms
 from mindspore import context, Tensor, Parameter
 from mindspore import context, Tensor
 from mindspore.common.api import _executor
 from mindspore.nn import Cell, TrainOneStepCell, Momentum
 from mindspore.nn import Cell
 from mindspore.ops import operations as P


@@ -37,7 +37,7 @@ _x = Tensor(np.ones([64, 1, 32, 1]), dtype=ms.float32)
 _b = Tensor(np.ones([64, 32]), dtype=ms.float32)


 def compile(net):
 def compile_net(net):
    net.set_auto_parallel()
    _executor.compile(net, _x, _b)
    context.reset_auto_parallel_context()
@@ -48,7 +48,7 @@ def test_squeeze_data_parallel():
    strategy1 = ((16, 1, 1, 1),)
    strategy2 = ((16, 1), (16, 1))
    net = Net(strategy1, strategy2)
    compile(net)
    compile_net(net)


 def test_squeeze_model_parallel():
@@ -56,7 +56,7 @@ def test_squeeze_model_parallel():
    strategy1 = ((1, 1, 16, 1),)
    strategy2 = ((1, 16), (1, 16))
    net = Net(strategy1, strategy2)
    compile(net)
    compile_net(net)


 def test_squeeze_specified_axis():
@@ -64,13 +64,13 @@ def test_squeeze_specified_axis():
    strategy1 = ((4, 1, 4, 1),)
    strategy2 = ((8, 2), (8, 2))
    net = Net(strategy1, strategy2, (1, 3))
    compile(net)
    compile_net(net)


 def test_squeeze_auto_parallel():
    context.set_auto_parallel_context(parallel_mode="auto_parallel", device_num=16, global_rank=0)
    net = Net()
    compile(net)
    compile_net(net)


 def test_squeeze_repeat_calc():
@@ -78,4 +78,4 @@ def test_squeeze_repeat_calc():
    strategy1 = ((1, 1, 8, 1),)
    strategy2 = ((2, 8), (2, 8))
    net = Net(strategy1, strategy2)
    compile(net)
    compile_net(net)
--- a/tests/ut/python/parallel/test_sum_as_loss.py
+++ b/tests/ut/python/parallel/test_sum_as_loss.py
@@ -21,7 +21,6 @@ from mindspore import context
 from mindspore.common.api import _executor
 from mindspore.ops import composite as C
 from mindspore.ops import operations as P
 from tests.ut.python.ops.test_math_ops import VirtualLoss


 class GradWrap(nn.Cell):
@@ -33,7 +32,7 @@ class GradWrap(nn.Cell):
        return C.grad_all(self.network)(x, y, bias)


 def compile(net, x, y, bias):
 def compile_net(net, x, y, bias):
    net.set_auto_parallel()
    _executor.compile(net, x, y, bias)

@@ -59,7 +58,7 @@ def test_sum_as_loss():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([64, 32]), dtype=ms.float32)
    bias = Tensor(np.ones([64]), dtype=ms.float32)
    compile(net, x, y, bias)
    compile_net(net, x, y, bias)


 def test_sum_as_loss2():
@@ -83,4 +82,4 @@ def test_sum_as_loss2():
    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([64, 32]), dtype=ms.float32)
    bias = Tensor(np.ones([64]), dtype=ms.float32)
    compile(net, x, y, bias)
    compile_net(net, x, y, bias)
--- a/tests/ut/python/parallel/test_transpose.py
+++ b/tests/ut/python/parallel/test_transpose.py
@@ -17,7 +17,6 @@ import numpy as np
 import mindspore as ms
 import mindspore.nn as nn
 from mindspore import Tensor, context
 from mindspore import context
 from mindspore.common.parameter import Parameter
 from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
 from mindspore.nn.optim.momentum import Momentum
@@ -67,7 +66,6 @@ def transpose_net(strategy1, strategy2):


 def transpose_common(strategy1, strategy2):
    batch_size = 32
    learning_rate = 0.1
    momentum = 0.9
    epoch_size = 2
--- a/tests/ut/python/parallel/test_two_matmul.py
+++ b/tests/ut/python/parallel/test_two_matmul.py
@@ -44,7 +44,7 @@ class GradWrap(nn.Cell):
        return C.grad_all(self.network)(x, y, b)


 def compile(net, x, y, b):
 def compile_net(net, x, y, b):
    net.set_auto_parallel()
    _executor.compile(net, x, y, b)

@@ -72,7 +72,7 @@ def test_two_matmul():
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)

    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_two_matmul_repeated_calculation1():
@@ -96,7 +96,7 @@ def test_two_matmul_repeated_calculation1():
    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)


 def test_two_matmul_repeated_calculation2():
@@ -120,4 +120,4 @@ def test_two_matmul_repeated_calculation2():
    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)
    compile(net, x, y, b)
    compile_net(net, x, y, b)
--- a/tests/ut/python/parallel/test_two_weights_parameter.py
+++ b/tests/ut/python/parallel/test_two_weights_parameter.py
@@ -20,7 +20,6 @@ from mindspore import Tensor, Parameter, ParameterTuple
 from mindspore import context
 from mindspore.common.api import _executor
 from mindspore.ops import composite as C
 from mindspore.ops import functional as F
 from mindspore.ops import operations as P


--- a/tests/ut/python/parallel/test_virtual_dataset_3_input.py
+++ b/tests/ut/python/parallel/test_virtual_dataset_3_input.py
@@ -78,7 +78,7 @@ def test_virtual_dataset_3_input():

 def test_virtualdataset_cell_3_inputs():
    class Net(nn.Cell):
        def __init__(self, strategy0, strategy1, strategy2, strategy3):
        def __init__(self, strategy1, strategy2, strategy3):
            super().__init__()
            self.matmul1 = P.MatMul().set_strategy(strategy1)
            self.matmul2 = P.MatMul().set_strategy(strategy2)
@@ -89,7 +89,7 @@ def test_virtualdataset_cell_3_inputs():
            out = self.matmul2(out, b)
            return out

    net = GradWrap(VirtualDatasetCellTriple(NetWithLoss(Net(None, None, None, None))))
    net = GradWrap(VirtualDatasetCellTriple(NetWithLoss(Net(None, None, None))))
    context.set_context(save_graphs=True)
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    context.set_auto_parallel_context(device_num=8, global_rank=0)