delete parallel end-to-end test cases

5 years ago · 1e6ee83874
--- a/tests/ut/python/parallel/parallel_end_to_end/add_relu/_test_add_relu_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/add_relu/_test_add_relu_parallel_4p.py
@@ -1,178 +0,0 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np
 import pytest

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class AddRelu(Cell):
    def __init__(self, strategy0=None, strategy1=None):
        super(AddRelu, self).__init__()
        self.add = P.TensorAdd(strategy=strategy0)
        self.relu = P.ReLU(strategy=strategy1)

    def construct(self, x, z):
        out = self.add(x, z)
        return self.relu(out)


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, output_grad):
        return grad_all_with_sens(self.network)(x, y, output_grad)


 class AddReluFactory:
    def __init__(self, input_shape, strategy0, strategy1):
        prefix = ""
        size = 1
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(1000, size)
        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
            np.float32)
        self.input_np2 = 1.0
        self.output_grad_np = np.reshape((np.arange(0, size) % (number_range - 10) - number_range / 2) * 0.1,
                                         input_shape).astype(np.float32)
        self.strategy0 = strategy0
        self.strategy1 = strategy1
        need_dev_num = 1
        need_dev_num_ = 1
        for s in strategy0[1]:
            need_dev_num = need_dev_num * s
        for s in strategy1[1]:
            need_dev_num_ = need_dev_num_ * s
        self.x_id = device_id % need_dev_num
        self.y_id = device_id % need_dev_num
        self.out_id = device_id % need_dev_num_

    def forward_mindspore_impl(self):
        net = AddRelu()
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        out = net(x, y)
        return out.asnumpy()

    def forward_mindspore_parallel_impl(self):
        net = AddRelu(strategy0=self.strategy0, strategy1=self.strategy1)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(self.input_np2, ms.float32)
        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
        return out.asnumpy()

    def grad_mindspore_impl(self):
        output_grad = Tensor(self.output_grad_np)
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        net = AddRelu()
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad)
        return input_grad

    def grad_mindspore_parallel_impl(self):
        output_grads = self.get_parallel_blocks(self.output_grad_np, self.strategy1[1])
        output_grad = Tensor(output_grads[self.out_id])
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        net = AddRelu(strategy0=self.strategy0, strategy1=self.strategy1)
        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_auto_parallel()
        grad_net.set_train()
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(self.input_np2, ms.float32)
        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad],
                              parallel_inputs_run=[x1, y1, output_grad])
        return input_grad

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def forward_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        out_blocks = self.get_parallel_blocks(out_mindspore, self.strategy1[1])
        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.001)

    def grad_cmp(self):
        input_grad_mindspore = self.grad_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
        _ = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        _ = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        assert np.allclose(input_grad_mindspore1, input_grad_mindspore_parallel1, 0.0001, 0.0001)


@pytest.mark.reid_forward
 def test_reid_add_relu_input_256_64():
    stra0 = (0, (2, 2), ())
    stra1 = (0, (2, 2))
    fact = AddReluFactory(input_shape=(256, 64), strategy0=stra0, strategy1=stra1)
    fact.forward_cmp()


@pytest.mark.reid_grad
 def test_reid_grad_add_relu_input_256_64():
    stra0 = (0, (2, 2), ())
    stra1 = (0, (2, 2))
    fact = AddReluFactory(input_shape=(256, 64), strategy0=stra0, strategy1=stra1)
    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/add_relu/add_relu_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/add_relu/add_relu_parallel_4p.sh
@@ -1,27 +0,0 @@
 #!/bin/bash
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 for((i=0;i<4;i++));
 do
 	rm -rf device$i
 	mkdir device$i
 	cd device$i
    mkdir output
    source ../../dist_env_4p.sh $i
    env  >log$i.log
    pytest -s ../test_add_relu_parallel_4p.py>../../log/test_add_relu_parallel_4p_log$i.log  2>&1 &
    cd ..    
 done
--- a/tests/ut/python/parallel/parallel_end_to_end/batch_parallel/_test_conv2d_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/batch_parallel/_test_conv2d_parallel_4p.py
@@ -1,356 +0,0 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np
 from numpy import allclose

 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore._checkparam import check_bool, twice
 from mindspore.common.initializer import initializer
 from mindspore.common.parameter import Parameter
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class _Conv(Cell):
    r"""Applies a N-D convolution over an input signal composed of several input
       planes.
    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride,
                 pad_mode,
                 padding,
                 dilation,
                 group,
                 has_bias,
                 weight_init,
                 bias_init):
        super(_Conv, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.pad_mode = pad_mode
        self.padding = padding
        self.dilation = dilation
        self.group = group
        self.has_bias = has_bias
        if not (isinstance(in_channels, int) and in_channels > 0):
            raise ValueError('Attr \'in_channels\' of \'Conv2D\' Op passed '
                             + str(in_channels) + ', should be a int and greater than 0.')
        if (not isinstance(kernel_size, tuple)) or len(kernel_size) != 2 or \
                (not isinstance(kernel_size[0], int)) or (not isinstance(kernel_size[1], int)) or \
                kernel_size[0] < 1 or kernel_size[1] < 1:
            raise ValueError('Attr \'kernel_size\' of \'Conv2D\' Op passed '
                             + str(self.kernel_size) + ', should be a int or tuple and equal to or greater than 1.')
        if in_channels % group != 0:
            raise ValueError('Attr \'in_channels\' of \'Conv2D\' Op must be divisible by '
                             'attr \'group\' of \'Conv2D\' Op.')
        if out_channels % group != 0:
            raise ValueError('Attr \'out_channels\' of \'Conv2D\' Op must be divisible by '
                             'attr \'group\' of \'Conv2D\' Op.')

        self.weight = Parameter(initializer(
            weight_init, [out_channels, in_channels // group, *kernel_size]), name='weight')

        if check_bool(has_bias):
            self.bias = Parameter(initializer(
                bias_init, [out_channels]), name='bias')
        else:
            if bias_init != 'zeros':
                print("Value of 'has_bias' is False, value of 'bias_init' will be ignored.")
            self.bias = None

    def construct(self, *inputs):
        raise NotImplementedError


 class Conv2d(_Conv):
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 pad_mode='same',
                 padding=0,
                 dilation=1,
                 group=1,
                 has_bias=False,
                 weight_init='normal',
                 bias_init='zeros',
                 strategy=None):
        kernel_size = twice(kernel_size)
        super(Conv2d, self).__init__(
            in_channels,
            out_channels,
            kernel_size,
            stride,
            pad_mode,
            padding,
            dilation,
            group,
            has_bias,
            weight_init,
            bias_init)
        self.add = P.TensorAdd(strategy)
        self.conv2d = P.Conv2D(out_channel=self.out_channels,
                               kernel_size=self.kernel_size,
                               mode=1,
                               pad_mode=self.pad_mode,
                               pad=self.padding,
                               stride=self.stride,
                               dilation=self.dilation,
                               group=self.group,
                               strategy=None)
        self.bias_add = P.BiasAdd()

    def construct(self, input1, input2):
        x = self.add(input1, input2)
        if self.has_bias:
            return self.bias_add(self.conv2d(x, self.weight),
                                 self.bias)
        return self.conv2d(x, self.weight)


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, input1, input2, output_grad):
        return grad_all_with_sens(self.network)(input1, input2, output_grad)


 class Conv2dFactory:
    def __init__(self, input_shape, filter_shape, stride, pad_mode, padding, dilation, group, has_bias):
        self.in_n, self.in_c, self.in_h, self.in_w = input_shape
        self.out_c, self.kernel_c, self.kernel_h, self.kernel_w = filter_shape
        self.stride = stride
        self.pad_mode = pad_mode
        self.padding = padding
        self.dilation = dilation
        self.group = group
        self.strategy0 = (0, (4, 1, 1, 1), (1, 1, 1, 1))
        prefix = ""
        input_size = 1
        filter_size = 1
        for s in input_shape:
            prefix = prefix + str(s) + "_"
            input_size = input_size * s
        self.prefix = prefix
        for s in filter_shape:
            filter_size = filter_size * s
        number_range1 = min(10, input_size)
        number_range2 = min(10, filter_size)
        self.input_np1 = np.reshape(np.arange(0, input_size) % number_range1 - number_range1 / 2, input_shape).astype(
            np.float16)
        self.input_np2 = np.reshape(np.arange(0, input_size) % number_range1 - number_range1 / 4, input_shape).astype(
            np.float16)
        self.weight_np = np.reshape(np.arange(0, filter_size) % number_range2 - number_range2 / 2, filter_shape).astype(
            np.float16)
        self.has_bias = has_bias
        if self.has_bias is True:
            self.bias_np = np.arange(0, self.out_c).astype(np.float16)

        self.out_shape = (128, 64, 56, 56)
        out_size = 1
        for s in self.out_shape:
            out_size = out_size * s
        number_range3 = min(10, out_size)
        self.output_grad_np = np.reshape(np.arange(0, out_size) % number_range3 - number_range3 / 2,
                                         self.out_shape).astype(np.float16)
        self.x_id = device_id % 4
        self.y_id = device_id % 4
        self.out_strategy = self.strategy0[1]
        self.out_id = device_id % 4

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def forward_conv2d_mindspore_impl(self):
        input1 = Tensor(self.input_np1)
        input2 = Tensor(self.input_np2)
        weight = Tensor(self.weight_np)
        if self.has_bias:
            bias = Tensor(self.bias_np)
            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
                         kernel_size=(self.kernel_h, self.kernel_w),
                         stride=self.stride, pad_mode=self.pad_mode,
                         padding=self.padding, dilation=self.dilation,
                         group=self.group, has_bias=True, weight_init=weight,
                         bias_init=bias)
        else:
            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
                         kernel_size=(self.kernel_h, self.kernel_w),
                         stride=self.stride, pad_mode=self.pad_mode,
                         padding=self.padding, dilation=self.dilation,
                         group=self.group, has_bias=False, weight_init=weight)
        out = net(input1, input2)
        return out.asnumpy()

    def forward_conv2d_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        weight = Tensor(self.weight_np)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        if self.has_bias:
            bias = Tensor(self.bias_np)
            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
                         kernel_size=(self.kernel_h, self.kernel_w),
                         stride=self.stride, pad_mode=self.pad_mode,
                         padding=self.padding, dilation=self.dilation,
                         group=self.group, has_bias=True, weight_init=weight,
                         bias_init=bias, strategy=(self.strategy0[0], self.strategy0[1], self.strategy0[1]))
        else:
            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
                         kernel_size=(self.kernel_h, self.kernel_w),
                         stride=self.stride, pad_mode=self.pad_mode,
                         padding=self.padding, dilation=self.dilation,
                         group=self.group, has_bias=False, weight_init=weight,
                         strategy=(self.strategy0[0], self.strategy0[1], self.strategy0[1]))
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
        return out.asnumpy()

    def grad_conv2d_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        weight = Tensor(self.weight_np)
        output_grad = Tensor(self.output_grad_np)
        if self.has_bias:
            bias = Tensor(self.bias_np)
            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
                         kernel_size=(self.kernel_h, self.kernel_w),
                         stride=self.stride, pad_mode=self.pad_mode,
                         padding=self.padding, dilation=self.dilation,
                         group=self.group, has_bias=True, weight_init=weight,
                         bias_init=bias,)
        else:
            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
                         kernel_size=(self.kernel_h, self.kernel_w),
                         stride=self.stride, pad_mode=self.pad_mode,
                         padding=self.padding, dilation=self.dilation,
                         group=self.group, has_bias=False, weight_init=weight)

        grad_net = Grad(net)
        grad_net.set_train()
        out_grad = grad_net(x, y, output_grad)
        return out_grad

    def grad_conv2d_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        weight = Tensor(self.weight_np)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        output_grad = Tensor(self.output_grad_np)
        output_grads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
        output_grad1 = Tensor(output_grads[self.out_id])
        if self.has_bias:
            bias = Tensor(self.bias_np)
            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
                         kernel_size=(self.kernel_h, self.kernel_w),
                         stride=self.stride, pad_mode=self.pad_mode,
                         padding=self.padding, dilation=self.dilation,
                         group=self.group, has_bias=True, weight_init=weight,
                         bias_init=bias, strategy=(self.strategy0[0], self.strategy0[1], self.strategy0[1]))
        else:
            net = Conv2d(in_channels=self.in_c, out_channels=self.out_c,
                         kernel_size=(self.kernel_h, self.kernel_w),
                         stride=self.stride, pad_mode=self.pad_mode,
                         padding=self.padding, dilation=self.dilation,
                         group=self.group, has_bias=False, weight_init=weight,
                         strategy=(self.strategy0[0], self.strategy0[1], self.strategy0[1]))

        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_train()
        grad_net.set_auto_parallel()
        out_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad1],
                            parallel_inputs_run=[x1, y1, output_grad1])
        return out_grad

    def forward_conv2d_cmp(self):
        out_mindspore = self.forward_conv2d_mindspore_impl()
        out_mindspore_parallel = self.forward_conv2d_mindspore_parallel_impl()
        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
        assert allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.001, 0.001)

    def grad_conv2d_cmp(self):
        input_grad_mindspore = self.grad_conv2d_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_conv2d_mindspore_parallel_impl()
        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[1])
        assert allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.001, 0.001)
        assert allclose(input_grad_blocks_1[self.x_id], input_grad_mindspore_parallel1, 0.001, 0.001)


 def test_reid_conv2d_input_128_64_112_112_kernel_64_64_1_1_stride_2_padding_0_bias_true():
    fact = Conv2dFactory(input_shape=(128, 64, 112, 112),
                         filter_shape=(64, 64, 1, 1),
                         stride=2, pad_mode='valid', padding=0,
                         dilation=1, group=1, has_bias=False)
    fact.forward_conv2d_cmp()


 def test_reid_conv2d_grad_input_128_64_112_112_kernel_64_64_1_1_stride_2_padding_0_bias_true():
    fact = Conv2dFactory(input_shape=(128, 64, 112, 112),
                         filter_shape=(64, 64, 1, 1),
                         stride=2, pad_mode='valid', padding=0,
                         dilation=1, group=1, has_bias=False)
    fact.grad_conv2d_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/batch_parallel/conv2d_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/batch_parallel/conv2d_parallel_4p.sh
@@ -1,27 +0,0 @@
 #!/bin/bash
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 for((i=0;i<4;i++));
 do
    rm -rf device$i
    mkdir device$i
    cd device$i
    mkdir output
    source ../../dist_env_4p.sh $i
    env  >log$i.log
    pytest -s ../test_conv2d_parallel_4p.py>../../log/test_conv2d_parallel_4p_log$i.log  2>&1 &
    cd ..     
 done
--- a/tests/ut/python/parallel/parallel_end_to_end/dist_env_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/dist_env_4p.sh
@@ -1,36 +0,0 @@
 #!/bin/bash
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 export SLOG_PRINT_TO_STDOUT=1
 source /root/miniconda3/bin/activate ci3.6
 export RANK_SIZE=4
 export RANK_TABLE_FILE=../../rank_table_4p.json
 export RANK_ID=$1
 export DEVICE_ID=$1
 export HCCL_FLAG=1
 export DEPLOY_MODE=0
 export AICPU_FLAG=1
 export DUMP_OP=1
 export PYTHONPATH=../../../../../../../../mindspore:/usr/local/HiAI/runtime/python3.6/site-packages/topi.egg/:/usr/local/HiAI/runtime/python3.6/site-packages/te.egg/:/usr/local/HiAI/runtime/ops/op_impl/built-in/ai_core/tbe/
 export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/HiAI/runtime/lib64/libhccl.so:/usr/local/HiAI/runtime/lib64/plugin/opskernel/libfe.so:/usr/local/HiAI/runtime/lib64/plugin/opskernel/libaicpu_plugin.so:/usr/local/HiAI/runtime/lib64/plugin/opskernel/libge_local_engine.so:/usr/local/HiAI/runtime/lib64/plugin/opskernel/librts_engine.so
 export LD_LIBRARY_PATH=/usr/local/HiAI/runtime/lib64
 export FE_FLAG=1
 export PATH=/usr/local/HiAI/runtime/ccec_compiler/bin:$PATH
 if [ $1 -eq 0 ];
 then
    export DUMP_GE_GRAPH=true
    export ME_DRAW_GRAPH=1
 fi
--- a/tests/ut/python/parallel/parallel_end_to_end/dropout/_test_dropout_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/dropout/_test_dropout_parallel_4p.py
@@ -1,120 +0,0 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.nn import Dropout

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class Net(Cell):
    def __init__(self, keep_prob, seed0, seed1, strategy=None):
        super(Net, self).__init__()
        self.drop = Dropout(keep_prob, seed0, seed1, dtype=ms.float32, strategy=strategy)

    def construct(self, input_):
        x = self.drop(input_)
        return x


 # pylint: disable=comparison-with-itself
 class DropoutFactory:
    def __init__(self, input_shape, keep_prob, seed0, seed1, strategy0=None):
        size = 1
        prefix = ""
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(10, size)
        self.input_np = np.reshape(np.arange(0, size) % number_range, input_shape).astype(np.float32)
        self.keep_prob = keep_prob
        self.seed0 = seed0
        self.seed1 = seed1
        self.strategy0 = strategy0
        need_dev_num = 1
        for s in strategy0[1]:
            need_dev_num = need_dev_num * s
        self.x_id = device_id % need_dev_num
        self.out_id = device_id % need_dev_num

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def d4_tensor_compare(self, input_, out_me):
        [a, b, c, d] = input_.shape
        for i in range(a):
            for j in range(b):
                for k in range(c):
                    for e in range(d):
                        if out_me[i, j, k, e] == 0:
                            assert True
                        else:
                            assert np.allclose(out_me[i, j, k, e], input_[i, j, k, e] * (1 / 0.4), 0.0001, 0.0001)

    def forward_mindspore_parallel_impl(self):
        x = Tensor(self.input_np)
        inputs_x = self.get_parallel_blocks(self.input_np, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        net = Net(0.4, 0, 0, strategy=self.strategy0)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        out = net(x, parallel_inputs_compile=[x], parallel_inputs_run=[x1])
        return out.asnumpy()

    def forward_cmp(self):
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        input_blocks = self.get_parallel_blocks(self.input_np, self.strategy0[1])
        self.d4_tensor_compare(input_blocks[self.out_id], out_mindspore_parallel)


 def test_reid_dropout_forward_seed_F32_64_512_8_8():
    fact = DropoutFactory(input_shape=(64, 512, 8, 8), keep_prob=0.4, seed0=0, seed1=0, strategy0=(0, (4, 1, 1, 1)))
    fact.forward_cmp()


 def test_reid_dropout_forward_seed_F32_64_512_8_8_repeat():
    fact = DropoutFactory(input_shape=(64, 512, 8, 8), keep_prob=0.4, seed0=0, seed1=0, strategy0=(0, (2, 1, 1, 1)))
    fact.forward_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/dropout/dropout_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/dropout/dropout_parallel_4p.sh
@@ -1,27 +0,0 @@
 #!/bin/bash
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 for((i=0;i<4;i++));
 do
    rm -rf device$i
    mkdir device$i
    cd device$i
    mkdir output
    source ../../dist_env_4p.sh $i
    env  >log$i.log
    pytest -s ../test_dropout_parallel_4p.py>../../log/test_dropout_parallel_4p_log$i.log  2>&1 &
    cd ..    
 done
--- a/tests/ut/python/parallel/parallel_end_to_end/hcom/_test_allgather_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/hcom/_test_allgather_4p.py
@@ -1,154 +0,0 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class MatmulSingle(Cell):
    def __init__(self, transpose_a=False, transpose_b=False):
        super(MatmulSingle, self).__init__()
        self.matmul = P.MatMul(transpose_a, transpose_b)
        self.pow = P.Pow()
        self.reduce_sum = P.ReduceSum()

    def construct(self, x, y):
        out = self.matmul(x, y)
        out = self.pow(out, 2.0)
        out = self.reduce_sum(out, None)
        return out


 class MatmulAllgather(Cell):
    def __init__(self, group, transpose_a=False, transpose_b=False):
        super(MatmulAllgather, self).__init__()
        self.allgather = P.AllGather(group=group)
        self.matmul = P.MatMul(transpose_a, transpose_b)
        self.pow = P.Pow()
        self.reduce_sum = P.ReduceSum()
        self.allreduce = P.AllReduce(group=group)

    def construct(self, x, y):
        x = self.allgather(x)
        out = self.matmul(x, y)
        out = self.pow(out, 2.0)
        out = self.reduce_sum(out, None)
        out = self.allreduce(out)
        return out


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, sens):
        return grad_all_with_sens(self.network)(x, y, sens)


 class MatmulAllgatherFactory:
    def __init__(self, inputx_shape, inputy_shape, x_stra, y_stra):
        self.inputx = self.gen_value(inputx_shape, 10)
        self.inputy = self.gen_value(inputy_shape, 20)
        self.x_stra = x_stra
        self.y_stra = y_stra
        stra_size = 1
        for s in x_stra:
            stra_size = stra_size * s
        self.stra_size = stra_size

    def gen_value(self, input_shape, delta):
        size = 1
        for s in input_shape:
            size = size * s
        number_range = min(100, size)
        input_np = np.reshape(np.arange(0, size) % number_range - delta, input_shape).astype(np.float32)
        return input_np

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def grad_mindspore_impl_single(self):
        x = Tensor(self.inputx)
        y = Tensor(self.inputy)
        sens = Tensor(1.0, dtype=ms.float32)
        net = MatmulSingle()
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, sens)
        return input_grad

    def grad_mindspore_impl_reduce(self):
        inputxs = self.get_parallel_blocks(self.inputx, self.x_stra)
        inputys = self.get_parallel_blocks(self.inputy, self.y_stra)
        x = Tensor(inputxs[device_id % self.stra_size])
        y = Tensor(inputys[device_id % self.stra_size])
        repeat_num = device_num / self.stra_size
        v = self.stra_size * repeat_num * repeat_num * repeat_num
        sens = Tensor(1.0 / v, dtype=ms.float32)
        net = MatmulAllgather("hccl_world_group")
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, sens)
        return input_grad

    def grad_cmp(self):
        single_results = self.grad_mindspore_impl_single()
        reduce_results = self.grad_mindspore_impl_reduce()
        single_result0 = self.get_parallel_blocks(single_results[0].asnumpy(), self.x_stra)[device_id % self.stra_size]
        reduce_result0 = reduce_results[0].asnumpy()
        single_result1 = self.get_parallel_blocks(single_results[1].asnumpy(), self.y_stra)[device_id % self.stra_size]
        reduce_result1 = reduce_results[1].asnumpy()
        assert np.allclose(single_result0, reduce_result0, 0.0001, 0.0001)
        assert np.allclose(single_result1, reduce_result1, 0.0001, 0.0001)


 def test_reduce_grad():
    inputx_shape = (64, 32)
    inputy_shape = (32, 64)
    fact = MatmulAllgatherFactory(inputx_shape, inputy_shape, (4, 1), (1, 4))
    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/hcom/_test_allreduce_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/hcom/_test_allreduce_4p.py
@@ -1,175 +0,0 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class MatmulSingle(Cell):
    def __init__(self, transpose_a=False, transpose_b=False):
        super(MatmulSingle, self).__init__()
        self.matmul1 = P.MatMul(transpose_a, transpose_b)
        self.matmul2 = P.MatMul(transpose_a, transpose_b)
        self.pow = P.Pow()
        self.reduce_sum = P.ReduceSum()

    def construct(self, x, y, z):
        out = self.matmul1(x, y)
        out = self.matmul2(out, z)
        out = self.pow(out, 2.0)
        out = self.reduce_sum(out, None)
        return out


 class MatmulReduce(Cell):
    def __init__(self, group, transpose_a=False, transpose_b=False):
        super(MatmulReduce, self).__init__()
        self.matmul1 = P.MatMul(transpose_a, transpose_b)
        self.allreduce1 = P.AllReduce(group=group)
        self.matmul2 = P.MatMul(transpose_a, transpose_b)
        self.pow = P.Pow()
        self.reduce_sum = P.ReduceSum()
        self.allreduce2 = P.AllReduce(group=group)

    def construct(self, x, y, z):
        out = self.matmul1(x, y)
        out = self.allreduce1(out)
        out = self.matmul2(out, z)
        out = self.pow(out, 2.0)
        out = self.reduce_sum(out, None)
        out = self.allreduce2(out)
        return out


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, z, sens):
        return grad_all_with_sens(self.network)(x, y, z, sens)


 class MatmulReduceFactory:
    def __init__(self, inputx_shape, inputy_shape, inputz_shape, x_stra, y_stra, z_stra):
        self.inputx = self.gen_value(inputx_shape, 10)
        self.inputy = self.gen_value(inputy_shape, 20)
        self.inputz = self.gen_value(inputz_shape, 30)
        self.x_stra = x_stra
        self.y_stra = y_stra
        self.z_stra = z_stra
        stra_size = 1
        for s in x_stra:
            stra_size = stra_size * s
        self.stra_size = stra_size

    def gen_value(self, input_shape, delta):
        size = 1
        for s in input_shape:
            size = size * s
        number_range = min(100, size)
        input_np = np.reshape(np.arange(0, size) % number_range - delta, input_shape).astype(np.float32)
        return input_np

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def grad_mindspore_impl_single(self):
        x = Tensor(self.inputx)
        y = Tensor(self.inputy)
        z = Tensor(self.inputz)
        sens = Tensor(1.0, dtype=ms.float32)
        net = MatmulSingle()
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, z, sens)
        return input_grad

    def grad_mindspore_impl_reduce(self):
        inputxs = self.get_parallel_blocks(self.inputx, self.x_stra)
        inputys = self.get_parallel_blocks(self.inputy, self.y_stra)
        inputzs = self.get_parallel_blocks(self.inputz, self.z_stra)
        x = Tensor(inputxs[device_id % self.stra_size])
        y = Tensor(inputys[device_id % self.stra_size])
        z = Tensor(inputzs[device_id % self.stra_size])
        repeat_num = device_num / self.stra_size
        v = self.stra_size * repeat_num * repeat_num * repeat_num
        sens = Tensor(1.0 / v, dtype=ms.float32)
        net = MatmulReduce("hccl_world_group")
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, z, sens)
        return input_grad

    def grad_cmp(self):
        single_results = self.grad_mindspore_impl_single()
        reduce_results = self.grad_mindspore_impl_reduce()
        single_result0 = self.get_parallel_blocks(single_results[0].asnumpy(), self.x_stra)[device_id % self.stra_size]
        reduce_result0 = reduce_results[0].asnumpy()
        single_result1 = self.get_parallel_blocks(single_results[1].asnumpy(), self.y_stra)[device_id % self.stra_size]
        reduce_result1 = reduce_results[1].asnumpy()
        single_result2 = self.get_parallel_blocks(single_results[2].asnumpy(), self.z_stra)[device_id % self.stra_size]
        reduce_result2 = reduce_results[2].asnumpy()
        assert np.allclose(single_result0, reduce_result0, 0.0001, 0.0001)
        assert np.allclose(single_result1, reduce_result1, 0.0001, 0.0001)
        assert np.allclose(single_result2, reduce_result2, 0.0001, 0.0001)


 def test_reduce_grad():
    inputx_shape = (32, 64)
    inputy_shape = (64, 64)
    inputz_shape = (64, 32)
    fact = MatmulReduceFactory(inputx_shape, inputy_shape, inputz_shape, (1, 4), (4, 1), (1, 4))
    fact.grad_cmp()


 def test_reduce_grad_repeat():
    inputx_shape = (32, 64)
    inputy_shape = (64, 64)
    inputz_shape = (64, 32)
    fact = MatmulReduceFactory(inputx_shape, inputy_shape, inputz_shape, (1, 2), (2, 1), (1, 2))
    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/hcom/allgather_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/hcom/allgather_4p.sh
@@ -1,27 +0,0 @@
 #!/bin/bash
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 for((i=0;i<4;i++));
 do
 	rm -rf device$i
 	mkdir device$i
 	cd device$i
    mkdir output
    source ../../dist_env_4p.sh $i
    env  >log$i.log
    pytest -s ../test_allgather_4p.py>../../log/test_allgather_4p_log$i.log  2>&1 &
    cd ..    
 done
--- a/tests/ut/python/parallel/parallel_end_to_end/hcom/allreduce_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/hcom/allreduce_4p.sh
@@ -1,27 +0,0 @@
 #!/bin/bash
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 for((i=0;i<4;i++));
 do
 	rm -rf device$i
 	mkdir device$i
 	cd device$i
    mkdir output
    source ../../dist_env_4p.sh $i
    env  >log$i.log
    pytest -s ../test_allreduce_4p.py>../../log/test_allreduce_4p_log$i.log  2>&1 &
    cd ..    
 done
--- a/tests/ut/python/parallel/parallel_end_to_end/l2normalize/_test_l2normalize_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/l2normalize/_test_l2normalize_parallel_4p.py
@@ -1,206 +0,0 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np

 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class L2normalize(Cell):
    def __init__(self, axis=0, epsilon=1e-4, strategy0=None, strategy1=None):
        super(L2normalize, self).__init__()
        self.add = P.TensorAdd(strategy=strategy0)
        self.l2norm = P.L2Normalize(axis, epsilon, strategy1)

    def construct(self, x, y):
        out = self.add(x, y)
        out = self.l2norm(out)
        return out


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, output_grad):
        return grad_all_with_sens(self.network)(x, y, output_grad)


 class L2normalizeFactory:
    def __init__(self, input_shape, axis, strategy0, strategy1):
        prefix = ""
        size = 1
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(1000, size)
        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
            np.float32)
        self.input_np2 = np.reshape(np.arange(0, size) % number_range - number_range / 4, input_shape).astype(
            np.float32)
        target_shape = input_shape
        self.target_shape = target_shape
        target_size = 1
        for s in target_shape:
            target_size = target_size * s
        number_range = min(1000, target_size)
        self.output_grad_np = np.reshape(np.arange(0, target_size) % number_range - number_range / 2,
                                         target_shape).astype(np.float32)
        self.axis = axis
        self.epsilon = 1e-4
        self.strategy0 = strategy0
        self.strategy1 = strategy1
        out_strategy = strategy1[1]
        self.out_strategy = out_strategy
        need_dev_num0 = 1
        need_dev_num1 = 1
        for s in strategy0[1]:
            need_dev_num0 = need_dev_num0 * s
        for s in out_strategy:
            need_dev_num1 = need_dev_num1 * s
        self.x_id = device_id % need_dev_num0
        self.y_id = device_id % need_dev_num0
        self.out_id = device_id % need_dev_num1

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def forward_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        net = L2normalize(self.axis, self.epsilon)
        out = net(x, y)
        return out.asnumpy()

    def forward_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        net = L2normalize(self.axis, self.epsilon, strategy0=self.strategy0, strategy1=self.strategy1)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
        return out.asnumpy()

    def grad_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        net = L2normalize(self.axis, self.epsilon)
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad)
        return input_grad

    def grad_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        outgrads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        output_grad1 = Tensor(outgrads[self.out_id])
        net = L2normalize(self.axis, self.epsilon, strategy0=self.strategy0, strategy1=self.strategy1)
        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_auto_parallel()
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad1],
                              parallel_inputs_run=[x1, y1, output_grad1])
        return input_grad

    def forward_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.001, 0.001)

    def grad_cmp(self):
        input_grad_mindspore = self.grad_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
        assert np.allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
        assert np.allclose(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)


 def test_reid_l2normalize_input_128_512():
    input_shape = (128, 512)
    axis = 0
    fact = L2normalizeFactory(input_shape, axis, strategy0=(0, (4, 1), (4, 1)), strategy1=(0, (1, 4)))
    fact.forward_cmp()


 def test_reid_l2normalize_grad_input_128_512():
    input_shape = (128, 512)
    axis = 0
    fact = L2normalizeFactory(input_shape, axis, (0, (4, 1), (4, 1)), strategy1=(0, (1, 4)))
    fact.grad_cmp()


 def test_reid_l2normalize_input_128_512_repeat():
    input_shape = (128, 512)
    axis = 0
    fact = L2normalizeFactory(input_shape, axis, strategy0=(0, (1, 2), (1, 2)), strategy1=(0, (1, 2)))
    fact.forward_cmp()


 def test_reid_l2normalize_grad_input_128_512_repeat():
    input_shape = (128, 512)
    axis = 0
    fact = L2normalizeFactory(input_shape, axis, strategy0=(0, (1, 2), (1, 2)), strategy1=(0, (1, 2)))
    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/l2normalize/l2normalize_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/l2normalize/l2normalize_parallel_4p.sh
@@ -1,27 +0,0 @@
 #!/bin/bash
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 for((i=0;i<4;i++));
 do
    rm -rf device$i
    mkdir device$i
    cd device$i
    mkdir output
    source ../../dist_env_4p.sh $i
    env  >log$i.log
    pytest -s ../test_l2normalize_parallel_4p.py>../../log/test_l2normalize_parallel_4p_log$i.log  2>&1 &
    cd ..    
 done
--- a/tests/ut/python/parallel/parallel_end_to_end/log/README.MD
+++ b/tests/ut/python/parallel/parallel_end_to_end/log/README.MD
@@ -1 +0,0 @@
 log files for auto parallel end to end test cases 
--- a/tests/ut/python/parallel/parallel_end_to_end/loss/_test_loss_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/loss/_test_loss_parallel_4p.py
@@ -1,195 +0,0 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np

 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class AddRelu(Cell):
    def __init__(self, strategy0=None, strategy1=None):
        super(AddRelu, self).__init__()
        self.add = P.TensorAdd(strategy=strategy0)
        self.relu = P.ReLU(strategy=strategy1)

    def construct(self, x, y):
        out = self.add(x, y)
        out = self.relu(out)
        return out


 class NetWithLoss(Cell):
    def __init__(self, network, strategy2=None):
        super(NetWithLoss, self).__init__()
        self.loss = P.SoftmaxCrossEntropyWithLogits(strategy=strategy2)
        self.network = network

    def construct(self, x, y, b):
        predict = self.network(x, y)
        return self.loss(predict, b)[0]


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, b):
        return grad_all(self.network)(x, y, b)


 class AddReluFactory:
    def __init__(self, input_shape, strategy0, strategy1, strategy2):
        prefix = ""
        size = 1
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(1000, size)
        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
            np.float32)
        self.input_np2 = np.reshape(np.arange(0, size) % number_range - number_range / 4, input_shape).astype(
            np.float32)
        target_shape = input_shape
        self.target_shape = target_shape
        target_size = 1
        for s in target_shape:
            target_size = target_size * s
        number_range = min(10, target_size)
        self.output_grad_np = np.reshape((np.arange(0, target_size) % number_range) * 0.1, target_shape).astype(
            np.float32)
        self.strategy0 = strategy0
        self.strategy1 = strategy1
        self.strategy2 = strategy2
        out_strategy = strategy1[1]
        self.out_strategy = out_strategy
        need_dev_num0 = 1
        need_dev_num1 = 1
        for s in strategy0[1]:
            need_dev_num0 = need_dev_num0 * s
        for s in out_strategy:
            need_dev_num1 = need_dev_num1 * s
        self.x_id = device_id % need_dev_num0
        self.y_id = device_id % need_dev_num0
        self.out_id = device_id % need_dev_num1

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def grad_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        net = AddRelu()
        net_with_loss = NetWithLoss(net)
        grad_net = Grad(net_with_loss)
        grad_net.set_train()
        input_grads = []
        for i in range(0, 3):
            input_grad = grad_net(x, y, output_grad)
            input_grads.append(input_grad)
        return input_grads

    def grad_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        outgrads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        output_grad1 = Tensor(outgrads[self.out_id])
        net = AddRelu(strategy0=self.strategy0, strategy1=self.strategy1)
        net_with_loss = NetWithLoss(net, strategy2=self.strategy2)
        grad_net = Grad(net_with_loss)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_auto_parallel()
        grad_net.set_train()
        input_grads = []
        for i in range(0, 3):
            input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad],
                                  parallel_inputs_run=[x1, y1, output_grad1])
            input_grads.append(input_grad)
        return input_grads

    def grad_cmp(self):
        input_grad_mindspores = self.grad_mindspore_impl()
        input_grad_mindspore_parallels = self.grad_mindspore_parallel_impl()
        for i in range(0, len(input_grad_mindspores)):
            input_grad_mindspore = input_grad_mindspores[i]
            input_grad_mindspore_parallel = input_grad_mindspore_parallels[i]
            input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
            input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
            input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
            input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
            input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
            input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
            np.save(path + str(i) + "_" + str(device_id) + "_" + self.prefix + "_grad_single0.npy",
                    input_grad_blocks_0[self.x_id])
            np.save(path + str(i) + "_" + str(device_id) + "_" + self.prefix + "_grad_single1.npy",
                    input_grad_blocks_1[self.y_id])
            np.save(path + str(i) + "_" + str(device_id) + "_" + self.prefix + "_grad_parallel0.npy",
                    input_grad_mindspore_parallel0)
            np.save(path + str(i) + "_" + str(device_id) + "_" + self.prefix + "_grad_parallel1.npy",
                    input_grad_mindspore_parallel1)
            assert np.allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
            assert np.allclose(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)


 def test_reid_l2normalize_grad_input_128_512():
    input_shape = (128, 512)
    fact = AddReluFactory(input_shape, strategy0=(0, (4, 1), (4, 1)), strategy1=(0, (4, 1)),
                          strategy2=(0, (4, 1), (4, 1)))
    fact.grad_cmp()


 def test_reid_l2normalize_grad_input_128_512_stridesplit():
    input_shape = (128, 512)
    fact = AddReluFactory(input_shape, strategy0=(0, (1, 1), (1, 1)), strategy1=(0, (4, 1)),
                          strategy2=(0, (4, 1), (4, 1)))
    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/loss/loss_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/loss/loss_parallel_4p.sh
@@ -1,27 +0,0 @@
 #!/bin/bash
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 for((i=0;i<4;i++));
 do
    rm -rf device$i
    mkdir device$i
    cd device$i
    mkdir output
    source ../../dist_env_4p.sh $i
    env  >log$i.log
    pytest -s ../test_loss_parallel_4p.py>../../log/test_loss_parallel_4p_log$i.log  2>&1 &
    cd ..    
 done
--- a/tests/ut/python/parallel/parallel_end_to_end/matmul/_test_matmul_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/matmul/_test_matmul_parallel_4p.py
@@ -1,329 +0,0 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np
 from numpy import allclose

 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class Matmul(Cell):
    def __init__(self, transpose_a=False, transpose_b=False, strategy0=None, strategy1=None):
        super(Matmul, self).__init__()
        self.add = P.TensorAdd(strategy=strategy1)
        self.matmul = P.MatMul(transpose_a, transpose_b, strategy=strategy0)

    def construct(self, x, w, z):
        out = self.add(x, z)
        return self.matmul(out, w)


 class BatchMatMul(Cell):
    def __init__(self, transpose_a=False, transpose_b=False, strategy0=None, strategy1=None):
        super(BatchMatMul, self).__init__()
        self.add = P.TensorAdd(strategy=strategy1)
        self.batchmatmul = P.BatchMatMul(transpose_a, transpose_b, strategy=strategy0)

    def construct(self, x, w, z):
        out = self.add(x, z)
        return self.batchmatmul(out, w)


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, inputa, inputb, inputz, output_grad):
        gout = grad_all_with_sens(self.network)(inputa, inputb, inputz, output_grad)
        return gout


 class BatchmatmulFactory:
    def __init__(self, inputa_shape, inputb_shape, transpose_a, transpose_b, strategy, strategy_):
        self.strategy = strategy
        self.strategy_ = strategy_
        inputa_size = 1
        inputb_size = 1
        prefix = ""
        for s in inputa_shape:
            prefix = prefix + str(s) + "_"
            inputa_size = inputa_size * s
        prefix = prefix + "and"
        for s in inputb_shape:
            prefix = prefix + str(s) + "_"
            inputb_size = inputb_size * s
        number_rangea = min(1000, inputa_size)
        number_rangeb = min(1000, inputb_size)
        self.inputa = np.reshape(np.arange(0, inputa_size) % number_rangea - number_rangea / 2, inputa_shape).astype(
            np.float32)
        self.inputb = np.reshape(np.arange(0, inputb_size) % number_rangeb - number_rangeb / 2, inputb_shape).astype(
            np.float32)
        self.inputz = np.zeros(self.inputa.shape).astype(np.float32)
        self.transpose_a = transpose_a
        self.transpose_b = transpose_b

        out_shape = []
        device_matrix = []
        out_strategy = []
        if transpose_a:
            temp = inputa_shape[-1]
            inputa_shape[-1] = inputa_shape[-2]
            inputa_shape[-2] = temp
        if transpose_b:
            temp = inputb_shape[-1]
            inputb_shape[-1] = inputb_shape[-2]
            inputb_shape[-2] = temp

        if len(inputa_shape) >= len(inputb_shape):
            out_shape = list(inputa_shape)
            out_shape[-1] = inputb_shape[-1]
        else:
            out_shape = list(inputb_shape)
            out_shape[-2] = inputa_shape[-2]

        strategy1 = list(self.strategy[1])
        strategy2 = list(self.strategy[2])
        if transpose_a:
            temp = strategy1[-1]
            strategy1[-1] = strategy1[-2]
            strategy1[-2] = temp
        if transpose_b:
            temp = strategy2[-1]
            strategy2[-1] = strategy2[-2]
            strategy2[-2] = temp

        if len(strategy1) >= len(strategy2):
            out_strategy = strategy1.copy()
            out_strategy[-1] = strategy2[-1]
        else:
            out_strategy = strategy2.copy()
            out_strategy[-2] = strategy1[-2]
        device_matrix = out_strategy.copy()
        device_matrix.insert(-1, strategy1[-1])
        self.out_strategy = out_strategy

        need_dev_num = 1
        for s in device_matrix:
            need_dev_num = need_dev_num * s
        self.need_dev_num = need_dev_num
        self.device_matrix = device_matrix

        out_size = 1
        for s in out_shape:
            out_size = out_size * s
        number_range = min(1000, out_size)
        self.output_grad_np = np.reshape(np.arange(0, out_size) % number_range - number_range / 2, out_shape).astype(
            np.float32)

        device_index = self.id_to_list(device_id % need_dev_num, self.device_matrix)
        x_index = device_index[:-1].copy()
        if transpose_a:
            temp = x_index[-1]
            x_index[-1] = x_index[-2]
            x_index[-2] = temp
        y_index = device_index[:-3].copy()
        y_index.append(device_index[-2])
        y_index.append(device_index[-1])
        if transpose_b:
            temp = y_index[-1]
            y_index[-1] = y_index[-2]
            y_index[-2] = temp

        out_index = device_index[:-2].copy()
        out_index.append(device_index[-1])

        print(device_matrix)
        print(device_index)

        need_dev_num_ = 1
        for s in strategy_[1]:
            need_dev_num_ = need_dev_num_ * s
        self.x_id = device_id % need_dev_num_
        self.y_id = self.list_to_id(y_index, self.strategy[2])
        self.out_id = self.list_to_id(out_index, self.out_strategy)

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks


    def id_to_list(self, id_, shape):
        """
        shape：每一维的上限，如（2,4,8）
        """
        result = []
        r = id_
        for i in range(0, len(shape)):
            v = 1
            for j in range(i + 1, len(shape)):
                v = v * shape[j]
            result.append(r // v)
            r = r % v
        return result

    def list_to_id(self, id_list, shape):
        result = 0
        for i in range(0, len(id_list)):
            v = 1
            for j in range(i + 1, len(id_list)):
                v = v * shape[j]
            result = result + id_list[i] * v
        return result

    def forward_mindspore_impl(self):
        if len(self.inputa.shape) > 2:
            matmul = BatchMatMul(self.transpose_a, self.transpose_b)
        else:
            matmul = Matmul(self.transpose_a, self.transpose_b)
        matmul.set_train()
        out_me = matmul(Tensor(self.inputa), Tensor(self.inputb), Tensor(self.inputz))
        return out_me.asnumpy()

    def forward_mindspore_parallel_impl(self):
        if len(self.inputa.shape) > 2:
            matmul = BatchMatMul(self.transpose_a, self.transpose_b, strategy0=self.strategy, strategy1=self.strategy_)
        else:
            matmul = Matmul(self.transpose_a, self.transpose_b, strategy0=self.strategy, strategy1=self.strategy_)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        x = Tensor(self.inputa)
        y = Tensor(self.inputb)
        z = Tensor(self.inputz)
        xs = self.get_parallel_blocks(self.inputa, self.strategy_[1])
        ys = self.get_parallel_blocks(self.inputb, self.strategy[2])
        zs = self.get_parallel_blocks(self.inputz, self.strategy_[1])
        x1 = Tensor(xs[self.x_id])  #
        y1 = Tensor(ys[self.y_id])  # 需要从设备矩阵推导
        z1 = Tensor(zs[self.x_id])
        matmul.set_train()
        matmul.set_auto_parallel()
        out_me = matmul(x, y, z, parallel_inputs_compile=[x, y, z], parallel_inputs_run=[x1, y1, z1])
        return out_me.asnumpy()

    def grad_mindspore_impl(self):
        x = Tensor(self.inputa)
        y = Tensor(self.inputb)
        z = Tensor(self.inputz)
        if len(self.inputa.shape) > 2:
            matmul = BatchMatMul(self.transpose_a, self.transpose_b)
        else:
            matmul = Matmul(self.transpose_a, self.transpose_b)
        net_me = Grad(matmul)
        net_me.set_train()
        out_grad_me = Tensor(self.output_grad_np)
        out_grad = net_me(x, y, z, out_grad_me)
        return out_grad

    def grad_mindspore_parallel_impl(self):
        if len(self.inputa.shape) > 2:
            matmul = BatchMatMul(self.transpose_a, self.transpose_b, strategy0=self.strategy, strategy1=self.strategy_)
        else:
            matmul = Matmul(self.transpose_a, self.transpose_b, strategy0=self.strategy, strategy1=self.strategy_)
        x = Tensor(self.inputa)
        y = Tensor(self.inputb)
        z = Tensor(self.inputz)
        out_grad_me = Tensor(self.output_grad_np)

        xs = self.get_parallel_blocks(self.inputa, self.strategy_[1])
        ys = self.get_parallel_blocks(self.inputb, self.strategy[2])
        zs = self.get_parallel_blocks(self.inputz, self.strategy_[1])
        out_grads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)

        x1 = Tensor(xs[self.x_id])  # 需要从设备矩阵推导
        y1 = Tensor(ys[self.y_id])  #
        z1 = Tensor(zs[self.x_id])
        out_grad1 = Tensor(out_grads[self.out_id])
        net_me = Grad(matmul)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net_me.set_auto_parallel()
        net_me.set_train()

        out_grad = net_me(x, y, z, out_grad_me, parallel_inputs_compile=[x, y, z, out_grad1],
                          parallel_inputs_run=[x1, y1, z1, out_grad1])
        return out_grad

    def forward_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspores = self.get_parallel_blocks(out_mindspore, self.out_strategy)
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        assert allclose(out_mindspores[self.out_id], out_mindspore_parallel, 0.0001, 0.0001)

    def grad_cmp(self):
        input_grad_mindspore = self.grad_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
        input_grad_mindspores0 = self.get_parallel_blocks(input_grad_mindspore[0].asnumpy(), self.strategy_[1])
        input_grad_mindspores1 = self.get_parallel_blocks(input_grad_mindspore[1].asnumpy(), self.strategy[2])
        input_grad_mindspores2 = self.get_parallel_blocks(input_grad_mindspore[2].asnumpy(), self.strategy_[1])
        assert allclose(input_grad_mindspores0[self.x_id], input_grad_mindspore_parallel[0].asnumpy(), 0.0001, 0.0001)
        assert allclose(input_grad_mindspores1[self.y_id], input_grad_mindspore_parallel[1].asnumpy(), 0.0001, 0.0001)
        assert allclose(input_grad_mindspores2[self.x_id], input_grad_mindspore_parallel[2].asnumpy(), 0.0001, 0.0001)


 def test_reid_batchmatmul_inputa_128_512_inputb_2000_512():
    inputa = [128, 512]
    inputb = [2000, 512]
    fact = BatchmatmulFactory(inputa, inputb, False, True, (0, (2, 2), (1, 2)), (0, (2, 2), (2, 2)))
    fact.forward_cmp()


 def test_reid_batchmatmul_grad_inputa_128_512_inputb_2000_512():
    inputa = [128, 512]
    inputb = [2000, 512]
    fact = BatchmatmulFactory(inputa, inputb, False, True, (0, (2, 2), (1, 2)), (0, (2, 2), (2, 2)))
    fact.grad_cmp()


 def test_reid_batchmatmul_inputa_128_512_inputb_2000_512_redistribution():
    inputa = [128, 512]
    inputb = [2000, 512]
    fact = BatchmatmulFactory(inputa, inputb, False, True, (0, (1, 2), (1, 2)), (0, (2, 2), (2, 2)))
    fact.forward_cmp()


 def test_reid_batchmatmul_grad_inputa_128_512_inputb_2000_512_redistribution():
    inputa = [128, 512]
    inputb = [2000, 512]
    fact = BatchmatmulFactory(inputa, inputb, False, True, (0, (1, 2), (1, 2)), (0, (2, 2), (2, 2)))
    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/matmul/matmul_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/matmul/matmul_parallel_4p.sh
@@ -1,27 +0,0 @@
 #!/bin/bash
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 for((i=0;i<4;i++));
 do
    rm -rf device$i
    mkdir device$i
    cd device$i
    mkdir output
    source ../../dist_env_4p.sh $i
    env  >log$i.log
    pytest -s ../test_matmul_parallel_4p.py >../../log/test_matmul_parallel_4p_log$i.log  2>&1 &  
    cd ..  
 done
--- a/tests/ut/python/parallel/parallel_end_to_end/max/_test_max_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/max/_test_max_parallel_4p.py
@@ -1,213 +0,0 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np

 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, input1, input2, output_grad):
        return grad_all_with_sens(self.network)(input1, input2, output_grad)


 class Max(Cell):
    def __init__(self, axis, keep_dims, strategy0=None, strategy1=None):
        super(Max, self).__init__()
        self.add = P.TensorAdd(strategy=strategy0)
        self.reduce_max = P.ReduceMax(keep_dims=keep_dims).set_strategy(strategy=strategy1)
        self.axis = axis

    def construct(self, input1, input2):
        out = self.add(input1, input2)
        return self.reduce_max(out, self.axis)


 class MaxFactory:
    def __init__(self, input_shape, axis, keep_dims, strategy0, strategy1):
        self.strategy0 = strategy0
        self.strategy1 = strategy1
        self.axis = axis
        self.keep_dims = keep_dims
        input_size = 1
        prefix = ""
        for s in input_shape:
            prefix = prefix + str(s) + "_"
            input_size = input_size * s
        number_range = min(1000, input_size)
        self.input_np1 = np.reshape(np.arange(0, input_size) % number_range - number_range / 2, input_shape).astype(
            np.float32)
        self.input_np2 = self.input_np1.copy()
        self.out_grad_np = None
        out_shape = list(input_shape)
        out_shape.pop(axis)
        out_size = input_size / input_shape[axis]
        number_range_ = min(1000, out_size)
        self.out_grad_np = np.reshape(np.arange(0, out_size) % number_range_ - number_range_ / 2, out_shape).astype(
            np.float32)
        out_strategy = list(strategy1[1])
        out_strategy.pop(axis)
        self.out_strategy = out_strategy
        need_dev_num = 1
        need_dev_num_ = 1
        for s in strategy0[1]:
            need_dev_num = need_dev_num * s
        for s in out_strategy:
            need_dev_num_ = need_dev_num_ * s
        self.x_id = device_id % need_dev_num
        self.y_id = device_id % need_dev_num
        self.out_id = device_id % need_dev_num_

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def forward_mindspore_impl(self):
        input1 = Tensor(self.input_np1)
        input2 = Tensor(self.input_np2)
        net = Max(axis=self.axis, keep_dims=self.keep_dims)
        out = net(input1, input2)
        return out.asnumpy()

    def forward_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        xs = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        ys = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        x1 = Tensor(xs[self.x_id])
        y1 = Tensor(ys[self.y_id])
        net = Max(axis=self.axis, keep_dims=self.keep_dims, strategy0=self.strategy0, strategy1=self.strategy1)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
        return out.asnumpy()

    def grad_mindspore_impl(self):
        input1 = Tensor(self.input_np1)
        input2 = Tensor(self.input_np2)
        out_grad = Tensor(self.out_grad_np)
        net = Max(axis=self.axis, keep_dims=self.keep_dims)
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(input1, input2, out_grad)
        return input_grad

    def grad_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grads = self.get_parallel_blocks(self.out_grad_np, self.out_strategy)
        out_grad = Tensor(output_grads[self.out_id])
        xs = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        ys = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        x1 = Tensor(xs[self.x_id])
        y1 = Tensor(ys[self.y_id])
        net = Max(axis=self.axis, keep_dims=self.keep_dims, strategy0=self.strategy0, strategy1=self.strategy1)
        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_auto_parallel()
        grad_net.set_train()
        input_grad = grad_net(x, y, out_grad, parallel_inputs_compile=[x, y, out_grad],
                              parallel_inputs_run=[x1, y1, out_grad])
        return input_grad

    def forward_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        print(out_mindspore)
        print(out_mindspore_parallel)
        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.001, 0.001)

    def grad_cmp(self):
        input_grad_mindspore = self.grad_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
        assert np.allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
        assert np.allclose(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)


 def test_reid_max_forward_input_256_64():
    fact = MaxFactory(input_shape=(256, 64), axis=1, keep_dims=False, strategy0=(0, (4, 1), (4, 1)),
                      strategy1=(0, (4, 1)))
    fact.forward_cmp()


 def test_reid_max_grad_input_256_64():
    fact = MaxFactory(input_shape=(256, 64), axis=1, keep_dims=False, strategy0=(0, (4, 1), (4, 1)),
                      strategy1=(0, (4, 1)))
    fact.grad_cmp()


 def test_reid_max_forward_input_128_64_32_32():
    fact = MaxFactory(input_shape=(128, 64, 32, 32), axis=3, keep_dims=False, strategy0=(0, (2, 1, 2, 1), (2, 1, 2, 1)),
                      strategy1=(0, (2, 1, 2, 1)))
    fact.forward_cmp()


 def test_reid_max_grad_input_128_64_32_32():
    fact = MaxFactory(input_shape=(128, 64, 32, 32), axis=3, keep_dims=False, strategy0=(0, (2, 1, 2, 1), (2, 1, 2, 1)),
                      strategy1=(0, (2, 1, 2, 1)))
    fact.grad_cmp()


 def test_reid_max_forward_input_256_64_repeat():
    fact = MaxFactory(input_shape=(256, 64), axis=1, keep_dims=False, strategy0=(0, (2, 1), (2, 1)),
                      strategy1=(0, (2, 1)))
    fact.forward_cmp()


 def test_reid_max_grad_input_256_64_repeat():
    fact = MaxFactory(input_shape=(256, 64), axis=1, keep_dims=False, strategy0=(0, (2, 1), (2, 1)),
                      strategy1=(0, (2, 1)))
    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/max/max_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/max/max_parallel_4p.sh
@@ -1,27 +0,0 @@
 #!/bin/bash
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 for((i=0;i<4;i++));
 do
    rm -rf device$i
    mkdir device$i
    cd device$i
    mkdir output
    source ../../dist_env_4p.sh $i
    env  >log$i.log
    pytest -s ../test_max_parallel_4p.py>../../log/test_max_parallel_4p_log$i.log  2>&1 &
    cd ..
 done
--- a/tests/ut/python/parallel/parallel_end_to_end/mul_softmax/mul_activation_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/mul_softmax/mul_activation_parallel_4p.sh
@@ -1,27 +0,0 @@
 #!/bin/bash
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 for((i=0;i<4;i++));
 do
    rm -rf device$i
    mkdir device$i
    cd device$i
    mkdir output
    source ../../dist_env_4p.sh $i
    env  >log$i.log
    pytest -s ../test_mul_softmax_parallel_4p.py>../../log/test_mul_softmax_parallel_4p_log$i.log  2>&1 &
    cd ..    
 done
--- a/tests/ut/python/parallel/parallel_end_to_end/mul_softmax/need_fix_test_mul_softmax_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/mul_softmax/need_fix_test_mul_softmax_parallel_4p.py
@@ -1,200 +0,0 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np
 import pytest

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class MulSoftmax(Cell):
    def __init__(self, strategy0=None, strategy1=None, axis=0):
        super(MulSoftmax, self).__init__()
        self.mul = P.Mul(strategy=strategy0)
        self.softmax = P.Softmax(axis=axis, strategy=strategy1)

    def construct(self, x, z):
        out = self.mul(x, z)
        return self.softmax(out)


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, output_grad):
        return grad_all_with_sens(self.network)(x, y, output_grad)


 class MulSoftmaxFactory:
    def __init__(self, input_shape, strategy0, strategy1):
        prefix = ""
        size = 1
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(1000, size)
        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
            np.float32)
        self.input_np2 = 1.0
        self.output_grad_np = np.reshape((np.arange(0, size) % (number_range - 10) - number_range / 2) * 0.1,
                                         input_shape).astype(np.float32)
        self.strategy0 = strategy0
        self.strategy1 = strategy1
        need_dev_num = 1
        need_dev_num_ = 1
        for s in strategy0[1]:
            need_dev_num = need_dev_num * s
        for s in strategy1[1]:
            need_dev_num_ = need_dev_num_ * s
        self.x_id = device_id % need_dev_num
        self.y_id = device_id % need_dev_num
        self.out_id = device_id % need_dev_num_

    def forward_mindspore_impl(self):
        net = MulSoftmax()
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        out = net(x, y)
        return out.asnumpy()

    def forward_mindspore_parallel_impl(self):
        net = MulSoftmax(strategy0=self.strategy0, strategy1=self.strategy1)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(self.input_np2, ms.float32)
        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
        return out.asnumpy()

    def grad_mindspore_impl(self):
        output_grad = Tensor(self.output_grad_np)
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        net = MulSoftmax()
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad)
        return input_grad

    def grad_mindspore_parallel_impl(self):
        output_grads = self.get_parallel_blocks(self.output_grad_np, self.strategy1[1])
        output_grad = Tensor(output_grads[self.out_id])
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        net = MulSoftmax(strategy0=self.strategy0, strategy1=self.strategy1)
        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_train()
        grad_net.set_auto_parallel()
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(self.input_np2, ms.float32)
        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad],
                              parallel_inputs_run=[x1, y1, output_grad])
        return input_grad

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def forward_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        np.save(path + str(device_id) + "_" + self.prefix + "_forward_parallel.npy", out_mindspore_parallel)
        out_blocks = self.get_parallel_blocks(out_mindspore, self.strategy1[1])
        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.001)

    def grad_cmp(self):
        input_grad_mindspore = self.grad_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        np.save(path + str(device_id) + "_" + self.prefix + "_grad_parallel0.npy", input_grad_mindspore_parallel0)
        np.save(path + str(device_id) + "_" + self.prefix + "_grad_parallel1.npy", input_grad_mindspore_parallel1)
        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0,
                                                       self.strategy0[1])  # 这里由于TensorMul两个输入X1没做广播，X2做了广播
        assert np.allclose(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
        assert np.allclose(input_grad_mindspore1, input_grad_mindspore_parallel1, 0.0001, 0.0001)


@pytest.mark.reid_forward
 def test_reid_mul_softmax_input_128x64():
    stra0 = (0, (1, 4), ())
    stra1 = (0, (1, 4))
    fact = MulSoftmaxFactory(input_shape=(128, 64), strategy0=stra0, strategy1=stra1)
    fact.forward_cmp()


@pytest.mark.reid_grad
 def test_reid_grad_mul_softmax_input_128x64():
    stra0 = (0, (1, 4), ())
    stra1 = (0, (1, 4))
    fact = MulSoftmaxFactory(input_shape=(128, 64), strategy0=stra0, strategy1=stra1)
    fact.grad_cmp()


@pytest.mark.reid_forward
 def test_reid_mul_softmax_input_128x64_all_to_all():
    stra0 = (0, (4, 1), ())
    stra1 = (0, (1, 4))
    fact = MulSoftmaxFactory(input_shape=(128, 64), strategy0=stra0, strategy1=stra1)
    fact.forward_cmp()


@pytest.mark.reid_grad
 def test_reid_grad_mul_softmax_input_128x64_all_to_all():
    stra0 = (0, (4, 1), ())
    stra1 = (0, (1, 4))
    fact = MulSoftmaxFactory(input_shape=(128, 64), strategy0=stra0, strategy1=stra1)
    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/onehot/_test_onehot_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/onehot/_test_onehot_parallel_4p.py
@@ -1,147 +0,0 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class Onehot(Cell):
    def __init__(self, axis=-1, depth=1, on_value=1.0, off_value=0.0, strategy=None):
        super(Onehot, self).__init__()
        self.onehot = P.OneHot(axis, strategy=strategy)
        self.depth = depth
        self.on_value = Tensor(on_value, ms.float32)
        self.off_value = Tensor(off_value, ms.float32)

    def construct(self, indices):
        return self.onehot(indices, self.depth, self.on_value, self.off_value)


 class OneHotFactory:
    def __init__(self, input_shape, depth, on_value=1.0, off_value=0.0, axis=None, dtype=None, strategy0=None):
        size = 1
        prefix = ""
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(10, size)
        self.input_np = np.reshape(np.arange(0, size) % number_range, input_shape).astype(np.int32)
        self.depth = depth
        self.on_value = on_value
        self.off_value = off_value
        self.axis = axis
        self.dtype = dtype
        self.strategy0 = strategy0
        need_dev_num = 1
        for s in strategy0[1]:
            need_dev_num = need_dev_num * s
        self.x_id = device_id % need_dev_num
        self.out_id = device_id % need_dev_num

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def grad_mindspore_impl(self):
        output_grad = Tensor(self.output_grad_np)
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2, ms.float32)
        net = AddRelu()
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad)
        return input_grad

    def forward_mindspore_impl(self):
        indices = Tensor(self.input_np)
        net = Onehot(axis=self.axis,
                     depth=self.depth,
                     on_value=self.on_value,
                     off_value=self.off_value)
        out = net(indices)
        return out.asnumpy()

    def forward_mindspore_parallel_impl(self):
        x = Tensor(self.input_np)
        inputs_x = self.get_parallel_blocks(self.input_np, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        net = Onehot(axis=self.axis,
                     depth=self.depth,
                     on_value=self.on_value,
                     off_value=self.off_value, strategy=self.strategy0)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        out = net(x, parallel_inputs_compile=[x], parallel_inputs_run=[x1])
        return out.asnumpy()

    def forward_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        out_blocks = self.get_parallel_blocks(out_mindspore, self.strategy0[1])
        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.0001)


 def test_reid_onehot_forward_int32_128_depth13000():
    fact = OneHotFactory(input_shape=(128,),
                         depth=131072,
                         on_value=1.000000,
                         off_value=0.000000,
                         axis=-1,
                         dtype="float32",
                         strategy0=(0, (2,)))
    fact.forward_cmp()


 def test_reid_onehot_forward_int32_131072_depth127():
    fact = OneHotFactory(input_shape=(131072,),
                         depth=127,
                         on_value=1.000000,
                         off_value=0.000000,
                         axis=-1,
                         dtype="float32",
                         strategy0=(0, (4,)))
    fact.forward_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/onehot/onehot_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/onehot/onehot_parallel_4p.sh
@@ -1,27 +0,0 @@
 #!/bin/bash
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 for((i=0;i<4;i++));
 do
    rm -rf device$i
    mkdir device$i
    cd device$i
    mkdir output
    source ../../dist_env_4p.sh $i
    env  >log$i.log
    pytest -s ../test_onehot_parallel_4p.py>../../log/test_onehot_parallel_4p_log$i.log  2>&1 &
    cd ..    
 done
--- a/tests/ut/python/parallel/parallel_end_to_end/prelu/_test_prelu_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/prelu/_test_prelu_parallel_4p.py
@@ -1,206 +0,0 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np
 import pytest

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class PReLU(Cell):
    def __init__(self, channel=1, w=0.25, strategy_=None, strategy1_=None):
        super(PReLU, self).__init__()
        self.add = P.TensorAdd(strategy=strategy1_)
        self.prelu = P.PReLU(strategy=strategy_)
        self.channel = channel

    def construct(self, x, z, w):
        out = self.add(x, z)
        return self.prelu(out, w)


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, input_, z, w, output_grad):
        return grad_all_with_sens(self.network)(input_, z, w, output_grad)


 class PReLUFactory:
    def __init__(self, input_shape, strategy):
        n, c = input_shape[:2]
        prefix = ""
        size = 1
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(1000, size)
        self.input_np = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(np.float32)
        self.output_grad_np = np.reshape((np.arange(0, size) % (number_range - 10) - number_range / 2) * 0.1,
                                         input_shape).astype(np.float32)
        self.channel = c
        self.weight = np.array([np.float32(0.25)] * c)
        self.strategy = strategy

    def forward_mindspore_impl(self):
        net = PReLU(channel=self.channel, w=self.weight)
        x = Tensor(self.input_np)
        z = Tensor(np.zeros(self.input_np.shape), ms.float32)
        w = Tensor(self.weight)
        out = net(x, z, w)
        return out.asnumpy()

    def forward_mindspore_parallel_impl(self):
        net = PReLU(channel=self.channel, w=self.weight, strategy_=self.strategy,
                    strategy1_=(self.strategy[0], self.strategy[1], self.strategy[1]))
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        x = Tensor(self.input_np)
        z = Tensor(np.zeros(self.input_np.shape), ms.float32)
        w = Tensor(self.weight)

        inputs = self.get_parallel_blocks(self.input_np, self.strategy[1])
        block_id = device_id % len(inputs)
        x1 = Tensor(inputs[block_id])
        z1 = Tensor(np.zeros(inputs[block_id].shape), ms.float32)
        w1 = Tensor(self.weight)

        out = net(x, z, w, parallel_inputs_compile=[x, z, w], parallel_inputs_run=[x1, z1, w1])
        return out.asnumpy()

    def grad_mindspore_impl(self):
        output_grad = Tensor(self.output_grad_np)
        x = Tensor(self.input_np)
        z = Tensor(np.zeros(self.input_np.shape), ms.float32)
        w = Tensor(self.weight)

        net = PReLU(channel=self.channel, w=self.weight)
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, z, w, output_grad)
        return input_grad

    def grad_mindspore_parallel_impl(self):
        output_grads = self.get_parallel_blocks(self.output_grad_np, self.strategy[1])
        block_id = device_id % len(output_grads)
        output_grad = Tensor(output_grads[block_id])
        x = Tensor(self.input_np)
        z = Tensor(np.zeros(self.input_np.shape), ms.float32)
        w = Tensor(self.weight)

        net = PReLU(channel=self.channel, w=self.weight, strategy_=self.strategy,
                    strategy1_=(self.strategy[0], self.strategy[1], self.strategy[1]))
        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_auto_parallel()

        grad_net.set_train()
        inputs = self.get_parallel_blocks(self.input_np, self.strategy[1])
        x1 = Tensor(inputs[block_id])
        z1 = Tensor(np.zeros(inputs[block_id].shape), ms.float32)
        w1 = Tensor(self.weight)

        input_grad = grad_net(x, z, w, output_grad, parallel_inputs_compile=[x, z, w, output_grad],
                              parallel_inputs_run=[x1, z1, w1, output_grad])
        return input_grad

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def forward_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        out_blocks = self.get_parallel_blocks(out_mindspore, self.strategy[1])
        block_id = device_id % len(out_blocks)
        assert np.allclose(out_blocks[block_id], out_mindspore_parallel, 0.0001, 0.001)

    def grad_cmp(self):
        input_grad_mindspore = self.grad_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        input_grad_mindspore2 = input_grad_mindspore[2].asnumpy()
        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        input_grad_mindspore_parallel2 = input_grad_mindspore_parallel[2].asnumpy()
        input_grad_blocks = self.get_parallel_blocks(input_grad_mindspore0, self.strategy[1])
        input1_grad_blocks = self.get_parallel_blocks(input_grad_mindspore1, self.strategy[1])
        block_id = device_id % len(input_grad_blocks)
        assert np.allclose(input_grad_blocks[block_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
        assert np.allclose(input_grad_mindspore2, input_grad_mindspore_parallel2, 0.0001, 0.0001)
        assert np.allclose(input1_grad_blocks[block_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)


@pytest.mark.reid_grad
 def test_reid_prelu_input_128x64x112x112_repeat():
    stra = (0, (1, 1, 2, 1), (1))
    fact = PReLUFactory(input_shape=(128, 64, 112, 112), strategy=stra)
    fact.forward_cmp()


@pytest.mark.reid_grad
 def test_reid_grad_prelu_input_128x64x112x112_repeat():
    stra = (0, (1, 1, 2, 1), (1))
    fact = PReLUFactory(input_shape=(128, 64, 112, 112), strategy=stra)
    fact.grad_cmp()


@pytest.mark.reid_grad
 def test_reid_prelu_input_128x64x112x112_mix():
    stra = (0, (2, 1, 1, 2), (1))
    fact = PReLUFactory(input_shape=(128, 64, 112, 112), strategy=stra)
    fact.forward_cmp()


@pytest.mark.reid_grad
 def test_reid_grad_prelu_input_128x64x112x112_mix():
    stra = (0, (2, 1, 1, 2), (1))
    fact = PReLUFactory(input_shape=(128, 64, 112, 112), strategy=stra)
    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/prelu/prelu_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/prelu/prelu_parallel_4p.sh
@@ -1,27 +0,0 @@
 #!/bin/bash
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 for((i=0;i<4;i++));
 do
    rm -rf device$i
    mkdir device$i
    cd device$i
    mkdir output
    source ../../dist_env_4p.sh $i
    env  >log$i.log
    pytest -s ../test_prelu_parallel_4p.py >../../log/test_prelu_parallel_4p_log$i.log  2>&1 &
    cd ..
 done
--- a/tests/ut/python/parallel/parallel_end_to_end/reducemean/_test_reducemean_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/reducemean/_test_reducemean_parallel_4p.py
@@ -1,252 +0,0 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np
 from numpy import allclose as allclose_nparray

 import mindspore as ms
 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, output_grad):
        return grad_all_with_sens(self.network)(x, y, output_grad)


 class GradScalar(Cell):
    def __init__(self, network):
        super(GradScalar, self).__init__()
        self.network = network
        self.sens = Tensor([1.0], dtype=ms.float32)

    def construct(self, x, y):
        return grad_all_with_sens(self.network)(x, y, self.sens)


 class ReduceMean(Cell):
    def __init__(self, keep_dims, axis, strategy0=None, strategy1=None):
        super(ReduceMean, self).__init__()
        self.add = P.TensorAdd(strategy=strategy0)
        self.reduce_mean = P.ReduceMean(keep_dims=keep_dims).set_strategy(strategy=strategy1)
        self.axis = axis

    def construct(self, x, y):
        out = self.add(x, y)
        return self.reduce_mean(out, self.axis)


 class ReduceMeanFactory:
    def __init__(self, input_shape, keep_dims, axis, strategy0=None, strategy1=None):
        prefix = ""
        size = 1
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(1000, size)
        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
            np.float32)
        self.input_np2 = np.reshape(np.arange(0, size) % number_range - number_range / 4, input_shape).astype(
            np.float32)
        self.keep_dims = keep_dims
        self.axis = axis
        target_shape = self.input_np1.mean(axis=axis, keepdims=keep_dims).shape
        target_size = 1
        for s in target_shape:
            target_size = target_size * s
        number_range = min(1000, target_size)
        self.output_grad_np = np.array([1.0], dtype=np.float32)
        if len(target_shape) > 0:
            self.output_grad_np = np.reshape(np.arange(0, target_size) % number_range, target_shape).astype(
                np.float32) + 1.0
        self.shape = target_shape
        self.strategy0 = strategy0
        self.strategy1 = strategy1
        out_strategy = []
        axis_ = list(axis)
        if axis_[0] == -1:
            axis_[0] = len(input_shape) - 1
        for i in range(0, len(input_shape)):
            if i in axis_:
                if keep_dims:
                    out_strategy.append(1)
            else:
                out_strategy.append(strategy1[1][i])
        self.out_strategy = out_strategy
        need_dev_num0 = 1
        need_dev_num1 = 1
        for s in strategy0[1]:
            need_dev_num0 = need_dev_num0 * s
        for s in out_strategy:
            need_dev_num1 = need_dev_num1 * s
        self.x_id = device_id % need_dev_num0
        self.y_id = device_id % need_dev_num0
        block_id = device_id % need_dev_num0
        device_index = self.id_to_list(block_id, self.strategy1[1])
        print(device_index)
        for i in axis:
            device_index[i] = 0
        print(device_index)
        self.out_id = self.list_to_id(device_index, self.out_strategy)
        print(self.out_id)

    def id_to_list(self, id_, shape):
        result = []
        r = id_
        for i in range(0, len(shape)):
            v = 1
            for j in range(i + 1, len(shape)):
                v = v * shape[j]
            result.append(r // v)
            r = r % v
        return result

    def list_to_id(self, id_list, shape):
        result = 0
        for i in range(0, len(id_list)):
            v = 1
            for j in range(i + 1, len(id_list)):
                v = v * shape[j]
            result = result + id_list[i] * v
        return result

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def forward_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        net = ReduceMean(keep_dims=self.keep_dims, axis=self.axis)
        out = net(x, y)
        return out.asnumpy()

    def forward_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        net = ReduceMean(keep_dims=self.keep_dims, axis=self.axis, strategy0=self.strategy0, strategy1=self.strategy1)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
        return out.asnumpy()

    def grad_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        out_grad = Tensor(self.output_grad_np)
        net = ReduceMean(keep_dims=self.keep_dims, axis=self.axis)
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, out_grad)
        return input_grad

    def grad_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        outgrads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        output_grad1 = Tensor(outgrads[self.out_id])
        net = ReduceMean(keep_dims=self.keep_dims, axis=self.axis, strategy0=self.strategy0, strategy1=self.strategy1)
        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_auto_parallel()
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad1],
                              parallel_inputs_run=[x1, y1, output_grad1])
        return input_grad

    def forward_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.001)

    def grad_cmp(self):
        input_grad_mindspore = self.grad_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
        assert allclose_nparray(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
        assert allclose_nparray(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)


 def test_reid_reducemean_input_64x16():
    fact = ReduceMeanFactory(input_shape=(64 * 16,), keep_dims=False, axis=(-1,), strategy0=(0, (4,), (4,)),
                             strategy1=(0, (4,)))
    fact.forward_cmp()


 def test_grad_reid_reducemean_input_64x16():
    fact = ReduceMeanFactory(input_shape=(64 * 16,), keep_dims=False, axis=(-1,), strategy0=(0, (4,), (4,)),
                             strategy1=(0, (4,)))
    fact.grad_cmp()


 def test_reid_reducemean_input_64x128x28x28():
    fact = ReduceMeanFactory(input_shape=(64, 128, 32, 32), keep_dims=True, axis=(2, 3),
                             strategy0=(0, (2, 1, 2, 1), (2, 1, 2, 1)), strategy1=(0, (2, 1, 2, 1)))
    fact.forward_cmp()


 def test_grad_reid_reducemean_input_64x128x28x28():
    fact = ReduceMeanFactory(input_shape=(64, 128, 32, 32), keep_dims=True, axis=(2, 3),
                             strategy0=(0, (2, 1, 2, 1), (2, 1, 2, 1)), strategy1=(0, (2, 1, 2, 1)))
    fact.grad_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/reducemean/reducemean_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/reducemean/reducemean_parallel_4p.sh
@@ -1,27 +0,0 @@
 #!/bin/bash
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 for((i=0;i<4;i++));
 do
    rm -rf device$i
    mkdir device$i
    cd device$i
    mkdir output
    source ../../dist_env_4p.sh $i
    env  >log$i.log
    pytest -s ../test_reducemean_parallel_4p.py>../../log/test_reducemean_parallel_4p_log$i.log  2>&1 &
    cd ..    
 done
--- a/tests/ut/python/parallel/parallel_end_to_end/reshape/_test_reshape_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/reshape/_test_reshape_parallel_4p.py
@@ -1,206 +0,0 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np
 import pytest
 from numpy import allclose as allclose_nparray

 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, output_grad):
        return grad_all_with_sens(self.network)(x, y, output_grad)


 class Reshape(Cell):
    def __init__(self, target_shape, strategy0=None, strategy1=None):
        super(Reshape, self).__init__()
        self.add = P.TensorAdd(strategy=strategy0)
        self.reshape = P.Reshape(strategy=strategy1)
        self.shape = tuple(target_shape)

    def construct(self, input1, input2):
        x = self.add(input1, input2)
        return self.reshape(x, self.shape)


 class ReshapeFactory:
    def __init__(self, input_shape, target_shape, strategy0, strategy1):
        prefix = ""
        size = 1
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(1000, size)
        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
            np.float32)
        self.input_np2 = np.reshape(np.arange(0, size) % number_range - number_range / 4, input_shape).astype(
            np.float32)
        target_size = 1
        for s in target_shape:
            target_size = target_size * s
        number_range = min(1000, target_size)
        self.output_grad_np = np.reshape(np.arange(0, target_size) % number_range - number_range / 2,
                                         target_shape).astype(np.float32)
        self.target_shape = target_shape
        self.strategy0 = strategy0
        self.strategy1 = strategy1
        out_strategy = [1] * len(target_shape)
        out_strategy[0] = strategy1[1][0]
        self.out_strategy = out_strategy

        need_dev_num0 = 1
        need_dev_num1 = 1
        for s in strategy0[1]:
            need_dev_num0 = need_dev_num0 * s
        for s in out_strategy:
            need_dev_num1 = need_dev_num1 * s
        self.x_id = device_id % need_dev_num0
        self.y_id = device_id % need_dev_num0
        self.out_id = device_id % need_dev_num1

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def forward_reshape_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        net = Reshape(self.target_shape)
        out = net(x, y)
        return out.asnumpy()

    def forward_reshape_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        net = Reshape(self.target_shape, strategy0=self.strategy0, strategy1=self.strategy1)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
        return out.asnumpy()

    def grad_reshape_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        net = Reshape(self.target_shape)
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad)
        return input_grad

    def grad_reshape_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        outgrads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        output_grad1 = Tensor(outgrads[self.out_id])
        net = Reshape(self.target_shape, strategy0=self.strategy0, strategy1=self.strategy1)
        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_auto_parallel()
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad1],
                              parallel_inputs_run=[x1, y1, output_grad1])
        return input_grad

    def forward_reshape_cmp(self):
        out_mindspore = self.forward_reshape_mindspore_impl()
        out_mindspore_parallel = self.forward_reshape_mindspore_parallel_impl()
        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.001)

    def grad_reshape_cmp(self):
        input_grad_mindspore = self.grad_reshape_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_reshape_mindspore_parallel_impl()
        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
        assert allclose_nparray(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
        assert allclose_nparray(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)


@pytest.mark.reid_forward
 def test_reid_reshape_input_128x512x7x7_target_128x25088():
    fact = ReshapeFactory(input_shape=(128, 512, 7, 7), target_shape=(128, 25088),
                          strategy0=(0, (4, 1, 1, 1), (4, 1, 1, 1)), strategy1=(0, (4, 1, 1, 1)))
    fact.forward_reshape_cmp()


 def test_reid_reshape_grad_input_128x512x7x7_target_128x25088():
    fact = ReshapeFactory(input_shape=(128, 512, 7, 7), target_shape=(128, 25088),
                          strategy0=(0, (4, 1, 1, 1), (4, 1, 1, 1)), strategy1=(0, (4, 1, 1, 1)))
    fact.grad_reshape_cmp()


@pytest.mark.reid_forward
 def test_reid_reshape_input_128x64_target_128x64x1x1():
    fact = ReshapeFactory(input_shape=(128, 64), target_shape=(128, 64, 1, 1), strategy0=(0, (2, 1), (2, 1)),
                          strategy1=(0, (2, 1)))
    fact.forward_reshape_cmp()


@pytest.mark.reid_grad
 def test_reid_reshape_grad_input_128x64_target_128x64x1x1():
    fact = ReshapeFactory(input_shape=(128, 64), target_shape=(128, 64, 1, 1), strategy0=(0, (2, 1), (2, 1)),
                          strategy1=(0, (2, 1)))
    fact.grad_reshape_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/reshape/reshape_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/reshape/reshape_parallel_4p.sh
@@ -1,27 +0,0 @@
 #!/bin/bash
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 for((i=0;i<4;i++));
 do
    rm -rf device$i
    mkdir device$i
    cd device$i
    mkdir output
    source ../../dist_env_4p.sh $i
    env  >log$i.log
    pytest -s ../test_reshape_parallel_4p.py>../../log/test_reshape_parallel_4p_log$i.log  2>&1 &
    cd ..    
 done
--- a/tests/ut/python/parallel/parallel_end_to_end/transpose/_test_transpose_parallel_4p.py
+++ b/tests/ut/python/parallel/parallel_end_to_end/transpose/_test_transpose_parallel_4p.py
@@ -1,235 +0,0 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import numpy as np
 from numpy import allclose as allclose_nparray

 import mindspore.communication.management as distributedTool
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.nn import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.composite import grad_all_with_sens

 device_num = 4
 device_id = int(os.environ["RANK_ID"])
 path = "./output/"


 def setup_module():
    print("~~~~~~~~~~~set up~~~~~~~~~~~~~")
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(device_num=device_num, global_rank=device_id)
    distributedTool.init()
    distributedTool.create_group("0-3", [0, 1, 2, 3])
    print("~~~~~~~~~~~set up finished~~~~~~~~~~~~~")


 def teardown_module():
    print("~~~~~~~~~~~~tear down~~~~~~~~~~")


 class Net(Cell):
    def __init__(self, perm_in, strategy0=None, strategy1=None):
        super(Net, self).__init__()
        self.add = P.TensorAdd(strategy=strategy0)
        self.transpose = P.Transpose(strategy=strategy1)
        self.perm_in = perm_in

    def construct(self, x, y):
        out = self.add(x, y)
        return self.transpose(out, self.perm_in)


 class Grad(Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.network = network

    def construct(self, x, y, output_grad):
        return grad_all_with_sens(self.network)(x, y, output_grad)


 class TransposeFactory:
    def __init__(self, input_shape, perm_in, strategy0, strategy1):
        prefix = ""
        size = 1
        for s in input_shape:
            prefix = prefix + str(s)
            size = size * s
        self.prefix = prefix
        number_range = min(1000, size)
        self.input_np1 = np.reshape(np.arange(0, size) % number_range - number_range / 2, input_shape).astype(
            np.float32)
        self.input_np2 = np.reshape(np.arange(0, size) % number_range - number_range / 4, input_shape).astype(
            np.float32)
        target_shape = self.input_np1.transpose(perm_in).shape
        target_size = 1
        for s in target_shape:
            target_size = target_size * s
        number_range = min(1000, target_size)
        self.target_shape = target_shape
        self.output_grad_np = np.reshape(np.arange(0, target_size) % number_range - number_range / 2,
                                         target_shape).astype(np.float32)
        self.perm_in = perm_in
        self.strategy0 = strategy0
        self.strategy1 = strategy1
        out_strategy = []
        for i in perm_in:
            out_strategy.append(strategy1[1][i])
        self.out_strategy = out_strategy
        need_dev_num0 = 1
        need_dev_num1 = 1
        for s in strategy0[1]:
            need_dev_num0 = need_dev_num0 * s
        for s in out_strategy:
            need_dev_num1 = need_dev_num1 * s
        self.x_id = device_id % need_dev_num0
        self.y_id = device_id % need_dev_num0
        device_index = self.id_to_list(device_id % need_dev_num1,
                                       self.strategy1[1])  # encoding to get the index before transpose
        device_index_transpose = []
        for i in perm_in:
            device_index_transpose.append(device_index[i])
        self.out_id = self.list_to_id(device_index_transpose, self.out_strategy)

    def get_parallel_blocks(self, input_, strategy):
        blocks = [input_]
        i = 0
        for stra in strategy:
            temp = []
            while len(blocks) > 0:
                block = blocks.pop(0)
                temp.extend(np.split(block, stra, axis=i))
            blocks.extend(temp)
            i += 1
        return blocks

    def id_to_list(self, id_, shape):
        result = []
        r = id_
        for i in range(0, len(shape)):
            v = 1
            for j in range(i + 1, len(shape)):
                v = v * shape[j]
            result.append(r // v)
            r = r % v
        return result

    def list_to_id(self, id_list, shape):
        result = 0
        for i in range(0, len(id_list)):
            v = 1
            for j in range(i + 1, len(id_list)):
                v = v * shape[j]
            result = result + id_list[i] * v
        return result

    def forward_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        net = Net(self.perm_in)
        out = net(x, y)
        return out.asnumpy()

    def forward_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        net = Net(self.perm_in, strategy0=self.strategy0, strategy1=self.strategy1)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        net.set_auto_parallel()
        out = net(x, y, parallel_inputs_compile=[x, y], parallel_inputs_run=[x1, y1])
        return out.asnumpy()

    def grad_mindspore_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        net = Net(self.perm_in)
        grad_net = Grad(net)
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad)
        return input_grad

    def grad_mindspore_parallel_impl(self):
        x = Tensor(self.input_np1)
        y = Tensor(self.input_np2)
        output_grad = Tensor(self.output_grad_np)
        inputs_x = self.get_parallel_blocks(self.input_np1, self.strategy0[1])
        inputs_y = self.get_parallel_blocks(self.input_np2, self.strategy0[1])
        outgrads = self.get_parallel_blocks(self.output_grad_np, self.out_strategy)
        x1 = Tensor(inputs_x[self.x_id])
        y1 = Tensor(inputs_y[self.y_id])
        output_grad1 = Tensor(outgrads[self.out_id])
        net = Net(self.perm_in, strategy0=self.strategy0, strategy1=self.strategy1)
        grad_net = Grad(net)
        context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
        grad_net.set_auto_parallel()
        grad_net.set_train()
        input_grad = grad_net(x, y, output_grad, parallel_inputs_compile=[x, y, output_grad1],
                              parallel_inputs_run=[x1, y1, output_grad1])
        return input_grad

    def forward_transpose_cmp(self):
        out_mindspore = self.forward_mindspore_impl()
        out_mindspore_parallel = self.forward_mindspore_parallel_impl()
        out_blocks = self.get_parallel_blocks(out_mindspore, self.out_strategy)
        assert np.allclose(out_blocks[self.out_id], out_mindspore_parallel, 0.0001, 0.001)

    def grad_transpose_cmp(self):
        input_grad_mindspore = self.grad_mindspore_impl()
        input_grad_mindspore_parallel = self.grad_mindspore_parallel_impl()
        input_grad_mindspore0 = input_grad_mindspore[0].asnumpy()
        input_grad_mindspore1 = input_grad_mindspore[1].asnumpy()
        input_grad_mindspore_parallel0 = input_grad_mindspore_parallel[0].asnumpy()
        input_grad_mindspore_parallel1 = input_grad_mindspore_parallel[1].asnumpy()
        input_grad_blocks_0 = self.get_parallel_blocks(input_grad_mindspore0, self.strategy0[1])
        input_grad_blocks_1 = self.get_parallel_blocks(input_grad_mindspore1, self.strategy0[2])
        assert allclose_nparray(input_grad_blocks_0[self.x_id], input_grad_mindspore_parallel0, 0.0001, 0.0001)
        assert allclose_nparray(input_grad_blocks_1[self.y_id], input_grad_mindspore_parallel1, 0.0001, 0.0001)


 def test_reid_transpose_input_256x512_output_512x256_perm_1x0():
    fact = TransposeFactory((256, 512), (1, 0), strategy0=(0, (2, 2), (2, 2)), strategy1=(0, (2, 2)))
    fact.forward_transpose_cmp()


 def test_reid_grad_transpose_input_256x512_output_512x256_perm_1x0():
    fact = TransposeFactory((256, 512), (1, 0), strategy0=(0, (2, 2), (2, 2)), strategy1=(0, (2, 2)))
    fact.grad_transpose_cmp()


 def test_reid_transpose_input_512x256_output_256x512_perm_1x0():
    fact = TransposeFactory((512, 256), (1, 0), strategy0=(0, (4, 1), (4, 1)), strategy1=(0, (1, 4)))
    fact.forward_transpose_cmp()


 def test_reid_grad_transpose_input_512x256_output_256x512_perm_1x0():
    fact = TransposeFactory((512, 256), (1, 0), strategy0=(0, (4, 1), (4, 1)), strategy1=(0, (1, 4)))
    fact.grad_transpose_cmp()


 def test_reid_transpose_input_512x256_output_256x512_perm_1x0_repeat():
    fact = TransposeFactory((512, 256), (1, 0), strategy0=(0, (2, 1), (2, 1)), strategy1=(0, (2, 1)))
    fact.forward_transpose_cmp()


 def test_reid_grad_transpose_input_512x256_output_256x512_perm_1x0_repeat():
    fact = TransposeFactory((512, 256), (1, 0), strategy0=(0, (2, 1), (2, 1)), strategy1=(0, (2, 1)))
    fact.grad_transpose_cmp()
--- a/tests/ut/python/parallel/parallel_end_to_end/transpose/transpose_parallel_4p.sh
+++ b/tests/ut/python/parallel/parallel_end_to_end/transpose/transpose_parallel_4p.sh
@@ -1,27 +0,0 @@
 #!/bin/bash
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 for((i=0;i<4;i++));
 do
    rm -rf device$i
    mkdir device$i
    cd device$i
    mkdir output
    source ../../dist_env_4p.sh $i
    env  >log$i.log
    pytest -s ../test_transpose_parallel_4p.py>../../log/test_transpose_parallel_4p_log$i.log  2>&1 &
    cd ..    
 done