for pylint 2nd

5 years ago · 2d0ee05454
--- a/mindspore/ops/_op_impl/_custom_op/init.py
+++ b/mindspore/ops/_op_impl/_custom_op/init.py
@@ -14,14 +14,3 @@
 # ============================================================================

 """custom ops"""
 from .batch_matmul_impl import CusBatchMatMul
 from .cholesky_trsm_impl import CusCholeskyTrsm
 from .fused_abs_max1_impl import CusFusedAbsMax1
 from .img2col_impl import CusImg2Col
 from .matmul_cube_dense_left_impl import CusMatMulCubeDenseLeft
 from .matmul_cube_dense_right_impl import CusMatMulCubeDenseRight
 from .matmul_cube_fracz_left_cast_impl import CusMatMulCubeFraczLeftCast
 from .matmul_cube_fracz_right_mul_impl import CusMatMulCubeFraczRightMul
 from .matmul_cube_impl import CusMatMulCube
 from .matrix_combine_impl import CusMatrixCombine
 from .transpose02314_impl import CusTranspose02314
--- a/mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py
@@ -14,29 +14,31 @@
 # ============================================================================
 """batch_matmul_impl"""

 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tik
 from topi.cce import util
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 

 cus_batchmatmul_op_info = TBERegOp("CusBatchMatMul") \
                             .fusion_type("OPAQUE") \
                             .async_flag(False) \
                             .binfile_name("batchmatmul.so") \
                             .compute_cost(10) \
                             .kernel_name("CusBatchMatMul") \
                             .partial_flag(True) \
                             .input(0, "x1", False, "required", "all") \
                             .input(1, "x2", False, "required", "all") \
                             .output(0, "y", False, "required", "all") \
                             .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
                             .get_op_info()
 
    .fusion_type("OPAQUE") \
    .async_flag(False) \
    .binfile_name("batchmatmul.so") \
    .compute_cost(10) \
    .kernel_name("CusBatchMatMul") \
    .partial_flag(True) \
    .input(0, "x1", False, "required", "all") \
    .input(1, "x2", False, "required", "all") \
    .output(0, "y", False, "required", "all") \
    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
    .get_op_info()


 def _get_flattern_shape(shape):
    flattern_shape = 1
    for dim in shape:
        flattern_shape *= dim
    return (flattern_shape,)
 


 def _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index):
    input_1_local_UB = tik_instance.Tensor(dtype, [128], name="input_1_local_UB", scope=tik.scope_ubuf)
    t_1_0_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="t_1_0_local_UB", scope=tik.scope_ubuf)
@@ -66,12 +68,13 @@ def _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_
                          matmul_hybrid_f_t_local_UB, 1, 1, 1, 1, 8, 8, 8)
        tik_instance.data_move(res[res_index + thread_idx2 * 64],
                               matmul_hybrid_f_t_local_UB, 0, 1, 8, 0, 0)
 


 def _inner_matmul_new_1_64_32_64(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index):
    input_1_local_UB = tik_instance.Tensor(dtype, [64], name="input_1_local_UB", scope=tik.scope_ubuf)
    tik_instance.data_move(input_1_local_UB, input1[input1_index], 0, 1, 8, 0, 0)
    with tik_instance.for_range(0, 2, thread_num=2) as thread_idx2:
        input_2_local_UB = tik_instance.Tensor(dtype, [32*64], name="input_2_local_UB",
        input_2_local_UB = tik_instance.Tensor(dtype, [32 * 64], name="input_2_local_UB",
                                               scope=tik.scope_ubuf)
        t_1_local_UB = input_2_local_UB
        matmul_hybrid_f_t_local_UB = tik_instance.Tensor(dtype, [32], name="matmul_hybrid_f_t_local_UB",
@@ -83,6 +86,8 @@ def _inner_matmul_new_1_64_32_64(tik_instance, dtype, input1, input1_index, inpu
                               1, 1, 1, 8)
        tik_instance.data_move(res[res_index + thread_idx2 * 32],
                               matmul_hybrid_f_t_local_UB, 0, 1, 4, 0, 0)


@op_info_register(cus_batchmatmul_op_info)
 def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=True, kernel_name="batchmatmul"):
    if util.get_product_version() == util.VERSION_MINI:
@@ -97,51 +102,54 @@ def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=Tr
            dtype, input_x2.get("dtype").lower()))
    input_shape = (tuple(x1_shape), tuple(x2_shape), dtype, transpose_a, transpose_b)
    support_shape = [((8, 128, 128), (8, 128, 128), "float32", False, True),
                    ((36, 128, 128), (36, 128, 128), "float32", False, True),
                    ((5, 128, 128), (5, 128, 128), "float32", False, True),
                    ((18, 128, 128), (18, 128, 128), "float32", False, True),
                    ((16, 128, 128), (16, 128, 128), "float32", False, True),
                    ((9, 128, 128), (9, 128, 128), "float32", False, True),
                    ((1, 64, 64), (1, 64, 64), "float32", False, True),
                    ((1, 128, 128), (1, 128, 128), "float32", False, True),
                    ((4, 128, 128), (4, 128, 128), "float32", False, True),
                    ((2, 128, 128), (2, 128, 128), "float32", False, True)]
                     ((36, 128, 128), (36, 128, 128), "float32", False, True),
                     ((5, 128, 128), (5, 128, 128), "float32", False, True),
                     ((18, 128, 128), (18, 128, 128), "float32", False, True),
                     ((16, 128, 128), (16, 128, 128), "float32", False, True),
                     ((9, 128, 128), (9, 128, 128), "float32", False, True),
                     ((1, 64, 64), (1, 64, 64), "float32", False, True),
                     ((1, 128, 128), (1, 128, 128), "float32", False, True),
                     ((4, 128, 128), (4, 128, 128), "float32", False, True),
                     ((2, 128, 128), (2, 128, 128), "float32", False, True)]
    if input_shape not in support_shape:
        raise RuntimeError("input_shape %s is not supported" % str(input_shape))
 
 

    # if not transpose_a and transpose_b:
    batch, m, k = x1_shape
    _, n, _ = x2_shape
 

    input1_shape = _get_flattern_shape(x1_shape)
    input1 = tik_instance.Tensor(dtype, input1_shape, name="input1", scope=tik.scope_gm)
    input2_shape = _get_flattern_shape(x2_shape)
    input2 = tik_instance.Tensor(dtype, input2_shape, name="input2", scope=tik.scope_gm)
 

    output_shape = x1_shape
    res_shape = _get_flattern_shape(output_shape)
    res = tik_instance.Tensor(dtype, res_shape, name="res", scope=tik.scope_gm)
 

    if input_shape == ((36, 128, 128), (36, 128, 128), "float32", False, True):
        with tik_instance.for_range(0, 18, block_num=18) as block_idx:
            with tik_instance.for_range(0, 2) as cc0:
                with tik_instance.for_range(0, 128, thread_num=2) as cc1:
                    input1_index = block_idx * 32768 + cc0*16384 + cc1 * 128
                    input2_index = block_idx * 32768 + cc0*16384
                    res_index = block_idx*32768 + cc0*16384 + cc1*128
                    input1_index = block_idx * 32768 + cc0 * 16384 + cc1 * 128
                    input2_index = block_idx * 32768 + cc0 * 16384
                    res_index = block_idx * 32768 + cc0 * 16384 + cc1 * 128
                    _inner_matmul_new(tik_instance, dtype,
                                  input1, input1_index,
                                  input2, input2_index,
                                  res, res_index)
                                      input1, input1_index,
                                      input2, input2_index,
                                      res, res_index)
    if input_shape == ((5, 128, 128), (5, 128, 128), "float32", False, True):
        with tik_instance.for_range(0, 30, block_num=30) as block_idx:
            with tik_instance.for_range(0, 11) as cc1_db:
                with tik_instance.for_range(0, 2, thread_num=2) as thread_idx:
                    with tik_instance.if_scope(((((block_idx % 6) * 22) + (cc1_db * 2) + thread_idx) < 128)):
                        input_1_local_UB = tik_instance.Tensor(dtype, [128], name="input_1_local_UB", scope=tik.scope_ubuf)
                        t_1_0_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="t_1_0_local_UB", scope=tik.scope_ubuf)
                        tik_instance.data_move(input_1_local_UB, input1[(block_idx//6)*16384 + (block_idx % 6)*2816 + cc1_db * 256 + thread_idx*128], 0, 1, 16, 0, 0)
                        input_1_local_UB = tik_instance.Tensor(dtype, [128], name="input_1_local_UB",
                                                               scope=tik.scope_ubuf)
                        t_1_0_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="t_1_0_local_UB",
                                                             scope=tik.scope_ubuf)
                        tik_instance.data_move(input_1_local_UB, input1[
                            (block_idx // 6) * 16384 + (block_idx % 6) * 2816 + cc1_db * 256 + thread_idx * 128], 0, 1,
                                               16, 0, 0)
                        with tik_instance.for_range(0, 2) as vec_i:
                            tik_instance.vadds(64, t_1_0_local_UB[vec_i * 64], input_1_local_UB[vec_i * 64], 0,
                                               64, 1, 1, 16, 0)
@@ -150,58 +158,61 @@ def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=Tr
                                                                   scope=tik.scope_ubuf)
                            t_1_local_UB = input_2_local_UB
                            bisec_last_axis_local_UB = input_2_local_UB
                            matmul_hybrid_f_t_local_UB = tik_instance.Tensor(dtype, [64], name="matmul_hybrid_f_t_local_UB",
                            matmul_hybrid_f_t_local_UB = tik_instance.Tensor(dtype, [64],
                                                                             name="matmul_hybrid_f_t_local_UB",
                                                                             scope=tik.scope_ubuf)
                            matmul_hybrid_f_t_local_UB_dst_tmp = tik_instance.Tensor(dtype, [64],
                                                                                     name="matmul_hybrid_f_t_local_UB_dst_tmp",
                                                                                     scope=tik.scope_ubuf)
                            tik_instance.vector_dup(64, matmul_hybrid_f_t_local_UB, 0, 1, 1, 8)
                            tik_instance.data_move(input_2_local_UB, input2[(block_idx//6) * 16384 + thread_idx2*8192], 0, 1,
                            tik_instance.data_move(input_2_local_UB,
                                                   input2[(block_idx // 6) * 16384 + thread_idx2 * 8192], 0, 1,
                                                   1024, 0, 0)
                            tik_instance.vmul(64, t_1_local_UB, t_1_0_local_UB, input_2_local_UB, 128, 1, 1, 1, 8, 8, 8)
                            tik_instance.vadd(64, bisec_last_axis_local_UB, t_1_local_UB, t_1_local_UB[64], 64, 1, 1, 1,
                                              16, 16, 16)
                            tik_instance.vector_dup(64, matmul_hybrid_f_t_local_UB_dst_tmp, 0, 1, 1, 8)
                            with tik_instance.for_range(0, 64) as cc6:
                                tik_instance.vcadd(64, matmul_hybrid_f_t_local_UB_dst_tmp[cc6], bisec_last_axis_local_UB[cc6*128],
                                tik_instance.vcadd(64, matmul_hybrid_f_t_local_UB_dst_tmp[cc6],
                                                   bisec_last_axis_local_UB[cc6 * 128],
                                                   1, 1, 1, 8)
                            tik_instance.vadd(64, matmul_hybrid_f_t_local_UB, matmul_hybrid_f_t_local_UB_dst_tmp,
                                              matmul_hybrid_f_t_local_UB, 1, 1, 1, 1, 8, 8, 8)
                            tik_instance.data_move(res[(block_idx//6)*16384 + (block_idx%6)*2816 + cc1_db*256 +
                                                       thread_idx*128 + thread_idx2*64],
                                                   matmul_hybrid_f_t_local_UB, 0, 1, 8, 0, 0)
 
                            tik_instance.data_move(
                                res[(block_idx // 6) * 16384 + (block_idx % 6) * 2816 + cc1_db * 256 +
                                    thread_idx * 128 + thread_idx2 * 64],
                                matmul_hybrid_f_t_local_UB, 0, 1, 8, 0, 0)

    if input_shape == ((18, 128, 128), (18, 128, 128), "float32", False, True):
        with tik_instance.for_range(0, 18, block_num=18) as block_idx:
            with tik_instance.for_range(0, 128, thread_num=2) as cc0:
                input1_index = block_idx * 16384 + cc0 * 128
                input2_index = block_idx * 16384
                res_index = block_idx*16384 + cc0*128
                res_index = block_idx * 16384 + cc0 * 128
                _inner_matmul_new(tik_instance, dtype,
                              input1, input1_index,
                              input2, input2_index,
                              res, res_index)
 
 
                                  input1, input1_index,
                                  input2, input2_index,
                                  res, res_index)

    if input_shape == ((9, 128, 128), (9, 128, 128), "float32", False, True):
        with tik_instance.for_range(0, 27, block_num=27) as block_idx:
            with tik_instance.for_range(0, 42, thread_num=2) as cc0:
                input1_index = (block_idx//3) * 16384 + (block_idx % 3)*5504 + cc0 * 128
                input2_index = (block_idx//3) * 16384
                res_index = (block_idx//3) * 16384 + (block_idx % 3)*5504 + cc0*128
                input1_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + cc0 * 128
                input2_index = (block_idx // 3) * 16384
                res_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + cc0 * 128
                _inner_matmul_new(tik_instance, dtype,
                              input1, input1_index,
                              input2, input2_index,
                              res, res_index)
                                  input1, input1_index,
                                  input2, input2_index,
                                  res, res_index)
            with tik_instance.if_scope((block_idx % 3) < 2):
                input1_index = (block_idx//3) * 16384 + (block_idx % 3)*5504 + 42*128
                input1_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + 42 * 128
                input2_index = (block_idx // 3) * 16384
                res_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + 42*128
                res_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + 42 * 128
                _inner_matmul_new(tik_instance, dtype,
                              input1, input1_index,
                              input2, input2_index,
                              res, res_index)
 
                                  input1, input1_index,
                                  input2, input2_index,
                                  res, res_index)

    if input_shape == ((1, 64, 64), (1, 64, 64), "float32", False, True):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            with tik_instance.for_range(0, 2, thread_num=2) as cc0:
@@ -209,35 +220,35 @@ def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=Tr
                input2_index = 0
                res_index = block_idx * 128 + cc0 * 64
                _inner_matmul_new_1_64_32_64(tik_instance, dtype,
                              input1, input1_index,
                              input2, input2_index,
                              res, res_index)
 
                                             input1, input1_index,
                                             input2, input2_index,
                                             res, res_index)

    input_shape_list = [((1, 128, 128), (1, 128, 128), "float32", False, True),
                        ((2, 128, 128), (2, 128, 128), "float32", False, True),
                        ((4, 128, 128), (4, 128, 128), "float32", False, True),
                        ((8, 128, 128), (8, 128, 128), "float32", False, True),
                        ((16, 128, 128), (16, 128, 128), "float32", False, True)
    ]
                        ]
    if input_shape in input_shape_list:
        block_num = 32
        input1_unit_size = 128
        input2_unint_size = 128*128
        input2_unint_size = 128 * 128
        with tik_instance.for_range(0, block_num, block_num=block_num) as block_idx:
            block_process_ele_num = (batch * m * k) // block_num
            loop_time = (batch*m*k)//block_num//input1_unit_size
            loop_time = (batch * m * k) // block_num // input1_unit_size
            thread_num = 2
            with tik_instance.for_range(0, loop_time, thread_num=thread_num) as cc0:
                input1_index = block_idx*block_process_ele_num + cc0*input1_unit_size
                input1_index = block_idx * block_process_ele_num + cc0 * input1_unit_size
                if batch > 1:
                    input2_index = block_idx//(block_num//batch) * input2_unint_size
                    input2_index = block_idx // (block_num // batch) * input2_unint_size
                else:
                    input2_index = 0
                res_index = block_idx*block_process_ele_num + cc0*input1_unit_size
                res_index = block_idx * block_process_ele_num + cc0 * input1_unit_size
                _inner_matmul_new(tik_instance, dtype,
                                  input1, input1_index,
                                  input2, input2_index,
                                  res, res_index)
 

    tik_instance.BuildCCE(kernel_name, inputs=[input1, input2], outputs=[res])
    return tik_instance
--- a/mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py
@@ -13,24 +13,25 @@
 # limitations under the License.
 # ============================================================================
 """CusCholeskyTrsm"""
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tik
 from topi.cce import util
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType

 cus_cholesky_trsm_op_info = TBERegOp("CusCholeskyTrsm") \
                             .fusion_type("OPAQUE") \
                             .async_flag(False) \
                             .binfile_name("choleskytrsm.so") \
                             .compute_cost(10) \
                             .kernel_name("CusCholeskyTrsm") \
                             .partial_flag(True) \
                             .input(0, "x1", False, "required", "all") \
                             .output(0, "y", False, "required", "all") \
                             .dtype_format(DataType.F32_Default, DataType.F32_Default) \
                             .get_op_info()
    .fusion_type("OPAQUE") \
    .async_flag(False) \
    .binfile_name("choleskytrsm.so") \
    .compute_cost(10) \
    .kernel_name("CusCholeskyTrsm") \
    .partial_flag(True) \
    .input(0, "x1", False, "required", "all") \
    .output(0, "y", False, "required", "all") \
    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
    .get_op_info()


@op_info_register(cus_cholesky_trsm_op_info)
 def CusCholeskyTrsm(input_x,output, kernel_name):
 def CusCholeskyTrsm(input_x, output, kernel_name):
    input_x_shape = input_x.get("shape")
    output_shape = output.get("shape")
    split_dim = 128
@@ -47,34 +48,36 @@ def CusCholeskyTrsm(input_x,output, kernel_name):

    input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
    res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
    with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
        input_x_ub = tik_instance.Tensor("float32", (split_dim,split_dim), name="input_x_ub", scope=tik.scope_ubuf)
        temp_ub = tik_instance.Tensor("float32", (split_dim,split_dim), name="temp_ub", scope=tik.scope_ubuf)
    with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
        input_x_ub = tik_instance.Tensor("float32", (split_dim, split_dim), name="input_x_ub", scope=tik.scope_ubuf)
        temp_ub = tik_instance.Tensor("float32", (split_dim, split_dim), name="temp_ub", scope=tik.scope_ubuf)
        assist_1_ub = tik_instance.Tensor("float32", (split_dim,), name="assist_1_ub", scope=tik.scope_ubuf)
        assist_2_ub = tik_instance.Tensor("float32", (split_dim,), name="assist_2_ub", scope=tik.scope_ubuf)
        with tik_instance.for_range(0,split_dim) as i:
            tik_instance.data_move(input_x_ub[i,0], input_x[block_index * split_dim + i, block_index * split_dim], 0, 1, vector_repeat_times * 8, 0, 0)
        scalar1 = tik_instance.Scalar("float32", init_value = -0.5)
        with tik_instance.for_range(0, split_dim) as i:
            tik_instance.data_move(input_x_ub[i, 0], input_x[block_index * split_dim + i, block_index * split_dim], 0,
                                   1, vector_repeat_times * 8, 0, 0)
        scalar1 = tik_instance.Scalar("float32", init_value=-0.5)

        with tik_instance.for_range(0, split_dim) as i:
            scalar2= tik_instance.Scalar("float32")
            tik_instance.vln(64, assist_1_ub[0], input_x_ub[i,0], vector_repeat_times, 1, 1, 8, 8)
            scalar2 = tik_instance.Scalar("float32")
            tik_instance.vln(64, assist_1_ub[0], input_x_ub[i, 0], vector_repeat_times, 1, 1, 8, 8)
            tik_instance.vmuls(64, assist_2_ub[0], assist_1_ub[0], scalar1, vector_repeat_times, 1, 1, 8, 8)
            tik_instance.vexp(64, assist_1_ub[0], assist_2_ub[0], vector_repeat_times, 1, 1, 8, 8)
            scalar2.set_as(assist_1_ub[i])
            tik_instance.vmuls(64, input_x_ub[i,0], input_x_ub[i,0], scalar2, vector_repeat_times, 1, 1, 8, 8)
            tik_instance.vmuls(64, input_x_ub[i, 0], input_x_ub[i, 0], scalar2, vector_repeat_times, 1, 1, 8, 8)
            with tik_instance.for_range(i + 1, split_dim) as j:
                scalar3= tik_instance.Scalar("float32")
                scalar3 = tik_instance.Scalar("float32")
                scalar3.set_as(input_x_ub[i, j])
                tik_instance.vmuls(64,temp_ub[j, 0], input_x_ub[i, 0], scalar3, vector_repeat_times, 1, 1, 8, 8)
            tik_instance.vsub(64,input_x_ub[i+1,0], input_x_ub[i+1,0], temp_ub[i+1,0], (split_dim-1-i) * vector_repeat_times, 1, 1, 1, 8, 8, 8)
                tik_instance.vmuls(64, temp_ub[j, 0], input_x_ub[i, 0], scalar3, vector_repeat_times, 1, 1, 8, 8)
            tik_instance.vsub(64, input_x_ub[i + 1, 0], input_x_ub[i + 1, 0], temp_ub[i + 1, 0],
                              (split_dim - 1 - i) * vector_repeat_times, 1, 1, 1, 8, 8, 8)

        zero = tik_instance.Scalar("float32")
        zero.set_as(0.0)
        one = tik_instance.Scalar("float32")
        one.set_as(1.0)
        with tik_instance.for_range(0, split_dim) as i:
            tik_instance.vector_dup(64, temp_ub[i,0], zero, vector_repeat_times, 1, 8)
            tik_instance.vector_dup(64, temp_ub[i, 0], zero, vector_repeat_times, 1, 8)
            temp_ub.__setitem__(i * split_dim + i, one)

        chol_diag_element_final = tik_instance.Scalar("float32")
@@ -89,16 +92,19 @@ def CusCholeskyTrsm(input_x,output, kernel_name):
            with tik_instance.for_range(0, i) as j:
                chol_diag_element_loop = tik_instance.Scalar("float32")
                chol_diag_element_loop.set_as(input_x_ub[index, index + 1 + j])
                tik_instance.vmuls(64, assist_2_ub, temp_ub[j + index + 1, 0], chol_diag_element_loop, vector_repeat_times,1,1,8,8)
                tik_instance.vadd(64, assist_1_ub, assist_2_ub, assist_1_ub, vector_repeat_times,1,1,1,8,8,8)
                tik_instance.vmuls(64, assist_2_ub, temp_ub[j + index + 1, 0], chol_diag_element_loop,
                                   vector_repeat_times, 1, 1, 8, 8)
                tik_instance.vadd(64, assist_1_ub, assist_2_ub, assist_1_ub, vector_repeat_times, 1, 1, 1, 8, 8, 8)
            temp_scalar = tik_instance.Scalar("float32")
            temp_scalar.set_as(input_x_ub[index, index])
            chol_diag_element = tik_instance.Scalar("float32")
            chol_diag_element.set_as(1.0 / temp_scalar)
            tik_instance.vsub(64,temp_ub[index, 0], temp_ub[index, 0], assist_1_ub, vector_repeat_times,1,1,1,8,8,8)
            tik_instance.vmuls(64, temp_ub[index, 0], temp_ub[index, 0], chol_diag_element,vector_repeat_times,1,1,8,8)
            tik_instance.vsub(64, temp_ub[index, 0], temp_ub[index, 0], assist_1_ub, vector_repeat_times, 1, 1, 1, 8, 8,
                              8)
            tik_instance.vmuls(64, temp_ub[index, 0], temp_ub[index, 0], chol_diag_element, vector_repeat_times, 1, 1,
                               8, 8)

        tik_instance.data_move(res[block_index,0,0], temp_ub, 0, 1, 8 * vector_repeat_times * split_dim,0,0)
        tik_instance.data_move(res[block_index, 0, 0], temp_ub, 0, 1, 8 * vector_repeat_times * split_dim, 0, 0)

    tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x], outputs=[res])
    return tik_instance
--- a/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py
--- a/mindspore/ops/_op_impl/_custom_op/img2col_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/img2col_impl.py
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py
@@ -17,17 +17,15 @@ limitations under the License.
 matmul
 """
 from __future__ import absolute_import

 import te.lang.cce
 import te.platform.cce_params as cce
 from te.platform.fusion_manager import fusion_manager
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tik
 from te import tvm
 from topi import generic
 from topi.cce import util

 from impl.matmul_vector import matmul_vector_cce

 from te import tik
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 # General limitation of the size for input shape: 2**31
 SHAPE_SIZE_LIMIT = 2147483648
 NoneType = type(None)
@@ -46,6 +44,7 @@ matmul_cube_dense_left_op_info = TBERegOp("CusMatMulCubeDenseLeft") \
    .dtype_format(DataType.F16_Default, DataType.F16_FracNZ, DataType.F16_Default, DataType.F16_FracNZ) \
    .get_op_info()


 # pylint: disable=locally-disabled,too-many-arguments,too-many-branches, too-many-statements, too-many-locals,
 def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
    """
@@ -115,16 +114,16 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):

    if m_shape != 1:
        if n_shape == 1:
            if km_shape % (cce.BLOCK_IN*cce.BLOCK_IN) != 0:
            if km_shape % (cce.BLOCK_IN * cce.BLOCK_IN) != 0:
                raise RuntimeError("input shape K1 should be multiple of %d"
                                   % (cce.BLOCK_IN*cce.BLOCK_IN))
        elif km_shape%k_block_size != 0:
                                   % (cce.BLOCK_IN * cce.BLOCK_IN))
        elif km_shape % k_block_size != 0:
            raise RuntimeError(
                "input shape K1 should be multiple of %d" % cce.BLOCK_IN)
    else:
        if km_shape % (cce.BLOCK_IN*cce.BLOCK_IN) != 0:
        if km_shape % (cce.BLOCK_IN * cce.BLOCK_IN) != 0:
            raise RuntimeError("input shape K1 should be multiple of %d"
                               % (cce.BLOCK_IN*cce.BLOCK_IN))
                               % (cce.BLOCK_IN * cce.BLOCK_IN))

    if n_shape % cce.BLOCK_IN != 0 and n_shape != 1:
        raise RuntimeError("input shape N should be 1 or multiple of %d" % cce.BLOCK_IN)
@@ -132,7 +131,7 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
    if len(shape_bias):
        if len(shape_bias) == 1:
            if is_gevm or is_gemv:
                if shape_bias[0] != m_shape*n_shape:
                if shape_bias[0] != m_shape * n_shape:
                    raise RuntimeError("broadcast case shape bias for gemv must be equal m*n")
            else:
                if shape_bias[0] != n_shape:
@@ -143,33 +142,36 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
        else:
            raise RuntimeError("unsupport input shape now for batch bias case")


 def _get_bias(shape_bias):
    bias_length = shape_bias[0]
    if bias_length % 16 ==0:
    if bias_length % 16 == 0:
        return shape_bias
    else:
        bias_length = (bias_length // 16)*16 + 16
        bias_length = (bias_length // 16) * 16 + 16
        shape_bias = []
        shape_bias.append(bias_length)
        return shape_bias


 def _get_input_shape(shape_x):
    dim_a = shape_x[0]
    dim_b = shape_x[1]
    res = []
    if dim_a % 16 !=0:
        dim_a = (dim_a // 16)*16 + 16
    if dim_a % 16 != 0:
        dim_a = (dim_a // 16) * 16 + 16
        res.append(dim_a)
    else:
        res.append(dim_a)

    if dim_b % 16 !=0:
        dim_b = (dim_b // 16)*16 + 16
    if dim_b % 16 != 0:
        dim_b = (dim_b // 16) * 16 + 16
        res.append(dim_b)
    else:
        res.append(dim_b)
    return res


 def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
    shape_a = input_x1.get("shape")
    shape_b = input_x2.get("shape")
@@ -184,7 +186,7 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
    if bias is not None and bool(bias):
        shape_bias = bias.get("shape")
    try:
        trans_a_f = bool(1-trans_a)
        trans_a_f = bool(1 - trans_a)
        if src_dtype == "float32" or src_dtype == "int32":
            if len(shape_a) != 2 and len(shape_b) != 2:
                return False
@@ -205,44 +207,46 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
                    return False
            elif shape_a[1] != shape_b[0]:
                return False
 

            if trans_a_f and trans_b and shape_b[1] == 1:
                return False
 

        if src_dtype == "float16":
            if len(shape_a) != 2 and len(shape_b) != 2:
                return False
 

            if trans_a:
                m_shape = shape_a[1]
                k_shape = shape_a[0]
            else:
                m_shape = shape_a[0]
                k_shape = shape_a[1]
 

            if trans_b:
                n_shape = shape_b[0]
                k_b_shape = shape_b[1]
            else:
                n_shape = shape_b[1]
                k_b_shape = shape_b[0]
 

            if k_shape != k_b_shape:
                return False
 

            if m_shape == 1 or n_shape == 1:
                if k_shape % 256 != 0:
                    return False
 

    except RuntimeError as e:
        return False
 

    return True
 


 # pylint: disable=locally-disabled,too-many-arguments, too-many-locals, too-many-statements
 # @util.check_input_type(dict, dict, (dict, NoneType), dict, bool, bool, str)
@op_info_register(matmul_cube_dense_left_op_info)
 def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
 def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False,
                           kernel_name="matmulcube"):
    """
    calculating  matrix multiplication with bias, C = A*B + bias, support input
    data with fractal format.
@@ -279,87 +283,87 @@ def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=F
    print(shape_a, shape_b)
    print("============")
    if input_x2.get("format") == "FRACTAL_Z":
        n,c,h,w = shape_b
        n, c, h, w = shape_b
        c0 = 16
        c1 = c // c0
        if c1 == 0:
            c1 = 1
        shape_b = [n, c1 * h * w * c0]
        shape_a = [n,n]
        shape_a = [n, n]

    if input_x1.get("format") == "FRACTAL_Z":
        n,c,h,w = shape_a
        n, c, h, w = shape_a
        c0 = 16
        c1 = c // c0
        if c1 == 0:
            c1 = 1
        shape_a = [n, c1 * h * w * c0]
        shape_b = [c1 * h * w * c0, c1 * h * w * c0]
 

    if input_x2.get("format") == "FRACTAL_NZ":
        shape_a = [shape_b[0], shape_b[0]]
        shape_b = shape_b
 

    if input_x1.get("format") == "FRACTAL_NZ":
        shape_a = shape_a
        shape_b = [shape_a[1], shape_a[1]]
 

    shape_a = list(shape_a)
    shape_b = list(shape_b)
 

    shape_a = _get_input_shape(shape_a)
    shape_b = _get_input_shape(shape_b)
 

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_a)
    util.check_shape_rule(shape_b)
    util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)
 

    shape_a = [shape_a[1], shape_a[0]]
    trans_a = bool(1-trans_a)
 
    trans_a = bool(1 - trans_a)

    shape_b = [shape_b[1], shape_b[0]]
    trans_b = bool(1-trans_b)
 
    trans_b = bool(1 - trans_b)

    shape_bias = ()
    if bias is not None and bool(bias):
        shape_bias = bias.get("shape")
        shape_bias = list(shape_bias)
        shape_bias = _get_bias(shape_bias)
 

    src_dtype = input_x1.get("dtype").lower()
    dst_dtype = output_y.get("dtype").lower()
    _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b)
 

    m_shape = shape_a[len(shape_a) - 2]
    km_shape = shape_a[len(shape_a) - 1]
    kn_shape = shape_b[len(shape_a) - 2]
    n_shape = shape_b[len(shape_a) - 1]
 

    if src_dtype == "float16":
        block_reduce = cce.BLOCK_REDUCE
 

    block_in = cce.BLOCK_IN
    block_out = cce.BLOCK_OUT
 

    if trans_a and km_shape == 1:
        block_in = cce.BLOCK_VECTOR
 

    if not trans_a and m_shape == 1:
        block_in = cce.BLOCK_VECTOR
 

    if trans_b and kn_shape == 1:
        block_out = cce.BLOCK_VECTOR
 

    if not trans_b and n_shape == 1:
        block_out = cce.BLOCK_VECTOR
 

    if trans_a:
        shape_a_temp = (m_shape // block_reduce, km_shape // block_in, block_reduce, block_in)
    else:
        shape_a_temp = (m_shape // block_in, km_shape // block_reduce, block_in, block_reduce)
 

    if trans_b:
        shape_b_temp = (kn_shape // block_out, n_shape // block_reduce, block_reduce, block_out)
    else:
@@ -368,7 +372,7 @@ def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=F
    format_a = "FRACTAL_NZ"
    shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2], shape_b_temp[3])
    format_b = "FRACTAL_NZ"
 

    print("=======================================")
    print(shape_a_temp, shape_b_temp)
    print(format_a, format_b)
@@ -378,67 +382,85 @@ def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=F
                               dtype=src_dtype)
    tensor_b = tvm.placeholder(shape_b_temp, name='tensor_b',
                               dtype=src_dtype)
 

    if len(shape_bias) > 0:
        tensor_bias = tvm.placeholder(shape_bias, name='tensor_bias',
                                      dtype=dst_dtype)
 

    if shape_a_temp[0] == 63 and shape_a_temp[1] == 63 and shape_b_temp[0] == 128 and shape_b_temp[1] == 63:
        if util.get_product_version() == util.VERSION_MINI:
          tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
            tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
        else:
          tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
 
            tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))

        input_x1 = tik_instance.Tensor("float16", shape_a_temp, name="left_matrix", scope=tik.scope_gm)
        input_x2 = tik_instance.Tensor("float16", shape_b_temp, name="right_matrix", scope=tik.scope_gm)
        resMatmul = tik_instance.Tensor("float16", shape_output, name="output", scope=tik.scope_gm)
        with tik_instance.for_range(0,32,block_num=32) as block_index:
            resMatmul_local_UB = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_ubuf, name = "resMatmul_local_UB")
            resMatmul_local_UB_local_L0C = tik_instance.Tensor("float32", (128 * 256,), scope=tik.scope_cc, name = "resMatmul_local_UB")
            input_1_local_L1_local_L0A = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_ca, name = "input_1_local_L1_local_L0A")
            input_2_local_L1 = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cbuf, name = "input_2_local_L1")
            input_1_local_L1 = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cbuf, name = "input_1_local_L1")
            input_2_local_L1_local_L0B = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cb, name = "input_2_local_L1_local_L0B")
        with tik_instance.for_range(0, 32, block_num=32) as block_index:
            resMatmul_local_UB = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_ubuf,
                                                     name="resMatmul_local_UB")
            resMatmul_local_UB_local_L0C = tik_instance.Tensor("float32", (128 * 256,), scope=tik.scope_cc,
                                                               name="resMatmul_local_UB")
            input_1_local_L1_local_L0A = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_ca,
                                                             name="input_1_local_L1_local_L0A")
            input_2_local_L1 = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cbuf,
                                                   name="input_2_local_L1")
            input_1_local_L1 = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cbuf,
                                                   name="input_1_local_L1")
            input_2_local_L1_local_L0B = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cb,
                                                             name="input_2_local_L1_local_L0B")
            core_m_idx = block_index % 8
            core_n_idx = block_index // 8
            with tik_instance.if_scope(core_m_idx != 7):
                tik_instance.data_move(input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 8, 128, 55 * 16, 0)
                tik_instance.data_move(input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0, 32, 128, 55 * 16, 0)
                tik_instance.data_move(input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 8, 128,
                                       55 * 16, 0)
                tik_instance.data_move(input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0,
                                       32, 128, 55 * 16, 0)
                with tik_instance.for_range(0, 8) as cc12:
                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc12 * 2048], input_1_local_L1[cc12 * 256],  0, 8, 8, 0, False)
                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc12 * 2048], input_1_local_L1[cc12 * 256], 0, 8,
                                          8, 0, False)
                with tik_instance.for_range(0, 2) as cc6:
                    with tik_instance.for_range(0, 8) as cc121:
                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc121 * 4096], input_2_local_L1[cc6 * 32768 + cc121 * 256],  0, 16, 8, 0, True)
                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 128, 128, 256, 0)
                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc121 * 4096],
                                              input_2_local_L1[cc6 * 32768 + cc121 * 256], 0, 16, 8, 0, True)
                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A,
                                      input_2_local_L1_local_L0B, 128, 128, 256, 0)
                    tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 128, 0, 0, 1)
                    tik_instance.data_move(resMatmul[cc6 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], resMatmul_local_UB, 0, 16, 256 // 2 , 0, 55 * 16 * 2 // 2)
                    tik_instance.data_move(resMatmul[cc6 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008],
                                           resMatmul_local_UB, 0, 16, 256 // 2, 0, 55 * 16 * 2 // 2)
            with tik_instance.else_scope():
                tik_instance.data_move(input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 7, 112, 56 * 16, 0)
                tik_instance.data_move(input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0, 32, 112, 56 * 16, 0)
                tik_instance.data_move(input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 7, 112,
                                       56 * 16, 0)
                tik_instance.data_move(input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0,
                                       32, 112, 56 * 16, 0)
                with tik_instance.for_range(0, 7) as cc10:
                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc10 * 1792], input_1_local_L1[cc10 * 256],  0, 7, 7, 0, False)
                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc10 * 1792], input_1_local_L1[cc10 * 256], 0, 7,
                                          7, 0, False)
                with tik_instance.for_range(0, 2) as cc5:
                    with tik_instance.for_range(0, 7) as cc101:
                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc101 * 4096], input_2_local_L1[cc5 * 28672 + cc101 * 256],  0, 16, 7, 0, True)
                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 112, 112, 256, 0)
                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc101 * 4096],
                                              input_2_local_L1[cc5 * 28672 + cc101 * 256], 0, 16, 7, 0, True)
                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A,
                                      input_2_local_L1_local_L0B, 112, 112, 256, 0)
                    tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 112, 0, 0, 1)
                    tik_instance.data_move(resMatmul[cc5 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], resMatmul_local_UB, 0, 16, 224 // 2 , 0, 56 * 16 * 2 // 2)
                    tik_instance.data_move(resMatmul[cc5 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008],
                                           resMatmul_local_UB, 0, 16, 224 // 2, 0, 56 * 16 * 2 // 2)
        tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2], outputs=[resMatmul])
        return tik_instance
    else:
        print("come into tbe, shape is error!")
        result = te.lang.cce.matmul(tensor_a, tensor_b, trans_a, trans_b, format_a=format_a,
                                    format_b=format_b, dst_dtype=dst_dtype, tensor_bias=tensor_bias)
 

        with tvm.target.cce():
            schedule = generic.auto_schedule(result)
 

        tensor_list = [tensor_a, tensor_b, result]
        if len(shape_bias) > 0:
            tensor_list = [tensor_a, tensor_b, tensor_bias, result]
 

        config = {"print_ir": False,
                  "name": kernel_name,
                  "tensor_list": tensor_list}
 

        te.lang.cce.cce_build_code(schedule, config)
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py
@@ -18,15 +18,10 @@ limitations under the License.
 matmul
 """
 from __future__ import absolute_import
 import te.lang.cce
 import te.platform.cce_params as cce
 from te.platform.fusion_manager import fusion_manager
 from te import tvm
 from topi import generic
 from topi.cce import util
 from impl.matmul_vector import matmul_vector_cce
 from te import tik

 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tik
 from topi.cce import util

 matmul_cube_dense_right_op_info = TBERegOp("CusMatMulCubeDenseRight") \
    .fusion_type("OPAQUE") \
@@ -40,23 +35,26 @@ matmul_cube_dense_right_op_info = TBERegOp("CusMatMulCubeDenseRight") \
    .input(2, "x3", False, "required", "all") \
    .input(3, "x4", False, "optional", "all") \
    .output(0, "y", False, "required", "all") \
    .dtype_format(DataType.F16_FracNZ, DataType.F16_Default, DataType.F32_Default, DataType.F16_Default, DataType.F32_FracNZ) \
    .dtype_format(DataType.F16_FracNZ, DataType.F16_Default, DataType.F32_Default, DataType.F16_Default,
                  DataType.F32_FracNZ) \
    .get_op_info()
 


@op_info_register(matmul_cube_dense_right_op_info)
 def CusMatMulCubeDenseRight(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
 def CusMatMulCubeDenseRight(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False,
                            kernel_name="matmulcube"):
    shape_a_temp = (128, 63, 16, 16)
    shape_b_temp = (128, 128, 16, 16)
    shape_output = output_y.get("shape")
    matrix_max_shape = (1,)
    support_shape = [(shape_a_temp, shape_b_temp, matrix_max_shape),]
    support_shape = [(shape_a_temp, shape_b_temp, matrix_max_shape), ]
    shape_a_input = input_x1.get("shape")
    shape_b_input = input_x2.get("shape")
    matrix_max_input = input_x3.get("shape")
    input_shape = (tuple(shape_a_input), tuple(shape_b_input), tuple(matrix_max_input))
    if input_shape not in support_shape:
        raise RuntimeError("input_shape %s is not supported" % str(input_shape))
 

    if shape_a_temp[0] == 128 and shape_a_temp[1] == 63 and shape_b_temp[0] == 128 and shape_b_temp[1] == 128:
        if util.get_product_version() == util.VERSION_MINI:
            tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
@@ -64,79 +62,110 @@ def CusMatMulCubeDenseRight(input_x1, input_x2, input_x3, bias=None, output_y={}
            tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
        input_x1 = tik_instance.Tensor("float16", shape_a_temp, name="left_matrix", scope=tik.scope_gm)
        input_x2 = tik_instance.Tensor("float16", shape_b_temp, name="right_matrix", scope=tik.scope_gm)
        input_x3 = tik_instance.Tensor("float32", [1,], name="matrix_max", scope=tik.scope_gm)
        input_x3 = tik_instance.Tensor("float32", [1, ], name="matrix_max", scope=tik.scope_gm)
        resMatmul = tik_instance.Tensor("float32", shape_output, name="output", scope=tik.scope_gm)
        with tik_instance.for_range(0, 32, block_num=32) as block_index:
            core_m_idx = block_index // 16
            core_n_idx = block_index % 16
            matrix_max_scalar = tik_instance.Scalar("float32")
            matrix_max_local_UB = tik_instance.Tensor("float32", (8,), scope = tik.scope_ubuf, name = "matrix_max_local_UB")
            matrix_max_local_UB = tik_instance.Tensor("float32", (8,), scope=tik.scope_ubuf, name="matrix_max_local_UB")
            tik_instance.data_move(matrix_max_local_UB, input_x3, 0, 1, 1, 0, 0)
            matrix_max_scalar.set_as(matrix_max_local_UB[0])
 
            resMatmul_local_UB = tik_instance.Tensor("float32", (256 * 128,), scope=tik.scope_ubuf, name = "resMatmul_local_UB")
            resMatmul_local_UB1 = tik_instance.Tensor("float32", (240 * 128,), scope=tik.scope_ubuf, name = "resMatmul_local_UB1")

            resMatmul_local_UB_local_L0C = tik_instance.Tensor("float32", (256 * 128,), scope=tik.scope_cc, name = "resMatmul_local_UB_local_L0C")
            resMatmul_local_UB_local_L0C1 = tik_instance.Tensor("float32", (240 * 128,), scope=tik.scope_cc, name = "resMatmul_local_UB_local_L0C1")
            resMatmul_local_UB = tik_instance.Tensor("float32", (256 * 128,), scope=tik.scope_ubuf,
                                                     name="resMatmul_local_UB")
            resMatmul_local_UB1 = tik_instance.Tensor("float32", (240 * 128,), scope=tik.scope_ubuf,
                                                      name="resMatmul_local_UB1")

            resMatmul_local_UB_local_L0C = tik_instance.Tensor("float32", (256 * 128,), scope=tik.scope_cc,
                                                               name="resMatmul_local_UB_local_L0C")
            resMatmul_local_UB_local_L0C1 = tik_instance.Tensor("float32", (240 * 128,), scope=tik.scope_cc,
                                                                name="resMatmul_local_UB_local_L0C1")

            input_1_local_L1_local_L0A = tik_instance.Tensor("float16", (256 * 128,), scope=tik.scope_ca,
                                                             name="input_1_local_L1_local_L0A")
            input_2_local_L1 = tik_instance.Tensor("float16", (8 * 128 * 16,), scope=tik.scope_cbuf,
                                                   name="input_2_local_L1")
            input_2_local_L11 = tik_instance.Tensor("float16", (8 * 128 * 16,), scope=tik.scope_cbuf,
                                                    name="input_2_local_L11")

            input_1_local_L1 = tik_instance.Tensor("float16", (8 * 256 * 16,), scope=tik.scope_cbuf,
                                                   name="input_1_local_L1")
            input_1_local_L11 = tik_instance.Tensor("float16", (8 * 240 * 16,), scope=tik.scope_cbuf,
                                                    name="input_1_local_L11")

            input_2_local_L1_local_L0B = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cb,
                                                             name="input_2_local_L1_local_L0B")
            input_2_local_L1_local_L0B1 = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cb,
                                                              name="input_2_local_L1_local_L0B1")

            input_1_local_L1_local_L0A = tik_instance.Tensor("float16", (256 * 128,), scope=tik.scope_ca, name = "input_1_local_L1_local_L0A")
            input_2_local_L1 = tik_instance.Tensor("float16", (8 * 128 * 16,), scope=tik.scope_cbuf, name = "input_2_local_L1")
            input_2_local_L11 = tik_instance.Tensor("float16", (8 * 128 * 16,), scope=tik.scope_cbuf, name = "input_2_local_L11")
 
            input_1_local_L1 = tik_instance.Tensor("float16", (8 * 256 * 16,), scope=tik.scope_cbuf, name = "input_1_local_L1")
            input_1_local_L11 = tik_instance.Tensor("float16", (8 * 240 * 16,), scope=tik.scope_cbuf, name = "input_1_local_L11")
 
            input_2_local_L1_local_L0B = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cb, name = "input_2_local_L1_local_L0B")
            input_2_local_L1_local_L0B1 = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cb, name = "input_2_local_L1_local_L0B1")
 
            with tik_instance.if_scope(core_m_idx == 0):
                with tik_instance.for_range(0, 2) as cc1:
                    tik_instance.data_move(input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128, 1920, 0)
                    tik_instance.data_move(input_1_local_L1, input_x1[core_n_idx * 129024 + cc1 * 4096], 0, 8, 256, 752, 0)
                    tik_instance.data_move(input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8,
                                           128, 1920, 0)
                    tik_instance.data_move(input_1_local_L1, input_x1[core_n_idx * 129024 + cc1 * 4096], 0, 8, 256, 752,
                                           0)
                    with tik_instance.for_range(0, 8) as cc10:
                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256],  0, 8, 8, 0, True)
                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256], 0,
                                              8, 8, 0, True)
                    with tik_instance.for_range(0, 16) as cc101:
                        tik_instance.load2dv1(input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256],  0, 8, 16, 0, False)
 
                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 256, 128, 128, 0)
                        tik_instance.load2dv1(input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256],
                                              0, 8, 16, 0, False)

                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A,
                                      input_2_local_L1_local_L0B, 256, 128, 128, 0)
                    tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 128, 0, 0)
                    tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255,1,1,8,8)
                    tik_instance.vmuls(64, resMatmul_local_UB[255*64], resMatmul_local_UB[255*64], matrix_max_scalar, 255,1,1,8,8)
                    tik_instance.vmuls(64, resMatmul_local_UB[510*64], resMatmul_local_UB[510*64], matrix_max_scalar, 2,1,1,8,8)
 
                    tik_instance.data_move(resMatmul[core_n_idx * 129024 + cc1 * 4096], resMatmul_local_UB, 0, 8, 512, 0, 1504)
                    tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255, 1, 1, 8, 8)
                    tik_instance.vmuls(64, resMatmul_local_UB[255 * 64], resMatmul_local_UB[255 * 64],
                                       matrix_max_scalar, 255, 1, 1, 8, 8)
                    tik_instance.vmuls(64, resMatmul_local_UB[510 * 64], resMatmul_local_UB[510 * 64],
                                       matrix_max_scalar, 2, 1, 1, 8, 8)

                    tik_instance.data_move(resMatmul[core_n_idx * 129024 + cc1 * 4096], resMatmul_local_UB, 0, 8, 512,
                                           0, 1504)
            with tik_instance.else_scope():
                tik_instance.data_move(input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128, 1920, 0)
                tik_instance.data_move(input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128,
                                       1920, 0)
                tik_instance.data_move(input_1_local_L1, input_x1[core_n_idx * 129024 + 2 * 4096], 0, 8, 256, 752, 0)
                with tik_instance.for_range(0, 8) as cc10:
                    tik_instance.load2dv1(input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256],  0, 8, 8, 0, True)
                    tik_instance.load2dv1(input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256], 0, 8,
                                          8, 0, True)
                with tik_instance.for_range(0, 16) as cc101:
                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256],  0, 8, 16, 0, False)
 
                tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 256, 128, 128, 0)
                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256], 0, 8,
                                          16, 0, False)

                tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B,
                                  256, 128, 128, 0)
                tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 128, 0, 0)
                tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255,1,1,8,8)
                tik_instance.vmuls(64, resMatmul_local_UB[255*64], resMatmul_local_UB[255*64], matrix_max_scalar, 255,1,1,8,8)
                tik_instance.vmuls(64, resMatmul_local_UB[510*64], resMatmul_local_UB[510*64], matrix_max_scalar, 2,1,1,8,8)
 
                tik_instance.data_move(resMatmul[core_n_idx * 129024 + 2 * 4096], resMatmul_local_UB, 0, 8, 512, 0, 1504)
 
                tik_instance.data_move(input_2_local_L11, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128, 1920, 0)
                tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255, 1, 1, 8, 8)
                tik_instance.vmuls(64, resMatmul_local_UB[255 * 64], resMatmul_local_UB[255 * 64], matrix_max_scalar,
                                   255, 1, 1, 8, 8)
                tik_instance.vmuls(64, resMatmul_local_UB[510 * 64], resMatmul_local_UB[510 * 64], matrix_max_scalar, 2,
                                   1, 1, 8, 8)

                tik_instance.data_move(resMatmul[core_n_idx * 129024 + 2 * 4096], resMatmul_local_UB, 0, 8, 512, 0,
                                       1504)

                tik_instance.data_move(input_2_local_L11, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128,
                                       1920, 0)
                tik_instance.data_move(input_1_local_L11, input_x1[core_n_idx * 129024 + 12288], 0, 8, 240, 768, 0)
 

                with tik_instance.for_range(0, 8) as cc102:
                    tik_instance.load2dv1(input_2_local_L1_local_L0B1[cc102 * 2048], input_2_local_L11[cc102 * 256],  0, 8, 8, 0, True)
                    tik_instance.load2dv1(input_2_local_L1_local_L0B1[cc102 * 2048], input_2_local_L11[cc102 * 256], 0,
                                          8, 8, 0, True)
                with tik_instance.for_range(0, 16) as cc103:
                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc103 * 2048], input_1_local_L11[cc103 * 256],  0, 8, 15, 0, False)
 
                tik_instance.mmad(resMatmul_local_UB_local_L0C1, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B1, 240, 128, 128, 0)
                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc103 * 2048], input_1_local_L11[cc103 * 256], 0,
                                          8, 15, 0, False)

                tik_instance.mmad(resMatmul_local_UB_local_L0C1, input_1_local_L1_local_L0A,
                                  input_2_local_L1_local_L0B1, 240, 128, 128, 0)
                tik_instance.data_move(resMatmul_local_UB1, resMatmul_local_UB_local_L0C1, 0, 1, 120, 0, 0)
 
                tik_instance.vmuls(64, resMatmul_local_UB1, resMatmul_local_UB1, matrix_max_scalar, 255,1,1,8,8)
                tik_instance.vmuls(64, resMatmul_local_UB1[255*64], resMatmul_local_UB1[255*64], matrix_max_scalar, 225,1,1,8,8)
 

                tik_instance.vmuls(64, resMatmul_local_UB1, resMatmul_local_UB1, matrix_max_scalar, 255, 1, 1, 8, 8)
                tik_instance.vmuls(64, resMatmul_local_UB1[255 * 64], resMatmul_local_UB1[255 * 64], matrix_max_scalar,
                                   225, 1, 1, 8, 8)

                tik_instance.data_move(resMatmul[core_n_idx * 129024 + 12288], resMatmul_local_UB1, 0, 8, 480, 0, 1536)
 

        tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2, input_x3], outputs=[resMatmul])
        return tik_instance
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py
@@ -17,11 +17,12 @@ limitations under the License.
 matmul
 """
 from __future__ import absolute_import

 import te.platform.cce_params as cce
 from te import tvm
 from topi.cce import util
 from te import tik
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tik
 from topi.cce import util

 # General limitation of the size for input shape: 2**31
 SHAPE_SIZE_LIMIT = 2147483648
 NoneType = type(None)
@@ -40,6 +41,7 @@ matmul_cube_fracz_left_cast_op_info = TBERegOp("CusMatMulCubeFraczLeftCast") \
    .dtype_format(DataType.F16_Default, DataType.F32_FracZ, DataType.F16_Default, DataType.F16_FracZ) \
    .get_op_info()


 # pylint: disable=locally-disabled,too-many-arguments,too-many-branches, too-many-statements, too-many-locals,
 def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
    """
@@ -137,6 +139,7 @@ src_dtype: str
        else:
            raise RuntimeError("unsupport input shape now for batch bias case")


 def _get_bias(shape_bias):
    bias_length = shape_bias[0]
    if bias_length % 16 == 0:
@@ -147,6 +150,7 @@ def _get_bias(shape_bias):
        shape_bias.append(bias_length)
        return shape_bias


 def _get_input_shape(shape_x):
    dim_a = shape_x[0]
    dim_b = shape_x[1]
@@ -164,6 +168,7 @@ def _get_input_shape(shape_x):
        res.append(dim_b)
    return res


 def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
    shape_a = input_x1.get("shape")
    shape_b = input_x2.get("shape")
@@ -199,40 +204,41 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
                    return False
            elif shape_a[1] != shape_b[0]:
                return False
 

            if trans_a_f and trans_b and shape_b[1] == 1:
                return False
 

        if src_dtype == "float16":
            if len(shape_a) != 2 and len(shape_b) != 2:
                return False
 

            if trans_a:
                m_shape = shape_a[1]
                k_shape = shape_a[0]
            else:
                m_shape = shape_a[0]
                k_shape = shape_a[1]
 

            if trans_b:
                n_shape = shape_b[0]
                k_b_shape = shape_b[1]
            else:
                n_shape = shape_b[1]
                k_b_shape = shape_b[0]
 

            if k_shape != k_b_shape:
                return False
 

            if m_shape == 1 or n_shape == 1:
                if k_shape % 256 != 0:
                    return False
 

    except RuntimeError as e:
        return False
 

    return True
 


 # pylint: disable=locally-disabled,too-many-arguments, too-many-locals, too-many-statements
@op_info_register(matmul_cube_fracz_left_cast_op_info)
 def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False,
@@ -278,7 +284,7 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
            c1 = 1
        shape_b = [n, c1 * h * w * c0]
        shape_a = [n, n]
 

    if input_x1.get("format") == "FRACTAL_Z":
        n, c, h, w = shape_a
        c0 = 16
@@ -291,26 +297,26 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
    if input_x2.get("format") == "FRACTAL_NZ":
        shape_a = [shape_b[0], shape_b[0]]
        shape_b = shape_b
 

    if input_x1.get("format") == "FRACTAL_NZ":
        shape_a = shape_a
        shape_b = [shape_a[1], shape_a[1]]

    shape_a = list(shape_a)
    shape_b = list(shape_b)
 

    shape_a = _get_input_shape(shape_a)
    shape_b = _get_input_shape(shape_b)
 

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_a)
    util.check_shape_rule(shape_b)
    util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)
 

    shape_a = [shape_a[1], shape_a[0]]
    trans_a = bool(1 - trans_a)
 

    shape_b = [shape_b[1], shape_b[0]]
    trans_b = bool(1 - trans_b)

@@ -319,45 +325,45 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
        shape_bias = bias.get("shape")
        shape_bias = list(shape_bias)
        shape_bias = _get_bias(shape_bias)
 

    src_dtype = input_x1.get("dtype").lower()
    _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b)
 

    m_shape = shape_a[len(shape_a) - 2]
    km_shape = shape_a[len(shape_a) - 1]
    kn_shape = shape_b[len(shape_a) - 2]
    n_shape = shape_b[len(shape_a) - 1]
 

    if src_dtype == "float16":
        block_reduce = cce.BLOCK_REDUCE
 

    block_in = cce.BLOCK_IN
    block_out = cce.BLOCK_OUT
 

    if trans_a and km_shape == 1:
        block_in = cce.BLOCK_VECTOR
 

    if not trans_a and m_shape == 1:
        block_in = cce.BLOCK_VECTOR
 

    if trans_b and kn_shape == 1:
        block_out = cce.BLOCK_VECTOR
 

    if not trans_b and n_shape == 1:
        block_out = cce.BLOCK_VECTOR
 

    if trans_a:
        shape_a_temp = (m_shape // block_reduce, km_shape // block_in, block_reduce, block_in)
    else:
        shape_a_temp = (m_shape // block_in, km_shape // block_reduce, block_in, block_reduce)
 

    if trans_b:
        shape_b_temp = (kn_shape // block_out, n_shape // block_reduce, block_reduce, block_out)
    else:
        shape_b_temp = (kn_shape // block_reduce, n_shape // block_out, block_out, block_reduce)
    shape_a_temp = (shape_a_temp[0], shape_a_temp[1], shape_a_temp[2], shape_a_temp[3])
    shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2], shape_b_temp[3])
 

    if util.get_product_version() == util.VERSION_MINI:
        tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
    else:
@@ -372,7 +378,8 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
                         diag_opt=diag_opt, diag_size=DIAG_SIZE)
    tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2], outputs=[res_matmul])
    return tik_instance
 


 def get_cus_tile_info(input_x1, input_x2, diag_size):
    tile_map = {
        ((32, 32, 16, 16), (128, 32, 16, 16)): (8, 8, 16),
@@ -381,10 +388,10 @@ def get_cus_tile_info(input_x1, input_x2, diag_size):
        ((128, 128, 16, 16), (32, 128, 16, 16)): (8, 8, 16),
        ((16, 16, 16, 16), (144, 16, 16, 16)): (8, 8, 9),
        ((64, 64, 16, 16), (16, 64, 16, 16)): (8, 8, 4),
        ((16, 16, 16, 16), (64, 16, 16, 16)):  (8, 8, 4),
        ((32, 32, 16, 16), (8, 32, 16, 16)):  (8, 8, 1),
        ((16, 16, 16, 16), (64, 16, 16, 16)): (8, 8, 4),
        ((32, 32, 16, 16), (8, 32, 16, 16)): (8, 8, 1),
        ((128, 128, 16, 16), (64, 128, 16, 16)): (8, 8, 16),
        ((16, 16, 16, 16),  (4, 16, 16, 16)):  (8, 8, 1),
        ((16, 16, 16, 16), (4, 16, 16, 16)): (8, 8, 1),
        ((16, 16, 16, 16), (32, 16, 16, 16)): (8, 8, 2),
        ((64, 64, 16, 16), (32, 64, 16, 16)): (8, 8, 8),
        ((32, 32, 16, 16), (64, 32, 16, 16)): (8, 8, 8),
@@ -398,13 +405,14 @@ def get_cus_tile_info(input_x1, input_x2, diag_size):
    }
    shape_info = (tuple(input_x1.shape), tuple(input_x2.shape))
    diag_opt = False
    if input_x1.shape[0]*input_x1.shape[3] > diag_size:
    if input_x1.shape[0] * input_x1.shape[3] > diag_size:
        diag_opt = True
    if shape_info not in tile_map:
        raise ValueError("shape %s is not supported" % str(shape_info))
    mo_tile, ko_tile, no_tile = tile_map[shape_info]
    return mo_tile, ko_tile, no_tile, diag_opt


 def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
                         res, mo_tile, ko_tile, no_tile, diag_opt=False, diag_size=128):
    ko, mo, mi, ki = input_x1.shape
@@ -420,7 +428,7 @@ def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
        raise ValueError("shape of input_x1 or input_x2 is not supported!")
    if not trans_a or not trans_b:
        raise ValueError("only trans_a=False and trans_b=False be supported!")
 

    core_m_num = mo // mo_tile
    loop_n_num = no // no_tile
    if loop_n_num * core_m_num <= maxblocknum:
@@ -432,7 +440,7 @@ def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
    else:
        raise ValueError("Does not support this scenario!")
    block_num = core_m_num * core_n_num
 

    loop_k_num = ko // ko_tile
    if diag_opt:
        loop_k_num = diag_outer // ko_tile
@@ -445,7 +453,7 @@ def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
        core_n = block_idx % core_n_num
        with tik_instance.for_range(0, loop_n_num) as cc_n:
            res_L0C = tik_instance.Tensor("float32", [no_tile, mo_tile, c0, c0],
                                                name="resMatmul_L0C", scope=tik.scope_cc)
                                          name="resMatmul_L0C", scope=tik.scope_cc)
            with tik_instance.for_range(0, loop_k_num, thread_num=thread_num_k) as thread_idx_k:
                # input_x2 -> input_x2_ub -(fp322fp16)-> input_x2_cast_ub -> input_x2_L1
                input_x2_ub = tik_instance.Tensor("float32", [no_tile, ko_tile_inner, c0, c0], name="input_x2_ub",
@@ -476,41 +484,41 @@ def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
                                   input_x2_cast_ub[count * repeate_times_max * vectorfp32_size],
                                   input_x2_ub[count * repeate_times_max * vectorfp32_size], repeate_num,
                                   1, 1, 4, 8)
               input_x2_L1 = tik_instance.Tensor("float16", [no_tile, ko_tile_inner, c0, c0],
                                                  name="input_x2_L1", scope=tik.scope_cbuf)
                tik_instance.data_move(input_x2_L1, input_x2_cast_ub, 0, 1,
                                       no_tile * ko_tile_inner * c0 * c0 * fp16_size // blocksize, 0, 0)
                # input_x1 -> input_x1_L1
                input_x1_L1 = tik_instance.Tensor(input_x1.dtype, [ko_tile_inner, mo_tile, c0, c0],
                                                  name="input_x1_L1", scope=tik.scope_cbuf)
                tik_instance.data_move(input_x1_L1,
                                       input_x1[k_idx,
                                                core_m * mo_tile, 0, 0],
                                       0, ko_tile_inner, mo_tile * c0 * c0 * fp16_size // blocksize,
                                       (mo - mo_tile) * c0 * c0 * fp16_size // blocksize, 0)
                # input_x2_L1 -> input_x2_L0B
                input_x2_L0B = tik_instance.Tensor("float16", [ko_tile_inner, no_tile, c0, c0],
                                                   name="input_x2_L0B", scope=tik.scope_cb)
                with tik_instance.for_range(0, ko_tile_inner) as cc2:
                    tik_instance.load2dv1(input_x2_L0B[cc2, 0, 0, 0], input_x2_L1[0, cc2, 0, 0], 0, no_tile,
                                          ko_tile_inner,
                                          0, True)
                # input_x1_L1 -> input_x1_L0A
                input_x1_L0A = tik_instance.Tensor(input_x1.dtype, [mo_tile, ko_tile_inner, c0, c0],
                                                   name="input_x1_L0A", scope=tik.scope_ca)
                with tik_instance.for_range(0, mo_tile) as cc1:
                    tik_instance.load2dv1(input_x1_L0A[cc1, 0, 0, 0], input_x1_L1[0, cc1, 0, 0], 0, ko_tile_inner,
                                          mo_tile, 0, False)
                with tik_instance.if_scope(thread_idx_k == 0):
                    tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
                                      ko_tile_inner * c0, no_tile * c0, 0)
                with tik_instance.else_scope():
                    tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
                                      ko_tile_inner * c0, no_tile * c0, 1)
            res_ub = tik_instance.Tensor(input_x1.dtype, [no_tile, mo_tile, c0, c0],
                                               name="resMatmul_ub", scope=tik.scope_ubuf)
            tik_instance.data_move(res_ub, res_L0C, 0, 1, no_tile * mo_tile, 0, 0, 1)
            tik_instance.data_move(res[(core_n * loop_n_num + cc_n) * no_tile, core_m * mo_tile, 0, 0],
                                   res_ub, 0, no_tile,
                                   mo_tile * c0 * c0 * fp16_size // blocksize, 0,
                                   (mo - mo_tile) * c0 * c0 * fp16_size // blocksize)
            input_x2_L1 = tik_instance.Tensor("float16", [no_tile, ko_tile_inner, c0, c0],
                                              name="input_x2_L1", scope=tik.scope_cbuf)
            tik_instance.data_move(input_x2_L1, input_x2_cast_ub, 0, 1,
                                   no_tile * ko_tile_inner * c0 * c0 * fp16_size // blocksize, 0, 0)
            # input_x1 -> input_x1_L1
            input_x1_L1 = tik_instance.Tensor(input_x1.dtype, [ko_tile_inner, mo_tile, c0, c0],
                                              name="input_x1_L1", scope=tik.scope_cbuf)
            tik_instance.data_move(input_x1_L1,
                                   input_x1[k_idx,
                                            core_m * mo_tile, 0, 0],
                                   0, ko_tile_inner, mo_tile * c0 * c0 * fp16_size // blocksize,
                                   (mo - mo_tile) * c0 * c0 * fp16_size // blocksize, 0)
            # input_x2_L1 -> input_x2_L0B
            input_x2_L0B = tik_instance.Tensor("float16", [ko_tile_inner, no_tile, c0, c0],
                                               name="input_x2_L0B", scope=tik.scope_cb)
            with tik_instance.for_range(0, ko_tile_inner) as cc2:
                tik_instance.load2dv1(input_x2_L0B[cc2, 0, 0, 0], input_x2_L1[0, cc2, 0, 0], 0, no_tile,
                                      ko_tile_inner,
                                      0, True)
            # input_x1_L1 -> input_x1_L0A
            input_x1_L0A = tik_instance.Tensor(input_x1.dtype, [mo_tile, ko_tile_inner, c0, c0],
                                               name="input_x1_L0A", scope=tik.scope_ca)
            with tik_instance.for_range(0, mo_tile) as cc1:
                tik_instance.load2dv1(input_x1_L0A[cc1, 0, 0, 0], input_x1_L1[0, cc1, 0, 0], 0, ko_tile_inner,
                                      mo_tile, 0, False)
            with tik_instance.if_scope(thread_idx_k == 0):
                tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
                                  ko_tile_inner * c0, no_tile * c0, 0)
            with tik_instance.else_scope():
                tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
                                  ko_tile_inner * c0, no_tile * c0, 1)
        res_ub = tik_instance.Tensor(input_x1.dtype, [no_tile, mo_tile, c0, c0],
                                     name="resMatmul_ub", scope=tik.scope_ubuf)
        tik_instance.data_move(res_ub, res_L0C, 0, 1, no_tile * mo_tile, 0, 0, 1)
        tik_instance.data_move(res[(core_n * loop_n_num + cc_n) * no_tile, core_m * mo_tile, 0, 0],
                               res_ub, 0, no_tile,
                               mo_tile * c0 * c0 * fp16_size // blocksize, 0,
                               (mo - mo_tile) * c0 * c0 * fp16_size // blocksize)
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py
@@ -18,37 +18,35 @@ limitations under the License.
 matmul
 """
 from __future__ import absolute_import
 import te.lang.cce
 import te.platform.cce_params as cce
 from te.platform.fusion_manager import fusion_manager
 from te import tvm
 from topi import generic
 from topi.cce import util
 from te import tik
 from impl.matmul_vector import matmul_vector_cce

 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tik
 from topi.cce import util

 # General limitation of the size for input shape: 2**31
 SHAPE_SIZE_LIMIT = 2147483648
 NoneType = type(None)

 cus_matmul_cube_fracz_right_mul_op_info = TBERegOp("CusMatMulCubeFraczRightMul") \
                             .fusion_type("OPAQUE") \
                             .async_flag(False) \
                             .binfile_name("matmulcubefraczrightmul.so") \
                             .compute_cost(10) \
                             .kernel_name("CusMatMulCubeFraczRightMul") \
                             .partial_flag(True) \
                             .input(0, "x1", False, "required", "all") \
                             .input(1, "x2", False, "required", "all") \
                             .input(2, "x3", False, "required", "all") \
                             .input(3, "x4", False, "optional", "all") \
                             .output(0, "y", False, "required", "all") \
                             .dtype_format(DataType.F16_FracZ, DataType.F16_Default, DataType.F32_Default, DataType.F16_Default, DataType.F32_FracZ) \
                             .get_op_info()
    .fusion_type("OPAQUE") \
    .async_flag(False) \
    .binfile_name("matmulcubefraczrightmul.so") \
    .compute_cost(10) \
    .kernel_name("CusMatMulCubeFraczRightMul") \
    .partial_flag(True) \
    .input(0, "x1", False, "required", "all") \
    .input(1, "x2", False, "required", "all") \
    .input(2, "x3", False, "required", "all") \
    .input(3, "x4", False, "optional", "all") \
    .output(0, "y", False, "required", "all") \
    .dtype_format(DataType.F16_FracZ, DataType.F16_Default, DataType.F32_Default, DataType.F16_Default,
                  DataType.F32_FracZ) \
    .get_op_info()


@op_info_register(cus_matmul_cube_fracz_right_mul_op_info)
 def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
 def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False,
                               kernel_name="matmulcube"):
    if util.get_product_version() == util.VERSION_MINI:
        tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
    else:
@@ -61,10 +59,10 @@ def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y
    input_x3_shape = input_x3.get("shape")
    input_x3_dtype = input_x3.get("dtype").lower()
    output_shape = output_y.get("shape")
    Supported = [((72, 8, 16, 16),"float16", (72, 72, 16, 16), "float16", (1,), "float32"),
                 ((32, 8, 16, 16),"float16", (32, 32, 16, 16), "float16", (1,), "float32"),
                 ((8, 32, 16, 16),"float16", (8, 8, 16, 16), "float16", (1,), "float32"),
                 ((4, 4, 16, 16),"float16", (4, 4, 16, 16), "float16", (1,), "float32"),
    Supported = [((72, 8, 16, 16), "float16", (72, 72, 16, 16), "float16", (1,), "float32"),
                 ((32, 8, 16, 16), "float16", (32, 32, 16, 16), "float16", (1,), "float32"),
                 ((8, 32, 16, 16), "float16", (8, 8, 16, 16), "float16", (1,), "float32"),
                 ((4, 4, 16, 16), "float16", (4, 4, 16, 16), "float16", (1,), "float32"),
                 ((4, 16, 16, 16), 'float16', (4, 4, 16, 16), 'float16', (1,), 'float32'),
                 ((49, 4, 16, 16), 'float16', (49, 49, 16, 16), 'float16', (1,), 'float32'),
                 ((36, 4, 16, 16), 'float16', (36, 36, 16, 16), 'float16', (1,), 'float32'),
@@ -81,7 +79,8 @@ def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y
                 ((32, 128, 16, 16), 'float16', (32, 32, 16, 16), 'float16', (1,), 'float32'),
                 ((64, 32, 16, 16), 'float16', (64, 64, 16, 16), 'float16', (1,), 'float32'),
                 ((16, 64, 16, 16), 'float16', (16, 16, 16, 16), 'float16', (1,), 'float32')]
    input_shape = (tuple(input_x1_shape), input_x1_dtype, tuple(input_x2_shape), input_x2_dtype, tuple(input_x3_shape), input_x3_dtype)
    input_shape = (
    tuple(input_x1_shape), input_x1_dtype, tuple(input_x2_shape), input_x2_dtype, tuple(input_x3_shape), input_x3_dtype)
    if input_shape not in Supported:
        raise RuntimeError("input_shape %s is not supported" % str(input_shape))

@@ -93,6 +92,7 @@ def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y
    tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2, input_x3], outputs=[resMatmul])
    return tik_instance


 def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
                              res):
    diag_size = 128
@@ -176,7 +176,7 @@ def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
                                              name="resMatmul_L0C", scope=tik.scope_cc)
                with tik_instance.for_range(0, loop_k_num, thread_num=thread_num_k) as thread_idx_k:
                    if diag_opt:
                        k_idx = (core_n*loop_n_num + cc_n) * no_tile + thread_idx_k * ko_tile_inner
                        k_idx = (core_n * loop_n_num + cc_n) * no_tile + thread_idx_k * ko_tile_inner
                    else:
                        k_idx = thread_idx_k * ko_tile_inner
                    # input_x1 -> input_x1_L1
@@ -191,7 +191,7 @@ def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
                    input_x2_L1 = tik_instance.Tensor("float16", [no_tile, ko_tile_inner, c0, c0],
                                                      name="input_x2_L1", scope=tik.scope_cbuf)
                    tik_instance.data_move(input_x2_L1,
                                           input_x2[(core_n*loop_n_num + cc_n) * no_tile,
                                           input_x2[(core_n * loop_n_num + cc_n) * no_tile,
                                                    k_idx, 0, 0],
                                           0, no_tile, ko_tile_inner * c0 * c0 * fp16_size // blocksize,
                                           (ko - ko_tile_inner) * c0 * c0 * fp16_size // blocksize, 0)
@@ -215,9 +215,9 @@ def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
                        tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
                                          ko_tile_inner * c0, no_tile * c0, 1)
                res_ub = tik_instance.Tensor("float32", [no_tile, mo_tile, c0, c0],
                                                   name="resMatmul_ub", scope=tik.scope_ubuf)
                                             name="resMatmul_ub", scope=tik.scope_ubuf)
                tik_instance.data_move(res_ub, res_L0C, 0, 1, no_tile * mo_tile, 0, 0)
 

                input_3_local_UB = tik_instance.Tensor("float32", (8,), scope=tik.scope_ubuf, name="input_3_local_UB")
                tik_instance.data_move(input_3_local_UB, input_x3, 0, 1, 1, 0, 0)
                matrix_max_scalar = tik_instance.Scalar("float32")
@@ -236,7 +236,7 @@ def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
                                   res_ub[count * repeate_times_max * vectorfp32_size],
                                   res_ub[count * repeate_times_max * vectorfp32_size],
                                   matrix_max_scalar, repeate_num, 1, 1, 8, 8)
 

                tik_instance.data_move(res[(core_n * loop_n_num + cc_n) * no_tile,
                                           (core_m * loop_m_num + cc_m) * mo_tile, 0, 0],
                                       res_ub, 0, no_tile,
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_impl.py
@@ -18,13 +18,15 @@ limitations under the License.
 matmul
 """
 from __future__ import absolute_import

 import te.lang.cce
 import te.platform.cce_params as cce
 from impl.matmul_vector import matmul_vector_cce
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tvm
 from topi import generic
 from topi.cce import util
 from impl.matmul_vector import matmul_vector_cce
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType

 # General limitation of the size for input shape: 2**31
 SHAPE_SIZE_LIMIT = 2147483648
 NoneType = type(None)
@@ -36,8 +38,8 @@ matmul_cube_op_info = TBERegOp("CusMatMulCube") \
    .compute_cost(10) \
    .kernel_name("CusMatMulCube") \
    .partial_flag(True) \
    .attr("transpose_a", "required", "bool", "all")\
    .attr("transpose_b", "required", "bool", "all")\
    .attr("transpose_a", "required", "bool", "all") \
    .attr("transpose_b", "required", "bool", "all") \
    .input(0, "x1", False, "required", "all") \
    .input(1, "x2", False, "required", "all") \
    .input(2, "x3", False, "optional", "all") \
@@ -45,6 +47,7 @@ matmul_cube_op_info = TBERegOp("CusMatMulCube") \
    .dtype_format(DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_Default, DataType.F32_FracNZ) \
    .get_op_info()


 # pylint: disable=locally-disabled,too-many-arguments,too-many-branches, too-many-statements, too-many-locals,
 def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
    """
@@ -113,16 +116,16 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):

    if m_shape != 1:
        if n_shape == 1:
            if km_shape % (cce.BLOCK_IN*cce.BLOCK_IN) != 0:
            if km_shape % (cce.BLOCK_IN * cce.BLOCK_IN) != 0:
                raise RuntimeError("input shape K1 should be multiple of %d"
                                   % (cce.BLOCK_IN*cce.BLOCK_IN))
        elif km_shape%k_block_size != 0:
                                   % (cce.BLOCK_IN * cce.BLOCK_IN))
        elif km_shape % k_block_size != 0:
            raise RuntimeError(
                "input shape K1 should be multiple of %d" % cce.BLOCK_IN)
    else:
        if km_shape % (cce.BLOCK_IN*cce.BLOCK_IN) != 0:
        if km_shape % (cce.BLOCK_IN * cce.BLOCK_IN) != 0:
            raise RuntimeError("input shape K1 should be multiple of %d"
                               % (cce.BLOCK_IN*cce.BLOCK_IN))
                               % (cce.BLOCK_IN * cce.BLOCK_IN))

    if n_shape % cce.BLOCK_IN != 0 and n_shape != 1:
        raise RuntimeError("input shape N should be 1 or multiple of %d" % cce.BLOCK_IN)
@@ -130,7 +133,7 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
    if len(shape_bias):
        if len(shape_bias) == 1:
            if is_gevm or is_gemv:
                if shape_bias[0] != m_shape*n_shape:
                if shape_bias[0] != m_shape * n_shape:
                    raise RuntimeError("broadcast case shape bias for gemv must be equal m*n")
            else:
                if shape_bias[0] != n_shape:
@@ -141,33 +144,36 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
        else:
            raise RuntimeError("unsupport input shape now for batch bias case")


 def _get_bias(shape_bias):
    bias_length = shape_bias[0]
    if bias_length % 16 ==0:
    if bias_length % 16 == 0:
        return shape_bias
    else:
        bias_length = (bias_length // 16)*16 + 16
        bias_length = (bias_length // 16) * 16 + 16
        shape_bias = []
        shape_bias.append(bias_length)
        return shape_bias


 def _get_input_shape(shape_x):
    dim_a = shape_x[0]
    dim_b = shape_x[1]
    res = []
    if dim_a % 16 !=0:
        dim_a = (dim_a // 16)*16 + 16
    if dim_a % 16 != 0:
        dim_a = (dim_a // 16) * 16 + 16
        res.append(dim_a)
    else:
        res.append(dim_a)

    if dim_b % 16 !=0:
        dim_b = (dim_b // 16)*16 + 16
    if dim_b % 16 != 0:
        dim_b = (dim_b // 16) * 16 + 16
        res.append(dim_b)
    else:
        res.append(dim_b)
    return res


 def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
    shape_a = input_x1.get("shape")
    shape_b = input_x2.get("shape")
@@ -182,7 +188,7 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
    if bias is not None and bool(bias):
        shape_bias = bias.get("shape")
    try:
        trans_a_f = bool(1-trans_a)
        trans_a_f = bool(1 - trans_a)
        if src_dtype == "float32" or src_dtype == "int32":
            if len(shape_a) != 2 and len(shape_b) != 2:
                return False
@@ -203,10 +209,10 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
                    return False
            elif shape_a[1] != shape_b[0]:
                return False
 

            if trans_a_f and trans_b and shape_b[1] == 1:
                return False
 

        if src_dtype == "float16":
            if len(shape_a) != 2 and len(shape_b) != 2:
                return False
@@ -217,26 +223,27 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
            else:
                m_shape = shape_a[0]
                k_shape = shape_a[1]
 

            if trans_b:
                n_shape = shape_b[0]
                k_b_shape = shape_b[1]
            else:
                n_shape = shape_b[1]
                k_b_shape = shape_b[0]
 

            if k_shape != k_b_shape:
                return False
 

            if m_shape == 1 or n_shape == 1:
                if k_shape % 256 != 0:
                    return False
 

    except RuntimeError as e:
        return False
 

    return True
 


 # pylint: disable=locally-disabled,too-many-arguments, too-many-locals, too-many-statements
@op_info_register(matmul_cube_op_info)
 def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
@@ -269,18 +276,18 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
    """
    shape_a = input_x1.get("ori_shape")
    shape_b = input_x2.get("ori_shape")
 

    if shape_a is not None:
        if len(shape_a) < 2:
            shape_a = input_x1.get("shape")
 

    if shape_b is not None:
        if len(shape_b) < 2:
            shape_b = input_x2.get("shape")
 

    shape_a = list(shape_a)
    shape_b = list(shape_b)
 

    if input_x1.get("format") == "FRACTAL_NZ":
        shape_a = _get_input_shape(shape_a)
        shape_b = _get_input_shape(shape_b)
@@ -290,21 +297,21 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
    util.check_shape_rule(shape_b)
    util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)
 

    if input_x1.get("format") == "FRACTAL_NZ":
        shape_a = [shape_a[1], shape_a[0]]
        trans_a = bool(1-trans_a)
 
        trans_a = bool(1 - trans_a)

    if input_x2.get("format") == "FRACTAL_NZ":
        shape_b = [shape_b[1], shape_b[0]]
        trans_b = bool(1-trans_b)
 
        trans_b = bool(1 - trans_b)

    shape_bias = ()
    if bias is not None and bool(bias):
        shape_bias = bias.get("shape")
        shape_bias = list(shape_bias)
        shape_bias = _get_bias(shape_bias)
 

    src_dtype = input_x1.get("dtype").lower()
    dst_dtype = output_y.get("dtype").lower()
    if src_dtype == "float32" or src_dtype == "int32":
@@ -338,12 +345,12 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
        shape_a_temp = (m_shape // block_reduce, km_shape // block_in, block_reduce, block_in)
    else:
        shape_a_temp = (m_shape // block_in, km_shape // block_reduce, block_in, block_reduce)
 

    if trans_b:
        shape_b_temp = (kn_shape // block_out, n_shape // block_reduce, block_reduce, block_out)
    else:
        shape_b_temp = (kn_shape // block_reduce, n_shape // block_out, block_out, block_reduce)
 

    if input_x1.get("format") == "FORMAT_FRACTAL_Z":
        shape_a_temp = (shape_a_temp[0], shape_a_temp[1], shape_a_temp[2], shape_a_temp[3])
        format_a = "fractal"
@@ -353,7 +360,7 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
    else:
        shape_a_temp = (shape_a[len(shape_a) - 2], shape_a[len(shape_a) - 1])
        format_a = "ND"
 

    if input_x2.get("format") == "FORMAT_FRACTAL_Z":
        shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2], shape_b_temp[3])
        format_b = "fractal"
@@ -363,28 +370,28 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
    else:
        shape_b_temp = (shape_b[len(shape_b) - 2], shape_b[len(shape_b) - 1])
        format_b = "ND"
 

    tensor_bias = None
    tensor_a = tvm.placeholder(shape_a_temp, name='tensor_a',
                               dtype=src_dtype)
    tensor_b = tvm.placeholder(shape_b_temp, name='tensor_b',
                               dtype=src_dtype)
 

    if len(shape_bias) > 0:
        tensor_bias = tvm.placeholder(shape_bias, name='tensor_bias',
                                      dtype=dst_dtype)
    result = te.lang.cce.matmul(tensor_a, tensor_b, trans_a, trans_b, format_a=format_a,
                                format_b=format_b, dst_dtype=dst_dtype, tensor_bias=tensor_bias)
 

    with tvm.target.cce():
        schedule = generic.auto_schedule(result)
 

    tensor_list = [tensor_a, tensor_b, result]
    if len(shape_bias) > 0:
        tensor_list = [tensor_a, tensor_b, tensor_bias, result]
 

    config = {"print_ir": False,
              "name": kernel_name,
              "tensor_list": tensor_list}
 

    te.lang.cce.cce_build_code(schedule, config)
--- a/mindspore/ops/_op_impl/_custom_op/matrix_combine_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matrix_combine_impl.py
@@ -13,24 +13,25 @@
 # limitations under the License.
 # ============================================================================
 """CusMatrixCombine"""
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tik
 from topi.cce import util
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType

 cus_matrix_combine_op_info = TBERegOp("CusMatrixCombine") \
                             .fusion_type("OPAQUE") \
                             .async_flag(False) \
                             .binfile_name("matrixcombine.so") \
                             .compute_cost(10) \
                             .kernel_name("CusMatrixCombine") \
                             .partial_flag(True) \
                             .input(0, "x1", False, "required", "all") \
                             .output(0, "y", False, "required", "all") \
                             .dtype_format(DataType.F32_Default, DataType.F32_Default) \
                             .get_op_info()
    .fusion_type("OPAQUE") \
    .async_flag(False) \
    .binfile_name("matrixcombine.so") \
    .compute_cost(10) \
    .kernel_name("CusMatrixCombine") \
    .partial_flag(True) \
    .input(0, "x1", False, "required", "all") \
    .output(0, "y", False, "required", "all") \
    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
    .get_op_info()


@op_info_register(cus_matrix_combine_op_info)
 def CusMatrixCombine(input_x, output,kernel_name="matrix_combine"):
 def CusMatrixCombine(input_x, output, kernel_name="matrix_combine"):
    input_x_shape = input_x.get("shape")
    output_shape = output.get("shape")
    split_dim = 128
@@ -45,18 +46,20 @@ def CusMatrixCombine(input_x, output,kernel_name="matrix_combine"):

    blocks = 32
    matrix_dim = input_x_shape[0] * input_x_shape[1]
    if input_x_shape[0] == 1 and input_x_shape[1] == 64 :
    if input_x_shape[0] == 1 and input_x_shape[1] == 64:
        tiling_dim = 2
        bs = 1
        with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
            input_x_ub = tik_instance.Tensor("float32", (tiling_dim, matrix_dim), name="input_x_ub", scope=tik.scope_ubuf)
        with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
            input_x_ub = tik_instance.Tensor("float32", (tiling_dim, matrix_dim), name="input_x_ub",
                                             scope=tik.scope_ubuf)
            tik_instance.data_move(input_x_ub, input_x[0, block_index * tiling_dim, 0], 0, 1, 16, 0, 0)
            tik_instance.data_move(res[block_index * tiling_dim, 0], input_x_ub, 0, 1, 16, 0, 0)
    else:
        tiling_dim = 4
        bs = input_x_shape[0]
        with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
            input_x_ub = tik_instance.Tensor("float32", (tiling_dim, matrix_dim), name="input_x_ub", scope=tik.scope_ubuf)
        with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
            input_x_ub = tik_instance.Tensor("float32", (tiling_dim, matrix_dim), name="input_x_ub",
                                             scope=tik.scope_ubuf)
            zero = tik_instance.Scalar("float32")
            zero.set_as(0.0)
            with tik_instance.for_range(0, bs) as i:
@@ -69,7 +72,9 @@ def CusMatrixCombine(input_x, output,kernel_name="matrix_combine"):
                    tik_instance.vector_dup(64, input_x_ub, zero, repeat_1, 1, 8)
                    tik_instance.vector_dup(64, input_x_ub[255 * 64], zero, repeat_2, 1, 8)
                with tik_instance.for_range(0, tiling_dim) as j:
                    tik_instance.data_move(input_x_ub[j, split_dim * i], input_x[i, block_index * tiling_dim + j, 0], 0, 1, 16, 0, 0)
                tik_instance.data_move(res[i * split_dim + block_index * tiling_dim, 0], input_x_ub, 0, 1, tiling_dim * matrix_dim *4 // 32, 0, 0)
                    tik_instance.data_move(input_x_ub[j, split_dim * i], input_x[i, block_index * tiling_dim + j, 0], 0,
                                           1, 16, 0, 0)
                tik_instance.data_move(res[i * split_dim + block_index * tiling_dim, 0], input_x_ub, 0, 1,
                                       tiling_dim * matrix_dim * 4 // 32, 0, 0)
    tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x], outputs=[res])
    return tik_instance
--- a/mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py
@@ -13,40 +13,41 @@
 # limitations under the License.
 # ============================================================================
 """CusTranspose02314"""
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tik
 from topi.cce import util
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType

 cus_transpose02314_op_info = TBERegOp("CusTranspose02314") \
                             .fusion_type("OPAQUE") \
                             .async_flag(False) \
                             .binfile_name("transpose02314.so") \
                             .compute_cost(10) \
                             .kernel_name("CusTranspose02314") \
                             .partial_flag(True) \
                             .input(0, "x1", False, "required", "all") \
                             .output(0, "y", False, "required", "all") \
                             .dtype_format(DataType.F16_5HD, DataType.F16_Default) \
                             .get_op_info()
    .fusion_type("OPAQUE") \
    .async_flag(False) \
    .binfile_name("transpose02314.so") \
    .compute_cost(10) \
    .kernel_name("CusTranspose02314") \
    .partial_flag(True) \
    .input(0, "x1", False, "required", "all") \
    .output(0, "y", False, "required", "all") \
    .dtype_format(DataType.F16_5HD, DataType.F16_Default) \
    .get_op_info()


@op_info_register(cus_transpose02314_op_info)
 def CusTranspose02314(input_x, output, kernel_name="transpose021354"):
    input_x_shape = input_x.get("shape")
    output_shape = output.get("shape")
    perm = (0,2,3,1,4)
    perm = (0, 2, 3, 1, 4)
    input_x_shape = tuple(input_x_shape)
    support_shape = [ (32,128,7,7,16),
            (32,32,7,7,16),
            (32,32,14,14,16),
            (32,64,14,14,16),
            (32,16,14,14,16),
            (32,16,28,28,16),
            (32,32,28,28,16),
            (32,8,28,28,16),
            (32,8,56,56,16),
            (32,16,56,56,16),
            (32,4,56,56,16),
            (32,4,112,112,16)]
    support_shape = [(32, 128, 7, 7, 16),
                     (32, 32, 7, 7, 16),
                     (32, 32, 14, 14, 16),
                     (32, 64, 14, 14, 16),
                     (32, 16, 14, 14, 16),
                     (32, 16, 28, 28, 16),
                     (32, 32, 28, 28, 16),
                     (32, 8, 28, 28, 16),
                     (32, 8, 56, 56, 16),
                     (32, 16, 56, 56, 16),
                     (32, 4, 56, 56, 16),
                     (32, 4, 112, 112, 16)]
    if input_x_shape not in support_shape:
        raise RuntimeError("input_shape %s is not supported" % str(input_x_shape))

@@ -59,125 +60,172 @@ def CusTranspose02314(input_x, output, kernel_name="transpose021354"):
    res = tik_instance.Tensor("float16", output_shape, name="res", scope=tik.scope_gm)

    dtype = "float16"
    if tuple(input_x_shape) == (32,4,112,112,16):
    if tuple(input_x_shape) == (32, 4, 112, 112, 16):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            with tik_instance.for_range(0, 14) as cc1_db:
                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf)
                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
                                                           scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
                                                               scope=tik.scope_ubuf)
                    zero = tik_instance.Scalar(dtype="float16", init_value=0)
                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 802816 + cc1_db * 14336 + 7168 * db_idx], 0, 4, 448, 12096, 0)
                    tik_instance.data_move(input_1_local_UB,
                                           input_x[block_idx * 802816 + cc1_db * 14336 + 7168 * db_idx], 0, 4, 448,
                                           12096, 0)
                    with tik_instance.for_range(0, 448) as cc7:
                        with tik_instance.for_range(0, 4) as cc8:
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 64 + cc8 * 16], input_1_local_UB[7168 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 802816 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0)
    elif tuple(input_x_shape) == (32,4,56,56,16):
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 64 + cc8 * 16],
                                               input_1_local_UB[7168 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 802816 + cc1_db * 57344 + 28672 * db_idx],
                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)
    elif tuple(input_x_shape) == (32, 4, 56, 56, 16):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            zero = tik_instance.Scalar(dtype="float16", init_value=0)
            with tik_instance.for_range(0, 3) as cc1_db:
                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf)
                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 200704 + cc1_db * 14336 + 7168 * db_idx], 0, 4, 448, 2688, 0)
                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
                                                           scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
                                                               scope=tik.scope_ubuf)
                    tik_instance.data_move(input_1_local_UB,
                                           input_x[block_idx * 200704 + cc1_db * 14336 + 7168 * db_idx], 0, 4, 448,
                                           2688, 0)
                    with tik_instance.for_range(0, 448) as cc7:
                        with tik_instance.for_range(0, 4) as cc8:
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 64 + cc8 * 16], input_1_local_UB[7168 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 200704 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0)
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 64 + cc8 * 16],
                                               input_1_local_UB[7168 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 200704 + cc1_db * 57344 + 28672 * db_idx],
                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)

            input_1_local_UB2 = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB2", scope=tik.scope_ubuf)
            T_transpose_local_UB2 = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB2", scope=tik.scope_ubuf)
            T_transpose_local_UB2 = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB2",
                                                        scope=tik.scope_ubuf)
            tik_instance.data_move(input_1_local_UB2, input_x[block_idx * 200704 + 43008], 0, 4, 448, 2688, 0)
            with tik_instance.for_range(0, 448) as cc72:
                with tik_instance.for_range(0, 4) as cc82:
                    tik_instance.vadds(16, T_transpose_local_UB2[cc72 * 64 + cc82 * 16], input_1_local_UB2[7168 * cc82 + cc72 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.vadds(16, T_transpose_local_UB2[cc72 * 64 + cc82 * 16],
                                       input_1_local_UB2[7168 * cc82 + cc72 * 16], zero, 1, 1, 1, 0, 0)
            tik_instance.data_move(res[block_idx * 200704 + 172032], T_transpose_local_UB2, 0, 1, 1792, 0, 0)
    elif tuple(input_x_shape) == (32,16,56,56,16):
    elif tuple(input_x_shape) == (32, 16, 56, 56, 16):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            zero = tik_instance.Scalar(dtype="float16", init_value=0)
            with tik_instance.for_range(0, 14) as cc1_db:
                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf)
                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 802816 + cc1_db * 3584 + 1792 * db_idx], 0, 16, 112, 3024, 0)
                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
                                                           scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
                                                               scope=tik.scope_ubuf)
                    tik_instance.data_move(input_1_local_UB,
                                           input_x[block_idx * 802816 + cc1_db * 3584 + 1792 * db_idx], 0, 16, 112,
                                           3024, 0)
                    with tik_instance.for_range(0, 112) as cc7:
                        with tik_instance.for_range(0, 16) as cc8:
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16], input_1_local_UB[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 802816 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0)
    elif tuple(input_x_shape) == (32,8,56,56,16):
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16],
                                               input_1_local_UB[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 802816 + cc1_db * 57344 + 28672 * db_idx],
                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)
    elif tuple(input_x_shape) == (32, 8, 56, 56, 16):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            zero = tik_instance.Scalar(dtype="float16", init_value=0)
            with tik_instance.for_range(0, 7) as cc1_db:
                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf)
                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 401408 + cc1_db * 7168 + 3584 * db_idx], 0, 8, 224, 2912, 0)
                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
                                                           scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
                                                               scope=tik.scope_ubuf)
                    tik_instance.data_move(input_1_local_UB,
                                           input_x[block_idx * 401408 + cc1_db * 7168 + 3584 * db_idx], 0, 8, 224, 2912,
                                           0)
                    with tik_instance.for_range(0, 224) as cc7:
                        with tik_instance.for_range(0, 16) as cc8:
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 128 + cc8 * 16], input_1_local_UB[3584 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 401408 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0)
    elif tuple(input_x_shape) == (32,8,28,28,16):
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 128 + cc8 * 16],
                                               input_1_local_UB[3584 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 401408 + cc1_db * 57344 + 28672 * db_idx],
                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)
    elif tuple(input_x_shape) == (32, 8, 28, 28, 16):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            zero = tik_instance.Scalar(dtype="float16", init_value=0)
            with tik_instance.for_range(0, 2) as cc1_db:
                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                    input_1_local_UB = tik_instance.Tensor(dtype, [25088], name="input_1_local_UB", scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [25088], name="T_transpose_local_UB", scope=tik.scope_ubuf)
                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 100352 + cc1_db * 6272 + 3136 * db_idx], 0, 8, 196, 588, 0)
                    input_1_local_UB = tik_instance.Tensor(dtype, [25088], name="input_1_local_UB",
                                                           scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [25088], name="T_transpose_local_UB",
                                                               scope=tik.scope_ubuf)
                    tik_instance.data_move(input_1_local_UB,
                                           input_x[block_idx * 100352 + cc1_db * 6272 + 3136 * db_idx], 0, 8, 196, 588,
                                           0)
                    with tik_instance.for_range(0, 196) as cc7:
                        with tik_instance.for_range(0, 8) as cc8:
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 128 + cc8 * 16], input_1_local_UB[3136 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 100352 + cc1_db * 50176 + 25088 * db_idx], T_transpose_local_UB, 0, 1, 1568, 0, 0)
    elif tuple(input_x_shape) == (32,32,28,28,16):
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 128 + cc8 * 16],
                                               input_1_local_UB[3136 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 100352 + cc1_db * 50176 + 25088 * db_idx],
                                           T_transpose_local_UB, 0, 1, 1568, 0, 0)
    elif tuple(input_x_shape) == (32, 32, 28, 28, 16):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            zero = tik_instance.Scalar(dtype="float16", init_value=0)
            with tik_instance.for_range(0, 7) as cc1_db:
                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf)
                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 401408 + cc1_db * 1792 + 896 * db_idx], 0, 32, 56, 728, 0)
                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
                                                           scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
                                                               scope=tik.scope_ubuf)
                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 401408 + cc1_db * 1792 + 896 * db_idx],
                                           0, 32, 56, 728, 0)
                    with tik_instance.for_range(0, 56) as cc7:
                        with tik_instance.for_range(0, 32) as cc8:
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 512 + cc8 * 16], input_1_local_UB[896 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 401408 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0)
    elif tuple(input_x_shape) == (32,16,28,28,16):
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 512 + cc8 * 16],
                                               input_1_local_UB[896 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 401408 + cc1_db * 57344 + 28672 * db_idx],
                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)
    elif tuple(input_x_shape) == (32, 16, 28, 28, 16):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            zero = tik_instance.Scalar(dtype="float16", init_value=0)
            with tik_instance.for_range(0, 3) as cc1_db:
                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf)
                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 200704 + cc1_db * 3584 + 1792 * db_idx], 0, 16, 112, 672, 0)
                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
                                                           scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
                                                               scope=tik.scope_ubuf)
                    tik_instance.data_move(input_1_local_UB,
                                           input_x[block_idx * 200704 + cc1_db * 3584 + 1792 * db_idx], 0, 16, 112, 672,
                                           0)
                    with tik_instance.for_range(0, 112) as cc7:
                        with tik_instance.for_range(0, 16) as cc8:
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16], input_1_local_UB[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 200704 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0)
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16],
                                               input_1_local_UB[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 200704 + cc1_db * 57344 + 28672 * db_idx],
                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)

            input_1_local_UB2 = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB2", scope=tik.scope_ubuf)
            T_transpose_local_UB2 = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB2", scope=tik.scope_ubuf)
            T_transpose_local_UB2 = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB2",
                                                        scope=tik.scope_ubuf)
            tik_instance.data_move(input_1_local_UB2, input_x[block_idx * 200704 + 10752], 0, 16, 112, 672, 0)
            with tik_instance.for_range(0, 112) as cc7:
                with tik_instance.for_range(0, 16) as cc8:
                    tik_instance.vadds(16, T_transpose_local_UB2[cc7 * 256 + cc8 * 16], input_1_local_UB2[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.vadds(16, T_transpose_local_UB2[cc7 * 256 + cc8 * 16],
                                       input_1_local_UB2[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
            tik_instance.data_move(res[block_idx * 200704 + 172032], T_transpose_local_UB2, 0, 1, 1792, 0, 0)

    elif tuple(input_x_shape) == (32,16,14,14,16):
    elif tuple(input_x_shape) == (32, 16, 14, 14, 16):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            zero = tik_instance.Scalar(dtype="float16", init_value=0)
            with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                input_1_local_UB = tik_instance.Tensor(dtype, [25088], name="input_1_local_UB", scope=tik.scope_ubuf)
                T_transpose_local_UB = tik_instance.Tensor(dtype, [25088], name="T_transpose_local_UB", scope=tik.scope_ubuf)
                T_transpose_local_UB = tik_instance.Tensor(dtype, [25088], name="T_transpose_local_UB",
                                                           scope=tik.scope_ubuf)
                tik_instance.data_move(input_1_local_UB, input_x[block_idx * 50176 + 1568 * db_idx], 0, 16, 98, 98, 0)
                with tik_instance.for_range(0, 98) as cc7:
                    with tik_instance.for_range(0, 16) as cc8:
                        tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16], input_1_local_UB[1568 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                        tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16],
                                           input_1_local_UB[1568 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                tik_instance.data_move(res[block_idx * 50176 + 25088 * db_idx], T_transpose_local_UB, 0, 1, 1568, 0, 0)
    elif tuple(input_x_shape) == (32, 128, 7, 7, 16) and tuple(perm) == (0,2,3,1,4) and dtype == "float16":
    elif tuple(input_x_shape) == (32, 128, 7, 7, 16) and tuple(perm) == (0, 2, 3, 1, 4) and dtype == "float16":
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            with tik_instance.for_range(0, 7, thread_num=2) as cc1:
                input_x_ub = tik_instance.Tensor(dtype, [1, 128, 1, 7, 16], name="input_1_local_UB",
                                                 scope=tik.scope_ubuf)
                transpose_ub = tik_instance.Tensor(dtype, [1, 1, 7, 128, 16], name="transpose_local_UB",
                                                 scope=tik.scope_ubuf)
                                                   scope=tik.scope_ubuf)
                tik_instance.data_move(input_x_ub, input_x[block_idx, 0, cc1, 0, 0], 0, 128, 7, 42, 0)
                with tik_instance.for_range(0, 7) as cc7:
                    with tik_instance.for_range(0, 128) as cc8:
@@ -193,7 +241,7 @@ def CusTranspose02314(input_x, output, kernel_name="transpose021354"):
                                               scope=tik.scope_ubuf)
            tik_instance.data_move(input_x_ub, input_x[block_idx, 0, 0, 0, 0], 0, 1, 1568, 0, 0)
            with tik_instance.for_range(0, 7) as cc1:
               with tik_instance.for_range(0, 7) as cc2:
                with tik_instance.for_range(0, 7) as cc2:
                    with tik_instance.for_range(0, 32) as cc3:
                        tik_instance.vadds(16, transpose_ub[0, cc1, cc2, cc3, 0], input_x_ub[0, cc3, cc1, cc2, 0], 0,
                                           1, 1, 1, 0, 0)
@@ -212,11 +260,12 @@ def CusTranspose02314(input_x, output, kernel_name="transpose021354"):
                        tik_instance.vadds(16, transpose_ub[0, cc2, cc3, cc4, 0], input_x_ub[0, cc4, cc2, cc3, 0],
                                           0, 1, 1, 1, 0, 0)
            tik_instance.data_move(res[block_idx * 100352 + split_index * 2 * 7168], transpose_ub, 0, 1, 896, 0, 0)

        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            with tik_instance.for_range(0, 6, thread_num=2) as cc1:
                _inner_+compute(cc1)
                _inner_ + compute(cc1)
            _inner_compute(6)
    elif tuple(input_x_shape) == (32,64,14,14,16)  and tuple(perm) == (0,2,3,1,4) and dtype == "float16":
    elif tuple(input_x_shape) == (32, 64, 14, 14, 16) and tuple(perm) == (0, 2, 3, 1, 4) and dtype == "float16":
        def _inner_compute(split_index, block_idx):
            input_x_ub = tik_instance.Tensor(dtype, [1, 64, 2, 14, 16], name="input_1_local_UB",
                                             scope=tik.scope_ubuf)
@@ -229,6 +278,7 @@ def CusTranspose02314(input_x, output, kernel_name="transpose021354"):
                        tik_instance.vadds(16, transpose_ub[0, cc2, cc3, cc4, 0], input_x_ub[0, cc4, cc2, cc3, 0],
                                           0, 1, 1, 1, 0, 0)
            tik_instance.data_move(res[block_idx * 200704 + split_index * 2 * 14336], transpose_ub, 0, 1, 1792, 0, 0)

        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            with tik_instance.for_range(0, 6, thread_num=2) as cc1:
                _inner_compute(cc1, block_idx)