From 2d0ee05454eccafcd35f303b69575a36d6c33330 Mon Sep 17 00:00:00 2001
From: z00478463 <zhangzhongpeng1@huawei.com>
Date: Tue, 26 May 2020 09:14:41 +0800
Subject: [PATCH] for pylint 2nd

---
 mindspore/ops/_op_impl/_custom_op/__init__.py |   11 -
 .../_op_impl/_custom_op/batch_matmul_impl.py  |  163 +-
 .../_op_impl/_custom_op/cholesky_trsm_impl.py |   66 +-
 .../_custom_op/fused_abs_max1_impl.py         |  864 +++++----
 .../ops/_op_impl/_custom_op/img2col_impl.py   | 1641 +++++++++--------
 .../_custom_op/matmul_cube_dense_left_impl.py |  182 +-
 .../matmul_cube_dense_right_impl.py           |  155 +-
 .../matmul_cube_fracz_left_cast_impl.py       |  158 +-
 .../matmul_cube_fracz_right_mul_impl.py       |   64 +-
 .../_op_impl/_custom_op/matmul_cube_impl.py   |   95 +-
 .../_custom_op/matrix_combine_impl.py         |   43 +-
 .../_custom_op/transpose02314_impl.py         |  206 ++-
 12 files changed, 2056 insertions(+), 1592 deletions(-)

diff --git a/mindspore/ops/_op_impl/_custom_op/__init__.py b/mindspore/ops/_op_impl/_custom_op/__init__.py
index d22ac30635..5fe583a60f 100644
--- a/mindspore/ops/_op_impl/_custom_op/__init__.py
+++ b/mindspore/ops/_op_impl/_custom_op/__init__.py
@@ -14,14 +14,3 @@
 # ============================================================================
 
 """custom ops"""
-from .batch_matmul_impl import CusBatchMatMul
-from .cholesky_trsm_impl import CusCholeskyTrsm
-from .fused_abs_max1_impl import CusFusedAbsMax1
-from .img2col_impl import CusImg2Col
-from .matmul_cube_dense_left_impl import CusMatMulCubeDenseLeft
-from .matmul_cube_dense_right_impl import CusMatMulCubeDenseRight
-from .matmul_cube_fracz_left_cast_impl import CusMatMulCubeFraczLeftCast
-from .matmul_cube_fracz_right_mul_impl import CusMatMulCubeFraczRightMul
-from .matmul_cube_impl import CusMatMulCube
-from .matrix_combine_impl import CusMatrixCombine
-from .transpose02314_impl import CusTranspose02314
diff --git a/mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py b/mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py
index ddbc26a03d..d8395c1e81 100644
--- a/mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py
@@ -14,29 +14,31 @@
 # ============================================================================
 """batch_matmul_impl"""
 
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tik
 from topi.cce import util
-from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
- 
+
 cus_batchmatmul_op_info = TBERegOp("CusBatchMatMul") \
-                             .fusion_type("OPAQUE") \
-                             .async_flag(False) \
-                             .binfile_name("batchmatmul.so") \
-                             .compute_cost(10) \
-                             .kernel_name("CusBatchMatMul") \
-                             .partial_flag(True) \
-                             .input(0, "x1", False, "required", "all") \
-                             .input(1, "x2", False, "required", "all") \
-                             .output(0, "y", False, "required", "all") \
-                             .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
-                             .get_op_info()
- 
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("batchmatmul.so") \
+    .compute_cost(10) \
+    .kernel_name("CusBatchMatMul") \
+    .partial_flag(True) \
+    .input(0, "x1", False, "required", "all") \
+    .input(1, "x2", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
+    .get_op_info()
+
+
 def _get_flattern_shape(shape):
     flattern_shape = 1
     for dim in shape:
         flattern_shape *= dim
     return (flattern_shape,)
- 
+
+
 def _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index):
     input_1_local_UB = tik_instance.Tensor(dtype, [128], name="input_1_local_UB", scope=tik.scope_ubuf)
     t_1_0_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="t_1_0_local_UB", scope=tik.scope_ubuf)
@@ -66,12 +68,13 @@ def _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_
                           matmul_hybrid_f_t_local_UB, 1, 1, 1, 1, 8, 8, 8)
         tik_instance.data_move(res[res_index + thread_idx2 * 64],
                                matmul_hybrid_f_t_local_UB, 0, 1, 8, 0, 0)
- 
+
+
 def _inner_matmul_new_1_64_32_64(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index):
     input_1_local_UB = tik_instance.Tensor(dtype, [64], name="input_1_local_UB", scope=tik.scope_ubuf)
     tik_instance.data_move(input_1_local_UB, input1[input1_index], 0, 1, 8, 0, 0)
     with tik_instance.for_range(0, 2, thread_num=2) as thread_idx2:
-        input_2_local_UB = tik_instance.Tensor(dtype, [32*64], name="input_2_local_UB",
+        input_2_local_UB = tik_instance.Tensor(dtype, [32 * 64], name="input_2_local_UB",
                                                scope=tik.scope_ubuf)
         t_1_local_UB = input_2_local_UB
         matmul_hybrid_f_t_local_UB = tik_instance.Tensor(dtype, [32], name="matmul_hybrid_f_t_local_UB",
@@ -83,6 +86,8 @@ def _inner_matmul_new_1_64_32_64(tik_instance, dtype, input1, input1_index, inpu
                                1, 1, 1, 8)
         tik_instance.data_move(res[res_index + thread_idx2 * 32],
                                matmul_hybrid_f_t_local_UB, 0, 1, 4, 0, 0)
+
+
 @op_info_register(cus_batchmatmul_op_info)
 def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=True, kernel_name="batchmatmul"):
     if util.get_product_version() == util.VERSION_MINI:
@@ -97,51 +102,54 @@ def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=Tr
             dtype, input_x2.get("dtype").lower()))
     input_shape = (tuple(x1_shape), tuple(x2_shape), dtype, transpose_a, transpose_b)
     support_shape = [((8, 128, 128), (8, 128, 128), "float32", False, True),
-                    ((36, 128, 128), (36, 128, 128), "float32", False, True),
-                    ((5, 128, 128), (5, 128, 128), "float32", False, True),
-                    ((18, 128, 128), (18, 128, 128), "float32", False, True),
-                    ((16, 128, 128), (16, 128, 128), "float32", False, True),
-                    ((9, 128, 128), (9, 128, 128), "float32", False, True),
-                    ((1, 64, 64), (1, 64, 64), "float32", False, True),
-                    ((1, 128, 128), (1, 128, 128), "float32", False, True),
-                    ((4, 128, 128), (4, 128, 128), "float32", False, True),
-                    ((2, 128, 128), (2, 128, 128), "float32", False, True)]
+                     ((36, 128, 128), (36, 128, 128), "float32", False, True),
+                     ((5, 128, 128), (5, 128, 128), "float32", False, True),
+                     ((18, 128, 128), (18, 128, 128), "float32", False, True),
+                     ((16, 128, 128), (16, 128, 128), "float32", False, True),
+                     ((9, 128, 128), (9, 128, 128), "float32", False, True),
+                     ((1, 64, 64), (1, 64, 64), "float32", False, True),
+                     ((1, 128, 128), (1, 128, 128), "float32", False, True),
+                     ((4, 128, 128), (4, 128, 128), "float32", False, True),
+                     ((2, 128, 128), (2, 128, 128), "float32", False, True)]
     if input_shape not in support_shape:
         raise RuntimeError("input_shape %s is not supported" % str(input_shape))
- 
- 
+
     # if not transpose_a and transpose_b:
     batch, m, k = x1_shape
     _, n, _ = x2_shape
- 
+
     input1_shape = _get_flattern_shape(x1_shape)
     input1 = tik_instance.Tensor(dtype, input1_shape, name="input1", scope=tik.scope_gm)
     input2_shape = _get_flattern_shape(x2_shape)
     input2 = tik_instance.Tensor(dtype, input2_shape, name="input2", scope=tik.scope_gm)
- 
+
     output_shape = x1_shape
     res_shape = _get_flattern_shape(output_shape)
     res = tik_instance.Tensor(dtype, res_shape, name="res", scope=tik.scope_gm)
- 
+
     if input_shape == ((36, 128, 128), (36, 128, 128), "float32", False, True):
         with tik_instance.for_range(0, 18, block_num=18) as block_idx:
             with tik_instance.for_range(0, 2) as cc0:
                 with tik_instance.for_range(0, 128, thread_num=2) as cc1:
-                    input1_index = block_idx * 32768 + cc0*16384 + cc1 * 128
-                    input2_index = block_idx * 32768 + cc0*16384
-                    res_index = block_idx*32768 + cc0*16384 + cc1*128
+                    input1_index = block_idx * 32768 + cc0 * 16384 + cc1 * 128
+                    input2_index = block_idx * 32768 + cc0 * 16384
+                    res_index = block_idx * 32768 + cc0 * 16384 + cc1 * 128
                     _inner_matmul_new(tik_instance, dtype,
-                                  input1, input1_index,
-                                  input2, input2_index,
-                                  res, res_index)
+                                      input1, input1_index,
+                                      input2, input2_index,
+                                      res, res_index)
     if input_shape == ((5, 128, 128), (5, 128, 128), "float32", False, True):
         with tik_instance.for_range(0, 30, block_num=30) as block_idx:
             with tik_instance.for_range(0, 11) as cc1_db:
                 with tik_instance.for_range(0, 2, thread_num=2) as thread_idx:
                     with tik_instance.if_scope(((((block_idx % 6) * 22) + (cc1_db * 2) + thread_idx) < 128)):
-                        input_1_local_UB = tik_instance.Tensor(dtype, [128], name="input_1_local_UB", scope=tik.scope_ubuf)
-                        t_1_0_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="t_1_0_local_UB", scope=tik.scope_ubuf)
-                        tik_instance.data_move(input_1_local_UB, input1[(block_idx//6)*16384 + (block_idx % 6)*2816 + cc1_db * 256 + thread_idx*128], 0, 1, 16, 0, 0)
+                        input_1_local_UB = tik_instance.Tensor(dtype, [128], name="input_1_local_UB",
+                                                               scope=tik.scope_ubuf)
+                        t_1_0_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="t_1_0_local_UB",
+                                                             scope=tik.scope_ubuf)
+                        tik_instance.data_move(input_1_local_UB, input1[
+                            (block_idx // 6) * 16384 + (block_idx % 6) * 2816 + cc1_db * 256 + thread_idx * 128], 0, 1,
+                                               16, 0, 0)
                         with tik_instance.for_range(0, 2) as vec_i:
                             tik_instance.vadds(64, t_1_0_local_UB[vec_i * 64], input_1_local_UB[vec_i * 64], 0,
                                                64, 1, 1, 16, 0)
@@ -150,58 +158,61 @@ def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=Tr
                                                                    scope=tik.scope_ubuf)
                             t_1_local_UB = input_2_local_UB
                             bisec_last_axis_local_UB = input_2_local_UB
-                            matmul_hybrid_f_t_local_UB = tik_instance.Tensor(dtype, [64], name="matmul_hybrid_f_t_local_UB",
+                            matmul_hybrid_f_t_local_UB = tik_instance.Tensor(dtype, [64],
+                                                                             name="matmul_hybrid_f_t_local_UB",
                                                                              scope=tik.scope_ubuf)
                             matmul_hybrid_f_t_local_UB_dst_tmp = tik_instance.Tensor(dtype, [64],
                                                                                      name="matmul_hybrid_f_t_local_UB_dst_tmp",
                                                                                      scope=tik.scope_ubuf)
                             tik_instance.vector_dup(64, matmul_hybrid_f_t_local_UB, 0, 1, 1, 8)
-                            tik_instance.data_move(input_2_local_UB, input2[(block_idx//6) * 16384 + thread_idx2*8192], 0, 1,
+                            tik_instance.data_move(input_2_local_UB,
+                                                   input2[(block_idx // 6) * 16384 + thread_idx2 * 8192], 0, 1,
                                                    1024, 0, 0)
                             tik_instance.vmul(64, t_1_local_UB, t_1_0_local_UB, input_2_local_UB, 128, 1, 1, 1, 8, 8, 8)
                             tik_instance.vadd(64, bisec_last_axis_local_UB, t_1_local_UB, t_1_local_UB[64], 64, 1, 1, 1,
                                               16, 16, 16)
                             tik_instance.vector_dup(64, matmul_hybrid_f_t_local_UB_dst_tmp, 0, 1, 1, 8)
                             with tik_instance.for_range(0, 64) as cc6:
-                                tik_instance.vcadd(64, matmul_hybrid_f_t_local_UB_dst_tmp[cc6], bisec_last_axis_local_UB[cc6*128],
+                                tik_instance.vcadd(64, matmul_hybrid_f_t_local_UB_dst_tmp[cc6],
+                                                   bisec_last_axis_local_UB[cc6 * 128],
                                                    1, 1, 1, 8)
                             tik_instance.vadd(64, matmul_hybrid_f_t_local_UB, matmul_hybrid_f_t_local_UB_dst_tmp,
                                               matmul_hybrid_f_t_local_UB, 1, 1, 1, 1, 8, 8, 8)
-                            tik_instance.data_move(res[(block_idx//6)*16384 + (block_idx%6)*2816 + cc1_db*256 +
-                                                       thread_idx*128 + thread_idx2*64],
-                                                   matmul_hybrid_f_t_local_UB, 0, 1, 8, 0, 0)
- 
+                            tik_instance.data_move(
+                                res[(block_idx // 6) * 16384 + (block_idx % 6) * 2816 + cc1_db * 256 +
+                                    thread_idx * 128 + thread_idx2 * 64],
+                                matmul_hybrid_f_t_local_UB, 0, 1, 8, 0, 0)
+
     if input_shape == ((18, 128, 128), (18, 128, 128), "float32", False, True):
         with tik_instance.for_range(0, 18, block_num=18) as block_idx:
             with tik_instance.for_range(0, 128, thread_num=2) as cc0:
                 input1_index = block_idx * 16384 + cc0 * 128
                 input2_index = block_idx * 16384
-                res_index = block_idx*16384 + cc0*128
+                res_index = block_idx * 16384 + cc0 * 128
                 _inner_matmul_new(tik_instance, dtype,
-                              input1, input1_index,
-                              input2, input2_index,
-                              res, res_index)
- 
- 
+                                  input1, input1_index,
+                                  input2, input2_index,
+                                  res, res_index)
+
     if input_shape == ((9, 128, 128), (9, 128, 128), "float32", False, True):
         with tik_instance.for_range(0, 27, block_num=27) as block_idx:
             with tik_instance.for_range(0, 42, thread_num=2) as cc0:
-                input1_index = (block_idx//3) * 16384 + (block_idx % 3)*5504 + cc0 * 128
-                input2_index = (block_idx//3) * 16384
-                res_index = (block_idx//3) * 16384 + (block_idx % 3)*5504 + cc0*128
+                input1_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + cc0 * 128
+                input2_index = (block_idx // 3) * 16384
+                res_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + cc0 * 128
                 _inner_matmul_new(tik_instance, dtype,
-                              input1, input1_index,
-                              input2, input2_index,
-                              res, res_index)
+                                  input1, input1_index,
+                                  input2, input2_index,
+                                  res, res_index)
             with tik_instance.if_scope((block_idx % 3) < 2):
-                input1_index = (block_idx//3) * 16384 + (block_idx % 3)*5504 + 42*128
+                input1_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + 42 * 128
                 input2_index = (block_idx // 3) * 16384
-                res_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + 42*128
+                res_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + 42 * 128
                 _inner_matmul_new(tik_instance, dtype,
-                              input1, input1_index,
-                              input2, input2_index,
-                              res, res_index)
- 
+                                  input1, input1_index,
+                                  input2, input2_index,
+                                  res, res_index)
+
     if input_shape == ((1, 64, 64), (1, 64, 64), "float32", False, True):
         with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             with tik_instance.for_range(0, 2, thread_num=2) as cc0:
@@ -209,35 +220,35 @@ def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=Tr
                 input2_index = 0
                 res_index = block_idx * 128 + cc0 * 64
                 _inner_matmul_new_1_64_32_64(tik_instance, dtype,
-                              input1, input1_index,
-                              input2, input2_index,
-                              res, res_index)
- 
+                                             input1, input1_index,
+                                             input2, input2_index,
+                                             res, res_index)
+
     input_shape_list = [((1, 128, 128), (1, 128, 128), "float32", False, True),
                         ((2, 128, 128), (2, 128, 128), "float32", False, True),
                         ((4, 128, 128), (4, 128, 128), "float32", False, True),
                         ((8, 128, 128), (8, 128, 128), "float32", False, True),
                         ((16, 128, 128), (16, 128, 128), "float32", False, True)
-    ]
+                        ]
     if input_shape in input_shape_list:
         block_num = 32
         input1_unit_size = 128
-        input2_unint_size = 128*128
+        input2_unint_size = 128 * 128
         with tik_instance.for_range(0, block_num, block_num=block_num) as block_idx:
             block_process_ele_num = (batch * m * k) // block_num
-            loop_time = (batch*m*k)//block_num//input1_unit_size
+            loop_time = (batch * m * k) // block_num // input1_unit_size
             thread_num = 2
             with tik_instance.for_range(0, loop_time, thread_num=thread_num) as cc0:
-                input1_index = block_idx*block_process_ele_num + cc0*input1_unit_size
+                input1_index = block_idx * block_process_ele_num + cc0 * input1_unit_size
                 if batch > 1:
-                    input2_index = block_idx//(block_num//batch) * input2_unint_size
+                    input2_index = block_idx // (block_num // batch) * input2_unint_size
                 else:
                     input2_index = 0
-                res_index = block_idx*block_process_ele_num + cc0*input1_unit_size
+                res_index = block_idx * block_process_ele_num + cc0 * input1_unit_size
                 _inner_matmul_new(tik_instance, dtype,
                                   input1, input1_index,
                                   input2, input2_index,
                                   res, res_index)
- 
+
     tik_instance.BuildCCE(kernel_name, inputs=[input1, input2], outputs=[res])
     return tik_instance
diff --git a/mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py b/mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py
index 9092b8a9ff..50830fe0f6 100644
--- a/mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py
@@ -13,24 +13,25 @@
 # limitations under the License.
 # ============================================================================
 """CusCholeskyTrsm"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tik
 from topi.cce import util
-from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 
 cus_cholesky_trsm_op_info = TBERegOp("CusCholeskyTrsm") \
-                             .fusion_type("OPAQUE") \
-                             .async_flag(False) \
-                             .binfile_name("choleskytrsm.so") \
-                             .compute_cost(10) \
-                             .kernel_name("CusCholeskyTrsm") \
-                             .partial_flag(True) \
-                             .input(0, "x1", False, "required", "all") \
-                             .output(0, "y", False, "required", "all") \
-                             .dtype_format(DataType.F32_Default, DataType.F32_Default) \
-                             .get_op_info()
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("choleskytrsm.so") \
+    .compute_cost(10) \
+    .kernel_name("CusCholeskyTrsm") \
+    .partial_flag(True) \
+    .input(0, "x1", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+    .get_op_info()
+
 
 @op_info_register(cus_cholesky_trsm_op_info)
-def CusCholeskyTrsm(input_x,output, kernel_name):
+def CusCholeskyTrsm(input_x, output, kernel_name):
     input_x_shape = input_x.get("shape")
     output_shape = output.get("shape")
     split_dim = 128
@@ -47,34 +48,36 @@ def CusCholeskyTrsm(input_x,output, kernel_name):
 
     input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
     res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
-    with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-        input_x_ub = tik_instance.Tensor("float32", (split_dim,split_dim), name="input_x_ub", scope=tik.scope_ubuf)
-        temp_ub = tik_instance.Tensor("float32", (split_dim,split_dim), name="temp_ub", scope=tik.scope_ubuf)
+    with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+        input_x_ub = tik_instance.Tensor("float32", (split_dim, split_dim), name="input_x_ub", scope=tik.scope_ubuf)
+        temp_ub = tik_instance.Tensor("float32", (split_dim, split_dim), name="temp_ub", scope=tik.scope_ubuf)
         assist_1_ub = tik_instance.Tensor("float32", (split_dim,), name="assist_1_ub", scope=tik.scope_ubuf)
         assist_2_ub = tik_instance.Tensor("float32", (split_dim,), name="assist_2_ub", scope=tik.scope_ubuf)
-        with tik_instance.for_range(0,split_dim) as i:
-            tik_instance.data_move(input_x_ub[i,0], input_x[block_index * split_dim + i, block_index * split_dim], 0, 1, vector_repeat_times * 8, 0, 0)
-        scalar1 = tik_instance.Scalar("float32", init_value = -0.5)
+        with tik_instance.for_range(0, split_dim) as i:
+            tik_instance.data_move(input_x_ub[i, 0], input_x[block_index * split_dim + i, block_index * split_dim], 0,
+                                   1, vector_repeat_times * 8, 0, 0)
+        scalar1 = tik_instance.Scalar("float32", init_value=-0.5)
 
         with tik_instance.for_range(0, split_dim) as i:
-            scalar2= tik_instance.Scalar("float32")
-            tik_instance.vln(64, assist_1_ub[0], input_x_ub[i,0], vector_repeat_times, 1, 1, 8, 8)
+            scalar2 = tik_instance.Scalar("float32")
+            tik_instance.vln(64, assist_1_ub[0], input_x_ub[i, 0], vector_repeat_times, 1, 1, 8, 8)
             tik_instance.vmuls(64, assist_2_ub[0], assist_1_ub[0], scalar1, vector_repeat_times, 1, 1, 8, 8)
             tik_instance.vexp(64, assist_1_ub[0], assist_2_ub[0], vector_repeat_times, 1, 1, 8, 8)
             scalar2.set_as(assist_1_ub[i])
-            tik_instance.vmuls(64, input_x_ub[i,0], input_x_ub[i,0], scalar2, vector_repeat_times, 1, 1, 8, 8)
+            tik_instance.vmuls(64, input_x_ub[i, 0], input_x_ub[i, 0], scalar2, vector_repeat_times, 1, 1, 8, 8)
             with tik_instance.for_range(i + 1, split_dim) as j:
-                scalar3= tik_instance.Scalar("float32")
+                scalar3 = tik_instance.Scalar("float32")
                 scalar3.set_as(input_x_ub[i, j])
-                tik_instance.vmuls(64,temp_ub[j, 0], input_x_ub[i, 0], scalar3, vector_repeat_times, 1, 1, 8, 8)
-            tik_instance.vsub(64,input_x_ub[i+1,0], input_x_ub[i+1,0], temp_ub[i+1,0], (split_dim-1-i) * vector_repeat_times, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmuls(64, temp_ub[j, 0], input_x_ub[i, 0], scalar3, vector_repeat_times, 1, 1, 8, 8)
+            tik_instance.vsub(64, input_x_ub[i + 1, 0], input_x_ub[i + 1, 0], temp_ub[i + 1, 0],
+                              (split_dim - 1 - i) * vector_repeat_times, 1, 1, 1, 8, 8, 8)
 
         zero = tik_instance.Scalar("float32")
         zero.set_as(0.0)
         one = tik_instance.Scalar("float32")
         one.set_as(1.0)
         with tik_instance.for_range(0, split_dim) as i:
-            tik_instance.vector_dup(64, temp_ub[i,0], zero, vector_repeat_times, 1, 8)
+            tik_instance.vector_dup(64, temp_ub[i, 0], zero, vector_repeat_times, 1, 8)
             temp_ub.__setitem__(i * split_dim + i, one)
 
         chol_diag_element_final = tik_instance.Scalar("float32")
@@ -89,16 +92,19 @@ def CusCholeskyTrsm(input_x,output, kernel_name):
             with tik_instance.for_range(0, i) as j:
                 chol_diag_element_loop = tik_instance.Scalar("float32")
                 chol_diag_element_loop.set_as(input_x_ub[index, index + 1 + j])
-                tik_instance.vmuls(64, assist_2_ub, temp_ub[j + index + 1, 0], chol_diag_element_loop, vector_repeat_times,1,1,8,8)
-                tik_instance.vadd(64, assist_1_ub, assist_2_ub, assist_1_ub, vector_repeat_times,1,1,1,8,8,8)
+                tik_instance.vmuls(64, assist_2_ub, temp_ub[j + index + 1, 0], chol_diag_element_loop,
+                                   vector_repeat_times, 1, 1, 8, 8)
+                tik_instance.vadd(64, assist_1_ub, assist_2_ub, assist_1_ub, vector_repeat_times, 1, 1, 1, 8, 8, 8)
             temp_scalar = tik_instance.Scalar("float32")
             temp_scalar.set_as(input_x_ub[index, index])
             chol_diag_element = tik_instance.Scalar("float32")
             chol_diag_element.set_as(1.0 / temp_scalar)
-            tik_instance.vsub(64,temp_ub[index, 0], temp_ub[index, 0], assist_1_ub, vector_repeat_times,1,1,1,8,8,8)
-            tik_instance.vmuls(64, temp_ub[index, 0], temp_ub[index, 0], chol_diag_element,vector_repeat_times,1,1,8,8)
+            tik_instance.vsub(64, temp_ub[index, 0], temp_ub[index, 0], assist_1_ub, vector_repeat_times, 1, 1, 1, 8, 8,
+                              8)
+            tik_instance.vmuls(64, temp_ub[index, 0], temp_ub[index, 0], chol_diag_element, vector_repeat_times, 1, 1,
+                               8, 8)
 
-        tik_instance.data_move(res[block_index,0,0], temp_ub, 0, 1, 8 * vector_repeat_times * split_dim,0,0)
+        tik_instance.data_move(res[block_index, 0, 0], temp_ub, 0, 1, 8 * vector_repeat_times * split_dim, 0, 0)
 
     tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x], outputs=[res])
     return tik_instance
diff --git a/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py b/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py
index 2092c658c2..0c47ce78b1 100644
--- a/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py
@@ -13,25 +13,26 @@
 # limitations under the License.
 # ============================================================================
 """CusFusedAbsMax1"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tik
 from topi.cce import util
-from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 
 cus_fused_abs_max1_op_info = TBERegOp("CusFusedAbsMax1") \
-                             .fusion_type("OPAQUE") \
-                             .async_flag(False) \
-                             .binfile_name("fusedabsmax1.so") \
-                             .compute_cost(10) \
-                             .kernel_name("CusFusedAbsMax1") \
-                             .partial_flag(True) \
-                             .attr("origin_shape", "required", "listInt", "all") \
-                             .input(0, "x1", False, "required", "all") \
-                             .output(0, "y", False, "required", "all") \
-                             .dtype_format(DataType.F32_Default, DataType.F32_Default) \
-                             .get_op_info()
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("fusedabsmax1.so") \
+    .compute_cost(10) \
+    .kernel_name("CusFusedAbsMax1") \
+    .partial_flag(True) \
+    .attr("origin_shape", "required", "listInt", "all") \
+    .input(0, "x1", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+    .get_op_info()
+
 
 @op_info_register(cus_fused_abs_max1_op_info)
-def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs_max1"):
+def CusFusedAbsMax1(input_x, output, origin_shape=None, kernel_name="fused_abs_max1"):
     input_x_shape = input_x.get("shape")
     output_shape = output.get("shape")
 
@@ -40,9 +41,9 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
     else:
         tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
 
-
     if len(input_x_shape) > 2:
-        if (input_x_shape[0] == 1 and input_x_shape[1] == 128 and input_x_shape[2] == 128)  or (input_x_shape[0] == 4 and input_x_shape[1] == 16) or (input_x_shape[0] == 16 and input_x_shape[1] == 4):
+        if (input_x_shape[0] == 1 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (
+                input_x_shape[0] == 4 and input_x_shape[1] == 16) or (input_x_shape[0] == 16 and input_x_shape[1] == 4):
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
             res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
             total_elements = 1
@@ -50,28 +51,39 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
                 repeat_time = each_block_element // 64
-                tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time,1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
-        elif (input_x_shape[0] == 2 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (input_x_shape[0] == 16 and input_x_shape[1] == 8):
-            if origin_shape[0] == 147  and (input_x_shape[0] == 2 and input_x_shape[1] == 128 and input_x_shape[2] == 128) :
+        elif (input_x_shape[0] == 2 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (
+                input_x_shape[0] == 16 and input_x_shape[1] == 8):
+            if origin_shape[0] == 147 and (
+                    input_x_shape[0] == 2 and input_x_shape[1] == 128 and input_x_shape[2] == 128):
                 assert origin_shape[0] == 147
                 assert origin_shape[1] == 147
                 phase_1 = 16384
@@ -80,9 +92,11 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 each_block_element = phase_1 // blocks + 64
                 input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
                 res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
-                with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                    input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                    broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                    input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                     scope=tik.scope_ubuf)
+                    broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                               scope=tik.scope_ubuf)
                     tik_instance.data_move(input_x_ub, input_x[512 * block_index], 0, 1, 512 // 8, 0, 0)
                     line_id = block_index % 19
                     tik_instance.data_move(input_x_ub[512], input_x[16384 + 128 * line_id], 0, 1, 8, 0, 0)
@@ -92,16 +106,22 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                    with tik_instance.for_range(0,64) as cc0:
+                    with tik_instance.for_range(0, 64) as cc0:
                         data_temp = tik_instance.Scalar("float32")
                         data_temp.set_as(input_x_ub[cc0])
                         tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1,
+                                      1, 8, 8, 8)
                     tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
             else:
                 input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
@@ -111,28 +131,38 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                     total_elements *= val
                 blocks = 32
                 each_block_element = total_elements // blocks
-                with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                    input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                    broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                    tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+                with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                    input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                     scope=tik.scope_ubuf)
+                    broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                               scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                           each_block_element // 8, 0, 0)
                     repeat_time = each_block_element // 64
                     tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                    with tik_instance.for_range(0,64) as cc0:
+                    with tik_instance.for_range(0, 64) as cc0:
                         data_temp = tik_instance.Scalar("float32")
                         data_temp.set_as(input_x_ub[cc0])
                         tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1,
+                                      1, 8, 8, 8)
                     tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
-        elif (input_x_shape[0] == 4 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (input_x_shape[0] == 8 and input_x_shape[1] == 32) or (input_x_shape[0] == 32 and input_x_shape[1] == 8):
+        elif (input_x_shape[0] == 4 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (
+                input_x_shape[0] == 8 and input_x_shape[1] == 32) or (input_x_shape[0] == 32 and input_x_shape[1] == 8):
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
             res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
             total_elements = 1
@@ -140,10 +170,13 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
                 repeat_time = each_block_element // 64
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
@@ -151,36 +184,50 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
-        elif (input_x_shape[0] == 8 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (input_x_shape[0] == 32 and input_x_shape[1] == 16) or (input_x_shape[0] == 16 and input_x_shape[1] == 32):
-            if (input_x_shape[0] == 8 and input_x_shape[1] == 128 and input_x_shape[2] == 128) and origin_shape[0] == 1000:
+        elif (input_x_shape[0] == 8 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (
+                input_x_shape[0] == 32 and input_x_shape[1] == 16) or (
+                input_x_shape[0] == 16 and input_x_shape[1] == 32):
+            if (input_x_shape[0] == 8 and input_x_shape[1] == 128 and input_x_shape[2] == 128) and origin_shape[
+                0] == 1000:
                 input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
                 res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
                 blocks = 32
                 each_block_element = 7 * 128 * 128 // 32 + 4 * 128
                 phase_1 = 7 * 128 * 128 // 32
-                with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                    input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                    broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                    input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                     scope=tik.scope_ubuf)
+                    broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                               scope=tik.scope_ubuf)
                     tik_instance.data_move(input_x_ub, input_x[phase_1 * block_index], 0, 1, phase_1 // 8, 0, 0)
-                    tik_instance.data_move(input_x_ub[phase_1], input_x[114688 + block_index * 384], 0, 1, 384 // 8, 0, 0)
+                    tik_instance.data_move(input_x_ub[phase_1], input_x[114688 + block_index * 384], 0, 1, 384 // 8, 0,
+                                           0)
                     move_idx = block_index % 8
-                    tik_instance.data_move(input_x_ub[phase_1 + 384], input_x[114688 + 96 * 128 + move_idx * 128], 0, 1, 128 // 8, 0, 0)
+                    tik_instance.data_move(input_x_ub[phase_1 + 384], input_x[114688 + 96 * 128 + move_idx * 128], 0, 1,
+                                           128 // 8, 0, 0)
                     repeat_time = each_block_element // 64
                     tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
                     vmask = 1000 - 7 * 128 - 64
-                    with tik_instance.for_range(0, 4) as loop_idx :
-                        tik_instance.vmax(vmask, input_x_ub[3584 + 128 * loop_idx], input_x_ub[3584 + 128 * loop_idx], input_x_ub[3584 + 128 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8)
+                    with tik_instance.for_range(0, 4) as loop_idx:
+                        tik_instance.vmax(vmask, input_x_ub[3584 + 128 * loop_idx], input_x_ub[3584 + 128 * loop_idx],
+                                          input_x_ub[3584 + 128 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub[512], input_x_ub[2048], 24, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
@@ -189,38 +236,52 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
 
                     with tik_instance.for_range(0, 4) as loop_idx:
-                        tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[3584 + 128 * loop_idx], 1, 1, 1, 1, 8, 8, 8)
-                    with tik_instance.for_range(0,64) as cc0:
+                        tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[3584 + 128 * loop_idx], 1, 1, 1, 1, 8,
+                                          8, 8)
+                    with tik_instance.for_range(0, 64) as cc0:
                         data_temp = tik_instance.Scalar("float32")
                         data_temp.set_as(input_x_ub[cc0])
                         tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1,
+                                      1, 8, 8, 8)
                     tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
 
-            elif (input_x_shape[0] == 8 and input_x_shape[1] == 128 and input_x_shape[2] == 128) and origin_shape[0] == 1001:
+            elif (input_x_shape[0] == 8 and input_x_shape[1] == 128 and input_x_shape[2] == 128) and origin_shape[
+                0] == 1001:
                 input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
                 res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
                 blocks = 32
                 each_block_element = 7 * 128 * 128 // 32 + 4 * 128
                 phase_1 = 7 * 128 * 128 // 32
-                with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                    input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                    broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                    input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                     scope=tik.scope_ubuf)
+                    broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                               scope=tik.scope_ubuf)
                     tik_instance.data_move(input_x_ub, input_x[phase_1 * block_index], 0, 1, phase_1 // 8, 0, 0)
-                    tik_instance.data_move(input_x_ub[phase_1], input_x[114688 + block_index * 384], 0, 1, 384 // 8, 0, 0)
-                    tik_instance.data_move(input_x_ub[phase_1], input_x[114688 + block_index * 384], 0, 1, 384 // 8, 0, 0)
+                    tik_instance.data_move(input_x_ub[phase_1], input_x[114688 + block_index * 384], 0, 1, 384 // 8, 0,
+                                           0)
+                    tik_instance.data_move(input_x_ub[phase_1], input_x[114688 + block_index * 384], 0, 1, 384 // 8, 0,
+                                           0)
                     move_idx = block_index % 9
-                    tik_instance.data_move(input_x_ub[phase_1 + 384], input_x[114688 + 96 * 128 + move_idx * 128], 0, 1, 128 // 8, 0, 0)
+                    tik_instance.data_move(input_x_ub[phase_1 + 384], input_x[114688 + 96 * 128 + move_idx * 128], 0, 1,
+                                           128 // 8, 0, 0)
                     repeat_time = each_block_element // 64
                     tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
                     vmask = 1001 - 7 * 128 - 64
                     with tik_instance.for_range(0, 4) as loop_idx:
-                        tik_instance.vmax(vmask, input_x_ub[3584 + 128 * loop_idx], input_x_ub[3584 + 128 * loop_idx], input_x_ub[3584 + 128 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8)
+                        tik_instance.vmax(vmask, input_x_ub[3584 + 128 * loop_idx], input_x_ub[3584 + 128 * loop_idx],
+                                          input_x_ub[3584 + 128 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub[512], input_x_ub[2048], 24, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
@@ -228,17 +289,24 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
                     with tik_instance.for_range(0, 4) as loop_idx:
-                        tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[3584 + 128 * loop_idx], 1, 1, 1, 1, 8, 8, 8)
-                    with tik_instance.for_range(0,64) as cc0:
+                        tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[3584 + 128 * loop_idx], 1, 1, 1, 1, 8,
+                                          8, 8)
+                    with tik_instance.for_range(0, 64) as cc0:
                         data_temp = tik_instance.Scalar("float32")
                         data_temp.set_as(input_x_ub[cc0])
                         tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1,
+                                      1, 8, 8, 8)
                     tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
             else:
                 input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
@@ -248,10 +316,13 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                     total_elements *= val
                 blocks = 32
                 each_block_element = total_elements // blocks
-                with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                    input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                    broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                    tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+                with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                    input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                     scope=tik.scope_ubuf)
+                    broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                               scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                           each_block_element // 8, 0, 0)
                     repeat_time = each_block_element // 64
                     tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8)
@@ -260,18 +331,26 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                    with tik_instance.for_range(0,64) as cc0:
+                    with tik_instance.for_range(0, 64) as cc0:
                         data_temp = tik_instance.Scalar("float32")
                         data_temp.set_as(input_x_ub[cc0])
                         tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1,
+                                      1, 8, 8, 8)
                     tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
-        elif (input_x_shape[0] == 16 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (input_x_shape[0] == 16 and input_x_shape[1] == 64) or (input_x_shape[0] == 64 and input_x_shape[1] == 16):
+        elif (input_x_shape[0] == 16 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (
+                input_x_shape[0] == 16 and input_x_shape[1] == 64) or (
+                input_x_shape[0] == 64 and input_x_shape[1] == 16):
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
             res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
             total_elements = 1
@@ -279,10 +358,13 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
                 repeat_time = each_block_element // 64
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8)
@@ -292,16 +374,22 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
         elif input_x_shape[0] == 5 and input_x_shape[1] == 128 and input_x_shape[2] == 128 and origin_shape[0] == 576:
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
@@ -311,33 +399,43 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
             each_block_element = total_elements // blocks
             phase_1 = 2048
             phase_2 = 128
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
                 tik_instance.data_move(input_x_ub, input_x[phase_1 * block_index], 0, 1, phase_1 // 8, 0, 0)
                 tik_instance.data_move(input_x_ub[phase_1], input_x[65536 + phase_2 * block_index * 2], 0, 1, 8, 0, 0)
-                tik_instance.data_move(input_x_ub[phase_1 + 64], input_x[65536 + 128 + phase_2 * block_index * 2], 0, 1, 8, 0, 0)
+                tik_instance.data_move(input_x_ub[phase_1 + 64], input_x[65536 + 128 + phase_2 * block_index * 2], 0, 1,
+                                       8, 0, 0)
                 repeat_time = each_block_element // 64
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
-                tik_instance.vmax(64, input_x_ub[2048], input_x_ub[2048], input_x_ub[2048+64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[2048], input_x_ub[2048], input_x_ub[2048 + 64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
-        elif (input_x_shape[0] == 9 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (input_x_shape[0] == 72 and input_x_shape[1] == 8):
+        elif (input_x_shape[0] == 9 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (
+                input_x_shape[0] == 72 and input_x_shape[1] == 8):
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
             res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
             total_elements = 1
@@ -345,10 +443,13 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
                 repeat_time = each_block_element // 64
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8)
@@ -357,20 +458,26 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[4096], input_x_ub[4096], input_x_ub[4096+256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[4096], input_x_ub[4096], input_x_ub[4096+128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[4096], input_x_ub[4096], input_x_ub[4096+64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[4096], input_x_ub[4096], input_x_ub[4096 + 256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[4096], input_x_ub[4096], input_x_ub[4096 + 128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[4096], input_x_ub[4096], input_x_ub[4096 + 64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
         elif input_x_shape[0] == 18 and input_x_shape[1] == 128 and input_x_shape[2] == 128:
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
@@ -380,10 +487,13 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
                 repeat_time = each_block_element // 64
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8)
@@ -393,23 +503,30 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192+512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192+256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192+128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192+64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192 + 512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192 + 256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192 + 128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192 + 64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
-        elif (input_x_shape[0] == 36 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (input_x_shape[0] == 144 and input_x_shape[1] == 16):
+        elif (input_x_shape[0] == 36 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (
+                input_x_shape[0] == 144 and input_x_shape[1] == 16):
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
             res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
             total_elements = 1
@@ -417,14 +534,18 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
                 repeat_time_1 = 255
                 repeat_time_2 = each_block_element // 64 - 255
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_2, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_2, 1,
+                                  1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8)
@@ -433,22 +554,32 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384+1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384+512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384+256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384+128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384+64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384 + 1024], 16, 1, 1, 1, 8, 8,
+                                  8)
+                tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384 + 512], 8, 1, 1, 1, 8, 8,
+                                  8)
+                tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384 + 256], 4, 1, 1, 1, 8, 8,
+                                  8)
+                tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384 + 128], 2, 1, 1, 1, 8, 8,
+                                  8)
+                tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384 + 64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
         elif input_x_shape[0] == 128 and input_x_shape[1] == 63:
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
@@ -458,32 +589,47 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
                 repeat_time_1 = 255
                 repeat_time_2 = each_block_element // 64 - 255 * 3
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64], repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 3 * 64], input_x_ub[repeat_time_1 * 3 * 64], repeat_time_2, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1,
+                                  1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64],
+                                  repeat_time_1, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 3 * 64], input_x_ub[repeat_time_1 * 3 * 64],
+                                  repeat_time_2, 1, 1, 8, 8)
                 loop_size = each_block_element // 16384
                 with tik_instance.for_range(0, loop_size) as loop_idx:
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 8192], 128, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 4096], 64, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 2048], 32, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 1024], 16, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 512], 8, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 256], 4, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 128], 2, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 8192], 128, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 4096], 64, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 2048], 32, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 1024], 16, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 512], 8, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 256], 4, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 128], 2, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8)
                 with tik_instance.for_range(0, loop_size - 1) as loop_idx:
-                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384 * (loop_idx + 1)], 1, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384 * (loop_idx + 1)], 1, 1, 1, 1, 8, 8,
+                                      8)
                 tail_element = each_block_element - 16384 * loop_size
                 repeats = tail_element // 64
-                with tik_instance.for_range(0, repeats) as i :
-                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384 * loop_size + i * 64], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, repeats) as i:
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384 * loop_size + i * 64], 1, 1, 1, 1, 8,
+                                      8, 8)
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, input_x_ub[64 + cc0 * 64], data_temp, 1, 1, 8)
@@ -494,7 +640,8 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub[64], input_x_ub[64], input_x_ub[128 + 64], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub[64], input_x_ub[64], input_x_ub[64 + 64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], input_x_ub[64], 0, 1, 8, 0, 0)
-        elif (input_x_shape[0] == 32 and input_x_shape[1] == 128) or (input_x_shape[0] == 128 and input_x_shape[1] == 32):
+        elif (input_x_shape[0] == 32 and input_x_shape[1] == 128) or (
+                input_x_shape[0] == 128 and input_x_shape[1] == 32):
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
             res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
             total_elements = 1
@@ -502,37 +649,57 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
                 repeat_time_1 = 255
                 repeat_time_2 = each_block_element // 64 - 255 * 2
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64], repeat_time_2, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1,
+                                  1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64],
+                                  repeat_time_2, 1, 1, 8, 8)
                 loop_size = each_block_element // 16384
                 with tik_instance.for_range(0, loop_size) as loop_idx:
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 8192], 128, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 4096], 64, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 2048], 32, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 1024], 16, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 512], 8, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 256], 4, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 128], 2, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 8192], 128, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 4096], 64, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 2048], 32, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 1024], 16, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 512], 8, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 256], 4, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 128], 2, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8)
                 with tik_instance.for_range(0, loop_size - 1) as loop_idx:
-                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384 * (loop_idx + 1)], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384 * (loop_idx + 1)], 1, 1, 1, 1, 8, 8,
+                                      8)
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
         elif input_x_shape[0] == 288 and input_x_shape[1] == 32:
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
@@ -542,20 +709,23 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
                 assist_ub = tik_instance.Tensor("float32", (64,), name="assist_ub", scope=tik.scope_ubuf)
                 zero = tik_instance.Scalar("float32")
                 zero.set_as(0)
                 tik_instance.vector_dup(64, assist_ub, zero, 1, 1, 8)
                 input_x_ub = tik_instance.Tensor("float32", (32768,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
                 repeat_time_1 = 255
                 repeat_time_2 = 32768 // 64 - 255 * 2
- 
+
                 tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 0], 0, 1, 4096, 0, 0)
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64], repeat_time_2, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1,
+                                  1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64],
+                                  repeat_time_2, 1, 1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384], 255, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub[16320], input_x_ub[16320], input_x_ub[32704], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8)
@@ -567,11 +737,14 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, assist_ub, assist_ub, input_x_ub, 1, 1, 1, 1, 8, 8, 8)
- 
-                tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 32768], 0, 1, 4096, 0, 0)
+
+                tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 32768], 0, 1, 4096, 0,
+                                       0)
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64], repeat_time_2, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1,
+                                  1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64],
+                                  repeat_time_2, 1, 1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384], 255, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub[16320], input_x_ub[16320], input_x_ub[32704], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8)
@@ -583,7 +756,8 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, assist_ub, assist_ub, input_x_ub, 1, 1, 1, 1, 8, 8, 8)
-                tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 65536], 0, 1, 1024, 0, 0)
+                tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 65536], 0, 1, 1024, 0,
+                                       0)
                 tik_instance.vabs(64, input_x_ub, input_x_ub, 128, 1, 1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8)
@@ -593,17 +767,23 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, assist_ub, assist_ub, input_x_ub, 1, 1, 1, 1, 8, 8, 8)
- 
-                with tik_instance.for_range(0,64) as cc0:
+
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(assist_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
         elif input_x_shape[0] == 64 and input_x_shape[1] == 128:
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
@@ -613,20 +793,23 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
                 assist_ub = tik_instance.Tensor("float32", (64,), name="assist_ub", scope=tik.scope_ubuf)
                 zero = tik_instance.Scalar("float32")
                 zero.set_as(0)
                 tik_instance.vector_dup(64, assist_ub, zero, 1, 1, 8)
                 input_x_ub = tik_instance.Tensor("float32", (32768,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
                 repeat_time_1 = 255
                 repeat_time_2 = 32768 // 64 - 255 * 2
 
                 tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 0], 0, 1, 4096, 0, 0)
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64], repeat_time_2, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1,
+                                  1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64],
+                                  repeat_time_2, 1, 1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384], 255, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub[16320], input_x_ub[16320], input_x_ub[32704], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8)
@@ -639,10 +822,13 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, assist_ub, assist_ub, input_x_ub, 1, 1, 1, 1, 8, 8, 8)
 
-                tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 32768], 0, 1, 4096, 0, 0)
+                tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 32768], 0, 1, 4096, 0,
+                                       0)
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64], repeat_time_2, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1,
+                                  1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64],
+                                  repeat_time_2, 1, 1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384], 255, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub[16320], input_x_ub[16320], input_x_ub[32704], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8)
@@ -655,16 +841,22 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, assist_ub, assist_ub, input_x_ub, 1, 1, 1, 1, 8, 8, 8)
 
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(assist_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
         elif (input_x_shape[0] == 64 and input_x_shape[1] == 32) or (input_x_shape[0] == 32 and input_x_shape[1] == 64):
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
@@ -674,14 +866,18 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
                 repeat_time_1 = 255
                 repeat_time_2 = each_block_element // 64 - 255
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_2, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_2, 1,
+                                  1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8)
@@ -690,16 +886,22 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
         elif input_x_shape[0] == 36 and input_x_shape[1] == 4:
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
@@ -709,10 +911,13 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
 
                 repeat_time = each_block_element // 64
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
@@ -722,16 +927,22 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
         elif input_x_shape[0] == 4 and input_x_shape[1] == 4:
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
@@ -741,24 +952,33 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
 
                 repeat_time = each_block_element // 64
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
         elif input_x_shape[0] == 49 and input_x_shape[1] == 4:
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
@@ -768,10 +988,13 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
 
                 repeat_time = each_block_element // 64
                 tik_instance.vabs(64, input_x_ub, input_x_ub, 24, 1, 1, 8, 8)
@@ -781,20 +1004,26 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[1024], input_x_ub[1024], input_x_ub[1024+256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[1024], input_x_ub[1024], input_x_ub[1024+128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[1024], input_x_ub[1024], input_x_ub[1024+64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[1024], input_x_ub[1024], input_x_ub[1024 + 256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[1024], input_x_ub[1024], input_x_ub[1024 + 128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[1024], input_x_ub[1024], input_x_ub[1024 + 64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
         elif input_x_shape[0] == 1 and input_x_shape[1] == 64 and input_x_shape[2] == 64:
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
@@ -804,31 +1033,40 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
                 repeat_time = each_block_element // 64
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
- 
+
         else:
             raise RuntimeError("UnSupportedShape")
     elif len(input_x_shape) == 2 and (input_x_shape[0] == 32 and input_x_shape[1] == 64):
         input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
         res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
-        input_x_ub = tik_instance.Tensor("float32", (32*64,), name="input_x_ub", scope=tik.scope_ubuf)
+        input_x_ub = tik_instance.Tensor("float32", (32 * 64,), name="input_x_ub", scope=tik.scope_ubuf)
         tik_instance.data_move(input_x_ub, input_x, 0, 1, 256, 0, 0)
         tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
         tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
@@ -838,6 +1076,6 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
         tik_instance.data_move(res[0], input_x_ub, 0, 1, 1, 0, 0)
     else:
         raise RuntimeError("UnSupportedShape")
- 
+
     tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x], outputs=[res])
     return tik_instance
diff --git a/mindspore/ops/_op_impl/_custom_op/img2col_impl.py b/mindspore/ops/_op_impl/_custom_op/img2col_impl.py
index 0d69240dc4..8c1fd1262f 100644
--- a/mindspore/ops/_op_impl/_custom_op/img2col_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/img2col_impl.py
@@ -13,26 +13,27 @@
 # limitations under the License.
 # ============================================================================
 """CusImg2ColNC1HWC0"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tik
 from topi.cce import util
-from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
- 
+
 cus_img2col_info = TBERegOp("CusImg2Col") \
-                             .fusion_type("OPAQUE") \
-                             .async_flag(False) \
-                             .binfile_name("img2col.so") \
-                             .compute_cost(10) \
-                             .kernel_name("CusImg2Col") \
-                             .partial_flag(True) \
-                             .attr("ksizes", "required", "listInt", "all") \
-                             .attr("strides", "required", "listInt", "all") \
-                             .attr("dilates", "required", "listInt", "all") \
-                             .attr("mode", "required", "str", "all") \
-                             .input(0, "x1", False, "required", "all") \
-                             .output(0, "y", False, "required", "all") \
-                             .dtype_format(DataType.F16_5HD, DataType.F16_FracNZ) \
-                             .get_op_info()
- 
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("img2col.so") \
+    .compute_cost(10) \
+    .kernel_name("CusImg2Col") \
+    .partial_flag(True) \
+    .attr("ksizes", "required", "listInt", "all") \
+    .attr("strides", "required", "listInt", "all") \
+    .attr("dilates", "required", "listInt", "all") \
+    .attr("mode", "required", "str", "all") \
+    .input(0, "x1", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_5HD, DataType.F16_FracNZ) \
+    .get_op_info()
+
+
 @op_info_register(cus_img2col_info)
 def CusImg2Col(input_x, output, ksizes, strides, dilates, mode, kernel_name="img2col"):
     input_x_shape = input_x.get("shape")
@@ -43,7 +44,7 @@ def CusImg2Col(input_x, output, ksizes, strides, dilates, mode, kernel_name="img
     _, filter_h, filter_w, _ = ksizes
     _, stride_h, stride_w, _ = strides
     _, dilation_filter_h, dilation_filter_w, _ = dilates
- 
+
     input_shape = (tuple(input_x_shape), input_x_dtype, (filter_h, filter_w), (stride_h, stride_w))
     supported_shape = [((32, 32, 14, 14, 16), 'float16', (3, 3), (2, 2)),
                        ((32, 1, 224, 224, 16), 'float16', (7, 7), (2, 2)),
@@ -63,102 +64,106 @@ def CusImg2Col(input_x, output, ksizes, strides, dilates, mode, kernel_name="img
                        ((32, 8, 28, 28, 16), 'float16', (1, 1), (1, 1)),
                        ((32, 32, 28, 28, 16), 'float16', (1, 1), (1, 1)),
                        ((32, 16, 14, 14, 16), 'float16', (1, 1), (1, 1)),
-                       ((32, 16, 56, 56, 16), 'float16', (1, 1), (1, 1)),]
- 
- 
+                       ((32, 16, 56, 56, 16), 'float16', (1, 1), (1, 1)), ]
+
     if input_shape not in supported_shape:
         raise RuntimeError("input_shape %s is not supported" % str(input_shape))
- 
+
     output_tmp = [N * int(H // stride_h) * int(W // stride_w), filter_h * filter_w * C]
     output_shape = [output_tmp[1] // 16, output_tmp[0] // 16, 16, 16]
     if util.get_product_version() == util.VERSION_MINI:
         tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
     else:
         tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
- 
+
     input_x = tik_instance.Tensor("float16", input_x_shape, name="input_x", scope=tik.scope_gm)
     res = tik_instance.Tensor("float16", output_shape, name="res", scope=tik.scope_gm)
- 
+
     if input_shape == ((32, 1, 224, 224, 16), 'float16', (7, 7), (2, 2)):
-        pad = [3,3,3,3]
+        pad = [3, 3, 3, 3]
         l1_h = 56
         l1_w = 224
         c1_index = 0
         jump_stride = 1
         repeat_mode = 1
- 
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
-            input_1_1_local_L1 = tik_instance.Tensor("float16", (200704,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (53760,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
- 
-            tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,0,0,0], 0, 1, 12544, 0, 0)
-            with tik_instance.for_range(0,7) as eeb:
-                with tik_instance.for_range(0,7) as cc0:
+
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (200704,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (53760,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+
+            tik_instance.data_move(input_1_1_local_L1, input_x[block_index, 0, 0, 0, 0], 0, 1, 12544, 0, 0)
+            with tik_instance.for_range(0, 7) as eeb:
+                with tik_instance.for_range(0, 7) as cc0:
                     temp = eeb % 2
                     rep = ((55 - temp - (-3 + eeb)) // 2 + 1) * 7
                     fetch_filter_w = cc0
                     fetch_filter_h = eeb
                     left_top_w = -3
                     left_top_h = -3
- 
+
                     tik_instance.load3dv1(input_1_1_fractal_L1_local_UB, input_1_1_local_L1,
-                        pad,
-                        l1_h,
-                        l1_w,
-                        c1_index,
-                        fetch_filter_w,
-                        fetch_filter_h,
-                        left_top_w,
-                        left_top_h,
-                        stride_w,
-                        stride_h,
-                        filter_w,
-                        filter_h,
-                        dilation_filter_w,
-                        dilation_filter_h,
-                        jump_stride,
-                        repeat_mode,
-                        rep)
- 
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          rep)
+
                     with tik_instance.for_range(0, rep) as cc1:
-                        tik_instance.data_move(res[cc0 + eeb * 7,cc1 + 784 * block_index,0,0], input_1_1_fractal_L1_local_UB[cc1 * 256], 0, 1, 16, 0, 0)
- 
-            with tik_instance.for_range(1,3) as eeb0:
-                tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,56*eeb0,0,0], 0, 1, 12544, 0, 0)
-                with tik_instance.for_range(0,7) as eeb:
-                    with tik_instance.for_range(0,7) as cc0:
+                        tik_instance.data_move(res[cc0 + eeb * 7, cc1 + 784 * block_index, 0, 0],
+                                               input_1_1_fractal_L1_local_UB[cc1 * 256], 0, 1, 16, 0, 0)
+
+            with tik_instance.for_range(1, 3) as eeb0:
+                tik_instance.data_move(input_1_1_local_L1, input_x[block_index, 0, 56 * eeb0, 0, 0], 0, 1, 12544, 0, 0)
+                with tik_instance.for_range(0, 7) as eeb:
+                    with tik_instance.for_range(0, 7) as cc0:
                         temp = eeb % 2
                         rep_prefix = ((55 - temp - (-3 + eeb)) // 2 + 1) * 7
                         rep = 196
                         fetch_filter_w = cc0
                         fetch_filter_h = eeb
                         left_top_w = -3
- 
+
                         left_top_h = 1 + ((55 - temp - (-3 + eeb)) // 2 - 29) * 2
- 
+
                         tik_instance.load3dv1(input_1_1_fractal_L1_local_UB, input_1_1_local_L1,
-                            pad,
-                            l1_h,
-                            l1_w,
-                            c1_index,
-                            fetch_filter_w,
-                            fetch_filter_h,
-                            left_top_w,
-                            left_top_h,
-                            stride_w,
-                            stride_h,
-                            filter_w,
-                            filter_h,
-                            dilation_filter_w,
-                            dilation_filter_h,
-                            jump_stride,
-                            repeat_mode,
-                            rep)
+                                              pad,
+                                              l1_h,
+                                              l1_w,
+                                              c1_index,
+                                              fetch_filter_w,
+                                              fetch_filter_h,
+                                              left_top_w,
+                                              left_top_h,
+                                              stride_w,
+                                              stride_h,
+                                              filter_w,
+                                              filter_h,
+                                              dilation_filter_w,
+                                              dilation_filter_h,
+                                              jump_stride,
+                                              repeat_mode,
+                                              rep)
                         with tik_instance.for_range(0, rep) as cc1:
-                            tik_instance.data_move(res[cc0 + eeb * 7,cc1 + rep_prefix + (eeb0 - 1) * rep + 784 * block_index, 0,0], input_1_1_fractal_L1_local_UB[cc1 * 256], 0, 1, 16, 0, 0)
- 
-            tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,56*3,0,0], 0, 1, 12544, 0, 0)
- 
+                            tik_instance.data_move(
+                                res[cc0 + eeb * 7, cc1 + rep_prefix + (eeb0 - 1) * rep + 784 * block_index, 0, 0],
+                                input_1_1_fractal_L1_local_UB[cc1 * 256], 0, 1, 16, 0, 0)
+
+            tik_instance.data_move(input_1_1_local_L1, input_x[block_index, 0, 56 * 3, 0, 0], 0, 1, 12544, 0, 0)
+
             with tik_instance.for_range(0, 7) as eeb:
                 with tik_instance.for_range(0, 7) as cc0:
                     temp = eeb % 2
@@ -169,37 +174,40 @@ def CusImg2Col(input_x, output, ksizes, strides, dilates, mode, kernel_name="img
                     left_top_w = -3
                     left_top_h = 1 + ((55 - temp - (-3 + eeb)) // 2 - 29) * 2
                     tik_instance.load3dv1(input_1_1_fractal_L1_local_UB, input_1_1_local_L1,
-                        pad,
-                        l1_h,
-                        l1_w,
-                        c1_index,
-                        fetch_filter_w,
-                        fetch_filter_h,
-                        left_top_w,
-                        left_top_h,
-                        stride_w,
-                        stride_h,
-                        filter_w,
-                        filter_h,
-                        dilation_filter_w,
-                        dilation_filter_h,
-                        jump_stride,
-                        repeat_mode,
-                        rep)
- 
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          rep)
+
                     with tik_instance.for_range(0, rep) as cc1:
-                        tik_instance.data_move(res[cc0 + eeb * 7,cc1 + rep_prefix + 784 * block_index,0,0], input_1_1_fractal_L1_local_UB[cc1 * 256], 0, 1, 16, 0, 0)
- 
+                        tik_instance.data_move(res[cc0 + eeb * 7, cc1 + rep_prefix + 784 * block_index, 0, 0],
+                                               input_1_1_fractal_L1_local_UB[cc1 * 256], 0, 1, 16, 0, 0)
+
     if input_shape == ((32, 4, 56, 56, 16), 'float16', (3, 3), (1, 1)):
-        pad = [1,1,1,1]
+        pad = [1, 1, 1, 1]
         l1_h = 56
         l1_w = 56
         jump_stride = 1
         repeat_mode = 1
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
-            input_1_1_local_L1 = tik_instance.Tensor("float16", (200704,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (50176,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-            tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,0,0,0], 0, 1, 12544, 0, 0)
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (200704,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (50176,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            tik_instance.data_move(input_1_1_local_L1, input_x[block_index, 0, 0, 0, 0], 0, 1, 12544, 0, 0)
             with tik_instance.for_range(0, 9) as eeb0:
                 rep = 196
                 fetch_filter_w = eeb0 % 3
@@ -209,260 +217,288 @@ def CusImg2Col(input_x, output, ksizes, strides, dilates, mode, kernel_name="img
                 with tik_instance.for_range(0, 4) as eeb1:
                     c1_index = eeb1
                     tik_instance.load3dv1(input_1_1_fractal_L1_local_UB, input_1_1_local_L1,
-                        pad,
-                        l1_h,
-                        l1_w,
-                        c1_index,
-                        fetch_filter_w,
-                        fetch_filter_h,
-                        left_top_w,
-                        left_top_h,
-                        stride_w,
-                        stride_h,
-                        filter_w,
-                        filter_h,
-                        dilation_filter_w,
-                        dilation_filter_h,
-                        jump_stride,
-                        repeat_mode,
-                        rep)
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          rep)
                     with tik_instance.for_range(0, rep) as i:
-                        tik_instance.data_move(res[eeb1 * 9 + eeb0,i + 196 * block_index,0,0], input_1_1_fractal_L1_local_UB[i * 256], 0, 1, 16, 0, 0)
+                        tik_instance.data_move(res[eeb1 * 9 + eeb0, i + 196 * block_index, 0, 0],
+                                               input_1_1_fractal_L1_local_UB[i * 256], 0, 1, 16, 0, 0)
 
     if input_shape == ((32, 8, 56, 56, 16), 'float16', (3, 3), (2, 2)):
-        pad = [1,1,1,1]
+        pad = [1, 1, 1, 1]
         l1_h = 56
         l1_w = 56
         jump_stride = 1
         repeat_mode = 1
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
-            input_1_1_local_L1 = tik_instance.Tensor("float16", (401408,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (112896,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-            tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,0,0,0], 0, 1, 25088, 0, 0)
-            with tik_instance.for_range(0,8) as eeb0:
-                with tik_instance.for_range(0,9) as eeb1:
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (401408,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (112896,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            tik_instance.data_move(input_1_1_local_L1, input_x[block_index, 0, 0, 0, 0], 0, 1, 25088, 0, 0)
+            with tik_instance.for_range(0, 8) as eeb0:
+                with tik_instance.for_range(0, 9) as eeb1:
                     rep = 49
                     fetch_filter_w = eeb1 % 3
                     fetch_filter_h = eeb1 // 3
                     left_top_w = -1
                     left_top_h = -1
                     c1_index = eeb0
-                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[49*256*eeb1], input_1_1_local_L1,
-                        pad,
-                        l1_h,
-                        l1_w,
-                        c1_index,
-                        fetch_filter_w,
-                        fetch_filter_h,
-                        left_top_w,
-                        left_top_h,
-                        stride_w,
-                        stride_h,
-                        filter_w,
-                        filter_h,
-                        dilation_filter_w,
-                        dilation_filter_h,
-                        jump_stride,
-                        repeat_mode,
-                        rep)
-                with tik_instance.for_range(0,9) as eeb1:
-                    with tik_instance.for_range(0,49) as i:
-                        tik_instance.data_move(res[eeb1 + eeb0 * 9,49 * block_index + i,0,0], input_1_1_fractal_L1_local_UB[i * 256 + eeb1 * 49 * 256], 0, 1, 16, 0, 0)
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[49 * 256 * eeb1], input_1_1_local_L1,
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          rep)
+                with tik_instance.for_range(0, 9) as eeb1:
+                    with tik_instance.for_range(0, 49) as i:
+                        tik_instance.data_move(res[eeb1 + eeb0 * 9, 49 * block_index + i, 0, 0],
+                                               input_1_1_fractal_L1_local_UB[i * 256 + eeb1 * 49 * 256], 0, 1, 16, 0, 0)
 
     if input_shape == ((32, 8, 28, 28, 16), 'float16', (3, 3), (1, 1)):
-        pad = [1,1,1,1]
+        pad = [1, 1, 1, 1]
         l1_h = 28
         l1_w = 28
         jump_stride = 1
         repeat_mode = 1
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
-            input_1_1_local_L1 = tik_instance.Tensor("float16", (100352,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (112896,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-            tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,0,0,0], 0, 1, 6272, 0, 0)
-            with tik_instance.for_range(0,8) as eeb0:
-                with tik_instance.for_range(0,9) as eeb1:
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (100352,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (112896,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            tik_instance.data_move(input_1_1_local_L1, input_x[block_index, 0, 0, 0, 0], 0, 1, 6272, 0, 0)
+            with tik_instance.for_range(0, 8) as eeb0:
+                with tik_instance.for_range(0, 9) as eeb1:
                     rep = 49
                     fetch_filter_w = eeb1 % 3
                     fetch_filter_h = eeb1 // 3
                     left_top_w = -1
                     left_top_h = -1
                     c1_index = eeb0
-                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[49*256*eeb1], input_1_1_local_L1,
-                        pad,
-                        l1_h,
-                        l1_w,
-                        c1_index,
-                        fetch_filter_w,
-                        fetch_filter_h,
-                        left_top_w,
-                        left_top_h,
-                        stride_w,
-                        stride_h,
-                        filter_w,
-                        filter_h,
-                        dilation_filter_w,
-                        dilation_filter_h,
-                        jump_stride,
-                        repeat_mode,
-                        rep)
-                with tik_instance.for_range(0,9) as eeb1:
-                    with tik_instance.for_range(0,49) as i:
-                        tik_instance.data_move(res[eeb1 + eeb0 * 9,49 * block_index + i,0,0], input_1_1_fractal_L1_local_UB[i * 256 + eeb1 * 49 * 256], 0, 1, 16, 0, 0)
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[49 * 256 * eeb1], input_1_1_local_L1,
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          rep)
+                with tik_instance.for_range(0, 9) as eeb1:
+                    with tik_instance.for_range(0, 49) as i:
+                        tik_instance.data_move(res[eeb1 + eeb0 * 9, 49 * block_index + i, 0, 0],
+                                               input_1_1_fractal_L1_local_UB[i * 256 + eeb1 * 49 * 256], 0, 1, 16, 0, 0)
 
     if input_shape == ((32, 16, 28, 28, 16), 'float16', (3, 3), (2, 2)):
-        pad = [1,1,1,1]
+        pad = [1, 1, 1, 1]
         l1_h = 28
         l1_w = 28
         jump_stride = 1
         repeat_mode = 1
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
             eeb0 = block_index % 2
             eeb1 = block_index // 2
-            input_1_1_local_L1 = tik_instance.Tensor("float16", (200704,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (53248,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (50176,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
-            with tik_instance.for_range(0,16) as i:
-                tik_instance.data_move(input_1_1_local_L1[i * 12544], input_x[i + 16 * eeb0,eeb1,0,0,0], 0, 1, 784, 0, 0)
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (200704,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (53248,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (50176,), scope=tik.scope_ubuf,
+                                                                name="input_1_2_fractal_L1_local_UB")
+            with tik_instance.for_range(0, 16) as i:
+                tik_instance.data_move(input_1_1_local_L1[i * 12544], input_x[i + 16 * eeb0, eeb1, 0, 0, 0], 0, 1, 784,
+                                       0, 0)
 
-            with tik_instance.for_range(0,9) as eeb3:
+            with tik_instance.for_range(0, 9) as eeb3:
                 rep = 13
                 fetch_filter_w = eeb3 % 3
                 fetch_filter_h = eeb3 // 3
                 left_top_w = -1
                 left_top_h = -1
                 c1_index = 0
-                with tik_instance.for_range(0,16) as i:
+                with tik_instance.for_range(0, 16) as i:
                     tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[3328 * i], input_1_1_local_L1[12544 * i],
-                        pad,
-                        l1_h,
-                        l1_w,
-                        c1_index,
-                        fetch_filter_w,
-                        fetch_filter_h,
-                        left_top_w,
-                        left_top_h,
-                        stride_w,
-                        stride_h,
-                        filter_w,
-                        filter_h,
-                        dilation_filter_w,
-                        dilation_filter_h,
-                        jump_stride,
-                        repeat_mode,
-                        rep)
-                with tik_instance.for_range(0,16) as i:
-                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16], input_1_1_fractal_L1_local_UB[i * 3328], 0, 1, 196,0,0)
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          rep)
+                with tik_instance.for_range(0, 16) as i:
+                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16],
+                                           input_1_1_fractal_L1_local_UB[i * 3328], 0, 1, 196, 0, 0)
 
                 with tik_instance.for_range(196 * eeb0, 196 * (eeb0 + 1)) as i:
-                    tik_instance.data_move(res[eeb1 * 9 + eeb3,i,0,0], input_1_2_fractal_L1_local_UB[256 * (i - 196 * eeb0)],0,1,16,0,0)
+                    tik_instance.data_move(res[eeb1 * 9 + eeb3, i, 0, 0],
+                                           input_1_2_fractal_L1_local_UB[256 * (i - 196 * eeb0)], 0, 1, 16, 0, 0)
 
-    if input_shape ==  ((32, 16, 14, 14, 16), 'float16', (3, 3), (1, 1)):
-        pad = [1,1,1,1]
+    if input_shape == ((32, 16, 14, 14, 16), 'float16', (3, 3), (1, 1)):
+        pad = [1, 1, 1, 1]
         l1_h = 14
         l1_w = 14
         jump_stride = 1
         repeat_mode = 1
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
             eeb0 = block_index % 2
             eeb1 = block_index // 2
-            input_1_1_local_L1 = tik_instance.Tensor("float16", (50176,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (53248,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (50176,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
-            with tik_instance.for_range(0,16) as i:
-                tik_instance.data_move(input_1_1_local_L1[i * 3136], input_x[i + 16 * eeb0,eeb1,0,0,0], 0, 1, 196, 0, 0)
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (50176,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (53248,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (50176,), scope=tik.scope_ubuf,
+                                                                name="input_1_2_fractal_L1_local_UB")
+            with tik_instance.for_range(0, 16) as i:
+                tik_instance.data_move(input_1_1_local_L1[i * 3136], input_x[i + 16 * eeb0, eeb1, 0, 0, 0], 0, 1, 196,
+                                       0, 0)
 
-            with tik_instance.for_range(0,9) as eeb3:
+            with tik_instance.for_range(0, 9) as eeb3:
                 rep = 13
                 fetch_filter_w = eeb3 % 3
                 fetch_filter_h = eeb3 // 3
                 left_top_w = -1
                 left_top_h = -1
                 c1_index = 0
-                with tik_instance.for_range(0,16) as i:
+                with tik_instance.for_range(0, 16) as i:
                     tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[3328 * i], input_1_1_local_L1[3136 * i],
-                        pad,
-                        l1_h,
-                        l1_w,
-                        c1_index,
-                        fetch_filter_w,
-                        fetch_filter_h,
-                        left_top_w,
-                        left_top_h,
-                        stride_w,
-                        stride_h,
-                        filter_w,
-                        filter_h,
-                        dilation_filter_w,
-                        dilation_filter_h,
-                        jump_stride,
-                        repeat_mode,
-                        rep)
-                with tik_instance.for_range(0,16) as i:
-                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16], input_1_1_fractal_L1_local_UB[i * 3328], 0, 1, 196,0,0)
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          rep)
+                with tik_instance.for_range(0, 16) as i:
+                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16],
+                                           input_1_1_fractal_L1_local_UB[i * 3328], 0, 1, 196, 0, 0)
 
                 with tik_instance.for_range(196 * eeb0, 196 * (eeb0 + 1)) as i:
-                    tik_instance.data_move(res[eeb1 * 9 + eeb3,i,0,0], input_1_2_fractal_L1_local_UB[256 * (i - 196 * eeb0)],0,1,16,0,0)
+                    tik_instance.data_move(res[eeb1 * 9 + eeb3, i, 0, 0],
+                                           input_1_2_fractal_L1_local_UB[256 * (i - 196 * eeb0)], 0, 1, 16, 0, 0)
 
     if input_shape == ((32, 32, 14, 14, 16), 'float16', (3, 3), (2, 2)):
-        pad = [1,1,1,1]
+        pad = [1, 1, 1, 1]
         l1_h = 14
         l1_w = 14
         jump_stride = 1
         repeat_mode = 1
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
-            input_1_1_local_L1 = tik_instance.Tensor("float16", (100352,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (32768,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (25088,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
-            with tik_instance.for_range(0,32) as i:
-                tik_instance.data_move(input_1_1_local_L1[i * 3136], input_x[i,block_index,0,0,0], 0, 1, 196, 0, 0)
-            with tik_instance.for_range(0,9) as eeb:
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (100352,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (32768,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (25088,), scope=tik.scope_ubuf,
+                                                                name="input_1_2_fractal_L1_local_UB")
+            with tik_instance.for_range(0, 32) as i:
+                tik_instance.data_move(input_1_1_local_L1[i * 3136], input_x[i, block_index, 0, 0, 0], 0, 1, 196, 0, 0)
+            with tik_instance.for_range(0, 9) as eeb:
                 rep = 4
-                fetch_filter_w = eeb  % 3
+                fetch_filter_w = eeb % 3
                 fetch_filter_h = eeb // 3
                 left_top_w = -1
                 left_top_h = -1
                 c1_index = 0
-                with tik_instance.for_range(0,32) as i:
+                with tik_instance.for_range(0, 32) as i:
                     tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[1024 * i], input_1_1_local_L1[3136 * i],
-                        pad,
-                        l1_h,
-                        l1_w,
-                        c1_index,
-                        fetch_filter_w,
-                        fetch_filter_h,
-                        left_top_w,
-                        left_top_h,
-                        stride_w,
-                        stride_h,
-                        filter_w,
-                        filter_h,
-                        dilation_filter_w,
-                        dilation_filter_h,
-                        jump_stride,
-                        repeat_mode,
-                        rep)
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          rep)
 
-                with tik_instance.for_range(0,32) as i:
-                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 49 * 16], input_1_1_fractal_L1_local_UB[i*1024],0,1,49,0,0)
+                with tik_instance.for_range(0, 32) as i:
+                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 49 * 16],
+                                           input_1_1_fractal_L1_local_UB[i * 1024], 0, 1, 49, 0, 0)
 
-                with tik_instance.for_range(0,98) as i:
-                    tik_instance.data_move(res[eeb + block_index * 9, i, 0, 0], input_1_2_fractal_L1_local_UB[256*i], 0,1,16,0,0)
+                with tik_instance.for_range(0, 98) as i:
+                    tik_instance.data_move(res[eeb + block_index * 9, i, 0, 0], input_1_2_fractal_L1_local_UB[256 * i],
+                                           0, 1, 16, 0, 0)
 
     if input_shape == ((32, 64, 14, 14, 16), 'float16', (1, 1), (2, 2)):
-        pad = [0,0,0,0]
+        pad = [0, 0, 0, 0]
         l1_h = 14
         l1_w = 14
         jump_stride = 1
         repeat_mode = 1
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
-            input_1_1_local_L1 = tik_instance.Tensor("float16", (100352,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (32768,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (25088,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (100352,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (32768,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (25088,), scope=tik.scope_ubuf,
+                                                                name="input_1_2_fractal_L1_local_UB")
 
-            with tik_instance.for_range(0,2) as eeb0:
-                with tik_instance.for_range(0,32) as i:
-                    tik_instance.data_move(input_1_1_local_L1[i * 3136], input_x[i,block_index * 2 + eeb0,0,0,0], 0, 1, 196, 0, 0)
-                with tik_instance.for_range(0,32) as i:
+            with tik_instance.for_range(0, 2) as eeb0:
+                with tik_instance.for_range(0, 32) as i:
+                    tik_instance.data_move(input_1_1_local_L1[i * 3136], input_x[i, block_index * 2 + eeb0, 0, 0, 0], 0,
+                                           1, 196, 0, 0)
+                with tik_instance.for_range(0, 32) as i:
                     rep = 4
                     fetch_filter_w = 0
                     fetch_filter_h = 0
@@ -470,41 +506,46 @@ def CusImg2Col(input_x, output, ksizes, strides, dilates, mode, kernel_name="img
                     left_top_h = 0
                     c1_index = 0
                     tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[1024 * i], input_1_1_local_L1[3136 * i],
-                        pad,
-                        l1_h,
-                        l1_w,
-                        c1_index,
-                        fetch_filter_w,
-                        fetch_filter_h,
-                        left_top_w,
-                        left_top_h,
-                        stride_w,
-                        stride_h,
-                        filter_w,
-                        filter_h,
-                        dilation_filter_w,
-                        dilation_filter_h,
-                        jump_stride,
-                        repeat_mode,
-                        rep)
-                with tik_instance.for_range(0,32) as i:
-                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 49 * 16], input_1_1_fractal_L1_local_UB[i*1024],0,1,49,0,0)
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          rep)
+                with tik_instance.for_range(0, 32) as i:
+                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 49 * 16],
+                                           input_1_1_fractal_L1_local_UB[i * 1024], 0, 1, 49, 0, 0)
 
-                with tik_instance.for_range(0,98) as i:
-                    tik_instance.data_move(res[eeb0 + block_index * 2, i, 0, 0], input_1_2_fractal_L1_local_UB[256*i], 0,1,16,0,0)
+                with tik_instance.for_range(0, 98) as i:
+                    tik_instance.data_move(res[eeb0 + block_index * 2, i, 0, 0], input_1_2_fractal_L1_local_UB[256 * i],
+                                           0, 1, 16, 0, 0)
 
     if input_shape == ((32, 32, 7, 7, 16), 'float16', (3, 3), (1, 1)):
-        pad = [1,1,1,1]
+        pad = [1, 1, 1, 1]
         l1_h = 7
         l1_w = 7
         jump_stride = 1
         repeat_mode = 1
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
-            input_1_1_local_L1 = tik_instance.Tensor("float16", (25088,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (32768,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (25088,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
-            with tik_instance.for_range(0,32) as i:
-                tik_instance.data_move(input_1_1_local_L1[i * 784], input_x[i,block_index,0,0,0], 0, 1, 49, 0, 0)
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (25088,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (32768,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (25088,), scope=tik.scope_ubuf,
+                                                                name="input_1_2_fractal_L1_local_UB")
+            with tik_instance.for_range(0, 32) as i:
+                tik_instance.data_move(input_1_1_local_L1[i * 784], input_x[i, block_index, 0, 0, 0], 0, 1, 49, 0, 0)
 
             with tik_instance.for_range(0, 9) as eeb:
                 rep = 4
@@ -513,92 +554,106 @@ def CusImg2Col(input_x, output, ksizes, strides, dilates, mode, kernel_name="img
                 left_top_w = -1
                 left_top_h = -1
                 c1_index = 0
-                with tik_instance.for_range(0,32) as i:
+                with tik_instance.for_range(0, 32) as i:
                     tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[1024 * i], input_1_1_local_L1[784 * i],
-                        pad,
-                        l1_h,
-                        l1_w,
-                        c1_index,
-                        fetch_filter_w,
-                        fetch_filter_h,
-                        left_top_w,
-                        left_top_h,
-                        stride_w,
-                        stride_h,
-                        filter_w,
-                        filter_h,
-                        dilation_filter_w,
-                        dilation_filter_h,
-                        jump_stride,
-                        repeat_mode,
-                        rep)
-                with tik_instance.for_range(0,32) as i:
-                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 49 * 16], input_1_1_fractal_L1_local_UB[i*1024],0,1,49,0,0)
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          rep)
+                with tik_instance.for_range(0, 32) as i:
+                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 49 * 16],
+                                           input_1_1_fractal_L1_local_UB[i * 1024], 0, 1, 49, 0, 0)
 
-                with tik_instance.for_range(0,98) as i:
-                    tik_instance.data_move(res[eeb + block_index * 9, i, 0, 0], input_1_2_fractal_L1_local_UB[256*i], 0,1,16,0,0)
+                with tik_instance.for_range(0, 98) as i:
+                    tik_instance.data_move(res[eeb + block_index * 9, i, 0, 0], input_1_2_fractal_L1_local_UB[256 * i],
+                                           0, 1, 16, 0, 0)
 
     if input_shape == ((32, 128, 7, 7, 16), 'float16', (1, 1), (1, 1)):
-        pad = [0,0,0,0]
+        pad = [0, 0, 0, 0]
         l1_h = 7
         l1_w = 7
         jump_stride = 1
         repeat_mode = 1
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
-            input_1_1_local_L1 = tik_instance.Tensor("float16", (25088,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (32768,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (25088,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
-            with tik_instance.for_range(0,4) as eeb0:
-                with tik_instance.for_range(0,32) as i:
-                    tik_instance.data_move(input_1_1_local_L1[i * 784], input_x[i,eeb0 + block_index * 4,0,0,0], 0, 1, 49, 0, 0)
-                with tik_instance.for_range(0,32) as i:
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (25088,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (32768,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (25088,), scope=tik.scope_ubuf,
+                                                                name="input_1_2_fractal_L1_local_UB")
+            with tik_instance.for_range(0, 4) as eeb0:
+                with tik_instance.for_range(0, 32) as i:
+                    tik_instance.data_move(input_1_1_local_L1[i * 784], input_x[i, eeb0 + block_index * 4, 0, 0, 0], 0,
+                                           1, 49, 0, 0)
+                with tik_instance.for_range(0, 32) as i:
                     rep = 4
                     fetch_filter_w = 0
                     fetch_filter_h = 0
                     left_top_w = 0
                     left_top_h = 0
                     c1_index = 0
-                    with tik_instance.for_range(0,32) as i:
+                    with tik_instance.for_range(0, 32) as i:
                         tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[1024 * i], input_1_1_local_L1[784 * i],
-                            pad,
-                            l1_h,
-                            l1_w,
-                            c1_index,
-                            fetch_filter_w,
-                            fetch_filter_h,
-                            left_top_w,
-                            left_top_h,
-                            stride_w,
-                            stride_h,
-                            filter_w,
-                            filter_h,
-                            dilation_filter_w,
-                            dilation_filter_h,
-                            jump_stride,
-                            repeat_mode,
-                            rep)
+                                              pad,
+                                              l1_h,
+                                              l1_w,
+                                              c1_index,
+                                              fetch_filter_w,
+                                              fetch_filter_h,
+                                              left_top_w,
+                                              left_top_h,
+                                              stride_w,
+                                              stride_h,
+                                              filter_w,
+                                              filter_h,
+                                              dilation_filter_w,
+                                              dilation_filter_h,
+                                              jump_stride,
+                                              repeat_mode,
+                                              rep)
 
-                with tik_instance.for_range(0,32) as i:
-                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 49 * 16], input_1_1_fractal_L1_local_UB[i*1024],0,1,49,0,0)
+                with tik_instance.for_range(0, 32) as i:
+                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 49 * 16],
+                                           input_1_1_fractal_L1_local_UB[i * 1024], 0, 1, 49, 0, 0)
 
-                with tik_instance.for_range(0,98) as i:
-                    tik_instance.data_move(res[eeb0 + block_index * 4, i, 0, 0], input_1_2_fractal_L1_local_UB[256*i], 0,1,16,0,0)
+                with tik_instance.for_range(0, 98) as i:
+                    tik_instance.data_move(res[eeb0 + block_index * 4, i, 0, 0], input_1_2_fractal_L1_local_UB[256 * i],
+                                           0, 1, 16, 0, 0)
 
     if input_shape == ((32, 64, 14, 14, 16), 'float16', (1, 1), (1, 1)):
-        pad = [0,0,0,0]
+        pad = [0, 0, 0, 0]
         l1_h = 14
         l1_w = 14
         jump_stride = 1
         repeat_mode = 1
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
-            input_1_1_local_L1 = tik_instance.Tensor("float16", (100352,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-            input_1_2_local_L1 = tik_instance.Tensor("float16", (100352,), scope=tik.scope_cbuf, name = "input_1_2_local_L1")
-            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (53248,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (50176,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
-            with tik_instance.for_range(0,32) as i:
-                tik_instance.data_move(input_1_1_local_L1[i * 3136], input_x[i,block_index * 2,0,0,0], 0, 1, 196, 0, 0)
-                tik_instance.data_move(input_1_2_local_L1[i * 3136], input_x[i,block_index * 2 + 1,0,0,0], 0, 1, 196, 0, 0)
-            with tik_instance.for_range(0,2) as eeb1:
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (100352,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_2_local_L1 = tik_instance.Tensor("float16", (100352,), scope=tik.scope_cbuf,
+                                                     name="input_1_2_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (53248,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (50176,), scope=tik.scope_ubuf,
+                                                                name="input_1_2_fractal_L1_local_UB")
+            with tik_instance.for_range(0, 32) as i:
+                tik_instance.data_move(input_1_1_local_L1[i * 3136], input_x[i, block_index * 2, 0, 0, 0], 0, 1, 196, 0,
+                                       0)
+                tik_instance.data_move(input_1_2_local_L1[i * 3136], input_x[i, block_index * 2 + 1, 0, 0, 0], 0, 1,
+                                       196, 0, 0)
+            with tik_instance.for_range(0, 2) as eeb1:
                 with tik_instance.for_range(eeb1 * 16, (eeb1 + 1) * 16) as i:
                     rep = 13
                     fetch_filter_w = 0
@@ -606,30 +661,33 @@ def CusImg2Col(input_x, output, ksizes, strides, dilates, mode, kernel_name="img
                     left_top_w = 0
                     left_top_h = 0
                     c1_index = 0
-                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[3328 * (i - eeb1 * 16)], input_1_1_local_L1[3136 * i],
-                        pad,
-                        l1_h,
-                        l1_w,
-                        c1_index,
-                        fetch_filter_w,
-                        fetch_filter_h,
-                        left_top_w,
-                        left_top_h,
-                        stride_w,
-                        stride_h,
-                        filter_w,
-                        filter_h,
-                        dilation_filter_w,
-                        dilation_filter_h,
-                        jump_stride,
-                        repeat_mode,
-                        rep)
-                with tik_instance.for_range(0,16) as i:
-                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16], input_1_1_fractal_L1_local_UB[i*3328],0,1,196,0,0)
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[3328 * (i - eeb1 * 16)],
+                                          input_1_1_local_L1[3136 * i],
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          rep)
+                with tik_instance.for_range(0, 16) as i:
+                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16],
+                                           input_1_1_fractal_L1_local_UB[i * 3328], 0, 1, 196, 0, 0)
                 with tik_instance.for_range(eeb1 * 196, (eeb1 + 1) * 196) as i:
-                    tik_instance.data_move(res[block_index * 2, i, 0, 0], input_1_2_fractal_L1_local_UB[256*(i - eeb1 * 196)], 0,1,16,0,0)
- 
-            with tik_instance.for_range(0,2) as eeb1:
+                    tik_instance.data_move(res[block_index * 2, i, 0, 0],
+                                           input_1_2_fractal_L1_local_UB[256 * (i - eeb1 * 196)], 0, 1, 16, 0, 0)
+
+            with tik_instance.for_range(0, 2) as eeb1:
                 with tik_instance.for_range(eeb1 * 16, (eeb1 + 1) * 16) as i:
                     rep = 13
                     fetch_filter_w = 0
@@ -637,42 +695,48 @@ def CusImg2Col(input_x, output, ksizes, strides, dilates, mode, kernel_name="img
                     left_top_w = 0
                     left_top_h = 0
                     c1_index = 0
-                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[3328 * (i - eeb1 * 16)], input_1_2_local_L1[3136 * i],
-                        pad,
-                        l1_h,
-                        l1_w,
-                        c1_index,
-                        fetch_filter_w,
-                        fetch_filter_h,
-                        left_top_w,
-                        left_top_h,
-                        stride_w,
-                        stride_h,
-                        filter_w,
-                        filter_h,
-                        dilation_filter_w,
-                        dilation_filter_h,
-                        jump_stride,
-                        repeat_mode,
-                        rep)
-                with tik_instance.for_range(0,16) as i:
-                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16], input_1_1_fractal_L1_local_UB[i*3328],0,1,196,0,0)
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[3328 * (i - eeb1 * 16)],
+                                          input_1_2_local_L1[3136 * i],
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          rep)
+                with tik_instance.for_range(0, 16) as i:
+                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16],
+                                           input_1_1_fractal_L1_local_UB[i * 3328], 0, 1, 196, 0, 0)
                 with tik_instance.for_range(eeb1 * 196, (eeb1 + 1) * 196) as i:
-                    tik_instance.data_move(res[block_index * 2 + 1, i, 0, 0], input_1_2_fractal_L1_local_UB[256*(i - eeb1 * 196)], 0,1,16,0,0)
- 
+                    tik_instance.data_move(res[block_index * 2 + 1, i, 0, 0],
+                                           input_1_2_fractal_L1_local_UB[256 * (i - eeb1 * 196)], 0, 1, 16, 0, 0)
+
     if input_shape == ((32, 32, 28, 28, 16), 'float16', (1, 1), (2, 2)):
-        pad = [0,0,0,0]
+        pad = [0, 0, 0, 0]
         l1_h = 28
         l1_w = 28
         jump_stride = 1
         repeat_mode = 1
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
-            input_1_1_local_L1 = tik_instance.Tensor("float16", (401408,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (53248,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (50176,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
-            with tik_instance.for_range(0,32) as i:
-                tik_instance.data_move(input_1_1_local_L1[i * 12544], input_x[i,block_index,0,0,0], 0, 1, 784, 0, 0)
-            with tik_instance.for_range(0,16) as i:
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (401408,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (53248,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (50176,), scope=tik.scope_ubuf,
+                                                                name="input_1_2_fractal_L1_local_UB")
+            with tik_instance.for_range(0, 32) as i:
+                tik_instance.data_move(input_1_1_local_L1[i * 12544], input_x[i, block_index, 0, 0, 0], 0, 1, 784, 0, 0)
+            with tik_instance.for_range(0, 16) as i:
                 rep = 13
                 fetch_filter_w = 0
                 fetch_filter_h = 0
@@ -680,29 +744,31 @@ def CusImg2Col(input_x, output, ksizes, strides, dilates, mode, kernel_name="img
                 left_top_h = 0
                 c1_index = 0
                 tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[3328 * i], input_1_1_local_L1[12544 * i],
-                    pad,
-                    l1_h,
-                    l1_w,
-                    c1_index,
-                    fetch_filter_w,
-                    fetch_filter_h,
-                    left_top_w,
-                    left_top_h,
-                    stride_w,
-                    stride_h,
-                    filter_w,
-                    filter_h,
-                    dilation_filter_w,
-                    dilation_filter_h,
-                    jump_stride,
-                    repeat_mode,
-                    rep)
-            with tik_instance.for_range(0,16) as i:
-                tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16], input_1_1_fractal_L1_local_UB[i*3328],0,1,196,0,0)
-            with tik_instance.for_range(0,196) as i:
-                tik_instance.data_move(res[block_index, i, 0, 0], input_1_2_fractal_L1_local_UB[256*i], 0,1,16,0,0)
- 
-            with tik_instance.for_range(16,32) as i:
+                                      pad,
+                                      l1_h,
+                                      l1_w,
+                                      c1_index,
+                                      fetch_filter_w,
+                                      fetch_filter_h,
+                                      left_top_w,
+                                      left_top_h,
+                                      stride_w,
+                                      stride_h,
+                                      filter_w,
+                                      filter_h,
+                                      dilation_filter_w,
+                                      dilation_filter_h,
+                                      jump_stride,
+                                      repeat_mode,
+                                      rep)
+            with tik_instance.for_range(0, 16) as i:
+                tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16],
+                                       input_1_1_fractal_L1_local_UB[i * 3328], 0, 1, 196, 0, 0)
+            with tik_instance.for_range(0, 196) as i:
+                tik_instance.data_move(res[block_index, i, 0, 0], input_1_2_fractal_L1_local_UB[256 * i], 0, 1, 16, 0,
+                                       0)
+
+            with tik_instance.for_range(16, 32) as i:
                 rep = 13
                 fetch_filter_w = 0
                 fetch_filter_h = 0
@@ -710,342 +776,375 @@ def CusImg2Col(input_x, output, ksizes, strides, dilates, mode, kernel_name="img
                 left_top_h = 0
                 c1_index = 0
                 tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[3328 * (i - 16)], input_1_1_local_L1[12544 * i],
-                    pad,
-                    l1_h,
-                    l1_w,
-                    c1_index,
-                    fetch_filter_w,
-                    fetch_filter_h,
-                    left_top_w,
-                    left_top_h,
-                    stride_w,
-                    stride_h,
-                    filter_w,
-                    filter_h,
-                    dilation_filter_w,
-                    dilation_filter_h,
-                    jump_stride,
-                    repeat_mode,
-                    rep)
-            with tik_instance.for_range(0,16) as i:
-                tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16], input_1_1_fractal_L1_local_UB[i*3328],0,1,196,0,0)
-            with tik_instance.for_range(196,392) as i:
-                tik_instance.data_move(res[block_index, i, 0, 0], input_1_2_fractal_L1_local_UB[256*(i-196)], 0,1,16,0,0)
- 
+                                      pad,
+                                      l1_h,
+                                      l1_w,
+                                      c1_index,
+                                      fetch_filter_w,
+                                      fetch_filter_h,
+                                      left_top_w,
+                                      left_top_h,
+                                      stride_w,
+                                      stride_h,
+                                      filter_w,
+                                      filter_h,
+                                      dilation_filter_w,
+                                      dilation_filter_h,
+                                      jump_stride,
+                                      repeat_mode,
+                                      rep)
+            with tik_instance.for_range(0, 16) as i:
+                tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16],
+                                       input_1_1_fractal_L1_local_UB[i * 3328], 0, 1, 196, 0, 0)
+            with tik_instance.for_range(196, 392) as i:
+                tik_instance.data_move(res[block_index, i, 0, 0], input_1_2_fractal_L1_local_UB[256 * (i - 196)], 0, 1,
+                                       16, 0, 0)
+
     if input_shape == ((32, 32, 7, 7, 16), 'float16', (1, 1), (1, 1)):
-      if padding == 'SAME':
-        padding_left = 0
-        padding_right = 0
-        padding_top = 0
-        padding_bottom = 0
-      pad = [padding_left, padding_right, padding_top, padding_bottom]
-      l1_h = 7
-      l1_w = 7
-      c1_index = 0
-      jump_stride = 1
-      repeat_mode = 1
-      with tik_instance.for_range(0,32, block_num=32) as block_index:
-        input_1_1_local_L1 = tik_instance.Tensor("float16", (25088, ), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-        input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (32768,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-        input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (25088, ), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
- 
-        with tik_instance.for_range(0,32) as i:
-          tik_instance.data_move(input_1_1_local_L1[i * 784], input_x[i, block_index, 0, 0, 0], 0, 1, 49, 0, 0)
- 
-        with tik_instance.for_range(0,32) as i:
-            fetch_filter_w = 0
-            fetch_filter_h = 0
-            left_top_h = 0
-            left_top_w = 0
-            tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[1024 * i], input_1_1_local_L1[784 * i],
-              pad,
-              l1_h,
-              l1_w,
-              c1_index,
-              fetch_filter_w,
-              fetch_filter_h,
-              left_top_w,
-              left_top_h,
-              stride_w,
-              stride_h,
-              filter_w,
-              filter_h,
-              dilation_filter_w,
-              dilation_filter_h,
-              jump_stride,
-              repeat_mode,
-              4)
- 
-        with tik_instance.for_range(0,32) as i:
-          tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 49 * 16], input_1_1_fractal_L1_local_UB[i * 1024], 0, 1, 49, 0, 0)
-        with tik_instance.for_range(0,98) as i:
-          tik_instance.data_move(res[block_index, i, 0, 0], input_1_2_fractal_L1_local_UB[i * 256], 0, 1, 16, 0, 0)
- 
+        if padding == 'SAME':
+            padding_left = 0
+            padding_right = 0
+            padding_top = 0
+            padding_bottom = 0
+        pad = [padding_left, padding_right, padding_top, padding_bottom]
+        l1_h = 7
+        l1_w = 7
+        c1_index = 0
+        jump_stride = 1
+        repeat_mode = 1
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (25088,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (32768,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (25088,), scope=tik.scope_ubuf,
+                                                                name="input_1_2_fractal_L1_local_UB")
+
+            with tik_instance.for_range(0, 32) as i:
+                tik_instance.data_move(input_1_1_local_L1[i * 784], input_x[i, block_index, 0, 0, 0], 0, 1, 49, 0, 0)
+
+            with tik_instance.for_range(0, 32) as i:
+                fetch_filter_w = 0
+                fetch_filter_h = 0
+                left_top_h = 0
+                left_top_w = 0
+                tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[1024 * i], input_1_1_local_L1[784 * i],
+                                      pad,
+                                      l1_h,
+                                      l1_w,
+                                      c1_index,
+                                      fetch_filter_w,
+                                      fetch_filter_h,
+                                      left_top_w,
+                                      left_top_h,
+                                      stride_w,
+                                      stride_h,
+                                      filter_w,
+                                      filter_h,
+                                      dilation_filter_w,
+                                      dilation_filter_h,
+                                      jump_stride,
+                                      repeat_mode,
+                                      4)
+
+            with tik_instance.for_range(0, 32) as i:
+                tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 49 * 16],
+                                       input_1_1_fractal_L1_local_UB[i * 1024], 0, 1, 49, 0, 0)
+            with tik_instance.for_range(0, 98) as i:
+                tik_instance.data_move(res[block_index, i, 0, 0], input_1_2_fractal_L1_local_UB[i * 256], 0, 1, 16, 0,
+                                       0)
+
     if input_shape == ((32, 4, 56, 56, 16), 'float16', (1, 1), (1, 1)):
-      if padding == 'SAME':
-        padding_left = 0
-        padding_right = 0
-        padding_top = 0
-        padding_bottom = 0
-      pad = [padding_left, padding_right, padding_top, padding_bottom]
-      l1_h = 56
-      l1_w = 56
-      c1_index = 0
-      jump_stride = 1
-      repeat_mode = 1
-      with tik_instance.for_range(0,32,block_num=32) as block_index:
-        input_1_1_local_L1 = tik_instance.Tensor("float16", (12544 * 32 // 2, ), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-        input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (100352 // 2,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-        tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,0,0,0], 0, 1, 12544, 0, 0)
-        with tik_instance.for_range(0, 4) as eeb:
-            fetch_filter_w = 0
-            fetch_filter_h = 0
-            left_top_h = 0
-            left_top_w = 0
-            tik_instance.load3dv1(input_1_1_fractal_L1_local_UB, input_1_1_local_L1[eeb * 56 * 56 * 16],
-              pad,
-              l1_h,
-              l1_w,
-              c1_index,
-              fetch_filter_w,
-              fetch_filter_h,
-              left_top_w,
-              left_top_h,
-              stride_w,
-              stride_h,
-              filter_w,
-              filter_h,
-              dilation_filter_w,
-              dilation_filter_h,
-              jump_stride,
-              repeat_mode,
-              196)
-            with tik_instance.for_range(0, 196) as rep:
-              tik_instance.data_move(res[eeb, rep + block_index * 196, 0, 0], input_1_1_fractal_L1_local_UB[rep * 256], 0, 1, 16, 0, 0)
+        if padding == 'SAME':
+            padding_left = 0
+            padding_right = 0
+            padding_top = 0
+            padding_bottom = 0
+        pad = [padding_left, padding_right, padding_top, padding_bottom]
+        l1_h = 56
+        l1_w = 56
+        c1_index = 0
+        jump_stride = 1
+        repeat_mode = 1
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (12544 * 32 // 2,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (100352 // 2,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            tik_instance.data_move(input_1_1_local_L1, input_x[block_index, 0, 0, 0, 0], 0, 1, 12544, 0, 0)
+            with tik_instance.for_range(0, 4) as eeb:
+                fetch_filter_w = 0
+                fetch_filter_h = 0
+                left_top_h = 0
+                left_top_w = 0
+                tik_instance.load3dv1(input_1_1_fractal_L1_local_UB, input_1_1_local_L1[eeb * 56 * 56 * 16],
+                                      pad,
+                                      l1_h,
+                                      l1_w,
+                                      c1_index,
+                                      fetch_filter_w,
+                                      fetch_filter_h,
+                                      left_top_w,
+                                      left_top_h,
+                                      stride_w,
+                                      stride_h,
+                                      filter_w,
+                                      filter_h,
+                                      dilation_filter_w,
+                                      dilation_filter_h,
+                                      jump_stride,
+                                      repeat_mode,
+                                      196)
+                with tik_instance.for_range(0, 196) as rep:
+                    tik_instance.data_move(res[eeb, rep + block_index * 196, 0, 0],
+                                           input_1_1_fractal_L1_local_UB[rep * 256], 0, 1, 16, 0, 0)
 
     if input_shape == ((32, 8, 28, 28, 16), 'float16', (1, 1), (1, 1)):
-      if padding == 'SAME':
-        padding_left = 0
-        padding_right = 0
-        padding_top = 0
-        padding_bottom = 0
-      pad = [padding_left, padding_right, padding_top, padding_bottom]
-      l1_h = 28
-      l1_w = 28
-      c1_index = 0
-      jump_stride = 1
-      repeat_mode = 1
-      with tik_instance.for_range(0,32,block_num=32) as block_index:
-        input_1_1_local_L1 = tik_instance.Tensor("float16", (6272 * 32 // 2, ), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-        input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (49 * 256 * 8,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-        tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,0,0,0], 0, 1, 6272, 0, 0)
-        with tik_instance.for_range(0, 1) as eeb0:
-          with tik_instance.for_range(0, 8) as eeb1:
-            fetch_filter_w = 0
-            fetch_filter_h = 0
-            left_top_h = 0
-            left_top_w = 0
-            tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[eeb1 * 49 * 256], input_1_1_local_L1[(eeb1 + eeb0 * 8) * 28 * 28 * 16],
-              pad,
-              l1_h,
-              l1_w,
-              c1_index,
-              fetch_filter_w,
-              fetch_filter_h,
-              left_top_w,
-              left_top_h,
-              stride_w,
-              stride_h,
-              filter_w,
-              filter_h,
-              dilation_filter_w,
-              dilation_filter_h,
-              jump_stride,
-              repeat_mode,
-              49)
-          with tik_instance.for_range(0, 8) as eeb1:
-            with tik_instance.for_range(0, 49) as i:
-              tik_instance.data_move(res[eeb0 * 8 + eeb1, i + block_index * 49, 0, 0], input_1_1_fractal_L1_local_UB[i * 256 + eeb1 * 49 * 256], 0, 1, 16, 0, 0)
+        if padding == 'SAME':
+            padding_left = 0
+            padding_right = 0
+            padding_top = 0
+            padding_bottom = 0
+        pad = [padding_left, padding_right, padding_top, padding_bottom]
+        l1_h = 28
+        l1_w = 28
+        c1_index = 0
+        jump_stride = 1
+        repeat_mode = 1
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (6272 * 32 // 2,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (49 * 256 * 8,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            tik_instance.data_move(input_1_1_local_L1, input_x[block_index, 0, 0, 0, 0], 0, 1, 6272, 0, 0)
+            with tik_instance.for_range(0, 1) as eeb0:
+                with tik_instance.for_range(0, 8) as eeb1:
+                    fetch_filter_w = 0
+                    fetch_filter_h = 0
+                    left_top_h = 0
+                    left_top_w = 0
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[eeb1 * 49 * 256],
+                                          input_1_1_local_L1[(eeb1 + eeb0 * 8) * 28 * 28 * 16],
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          49)
+                with tik_instance.for_range(0, 8) as eeb1:
+                    with tik_instance.for_range(0, 49) as i:
+                        tik_instance.data_move(res[eeb0 * 8 + eeb1, i + block_index * 49, 0, 0],
+                                               input_1_1_fractal_L1_local_UB[i * 256 + eeb1 * 49 * 256], 0, 1, 16, 0, 0)
 
     if input_shape == ((32, 32, 28, 28, 16), 'float16', (1, 1), (1, 1)):
-      if padding == 'SAME':
-        padding_left = 0
-        padding_right = 0
-        padding_top = 0
-        padding_bottom = 0
-      pad = [padding_left, padding_right, padding_top, padding_bottom]
-      l1_h = 28
-      l1_w = 28
-      c1_index = 0
-      jump_stride = 1
-      repeat_mode = 1
-      with tik_instance.for_range(0,32,block_num=32) as block_index:
-        input_1_1_local_L1 = tik_instance.Tensor("float16", (25088 * 32 // 2, ), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-        input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (49 * 256 * 8,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-        tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,0,0,0], 0, 1, 25088, 0, 0)
-        with tik_instance.for_range(0, 4) as eeb0:
-          with tik_instance.for_range(0, 8) as eeb1:
-            fetch_filter_w = 0
-            fetch_filter_h = 0
-            left_top_h = 0
-            left_top_w = 0
-            tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[eeb1 * 49 * 256], input_1_1_local_L1[(eeb1 + eeb0 * 8) * 28 * 28 * 16],
-              pad,
-              l1_h,
-              l1_w,
-              c1_index,
-              fetch_filter_w,
-              fetch_filter_h,
-              left_top_w,
-              left_top_h,
-              stride_w,
-              stride_h,
-              filter_w,
-              filter_h,
-              dilation_filter_w,
-              dilation_filter_h,
-              jump_stride,
-              repeat_mode,
-              49)
-          with tik_instance.for_range(0, 8) as eeb1:
-            with tik_instance.for_range(0, 49) as i:
-              tik_instance.data_move(res[eeb0 * 8 + eeb1, i + block_index * 49, 0, 0], input_1_1_fractal_L1_local_UB[i * 256 + eeb1 * 49 * 256], 0, 1, 16, 0, 0)
+        if padding == 'SAME':
+            padding_left = 0
+            padding_right = 0
+            padding_top = 0
+            padding_bottom = 0
+        pad = [padding_left, padding_right, padding_top, padding_bottom]
+        l1_h = 28
+        l1_w = 28
+        c1_index = 0
+        jump_stride = 1
+        repeat_mode = 1
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (25088 * 32 // 2,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (49 * 256 * 8,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            tik_instance.data_move(input_1_1_local_L1, input_x[block_index, 0, 0, 0, 0], 0, 1, 25088, 0, 0)
+            with tik_instance.for_range(0, 4) as eeb0:
+                with tik_instance.for_range(0, 8) as eeb1:
+                    fetch_filter_w = 0
+                    fetch_filter_h = 0
+                    left_top_h = 0
+                    left_top_w = 0
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[eeb1 * 49 * 256],
+                                          input_1_1_local_L1[(eeb1 + eeb0 * 8) * 28 * 28 * 16],
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          49)
+                with tik_instance.for_range(0, 8) as eeb1:
+                    with tik_instance.for_range(0, 49) as i:
+                        tik_instance.data_move(res[eeb0 * 8 + eeb1, i + block_index * 49, 0, 0],
+                                               input_1_1_fractal_L1_local_UB[i * 256 + eeb1 * 49 * 256], 0, 1, 16, 0, 0)
 
     if input_shape == ((32, 16, 14, 14, 16), 'float16', (1, 1), (1, 1)):
-      if padding == 'SAME':
-        padding_left = 0
-        padding_right = 0
-        padding_top = 0
-        padding_bottom = 0
-      pad = [padding_left, padding_right, padding_top, padding_bottom]
-      l1_h = 14
-      l1_w = 14
-      c1_index = 0
-      jump_stride = 1
-      repeat_mode = 1
-      with tik_instance.for_range(0,32,block_num=32) as block_index:
-        eeb0 = block_index % 2
-        eeb1 = block_index // 2
-        input_1_1_local_L1 = tik_instance.Tensor("float16", (196 * 32 * 16,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-        input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (106496 // 2,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-        input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (196 * 16 * 16,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
-        with tik_instance.for_range(0,32) as i:
-          tik_instance.data_move(input_1_1_local_L1[i * 3136], input_x[i,eeb1,0,0,0], 0, 1, 196, 0, 0)
-        with tik_instance.for_range(0,16) as i:
-          fetch_filter_w = 0
-          fetch_filter_h = 0
-          left_top_h = 0
-          left_top_w = 0
-          tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[i * 3328], input_1_1_local_L1[i * 3136 + eeb0 * 16 * 3136],
-              pad,
-              l1_h,
-              l1_w,
-              c1_index,
-              fetch_filter_w,
-              fetch_filter_h,
-              left_top_w,
-              left_top_h,
-              stride_w,
-              stride_h,
-              filter_w,
-              filter_h,
-              dilation_filter_w,
-              dilation_filter_h,
-              jump_stride,
-              repeat_mode,
-              13)
-        with tik_instance.for_range(0,16) as i:
-          tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16], input_1_1_fractal_L1_local_UB[i * 3328], 0, 1, 196,0,0)
-        with tik_instance.for_range(0, 196) as i:
-          tik_instance.data_move(res[eeb1, i + 196 * eeb0, 0, 0],input_1_2_fractal_L1_local_UB[256 * i], 0, 1, 16, 0, 0)
+        if padding == 'SAME':
+            padding_left = 0
+            padding_right = 0
+            padding_top = 0
+            padding_bottom = 0
+        pad = [padding_left, padding_right, padding_top, padding_bottom]
+        l1_h = 14
+        l1_w = 14
+        c1_index = 0
+        jump_stride = 1
+        repeat_mode = 1
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            eeb0 = block_index % 2
+            eeb1 = block_index // 2
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (196 * 32 * 16,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (106496 // 2,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (196 * 16 * 16,), scope=tik.scope_ubuf,
+                                                                name="input_1_2_fractal_L1_local_UB")
+            with tik_instance.for_range(0, 32) as i:
+                tik_instance.data_move(input_1_1_local_L1[i * 3136], input_x[i, eeb1, 0, 0, 0], 0, 1, 196, 0, 0)
+            with tik_instance.for_range(0, 16) as i:
+                fetch_filter_w = 0
+                fetch_filter_h = 0
+                left_top_h = 0
+                left_top_w = 0
+                tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[i * 3328],
+                                      input_1_1_local_L1[i * 3136 + eeb0 * 16 * 3136],
+                                      pad,
+                                      l1_h,
+                                      l1_w,
+                                      c1_index,
+                                      fetch_filter_w,
+                                      fetch_filter_h,
+                                      left_top_w,
+                                      left_top_h,
+                                      stride_w,
+                                      stride_h,
+                                      filter_w,
+                                      filter_h,
+                                      dilation_filter_w,
+                                      dilation_filter_h,
+                                      jump_stride,
+                                      repeat_mode,
+                                      13)
+            with tik_instance.for_range(0, 16) as i:
+                tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16],
+                                       input_1_1_fractal_L1_local_UB[i * 3328], 0, 1, 196, 0, 0)
+            with tik_instance.for_range(0, 196) as i:
+                tik_instance.data_move(res[eeb1, i + 196 * eeb0, 0, 0], input_1_2_fractal_L1_local_UB[256 * i], 0, 1,
+                                       16, 0, 0)
 
     if input_shape == ((32, 16, 56, 56, 16), 'float16', (1, 1), (1, 1)):
-      if padding == 'SAME':
-        padding_left = 0
-        padding_right = 0
-        padding_top = 0
-        padding_bottom = 0
-      pad = [padding_left, padding_right, padding_top, padding_bottom]
-      l1_h = 56
-      l1_w = 56
-      c1_index = 0
-      jump_stride = 1
-      repeat_mode = 1
-      with tik_instance.for_range(0,32,block_num=32) as block_index:
-        input_1_1_local_L1 = tik_instance.Tensor("float16", (25088 * 32 // 2,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-        input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (196 * 256 * 2,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-        with tik_instance.for_range(0,2) as eeb0:
-          tik_instance.data_move(input_1_1_local_L1, input_x[block_index,eeb0 * 8,0,0,0], 0, 1, 25088, 0, 0)
-          with tik_instance.for_range(0,4) as eeb1:
-            with tik_instance.for_range(0,2) as eeb2:
-              fetch_filter_w = 0
-              fetch_filter_h = 0
-              left_top_h = 0
-              left_top_w = 0
-              tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[eeb2 * 196 * 256], input_1_1_local_L1[(eeb2 + eeb1 * 2) * 56 * 56 * 16],
-                pad,
-                l1_h,
-                l1_w,
-                c1_index,
-                fetch_filter_w,
-                fetch_filter_h,
-                left_top_w,
-                left_top_h,
-                stride_w,
-                stride_h,
-                filter_w,
-                filter_h,
-                dilation_filter_w,
-                dilation_filter_h,
-                jump_stride,
-                repeat_mode,
-                196)
-            with tik_instance.for_range(0,2) as eeb2:
-              with tik_instance.for_range(0,196) as i:
-                tik_instance.data_move(res[eeb0 * 8 + eeb1 * 2 + eeb2, i + block_index * 196, 0, 0],input_1_1_fractal_L1_local_UB[256 * i + eeb2 * 196 * 256], 0, 1, 16, 0, 0)
+        if padding == 'SAME':
+            padding_left = 0
+            padding_right = 0
+            padding_top = 0
+            padding_bottom = 0
+        pad = [padding_left, padding_right, padding_top, padding_bottom]
+        l1_h = 56
+        l1_w = 56
+        c1_index = 0
+        jump_stride = 1
+        repeat_mode = 1
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (25088 * 32 // 2,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (196 * 256 * 2,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            with tik_instance.for_range(0, 2) as eeb0:
+                tik_instance.data_move(input_1_1_local_L1, input_x[block_index, eeb0 * 8, 0, 0, 0], 0, 1, 25088, 0, 0)
+                with tik_instance.for_range(0, 4) as eeb1:
+                    with tik_instance.for_range(0, 2) as eeb2:
+                        fetch_filter_w = 0
+                        fetch_filter_h = 0
+                        left_top_h = 0
+                        left_top_w = 0
+                        tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[eeb2 * 196 * 256],
+                                              input_1_1_local_L1[(eeb2 + eeb1 * 2) * 56 * 56 * 16],
+                                              pad,
+                                              l1_h,
+                                              l1_w,
+                                              c1_index,
+                                              fetch_filter_w,
+                                              fetch_filter_h,
+                                              left_top_w,
+                                              left_top_h,
+                                              stride_w,
+                                              stride_h,
+                                              filter_w,
+                                              filter_h,
+                                              dilation_filter_w,
+                                              dilation_filter_h,
+                                              jump_stride,
+                                              repeat_mode,
+                                              196)
+                    with tik_instance.for_range(0, 2) as eeb2:
+                        with tik_instance.for_range(0, 196) as i:
+                            tik_instance.data_move(res[eeb0 * 8 + eeb1 * 2 + eeb2, i + block_index * 196, 0, 0],
+                                                   input_1_1_fractal_L1_local_UB[256 * i + eeb2 * 196 * 256], 0, 1, 16,
+                                                   0, 0)
 
     if input_shape == ((32, 16, 56, 56, 16), 'float16', (1, 1), (2, 2)):
-      if padding == 'SAME':
-        padding_left = 0
-        padding_right = 0
-        padding_top = 0
-        padding_bottom = 0
-      pad = [padding_left, padding_right, padding_top, padding_bottom]
-      l1_h = 56
-      l1_w = 56
-      c1_index = 0
-      jump_stride = 1
-      repeat_mode = 1
-      with tik_instance.for_range(0,32,block_num=32) as block_index:
-        input_1_1_local_L1 = tik_instance.Tensor("float16", (25088 * 32 // 2,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-        input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (49 * 256 * 8,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-        with tik_instance.for_range(0,2) as eeb0:
-          tik_instance.data_move(input_1_1_local_L1, input_x[block_index,eeb0 * 8,0,0,0], 0, 1, 25088, 0, 0)
-          with tik_instance.for_range(0,8) as eeb1:
-            fetch_filter_w = 0
-            fetch_filter_h = 0
-            left_top_h = 0
-            left_top_w = 0
-            tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[eeb1 * 49 * 256], input_1_1_local_L1[eeb1 * 56 * 56 * 16],
-              pad,
-              l1_h,
-              l1_w,
-              c1_index,
-              fetch_filter_w,
-              fetch_filter_h,
-              left_top_w,
-              left_top_h,
-              stride_w,
-              stride_h,
-              filter_w,
-              filter_h,
-              dilation_filter_w,
-              dilation_filter_h,
-              jump_stride,
-              repeat_mode,
-              49)
-          with tik_instance.for_range(0,8) as eeb1:
-            with tik_instance.for_range(0,49) as i:
-              tik_instance.data_move(res[eeb0 * 8 + eeb1, i + block_index * 49, 0, 0],input_1_1_fractal_L1_local_UB[256 * i + eeb1 * 49 * 256], 0, 1, 16, 0, 0)
+        if padding == 'SAME':
+            padding_left = 0
+            padding_right = 0
+            padding_top = 0
+            padding_bottom = 0
+        pad = [padding_left, padding_right, padding_top, padding_bottom]
+        l1_h = 56
+        l1_w = 56
+        c1_index = 0
+        jump_stride = 1
+        repeat_mode = 1
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (25088 * 32 // 2,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (49 * 256 * 8,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            with tik_instance.for_range(0, 2) as eeb0:
+                tik_instance.data_move(input_1_1_local_L1, input_x[block_index, eeb0 * 8, 0, 0, 0], 0, 1, 25088, 0, 0)
+                with tik_instance.for_range(0, 8) as eeb1:
+                    fetch_filter_w = 0
+                    fetch_filter_h = 0
+                    left_top_h = 0
+                    left_top_w = 0
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[eeb1 * 49 * 256],
+                                          input_1_1_local_L1[eeb1 * 56 * 56 * 16],
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          49)
+                with tik_instance.for_range(0, 8) as eeb1:
+                    with tik_instance.for_range(0, 49) as i:
+                        tik_instance.data_move(res[eeb0 * 8 + eeb1, i + block_index * 49, 0, 0],
+                                               input_1_1_fractal_L1_local_UB[256 * i + eeb1 * 49 * 256], 0, 1, 16, 0, 0)
     tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x], outputs=[res])
     return tik_instance
diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py
index a3517dce75..0458363a6d 100644
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py
@@ -17,17 +17,15 @@ limitations under the License.
 matmul
 """
 from __future__ import absolute_import
+
 import te.lang.cce
 import te.platform.cce_params as cce
-from te.platform.fusion_manager import fusion_manager
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+from te import tik
 from te import tvm
 from topi import generic
 from topi.cce import util
 
-from impl.matmul_vector import matmul_vector_cce
-
-from te import tik
-from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 # General limitation of the size for input shape: 2**31
 SHAPE_SIZE_LIMIT = 2147483648
 NoneType = type(None)
@@ -46,6 +44,7 @@ matmul_cube_dense_left_op_info = TBERegOp("CusMatMulCubeDenseLeft") \
     .dtype_format(DataType.F16_Default, DataType.F16_FracNZ, DataType.F16_Default, DataType.F16_FracNZ) \
     .get_op_info()
 
+
 # pylint: disable=locally-disabled,too-many-arguments,too-many-branches, too-many-statements, too-many-locals,
 def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
     """
@@ -115,16 +114,16 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
 
     if m_shape != 1:
         if n_shape == 1:
-            if km_shape % (cce.BLOCK_IN*cce.BLOCK_IN) != 0:
+            if km_shape % (cce.BLOCK_IN * cce.BLOCK_IN) != 0:
                 raise RuntimeError("input shape K1 should be multiple of %d"
-                                   % (cce.BLOCK_IN*cce.BLOCK_IN))
-        elif km_shape%k_block_size != 0:
+                                   % (cce.BLOCK_IN * cce.BLOCK_IN))
+        elif km_shape % k_block_size != 0:
             raise RuntimeError(
                 "input shape K1 should be multiple of %d" % cce.BLOCK_IN)
     else:
-        if km_shape % (cce.BLOCK_IN*cce.BLOCK_IN) != 0:
+        if km_shape % (cce.BLOCK_IN * cce.BLOCK_IN) != 0:
             raise RuntimeError("input shape K1 should be multiple of %d"
-                               % (cce.BLOCK_IN*cce.BLOCK_IN))
+                               % (cce.BLOCK_IN * cce.BLOCK_IN))
 
     if n_shape % cce.BLOCK_IN != 0 and n_shape != 1:
         raise RuntimeError("input shape N should be 1 or multiple of %d" % cce.BLOCK_IN)
@@ -132,7 +131,7 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
     if len(shape_bias):
         if len(shape_bias) == 1:
             if is_gevm or is_gemv:
-                if shape_bias[0] != m_shape*n_shape:
+                if shape_bias[0] != m_shape * n_shape:
                     raise RuntimeError("broadcast case shape bias for gemv must be equal m*n")
             else:
                 if shape_bias[0] != n_shape:
@@ -143,33 +142,36 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
         else:
             raise RuntimeError("unsupport input shape now for batch bias case")
 
+
 def _get_bias(shape_bias):
     bias_length = shape_bias[0]
-    if bias_length % 16 ==0:
+    if bias_length % 16 == 0:
         return shape_bias
     else:
-        bias_length = (bias_length // 16)*16 + 16
+        bias_length = (bias_length // 16) * 16 + 16
         shape_bias = []
         shape_bias.append(bias_length)
         return shape_bias
 
+
 def _get_input_shape(shape_x):
     dim_a = shape_x[0]
     dim_b = shape_x[1]
     res = []
-    if dim_a % 16 !=0:
-        dim_a = (dim_a // 16)*16 + 16
+    if dim_a % 16 != 0:
+        dim_a = (dim_a // 16) * 16 + 16
         res.append(dim_a)
     else:
         res.append(dim_a)
 
-    if dim_b % 16 !=0:
-        dim_b = (dim_b // 16)*16 + 16
+    if dim_b % 16 != 0:
+        dim_b = (dim_b // 16) * 16 + 16
         res.append(dim_b)
     else:
         res.append(dim_b)
     return res
 
+
 def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
     shape_a = input_x1.get("shape")
     shape_b = input_x2.get("shape")
@@ -184,7 +186,7 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
     if bias is not None and bool(bias):
         shape_bias = bias.get("shape")
     try:
-        trans_a_f = bool(1-trans_a)
+        trans_a_f = bool(1 - trans_a)
         if src_dtype == "float32" or src_dtype == "int32":
             if len(shape_a) != 2 and len(shape_b) != 2:
                 return False
@@ -205,44 +207,46 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
                     return False
             elif shape_a[1] != shape_b[0]:
                 return False
- 
+
             if trans_a_f and trans_b and shape_b[1] == 1:
                 return False
- 
+
         if src_dtype == "float16":
             if len(shape_a) != 2 and len(shape_b) != 2:
                 return False
- 
+
             if trans_a:
                 m_shape = shape_a[1]
                 k_shape = shape_a[0]
             else:
                 m_shape = shape_a[0]
                 k_shape = shape_a[1]
- 
+
             if trans_b:
                 n_shape = shape_b[0]
                 k_b_shape = shape_b[1]
             else:
                 n_shape = shape_b[1]
                 k_b_shape = shape_b[0]
- 
+
             if k_shape != k_b_shape:
                 return False
- 
+
             if m_shape == 1 or n_shape == 1:
                 if k_shape % 256 != 0:
                     return False
- 
+
     except RuntimeError as e:
         return False
- 
+
     return True
- 
+
+
 # pylint: disable=locally-disabled,too-many-arguments, too-many-locals, too-many-statements
 # @util.check_input_type(dict, dict, (dict, NoneType), dict, bool, bool, str)
 @op_info_register(matmul_cube_dense_left_op_info)
-def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
+def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False,
+                           kernel_name="matmulcube"):
     """
     calculating  matrix multiplication with bias, C = A*B + bias, support input
     data with fractal format.
@@ -279,87 +283,87 @@ def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=F
     print(shape_a, shape_b)
     print("============")
     if input_x2.get("format") == "FRACTAL_Z":
-        n,c,h,w = shape_b
+        n, c, h, w = shape_b
         c0 = 16
         c1 = c // c0
         if c1 == 0:
             c1 = 1
         shape_b = [n, c1 * h * w * c0]
-        shape_a = [n,n]
+        shape_a = [n, n]
 
     if input_x1.get("format") == "FRACTAL_Z":
-        n,c,h,w = shape_a
+        n, c, h, w = shape_a
         c0 = 16
         c1 = c // c0
         if c1 == 0:
             c1 = 1
         shape_a = [n, c1 * h * w * c0]
         shape_b = [c1 * h * w * c0, c1 * h * w * c0]
- 
+
     if input_x2.get("format") == "FRACTAL_NZ":
         shape_a = [shape_b[0], shape_b[0]]
         shape_b = shape_b
- 
+
     if input_x1.get("format") == "FRACTAL_NZ":
         shape_a = shape_a
         shape_b = [shape_a[1], shape_a[1]]
- 
+
     shape_a = list(shape_a)
     shape_b = list(shape_b)
- 
+
     shape_a = _get_input_shape(shape_a)
     shape_b = _get_input_shape(shape_b)
- 
+
     util.check_kernel_name(kernel_name)
     util.check_shape_rule(shape_a)
     util.check_shape_rule(shape_b)
     util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
     util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)
- 
+
     shape_a = [shape_a[1], shape_a[0]]
-    trans_a = bool(1-trans_a)
- 
+    trans_a = bool(1 - trans_a)
+
     shape_b = [shape_b[1], shape_b[0]]
-    trans_b = bool(1-trans_b)
- 
+    trans_b = bool(1 - trans_b)
+
     shape_bias = ()
     if bias is not None and bool(bias):
         shape_bias = bias.get("shape")
         shape_bias = list(shape_bias)
         shape_bias = _get_bias(shape_bias)
- 
+
     src_dtype = input_x1.get("dtype").lower()
     dst_dtype = output_y.get("dtype").lower()
     _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b)
- 
+
     m_shape = shape_a[len(shape_a) - 2]
     km_shape = shape_a[len(shape_a) - 1]
     kn_shape = shape_b[len(shape_a) - 2]
     n_shape = shape_b[len(shape_a) - 1]
- 
+
     if src_dtype == "float16":
         block_reduce = cce.BLOCK_REDUCE
- 
+
     block_in = cce.BLOCK_IN
     block_out = cce.BLOCK_OUT
- 
+
     if trans_a and km_shape == 1:
         block_in = cce.BLOCK_VECTOR
- 
+
     if not trans_a and m_shape == 1:
         block_in = cce.BLOCK_VECTOR
- 
+
     if trans_b and kn_shape == 1:
         block_out = cce.BLOCK_VECTOR
- 
+
     if not trans_b and n_shape == 1:
         block_out = cce.BLOCK_VECTOR
- 
+
     if trans_a:
         shape_a_temp = (m_shape // block_reduce, km_shape // block_in, block_reduce, block_in)
     else:
         shape_a_temp = (m_shape // block_in, km_shape // block_reduce, block_in, block_reduce)
- 
+
     if trans_b:
         shape_b_temp = (kn_shape // block_out, n_shape // block_reduce, block_reduce, block_out)
     else:
@@ -368,7 +372,7 @@ def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=F
     format_a = "FRACTAL_NZ"
     shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2], shape_b_temp[3])
     format_b = "FRACTAL_NZ"
- 
+
     print("=======================================")
     print(shape_a_temp, shape_b_temp)
     print(format_a, format_b)
@@ -378,67 +382,85 @@ def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=F
                                dtype=src_dtype)
     tensor_b = tvm.placeholder(shape_b_temp, name='tensor_b',
                                dtype=src_dtype)
- 
+
     if len(shape_bias) > 0:
         tensor_bias = tvm.placeholder(shape_bias, name='tensor_bias',
                                       dtype=dst_dtype)
- 
+
     if shape_a_temp[0] == 63 and shape_a_temp[1] == 63 and shape_b_temp[0] == 128 and shape_b_temp[1] == 63:
         if util.get_product_version() == util.VERSION_MINI:
-          tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
+            tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
         else:
-          tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
- 
+            tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
+
         input_x1 = tik_instance.Tensor("float16", shape_a_temp, name="left_matrix", scope=tik.scope_gm)
         input_x2 = tik_instance.Tensor("float16", shape_b_temp, name="right_matrix", scope=tik.scope_gm)
         resMatmul = tik_instance.Tensor("float16", shape_output, name="output", scope=tik.scope_gm)
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
-            resMatmul_local_UB = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_ubuf, name = "resMatmul_local_UB")
-            resMatmul_local_UB_local_L0C = tik_instance.Tensor("float32", (128 * 256,), scope=tik.scope_cc, name = "resMatmul_local_UB")
-            input_1_local_L1_local_L0A = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_ca, name = "input_1_local_L1_local_L0A")
-            input_2_local_L1 = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cbuf, name = "input_2_local_L1")
-            input_1_local_L1 = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cbuf, name = "input_1_local_L1")
-            input_2_local_L1_local_L0B = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cb, name = "input_2_local_L1_local_L0B")
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            resMatmul_local_UB = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_ubuf,
+                                                     name="resMatmul_local_UB")
+            resMatmul_local_UB_local_L0C = tik_instance.Tensor("float32", (128 * 256,), scope=tik.scope_cc,
+                                                               name="resMatmul_local_UB")
+            input_1_local_L1_local_L0A = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_ca,
+                                                             name="input_1_local_L1_local_L0A")
+            input_2_local_L1 = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cbuf,
+                                                   name="input_2_local_L1")
+            input_1_local_L1 = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cbuf,
+                                                   name="input_1_local_L1")
+            input_2_local_L1_local_L0B = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cb,
+                                                             name="input_2_local_L1_local_L0B")
             core_m_idx = block_index % 8
             core_n_idx = block_index // 8
             with tik_instance.if_scope(core_m_idx != 7):
-                tik_instance.data_move(input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 8, 128, 55 * 16, 0)
-                tik_instance.data_move(input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0, 32, 128, 55 * 16, 0)
+                tik_instance.data_move(input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 8, 128,
+                                       55 * 16, 0)
+                tik_instance.data_move(input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0,
+                                       32, 128, 55 * 16, 0)
                 with tik_instance.for_range(0, 8) as cc12:
-                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc12 * 2048], input_1_local_L1[cc12 * 256],  0, 8, 8, 0, False)
+                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc12 * 2048], input_1_local_L1[cc12 * 256], 0, 8,
+                                          8, 0, False)
                 with tik_instance.for_range(0, 2) as cc6:
                     with tik_instance.for_range(0, 8) as cc121:
-                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc121 * 4096], input_2_local_L1[cc6 * 32768 + cc121 * 256],  0, 16, 8, 0, True)
-                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 128, 128, 256, 0)
+                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc121 * 4096],
+                                              input_2_local_L1[cc6 * 32768 + cc121 * 256], 0, 16, 8, 0, True)
+                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A,
+                                      input_2_local_L1_local_L0B, 128, 128, 256, 0)
                     tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 128, 0, 0, 1)
-                    tik_instance.data_move(resMatmul[cc6 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], resMatmul_local_UB, 0, 16, 256 // 2 , 0, 55 * 16 * 2 // 2)
+                    tik_instance.data_move(resMatmul[cc6 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008],
+                                           resMatmul_local_UB, 0, 16, 256 // 2, 0, 55 * 16 * 2 // 2)
             with tik_instance.else_scope():
-                tik_instance.data_move(input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 7, 112, 56 * 16, 0)
-                tik_instance.data_move(input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0, 32, 112, 56 * 16, 0)
+                tik_instance.data_move(input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 7, 112,
+                                       56 * 16, 0)
+                tik_instance.data_move(input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0,
+                                       32, 112, 56 * 16, 0)
                 with tik_instance.for_range(0, 7) as cc10:
-                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc10 * 1792], input_1_local_L1[cc10 * 256],  0, 7, 7, 0, False)
+                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc10 * 1792], input_1_local_L1[cc10 * 256], 0, 7,
+                                          7, 0, False)
                 with tik_instance.for_range(0, 2) as cc5:
                     with tik_instance.for_range(0, 7) as cc101:
-                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc101 * 4096], input_2_local_L1[cc5 * 28672 + cc101 * 256],  0, 16, 7, 0, True)
-                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 112, 112, 256, 0)
+                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc101 * 4096],
+                                              input_2_local_L1[cc5 * 28672 + cc101 * 256], 0, 16, 7, 0, True)
+                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A,
+                                      input_2_local_L1_local_L0B, 112, 112, 256, 0)
                     tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 112, 0, 0, 1)
-                    tik_instance.data_move(resMatmul[cc5 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], resMatmul_local_UB, 0, 16, 224 // 2 , 0, 56 * 16 * 2 // 2)
+                    tik_instance.data_move(resMatmul[cc5 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008],
+                                           resMatmul_local_UB, 0, 16, 224 // 2, 0, 56 * 16 * 2 // 2)
         tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2], outputs=[resMatmul])
         return tik_instance
     else:
         print("come into tbe, shape is error!")
         result = te.lang.cce.matmul(tensor_a, tensor_b, trans_a, trans_b, format_a=format_a,
                                     format_b=format_b, dst_dtype=dst_dtype, tensor_bias=tensor_bias)
- 
+
         with tvm.target.cce():
             schedule = generic.auto_schedule(result)
- 
+
         tensor_list = [tensor_a, tensor_b, result]
         if len(shape_bias) > 0:
             tensor_list = [tensor_a, tensor_b, tensor_bias, result]
- 
+
         config = {"print_ir": False,
                   "name": kernel_name,
                   "tensor_list": tensor_list}
- 
+
         te.lang.cce.cce_build_code(schedule, config)
diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py
index d0522824ca..5cae9afda0 100644
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py
@@ -18,15 +18,10 @@ limitations under the License.
 matmul
 """
 from __future__ import absolute_import
-import te.lang.cce
-import te.platform.cce_params as cce
-from te.platform.fusion_manager import fusion_manager
-from te import tvm
-from topi import generic
-from topi.cce import util
-from impl.matmul_vector import matmul_vector_cce
-from te import tik
+
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+from te import tik
+from topi.cce import util
 
 matmul_cube_dense_right_op_info = TBERegOp("CusMatMulCubeDenseRight") \
     .fusion_type("OPAQUE") \
@@ -40,23 +35,26 @@ matmul_cube_dense_right_op_info = TBERegOp("CusMatMulCubeDenseRight") \
     .input(2, "x3", False, "required", "all") \
     .input(3, "x4", False, "optional", "all") \
     .output(0, "y", False, "required", "all") \
-    .dtype_format(DataType.F16_FracNZ, DataType.F16_Default, DataType.F32_Default, DataType.F16_Default, DataType.F32_FracNZ) \
+    .dtype_format(DataType.F16_FracNZ, DataType.F16_Default, DataType.F32_Default, DataType.F16_Default,
+                  DataType.F32_FracNZ) \
     .get_op_info()
- 
+
+
 @op_info_register(matmul_cube_dense_right_op_info)
-def CusMatMulCubeDenseRight(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
+def CusMatMulCubeDenseRight(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False,
+                            kernel_name="matmulcube"):
     shape_a_temp = (128, 63, 16, 16)
     shape_b_temp = (128, 128, 16, 16)
     shape_output = output_y.get("shape")
     matrix_max_shape = (1,)
-    support_shape = [(shape_a_temp, shape_b_temp, matrix_max_shape),]
+    support_shape = [(shape_a_temp, shape_b_temp, matrix_max_shape), ]
     shape_a_input = input_x1.get("shape")
     shape_b_input = input_x2.get("shape")
     matrix_max_input = input_x3.get("shape")
     input_shape = (tuple(shape_a_input), tuple(shape_b_input), tuple(matrix_max_input))
     if input_shape not in support_shape:
         raise RuntimeError("input_shape %s is not supported" % str(input_shape))
- 
+
     if shape_a_temp[0] == 128 and shape_a_temp[1] == 63 and shape_b_temp[0] == 128 and shape_b_temp[1] == 128:
         if util.get_product_version() == util.VERSION_MINI:
             tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
@@ -64,79 +62,110 @@ def CusMatMulCubeDenseRight(input_x1, input_x2, input_x3, bias=None, output_y={}
             tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
         input_x1 = tik_instance.Tensor("float16", shape_a_temp, name="left_matrix", scope=tik.scope_gm)
         input_x2 = tik_instance.Tensor("float16", shape_b_temp, name="right_matrix", scope=tik.scope_gm)
-        input_x3 = tik_instance.Tensor("float32", [1,], name="matrix_max", scope=tik.scope_gm)
+        input_x3 = tik_instance.Tensor("float32", [1, ], name="matrix_max", scope=tik.scope_gm)
         resMatmul = tik_instance.Tensor("float32", shape_output, name="output", scope=tik.scope_gm)
         with tik_instance.for_range(0, 32, block_num=32) as block_index:
             core_m_idx = block_index // 16
             core_n_idx = block_index % 16
             matrix_max_scalar = tik_instance.Scalar("float32")
-            matrix_max_local_UB = tik_instance.Tensor("float32", (8,), scope = tik.scope_ubuf, name = "matrix_max_local_UB")
+            matrix_max_local_UB = tik_instance.Tensor("float32", (8,), scope=tik.scope_ubuf, name="matrix_max_local_UB")
             tik_instance.data_move(matrix_max_local_UB, input_x3, 0, 1, 1, 0, 0)
             matrix_max_scalar.set_as(matrix_max_local_UB[0])
- 
-            resMatmul_local_UB = tik_instance.Tensor("float32", (256 * 128,), scope=tik.scope_ubuf, name = "resMatmul_local_UB")
-            resMatmul_local_UB1 = tik_instance.Tensor("float32", (240 * 128,), scope=tik.scope_ubuf, name = "resMatmul_local_UB1")
 
-            resMatmul_local_UB_local_L0C = tik_instance.Tensor("float32", (256 * 128,), scope=tik.scope_cc, name = "resMatmul_local_UB_local_L0C")
-            resMatmul_local_UB_local_L0C1 = tik_instance.Tensor("float32", (240 * 128,), scope=tik.scope_cc, name = "resMatmul_local_UB_local_L0C1")
+            resMatmul_local_UB = tik_instance.Tensor("float32", (256 * 128,), scope=tik.scope_ubuf,
+                                                     name="resMatmul_local_UB")
+            resMatmul_local_UB1 = tik_instance.Tensor("float32", (240 * 128,), scope=tik.scope_ubuf,
+                                                      name="resMatmul_local_UB1")
+
+            resMatmul_local_UB_local_L0C = tik_instance.Tensor("float32", (256 * 128,), scope=tik.scope_cc,
+                                                               name="resMatmul_local_UB_local_L0C")
+            resMatmul_local_UB_local_L0C1 = tik_instance.Tensor("float32", (240 * 128,), scope=tik.scope_cc,
+                                                                name="resMatmul_local_UB_local_L0C1")
+
+            input_1_local_L1_local_L0A = tik_instance.Tensor("float16", (256 * 128,), scope=tik.scope_ca,
+                                                             name="input_1_local_L1_local_L0A")
+            input_2_local_L1 = tik_instance.Tensor("float16", (8 * 128 * 16,), scope=tik.scope_cbuf,
+                                                   name="input_2_local_L1")
+            input_2_local_L11 = tik_instance.Tensor("float16", (8 * 128 * 16,), scope=tik.scope_cbuf,
+                                                    name="input_2_local_L11")
+
+            input_1_local_L1 = tik_instance.Tensor("float16", (8 * 256 * 16,), scope=tik.scope_cbuf,
+                                                   name="input_1_local_L1")
+            input_1_local_L11 = tik_instance.Tensor("float16", (8 * 240 * 16,), scope=tik.scope_cbuf,
+                                                    name="input_1_local_L11")
+
+            input_2_local_L1_local_L0B = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cb,
+                                                             name="input_2_local_L1_local_L0B")
+            input_2_local_L1_local_L0B1 = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cb,
+                                                              name="input_2_local_L1_local_L0B1")
 
-            input_1_local_L1_local_L0A = tik_instance.Tensor("float16", (256 * 128,), scope=tik.scope_ca, name = "input_1_local_L1_local_L0A")
-            input_2_local_L1 = tik_instance.Tensor("float16", (8 * 128 * 16,), scope=tik.scope_cbuf, name = "input_2_local_L1")
-            input_2_local_L11 = tik_instance.Tensor("float16", (8 * 128 * 16,), scope=tik.scope_cbuf, name = "input_2_local_L11")
- 
-            input_1_local_L1 = tik_instance.Tensor("float16", (8 * 256 * 16,), scope=tik.scope_cbuf, name = "input_1_local_L1")
-            input_1_local_L11 = tik_instance.Tensor("float16", (8 * 240 * 16,), scope=tik.scope_cbuf, name = "input_1_local_L11")
- 
-            input_2_local_L1_local_L0B = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cb, name = "input_2_local_L1_local_L0B")
-            input_2_local_L1_local_L0B1 = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cb, name = "input_2_local_L1_local_L0B1")
- 
             with tik_instance.if_scope(core_m_idx == 0):
                 with tik_instance.for_range(0, 2) as cc1:
-                    tik_instance.data_move(input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128, 1920, 0)
-                    tik_instance.data_move(input_1_local_L1, input_x1[core_n_idx * 129024 + cc1 * 4096], 0, 8, 256, 752, 0)
+                    tik_instance.data_move(input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8,
+                                           128, 1920, 0)
+                    tik_instance.data_move(input_1_local_L1, input_x1[core_n_idx * 129024 + cc1 * 4096], 0, 8, 256, 752,
+                                           0)
                     with tik_instance.for_range(0, 8) as cc10:
-                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256],  0, 8, 8, 0, True)
+                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256], 0,
+                                              8, 8, 0, True)
                     with tik_instance.for_range(0, 16) as cc101:
-                        tik_instance.load2dv1(input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256],  0, 8, 16, 0, False)
- 
-                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 256, 128, 128, 0)
+                        tik_instance.load2dv1(input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256],
+                                              0, 8, 16, 0, False)
+
+                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A,
+                                      input_2_local_L1_local_L0B, 256, 128, 128, 0)
                     tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 128, 0, 0)
-                    tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255,1,1,8,8)
-                    tik_instance.vmuls(64, resMatmul_local_UB[255*64], resMatmul_local_UB[255*64], matrix_max_scalar, 255,1,1,8,8)
-                    tik_instance.vmuls(64, resMatmul_local_UB[510*64], resMatmul_local_UB[510*64], matrix_max_scalar, 2,1,1,8,8)
- 
-                    tik_instance.data_move(resMatmul[core_n_idx * 129024 + cc1 * 4096], resMatmul_local_UB, 0, 8, 512, 0, 1504)
+                    tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255, 1, 1, 8, 8)
+                    tik_instance.vmuls(64, resMatmul_local_UB[255 * 64], resMatmul_local_UB[255 * 64],
+                                       matrix_max_scalar, 255, 1, 1, 8, 8)
+                    tik_instance.vmuls(64, resMatmul_local_UB[510 * 64], resMatmul_local_UB[510 * 64],
+                                       matrix_max_scalar, 2, 1, 1, 8, 8)
+
+                    tik_instance.data_move(resMatmul[core_n_idx * 129024 + cc1 * 4096], resMatmul_local_UB, 0, 8, 512,
+                                           0, 1504)
             with tik_instance.else_scope():
-                tik_instance.data_move(input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128, 1920, 0)
+                tik_instance.data_move(input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128,
+                                       1920, 0)
                 tik_instance.data_move(input_1_local_L1, input_x1[core_n_idx * 129024 + 2 * 4096], 0, 8, 256, 752, 0)
                 with tik_instance.for_range(0, 8) as cc10:
-                    tik_instance.load2dv1(input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256],  0, 8, 8, 0, True)
+                    tik_instance.load2dv1(input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256], 0, 8,
+                                          8, 0, True)
                 with tik_instance.for_range(0, 16) as cc101:
-                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256],  0, 8, 16, 0, False)
- 
-                tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 256, 128, 128, 0)
+                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256], 0, 8,
+                                          16, 0, False)
+
+                tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B,
+                                  256, 128, 128, 0)
                 tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 128, 0, 0)
-                tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255,1,1,8,8)
-                tik_instance.vmuls(64, resMatmul_local_UB[255*64], resMatmul_local_UB[255*64], matrix_max_scalar, 255,1,1,8,8)
-                tik_instance.vmuls(64, resMatmul_local_UB[510*64], resMatmul_local_UB[510*64], matrix_max_scalar, 2,1,1,8,8)
- 
-                tik_instance.data_move(resMatmul[core_n_idx * 129024 + 2 * 4096], resMatmul_local_UB, 0, 8, 512, 0, 1504)
- 
-                tik_instance.data_move(input_2_local_L11, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128, 1920, 0)
+                tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255, 1, 1, 8, 8)
+                tik_instance.vmuls(64, resMatmul_local_UB[255 * 64], resMatmul_local_UB[255 * 64], matrix_max_scalar,
+                                   255, 1, 1, 8, 8)
+                tik_instance.vmuls(64, resMatmul_local_UB[510 * 64], resMatmul_local_UB[510 * 64], matrix_max_scalar, 2,
+                                   1, 1, 8, 8)
+
+                tik_instance.data_move(resMatmul[core_n_idx * 129024 + 2 * 4096], resMatmul_local_UB, 0, 8, 512, 0,
+                                       1504)
+
+                tik_instance.data_move(input_2_local_L11, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128,
+                                       1920, 0)
                 tik_instance.data_move(input_1_local_L11, input_x1[core_n_idx * 129024 + 12288], 0, 8, 240, 768, 0)
- 
+
                 with tik_instance.for_range(0, 8) as cc102:
-                    tik_instance.load2dv1(input_2_local_L1_local_L0B1[cc102 * 2048], input_2_local_L11[cc102 * 256],  0, 8, 8, 0, True)
+                    tik_instance.load2dv1(input_2_local_L1_local_L0B1[cc102 * 2048], input_2_local_L11[cc102 * 256], 0,
+                                          8, 8, 0, True)
                 with tik_instance.for_range(0, 16) as cc103:
-                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc103 * 2048], input_1_local_L11[cc103 * 256],  0, 8, 15, 0, False)
- 
-                tik_instance.mmad(resMatmul_local_UB_local_L0C1, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B1, 240, 128, 128, 0)
+                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc103 * 2048], input_1_local_L11[cc103 * 256], 0,
+                                          8, 15, 0, False)
+
+                tik_instance.mmad(resMatmul_local_UB_local_L0C1, input_1_local_L1_local_L0A,
+                                  input_2_local_L1_local_L0B1, 240, 128, 128, 0)
                 tik_instance.data_move(resMatmul_local_UB1, resMatmul_local_UB_local_L0C1, 0, 1, 120, 0, 0)
- 
-                tik_instance.vmuls(64, resMatmul_local_UB1, resMatmul_local_UB1, matrix_max_scalar, 255,1,1,8,8)
-                tik_instance.vmuls(64, resMatmul_local_UB1[255*64], resMatmul_local_UB1[255*64], matrix_max_scalar, 225,1,1,8,8)
- 
+
+                tik_instance.vmuls(64, resMatmul_local_UB1, resMatmul_local_UB1, matrix_max_scalar, 255, 1, 1, 8, 8)
+                tik_instance.vmuls(64, resMatmul_local_UB1[255 * 64], resMatmul_local_UB1[255 * 64], matrix_max_scalar,
+                                   225, 1, 1, 8, 8)
+
                 tik_instance.data_move(resMatmul[core_n_idx * 129024 + 12288], resMatmul_local_UB1, 0, 8, 480, 0, 1536)
- 
+
         tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2, input_x3], outputs=[resMatmul])
         return tik_instance
diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py
index aeef521d1e..ebff84d889 100644
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py
@@ -17,11 +17,12 @@ limitations under the License.
 matmul
 """
 from __future__ import absolute_import
+
 import te.platform.cce_params as cce
-from te import tvm
-from topi.cce import util
-from te import tik
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+from te import tik
+from topi.cce import util
+
 # General limitation of the size for input shape: 2**31
 SHAPE_SIZE_LIMIT = 2147483648
 NoneType = type(None)
@@ -40,6 +41,7 @@ matmul_cube_fracz_left_cast_op_info = TBERegOp("CusMatMulCubeFraczLeftCast") \
     .dtype_format(DataType.F16_Default, DataType.F32_FracZ, DataType.F16_Default, DataType.F16_FracZ) \
     .get_op_info()
 
+
 # pylint: disable=locally-disabled,too-many-arguments,too-many-branches, too-many-statements, too-many-locals,
 def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
     """
@@ -137,6 +139,7 @@ src_dtype: str
         else:
             raise RuntimeError("unsupport input shape now for batch bias case")
 
+
 def _get_bias(shape_bias):
     bias_length = shape_bias[0]
     if bias_length % 16 == 0:
@@ -147,6 +150,7 @@ def _get_bias(shape_bias):
         shape_bias.append(bias_length)
         return shape_bias
 
+
 def _get_input_shape(shape_x):
     dim_a = shape_x[0]
     dim_b = shape_x[1]
@@ -164,6 +168,7 @@ def _get_input_shape(shape_x):
         res.append(dim_b)
     return res
 
+
 def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
     shape_a = input_x1.get("shape")
     shape_b = input_x2.get("shape")
@@ -199,40 +204,41 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
                     return False
             elif shape_a[1] != shape_b[0]:
                 return False
- 
+
             if trans_a_f and trans_b and shape_b[1] == 1:
                 return False
- 
+
         if src_dtype == "float16":
             if len(shape_a) != 2 and len(shape_b) != 2:
                 return False
- 
+
             if trans_a:
                 m_shape = shape_a[1]
                 k_shape = shape_a[0]
             else:
                 m_shape = shape_a[0]
                 k_shape = shape_a[1]
- 
+
             if trans_b:
                 n_shape = shape_b[0]
                 k_b_shape = shape_b[1]
             else:
                 n_shape = shape_b[1]
                 k_b_shape = shape_b[0]
- 
+
             if k_shape != k_b_shape:
                 return False
- 
+
             if m_shape == 1 or n_shape == 1:
                 if k_shape % 256 != 0:
                     return False
- 
+
     except RuntimeError as e:
         return False
- 
+
     return True
- 
+
+
 # pylint: disable=locally-disabled,too-many-arguments, too-many-locals, too-many-statements
 @op_info_register(matmul_cube_fracz_left_cast_op_info)
 def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False,
@@ -278,7 +284,7 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
             c1 = 1
         shape_b = [n, c1 * h * w * c0]
         shape_a = [n, n]
- 
+
     if input_x1.get("format") == "FRACTAL_Z":
         n, c, h, w = shape_a
         c0 = 16
@@ -291,26 +297,26 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
     if input_x2.get("format") == "FRACTAL_NZ":
         shape_a = [shape_b[0], shape_b[0]]
         shape_b = shape_b
- 
+
     if input_x1.get("format") == "FRACTAL_NZ":
         shape_a = shape_a
         shape_b = [shape_a[1], shape_a[1]]
 
     shape_a = list(shape_a)
     shape_b = list(shape_b)
- 
+
     shape_a = _get_input_shape(shape_a)
     shape_b = _get_input_shape(shape_b)
- 
+
     util.check_kernel_name(kernel_name)
     util.check_shape_rule(shape_a)
     util.check_shape_rule(shape_b)
     util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
     util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)
- 
+
     shape_a = [shape_a[1], shape_a[0]]
     trans_a = bool(1 - trans_a)
- 
+
     shape_b = [shape_b[1], shape_b[0]]
     trans_b = bool(1 - trans_b)
 
@@ -319,45 +325,45 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
         shape_bias = bias.get("shape")
         shape_bias = list(shape_bias)
         shape_bias = _get_bias(shape_bias)
- 
+
     src_dtype = input_x1.get("dtype").lower()
     _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b)
- 
+
     m_shape = shape_a[len(shape_a) - 2]
     km_shape = shape_a[len(shape_a) - 1]
     kn_shape = shape_b[len(shape_a) - 2]
     n_shape = shape_b[len(shape_a) - 1]
- 
+
     if src_dtype == "float16":
         block_reduce = cce.BLOCK_REDUCE
- 
+
     block_in = cce.BLOCK_IN
     block_out = cce.BLOCK_OUT
- 
+
     if trans_a and km_shape == 1:
         block_in = cce.BLOCK_VECTOR
- 
+
     if not trans_a and m_shape == 1:
         block_in = cce.BLOCK_VECTOR
- 
+
     if trans_b and kn_shape == 1:
         block_out = cce.BLOCK_VECTOR
- 
+
     if not trans_b and n_shape == 1:
         block_out = cce.BLOCK_VECTOR
- 
+
     if trans_a:
         shape_a_temp = (m_shape // block_reduce, km_shape // block_in, block_reduce, block_in)
     else:
         shape_a_temp = (m_shape // block_in, km_shape // block_reduce, block_in, block_reduce)
- 
+
     if trans_b:
         shape_b_temp = (kn_shape // block_out, n_shape // block_reduce, block_reduce, block_out)
     else:
         shape_b_temp = (kn_shape // block_reduce, n_shape // block_out, block_out, block_reduce)
     shape_a_temp = (shape_a_temp[0], shape_a_temp[1], shape_a_temp[2], shape_a_temp[3])
     shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2], shape_b_temp[3])
- 
+
     if util.get_product_version() == util.VERSION_MINI:
         tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
     else:
@@ -372,7 +378,8 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
                          diag_opt=diag_opt, diag_size=DIAG_SIZE)
     tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2], outputs=[res_matmul])
     return tik_instance
- 
+
+
 def get_cus_tile_info(input_x1, input_x2, diag_size):
     tile_map = {
         ((32, 32, 16, 16), (128, 32, 16, 16)): (8, 8, 16),
@@ -381,10 +388,10 @@ def get_cus_tile_info(input_x1, input_x2, diag_size):
         ((128, 128, 16, 16), (32, 128, 16, 16)): (8, 8, 16),
         ((16, 16, 16, 16), (144, 16, 16, 16)): (8, 8, 9),
         ((64, 64, 16, 16), (16, 64, 16, 16)): (8, 8, 4),
-        ((16, 16, 16, 16), (64, 16, 16, 16)):  (8, 8, 4),
-        ((32, 32, 16, 16), (8, 32, 16, 16)):  (8, 8, 1),
+        ((16, 16, 16, 16), (64, 16, 16, 16)): (8, 8, 4),
+        ((32, 32, 16, 16), (8, 32, 16, 16)): (8, 8, 1),
         ((128, 128, 16, 16), (64, 128, 16, 16)): (8, 8, 16),
-        ((16, 16, 16, 16),  (4, 16, 16, 16)):  (8, 8, 1),
+        ((16, 16, 16, 16), (4, 16, 16, 16)): (8, 8, 1),
         ((16, 16, 16, 16), (32, 16, 16, 16)): (8, 8, 2),
         ((64, 64, 16, 16), (32, 64, 16, 16)): (8, 8, 8),
         ((32, 32, 16, 16), (64, 32, 16, 16)): (8, 8, 8),
@@ -398,13 +405,14 @@ def get_cus_tile_info(input_x1, input_x2, diag_size):
     }
     shape_info = (tuple(input_x1.shape), tuple(input_x2.shape))
     diag_opt = False
-    if input_x1.shape[0]*input_x1.shape[3] > diag_size:
+    if input_x1.shape[0] * input_x1.shape[3] > diag_size:
         diag_opt = True
     if shape_info not in tile_map:
         raise ValueError("shape %s is not supported" % str(shape_info))
     mo_tile, ko_tile, no_tile = tile_map[shape_info]
     return mo_tile, ko_tile, no_tile, diag_opt
 
+
 def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
                          res, mo_tile, ko_tile, no_tile, diag_opt=False, diag_size=128):
     ko, mo, mi, ki = input_x1.shape
@@ -420,7 +428,7 @@ def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
         raise ValueError("shape of input_x1 or input_x2 is not supported!")
     if not trans_a or not trans_b:
         raise ValueError("only trans_a=False and trans_b=False be supported!")
- 
+
     core_m_num = mo // mo_tile
     loop_n_num = no // no_tile
     if loop_n_num * core_m_num <= maxblocknum:
@@ -432,7 +440,7 @@ def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
     else:
         raise ValueError("Does not support this scenario!")
     block_num = core_m_num * core_n_num
- 
+
     loop_k_num = ko // ko_tile
     if diag_opt:
         loop_k_num = diag_outer // ko_tile
@@ -445,7 +453,7 @@ def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
         core_n = block_idx % core_n_num
         with tik_instance.for_range(0, loop_n_num) as cc_n:
             res_L0C = tik_instance.Tensor("float32", [no_tile, mo_tile, c0, c0],
-                                                name="resMatmul_L0C", scope=tik.scope_cc)
+                                          name="resMatmul_L0C", scope=tik.scope_cc)
             with tik_instance.for_range(0, loop_k_num, thread_num=thread_num_k) as thread_idx_k:
                 # input_x2 -> input_x2_ub -(fp322fp16)-> input_x2_cast_ub -> input_x2_L1
                 input_x2_ub = tik_instance.Tensor("float32", [no_tile, ko_tile_inner, c0, c0], name="input_x2_ub",
@@ -476,41 +484,41 @@ def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
                                    input_x2_cast_ub[count * repeate_times_max * vectorfp32_size],
                                    input_x2_ub[count * repeate_times_max * vectorfp32_size], repeate_num,
                                    1, 1, 4, 8)
-               input_x2_L1 = tik_instance.Tensor("float16", [no_tile, ko_tile_inner, c0, c0],
-                                                  name="input_x2_L1", scope=tik.scope_cbuf)
-                tik_instance.data_move(input_x2_L1, input_x2_cast_ub, 0, 1,
-                                       no_tile * ko_tile_inner * c0 * c0 * fp16_size // blocksize, 0, 0)
-                # input_x1 -> input_x1_L1
-                input_x1_L1 = tik_instance.Tensor(input_x1.dtype, [ko_tile_inner, mo_tile, c0, c0],
-                                                  name="input_x1_L1", scope=tik.scope_cbuf)
-                tik_instance.data_move(input_x1_L1,
-                                       input_x1[k_idx,
-                                                core_m * mo_tile, 0, 0],
-                                       0, ko_tile_inner, mo_tile * c0 * c0 * fp16_size // blocksize,
-                                       (mo - mo_tile) * c0 * c0 * fp16_size // blocksize, 0)
-                # input_x2_L1 -> input_x2_L0B
-                input_x2_L0B = tik_instance.Tensor("float16", [ko_tile_inner, no_tile, c0, c0],
-                                                   name="input_x2_L0B", scope=tik.scope_cb)
-                with tik_instance.for_range(0, ko_tile_inner) as cc2:
-                    tik_instance.load2dv1(input_x2_L0B[cc2, 0, 0, 0], input_x2_L1[0, cc2, 0, 0], 0, no_tile,
-                                          ko_tile_inner,
-                                          0, True)
-                # input_x1_L1 -> input_x1_L0A
-                input_x1_L0A = tik_instance.Tensor(input_x1.dtype, [mo_tile, ko_tile_inner, c0, c0],
-                                                   name="input_x1_L0A", scope=tik.scope_ca)
-                with tik_instance.for_range(0, mo_tile) as cc1:
-                    tik_instance.load2dv1(input_x1_L0A[cc1, 0, 0, 0], input_x1_L1[0, cc1, 0, 0], 0, ko_tile_inner,
-                                          mo_tile, 0, False)
-                with tik_instance.if_scope(thread_idx_k == 0):
-                    tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
-                                      ko_tile_inner * c0, no_tile * c0, 0)
-                with tik_instance.else_scope():
-                    tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
-                                      ko_tile_inner * c0, no_tile * c0, 1)
-            res_ub = tik_instance.Tensor(input_x1.dtype, [no_tile, mo_tile, c0, c0],
-                                               name="resMatmul_ub", scope=tik.scope_ubuf)
-            tik_instance.data_move(res_ub, res_L0C, 0, 1, no_tile * mo_tile, 0, 0, 1)
-            tik_instance.data_move(res[(core_n * loop_n_num + cc_n) * no_tile, core_m * mo_tile, 0, 0],
-                                   res_ub, 0, no_tile,
-                                   mo_tile * c0 * c0 * fp16_size // blocksize, 0,
-                                   (mo - mo_tile) * c0 * c0 * fp16_size // blocksize)
+            input_x2_L1 = tik_instance.Tensor("float16", [no_tile, ko_tile_inner, c0, c0],
+                                              name="input_x2_L1", scope=tik.scope_cbuf)
+            tik_instance.data_move(input_x2_L1, input_x2_cast_ub, 0, 1,
+                                   no_tile * ko_tile_inner * c0 * c0 * fp16_size // blocksize, 0, 0)
+            # input_x1 -> input_x1_L1
+            input_x1_L1 = tik_instance.Tensor(input_x1.dtype, [ko_tile_inner, mo_tile, c0, c0],
+                                              name="input_x1_L1", scope=tik.scope_cbuf)
+            tik_instance.data_move(input_x1_L1,
+                                   input_x1[k_idx,
+                                            core_m * mo_tile, 0, 0],
+                                   0, ko_tile_inner, mo_tile * c0 * c0 * fp16_size // blocksize,
+                                   (mo - mo_tile) * c0 * c0 * fp16_size // blocksize, 0)
+            # input_x2_L1 -> input_x2_L0B
+            input_x2_L0B = tik_instance.Tensor("float16", [ko_tile_inner, no_tile, c0, c0],
+                                               name="input_x2_L0B", scope=tik.scope_cb)
+            with tik_instance.for_range(0, ko_tile_inner) as cc2:
+                tik_instance.load2dv1(input_x2_L0B[cc2, 0, 0, 0], input_x2_L1[0, cc2, 0, 0], 0, no_tile,
+                                      ko_tile_inner,
+                                      0, True)
+            # input_x1_L1 -> input_x1_L0A
+            input_x1_L0A = tik_instance.Tensor(input_x1.dtype, [mo_tile, ko_tile_inner, c0, c0],
+                                               name="input_x1_L0A", scope=tik.scope_ca)
+            with tik_instance.for_range(0, mo_tile) as cc1:
+                tik_instance.load2dv1(input_x1_L0A[cc1, 0, 0, 0], input_x1_L1[0, cc1, 0, 0], 0, ko_tile_inner,
+                                      mo_tile, 0, False)
+            with tik_instance.if_scope(thread_idx_k == 0):
+                tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
+                                  ko_tile_inner * c0, no_tile * c0, 0)
+            with tik_instance.else_scope():
+                tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
+                                  ko_tile_inner * c0, no_tile * c0, 1)
+        res_ub = tik_instance.Tensor(input_x1.dtype, [no_tile, mo_tile, c0, c0],
+                                     name="resMatmul_ub", scope=tik.scope_ubuf)
+        tik_instance.data_move(res_ub, res_L0C, 0, 1, no_tile * mo_tile, 0, 0, 1)
+        tik_instance.data_move(res[(core_n * loop_n_num + cc_n) * no_tile, core_m * mo_tile, 0, 0],
+                               res_ub, 0, no_tile,
+                               mo_tile * c0 * c0 * fp16_size // blocksize, 0,
+                               (mo - mo_tile) * c0 * c0 * fp16_size // blocksize)
diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py
index 321ec602f7..b5f8ee9d82 100644
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py
@@ -18,37 +18,35 @@ limitations under the License.
 matmul
 """
 from __future__ import absolute_import
-import te.lang.cce
-import te.platform.cce_params as cce
-from te.platform.fusion_manager import fusion_manager
-from te import tvm
-from topi import generic
-from topi.cce import util
-from te import tik
-from impl.matmul_vector import matmul_vector_cce
+
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+from te import tik
+from topi.cce import util
+
 # General limitation of the size for input shape: 2**31
 SHAPE_SIZE_LIMIT = 2147483648
 NoneType = type(None)
 
 cus_matmul_cube_fracz_right_mul_op_info = TBERegOp("CusMatMulCubeFraczRightMul") \
-                             .fusion_type("OPAQUE") \
-                             .async_flag(False) \
-                             .binfile_name("matmulcubefraczrightmul.so") \
-                             .compute_cost(10) \
-                             .kernel_name("CusMatMulCubeFraczRightMul") \
-                             .partial_flag(True) \
-                             .input(0, "x1", False, "required", "all") \
-                             .input(1, "x2", False, "required", "all") \
-                             .input(2, "x3", False, "required", "all") \
-                             .input(3, "x4", False, "optional", "all") \
-                             .output(0, "y", False, "required", "all") \
-                             .dtype_format(DataType.F16_FracZ, DataType.F16_Default, DataType.F32_Default, DataType.F16_Default, DataType.F32_FracZ) \
-                             .get_op_info()
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("matmulcubefraczrightmul.so") \
+    .compute_cost(10) \
+    .kernel_name("CusMatMulCubeFraczRightMul") \
+    .partial_flag(True) \
+    .input(0, "x1", False, "required", "all") \
+    .input(1, "x2", False, "required", "all") \
+    .input(2, "x3", False, "required", "all") \
+    .input(3, "x4", False, "optional", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_FracZ, DataType.F16_Default, DataType.F32_Default, DataType.F16_Default,
+                  DataType.F32_FracZ) \
+    .get_op_info()
 
 
 @op_info_register(cus_matmul_cube_fracz_right_mul_op_info)
-def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
+def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False,
+                               kernel_name="matmulcube"):
     if util.get_product_version() == util.VERSION_MINI:
         tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
     else:
@@ -61,10 +59,10 @@ def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y
     input_x3_shape = input_x3.get("shape")
     input_x3_dtype = input_x3.get("dtype").lower()
     output_shape = output_y.get("shape")
-    Supported = [((72, 8, 16, 16),"float16", (72, 72, 16, 16), "float16", (1,), "float32"),
-                 ((32, 8, 16, 16),"float16", (32, 32, 16, 16), "float16", (1,), "float32"),
-                 ((8, 32, 16, 16),"float16", (8, 8, 16, 16), "float16", (1,), "float32"),
-                 ((4, 4, 16, 16),"float16", (4, 4, 16, 16), "float16", (1,), "float32"),
+    Supported = [((72, 8, 16, 16), "float16", (72, 72, 16, 16), "float16", (1,), "float32"),
+                 ((32, 8, 16, 16), "float16", (32, 32, 16, 16), "float16", (1,), "float32"),
+                 ((8, 32, 16, 16), "float16", (8, 8, 16, 16), "float16", (1,), "float32"),
+                 ((4, 4, 16, 16), "float16", (4, 4, 16, 16), "float16", (1,), "float32"),
                  ((4, 16, 16, 16), 'float16', (4, 4, 16, 16), 'float16', (1,), 'float32'),
                  ((49, 4, 16, 16), 'float16', (49, 49, 16, 16), 'float16', (1,), 'float32'),
                  ((36, 4, 16, 16), 'float16', (36, 36, 16, 16), 'float16', (1,), 'float32'),
@@ -81,7 +79,8 @@ def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y
                  ((32, 128, 16, 16), 'float16', (32, 32, 16, 16), 'float16', (1,), 'float32'),
                  ((64, 32, 16, 16), 'float16', (64, 64, 16, 16), 'float16', (1,), 'float32'),
                  ((16, 64, 16, 16), 'float16', (16, 16, 16, 16), 'float16', (1,), 'float32')]
-    input_shape = (tuple(input_x1_shape), input_x1_dtype, tuple(input_x2_shape), input_x2_dtype, tuple(input_x3_shape), input_x3_dtype)
+    input_shape = (
+    tuple(input_x1_shape), input_x1_dtype, tuple(input_x2_shape), input_x2_dtype, tuple(input_x3_shape), input_x3_dtype)
     if input_shape not in Supported:
         raise RuntimeError("input_shape %s is not supported" % str(input_shape))
 
@@ -93,6 +92,7 @@ def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y
     tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2, input_x3], outputs=[resMatmul])
     return tik_instance
 
+
 def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
                               res):
     diag_size = 128
@@ -176,7 +176,7 @@ def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
                                               name="resMatmul_L0C", scope=tik.scope_cc)
                 with tik_instance.for_range(0, loop_k_num, thread_num=thread_num_k) as thread_idx_k:
                     if diag_opt:
-                        k_idx = (core_n*loop_n_num + cc_n) * no_tile + thread_idx_k * ko_tile_inner
+                        k_idx = (core_n * loop_n_num + cc_n) * no_tile + thread_idx_k * ko_tile_inner
                     else:
                         k_idx = thread_idx_k * ko_tile_inner
                     # input_x1 -> input_x1_L1
@@ -191,7 +191,7 @@ def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
                     input_x2_L1 = tik_instance.Tensor("float16", [no_tile, ko_tile_inner, c0, c0],
                                                       name="input_x2_L1", scope=tik.scope_cbuf)
                     tik_instance.data_move(input_x2_L1,
-                                           input_x2[(core_n*loop_n_num + cc_n) * no_tile,
+                                           input_x2[(core_n * loop_n_num + cc_n) * no_tile,
                                                     k_idx, 0, 0],
                                            0, no_tile, ko_tile_inner * c0 * c0 * fp16_size // blocksize,
                                            (ko - ko_tile_inner) * c0 * c0 * fp16_size // blocksize, 0)
@@ -215,9 +215,9 @@ def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
                         tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
                                           ko_tile_inner * c0, no_tile * c0, 1)
                 res_ub = tik_instance.Tensor("float32", [no_tile, mo_tile, c0, c0],
-                                                   name="resMatmul_ub", scope=tik.scope_ubuf)
+                                             name="resMatmul_ub", scope=tik.scope_ubuf)
                 tik_instance.data_move(res_ub, res_L0C, 0, 1, no_tile * mo_tile, 0, 0)
- 
+
                 input_3_local_UB = tik_instance.Tensor("float32", (8,), scope=tik.scope_ubuf, name="input_3_local_UB")
                 tik_instance.data_move(input_3_local_UB, input_x3, 0, 1, 1, 0, 0)
                 matrix_max_scalar = tik_instance.Scalar("float32")
@@ -236,7 +236,7 @@ def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
                                    res_ub[count * repeate_times_max * vectorfp32_size],
                                    res_ub[count * repeate_times_max * vectorfp32_size],
                                    matrix_max_scalar, repeate_num, 1, 1, 8, 8)
- 
+
                 tik_instance.data_move(res[(core_n * loop_n_num + cc_n) * no_tile,
                                            (core_m * loop_m_num + cc_m) * mo_tile, 0, 0],
                                        res_ub, 0, no_tile,
diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_impl.py
index 4bbfcf7f33..dfa83c4fb7 100644
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_impl.py
@@ -18,13 +18,15 @@ limitations under the License.
 matmul
 """
 from __future__ import absolute_import
+
 import te.lang.cce
 import te.platform.cce_params as cce
+from impl.matmul_vector import matmul_vector_cce
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tvm
 from topi import generic
 from topi.cce import util
-from impl.matmul_vector import matmul_vector_cce
-from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
 # General limitation of the size for input shape: 2**31
 SHAPE_SIZE_LIMIT = 2147483648
 NoneType = type(None)
@@ -36,8 +38,8 @@ matmul_cube_op_info = TBERegOp("CusMatMulCube") \
     .compute_cost(10) \
     .kernel_name("CusMatMulCube") \
     .partial_flag(True) \
-    .attr("transpose_a", "required", "bool", "all")\
-    .attr("transpose_b", "required", "bool", "all")\
+    .attr("transpose_a", "required", "bool", "all") \
+    .attr("transpose_b", "required", "bool", "all") \
     .input(0, "x1", False, "required", "all") \
     .input(1, "x2", False, "required", "all") \
     .input(2, "x3", False, "optional", "all") \
@@ -45,6 +47,7 @@ matmul_cube_op_info = TBERegOp("CusMatMulCube") \
     .dtype_format(DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_Default, DataType.F32_FracNZ) \
     .get_op_info()
 
+
 # pylint: disable=locally-disabled,too-many-arguments,too-many-branches, too-many-statements, too-many-locals,
 def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
     """
@@ -113,16 +116,16 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
 
     if m_shape != 1:
         if n_shape == 1:
-            if km_shape % (cce.BLOCK_IN*cce.BLOCK_IN) != 0:
+            if km_shape % (cce.BLOCK_IN * cce.BLOCK_IN) != 0:
                 raise RuntimeError("input shape K1 should be multiple of %d"
-                                   % (cce.BLOCK_IN*cce.BLOCK_IN))
-        elif km_shape%k_block_size != 0:
+                                   % (cce.BLOCK_IN * cce.BLOCK_IN))
+        elif km_shape % k_block_size != 0:
             raise RuntimeError(
                 "input shape K1 should be multiple of %d" % cce.BLOCK_IN)
     else:
-        if km_shape % (cce.BLOCK_IN*cce.BLOCK_IN) != 0:
+        if km_shape % (cce.BLOCK_IN * cce.BLOCK_IN) != 0:
             raise RuntimeError("input shape K1 should be multiple of %d"
-                               % (cce.BLOCK_IN*cce.BLOCK_IN))
+                               % (cce.BLOCK_IN * cce.BLOCK_IN))
 
     if n_shape % cce.BLOCK_IN != 0 and n_shape != 1:
         raise RuntimeError("input shape N should be 1 or multiple of %d" % cce.BLOCK_IN)
@@ -130,7 +133,7 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
     if len(shape_bias):
         if len(shape_bias) == 1:
             if is_gevm or is_gemv:
-                if shape_bias[0] != m_shape*n_shape:
+                if shape_bias[0] != m_shape * n_shape:
                     raise RuntimeError("broadcast case shape bias for gemv must be equal m*n")
             else:
                 if shape_bias[0] != n_shape:
@@ -141,33 +144,36 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
         else:
             raise RuntimeError("unsupport input shape now for batch bias case")
 
+
 def _get_bias(shape_bias):
     bias_length = shape_bias[0]
-    if bias_length % 16 ==0:
+    if bias_length % 16 == 0:
         return shape_bias
     else:
-        bias_length = (bias_length // 16)*16 + 16
+        bias_length = (bias_length // 16) * 16 + 16
         shape_bias = []
         shape_bias.append(bias_length)
         return shape_bias
 
+
 def _get_input_shape(shape_x):
     dim_a = shape_x[0]
     dim_b = shape_x[1]
     res = []
-    if dim_a % 16 !=0:
-        dim_a = (dim_a // 16)*16 + 16
+    if dim_a % 16 != 0:
+        dim_a = (dim_a // 16) * 16 + 16
         res.append(dim_a)
     else:
         res.append(dim_a)
 
-    if dim_b % 16 !=0:
-        dim_b = (dim_b // 16)*16 + 16
+    if dim_b % 16 != 0:
+        dim_b = (dim_b // 16) * 16 + 16
         res.append(dim_b)
     else:
         res.append(dim_b)
     return res
 
+
 def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
     shape_a = input_x1.get("shape")
     shape_b = input_x2.get("shape")
@@ -182,7 +188,7 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
     if bias is not None and bool(bias):
         shape_bias = bias.get("shape")
     try:
-        trans_a_f = bool(1-trans_a)
+        trans_a_f = bool(1 - trans_a)
         if src_dtype == "float32" or src_dtype == "int32":
             if len(shape_a) != 2 and len(shape_b) != 2:
                 return False
@@ -203,10 +209,10 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
                     return False
             elif shape_a[1] != shape_b[0]:
                 return False
- 
+
             if trans_a_f and trans_b and shape_b[1] == 1:
                 return False
- 
+
         if src_dtype == "float16":
             if len(shape_a) != 2 and len(shape_b) != 2:
                 return False
@@ -217,26 +223,27 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
             else:
                 m_shape = shape_a[0]
                 k_shape = shape_a[1]
- 
+
             if trans_b:
                 n_shape = shape_b[0]
                 k_b_shape = shape_b[1]
             else:
                 n_shape = shape_b[1]
                 k_b_shape = shape_b[0]
- 
+
             if k_shape != k_b_shape:
                 return False
- 
+
             if m_shape == 1 or n_shape == 1:
                 if k_shape % 256 != 0:
                     return False
- 
+
     except RuntimeError as e:
         return False
- 
+
     return True
- 
+
+
 # pylint: disable=locally-disabled,too-many-arguments, too-many-locals, too-many-statements
 @op_info_register(matmul_cube_op_info)
 def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
@@ -269,18 +276,18 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
     """
     shape_a = input_x1.get("ori_shape")
     shape_b = input_x2.get("ori_shape")
- 
+
     if shape_a is not None:
         if len(shape_a) < 2:
             shape_a = input_x1.get("shape")
- 
+
     if shape_b is not None:
         if len(shape_b) < 2:
             shape_b = input_x2.get("shape")
- 
+
     shape_a = list(shape_a)
     shape_b = list(shape_b)
- 
+
     if input_x1.get("format") == "FRACTAL_NZ":
         shape_a = _get_input_shape(shape_a)
         shape_b = _get_input_shape(shape_b)
@@ -290,21 +297,21 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
     util.check_shape_rule(shape_b)
     util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
     util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)
- 
+
     if input_x1.get("format") == "FRACTAL_NZ":
         shape_a = [shape_a[1], shape_a[0]]
-        trans_a = bool(1-trans_a)
- 
+        trans_a = bool(1 - trans_a)
+
     if input_x2.get("format") == "FRACTAL_NZ":
         shape_b = [shape_b[1], shape_b[0]]
-        trans_b = bool(1-trans_b)
- 
+        trans_b = bool(1 - trans_b)
+
     shape_bias = ()
     if bias is not None and bool(bias):
         shape_bias = bias.get("shape")
         shape_bias = list(shape_bias)
         shape_bias = _get_bias(shape_bias)
- 
+
     src_dtype = input_x1.get("dtype").lower()
     dst_dtype = output_y.get("dtype").lower()
     if src_dtype == "float32" or src_dtype == "int32":
@@ -338,12 +345,12 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
         shape_a_temp = (m_shape // block_reduce, km_shape // block_in, block_reduce, block_in)
     else:
         shape_a_temp = (m_shape // block_in, km_shape // block_reduce, block_in, block_reduce)
- 
+
     if trans_b:
         shape_b_temp = (kn_shape // block_out, n_shape // block_reduce, block_reduce, block_out)
     else:
         shape_b_temp = (kn_shape // block_reduce, n_shape // block_out, block_out, block_reduce)
- 
+
     if input_x1.get("format") == "FORMAT_FRACTAL_Z":
         shape_a_temp = (shape_a_temp[0], shape_a_temp[1], shape_a_temp[2], shape_a_temp[3])
         format_a = "fractal"
@@ -353,7 +360,7 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
     else:
         shape_a_temp = (shape_a[len(shape_a) - 2], shape_a[len(shape_a) - 1])
         format_a = "ND"
- 
+
     if input_x2.get("format") == "FORMAT_FRACTAL_Z":
         shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2], shape_b_temp[3])
         format_b = "fractal"
@@ -363,28 +370,28 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
     else:
         shape_b_temp = (shape_b[len(shape_b) - 2], shape_b[len(shape_b) - 1])
         format_b = "ND"
- 
+
     tensor_bias = None
     tensor_a = tvm.placeholder(shape_a_temp, name='tensor_a',
                                dtype=src_dtype)
     tensor_b = tvm.placeholder(shape_b_temp, name='tensor_b',
                                dtype=src_dtype)
- 
+
     if len(shape_bias) > 0:
         tensor_bias = tvm.placeholder(shape_bias, name='tensor_bias',
                                       dtype=dst_dtype)
     result = te.lang.cce.matmul(tensor_a, tensor_b, trans_a, trans_b, format_a=format_a,
                                 format_b=format_b, dst_dtype=dst_dtype, tensor_bias=tensor_bias)
- 
+
     with tvm.target.cce():
         schedule = generic.auto_schedule(result)
- 
+
     tensor_list = [tensor_a, tensor_b, result]
     if len(shape_bias) > 0:
         tensor_list = [tensor_a, tensor_b, tensor_bias, result]
- 
+
     config = {"print_ir": False,
               "name": kernel_name,
               "tensor_list": tensor_list}
- 
+
     te.lang.cce.cce_build_code(schedule, config)
diff --git a/mindspore/ops/_op_impl/_custom_op/matrix_combine_impl.py b/mindspore/ops/_op_impl/_custom_op/matrix_combine_impl.py
index 14c4b590ce..b001e0ba33 100644
--- a/mindspore/ops/_op_impl/_custom_op/matrix_combine_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matrix_combine_impl.py
@@ -13,24 +13,25 @@
 # limitations under the License.
 # ============================================================================
 """CusMatrixCombine"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tik
 from topi.cce import util
-from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 
 cus_matrix_combine_op_info = TBERegOp("CusMatrixCombine") \
-                             .fusion_type("OPAQUE") \
-                             .async_flag(False) \
-                             .binfile_name("matrixcombine.so") \
-                             .compute_cost(10) \
-                             .kernel_name("CusMatrixCombine") \
-                             .partial_flag(True) \
-                             .input(0, "x1", False, "required", "all") \
-                             .output(0, "y", False, "required", "all") \
-                             .dtype_format(DataType.F32_Default, DataType.F32_Default) \
-                             .get_op_info()
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("matrixcombine.so") \
+    .compute_cost(10) \
+    .kernel_name("CusMatrixCombine") \
+    .partial_flag(True) \
+    .input(0, "x1", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+    .get_op_info()
+
 
 @op_info_register(cus_matrix_combine_op_info)
-def CusMatrixCombine(input_x, output,kernel_name="matrix_combine"):
+def CusMatrixCombine(input_x, output, kernel_name="matrix_combine"):
     input_x_shape = input_x.get("shape")
     output_shape = output.get("shape")
     split_dim = 128
@@ -45,18 +46,20 @@ def CusMatrixCombine(input_x, output,kernel_name="matrix_combine"):
 
     blocks = 32
     matrix_dim = input_x_shape[0] * input_x_shape[1]
-    if input_x_shape[0] == 1 and input_x_shape[1] == 64 :
+    if input_x_shape[0] == 1 and input_x_shape[1] == 64:
         tiling_dim = 2
         bs = 1
-        with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-            input_x_ub = tik_instance.Tensor("float32", (tiling_dim, matrix_dim), name="input_x_ub", scope=tik.scope_ubuf)
+        with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+            input_x_ub = tik_instance.Tensor("float32", (tiling_dim, matrix_dim), name="input_x_ub",
+                                             scope=tik.scope_ubuf)
             tik_instance.data_move(input_x_ub, input_x[0, block_index * tiling_dim, 0], 0, 1, 16, 0, 0)
             tik_instance.data_move(res[block_index * tiling_dim, 0], input_x_ub, 0, 1, 16, 0, 0)
     else:
         tiling_dim = 4
         bs = input_x_shape[0]
-        with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-            input_x_ub = tik_instance.Tensor("float32", (tiling_dim, matrix_dim), name="input_x_ub", scope=tik.scope_ubuf)
+        with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+            input_x_ub = tik_instance.Tensor("float32", (tiling_dim, matrix_dim), name="input_x_ub",
+                                             scope=tik.scope_ubuf)
             zero = tik_instance.Scalar("float32")
             zero.set_as(0.0)
             with tik_instance.for_range(0, bs) as i:
@@ -69,7 +72,9 @@ def CusMatrixCombine(input_x, output,kernel_name="matrix_combine"):
                     tik_instance.vector_dup(64, input_x_ub, zero, repeat_1, 1, 8)
                     tik_instance.vector_dup(64, input_x_ub[255 * 64], zero, repeat_2, 1, 8)
                 with tik_instance.for_range(0, tiling_dim) as j:
-                    tik_instance.data_move(input_x_ub[j, split_dim * i], input_x[i, block_index * tiling_dim + j, 0], 0, 1, 16, 0, 0)
-                tik_instance.data_move(res[i * split_dim + block_index * tiling_dim, 0], input_x_ub, 0, 1, tiling_dim * matrix_dim *4 // 32, 0, 0)
+                    tik_instance.data_move(input_x_ub[j, split_dim * i], input_x[i, block_index * tiling_dim + j, 0], 0,
+                                           1, 16, 0, 0)
+                tik_instance.data_move(res[i * split_dim + block_index * tiling_dim, 0], input_x_ub, 0, 1,
+                                       tiling_dim * matrix_dim * 4 // 32, 0, 0)
     tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x], outputs=[res])
     return tik_instance
diff --git a/mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py b/mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py
index 76fe9625a4..f341efe4b7 100644
--- a/mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py
@@ -13,40 +13,41 @@
 # limitations under the License.
 # ============================================================================
 """CusTranspose02314"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tik
 from topi.cce import util
-from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 
 cus_transpose02314_op_info = TBERegOp("CusTranspose02314") \
-                             .fusion_type("OPAQUE") \
-                             .async_flag(False) \
-                             .binfile_name("transpose02314.so") \
-                             .compute_cost(10) \
-                             .kernel_name("CusTranspose02314") \
-                             .partial_flag(True) \
-                             .input(0, "x1", False, "required", "all") \
-                             .output(0, "y", False, "required", "all") \
-                             .dtype_format(DataType.F16_5HD, DataType.F16_Default) \
-                             .get_op_info()
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("transpose02314.so") \
+    .compute_cost(10) \
+    .kernel_name("CusTranspose02314") \
+    .partial_flag(True) \
+    .input(0, "x1", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_5HD, DataType.F16_Default) \
+    .get_op_info()
+
 
 @op_info_register(cus_transpose02314_op_info)
 def CusTranspose02314(input_x, output, kernel_name="transpose021354"):
     input_x_shape = input_x.get("shape")
     output_shape = output.get("shape")
-    perm = (0,2,3,1,4)
+    perm = (0, 2, 3, 1, 4)
     input_x_shape = tuple(input_x_shape)
-    support_shape = [ (32,128,7,7,16),
-            (32,32,7,7,16),
-            (32,32,14,14,16),
-            (32,64,14,14,16),
-            (32,16,14,14,16),
-            (32,16,28,28,16),
-            (32,32,28,28,16),
-            (32,8,28,28,16),
-            (32,8,56,56,16),
-            (32,16,56,56,16),
-            (32,4,56,56,16),
-            (32,4,112,112,16)]
+    support_shape = [(32, 128, 7, 7, 16),
+                     (32, 32, 7, 7, 16),
+                     (32, 32, 14, 14, 16),
+                     (32, 64, 14, 14, 16),
+                     (32, 16, 14, 14, 16),
+                     (32, 16, 28, 28, 16),
+                     (32, 32, 28, 28, 16),
+                     (32, 8, 28, 28, 16),
+                     (32, 8, 56, 56, 16),
+                     (32, 16, 56, 56, 16),
+                     (32, 4, 56, 56, 16),
+                     (32, 4, 112, 112, 16)]
     if input_x_shape not in support_shape:
         raise RuntimeError("input_shape %s is not supported" % str(input_x_shape))
 
@@ -59,125 +60,172 @@ def CusTranspose02314(input_x, output, kernel_name="transpose021354"):
     res = tik_instance.Tensor("float16", output_shape, name="res", scope=tik.scope_gm)
 
     dtype = "float16"
-    if tuple(input_x_shape) == (32,4,112,112,16):
+    if tuple(input_x_shape) == (32, 4, 112, 112, 16):
         with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             with tik_instance.for_range(0, 14) as cc1_db:
                 with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
-                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf)
-                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf)
+                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
+                                                           scope=tik.scope_ubuf)
+                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
+                                                               scope=tik.scope_ubuf)
                     zero = tik_instance.Scalar(dtype="float16", init_value=0)
-                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 802816 + cc1_db * 14336 + 7168 * db_idx], 0, 4, 448, 12096, 0)
+                    tik_instance.data_move(input_1_local_UB,
+                                           input_x[block_idx * 802816 + cc1_db * 14336 + 7168 * db_idx], 0, 4, 448,
+                                           12096, 0)
                     with tik_instance.for_range(0, 448) as cc7:
                         with tik_instance.for_range(0, 4) as cc8:
-                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 64 + cc8 * 16], input_1_local_UB[7168 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
-                    tik_instance.data_move(res[block_idx * 802816 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0)
-    elif tuple(input_x_shape) == (32,4,56,56,16):
+                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 64 + cc8 * 16],
+                                               input_1_local_UB[7168 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                    tik_instance.data_move(res[block_idx * 802816 + cc1_db * 57344 + 28672 * db_idx],
+                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)
+    elif tuple(input_x_shape) == (32, 4, 56, 56, 16):
         with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             zero = tik_instance.Scalar(dtype="float16", init_value=0)
             with tik_instance.for_range(0, 3) as cc1_db:
                 with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
-                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf)
-                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf)
-                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 200704 + cc1_db * 14336 + 7168 * db_idx], 0, 4, 448, 2688, 0)
+                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
+                                                           scope=tik.scope_ubuf)
+                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
+                                                               scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_1_local_UB,
+                                           input_x[block_idx * 200704 + cc1_db * 14336 + 7168 * db_idx], 0, 4, 448,
+                                           2688, 0)
                     with tik_instance.for_range(0, 448) as cc7:
                         with tik_instance.for_range(0, 4) as cc8:
-                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 64 + cc8 * 16], input_1_local_UB[7168 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
-                    tik_instance.data_move(res[block_idx * 200704 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0)
+                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 64 + cc8 * 16],
+                                               input_1_local_UB[7168 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                    tik_instance.data_move(res[block_idx * 200704 + cc1_db * 57344 + 28672 * db_idx],
+                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)
 
             input_1_local_UB2 = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB2", scope=tik.scope_ubuf)
-            T_transpose_local_UB2 = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB2", scope=tik.scope_ubuf)
+            T_transpose_local_UB2 = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB2",
+                                                        scope=tik.scope_ubuf)
             tik_instance.data_move(input_1_local_UB2, input_x[block_idx * 200704 + 43008], 0, 4, 448, 2688, 0)
             with tik_instance.for_range(0, 448) as cc72:
                 with tik_instance.for_range(0, 4) as cc82:
-                    tik_instance.vadds(16, T_transpose_local_UB2[cc72 * 64 + cc82 * 16], input_1_local_UB2[7168 * cc82 + cc72 * 16], zero, 1, 1, 1, 0, 0)
+                    tik_instance.vadds(16, T_transpose_local_UB2[cc72 * 64 + cc82 * 16],
+                                       input_1_local_UB2[7168 * cc82 + cc72 * 16], zero, 1, 1, 1, 0, 0)
             tik_instance.data_move(res[block_idx * 200704 + 172032], T_transpose_local_UB2, 0, 1, 1792, 0, 0)
-    elif tuple(input_x_shape) == (32,16,56,56,16):
+    elif tuple(input_x_shape) == (32, 16, 56, 56, 16):
         with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             zero = tik_instance.Scalar(dtype="float16", init_value=0)
             with tik_instance.for_range(0, 14) as cc1_db:
                 with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
-                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf)
-                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf)
-                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 802816 + cc1_db * 3584 + 1792 * db_idx], 0, 16, 112, 3024, 0)
+                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
+                                                           scope=tik.scope_ubuf)
+                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
+                                                               scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_1_local_UB,
+                                           input_x[block_idx * 802816 + cc1_db * 3584 + 1792 * db_idx], 0, 16, 112,
+                                           3024, 0)
                     with tik_instance.for_range(0, 112) as cc7:
                         with tik_instance.for_range(0, 16) as cc8:
-                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16], input_1_local_UB[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
-                    tik_instance.data_move(res[block_idx * 802816 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0)
-    elif tuple(input_x_shape) == (32,8,56,56,16):
+                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16],
+                                               input_1_local_UB[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                    tik_instance.data_move(res[block_idx * 802816 + cc1_db * 57344 + 28672 * db_idx],
+                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)
+    elif tuple(input_x_shape) == (32, 8, 56, 56, 16):
         with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             zero = tik_instance.Scalar(dtype="float16", init_value=0)
             with tik_instance.for_range(0, 7) as cc1_db:
                 with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
-                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf)
-                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf)
-                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 401408 + cc1_db * 7168 + 3584 * db_idx], 0, 8, 224, 2912, 0)
+                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
+                                                           scope=tik.scope_ubuf)
+                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
+                                                               scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_1_local_UB,
+                                           input_x[block_idx * 401408 + cc1_db * 7168 + 3584 * db_idx], 0, 8, 224, 2912,
+                                           0)
                     with tik_instance.for_range(0, 224) as cc7:
                         with tik_instance.for_range(0, 16) as cc8:
-                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 128 + cc8 * 16], input_1_local_UB[3584 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
-                    tik_instance.data_move(res[block_idx * 401408 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0)
-    elif tuple(input_x_shape) == (32,8,28,28,16):
+                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 128 + cc8 * 16],
+                                               input_1_local_UB[3584 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                    tik_instance.data_move(res[block_idx * 401408 + cc1_db * 57344 + 28672 * db_idx],
+                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)
+    elif tuple(input_x_shape) == (32, 8, 28, 28, 16):
         with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             zero = tik_instance.Scalar(dtype="float16", init_value=0)
             with tik_instance.for_range(0, 2) as cc1_db:
                 with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
-                    input_1_local_UB = tik_instance.Tensor(dtype, [25088], name="input_1_local_UB", scope=tik.scope_ubuf)
-                    T_transpose_local_UB = tik_instance.Tensor(dtype, [25088], name="T_transpose_local_UB", scope=tik.scope_ubuf)
-                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 100352 + cc1_db * 6272 + 3136 * db_idx], 0, 8, 196, 588, 0)
+                    input_1_local_UB = tik_instance.Tensor(dtype, [25088], name="input_1_local_UB",
+                                                           scope=tik.scope_ubuf)
+                    T_transpose_local_UB = tik_instance.Tensor(dtype, [25088], name="T_transpose_local_UB",
+                                                               scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_1_local_UB,
+                                           input_x[block_idx * 100352 + cc1_db * 6272 + 3136 * db_idx], 0, 8, 196, 588,
+                                           0)
                     with tik_instance.for_range(0, 196) as cc7:
                         with tik_instance.for_range(0, 8) as cc8:
-                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 128 + cc8 * 16], input_1_local_UB[3136 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
-                    tik_instance.data_move(res[block_idx * 100352 + cc1_db * 50176 + 25088 * db_idx], T_transpose_local_UB, 0, 1, 1568, 0, 0)
-    elif tuple(input_x_shape) == (32,32,28,28,16):
+                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 128 + cc8 * 16],
+                                               input_1_local_UB[3136 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                    tik_instance.data_move(res[block_idx * 100352 + cc1_db * 50176 + 25088 * db_idx],
+                                           T_transpose_local_UB, 0, 1, 1568, 0, 0)
+    elif tuple(input_x_shape) == (32, 32, 28, 28, 16):
         with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             zero = tik_instance.Scalar(dtype="float16", init_value=0)
             with tik_instance.for_range(0, 7) as cc1_db:
                 with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
-                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf)
-                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf)
-                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 401408 + cc1_db * 1792 + 896 * db_idx], 0, 32, 56, 728, 0)
+                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
+                                                           scope=tik.scope_ubuf)
+                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
+                                                               scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 401408 + cc1_db * 1792 + 896 * db_idx],
+                                           0, 32, 56, 728, 0)
                     with tik_instance.for_range(0, 56) as cc7:
                         with tik_instance.for_range(0, 32) as cc8:
-                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 512 + cc8 * 16], input_1_local_UB[896 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
-                    tik_instance.data_move(res[block_idx * 401408 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0)
-    elif tuple(input_x_shape) == (32,16,28,28,16):
+                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 512 + cc8 * 16],
+                                               input_1_local_UB[896 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                    tik_instance.data_move(res[block_idx * 401408 + cc1_db * 57344 + 28672 * db_idx],
+                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)
+    elif tuple(input_x_shape) == (32, 16, 28, 28, 16):
         with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             zero = tik_instance.Scalar(dtype="float16", init_value=0)
             with tik_instance.for_range(0, 3) as cc1_db:
                 with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
-                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf)
-                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf)
-                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 200704 + cc1_db * 3584 + 1792 * db_idx], 0, 16, 112, 672, 0)
+                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
+                                                           scope=tik.scope_ubuf)
+                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
+                                                               scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_1_local_UB,
+                                           input_x[block_idx * 200704 + cc1_db * 3584 + 1792 * db_idx], 0, 16, 112, 672,
+                                           0)
                     with tik_instance.for_range(0, 112) as cc7:
                         with tik_instance.for_range(0, 16) as cc8:
-                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16], input_1_local_UB[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
-                    tik_instance.data_move(res[block_idx * 200704 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0)
+                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16],
+                                               input_1_local_UB[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                    tik_instance.data_move(res[block_idx * 200704 + cc1_db * 57344 + 28672 * db_idx],
+                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)
 
             input_1_local_UB2 = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB2", scope=tik.scope_ubuf)
-            T_transpose_local_UB2 = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB2", scope=tik.scope_ubuf)
+            T_transpose_local_UB2 = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB2",
+                                                        scope=tik.scope_ubuf)
             tik_instance.data_move(input_1_local_UB2, input_x[block_idx * 200704 + 10752], 0, 16, 112, 672, 0)
             with tik_instance.for_range(0, 112) as cc7:
                 with tik_instance.for_range(0, 16) as cc8:
-                    tik_instance.vadds(16, T_transpose_local_UB2[cc7 * 256 + cc8 * 16], input_1_local_UB2[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                    tik_instance.vadds(16, T_transpose_local_UB2[cc7 * 256 + cc8 * 16],
+                                       input_1_local_UB2[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
             tik_instance.data_move(res[block_idx * 200704 + 172032], T_transpose_local_UB2, 0, 1, 1792, 0, 0)
 
-    elif tuple(input_x_shape) == (32,16,14,14,16):
+    elif tuple(input_x_shape) == (32, 16, 14, 14, 16):
         with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             zero = tik_instance.Scalar(dtype="float16", init_value=0)
             with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                 input_1_local_UB = tik_instance.Tensor(dtype, [25088], name="input_1_local_UB", scope=tik.scope_ubuf)
-                T_transpose_local_UB = tik_instance.Tensor(dtype, [25088], name="T_transpose_local_UB", scope=tik.scope_ubuf)
+                T_transpose_local_UB = tik_instance.Tensor(dtype, [25088], name="T_transpose_local_UB",
+                                                           scope=tik.scope_ubuf)
                 tik_instance.data_move(input_1_local_UB, input_x[block_idx * 50176 + 1568 * db_idx], 0, 16, 98, 98, 0)
                 with tik_instance.for_range(0, 98) as cc7:
                     with tik_instance.for_range(0, 16) as cc8:
-                        tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16], input_1_local_UB[1568 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                        tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16],
+                                           input_1_local_UB[1568 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                 tik_instance.data_move(res[block_idx * 50176 + 25088 * db_idx], T_transpose_local_UB, 0, 1, 1568, 0, 0)
-    elif tuple(input_x_shape) == (32, 128, 7, 7, 16) and tuple(perm) == (0,2,3,1,4) and dtype == "float16":
+    elif tuple(input_x_shape) == (32, 128, 7, 7, 16) and tuple(perm) == (0, 2, 3, 1, 4) and dtype == "float16":
         with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             with tik_instance.for_range(0, 7, thread_num=2) as cc1:
                 input_x_ub = tik_instance.Tensor(dtype, [1, 128, 1, 7, 16], name="input_1_local_UB",
                                                  scope=tik.scope_ubuf)
                 transpose_ub = tik_instance.Tensor(dtype, [1, 1, 7, 128, 16], name="transpose_local_UB",
-                                                 scope=tik.scope_ubuf)
+                                                   scope=tik.scope_ubuf)
                 tik_instance.data_move(input_x_ub, input_x[block_idx, 0, cc1, 0, 0], 0, 128, 7, 42, 0)
                 with tik_instance.for_range(0, 7) as cc7:
                     with tik_instance.for_range(0, 128) as cc8:
@@ -193,7 +241,7 @@ def CusTranspose02314(input_x, output, kernel_name="transpose021354"):
                                                scope=tik.scope_ubuf)
             tik_instance.data_move(input_x_ub, input_x[block_idx, 0, 0, 0, 0], 0, 1, 1568, 0, 0)
             with tik_instance.for_range(0, 7) as cc1:
-               with tik_instance.for_range(0, 7) as cc2:
+                with tik_instance.for_range(0, 7) as cc2:
                     with tik_instance.for_range(0, 32) as cc3:
                         tik_instance.vadds(16, transpose_ub[0, cc1, cc2, cc3, 0], input_x_ub[0, cc3, cc1, cc2, 0], 0,
                                            1, 1, 1, 0, 0)
@@ -212,11 +260,12 @@ def CusTranspose02314(input_x, output, kernel_name="transpose021354"):
                         tik_instance.vadds(16, transpose_ub[0, cc2, cc3, cc4, 0], input_x_ub[0, cc4, cc2, cc3, 0],
                                            0, 1, 1, 1, 0, 0)
             tik_instance.data_move(res[block_idx * 100352 + split_index * 2 * 7168], transpose_ub, 0, 1, 896, 0, 0)
+
         with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             with tik_instance.for_range(0, 6, thread_num=2) as cc1:
-                _inner_+compute(cc1)
+                _inner_ + compute(cc1)
             _inner_compute(6)
-    elif tuple(input_x_shape) == (32,64,14,14,16)  and tuple(perm) == (0,2,3,1,4) and dtype == "float16":
+    elif tuple(input_x_shape) == (32, 64, 14, 14, 16) and tuple(perm) == (0, 2, 3, 1, 4) and dtype == "float16":
         def _inner_compute(split_index, block_idx):
             input_x_ub = tik_instance.Tensor(dtype, [1, 64, 2, 14, 16], name="input_1_local_UB",
                                              scope=tik.scope_ubuf)
@@ -229,6 +278,7 @@ def CusTranspose02314(input_x, output, kernel_name="transpose021354"):
                         tik_instance.vadds(16, transpose_ub[0, cc2, cc3, cc4, 0], input_x_ub[0, cc4, cc2, cc3, 0],
                                            0, 1, 1, 1, 0, 0)
             tik_instance.data_move(res[block_idx * 200704 + split_index * 2 * 14336], transpose_ub, 0, 1, 1792, 0, 0)
+
         with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             with tik_instance.for_range(0, 6, thread_num=2) as cc1:
                 _inner_compute(cc1, block_idx)