!3961 unroll loop for fp32 depth wise convolution to 16

Merge pull request !3961 from lixian/master
5 years ago · c2cd2df9e7
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/ConvDwFp32Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/ConvDwFp32Center.S
@@ -0,0 +1,161 @@
 #ifdef __arm__
 #ifndef __aarch64__

 .text
 .align 5
 .global ConvDwFp32Center
 #ifndef __APPLE__
 .type ConvDwFp32Center, %function
 #endif

 // void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
 //                      size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
 //                      size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
 // r0: dst, r1: src, r2: weight, r3: bias, #48: height, #52: weight, #56: kernel_h, #60: kernel_w, 
 // #64: out_h_step, #68: block_channel, #72: in_sh_step, #76: in_sw_step, #80: in_kh_step,#84: in_kw_step
 // #88: relu, #92: relu6
 ConvDwFp32Center:
    // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
    // according to https://stackoverflow.com/questions/53625807
    // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
    // clang's rule seems more simple, though there are no subroutine calls here
    // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
    push {r0-r8, r10, r11, lr}
    vpush {v4-v7}
    add sp, sp, #112

    ldr r4, [sp, #48]

    vld1.32 {q13}, [r3]
    vmov.i32 q14, #6
    vcvt.f32.s32 q14, q14
    veor q15, q15, q15

    LoopH:
        ldr r1, [sp, #4] // src_w
        ldr r5, [sp, #52] // width
        ldr r0, [sp] // dst_w
        cmp r5, #4
        blt LoopW
        LoopW4:
            mov r11, [sp, #76] // in_sw_step
            mov r8, r1 // src_kh
            ldr r2, [sp, #8] // weight_kh
            ldr r6, [sp, #56] // kernel_h
            vmov q0, q13
            LoopKh4:
                ldr r12, [sp, #80] //in_kh_step 
                ldr r7, [sp, #60] // kernel_w
                mov lr, r8 // src_kw
                LoopKw4:
                    mov r10, lr
                    vld1.32 {q12}, [r2]!
                    vld1.32 {q4}, [r10]
                    add r10, r10, r11
                    vmla.f32 q0, q4, q12
                    vld1.32 {q5}, [r10]
                    add r10, r10, r11
                    vmla.f32 q1, q5, q12
                    vld1.32 {q6}, [r10]
                    add r10, r10, r11
                    vmla.f32 q2, q6, q12
                    vld1.32 {q7}, [r10]
                    add r10, r10, r11
                    vmla.f32 q3, q7, q12
                    subs r7, r7, #1
                    add lr, lr, r12
                    bne LoopKw4
                ldr r12, [sp, #80]
                add r8, r8, r12
                subs r6, r6, #1
                bne LoopKh4
            ldr r12, [sp, #92]
            cmp r12, #0
            bne Relu64
            ldr r12, [sp, #88]
            cmp r12, #0
            bne Relu4
            b Write4
        Relu64:
            vmin.f32 q0, q0, q14
            vmin.f32 q1, q1, q14
            vmin.f32 q2, q2, q14
            vmin.f32 q3, q3, q14
        Relu4:
            vmax.f32 q0, q0, q15
            vmax.f32 q1, q1, q15
            vmax.f32 q2, q2, q15
            vmax.f32 q3, q3, q15
        Write4:
            ldr r12, [sp, #68]
            vst1.32 {q0}, [r0]
            add r0, r0, r12
            vst1.32 {q1}, [r0]
            add r0, r0, r12
            vst1.32 {q2}, [r0]
            add r0, r0, r12
            vst1.32 {q3}, [r0]
            add r0, r0, r12
            mov r12, #4
            mul r11, r11, r12
            add r1, r1, r11
            sub r5, r5, #4
            cmp r5, r5, #0
            ble LoopWEnd
            cmp r5, #4
            bge LoopW
        LoopW:
            mov r8, r1 // src_kh
            ldr r2, [sp, #8] // weight_kh
            ldr r6, [sp, #56] // kernel_h
            vmov q0, q13
            LoopKh:
                ldr r12, [sp, #84] //in_kw_step 
                ldr r7, [sp, #60] // kernel_w
                mov r10, r8 // src_kw
                LoopKw:
                    vld1.32 {q1}, [r10]
                    add r10, r10, r12
                    vld1.32 {q12}, [r2]!
                    vmla.f32 q0, q1, q12
                    subs r7, r7, #1
                    bne LoopKw
                ldr r12, [sp, #80]
                add r8, r8, r12
                subs r6, r6, #1
                bne LoopKh
            ldr r12, [sp, #92]
            cmp r12, #0
            bne Relu6
            ldr r12, [sp, #88]
            cmp r12, #0
            bne Relu
            b Write
        Relu6:
            vmin.f32 q0, q0, q14
        Relu:
            vmax.f32 q0, q0, q15
        Write:
            ldr r12, [sp, #68]
            vst1.32 {q0}, [r0]
            add r0, r0, r12
            ldr r12, [sp, #76]
            add r1, r1, r12
            subs r5, r5, #1
            bne LoopW
        ldr r3, [sp, #64]
        ldr r12, [sp]
        add r12, r12, r3
        str r12, [sp]
        ldr r3, [sp, #72]
        ldr r12, [sp, #4]
        add r12, r12, r3
        str r12, [sp, #4]
        subs r4, r4, #1
        bne LoopH
 LoopWEnd:
    sub sp, sp, #112
    vpop {v4-v7}
    pop {r0-r8, r10, r11, pc}
 #endif
 #endif
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/ConvDwInt8Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/ConvDwInt8Center.S
@@ -0,0 +1,207 @@
 #ifdef __arm__
 #ifndef __aarch64__

 .text
 .align 5
 .global ConvDwInt8Center
 #ifndef __APPLE__
 .type ConvDwInt8Center, %function
 #endif

 // void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, size_t height, size_t width,
 //                      size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
 //                      size_t in_kh_step, size_t in_kw_step, int out_multiplier, int left_shift,
 //                      int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max);
 // r0: dst, r1: src, r2: weight, r3: bias, #48: height, #52: width, #56: kernel_h, #60: kernel_w, 
 // #64: out_h_step, #68: block_channel, #72: in_sh_step, #76: in_sw_step, #80: in_kh_step,#84: in_kw_step
 // #88: out_multiplier, #92: left_shift, #96: right_shift, #100: out_zp, #104: acc_min, #108: acc_max
 ConvDwInt8Center:
    // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
    // according to https://stackoverflow.com/questions/53625807
    // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
    // clang's rule seems more simple, though there are no subroutine calls here
    // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
    push {r0-r8, r10, r11, lr}
    vpush {q4-q7}
    add sp, sp, #112

    ldr r4, [sp, #48]

    ldr r12, [sp, #92]
    vdup.32 q9, r12

    ldr r11, [sp, #88]
    vdup.32 q10, r11

    ldr r10, [sp, #96]
    vdup.32 q11, r10

    ldr r8, [sp, #100]
    vdup.32 q12, r8
 
    ldr r7, [sp, #104]
    vdup.32 q13, r7

    ldr r6, [sp, #108]
    vdup.32 q14, r6

    vld1.32 {q15}, [r3]

    LoopH:
        ldr r1, [sp, #4] // src_w
        ldr r5, [sp, #52] // width
        ldr r0, [sp] // dst_w
        LoopW4:
            mov r11, [sp, #76] // in_sw_step
            mov r8, r1 // src_kh
            ldr r2, [sp, #8] // weight_kh
            ldr r6, [sp, #56] // kernel_h
            vmov q0, q15
            LoopKh4:
                ldr r12, [sp, #80] //in_kh_step
                ldr r7, [sp, #60] // kernel_w
                mov r10, r8 // src_kw
                LoopKw4:
                    vld1.16 {d24}, [r2]!
                    vld1.16 {d8}, [r10]
                    add r10, r10, r11
                    vmlal.s16 q0, d8, d24
                    vld1.16 {d10}, [r10]
                    add r10, r10, r11
                    vmlal.s16 q1, d10, d24
                    vld1.16 {d12}, [r10]
                    add r10, r10, r11
                    vmlal.s16 q2, d12, d24
                    vld1.16 {d14}, [r10]
                    add r10, r10, r11
                    vmlal.s16 q3, d14, d24
                    subs r7, r7, #1
                    bne LoopKw4
                ldr r12, [sp, #80]
                add r8, r8, r12
                subs r6, r6, #1
                bne LoopKh4

            vshl.s32 q0, q0, q9
            vshl.s32 q1, q1, q9
            vshl.s32 q2, q2, q9
            vshl.s32 q3, q3, q9
            vqrdmulh.s32 q0, q0, q10
            vqrdmulh.s32 q1, q1, q10
            vqrdmulh.s32 q2, q2, q10
            vqrdmulh.s32 q3, q3, q10
            vrshl.s32 q0, q0, q11
            vrshl.s32 q1, q1, q11
            vrshl.s32 q2, q2, q11
            vrshl.s32 q3, q3, q11
            vadd.i32 q0, q0, q12
            vadd.i32 q1, q1, q12
            vadd.i32 q2, q2, q12
            vadd.i32 q3, q3, q12
            vmax.s32 q0, q0, q13
            vmax.s32 q1, q1, q13
            vmax.s32 q2, q2, q13
            vmax.s32 q3, q3, q13
            vmin.s32 q0, q0, q14
            vmin.s32 q1, q1, q14
            vmin.s32 q2, q2, q14
            vmin.s32 q3, q3, q14

            vqmovn.s32 d0, q0
            vqmovn.s32 d2, q1
            vqmovn.s32 d4, q2
            vqmovn.s32 d6, q3
            vqmovn.s16 d0, q0
            vqmovn.s16 d2, q1
            vqmovn.s16 d4, q2
            vqmovn.s16 d6, q3

            mov r3, r0
            ldr r12, [sp, #68]
            vst1.8 {d0[0]}, [r3]!
            vst1.8 {d0[1]}, [r3]!
            vst1.8 {d0[2]}, [r3]!
            vst1.8 {d0[3]}, [r3]!
            add r0, r0, r12
            mov r3, r0
            vst1.8 {d2[0]}, [r3]!
            vst1.8 {d2[1]}, [r3]!
            vst1.8 {d2[2]}, [r3]!
            vst1.8 {d2[3]}, [r3]!
            add r0, r0, r12
            mov r3, r0
            vst1.8 {d4[0]}, [r3]!
            vst1.8 {d4[1]}, [r3]!
            vst1.8 {d4[2]}, [r3]!
            vst1.8 {d4[3]}, [r3]!
            add r0, r0, r12
            mov r3, r0
            vst1.8 {d6[0]}, [r3]!
            vst1.8 {d6[1]}, [r3]!
            vst1.8 {d6[2]}, [r3]!
            vst1.8 {d6[3]}, [r3]!
            add r0, r0, r12
            mov r3, r0
            mov r12, #4
            mul r11, r11, r12
            add r1, r1, r11
            subs r5, r5, #1
            bne LoopW4
        LoopW:
            mov r8, r1 // src_kh
            ldr r2, [sp, #8] // weight_kh
            ldr r6, [sp, #56] // kernel_h
            vmov q0, q15
            LoopKh:
                ldr r12, [sp, #84] //in_kw_step 
                ldr r7, [sp, #60] // kernel_w
                mov r10, r8 // src_kw
                LoopKw:
                    vld1.16 {d2}, [r10]
                    add r10, r10, r12
                    vld1.16 {d24}, [r2]!
                    vmlal.s16 q0, d2, d24
                    subs r7, r7, #1
                    bne LoopKw
                ldr r12, [sp, #80]
                add r8, r8, r12
                subs r6, r6, #1
                bne LoopKh

            vshl.s32 q0, q0, q9
            vqrdmulh.s32 q0, q0, q10
            vrshl.s32 q0, q0, q11
            vadd.i32 q0, q0, q12
            vmax.s32 q0, q0, q13
            vmin.s32 q0, q0, q14

            vqmovn.s32 d0, q0
            vqmovn.s16 d0, q0

            mov r3, r0
            ldr r12, [sp, #68]
            vst1.8 {d0[0]}, [r3]!
            vst1.8 {d0[1]}, [r3]!
            vst1.8 {d0[2]}, [r3]!
            vst1.8 {d0[3]}, [r3]!
            add r0, r0, r12
            ldr r12, [sp, #76]
            add r1, r1, r12
            subs r5, r5, #1
            bne LoopW
        ldr r3, [sp, #64]
        ldr r12, [sp]
        add r12, r12, r3
        str r12, [sp]
        ldr r3, [sp, #72]
        ldr r12, [sp, #4]
        add r12, r12, r3
        str r12, [sp, #4]
        subs r4, r4, #1
        bne LoopH

    sub sp, sp, #112
    vpop {q4-q7}
    pop {r0-r8, r10, r11, pc}
 #endif
 #endif
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/DeconvDwFp32Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/DeconvDwFp32Center.S
@@ -0,0 +1,69 @@
 #ifdef __arm__
 #ifndef __aarch64__

 .text
 .align 5
 .global DeconvDwFp32Center
 #ifndef __APPLE__
 .type DeconvDwFp32Center, %function
 #endif

 // void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width,
 //                      size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
 //                      size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
 // r0: dst, r1: src, r2: weight, r3: height, r4: width, #52: kernel_h, #56: kernel_w, #60: out_h_step
 // #64: block_channel, #68: in_sh_step, #72: in_sw_step, #76: in_kh_step, #80: in_kw_step
 DeconvDwFp32Center:
    // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
    // according to https://stackoverflow.com/questions/53625807
    // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
    // clang's rule seems more simple, though there are no subroutine calls here
    // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
    push {r0-r8, r10, r11, lr}

    ldr r10, [sp, #80] // in_kw_step
    ldr r11, [sp, #76] // in_kh_step

    LoopH:
        ldr r0, [sp] // dst_w
        ldr r1, [sp, #4] // src_w
        ldr r4, [sp, #48] // width
        LoopW:
            mov r6, r0 // dst_kh
            ldr r2, [sp, #8] // weight_kh
            ldr r5, [sp, #52] // kernel_h
            vld1.32 {q1}, [r1]
            LoopKh:
                mov r7, r6 // dst_kw
                ldr r12, [sp, #56] // kernel_w
                LoopKw:
                    vld1.32 {q0}, [r7]
                    vld1.32 {q2}, [r2]!
                    vmla.f32 q0, q1, q2
                    vst1.32 {q0}, [r7]
                    add r7, r7, r10
                    subs r12, r12, #1
                    bne LoopKw
                add r6, r6, r11
                subs r5, r5, #1
                bne LoopKh
            ldr r12, [sp, #72]
            add r0, r0, r12
            ldr r8, [sp, #64]
            add r1, r1, r8
            subs r4, r4, #1
            bne LoopW
        ldr r8, [sp, #68]
        ldr r12, [sp]
        add r12, r12, r8
        str r12, [sp]
        ldr r8, [sp, #60]
        ldr r12, [sp, #4]
        add r12, r12, r8
        str r12, [sp, #4]
        subs r3, r3, #1
        bne LoopH

    pop {r0-r8, r10, r11, pc}
 #endif
 #endif
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/DeconvDwInt8Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/DeconvDwInt8Center.S
@@ -0,0 +1,69 @@
 #ifdef __arm__
 #ifndef __aarch64__

 .text
 .align 5
 .global DeconvDwInt8Center
 #ifndef __APPLE__
 .type DeconvDwInt8Center, %function
 #endif

 // void DeconvDwInt8Center(int32_t *dst, const int16_t *src, const int16_t *weight, size_t height, size_t width,
 //                      size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
 //                      size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
 // r0: dst, r1: src, r2: weight, r3: height, r4: width, #52: kernel_h, #56: kernel_w, #60: out_h_step
 // #64: block_channel, #68: in_sh_step, #72: in_sw_step, #76: in_kh_step, #80: in_kw_step
 DeconvDwInt8Center:
    // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
    // according to https://stackoverflow.com/questions/53625807
    // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
    // clang's rule seems more simple, though there are no subroutine calls here
    // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
    push {r0-r8, r10, r11, lr}

    ldr r10, [sp, #80] // in_kw_step
    ldr r11, [sp, #76] // in_kh_step

    LoopH:
        ldr r0, [sp] // dst_w
        ldr r1, [sp, #4] // src_w
        ldr r4, [sp, #48] // width
        LoopW:
            mov r6, r0 // dst_kh
            ldr r2, [sp, #8] // weight_kh
            ldr r5, [sp, #52] // kernel_h
            vld1.16 {d2}, [r1]
            LoopKh:
                mov r7, r6 // dst_kw
                ldr r12, [sp, #56] // kernel_w
                LoopKw:
                    vld1.32 {q0}, [r7]
                    vld1.16 {d24}, [r2]!
                    vmlal.s16 q0, d2, d24
                    vst1.32 {q0}, [r7]
                    add r7, r7, r10
                    subs r12, r12, #1
                    bne LoopKw
                add r6, r6, r11
                subs r5, r5, #1
                bne LoopKh
            ldr r12, [sp, #72]
            add r0, r0, r12
            ldr r8, [sp, #64]
            add r1, r1, r8
            subs r4, r4, #1
            bne LoopW
        ldr r8, [sp, #68]
        ldr r12, [sp]
        add r12, r12, r8
        str r12, [sp]
        ldr r8, [sp, #60]
        ldr r12, [sp, #4]
        add r12, r12, r8
        str r12, [sp, #4]
        subs r3, r3, #1
        bne LoopH

    pop {r0-r8, r10, r11, pc}
 #endif
 #endif
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/ConvDwFp32Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/ConvDwFp32Center.S
@@ -32,24 +32,238 @@ ConvDwFp32Center:
    ldr x14, [sp, #48]
    ldr x15, [sp, #56]

    ld1 {v5.4s}, [x3]
    ld1 {v24.4s}, [x3]
    movi v26.4s, #6
    scvtf v26.4s, v26.4s
    dup v27.4s, wzr

    LoopH:
        mov x23, x1
        mov x24, x5
        mov x3, x0
        cmp x24, #8
        blt LoopW
        cmp x24, #16
        blt LoopW8

        LoopW16:
            mov x19, #16
            mul x19, x19, x11
            mov x16, x23
            mov x17, x2
            mov x20, x6
            mov v0.16b, v24.16b
            mov v1.16b, v24.16b
            mov v2.16b, v24.16b
            mov v3.16b, v24.16b
            mov v4.16b, v24.16b
            mov v5.16b, v24.16b
            mov v6.16b, v24.16b
            mov v7.16b, v24.16b
            mov v8.16b, v24.16b
            mov v9.16b, v24.16b
            mov v10.16b, v24.16b
            mov v11.16b, v24.16b
            mov v12.16b, v24.16b
            mov v13.16b, v24.16b
            mov v14.16b, v24.16b
            mov v15.16b, v24.16b
            LoopKh16:
                mov x18, x7
                mov x21, x16
                LoopKw16:
                    mov x22, x21
                    ld1 {v25.4s}, [x17], #16
                    ld1 {v16.4s}, [x22], x11
                    ld1 {v17.4s}, [x22], x11
                    fmla v0.4s, v16.4s, v25.4s
                    fmla v1.4s, v17.4s, v25.4s
                    ld1 {v18.4s}, [x22], x11
                    ld1 {v19.4s}, [x22], x11
                    fmla v2.4s, v18.4s, v25.4s
                    fmla v3.4s, v19.4s, v25.4s
                    ld1 {v20.4s}, [x22], x11
                    ld1 {v21.4s}, [x22], x11
                    fmla v4.4s, v20.4s, v25.4s
                    fmla v5.4s, v21.4s, v25.4s
                    ld1 {v22.4s}, [x22], x11
                    ld1 {v23.4s}, [x22], x11
                    fmla v6.4s, v22.4s, v25.4s
                    fmla v7.4s, v23.4s, v25.4s
                    ld1 {v16.4s}, [x22], x11
                    ld1 {v17.4s}, [x22], x11
                    fmla v8.4s, v16.4s, v25.4s
                    fmla v9.4s, v17.4s, v25.4s
                    ld1 {v18.4s}, [x22], x11
                    ld1 {v19.4s}, [x22], x11
                    fmla v10.4s, v18.4s, v25.4s
                    fmla v11.4s, v19.4s, v25.4s
                    ld1 {v20.4s}, [x22], x11
                    ld1 {v21.4s}, [x22], x11
                    fmla v12.4s, v20.4s, v25.4s
                    fmla v13.4s, v21.4s, v25.4s
                    ld1 {v22.4s}, [x22], x11
                    ld1 {v23.4s}, [x22], x11
                    fmla v14.4s, v22.4s, v25.4s
                    fmla v15.4s, v23.4s, v25.4s
                    subs x18, x18, #1
                    add x21, x21, x13
                    bne LoopKw16
                add x16, x16, x12
                subs x20, x20, #1
                bne LoopKh16
            cbnz x15, Relu616
            cbnz x14, Relu16
            b Write16
        Relu616:
            fmin v0.4s, v0.4s, v26.4s
            fmin v1.4s, v1.4s, v26.4s
            fmin v2.4s, v2.4s, v26.4s
            fmin v3.4s, v3.4s, v26.4s
            fmin v4.4s, v4.4s, v26.4s
            fmin v5.4s, v5.4s, v26.4s
            fmin v6.4s, v6.4s, v26.4s
            fmin v7.4s, v7.4s, v26.4s
            fmin v8.4s, v8.4s, v26.4s
            fmin v9.4s, v9.4s, v26.4s
            fmin v10.4s, v10.4s, v26.4s
            fmin v11.4s, v11.4s, v26.4s
            fmin v12.4s, v12.4s, v26.4s
            fmin v13.4s, v13.4s, v26.4s
            fmin v14.4s, v14.4s, v26.4s
            fmin v15.4s, v15.4s, v26.4s
        Relu16:
            fmax v0.4s, v0.4s, v27.4s
            fmax v1.4s, v1.4s, v27.4s
            fmax v2.4s, v2.4s, v27.4s
            fmax v3.4s, v3.4s, v27.4s
            fmax v4.4s, v4.4s, v27.4s
            fmax v5.4s, v5.4s, v27.4s
            fmax v6.4s, v6.4s, v27.4s
            fmax v7.4s, v7.4s, v27.4s
            fmax v8.4s, v8.4s, v27.4s
            fmax v9.4s, v9.4s, v27.4s
            fmax v10.4s, v10.4s, v27.4s
            fmax v11.4s, v11.4s, v27.4s
            fmax v12.4s, v12.4s, v27.4s
            fmax v13.4s, v13.4s, v27.4s
            fmax v14.4s, v14.4s, v27.4s
            fmax v15.4s, v15.4s, v27.4s
        Write16:
            st1 {v0.4s}, [x3], x9
            st1 {v1.4s}, [x3], x9
            st1 {v2.4s}, [x3], x9
            st1 {v3.4s}, [x3], x9
            st1 {v4.4s}, [x3], x9
            st1 {v5.4s}, [x3], x9
            st1 {v6.4s}, [x3], x9
            st1 {v7.4s}, [x3], x9
            st1 {v8.4s}, [x3], x9
            st1 {v9.4s}, [x3], x9
            st1 {v10.4s}, [x3], x9
            st1 {v11.4s}, [x3], x9
            st1 {v12.4s}, [x3], x9
            st1 {v13.4s}, [x3], x9
            st1 {v14.4s}, [x3], x9
            st1 {v15.4s}, [x3], x9
            add x23, x23, x19
            sub x24, x24, #16
            cmp x24, #0
            ble LoopWEnd
            cmp x24, #8
            blt LoopW
            cmp x24, #16
            bge LoopW16
        LoopW8:
            mov x19, #8
            mul x19, x19, x11
            mov x16, x23
            mov x17, x2
            mov x20, x6
            mov v0.16b, v24.16b
            mov v1.16b, v24.16b
            mov v2.16b, v24.16b
            mov v3.16b, v24.16b
            mov v4.16b, v24.16b
            mov v5.16b, v24.16b
            mov v6.16b, v24.16b
            mov v7.16b, v24.16b
            LoopKh8:
                mov x18, x7
                mov x21, x16
                LoopKw8:
                    mov x22, x21
                    ld1 {v25.4s}, [x17], #16
                    ld1 {v16.4s}, [x22], x11
                    ld1 {v17.4s}, [x22], x11
                    fmla v0.4s, v16.4s, v25.4s
                    fmla v1.4s, v17.4s, v25.4s
                    ld1 {v18.4s}, [x22], x11
                    ld1 {v19.4s}, [x22], x11
                    fmla v2.4s, v18.4s, v25.4s
                    fmla v3.4s, v19.4s, v25.4s
                    ld1 {v20.4s}, [x22], x11
                    ld1 {v21.4s}, [x22], x11
                    fmla v4.4s, v20.4s, v25.4s
                    fmla v5.4s, v21.4s, v25.4s
                    ld1 {v22.4s}, [x22], x11
                    ld1 {v23.4s}, [x22], x11
                    fmla v6.4s, v22.4s, v25.4s
                    fmla v7.4s, v23.4s, v25.4s
                    subs x18, x18, #1
                    add x21, x21, x13
                    bne LoopKw8
                add x16, x16, x12
                subs x20, x20, #1
                bne LoopKh8
            cbnz x15, Relu68
            cbnz x14, Relu8
            b Write8
        Relu68:
            fmin v0.4s, v0.4s, v26.4s
            fmin v1.4s, v1.4s, v26.4s
            fmin v2.4s, v2.4s, v26.4s
            fmin v3.4s, v3.4s, v26.4s
            fmin v4.4s, v4.4s, v26.4s
            fmin v5.4s, v5.4s, v26.4s
            fmin v6.4s, v6.4s, v26.4s
            fmin v7.4s, v7.4s, v26.4s
        Relu8:
            fmax v0.4s, v0.4s, v27.4s
            fmax v1.4s, v1.4s, v27.4s
            fmax v2.4s, v2.4s, v27.4s
            fmax v3.4s, v3.4s, v27.4s
            fmax v4.4s, v4.4s, v27.4s
            fmax v5.4s, v5.4s, v27.4s
            fmax v6.4s, v6.4s, v27.4s
            fmax v7.4s, v7.4s, v27.4s
        Write8:
            st1 {v0.4s}, [x3], x9
            st1 {v1.4s}, [x3], x9
            st1 {v2.4s}, [x3], x9
            st1 {v3.4s}, [x3], x9
            st1 {v4.4s}, [x3], x9
            st1 {v5.4s}, [x3], x9
            st1 {v6.4s}, [x3], x9
            st1 {v7.4s}, [x3], x9
            add x23, x23, x19
            sub x24, x24, #8
            cmp x24, #0
            ble LoopWEnd
            cmp x24, #8
            bge LoopW8
        LoopW:
            mov x16, x23
            mov x17, x2
            mov x20, x6
            mov v0.16b, v5.16b
            mov v0.16b, v24.16b
            LoopKh:
                mov x18, x7
                mov x22, x16
                LoopKw:
                    ld1 {v1.4s}, [x22], x13
                    ld1 {v2.4s}, [x17], #16
                    fmla v0.4s, v1.4s, v2.4s
                    ld1 {v16.4s}, [x22], x13
                    ld1 {v25.4s}, [x17], #16
                    fmla v0.4s, v16.4s, v25.4s
                    subs x18, x18, #1
                    bne LoopKw
                add x16, x16, x12
@@ -59,17 +273,15 @@ ConvDwFp32Center:
            cbnz x14, Relu
            b Write
        Relu6:
            movi v4.4s, #6
            scvtf v4.4s, v4.4s
            fmin v0.4s, v0.4s, v4.4s
            fmin v0.4s, v0.4s, v26.4s
        Relu:
            dup v3.4s, wzr
            fmax v0.4s, v0.4s, v3.4s
            fmax v0.4s, v0.4s, v27.4s
        Write:
            st1 {v0.4s}, [x3], x9
            add x23, x23, x11
            subs x24, x24, #1
            bne LoopW
    LoopWEnd:
        add x0, x0, x8
        add x1, x1, x10
        subs x4, x4, #1
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/ConvDwInt8Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/ConvDwInt8Center.S
@@ -0,0 +1,558 @@
 #ifdef __aarch64__

 .text
 .align 5
 .global ConvDwInt8Center
 #ifndef __APPLE__
 .type ConvDwInt8Center, %function
 #endif

 // void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, size_t height, size_t width,
 //                      size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
 //                      size_t in_kh_step, size_t in_kw_step, int out_multiplier, int left_shift,
 //                      int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max);
 // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w, 
 // x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step
 // x14: out_multiplier, #56: left_shift, #64: right_shift, #72:out_zp, #80: acc_min, #88: acc_max
 ConvDwInt8Center:
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
    // whereas our coding style do not permit such amount of parameters
    sub sp, sp, #48
    stp x19, x20, [sp], #16
    stp x21, x22, [sp], #16
    stp x23, x24, [sp], #16

    ldr x8, [sp]
    ldr x9, [sp, #8]
    ldr x10, [sp, #16]
    ldr x11, [sp, #24]
    ldr x12, [sp, #32]
    ldr x13, [sp, #40]

    ldr w14, [sp, #56]
    dup v26.4s, w14

    ldr x15, [sp, #48]
    dup v27.4s, w15

    ldr w16, [sp, #64]
    dup v28.4s, w16

    ldr w17, [sp, #72]
    dup v29.4s, w17
                
    ldr w18, [sp, #80]
    dup v30.4s, w18

    ldr w19, [sp, #88]
    dup v31.4s, w19

    ld1 {v24.4s}, [x3]

    LoopH:
        mov x23, x1
        mov x24, x5
        mov x3, x0
        cmp x24, #8
        blt LoopW
        cmp x24, #16
        blt LoopW8

        LoopW16:
            mov x19, #16
            mul x19, x19, x11
            mov x16, x23
            mov x17, x2
            mov x20, x6
            mov v0.16b, v24.16b
            mov v1.16b, v24.16b
            mov v2.16b, v24.16b
            mov v3.16b, v24.16b
            mov v4.16b, v24.16b
            mov v5.16b, v24.16b
            mov v6.16b, v24.16b
            mov v7.16b, v24.16b
            mov v8.16b, v24.16b
            mov v9.16b, v24.16b
            mov v10.16b, v24.16b
            mov v11.16b, v24.16b
            mov v12.16b, v24.16b
            mov v13.16b, v24.16b
            mov v14.16b, v24.16b
            mov v15.16b, v24.16b
            LoopKh16:
                mov x18, x7
                mov x21, x16
                LoopKw16:
                    mov x22, x21
                    ld1 {v25.4h}, [x17], #8
                    ld1 {v16.4h}, [x22], x13
                    ld1 {v17.4h}, [x22], x13
                    smlal v0.4s, v16.4h, v25.4h
                    smlal v1.4s, v17.4h, v25.4h
                    ld1 {v18.4h}, [x22], x13
                    ld1 {v19.4h}, [x22], x13
                    smlal v2.4s, v18.4h, v25.4h
                    smlal v3.4s, v19.4h, v25.4h
                    ld1 {v20.4h}, [x22], x13
                    ld1 {v21.4h}, [x22], x13
                    smlal v4.4s, v20.4h, v25.4h
                    smlal v5.4s, v21.4h, v25.4h
                    ld1 {v22.4h}, [x22], x13
                    ld1 {v23.4h}, [x22], x13
                    smlal v6.4s, v22.4h, v25.4h
                    smlal v7.4s, v23.4h, v25.4h
                    ld1 {v16.4h}, [x22], x13
                    ld1 {v17.4h}, [x22], x13
                    smlal v8.4s, v16.4h, v25.4h
                    smlal v9.4s, v17.4h, v25.4h
                    ld1 {v18.4h}, [x22], x13
                    ld1 {v19.4h}, [x22], x13
                    smlal v10.4s, v18.4h, v25.4h
                    smlal v11.4s, v19.4h, v25.4h
                    ld1 {v20.4h}, [x22], x13
                    ld1 {v21.4h}, [x22], x13
                    smlal v12.4s, v20.4h, v25.4h
                    smlal v13.4s, v21.4h, v25.4h
                    ld1 {v22.4h}, [x22], x13
                    ld1 {v23.4h}, [x22], x13
                    smlal v14.4s, v22.4h, v25.4h
                    smlal v15.4s, v23.4h, v25.4h
                    subs x18, x18, #1
                    add x21, x21, x13
                    bne LoopKw16
                add x16, x16, x12
                subs x20, x20, #1
                bne LoopKh16

            sqshl v0.4s, v0.4s ,v26.4s
            sqshl v1.4s, v1.4s ,v26.4s
            sqshl v2.4s, v2.4s ,v26.4s
            sqshl v3.4s, v3.4s ,v26.4s
            sqshl v4.4s, v4.4s ,v26.4s
            sqshl v5.4s, v5.4s ,v26.4s
            sqshl v6.4s, v6.4s ,v26.4s
            sqshl v7.4s, v7.4s ,v26.4s
            sqshl v8.4s, v8.4s ,v26.4s
            sqshl v9.4s, v9.4s ,v26.4s
            sqshl v10.4s, v10.4s ,v26.4s
            sqshl v11.4s, v11.4s ,v26.4s
            sqshl v12.4s, v12.4s ,v26.4s
            sqshl v13.4s, v13.4s ,v26.4s
            sqshl v14.4s, v14.4s ,v26.4s
            sqshl v15.4s, v15.4s ,v26.4s
            sqrdmulh v0.4s, v0.4s ,v27.4s
            sqrdmulh v1.4s, v1.4s ,v27.4s
            sqrdmulh v2.4s, v2.4s ,v27.4s
            sqrdmulh v3.4s, v3.4s ,v27.4s
            sqrdmulh v4.4s, v4.4s ,v27.4s
            sqrdmulh v5.4s, v5.4s ,v27.4s
            sqrdmulh v6.4s, v6.4s ,v27.4s
            sqrdmulh v7.4s, v7.4s ,v27.4s
            sqrdmulh v8.4s, v8.4s ,v27.4s
            sqrdmulh v9.4s, v9.4s ,v27.4s
            sqrdmulh v10.4s, v10.4s ,v27.4s
            sqrdmulh v11.4s, v11.4s ,v27.4s
            sqrdmulh v12.4s, v12.4s ,v27.4s
            sqrdmulh v13.4s, v13.4s ,v27.4s
            sqrdmulh v14.4s, v14.4s ,v27.4s
            sqrdmulh v15.4s, v15.4s ,v27.4s
            sqrshl v0.4s, v0.4s ,v28.4s
            sqrshl v1.4s, v1.4s ,v28.4s
            sqrshl v2.4s, v2.4s ,v28.4s
            sqrshl v3.4s, v3.4s ,v28.4s
            sqrshl v4.4s, v4.4s ,v28.4s
            sqrshl v5.4s, v5.4s ,v28.4s
            sqrshl v6.4s, v6.4s ,v28.4s
            sqrshl v7.4s, v7.4s ,v28.4s
            sqrshl v8.4s, v8.4s ,v28.4s
            sqrshl v9.4s, v9.4s ,v28.4s
            sqrshl v10.4s, v10.4s ,v28.4s
            sqrshl v11.4s, v11.4s ,v28.4s
            sqrshl v12.4s, v12.4s ,v28.4s
            sqrshl v13.4s, v13.4s ,v28.4s
            sqrshl v14.4s, v14.4s ,v28.4s
            sqrshl v15.4s, v15.4s ,v28.4s
            add v0.4s, v0.4s ,v29.4s
            add v1.4s, v1.4s ,v29.4s
            add v2.4s, v2.4s ,v29.4s
            add v3.4s, v3.4s ,v29.4s
            add v4.4s, v4.4s ,v29.4s
            add v5.4s, v5.4s ,v29.4s
            add v6.4s, v6.4s ,v29.4s
            add v7.4s, v7.4s ,v29.4s
            add v8.4s, v8.4s ,v29.4s
            add v9.4s, v9.4s ,v29.4s
            add v10.4s, v10.4s ,v29.4s
            add v11.4s, v11.4s ,v29.4s
            add v12.4s, v12.4s ,v29.4s
            add v13.4s, v13.4s ,v29.4s
            add v14.4s, v14.4s ,v29.4s
            add v15.4s, v15.4s ,v29.4s
            smax v0.4s, v0.4s ,v30.4s
            smax v1.4s, v1.4s ,v30.4s
            smax v2.4s, v2.4s ,v30.4s
            smax v3.4s, v3.4s ,v30.4s
            smax v4.4s, v4.4s ,v30.4s
            smax v5.4s, v5.4s ,v30.4s
            smax v6.4s, v6.4s ,v30.4s
            smax v7.4s, v7.4s ,v30.4s
            smax v8.4s, v8.4s ,v30.4s
            smax v9.4s, v9.4s ,v30.4s
            smax v10.4s, v10.4s ,v30.4s
            smax v11.4s, v11.4s ,v30.4s
            smax v12.4s, v12.4s ,v30.4s
            smax v13.4s, v13.4s ,v30.4s
            smax v14.4s, v14.4s ,v30.4s
            smax v15.4s, v15.4s ,v30.4s
            smin v0.4s, v0.4s ,v31.4s
            smin v1.4s, v1.4s ,v31.4s
            smin v2.4s, v2.4s ,v31.4s
            smin v3.4s, v3.4s ,v31.4s
            smin v4.4s, v4.4s ,v31.4s
            smin v5.4s, v5.4s ,v31.4s
            smin v6.4s, v6.4s ,v31.4s
            smin v7.4s, v7.4s ,v31.4s
            smin v8.4s, v8.4s ,v31.4s
            smin v9.4s, v9.4s ,v31.4s
            smin v10.4s, v10.4s ,v31.4s
            smin v11.4s, v11.4s ,v31.4s
            smin v12.4s, v12.4s ,v31.4s
            smin v13.4s, v13.4s ,v31.4s
            smin v14.4s, v14.4s ,v31.4s
            smin v15.4s, v15.4s ,v31.4s

            sqxtn v0.4h, v0.4s
            sqxtn v1.4h, v1.4s
            sqxtn v2.4h, v2.4s
            sqxtn v3.4h, v3.4s
            sqxtn v4.4h, v4.4s
            sqxtn v5.4h, v5.4s
            sqxtn v6.4h, v6.4s
            sqxtn v7.4h, v7.4s
            sqxtn v8.4h, v8.4s
            sqxtn v9.4h, v9.4s
            sqxtn v10.4h, v10.4s
            sqxtn v11.4h, v11.4s
            sqxtn v12.4h, v12.4s
            sqxtn v13.4h, v13.4s
            sqxtn v14.4h, v14.4s
            sqxtn v15.4h, v15.4s
            sqxtn v0.8b, v0.8h
            sqxtn v1.8b, v1.8h
            sqxtn v2.8b, v2.8h
            sqxtn v3.8b, v3.8h
            sqxtn v4.8b, v4.8h
            sqxtn v5.8b, v5.8h
            sqxtn v6.8b, v6.8h
            sqxtn v7.8b, v7.8h
            sqxtn v8.8b, v8.8h
            sqxtn v9.8b, v9.8h
            sqxtn v10.8b, v10.8h
            sqxtn v11.8b, v11.8h
            sqxtn v12.8b, v12.8h
            sqxtn v13.8b, v13.8h
            sqxtn v14.8b, v14.8h
            sqxtn v15.8b, v15.8h

            add x17, x3, #1
            add x18, x3, #2
            add x21, x3, #3
            st1 {v0.b}[0], [x3], x9
            st1 {v0.b}[1], [x17], x9
            st1 {v0.b}[2], [x18], x9
            st1 {v0.b}[3], [x21], x9

            st1 {v1.b}[0], [x3], x9
            st1 {v1.b}[1], [x17], x9
            st1 {v1.b}[2], [x18], x9
            st1 {v1.b}[3], [x21], x9

            st1 {v2.b}[0], [x3], x9
            st1 {v2.b}[1], [x17], x9
            st1 {v2.b}[2], [x18], x9
            st1 {v2.b}[3], [x21], x9

            st1 {v3.b}[0], [x3], x9
            st1 {v3.b}[1], [x17], x9
            st1 {v3.b}[2], [x18], x9
            st1 {v3.b}[3], [x21], x9

            st1 {v4.b}[0], [x3], x9
            st1 {v4.b}[1], [x17], x9
            st1 {v4.b}[2], [x18], x9
            st1 {v4.b}[3], [x21], x9

            st1 {v5.b}[0], [x3], x9
            st1 {v5.b}[1], [x17], x9
            st1 {v5.b}[2], [x18], x9
            st1 {v5.b}[3], [x21], x9

            st1 {v6.b}[0], [x3], x9
            st1 {v6.b}[1], [x17], x9
            st1 {v6.b}[2], [x18], x9
            st1 {v6.b}[3], [x21], x9

            st1 {v7.b}[0], [x3], x9
            st1 {v7.b}[1], [x17], x9
            st1 {v7.b}[2], [x18], x9
            st1 {v7.b}[3], [x21], x9

            st1 {v8.b}[0], [x3], x9
            st1 {v8.b}[1], [x17], x9
            st1 {v8.b}[2], [x18], x9
            st1 {v8.b}[3], [x21], x9

            st1 {v9.b}[0], [x3], x9
            st1 {v9.b}[1], [x17], x9
            st1 {v9.b}[2], [x18], x9
            st1 {v9.b}[3], [x21], x9

            st1 {v10.b}[0], [x3], x9
            st1 {v10.b}[1], [x17], x9
            st1 {v10.b}[2], [x18], x9
            st1 {v10.b}[3], [x21], x9

            st1 {v11.b}[0], [x3], x9
            st1 {v11.b}[1], [x17], x9
            st1 {v11.b}[2], [x18], x9
            st1 {v11.b}[3], [x21], x9

            st1 {v12.b}[0], [x3], x9
            st1 {v12.b}[1], [x17], x9
            st1 {v12.b}[2], [x18], x9
            st1 {v12.b}[3], [x21], x9

            st1 {v13.b}[0], [x3], x9
            st1 {v13.b}[1], [x17], x9
            st1 {v13.b}[2], [x18], x9
            st1 {v13.b}[3], [x21], x9

            st1 {v14.b}[0], [x3], x9
            st1 {v14.b}[1], [x17], x9
            st1 {v14.b}[2], [x18], x9
            st1 {v14.b}[3], [x21], x9

            st1 {v15.b}[0], [x3], x9
            st1 {v15.b}[1], [x17], x9
            st1 {v15.b}[2], [x18], x9
            st1 {v15.b}[3], [x21], x9

            add x23, x23, x19
            sub x24, x24, #16
            cmp x24, #0
            ble LoopWEnd
            cmp x24, #8
            blt LoopW
            cmp x24, #16
            bge LoopW16
        LoopW8:
            mov x19, #8
            mul x19, x19, x11
            mov x16, x23
            mov x17, x2
            mov x20, x6
            mov v0.16b, v24.16b
            mov v1.16b, v24.16b
            mov v2.16b, v24.16b
            mov v3.16b, v24.16b
            mov v4.16b, v24.16b
            mov v5.16b, v24.16b
            mov v6.16b, v24.16b
            mov v7.16b, v24.16b
            LoopKh8:
                mov x18, x7
                mov x21, x16
                LoopKw8:
                    mov x22, x21
                    ld1 {v25.4h}, [x17], #8
                    ld1 {v16.4h}, [x22], x13
                    ld1 {v17.4h}, [x22], x13
                    smlal v0.4s, v16.4h, v25.4h
                    smlal v1.4s, v17.4h, v25.4h
                    ld1 {v18.4h}, [x22], x13
                    ld1 {v19.4h}, [x22], x13
                    smlal v2.4s, v18.4h, v25.4h
                    smlal v3.4s, v19.4h, v25.4h
                    ld1 {v20.4h}, [x22], x13
                    ld1 {v21.4h}, [x22], x13
                    smlal v4.4s, v20.4h, v25.4h
                    smlal v5.4s, v21.4h, v25.4h
                    ld1 {v22.4h}, [x22], x13
                    ld1 {v23.4h}, [x22], x13
                    smlal v6.4s, v22.4h, v25.4h
                    smlal v7.4s, v23.4h, v25.4h
                    subs x18, x18, #1
                    add x21, x21, x13
                    bne LoopKw8
                add x16, x16, x12
                subs x20, x20, #1
                bne LoopKh8

            sqshl v0.4s, v0.4s ,v26.4s
            sqshl v1.4s, v1.4s ,v26.4s
            sqshl v2.4s, v2.4s ,v26.4s
            sqshl v3.4s, v3.4s ,v26.4s
            sqshl v4.4s, v4.4s ,v26.4s
            sqshl v5.4s, v5.4s ,v26.4s
            sqshl v6.4s, v6.4s ,v26.4s
            sqshl v7.4s, v7.4s ,v26.4s
            sqrdmulh v0.4s, v0.4s ,v27.4s
            sqrdmulh v1.4s, v1.4s ,v27.4s
            sqrdmulh v2.4s, v2.4s ,v27.4s
            sqrdmulh v3.4s, v3.4s ,v27.4s
            sqrdmulh v4.4s, v4.4s ,v27.4s
            sqrdmulh v5.4s, v5.4s ,v27.4s
            sqrdmulh v6.4s, v6.4s ,v27.4s
            sqrdmulh v7.4s, v7.4s ,v27.4s
            sqrshl v0.4s, v0.4s ,v28.4s
            sqrshl v1.4s, v1.4s ,v28.4s
            sqrshl v2.4s, v2.4s ,v28.4s
            sqrshl v3.4s, v3.4s ,v28.4s
            sqrshl v4.4s, v4.4s ,v28.4s
            sqrshl v5.4s, v5.4s ,v28.4s
            sqrshl v6.4s, v6.4s ,v28.4s
            sqrshl v7.4s, v7.4s ,v28.4s
            add v0.4s, v0.4s ,v29.4s
            add v1.4s, v1.4s ,v29.4s
            add v2.4s, v2.4s ,v29.4s
            add v3.4s, v3.4s ,v29.4s
            add v4.4s, v4.4s ,v29.4s
            add v5.4s, v5.4s ,v29.4s
            add v6.4s, v6.4s ,v29.4s
            add v7.4s, v7.4s ,v29.4s
            smax v0.4s, v0.4s ,v30.4s
            smax v1.4s, v1.4s ,v30.4s
            smax v2.4s, v2.4s ,v30.4s
            smax v3.4s, v3.4s ,v30.4s
            smax v4.4s, v4.4s ,v30.4s
            smax v5.4s, v5.4s ,v30.4s
            smax v6.4s, v6.4s ,v30.4s
            smax v7.4s, v7.4s ,v30.4s
            smin v0.4s, v0.4s ,v31.4s
            smin v1.4s, v1.4s ,v31.4s
            smin v2.4s, v2.4s ,v31.4s
            smin v3.4s, v3.4s ,v31.4s
            smin v4.4s, v4.4s ,v31.4s
            smin v5.4s, v5.4s ,v31.4s
            smin v6.4s, v6.4s ,v31.4s
            smin v7.4s, v7.4s ,v31.4s

            sqxtn v0.4h, v0.4s
            sqxtn v1.4h, v1.4s
            sqxtn v2.4h, v2.4s
            sqxtn v3.4h, v3.4s
            sqxtn v4.4h, v4.4s
            sqxtn v5.4h, v5.4s
            sqxtn v6.4h, v6.4s
            sqxtn v7.4h, v7.4s
            sqxtn v0.8b, v0.8h
            sqxtn v1.8b, v1.8h
            sqxtn v2.8b, v2.8h
            sqxtn v3.8b, v3.8h
            sqxtn v4.8b, v4.8h
            sqxtn v5.8b, v5.8h
            sqxtn v6.8b, v6.8h
            sqxtn v7.8b, v7.8h

            add x17, x3, #1
            add x18, x3, #2
            add x21, x3, #3
            st1 {v0.b}[0], [x3], x9
            st1 {v0.b}[1], [x17], x9
            st1 {v0.b}[2], [x18], x9
            st1 {v0.b}[3], [x21], x9

            st1 {v1.b}[0], [x3], x9
            st1 {v1.b}[1], [x17], x9
            st1 {v1.b}[2], [x18], x9
            st1 {v1.b}[3], [x21], x9

            st1 {v2.b}[0], [x3], x9
            st1 {v2.b}[1], [x17], x9
            st1 {v2.b}[2], [x18], x9
            st1 {v2.b}[3], [x21], x9

            st1 {v3.b}[0], [x3], x9
            st1 {v3.b}[1], [x17], x9
            st1 {v3.b}[2], [x18], x9
            st1 {v3.b}[3], [x21], x9

            st1 {v4.b}[0], [x3], x9
            st1 {v4.b}[1], [x17], x9
            st1 {v4.b}[2], [x18], x9
            st1 {v4.b}[3], [x21], x9

            st1 {v5.b}[0], [x3], x9
            st1 {v5.b}[1], [x17], x9
            st1 {v5.b}[2], [x18], x9
            st1 {v5.b}[3], [x21], x9

            st1 {v6.b}[0], [x3], x9
            st1 {v6.b}[1], [x17], x9
            st1 {v6.b}[2], [x18], x9
            st1 {v6.b}[3], [x21], x9

            st1 {v7.b}[0], [x3], x9
            st1 {v7.b}[1], [x17], x9
            st1 {v7.b}[2], [x18], x9
            st1 {v7.b}[3], [x21], x9

            add x23, x23, x19
            sub x24, x24, #8
            cmp x24, #0
            ble LoopWEnd
            cmp x24, #8
            bge LoopW8
        LoopW:
            mov x16, x23
            mov x17, x2
            mov x20, x6
            mov v0.16b, v24.16b
            LoopKh:
                mov x18, x7
                mov x22, x16
                LoopKw:
                    ld1 {v16.4h}, [x22], x13
                    ld1 {v25.4h}, [x17], #8
                    smlal v0.4s, v16.4h, v25.4h
                    subs x18, x18, #1
                    bne LoopKw
                add x16, x16, x12
                subs x20, x20, #1
                bne LoopKh

            sqshl v0.4s, v0.4s ,v26.4s
            sqrdmulh v0.4s, v0.4s ,v27.4s
            sqrshl v0.4s, v0.4s ,v28.4s
            add v0.4s, v0.4s ,v29.4s
            smax v0.4s, v0.4s ,v30.4s
            smin v0.4s, v0.4s ,v31.4s

            sqxtn v0.4h, v0.4s
            sqxtn v0.8b, v0.8h

            mov x17, x3
            st1 {v0.b}[0], [x17], #1
            st1 {v0.b}[1], [x17], #1
            st1 {v0.b}[2], [x17], #1
            st1 {v0.b}[3], [x17], #1
            add x3, x3, x9

            add x23, x23, x11
            subs x24, x24, #1
            bne LoopW
    LoopWEnd:
        add x0, x0, x8
        add x1, x1, x10
        subs x4, x4, #1
        bne LoopH

    sub sp, sp, #48
    ldp x19, x20, [sp], #16
    ldp x21, x22, [sp], #16
    ldp x23, x24, [sp], #16
    ret
 #endif
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/DeconvDwFp32Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/DeconvDwFp32Center.S
@@ -35,12 +35,12 @@ DeconvDwFp32Center:
            mov x18, x15
            mov x19, x2
            mov x20, x5
            dup v0.4s, wzr
            ld1 {v1.4s}, [x16], x8
            LoopKh:
                mov x21, x18
                mov x13, x6
                LoopKw:
                    ld1 {v1.4s}, [x16]
                    ld1 {v0.4s}, [x21]
                    ld1 {v2.4s}, [x19], #16
                    fmla v0.4s, v1.4s, v2.4s
                    st1 {v0.4s}, [x21], x12
@@ -50,7 +50,6 @@ DeconvDwFp32Center:
                subs x20, x20, #1
                bne LoopKh
            add x15, x15, x10
            add x16, x16, x8
            subs x17, x17, #1
            bne LoopW
        add x0, x0, x9
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/DeconvDwInt8Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/DeconvDwInt8Center.S
@@ -0,0 +1,65 @@
 #ifdef __aarch64__

 .text
 .align 5
 .global DeconvDwInt8Center
 #ifndef __APPLE__
 .type DeconvDwInt8Center, %function
 #endif

 // void DeconvDwInt8Center(int32_t *dst, const int16_t *src, const int16_t *weight, size_t height, size_t width,
 //                      size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
 //                      size_t in_kh_step, size_t in_kw_step);
 // x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step
 // x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step
 DeconvDwInt8Center:
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
    // whereas our coding style do not permit such amount of parameters
    sub sp, sp, #32
    stp x19, x20, [sp], #16
    stp x21, x22, [sp], #16

    ldr x8, [sp]
    ldr x9, [sp, #8]
    ldr x10, [sp, #16]
    ldr x11, [sp, #24]
    ldr x12, [sp, #32]

    LoopH:
        mov x15, x0
        mov x16, x1
        mov x17, x4
        LoopW:
            mov x18, x15
            mov x19, x2
            mov x20, x5
            ld1 {v1.4h}, [x16], x8
            LoopKh:
                mov x21, x18
                mov x13, x6
                LoopKw:
                    ld1 {v0.4s}, [x21]
                    ld1 {v2.4h}, [x19], #8
                    smlal v0.4s, v1.4h, v2.4h
                    st1 {v0.4s}, [x21], x12
                    subs x13, x13, #1
                    bne LoopKw
                add x18, x18, x11
                subs x20, x20, #1
                bne LoopKh
            add x15, x15, x10
            add x16, x16, x8
            subs x17, x17, #1
            bne LoopW
        add x0, x0, x9
        add x1, x1, x7
        subs x3, x3, #1
        bne LoopH

    sub sp, sp, #32
    ldp x19, x20, [sp], #16
    ldp x21, x22, [sp], #16
    ret
 #endif
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/opt/ConvDwFp16Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/opt/ConvDwFp16Center.S
@@ -0,0 +1,294 @@
 #ifdef __aarch64__

 .text
 .align 5
 .global ConvDwFp16Center
 #ifndef __APPLE__
 .type ConvDwFp16Center, %function
 #endif

 // void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, size_t height, size_t width,
 //                      size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
 //                      size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
 // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w, 
 // x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step
 // x14: relu, x15: relu6
 ConvDwFp16Center:
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
    // whereas our coding style do not permit such amount of parameters
    sub sp, sp, #48
    stp x19, x20, [sp], #16
    stp x21, x22, [sp], #16
    stp x23, x24, [sp], #16

    ldr x8, [sp]
    ldr x9, [sp, #8]
    ldr x10, [sp, #16]
    ldr x11, [sp, #24]
    ldr x12, [sp, #32]
    ldr x13, [sp, #40]
    ldr x14, [sp, #48]
    ldr x15, [sp, #56]

    ld1 {v24.8h}, [x3]
    movi v26.8h, #0x46, lsl #8
    dup v27.4s, wzr

    LoopH:
        mov x23, x1
        mov x24, x5
        mov x3, x0
        cmp x24, #8
        blt LoopW
        cmp x24, #16
        blt LoopW8

        LoopW16:
            mov x19, #16
            mul x19, x19, x11
            mov x16, x23
            mov x17, x2
            mov x20, x6
            mov v0.16b, v24.16b
            mov v1.16b, v24.16b
            mov v2.16b, v24.16b
            mov v3.16b, v24.16b
            mov v4.16b, v24.16b
            mov v5.16b, v24.16b
            mov v6.16b, v24.16b
            mov v7.16b, v24.16b
            mov v8.16b, v24.16b
            mov v9.16b, v24.16b
            mov v10.16b, v24.16b
            mov v11.16b, v24.16b
            mov v12.16b, v24.16b
            mov v13.16b, v24.16b
            mov v14.16b, v24.16b
            mov v15.16b, v24.16b
            LoopKh16:
                mov x18, x7
                mov x21, x16
                LoopKw16:
                    mov x22, x21
                    ld1 {v25.8h}, [x17], #16
                    ld1 {v16.8h}, [x22], x11
                    ld1 {v17.8h}, [x22], x11
                    fmla v0.8h, v16.8h, v25.8h
                    fmla v1.8h, v17.8h, v25.8h
                    ld1 {v18.8h}, [x22], x11
                    ld1 {v19.8h}, [x22], x11
                    fmla v2.8h, v18.8h, v25.8h
                    fmla v3.8h, v19.8h, v25.8h
                    ld1 {v20.8h}, [x22], x11
                    ld1 {v21.8h}, [x22], x11
                    fmla v4.8h, v20.8h, v25.8h
                    fmla v5.8h, v21.8h, v25.8h
                    ld1 {v22.8h}, [x22], x11
                    ld1 {v23.8h}, [x22], x11
                    fmla v6.8h, v22.8h, v25.8h
                    fmla v7.8h, v23.8h, v25.8h
                    ld1 {v16.8h}, [x22], x11
                    ld1 {v17.8h}, [x22], x11
                    fmla v8.8h, v16.8h, v25.8h
                    fmla v9.8h, v17.8h, v25.8h
                    ld1 {v18.8h}, [x22], x11
                    ld1 {v19.8h}, [x22], x11
                    fmla v10.8h, v18.8h, v25.8h
                    fmla v11.8h, v19.8h, v25.8h
                    ld1 {v20.8h}, [x22], x11
                    ld1 {v21.8h}, [x22], x11
                    fmla v12.8h, v20.8h, v25.8h
                    fmla v13.8h, v21.8h, v25.8h
                    ld1 {v22.8h}, [x22], x11
                    ld1 {v23.8h}, [x22], x11
                    fmla v14.8h, v22.8h, v25.8h
                    fmla v15.8h, v23.8h, v25.8h
                    subs x18, x18, #1
                    add x21, x21, x13
                    bne LoopKw16
                add x16, x16, x12
                subs x20, x20, #1
                bne LoopKh16
            cbnz x15, Relu616
            cbnz x14, Relu16
            b Write16
        Relu616:
            fmin v0.8h, v0.8h, v26.8h
            fmin v1.8h, v1.8h, v26.8h
            fmin v2.8h, v2.8h, v26.8h
            fmin v3.8h, v3.8h, v26.8h
            fmin v4.8h, v4.8h, v26.8h
            fmin v5.8h, v5.8h, v26.8h
            fmin v6.8h, v6.8h, v26.8h
            fmin v7.8h, v7.8h, v26.8h
            fmin v8.8h, v8.8h, v26.8h
            fmin v9.8h, v9.8h, v26.8h
            fmin v10.8h, v10.8h, v26.8h
            fmin v11.8h, v11.8h, v26.8h
            fmin v12.8h, v12.8h, v26.8h
            fmin v13.8h, v13.8h, v26.8h
            fmin v14.8h, v14.8h, v26.8h
            fmin v15.8h, v15.8h, v26.8h
        Relu16:
            fmax v0.8h, v0.8h, v27.8h
            fmax v1.8h, v1.8h, v27.8h
            fmax v2.8h, v2.8h, v27.8h
            fmax v3.8h, v3.8h, v27.8h
            fmax v4.8h, v4.8h, v27.8h
            fmax v5.8h, v5.8h, v27.8h
            fmax v6.8h, v6.8h, v27.8h
            fmax v7.8h, v7.8h, v27.8h
            fmax v8.8h, v8.8h, v27.8h
            fmax v9.8h, v9.8h, v27.8h
            fmax v10.8h, v10.8h, v27.8h
            fmax v11.8h, v11.8h, v27.8h
            fmax v12.8h, v12.8h, v27.8h
            fmax v13.8h, v13.8h, v27.8h
            fmax v14.8h, v14.8h, v27.8h
            fmax v15.8h, v15.8h, v27.8h
        Write16:
            st1 {v0.8h}, [x3], x9
            st1 {v1.8h}, [x3], x9
            st1 {v2.8h}, [x3], x9
            st1 {v3.8h}, [x3], x9
            st1 {v4.8h}, [x3], x9
            st1 {v5.8h}, [x3], x9
            st1 {v6.8h}, [x3], x9
            st1 {v7.8h}, [x3], x9
            st1 {v8.8h}, [x3], x9
            st1 {v9.8h}, [x3], x9
            st1 {v10.8h}, [x3], x9
            st1 {v11.8h}, [x3], x9
            st1 {v12.8h}, [x3], x9
            st1 {v13.8h}, [x3], x9
            st1 {v14.8h}, [x3], x9
            st1 {v15.8h}, [x3], x9
            add x23, x23, x19
            sub x24, x24, #16
            cmp x24, #0
            ble LoopWEnd
            cmp x24, #8
            blt LoopW
            cmp x24, #16
            bge LoopW16
        LoopW8:
            mov x19, #8
            mul x19, x19, x11
            mov x16, x23
            mov x17, x2
            mov x20, x6
            mov v0.16b, v24.16b
            mov v1.16b, v24.16b
            mov v2.16b, v24.16b
            mov v3.16b, v24.16b
            mov v4.16b, v24.16b
            mov v5.16b, v24.16b
            mov v6.16b, v24.16b
            mov v7.16b, v24.16b
            LoopKh8:
                mov x18, x7
                mov x21, x16
                LoopKw8:
                    mov x22, x21
                    ld1 {v25.8h}, [x17], #16
                    ld1 {v16.8h}, [x22], x11
                    ld1 {v17.8h}, [x22], x11
                    fmla v0.8h, v16.8h, v25.8h
                    fmla v1.8h, v17.8h, v25.8h
                    ld1 {v18.8h}, [x22], x11
                    ld1 {v19.8h}, [x22], x11
                    fmla v2.8h, v18.8h, v25.8h
                    fmla v3.8h, v19.8h, v25.8h
                    ld1 {v20.8h}, [x22], x11
                    ld1 {v21.8h}, [x22], x11
                    fmla v4.8h, v20.8h, v25.8h
                    fmla v5.8h, v21.8h, v25.8h
                    ld1 {v22.8h}, [x22], x11
                    ld1 {v23.8h}, [x22], x11
                    fmla v6.8h, v22.8h, v25.8h
                    fmla v7.8h, v23.8h, v25.8h
                    subs x18, x18, #1
                    add x21, x21, x13
                    bne LoopKw8
                add x16, x16, x12
                subs x20, x20, #1
                bne LoopKh8
            cbnz x15, Relu68
            cbnz x14, Relu8
            b Write8
        Relu68:
            fmin v0.8h, v0.8h, v26.8h
            fmin v1.8h, v1.8h, v26.8h
            fmin v2.8h, v2.8h, v26.8h
            fmin v3.8h, v3.8h, v26.8h
            fmin v4.8h, v4.8h, v26.8h
            fmin v5.8h, v5.8h, v26.8h
            fmin v6.8h, v6.8h, v26.8h
            fmin v7.8h, v7.8h, v26.8h
        Relu8:
            fmax v0.8h, v0.8h, v27.8h
            fmax v1.8h, v1.8h, v27.8h
            fmax v2.8h, v2.8h, v27.8h
            fmax v3.8h, v3.8h, v27.8h
            fmax v4.8h, v4.8h, v27.8h
            fmax v5.8h, v5.8h, v27.8h
            fmax v6.8h, v6.8h, v27.8h
            fmax v7.8h, v7.8h, v27.8h
        Write8:
            st1 {v0.8h}, [x3], x9
            st1 {v1.8h}, [x3], x9
            st1 {v2.8h}, [x3], x9
            st1 {v3.8h}, [x3], x9
            st1 {v4.8h}, [x3], x9
            st1 {v5.8h}, [x3], x9
            st1 {v6.8h}, [x3], x9
            st1 {v7.8h}, [x3], x9
            add x23, x23, x19
            sub x24, x24, #8
            cmp x24, #0
            ble LoopWEnd
            cmp x24, #8
            bge LoopW8
        LoopW:
            mov x16, x23
            mov x17, x2
            mov x20, x6
            mov v0.16b, v24.16b
            LoopKh:
                mov x18, x7
                mov x22, x16
                LoopKw:
                    ld1 {v16.8h}, [x22], x13
                    ld1 {v25.8h}, [x17], #16
                    fmla v0.8h, v16.8h, v25.8h
                    subs x18, x18, #1
                    bne LoopKw
                add x16, x16, x12
                subs x20, x20, #1
                bne LoopKh
            cbnz x15, Relu6
            cbnz x14, Relu
            b Write
        Relu6:
            fmin v0.8h, v0.8h, v26.8h
        Relu:
            fmax v0.8h, v0.8h, v27.8h
        Write:
            st1 {v0.8h}, [x3], x9
            add x23, x23, x11
            subs x24, x24, #1
            bne LoopW
    LoopWEnd:
        add x0, x0, x8
        add x1, x1, x10
        subs x4, x4, #1
        bne LoopH

    sub sp, sp, #48
    ldp x19, x20, [sp], #16
    ldp x21, x22, [sp], #16
    ldp x23, x24, [sp], #16
    ret
 #endif
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/opt/DeconvDwFp16Center.S
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/opt/DeconvDwFp16Center.S
@@ -0,0 +1,64 @@
 #ifdef __aarch64__

 .text
 .align 5
 .global DeconvDwFp16Center
 #ifndef __APPLE__
 .type DeconvDwFp16Center, %function
 #endif

 // void DeconvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width,
 //                      size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
 //                      size_t in_kh_step, size_t in_kw_step);
 // x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step
 // x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step
 DeconvDwFp16Center:
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
    // whereas our coding style do not permit such amount of parameters
    sub sp, sp, #32
    stp x19, x20, [sp], #16
    stp x21, x22, [sp], #16

    ldr x8, [sp]
    ldr x9, [sp, #8]
    ldr x10, [sp, #16]
    ldr x11, [sp, #24]
    ldr x12, [sp, #32]

    LoopH:
        mov x15, x0
        mov x16, x1
        mov x17, x4
        LoopW:
            mov x18, x15
            mov x19, x2
            mov x20, x5
            ld1 {v1.8h}, [x16], x8
            LoopKh:
                mov x21, x18
                mov x13, x6
                LoopKw:
                    ld1 {v0.8h}, [x21]
                    ld1 {v2.8h}, [x19], #16
                    fmla v0.8h, v1.8h, v2.8h
                    st1 {v0.8h}, [x21], x12
                    subs x13, x13, #1
                    bne LoopKw
                add x18, x18, x11
                subs x20, x20, #1
                bne LoopKh
            add x15, x15, x10
            subs x17, x17, #1
            bne LoopW
        add x0, x0, x9
        add x1, x1, x7
        subs x3, x3, #1
        bne LoopH

    sub sp, sp, #32
    ldp x19, x20, [sp], #16
    ldp x21, x22, [sp], #16
    ret
 #endif
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/common_func.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/common_func.h
@@ -0,0 +1,44 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP16_COMMON_FUNC_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP16_COMMON_FUNC_H_

 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
 #include "src/runtime/kernel/arm/opclib/op_base.h"
 #include "src/runtime/kernel/arm/opclib/conv_parameter.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 #ifdef ENABLE_ARM64
 void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
                      size_t height, size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step,
                      size_t block_channel, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step,
                      size_t in_kw_step, size_t relu, size_t relu6);
 void DeconvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width,
                        size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
                        size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
 #endif

 #ifdef __cplusplus
 }
 #endif

 #endif /* MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_COMMON_FUNC_H_ */
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.cc
@@ -16,6 +16,7 @@

 #include "src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.h"
 #include <arm_neon.h>
 #include "src/runtime/kernel/arm/opclib/fp16/common_func.h"

 /*conv depthwise fp16 begin*/
 void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
@@ -79,6 +80,7 @@ void DepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t *
  }  // height loop
 }

 #ifndef ENABLE_ARM64
 void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
                         int height, int width, int kernel_h, int kernel_w, int out_h_step, int block_channel,
                         int in_sh_step, int in_sw_step, int in_kh_step, int in_kw_step, bool is_relu, bool is_relu6) {
@@ -97,12 +99,17 @@ void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *
        const float16_t *src_kw = src_kh;
        const float16_t *weight_kw = weight_kh;
        for (int kw = 0; kw < kernel_w; kw++) {
 #ifdef ENABLE_ARM64
          float16x8_t src_8 = vld1q_f16(src_kw);
          float16x8_t weight_8 = vld1q_f16(weight_kw);
          float16x8_t dst_8 = vld1q_f16(dst_w);
          dst_8 = vfmaq_f16(dst_8, src_8, weight_8);
          vst1q_f16(dst_w, dst_8);

 #else
          for (int c = 0; c < C8NUM; c++) {
            dst_w[c] += src_kw[c] * weight_kw[c];
          }
 #endif
          src_kw += in_kw_step;
          weight_kw += C8NUM;
        }  // kernel_w loop
@@ -122,6 +129,7 @@ void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *
    src_h += in_sh_step;
  }  // dst_height loop
 }
 #endif

 // conv depthwise fp16: sliding window
 void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data,
@@ -149,11 +157,19 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo
        int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
        const float16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_;
        float16_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;

 #ifdef ENABLE_ARM64
        ConvDwFp16Center(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_,
                         sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
                         sliding->out_h_step_ * sizeof(float16_t), sliding->block_channel_ * sizeof(float16_t),
                         sliding->in_sh_step_ * sizeof(float16_t), sliding->in_sw_step_ * sizeof(float16_t),
                         sliding->in_kh_step_ * sizeof(float16_t), sliding->in_kw_step_ * sizeof(float16_t),
                         conv_param->is_relu_, conv_param->is_relu6_);
 #else
        DepthwiseCenterFp16(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_,
                            sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
                            sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_, sliding->in_sw_step_,
                            sliding->in_kh_step_, sliding->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_);
 #endif
      }
    }  // output C8 loop
    src += sliding->in_step_;
@@ -214,6 +230,7 @@ void DeconvDepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float
  }  // height loop
 }

 #ifndef ENABLE_ARM64
 void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *weight, int height, int width,
                               int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step,
                               int in_sw_step, int in_kh_step, int in_kw_step) {
@@ -229,12 +246,17 @@ void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float
        float16_t *dst_kw = dst_kh;
        const float16_t *weight_kw = weight_kh;
        for (int kw = 0; kw < kernel_w; kw++) {
 #ifdef ENABLE_ARM64
          float16x8_t src_8 = vld1q_f16(src_w);
          float16x8_t weight_8 = vld1q_f16(weight_kw);
          float16x8_t dst_8 = vld1q_f16(dst_kw);
          dst_8 = vfmaq_f16(dst_8, src_8, weight_8);
          vst1q_f16(dst_kw, dst_8);

 #else
          for (int c = 0; c < C8NUM; c++) {
            dst_kw[c] += src_w[c] * weight_kw[c];
          }
 #endif
          dst_kw += in_kw_step;
          weight_kw += C8NUM;
        }  // kernel_w loop
@@ -248,6 +270,7 @@ void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float
    src_h += out_h_step;
  }  // dst_height loop
 }
 #endif

 void DeconvDepthwisePostFuncFp16(float16_t *dst, const float16_t *bias, int block_channel,
                                 const ConvParameter *conv_param) {
@@ -289,11 +312,18 @@ void DeconvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const f
        float16_t *out_t = dst_data + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_;
        const float16_t *in_t =
          src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;

 #ifdef ENABLE_ARM64
        DeconvDwFp16Center(out_t, in_t, weight, sliding->bottom_ - sliding->top_,
                           sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
                           sliding->out_h_step_ * sizeof(float16_t), sliding->block_channel_ * sizeof(float16_t),
                           sliding->in_sh_step_ * sizeof(float16_t), sliding->in_sw_step_ * sizeof(float16_t),
                           sliding->in_kh_step_ * sizeof(float16_t), sliding->in_kw_step_ * sizeof(float16_t));
 #else
        DeconvDepthwiseCenterFp16(out_t, in_t, weight, sliding->bottom_ - sliding->top_,
                                  sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
                                  sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_,
                                  sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_);
 #endif
      }
      DeconvDepthwisePostFuncFp16(dst_data, bias, sliding->block_channel_, conv_param);
    }  // output C8 loop
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/common_func.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/common_func.h
@@ -38,6 +38,15 @@ void MatrixSub(const float *a_ptr, const float *b_ptr, float *dst, size_t a_stri
 void MatrixMultiAdd(float *c11, float *c12, float *c21, float *c22, float *x_ptr, size_t row, size_t col,
                    size_t c_stride, size_t x_stride);

 #ifdef ENABLE_ARM
 void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
                      size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
                      size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
 void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width,
                        size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
                        size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
 #endif

 #ifdef ENABLE_ARM64
 void BiasAdd(const float *bias, float *data, size_t oc4, size_t plan_size);
 void BiasAddRelu6(const float *bias, float *data, size_t oc4, size_t plan_size);
@@ -49,12 +58,6 @@ void C4BiasAddRelu(float *dst, const float *input, const float* bias, size_t oc,
 void C4BiasAddRelu6(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride);
 void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride);
 void C4Relu6(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride);
 void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
                      size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
                      size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6);
 void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width,
                        size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
                        size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
 #endif

 #ifdef __cplusplus
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/common_func.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/common_func.h
@@ -0,0 +1,62 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_COMMON_FUNC_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_COMMON_FUNC_H_

 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
 #include "src/runtime/kernel/arm/opclib/op_base.h"
 #include "src/runtime/kernel/arm/opclib/conv_parameter.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 #ifdef ENABLE_ARM
 void IndirectGemmInt16to32_8x4(int32_t *dst, const int16_t *src, const int16_t *weight, size_t ksize, size_t ic8,
                               size_t oc4, size_t offset);

 #ifdef ENABLE_ARM64
 void IndirectGemmInt8_4x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize,
                          size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min,
                          size_t act_max, size_t out_zp, size_t out_multiplier, size_t shift_before,
                          size_t shift_after);
 #elif defined(ENABLE_ARM32)
 void IndirectGemmInt8_2x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize,
                          size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min,
                          size_t act_max, size_t out_zp, size_t out_multiplier, size_t shift_before,
                          size_t shift_after);
 #endif
 #endif

 #ifdef ENABLE_ARM
 void DeconvDwInt8Center(int32_t *dst, const int16_t *src, const int16_t *weight, size_t height, size_t width,
                        size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
                        size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
 void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, size_t height,
                      size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel,
                      size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, int out_multiplier,
                      int left_shift, int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max);
 #endif

 #ifdef __cplusplus
 }
 #endif

 #endif /* MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_COMMON_FUNC_H_ */

--- a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_depthwise_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_depthwise_int8.cc
@@ -17,6 +17,7 @@
 #include "src/runtime/kernel/arm/opclib/int8/conv_depthwise_int8.h"
 #include <string.h>
 #include "src/runtime/kernel/arm/opclib/quantization/fixed_point.h"
 #include "src/runtime/kernel/arm/opclib/int8/common_func.h"

 /*conv depthwise int8 begin*/
 void DepthwiseBorderPixelInt8(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, int height,
@@ -85,6 +86,7 @@ void DepthwiseBorderInt8(int8_t *dst, const int16_t *src, const int16_t *weight,
  }  // height loop
 }

 #ifndef ENABLE_ARM64
 void DepthwiseCenterInt8(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, int height,
                         int width, int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step,
                         int in_sw_step, int in_kh_step, int in_kw_step, int out_multiplier, int left_shift,
@@ -133,6 +135,7 @@ void DepthwiseCenterInt8(int8_t *dst, const int16_t *src, const int16_t *weight,
    src_h += in_sh_step;
  }  // dst_height loop
 }
 #endif

 void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data,
                const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id) {
@@ -158,7 +161,17 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w
        int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
        const int16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * C4NUM;
        int8_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * C4NUM;

 #ifdef ENABLE_ARM64
        ConvDwInt8Center(
          out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_,
          conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(int8_t),
          sliding->block_channel_ * sizeof(int8_t), sliding->in_sh_step_ * sizeof(int16_t),
          sliding->in_sw_step_ * sizeof(int16_t), sliding->in_kh_step_ * sizeof(int16_t),
          sliding->in_kw_step_ * sizeof(int16_t), conv_param->conv_quant_arg_.quant_multiplier_[0],
          conv_param->conv_quant_arg_.left_shift_[0], conv_param->conv_quant_arg_.right_shift_[0],
          conv_param->conv_quant_arg_.quant_args_[2][0].zp_, conv_param->conv_quant_arg_.out_act_min_[0],
          conv_param->conv_quant_arg_.out_act_max_[0]);
 #else
        DepthwiseCenterInt8(
          out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_,
          conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_, sliding->block_channel_,
@@ -166,6 +179,7 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w
          conv_param->conv_quant_arg_.quant_multiplier_[0], conv_param->conv_quant_arg_.left_shift_[0],
          conv_param->conv_quant_arg_.right_shift_[0], conv_param->conv_quant_arg_.quant_args_[2][0].zp_,
          conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0]);
 #endif
      }
    }  // output C4 loop
    src += sliding->in_step_;
@@ -222,6 +236,7 @@ void DeconvDepthwiseBorderInt8(int32_t *dst, const int16_t *src, const int16_t *
  }  // height loop
 }

 #ifndef ENABLE_ARM64
 void DeconvDepthwiseCenterInt8(int32_t *dst, const int16_t *src, const int16_t *weight, int height, int width,
                               int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step,
                               int in_sw_step, int in_kh_step, int in_kw_step) {
@@ -253,6 +268,7 @@ void DeconvDepthwiseCenterInt8(int32_t *dst, const int16_t *src, const int16_t *
    src_h += out_h_step;
  }  // dst_height loop
 }
 #endif

 void DeconvDepthwisePostFuncInt8(int8_t *dst, int32_t *output_buffer, const int32_t *bias, int block_channel,
                                 const ConvParameter *conv_param, int out_multiplier, int left_shift, int right_shift,
@@ -302,11 +318,18 @@ void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *in
        int32_t *out_t = output_buffer + oh_h_start * sliding->in_h_step_ + oh_w_start * C4NUM;
        const int16_t *in_t =
          src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;

 #ifdef ENABLE_ARM64
        DeconvDwInt8Center(out_t, in_t, weight, sliding->bottom_ - sliding->top_,
                           sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
                           sliding->out_h_step_ * sizeof(int16_t), sliding->block_channel_ * sizeof(int16_t),
                           sliding->in_sh_step_ * sizeof(int32_t), sliding->in_sw_step_ * sizeof(int32_t),
                           sliding->in_kh_step_ * sizeof(int32_t), sliding->in_kw_step_ * sizeof(int32_t));
 #else
        DeconvDepthwiseCenterInt8(out_t, in_t, weight, sliding->bottom_ - sliding->top_,
                                  sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
                                  sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_,
                                  sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_);
 #endif
      }
      DeconvDepthwisePostFuncInt8(
        dst_data, output_buffer, bias, sliding->block_channel_, conv_param,
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_int8.cc
@@ -17,25 +17,7 @@
 #include "src/runtime/kernel/arm/opclib/int8/conv_int8.h"
 #include <string.h>
 #include "src/runtime/kernel/arm/opclib/winograd_transform.h"

 extern "C" {
 #ifdef ENABLE_ARM
 void IndirectGemmInt16to32_8x4(int32_t *dst, const int16_t *src, const int16_t *weight, size_t ksize, size_t ic8,
                               size_t oc4, size_t offset);

 #ifdef ENABLE_ARM64
 void IndirectGemmInt8_4x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize,
                          size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min,
                          size_t act_max, size_t out_zp, size_t out_multiplier, size_t shift_before,
                          size_t shift_after);
 #elif defined(ENABLE_ARM32)
 void IndirectGemmInt8_2x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize,
                          size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min,
                          size_t act_max, size_t out_zp, size_t out_multiplier, size_t shift_before,
                          size_t shift_after);
 #endif
 #endif
 }
 #include "src/runtime/kernel/arm/opclib/int8/common_func.h"

 void IndirectGemmInt8(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const int8_t *weight, const int32_t *bias,
                      int ic4, size_t kernel_plane, size_t output_channel, const int32_t *input_sum,