!6960 [MS][LITE][Develop]optimization for fp32 matmul kernel on arm64

Merge pull request !6960 from lixian/master
5 years ago · dcc4bb1d5c
--- a/mindspore/lite/internal/CMakeLists.txt
+++ b/mindspore/lite/internal/CMakeLists.txt
@@ -39,7 +39,8 @@ if (PLATFORM_ARM64)
    # assembly
    file(GLOB ASSEMBLY_SRC
            ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/assembly/arm64/MatmulFp32OptRemain.S
            ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/assembly/arm64/MatmulFp32Opt.S)
            ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/assembly/arm64/MatmulFp32Opt.S
            ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/assembly/arm64/MatmulFp32.S)
    set_property(SOURCE ${ASSEMBLY_SRC} PROPERTY LANGUAGE C)
    set(KERNEL_SRC ${KERNEL_SRC} ${ASSEMBLY_SRC})
    add_library(mslite_internal SHARED ${CCSRC} ${KERNEL_SRC} ${TRAIN_KERNEL_SRC})
--- a/mindspore/lite/nnacl/assembly/arm32/IndirectGemmFp32_8x4.S
+++ b/mindspore/lite/nnacl/assembly/arm32/IndirectGemmFp32_8x4.S
@@ -1,302 +0,0 @@
 #ifdef __arm__
 #ifndef __aarch64__

 .text
 .align 5
 .global IndirectGemmFp32_8x4
 #ifndef __APPLE__
 .type IndirectGemmFp32_8x4, %function
 #endif

 // void IndirectGemmFp32_8x4(float *output, float *input, float *weight, float *bias,
 //     size_t kSize, size_t ic4, size_t oc8, size_t offset, size_t mode, size_t writeC4, size_t relu, size_t relu6);
 // r0: output, r1: input, r2: weight, r3: bias, r4: kSize, r5: ic4, r6: oc, r7: offset
 // r8:mode, r10: writeMode, r10: relu, r10:relu6
 // mode = 0 for general convolution, where one conv unit is a row
 // mode = 1 for winograd/common gemm, where the total channels of one input is a row
 IndirectGemmFp32_8x4:

    .macro INIT_BIAS
        veor q8, q8, q8
        cmp r3, #0
        beq InitBias
        vld1.32 {q8}, [r3]
    InitBias:
        vmov q9, q8
        vmov q10, q8
        vmov q11, q8
        vmov q12, q8
        vmov q13, q8
        vmov q14, q8
        vmov q15, q8
    .endm

    // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
    // according to https://stackoverflow.com/questions/53625807
    // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
    // clang's rule seems more simple, though there are no subroutine calls here
    // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
    push {r4-r8, r10, r11, lr}
    vpush {q4-q7}
    add sp, sp, #96

    ldr r4, [sp]
    ldr r5, [sp, #4]
    ldr r6, [sp, #8]
    ldr r7, [sp, #12]
    ldr r8, [sp, #16]

    cmp r8, #0
    bne LoopOc
    // step is one for common convolution, where ic8 should multiply by kernel size 
    // step is (a+b-1) for F(a,b) in winograd
    mul r5, r4, r5
    mov r4, #1

    LoopOc:
        mov r8, r4
        mov r12, r1

        LoopKsize:

            mov r11, r0
            INIT_BIAS

            // load input for output 1-2
            vld1.32 {q0, q1}, [r12]!
            vld1.32 {q2, q3}, [r12]!
            // load weight
            vld1.32 {q4, q5}, [r2]!
            // step for output 1-2
            vmla.f32 q8, q4, d0[0]
            vmla.f32 q9, q4, d2[0]
            vmla.f32 q8, q5, d0[1]
            vmla.f32 q9, q5, d2[1]
            vld1.32 {q6, q7}, [r2]!

            subs r10, r5, #1
            beq LoopIcEnd

            LoopIc:
                vmla.f32 q8, q6, d1[0]
                vmla.f32 q9, q6, d3[0]
                vmla.f32 q8, q7, d1[1]
                vmla.f32 q9, q7, d3[1]
                vmla.f32 q10, q4, d4[0]
                vmla.f32 q11, q4, d6[0]
                vmla.f32 q10, q5, d4[1]
                vmla.f32 q11, q5, d6[1]
                vld1.s32 {q0, q1}, [r12]!
                vmla.f32 q10, q6, d5[0]
                vmla.f32 q11, q6, d7[0]
                vmla.f32 q10, q7, d5[1]
                vmla.f32 q11, q7, d7[1]
                vld1.s32 {q2, q3}, [r12]!
                vmla.f32 q12, q4, d0[0]
                vmla.f32 q13, q4, d2[0]
                vmla.f32 q12, q5, d0[1]
                vmla.f32 q13, q5, d2[1]
                vmla.f32 q14, q4, d4[0]
                vmla.f32 q15, q4, d6[0]
                vmla.f32 q14, q5, d4[1]
                vmla.f32 q15, q5, d6[1]
                vld1.s32 {q4, q5}, [r2]!
                vmla.f32 q12, q6, d1[0]
                vmla.f32 q13, q6, d3[0]
                vmla.f32 q12, q7, d1[1]
                vmla.f32 q13, q7, d3[1]
                vld1.s32 {q0, q1}, [r12]!
                vmla.f32 q14, q6, d5[0]
                vmla.f32 q15, q6, d7[0]
                vmla.f32 q14, q7, d5[1]
                vmla.f32 q15, q7, d7[1]
                vld1.s32 {q6, q7}, [r2]!
                vmla.f32 q8, q4, d0[0]
                vmla.f32 q9, q4, d2[0]
                vmla.f32 q8, q5, d0[1]
                vmla.f32 q9, q5, d2[1]
                vld1.s32 {q2, q3}, [r12]!

                subs r10, r10, #1
                bne LoopIc

            LoopIcEnd:
                vmla.f32 q8, q6, d1[0]
                vmla.f32 q9, q6, d3[0]
                vmla.f32 q8, q7, d1[1]
                vmla.f32 q9, q7, d3[1]
                vmla.f32 q10, q4, d4[0]
                vmla.f32 q11, q4, d6[0]
                vmla.f32 q10, q5, d4[1]
                vmla.f32 q11, q5, d6[1]
                vld1.s32 {q0, q1}, [r12]!
                vmla.f32 q10, q6, d5[0]
                vmla.f32 q11, q6, d7[0]
                vmla.f32 q10, q7, d5[1]
                vmla.f32 q11, q7, d7[1]
                vld1.s32 {q2, q3}, [r12]!
                vmla.f32 q12, q4, d0[0]
                vmla.f32 q13, q4, d2[0]
                vmla.f32 q12, q5, d0[1]
                vmla.f32 q13, q5, d2[1]
                vmla.f32 q14, q4, d4[0]
                vmla.f32 q15, q4, d6[0]
                vmla.f32 q14, q5, d4[1]
                vmla.f32 q15, q5, d6[1]
                vmla.f32 q12, q6, d1[0]
                vmla.f32 q13, q6, d3[0]
                vmla.f32 q12, q7, d1[1]
                vmla.f32 q13, q7, d3[1]
                vmla.f32 q14, q6, d5[0]
                vmla.f32 q15, q6, d7[0]
                vmla.f32 q14, q7, d5[1]
                vmla.f32 q15, q7, d7[1]

                ldr r10, [sp, #28]
                cmp r10, #0
                bne Relu6
                ldr r10, [sp, #24]
                cmp r10, #0
                bne Relu
                b WriteStart
            Relu6:
                vmov.i32 q7, #6
                vcvt.f32.s32 q7, q7
                vmin.f32 q8, q8, q7
                vmin.f32 q9, q9, q7
                vmin.f32 q10, q10, q7
                vmin.f32 q11, q11, q7
                vmin.f32 q12, q12, q7
                vmin.f32 q13, q13, q7
                vmin.f32 q14, q14, q7
                vmin.f32 q15, q15, q7
            Relu:
                veor q7, q7, q7
                vmax.f32 q8, q8, q7
                vmax.f32 q9, q9, q7
                vmax.f32 q10, q10, q7
                vmax.f32 q11, q11, q7
                vmax.f32 q12, q12, q7
                vmax.f32 q13, q13, q7
                vmax.f32 q14, q14, q7
                vmax.f32 q15, q15, q7

            WriteStart:
                ldr r10, [sp, #20]
                cmp r10, #0
                bne Write4
                cmp r6, #1
                beq Write1
                cmp r6, #2
                beq Write2
                cmp r6, #3
                beq Write3
                b Write4
            Write1:
                vst1.32 d16[0], [r11]
                add r11, r11, r7
                vst1.32 d18[0], [r11]
                add r11, r11, r7
                vst1.32 d20[0], [r11]
                add r11, r11, r7
                vst1.32 d22[0], [r11]
                add r11, r11, r7
                vst1.32 d24[0], [r11]
                add r11, r11, r7
                vst1.32 d26[0], [r11]
                add r11, r11, r7
                vst1.32 d28[0], [r11]
                add r11, r11, r7
                vst1.32 d30[0], [r11]
                add r11, r11, r7
                add r0, r0, #4
                b WriteEnd
            Write2:
                vst1.32 d16, [r11]
                add r11, r11, r7
                vst1.32 d18, [r11]
                add r11, r11, r7
                vst1.32 d20, [r11]
                add r11, r11, r7
                vst1.32 d22, [r11]
                add r11, r11, r7
                vst1.32 d24, [r11]
                add r11, r11, r7
                vst1.32 d26, [r11]
                add r11, r11, r7
                vst1.32 d28, [r11]
                add r11, r11, r7
                vst1.32 d30, [r11]
                add r11, r11, r7
                add r0, r0, #8
                b WriteEnd
            Write3:
                add lr, r11, #8
                vst1.32 d16, [r11]
                add r11, r11, r7
                vst1.32 d17[0], [lr]
                add lr, lr, r7
                vst1.32 d18, [r11]
                add r11, r11, r7
                vst1.32 d19[0], [lr]
                add lr, lr, r7
                vst1.32 d20, [r11]
                add r11, r11, r7
                vst1.32 d21[0], [lr]
                add lr, lr, r7
                vst1.32 d22, [r11]
                add r11, r11, r7
                vst1.32 d23[0], [lr]
                add lr, lr, r7
                vst1.32 d24, [r11]
                add r11, r11, r7
                vst1.32 d25[0], [lr]
                add lr, lr, r7
                vst1.32 d26, [r11]
                add r11, r11, r7
                vst1.32 d27[0], [lr]
                add lr, lr, r7
                vst1.32 d28, [r11]
                add r11, r11, r7
                vst1.32 d29[0], [lr]
                add lr, lr, r7
                vst1.32 d30, [r11]
                add r11, r11, r7
                vst1.32 d31[0], [lr]
                add lr, lr, r7
                add r0, r0, #12
                b WriteEnd
            Write4:
                // prefetching is not prefered while writing results in spite of cache missings
                // you could try pld
                // there are almost no benefits observed though
                vst1.32 {q8}, [r11], r7
                vst1.32 {q9}, [r11], r7
                vst1.32 {q10}, [r11], r7
                vst1.32 {q11}, [r11], r7
                vst1.32 {q12}, [r11], r7
                vst1.32 {q13}, [r11], r7
                vst1.32 {q14}, [r11], r7
                vst1.32 {q15}, [r11], r7
                add r0, r0, #16

        WriteEnd:

            subs r8, r8, #1
            bne LoopKsize

        cmp r6, #4
        ble LoopOcEnd
        sub r6, r6, #4
        cmp r3, #0
        beq NoStepFowrard
        add r3, r3, #16
    NoStepFowrard:
        b LoopOc

 LoopOcEnd:
    sub sp, sp, #96
    vpop {q4-q7}
    pop {r4-r8, r10, r11, pc}
 #endif
 #endif
--- a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32.S
@@ -0,0 +1,368 @@
 #ifdef ENABLE_ARM32
    .text
    .align 5
    .global MatmulFloatNeon32
 #ifndef __APPLE__
    .type MatmulFloatNeon32, %function
 #endif

 // void MatmulFloatNeon32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
 //                        int row, int col, size_t stride, size_t writeNhwc, size_t WriteWino)
 // r0: a
 // r1: b
 // r2: c
 // r3: bias
 // r4: act_type
 // r5: depth
 // r6: row
 // r7: col
 // r8: stride
 // lr: writeNhwc/writeWino

 MatmulFloatNeon32:
    // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
    push {r0-r8, r10, r11, lr}
    add sp, sp, #48

    ldr r5, [sp, #4]
    ldr r7, [sp, #12]
    ldr r8, [sp, #16]

    mov lr, #32 // sizeof(float) * 8
    mul r12, r5, lr // block stride of lhs/rhs: sizeof(float) * 8 * depth
    ldr lr, [sp, #24]
    cmp lr, #0
    beq NoWinoSteps
    mov lr, #4
    mul r11, r7, r8 // stride * col * sizeof(float)
    mul r11, r11, lr
    mov lr, #32
    mul r10, r8, lr // stride * 8 * sizeof(float)
 NoWinoSteps:
    mov lr, #4
    mul r8, r8, lr // stride * sizeof(float)

 LoopCol:
    ldr r6, [sp, #8] // reload lhs row
    ldr r0, [sp, #-48] // reload lhs ptr
    ldr r2, [sp, #-40] // reload dst ptr

    LoopRow:
        ldr r1, [sp, #-44] // reload rhs ptr
        ldr r5, [sp, #4] // reload depth
        veor q8, q8, q8
        veor q9, q9, q9
        veor q10, q10, q10
        veor q11, q11, q11
        veor q12, q12, q12
        veor q13, q13, q13
        veor q14, q14, q14
        veor q15, q15, q15

        LoopDepth:
            vld1.32 {q0}, [r0]!
            vld1.32 {q1, q2}, [r1]!
            vmla.f32 q8, q1, d0[0]
            vmla.f32 q9, q2, d0[0]
            vmla.f32 q10, q1, d0[1]
            vmla.f32 q11, q2, d0[1]
            vmla.f32 q12, q1, d1[0]
            vmla.f32 q13, q2, d1[0]
            vmla.f32 q14, q1, d1[1]
            vmla.f32 q15, q2, d1[1]

            subs r5, r5, #1
            bne LoopDepth

        Bias:
            cmp r3, #0
            beq Activation
            vld1.32 {q0}, [r3]!
            vld1.32 {q1}, [r3]
            sub r3, r3, #16
            vadd.f32 q8, q8, q0
            vadd.f32 q9, q9, q1
            vadd.f32 q10, q10, q0
            vadd.f32 q11, q11, q1
            vadd.f32 q12, q12, q0
            vadd.f32 q13, q13, q1
            vadd.f32 q14, q14, q0
            vadd.f32 q15, q15, q1

        Activation:
            ldr lr, [sp]
            cmp lr, #2
            beq Relu6
            cmp lr, #1
            beq Relu
            b Write

        Relu6:
            vmov.i32 q2, #6
            vcvt.f32.s32 q2, q2
            vmin.f32 q8, q8, q2
            vmin.f32 q9, q9, q2
            vmin.f32 q10, q10, q2
            vmin.f32 q11, q11, q2
            vmin.f32 q12, q12, q2
            vmin.f32 q13, q13, q2
            vmin.f32 q14, q14, q2
            vmin.f32 q15, q15, q2

        Relu:
            veor q3, q3, q3
            vmax.f32 q8, q8, q3
            vmax.f32 q9, q9, q3
            vmax.f32 q10, q10, q3
            vmax.f32 q11, q11, q3
            vmax.f32 q12, q12, q3
            vmax.f32 q13, q13, q3
            vmax.f32 q14, q14, q3
            vmax.f32 q15, q15, q3

        Write:
            ldr lr, [sp, #24]
            cmp lr, #0
            bne WriteWino
            ldr lr, [sp, #20]
            cmp lr, #0
            beq WriteC8
            cmp r7, #1
            beq Write1
            cmp r7, #2
            beq Write2
            cmp r7, #3
            beq Write3
            cmp r7, #4
            beq Write4
            cmp r7, #5
            beq Write5
            cmp r7, #6
            beq Write6
            cmp r7, #7
            beq Write7
            b Write8

        Write1:
            vst1.32 d16[0], [r2]
            cmp r6, #1
            beq WriteEnd
            add r2, r2, r8
            vst1.32 d20[0], [r2]
            cmp r6, #2
            beq WriteEnd
            add r2, r2, r8
            vst1.32 d24[0], [r2]
            cmp r6, #3
            beq WriteEnd
            add r2, r2, r8
            vst1.32 d28[0], [r2]
            add r2, r2, r8
            b WriteEnd
        Write2:
            vst1.32 d16, [r2]
            cmp r6, #1
            beq WriteEnd
            add r2, r2, r8
            vst1.32 d20, [r2]
            cmp r6, #2
            beq WriteEnd
            add r2, r2, r8
            vst1.32 d24, [r2]
            cmp r6, #3
            beq WriteEnd
            add r2, r2, r8
            vst1.32 d28, [r2]
            add r2, r2, r8
            b WriteEnd
        Write3:
            add r4, r2, #8
            vst1.32 d16, [r2]
            vst1.32 d17[0], [r4]
            cmp r6, #1
            beq WriteEnd
            add r2, r2, r8
            add r4, r4, r8
            vst1.32 d20, [r2]
            vst1.32 d21[0], [r4]
            cmp r6, #2
            beq WriteEnd
            add r2, r2, r8
            add r4, r4, r8
            vst1.32 d24, [r2]
            vst1.32 d25[0], [r4]
            cmp r6, #3
            beq WriteEnd
            add r2, r2, r8
            add r4, r4, r8
            vst1.32 d28, [r2]
            vst1.32 d29[0], [r4]
            add r2, r2, r8
            b WriteEnd
        Write4:
            vst1.32 q8, [r2]
            cmp r6, #1
            beq WriteEnd
            add r2, r2, r8
            vst1.32 q10, [r2]
            cmp r6, #2
            beq WriteEnd
            add r2, r2, r8
            vst1.32 q12, [r2]
            cmp r6, #3
            beq WriteEnd
            add r2, r2, r8
            vst1.32 q14, [r2]
            add r2, r2, r8
            b WriteEnd
        Write5:
            add r4, r2, #16
            vst1.32 q8, [r2]
            vst1.32 d18[0], [r4]
            cmp r6, #1
            beq WriteEnd
            add r2, r2, r8
            add r4, r4, r8
            vst1.32 q10, [r2]
            vst1.32 d22[0], [r4]
            cmp r6, #2
            beq WriteEnd
            add r2, r2, r8
            add r4, r4, r8
            vst1.32 q12, [r2]
            vst1.32 d26[0], [r4]
            cmp r6, #3
            beq WriteEnd
            add r2, r2, r8
            add r4, r4, r8
            vst1.32 q14, [r2]
            vst1.32 d30[0], [r4]
            add r2, r2, r8
            b WriteEnd
        Write6:
            add r4, r2, #16
            vst1.32 q8, [r2]
            vst1.32 d18, [r4]
            cmp r6, #1
            beq WriteEnd
            add r2, r2, r8
            add r4, r4, r8
            vst1.32 q10, [r2]
            vst1.32 d22, [r4]
            cmp r6, #2
            beq WriteEnd
            add r2, r2, r8
            add r4, r4, r8
            vst1.32 q12, [r2]
            vst1.32 d26, [r4]
            cmp r6, #3
            beq WriteEnd
            add r2, r2, r8
            add r4, r4, r8
            vst1.32 q14, [r2]
            vst1.32 d30, [r4]
            add r2, r2, r8
            b WriteEnd
        Write7:
            add lr, r2, #24
            add r4, r2, #16
            vst1.32 q8, [r2]
            vst1.32 d18, [r4]
            vst1.32 d19[0], [lr]
            cmp r6, #1
            beq WriteEnd
            add r2, r2, r8
            add r4, r4, r8
            add lr, lr, r8
            vst1.32 q10, [r2]
            vst1.32 d22, [r4]
            vst1.32 d23[0], [lr]
            cmp r6, #2
            beq WriteEnd
            add r2, r2, r8
            add r4, r4, r8
            add lr, lr, r8
            vst1.32 q12, [r2]
            vst1.32 d26, [r4]
            vst1.32 d27[0], [lr]
            cmp r6, #3
            beq WriteEnd
            add r2, r2, r8
            add r4, r4, r8
            add lr, lr, r8
            vst1.32 q14, [r2]
            vst1.32 d30, [r4]
            vst1.32 d31[0], [lr]
            add r2, r2, r8
            b WriteEnd
        WriteC8:
            vst1.32 {q8, q9}, [r2]!
            vst1.32 {q10, q11}, [r2]!
            vst1.32 {q12, q13}, [r2]!
            vst1.32 {q14, q15}, [r2]!
            str r2, [sp, #-40]
            b WriteEnd
        WriteWino:
            vst1.32 {q8, q9}, [r2]
            add r2, r2, r11
            vst1.32 {q10, q11}, [r2]
            add r2, r2, r11
            vst1.32 {q12, q13}, [r2]
            add r2, r2, r11
            vst1.32 {q14, q15}, [r2]
            add r2, r2, r11
            b WriteEnd
        Write8:
            vst1.32 {q8, q9}, [r2]
            cmp r6, #1
            beq WriteEnd
            add r2, r2, r8
            vst1.32 {q10, q11}, [r2]
            cmp r6, #2
            beq WriteEnd
            add r2, r2, r8
            vst1.32 {q12, q13}, [r2]
            cmp r6, #3
            beq WriteEnd
            add r2, r2, r8
            vst1.32 {q14, q15}, [r2]
            add r2, r2, r8

        WriteEnd:
            cmp r6, #4
            ble LoopRowEnd
            sub r6, r6, #4 // lhs row - 4
            b LoopRow

    LoopRowEnd:
        ldr r1, [sp, #-44]
        add r1, r1, r12 // rhs ptr + stride
        str r1, [sp, #-44]
        cmp r3, #0
        beq NoBiasStep
        add r3, r3, #32 // bias ptr + stride
    NoBiasStep:
        ldr lr, [sp, #24]
        cmp lr, #0
        bne WinoDstStep
        ldr lr, [sp, #20]
        cmp lr, #0
        beq NoDstStep
        ldr r2, [sp, #-40]
        add r2, r2, #32 // dst ptr + stride
        str r2, [sp, #-40]
        b NoDstStep
    WinoDstStep:
        ldr r2, [sp, #-40]
        add r2, r2, r10
        str r2, [sp, #-40]
    NoDstStep:
        cmp r7, #8
        ble LoopColEnd
        sub r7, r7, #8 // rhs col - 8
        b LoopCol

 LoopColEnd:
    sub sp, sp, #48
    pop {r0-r8, r10, r11, pc}
 #endif
--- a/mindspore/lite/nnacl/assembly/arm64/IndirectGemmFp32_8x8.S
+++ b/mindspore/lite/nnacl/assembly/arm64/IndirectGemmFp32_8x8.S
@@ -1,730 +0,0 @@
 #ifdef __aarch64__

 .text
 .align 5
 .global IndirectGemmFp32_8x8
 #ifndef __APPLE__
 .type IndirectGemmFp32_8x8, %function
 #endif

 // void IndirectGemmFp32_8x8(float *output, float *input, float *weight, float *bias,
 //     size_t kSize, size_t ic4, size_t oc8, size_t offset, size_t mode, size_t writeC4, size_t relu, size_t relu6);
 // x0: output, x1: input, x2: weight, x3: bias, x4: kSize, x5: ic4, x6: oc, x7: offset
 // x8:mode, x9: writeMode, x10: relu, x11:relu6
 // mode = 0 for general convolution, where one conv unit is a row
 // mode = 1 for winograd/common gemm, where the total channels of one input is a row
 IndirectGemmFp32_8x8:

    .macro INIT_BIAS
        dup v16.4s, wzr
        dup v17.4s, wzr
        cbz x3, InitBias
        ld1 {v16.4s, v17.4s}, [x3]
    InitBias:
        mov v18.16b, v16.16b
        mov v19.16b, v17.16b
        mov v20.16b, v16.16b
        mov v21.16b, v17.16b
        mov v22.16b, v16.16b
        mov v23.16b, v17.16b
        mov v24.16b, v16.16b
        mov v25.16b, v17.16b
        mov v26.16b, v16.16b
        mov v27.16b, v17.16b
        mov v28.16b, v16.16b
        mov v29.16b, v17.16b
        mov v30.16b, v16.16b
        mov v31.16b, v17.16b
    .endm

    .macro INIT_BIAS_HALF
        dup v16.4s, wzr
        cbz x3, InitBiasHalf
        ld1 {v16.4s}, [x3]
    InitBiasHalf:
        mov v18.16b, v16.16b
        mov v20.16b, v16.16b
        mov v22.16b, v16.16b
        mov v24.16b, v16.16b
        mov v26.16b, v16.16b
        mov v28.16b, v16.16b
        mov v30.16b, v16.16b
    .endm

    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // r19 ~ r29 should be also preserved
    // whereas our coding style do not permit such amount of parameters
    sub sp, sp, #128
    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64

    ldr x8, [sp, #0]
    ldr x9, [sp, #8]
    ldr x10, [sp, #16]
    ldr x11, [sp, #24]

    cbnz x8, NoStepShuffle
    // step is one for common convolution, where ic8 should multiply by kernel size 
    // step is (a+b-1) for F(a,b) in winograd
    mul x5, x4, x5
    mov x4, #1

 NoStepShuffle:
    // x8 is used to store offset now
    // only useful for WriteC4
    mov x8, #16
    mul x8, x8, x4

 IndirectGemmStart:

    cmp x6, #4
    ble LoopOcHalf

    LoopOc:

        mov x14, x4
        mov x12, x1

        LoopKsize:

            mov x15, x0
            INIT_BIAS

            // load input for output 1-2
            ld1 {v0.4s, v1.4s}, [x12], #32
            // load weight
            ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x2], #64
            // step for output 1-2
            fmla v16.4s, v8.4s, v0.s[0]
            fmla v17.4s, v9.4s, v0.s[0]
            fmla v18.4s, v8.4s, v1.s[0]
            fmla v19.4s, v9.4s, v1.s[0]
            // load input for output 3-4
            ld1 {v2.4s, v3.4s}, [x12], #32
            // another step for output 1-2
            fmla v16.4s, v10.4s, v0.s[1]
            fmla v17.4s, v11.4s, v0.s[1]
            fmla v18.4s, v10.4s, v1.s[1]
            fmla v19.4s, v11.4s, v1.s[1]
            // load input  for output 5-8
            // input cache should be refreshed after loading
            // ATTENTION: advance is prefered, but advancing too much may lead to invalid prefetching 
            ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x12], #64
            // step for output 3-8
            fmla v20.4s, v8.4s, v2.s[0]
            fmla v21.4s, v9.4s, v2.s[0]
            fmla v22.4s, v8.4s, v3.s[0]
            fmla v23.4s, v9.4s, v3.s[0]

            subs x13, x5, #1
            beq LoopIcEnd

            LoopIc:
                fmla v24.4s, v8.4s, v4.s[0]
                fmla v25.4s, v9.4s, v4.s[0]
                fmla v26.4s, v8.4s, v5.s[0]
                fmla v27.4s, v9.4s, v5.s[0]
                fmla v28.4s, v8.4s, v6.s[0]
                fmla v29.4s, v9.4s, v6.s[0]
                fmla v30.4s, v8.4s, v7.s[0]
                fmla v31.4s, v9.4s, v7.s[0]
                // load weight
                ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x2], #64
                // step for output 3-8
                fmla v20.4s, v10.4s, v2.s[1]
                fmla v21.4s, v11.4s, v2.s[1]
                fmla v22.4s, v10.4s, v3.s[1]
                fmla v23.4s, v11.4s, v3.s[1]
                fmla v24.4s, v10.4s, v4.s[1]
                fmla v25.4s, v11.4s, v4.s[1]
                fmla v26.4s, v10.4s, v5.s[1]
                fmla v27.4s, v11.4s, v5.s[1]
                fmla v28.4s, v10.4s, v6.s[1]
                fmla v29.4s, v11.4s, v6.s[1]
                fmla v30.4s, v10.4s, v7.s[1]
                fmla v31.4s, v11.4s, v7.s[1]
                // another step for output 1-8
                fmla v16.4s, v12.4s, v0.s[2]
                fmla v17.4s, v13.4s, v0.s[2]
                fmla v18.4s, v12.4s, v1.s[2]
                fmla v19.4s, v13.4s, v1.s[2]
                fmla v20.4s, v12.4s, v2.s[2]
                fmla v21.4s, v13.4s, v2.s[2]
                fmla v22.4s, v12.4s, v3.s[2]
                fmla v23.4s, v13.4s, v3.s[2]
                fmla v24.4s, v12.4s, v4.s[2]
                fmla v25.4s, v13.4s, v4.s[2]
                fmla v26.4s, v12.4s, v5.s[2]
                fmla v27.4s, v13.4s, v5.s[2]
                fmla v28.4s, v12.4s, v6.s[2]
                fmla v29.4s, v13.4s, v6.s[2]
                fmla v30.4s, v12.4s, v7.s[2]
                fmla v31.4s, v13.4s, v7.s[2]
                // load weight
                ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x2], #64
                // another step for output 1-8
                fmla v16.4s, v14.4s, v0.s[3]
                fmla v17.4s, v15.4s, v0.s[3]
                fmla v18.4s, v14.4s, v1.s[3]
                fmla v19.4s, v15.4s, v1.s[3]
                fmla v20.4s, v14.4s, v2.s[3]
                fmla v21.4s, v15.4s, v2.s[3]
                fmla v22.4s, v14.4s, v3.s[3]
                fmla v23.4s, v15.4s, v3.s[3]
                // load input for output 1-4
                ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64
                fmla v24.4s, v14.4s, v4.s[3]
                fmla v25.4s, v15.4s, v4.s[3]
                fmla v26.4s, v14.4s, v5.s[3]
                fmla v27.4s, v15.4s, v5.s[3]
                fmla v28.4s, v14.4s, v6.s[3]
                fmla v29.4s, v15.4s, v6.s[3]
                fmla v30.4s, v14.4s, v7.s[3]
                fmla v31.4s, v15.4s, v7.s[3]
                // load input  for output 5-8
                ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x12], #64
                // step for output 1-8
                fmla v16.4s, v8.4s, v0.s[0]
                fmla v17.4s, v9.4s, v0.s[0]
                fmla v18.4s, v8.4s, v1.s[0]
                fmla v19.4s, v9.4s, v1.s[0]
                fmla v16.4s, v10.4s, v0.s[1]
                fmla v17.4s, v11.4s, v0.s[1]
                fmla v18.4s, v10.4s, v1.s[1]
                fmla v19.4s, v11.4s, v1.s[1]
                fmla v20.4s, v8.4s, v2.s[0]
                fmla v21.4s, v9.4s, v2.s[0]
                fmla v22.4s, v8.4s, v3.s[0]
                fmla v23.4s, v9.4s, v3.s[0]

                subs x13, x13, #1
                bne LoopIc

            LoopIcEnd:
                fmla v24.4s, v8.4s, v4.s[0]
                fmla v25.4s, v9.4s, v4.s[0]
                fmla v26.4s, v8.4s, v5.s[0]
                fmla v27.4s, v9.4s, v5.s[0]
                fmla v28.4s, v8.4s, v6.s[0]
                fmla v29.4s, v9.4s, v6.s[0]
                fmla v30.4s, v8.4s, v7.s[0]
                fmla v31.4s, v9.4s, v7.s[0]
                // load weight
                ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x2], #64
                // step for output 3-8
                fmla v20.4s, v10.4s, v2.s[1]
                fmla v21.4s, v11.4s, v2.s[1]
                fmla v22.4s, v10.4s, v3.s[1]
                fmla v23.4s, v11.4s, v3.s[1]
                fmla v24.4s, v10.4s, v4.s[1]
                fmla v25.4s, v11.4s, v4.s[1]
                fmla v26.4s, v10.4s, v5.s[1]
                fmla v27.4s, v11.4s, v5.s[1]
                fmla v28.4s, v10.4s, v6.s[1]
                fmla v29.4s, v11.4s, v6.s[1]
                fmla v30.4s, v10.4s, v7.s[1]
                fmla v31.4s, v11.4s, v7.s[1]
                // another step for output 1-8
                fmla v16.4s, v12.4s, v0.s[2]
                fmla v17.4s, v13.4s, v0.s[2]
                fmla v18.4s, v12.4s, v1.s[2]
                fmla v19.4s, v13.4s, v1.s[2]
                fmla v20.4s, v12.4s, v2.s[2]
                fmla v21.4s, v13.4s, v2.s[2]
                fmla v22.4s, v12.4s, v3.s[2]
                fmla v23.4s, v13.4s, v3.s[2]
                fmla v24.4s, v12.4s, v4.s[2]
                fmla v25.4s, v13.4s, v4.s[2]
                fmla v26.4s, v12.4s, v5.s[2]
                fmla v27.4s, v13.4s, v5.s[2]
                fmla v28.4s, v12.4s, v6.s[2]
                fmla v29.4s, v13.4s, v6.s[2]
                fmla v30.4s, v12.4s, v7.s[2]
                fmla v31.4s, v13.4s, v7.s[2]
                // another step for output 1-8
                fmla v16.4s, v14.4s, v0.s[3]
                fmla v17.4s, v15.4s, v0.s[3]
                fmla v18.4s, v14.4s, v1.s[3]
                fmla v19.4s, v15.4s, v1.s[3]
                fmla v20.4s, v14.4s, v2.s[3]
                fmla v21.4s, v15.4s, v2.s[3]
                fmla v22.4s, v14.4s, v3.s[3]
                fmla v23.4s, v15.4s, v3.s[3]
                fmla v24.4s, v14.4s, v4.s[3]
                fmla v25.4s, v15.4s, v4.s[3]
                fmla v26.4s, v14.4s, v5.s[3]
                fmla v27.4s, v15.4s, v5.s[3]
                fmla v28.4s, v14.4s, v6.s[3]
                fmla v29.4s, v15.4s, v6.s[3]
                fmla v30.4s, v14.4s, v7.s[3]
                fmla v31.4s, v15.4s, v7.s[3]
                // prefetching is not prefered while writing results in spite of cache missings
                // you could try prfm pstl2strm
                // there are almost no benefits observed though
                cbnz x11, Relu6
                cbnz x10, Relu
                b WriteStart
            Relu6:
                movi v1.4s, #6
                scvtf v1.4s, v1.4s
                fmin v16.4s, v16.4s, v1.4s
                fmin v17.4s, v17.4s, v1.4s
                fmin v18.4s, v18.4s, v1.4s
                fmin v19.4s, v19.4s, v1.4s
                fmin v20.4s, v20.4s, v1.4s
                fmin v21.4s, v21.4s, v1.4s
                fmin v22.4s, v22.4s, v1.4s
                fmin v23.4s, v23.4s, v1.4s
                fmin v24.4s, v24.4s, v1.4s
                fmin v25.4s, v25.4s, v1.4s
                fmin v26.4s, v26.4s, v1.4s
                fmin v27.4s, v27.4s, v1.4s
                fmin v28.4s, v28.4s, v1.4s
                fmin v29.4s, v29.4s, v1.4s
                fmin v30.4s, v30.4s, v1.4s
                fmin v31.4s, v31.4s, v1.4s
            Relu:
                dup v0.4s, wzr
                fmax v16.4s, v16.4s, v0.4s
                fmax v17.4s, v17.4s, v0.4s
                fmax v18.4s, v18.4s, v0.4s
                fmax v19.4s, v19.4s, v0.4s
                fmax v20.4s, v20.4s, v0.4s
                fmax v21.4s, v21.4s, v0.4s
                fmax v22.4s, v22.4s, v0.4s
                fmax v23.4s, v23.4s, v0.4s
                fmax v24.4s, v24.4s, v0.4s
                fmax v25.4s, v25.4s, v0.4s
                fmax v26.4s, v26.4s, v0.4s
                fmax v27.4s, v27.4s, v0.4s
                fmax v28.4s, v28.4s, v0.4s
                fmax v29.4s, v29.4s, v0.4s
                fmax v30.4s, v30.4s, v0.4s
                fmax v31.4s, v31.4s, v0.4s

            WriteStart:
                cbnz x9, WriteC4
                cmp x6, #5
                beq Write5
                cmp x6, #6
                beq Write6
                cmp x6, #7
                beq Write7
                b Write8
            Write5:
                add x17, x15, #16
                st1 {v16.4s}, [x15], x7
                str s17, [x17]
                add x17, x17, x7
                st1 {v18.4s}, [x15], x7
                str s19, [x17]
                add x17, x17, x7
                st1 {v20.4s}, [x15], x7
                str s21, [x17]
                add x17, x17, x7
                st1 {v22.4s}, [x15], x7
                str s23, [x17]
                add x17, x17, x7
                st1 {v24.4s}, [x15], x7
                str s25, [x17]
                add x17, x17, x7
                st1 {v26.4s}, [x15], x7
                str s27, [x17]
                add x17, x17, x7
                st1 {v28.4s}, [x15], x7
                str s29, [x17]
                add x17, x17, x7
                st1 {v30.4s}, [x15]
                str s31, [x17]
                add x0, x0, #20
                b WriteEnd
            Write6:
                add x17, x15, #16
                st1 {v16.4s}, [x15], x7
                dup s16, v17.s[1]
                stp s17, s16, [x17]
                add x17, x17, x7
                st1 {v18.4s}, [x15], x7
                dup s18, v19.s[1]
                stp s19, s18, [x17]
                add x17, x17, x7
                st1 {v20.4s}, [x15], x7
                dup s20, v21.s[1]
                stp s21, s20, [x17]
                add x17, x17, x7
                st1 {v22.4s}, [x15], x7
                dup s22, v23.s[1]
                stp s23, s22, [x17]
                add x17, x17, x7
                st1 {v24.4s}, [x15], x7
                dup s24, v25.s[1]
                stp s25, s24, [x17]
                add x17, x17, x7
                st1 {v26.4s}, [x15], x7
                dup s26, v27.s[1]
                stp s27, s26, [x17]
                add x17, x17, x7
                st1 {v28.4s}, [x15], x7
                dup s28, v29.s[1]
                stp s29, s28, [x17]
                add x17, x17, x7
                st1 {v30.4s}, [x15]
                dup s30, v31.s[1]
                stp s31, s30, [x17]
                add x0, x0, #24
                b WriteEnd
            Write7:
                add x17, x15, #16
                add x16, x15, #24
                st1 {v16.4s}, [x15], x7
                dup s16, v17.s[1]
                stp s17, s16, [x17]
                add x17, x17, x7
                st1 {v17.s}[2], [x16], x7
                st1 {v18.4s}, [x15], x7
                dup s18, v19.s[1]
                stp s19, s18, [x17]
                add x17, x17, x7
                st1 {v19.s}[2], [x16], x7
                st1 {v20.4s}, [x15], x7
                dup s20, v21.s[1]
                stp s21, s20, [x17]
                add x17, x17, x7
                st1 {v21.s}[2], [x16], x7
                st1 {v22.4s}, [x15], x7
                dup s22, v23.s[1]
                stp s23, s22, [x17]
                add x17, x17, x7
                st1 {v23.s}[2], [x16], x7
                st1 {v24.4s}, [x15], x7
                dup s24, v25.s[1]
                stp s25, s24, [x17]
                add x17, x17, x7
                st1 {v25.s}[2], [x16], x7
                st1 {v26.4s}, [x15], x7
                dup s26, v27.s[1]
                stp s27, s26, [x17]
                add x17, x17, x7
                st1 {v27.s}[2], [x16], x7
                st1 {v28.4s}, [x15], x7
                dup s28, v29.s[1]
                stp s29, s28, [x17]
                add x17, x17, x7
                st1 {v29.s}[2], [x16], x7
                st1 {v30.4s}, [x15], x7
                dup s30, v31.s[1]
                stp s31, s30, [x17]
                add x17, x17, x7
                st1 {v31.s}[2], [x16], x7
                add x0, x0, #28
                b WriteEnd
            WriteC4:
                st1 {v16.4s}, [x15], x7
                st1 {v18.4s}, [x15], x7
                st1 {v20.4s}, [x15], x7
                st1 {v22.4s}, [x15], x7
                st1 {v24.4s}, [x15], x7
                st1 {v26.4s}, [x15], x7
                st1 {v28.4s}, [x15], x7
                st1 {v30.4s}, [x15]
                add x15, x8, x0
                st1 {v17.4s}, [x15], x7
                st1 {v19.4s}, [x15], x7
                st1 {v21.4s}, [x15], x7
                st1 {v23.4s}, [x15], x7
                st1 {v25.4s}, [x15], x7
                st1 {v27.4s}, [x15], x7
                st1 {v29.4s}, [x15], x7
                st1 {v31.4s}, [x15]
                add x0, x0, #16
                b WriteEnd
            Write8:
                st1 {v16.4s, v17.4s}, [x15], x7
                st1 {v18.4s, v19.4s}, [x15], x7
                st1 {v20.4s, v21.4s}, [x15], x7
                st1 {v22.4s, v23.4s}, [x15], x7
                st1 {v24.4s, v25.4s}, [x15], x7
                st1 {v26.4s, v27.4s}, [x15], x7
                st1 {v28.4s, v29.4s}, [x15], x7
                st1 {v30.4s, v31.4s}, [x15]
                add x0, x0, #32

        WriteEnd:

            subs x14, x14, #1
            bne LoopKsize

        subs x6, x6, #8
        ble LoopOcEnd
        cbz x9, NoStepC4Block
        add x0, x0, x8
    NoStepC4Block:
        cbz x3, NoStepForward
        add x3, x3, #32
    NoStepForward:
        cmp x6, #4
        bgt LoopOc

    LoopOcHalf:
        mov x18, #32

        mov x14, x4
        mov x12, x1

        LoopKsizeHalf:

            mov x15, x0
            INIT_BIAS_HALF

            // load input for output 1-2
            ld1 {v0.4s, v1.4s}, [x12], #32
            // load weight
            ld1 {v8.4s}, [x2], x18
            ld1 {v10.4s}, [x2], x18
            // step for output 1-2
            fmla v16.4s, v8.4s, v0.s[0]
            fmla v18.4s, v8.4s, v1.s[0]
            // load input for output 3-4
            ld1 {v2.4s, v3.4s}, [x12], #32
            // another step for output 1-2
            fmla v16.4s, v10.4s, v0.s[1]
            fmla v18.4s, v10.4s, v1.s[1]
            // load input  for output 5-8
            // input cache should be refreshed after loading
            // ATTENTION: advance is prefered, but advancing too much may lead to invalid prefetching 
            ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x12], #64
            // step for output 3-8
            fmla v20.4s, v8.4s, v2.s[0]
            fmla v22.4s, v8.4s, v3.s[0]

            subs x13, x5, #1
            beq LoopIcEndHalf

            LoopIcHalf:
                fmla v24.4s, v8.4s, v4.s[0]
                fmla v26.4s, v8.4s, v5.s[0]
                fmla v28.4s, v8.4s, v6.s[0]
                fmla v30.4s, v8.4s, v7.s[0]
                // load weight
                ld1 {v12.4s}, [x2], x18
                // step for output 3-8
                fmla v20.4s, v10.4s, v2.s[1]
                fmla v22.4s, v10.4s, v3.s[1]
                // load weight
                ld1 {v14.4s}, [x2], x18
                fmla v24.4s, v10.4s, v4.s[1]
                fmla v26.4s, v10.4s, v5.s[1]
                fmla v28.4s, v10.4s, v6.s[1]
                fmla v30.4s, v10.4s, v7.s[1]
                // another step for output 1-8
                fmla v16.4s, v12.4s, v0.s[2]
                fmla v18.4s, v12.4s, v1.s[2]
                fmla v20.4s, v12.4s, v2.s[2]
                fmla v22.4s, v12.4s, v3.s[2]
                fmla v24.4s, v12.4s, v4.s[2]
                fmla v26.4s, v12.4s, v5.s[2]
                fmla v28.4s, v12.4s, v6.s[2]
                fmla v30.4s, v12.4s, v7.s[2]
                // load weight
                ld1 {v8.4s}, [x2], x18
                // another step for output 1-8
                fmla v16.4s, v14.4s, v0.s[3]
                fmla v18.4s, v14.4s, v1.s[3]
                // load weight
                ld1 {v10.4s}, [x2], x18
                fmla v20.4s, v14.4s, v2.s[3]
                fmla v22.4s, v14.4s, v3.s[3]
                // load input for output 1-4
                ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x12], #64
                fmla v24.4s, v14.4s, v4.s[3]
                fmla v26.4s, v14.4s, v5.s[3]
                fmla v28.4s, v14.4s, v6.s[3]
                fmla v30.4s, v14.4s, v7.s[3]
                // load input  for output 5-8
                ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x12], #64
                // step for output 1-8
                fmla v16.4s, v8.4s, v0.s[0]
                fmla v18.4s, v8.4s, v1.s[0]
                fmla v16.4s, v10.4s, v0.s[1]
                fmla v18.4s, v10.4s, v1.s[1]
                fmla v20.4s, v8.4s, v2.s[0]
                fmla v22.4s, v8.4s, v3.s[0]

                subs x13, x13, #1
                bne LoopIcHalf

            LoopIcEndHalf:
                fmla v24.4s, v8.4s, v4.s[0]
                fmla v26.4s, v8.4s, v5.s[0]
                fmla v28.4s, v8.4s, v6.s[0]
                fmla v30.4s, v8.4s, v7.s[0]
                // load weight
                ld1 {v12.4s}, [x2], x18
                // step for output 3-8
                fmla v20.4s, v10.4s, v2.s[1]
                fmla v22.4s, v10.4s, v3.s[1]
                // load weight
                ld1 {v14.4s}, [x2], x18
                fmla v24.4s, v10.4s, v4.s[1]
                fmla v26.4s, v10.4s, v5.s[1]
                fmla v28.4s, v10.4s, v6.s[1]
                fmla v30.4s, v10.4s, v7.s[1]
                // another step for output 1-8
                fmla v16.4s, v12.4s, v0.s[2]
                fmla v18.4s, v12.4s, v1.s[2]
                fmla v20.4s, v12.4s, v2.s[2]
                fmla v22.4s, v12.4s, v3.s[2]
                fmla v24.4s, v12.4s, v4.s[2]
                fmla v26.4s, v12.4s, v5.s[2]
                fmla v28.4s, v12.4s, v6.s[2]
                fmla v30.4s, v12.4s, v7.s[2]
                // another step for output 1-8
                fmla v16.4s, v14.4s, v0.s[3]
                fmla v18.4s, v14.4s, v1.s[3]
                fmla v20.4s, v14.4s, v2.s[3]
                fmla v22.4s, v14.4s, v3.s[3]
                fmla v24.4s, v14.4s, v4.s[3]
                fmla v26.4s, v14.4s, v5.s[3]
                fmla v28.4s, v14.4s, v6.s[3]
                fmla v30.4s, v14.4s, v7.s[3]

                cbnz x11, Relu6Half
                cbnz x10, ReluHalf
                b WriteStartHalf
            Relu6Half:
                movi v1.4s, #6
                scvtf v1.4s, v1.4s
                fmin v16.4s, v16.4s, v1.4s
                fmin v18.4s, v18.4s, v1.4s
                fmin v20.4s, v20.4s, v1.4s
                fmin v22.4s, v22.4s, v1.4s
                fmin v24.4s, v24.4s, v1.4s
                fmin v26.4s, v26.4s, v1.4s
                fmin v28.4s, v28.4s, v1.4s
                fmin v30.4s, v30.4s, v1.4s
            ReluHalf:
                dup v0.4s, wzr
                fmax v16.4s, v16.4s, v0.4s
                fmax v18.4s, v18.4s, v0.4s
                fmax v20.4s, v20.4s, v0.4s
                fmax v22.4s, v22.4s, v0.4s
                fmax v24.4s, v24.4s, v0.4s
                fmax v26.4s, v26.4s, v0.4s
                fmax v28.4s, v28.4s, v0.4s
                fmax v30.4s, v30.4s, v0.4s

            WriteStartHalf:
                cbnz x9, Write4
                cmp x6, #1
                beq Write1
                cmp x6, #2
                beq Write2
                cmp x6, #3
                beq Write3
                b Write4
            Write1:
                str s16, [x15]
                add x15, x15, x7
                str s18, [x15]
                add x15, x15, x7
                str s20, [x15]
                add x15, x15, x7
                str s22, [x15]
                add x15, x15, x7
                str s24, [x15]
                add x15, x15, x7
                str s26, [x15]
                add x15, x15, x7
                str s28, [x15]
                add x15, x15, x7
                str s30, [x15]
                add x0, x0, #4
                b WriteEndHalf
            Write2:
                dup s17, v16.s[1]
                stp s16, s17, [x15]
                add x15, x15, x7
                dup s19, v18.s[1]
                stp s18, s19, [x15]
                add x15, x15, x7
                dup s21, v20.s[1]
                stp s20, s21, [x15]
                add x15, x15, x7
                dup s23, v22.s[1]
                stp s22, s23, [x15]
                add x15, x15, x7
                dup s25, v24.s[1]
                stp s24, s25, [x15]
                add x15, x15, x7
                dup s27, v26.s[1]
                stp s26, s27, [x15]
                add x15, x15, x7
                dup s29, v28.s[1]
                stp s28, s29, [x15]
                add x15, x15, x7
                dup s31, v30.s[1]
                stp s30, s31, [x15]
                add x0, x0, #8
                b WriteEndHalf
            Write3:
                add x17, x15, #8
                dup s17, v16.s[1]
                stp s16, s17, [x15]
                add x15, x15, x7
                st1 {v16.s}[2], [x17], x7
                dup s19, v18.s[1]
                stp s18, s19, [x15]
                add x15, x15, x7
                st1 {v18.s}[2], [x17], x7
                dup s21, v20.s[1]
                stp s20, s21, [x15]
                add x15, x15, x7
                st1 {v20.s}[2], [x17], x7
                dup s23, v22.s[1]
                stp s22, s23, [x15]
                add x15, x15, x7
                st1 {v22.s}[2], [x17], x7
                dup s25, v24.s[1]
                stp s24, s25, [x15]
                add x15, x15, x7
                st1 {v24.s}[2], [x17], x7
                dup s27, v26.s[1]
                stp s26, s27, [x15]
                add x15, x15, x7
                st1 {v26.s}[2], [x17], x7
                dup s29, v28.s[1]
                stp s28, s29, [x15]
                add x15, x15, x7
                st1 {v28.s}[2], [x17], x7
                dup s31, v30.s[1]
                stp s30, s31, [x15]
                st1 {v30.s}[2], [x17]
                add x0, x0, #12
                b WriteEndHalf
            Write4:
                // prefetching is not prefered while writing results in spite of cache missings
                // you could try prfm pstl2strm
                // there are almost no benefits observed though
                st1 {v16.4s}, [x15], x7
                st1 {v18.4s}, [x15], x7
                st1 {v20.4s}, [x15], x7
                st1 {v22.4s}, [x15], x7
                st1 {v24.4s}, [x15], x7
                st1 {v26.4s}, [x15], x7
                st1 {v28.4s}, [x15], x7
                st1 {v30.4s}, [x15]
                add x0, x0, #16

        WriteEndHalf:

            subs x14, x14, #1
            bne LoopKsizeHalf

 LoopOcEnd:

    sub sp, sp, #128
    ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
    ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
    ret
 #endif
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S
@@ -7,7 +7,7 @@
 #endif

 // void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
 //                        int row, int col, int stride, bool write_nhwc)
 //                        int row, int col, size_t stride, size_t writeNhwc, size_t WriteWino)
 // x0: a
 // x1: b
 // x2: c
@@ -17,18 +17,27 @@
 // w6: row
 // w7: col
 // w17: stride
 // w13: writeC8
 // w13: c8_nhwc_c4

 MatmulFloatNeon64:
  sub sp, sp, #128
  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64

  ldr x9, [sp, #8]
  ldr x14, [sp, #16]

  mov w18, #32 // sizeof(float) * 8
  mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float) * 8 * depth
  mov x11, x3 // bias flag
  mov x18, #4
  ldr x17, [sp]
  cbz x14, NoWinoSteps
  mul x8, x7, x17
  mov x11, #8
  mul x11, x11, x17
  mul x8, x8, x18
  mul x11, x11, x18
 NoWinoSteps:
  mul x17, x17, x18

 L1:
@@ -39,7 +48,14 @@ L1:
 L2:
  mov x16, x1 // reload rhs ptr
  mov w13, w5 // reload depth
  mov x14, x3 // reload bias ptr
  dup v8.4s, wzr
  dup v9.4s, wzr
  dup v10.4s, wzr
  dup v11.4s, wzr
  dup v12.4s, wzr
  dup v13.4s, wzr
  dup v14.4s, wzr
  dup v15.4s, wzr
  dup v16.4s, wzr
  dup v17.4s, wzr
  dup v18.4s, wzr
@@ -57,116 +73,86 @@ L2:
  dup v30.4s, wzr
  dup v31.4s, wzr

  cmp w13, #4
  blt CommLoopMul

 OptLoopMul4:
  ld1 {v0.4s, v1.4s}, [x12], #32
  ld1 {v8.4s, v9.4s}, [x16], #32
  fmla v16.4s, v8.4s, v0.s[0]
  fmla v17.4s, v9.4s, v0.s[0]
  fmla v18.4s, v8.4s, v0.s[1]
  fmla v19.4s, v9.4s, v0.s[1]
  fmla v20.4s, v8.4s, v0.s[2]
  fmla v21.4s, v9.4s, v0.s[2]
  fmla v22.4s, v8.4s, v0.s[3]
  fmla v23.4s, v9.4s, v0.s[3]
  ld1 {v10.4s, v11.4s}, [x16], #32
  fmla v24.4s, v8.4s, v1.s[0]
  fmla v25.4s, v9.4s, v1.s[0]
  fmla v26.4s, v8.4s, v1.s[1]
  fmla v27.4s, v9.4s, v1.s[1]
  ld1 {v2.4s, v3.4s}, [x12], #32
  fmla v28.4s, v8.4s, v1.s[2]
  fmla v29.4s, v9.4s, v1.s[2]
  fmla v30.4s, v8.4s, v1.s[3]
  fmla v31.4s, v9.4s, v1.s[3]
  fmla v16.4s, v10.4s, v2.s[0]
  fmla v17.4s, v11.4s, v2.s[0]
  fmla v18.4s, v10.4s, v2.s[1]
  fmla v19.4s, v11.4s, v2.s[1]
  fmla v20.4s, v10.4s, v2.s[2]
  fmla v21.4s, v11.4s, v2.s[2]
  fmla v22.4s, v10.4s, v2.s[3]
  fmla v23.4s, v11.4s, v2.s[3]
  ld1 {v12.4s, v13.4s}, [x16], #32
  fmla v24.4s, v10.4s, v3.s[0]
  fmla v25.4s, v11.4s, v3.s[0]
  fmla v26.4s, v10.4s, v3.s[1]
  fmla v27.4s, v11.4s, v3.s[1]
  ld1 {v4.4s, v5.4s}, [x12], #32
  fmla v28.4s, v10.4s, v3.s[2]
  fmla v29.4s, v11.4s, v3.s[2]
  fmla v30.4s, v10.4s, v3.s[3]
  fmla v31.4s, v11.4s, v3.s[3]
  fmla v16.4s, v12.4s, v4.s[0]
  fmla v17.4s, v13.4s, v4.s[0]
  fmla v18.4s, v12.4s, v4.s[1]
  fmla v19.4s, v13.4s, v4.s[1]
  fmla v20.4s, v12.4s, v4.s[2]
  fmla v21.4s, v13.4s, v4.s[2]
  fmla v22.4s, v12.4s, v4.s[3]
  fmla v23.4s, v13.4s, v4.s[3]
  ld1 {v6.4s,v7.4s}, [x12], #32
  fmla v24.4s, v12.4s, v5.s[0]
  fmla v25.4s, v13.4s, v5.s[0]
  fmla v26.4s, v12.4s, v5.s[1]
  fmla v27.4s, v13.4s, v5.s[1]
  ld1 {v14.4s, v15.4s}, [x16], #32
  fmla v28.4s, v12.4s, v5.s[2]
  fmla v29.4s, v13.4s, v5.s[2]
  fmla v30.4s, v12.4s, v5.s[3]
  fmla v31.4s, v13.4s, v5.s[3]
  fmla v16.4s, v14.4s, v6.s[0]
  fmla v17.4s, v15.4s, v6.s[0]
  fmla v18.4s, v14.4s, v6.s[1]
  fmla v19.4s, v15.4s, v6.s[1]
  fmla v20.4s, v14.4s, v6.s[2]
  fmla v21.4s, v15.4s, v6.s[2]
  fmla v22.4s, v14.4s, v6.s[3]
  fmla v23.4s, v15.4s, v6.s[3]
  fmla v24.4s, v14.4s, v7.s[0]
  fmla v25.4s, v15.4s, v7.s[0]
  fmla v26.4s, v14.4s, v7.s[1]
  fmla v27.4s, v15.4s, v7.s[1]
  fmla v28.4s, v14.4s, v7.s[2]
  fmla v29.4s, v15.4s, v7.s[2]
  fmla v30.4s, v14.4s, v7.s[3]
  fmla v31.4s, v15.4s, v7.s[3]
 LoopStart:
  ld1 {v0.4s, v1.4s, v2.4s}, [x12], #48
  ld1 {v3.4s, v4.4s}, [x16], #32
  fmla v8.4s, v3.4s, v0.s[0]
  fmla v10.4s, v3.4s, v0.s[1]
  fmla v12.4s, v3.4s, v0.s[2]
  fmla v14.4s, v3.4s, v0.s[3]
  fmla v9.4s, v4.4s, v0.s[0]
  fmla v11.4s, v4.4s, v0.s[1]
  fmla v13.4s, v4.4s, v0.s[2]
  fmla v15.4s, v4.4s, v0.s[3]

  sub w13, w13, #4
  cmp w13, #0
  ble Bias
  cmp w13, #4
  bge OptLoopMul4
  subs w13, w13, #1
  beq LoopEnd

 CommLoopMul:
  ld1 {v0.4s, v1.4s}, [x12], #32
  ld1 {v2.4s, v3.4s}, [x16], #32
  fmla v16.4s, v2.4s, v0.s[0]
  fmla v17.4s, v3.4s, v0.s[0]
  fmla v18.4s, v2.4s, v0.s[1]
  fmla v19.4s, v3.4s, v0.s[1]
  fmla v20.4s, v2.4s, v0.s[2]
  fmla v21.4s, v3.4s, v0.s[2]
  fmla v22.4s, v2.4s, v0.s[3]
  fmla v23.4s, v3.4s, v0.s[3]
  fmla v24.4s, v2.4s, v1.s[0]
  fmla v25.4s, v3.4s, v1.s[0]
  fmla v26.4s, v2.4s, v1.s[1]
  fmla v27.4s, v3.4s, v1.s[1]
  fmla v28.4s, v2.4s, v1.s[2]
  fmla v29.4s, v3.4s, v1.s[2]
  fmla v30.4s, v2.4s, v1.s[3]
  fmla v31.4s, v3.4s, v1.s[3]
 Loop:
  ld1 {v0.4s}, [x12], #16
  fmla v16.4s, v3.4s, v1.s[0]
  fmla v18.4s, v3.4s, v1.s[1]
  fmla v20.4s, v3.4s, v1.s[2]
  fmla v22.4s, v3.4s, v1.s[3]
  fmla v17.4s, v4.4s, v1.s[0]
  fmla v19.4s, v4.4s, v1.s[1]
  fmla v21.4s, v4.4s, v1.s[2]
  fmla v23.4s, v4.4s, v1.s[3]
  ld1 {v1.4s}, [x12], #16
  fmla v24.4s, v3.4s, v2.s[0]
  fmla v26.4s, v3.4s, v2.s[1]
  fmla v28.4s, v3.4s, v2.s[2]
  fmla v30.4s, v3.4s, v2.s[3]
  ld1 {v3.4s}, [x16], #16
  fmla v25.4s, v4.4s, v2.s[0]
  fmla v27.4s, v4.4s, v2.s[1]
  fmla v29.4s, v4.4s, v2.s[2]
  fmla v31.4s, v4.4s, v2.s[3]
  ld1 {v4.4s}, [x16], #16
  fmla v8.4s, v3.4s, v0.s[0]
  fmla v10.4s, v3.4s, v0.s[1]
  fmla v12.4s, v3.4s, v0.s[2]
  fmla v14.4s, v3.4s, v0.s[3]
  ld1 {v2.4s}, [x12], #16
  fmla v9.4s, v4.4s, v0.s[0]
  fmla v11.4s, v4.4s, v0.s[1]
  fmla v13.4s, v4.4s, v0.s[2]
  fmla v15.4s, v4.4s, v0.s[3]

  subs w13, w13, #1
  bgt CommLoopMul
  bgt Loop

 LoopEnd:
  fmla v16.4s, v3.4s, v1.s[0]
  fmla v18.4s, v3.4s, v1.s[1]
  fmla v20.4s, v3.4s, v1.s[2]
  fmla v22.4s, v3.4s, v1.s[3]
  fmla v17.4s, v4.4s, v1.s[0]
  fmla v19.4s, v4.4s, v1.s[1]
  fmla v21.4s, v4.4s, v1.s[2]
  fmla v23.4s, v4.4s, v1.s[3]
  fmla v24.4s, v3.4s, v2.s[0]
  fmla v26.4s, v3.4s, v2.s[1]
  fmla v28.4s, v3.4s, v2.s[2]
  fmla v30.4s, v3.4s, v2.s[3]
  fmla v25.4s, v4.4s, v2.s[0]
  fmla v27.4s, v4.4s, v2.s[1]
  fmla v29.4s, v4.4s, v2.s[2]
  fmla v31.4s, v4.4s, v2.s[3]

 Bias:
  cbz x11, Activation
  ld1 {v0.4s}, [x14], #16
  ld1 {v1.4s}, [x14], #16
  cbz x3, Activation
  ld1 {v0.4s}, [x3], #16
  ld1 {v1.4s}, [x3]
  sub x3, x3, #16
  fadd v8.4s, v8.4s, v0.4s
  fadd v9.4s, v9.4s, v1.4s
  fadd v10.4s, v10.4s, v0.4s
  fadd v11.4s, v11.4s, v1.4s
  fadd v12.4s, v12.4s, v0.4s
  fadd v13.4s, v13.4s, v1.4s
  fadd v14.4s, v14.4s, v0.4s
  fadd v15.4s, v15.4s, v1.4s
  fadd v16.4s, v16.4s, v0.4s
  fadd v17.4s, v17.4s, v1.4s
  fadd v18.4s, v18.4s, v0.4s
@@ -192,48 +178,64 @@ Activation:
  b Write

 Relu6:
  mov w8, #6
  dup v15.4s, w8
  scvtf v15.4s, v15.4s
  fmin v16.4s, v16.4s, v15.4s
  fmin v17.4s, v17.4s, v15.4s
  fmin v18.4s, v18.4s, v15.4s
  fmin v19.4s, v19.4s, v15.4s
  fmin v20.4s, v20.4s, v15.4s
  fmin v21.4s, v21.4s, v15.4s
  fmin v22.4s, v22.4s, v15.4s
  fmin v23.4s, v23.4s, v15.4s
  fmin v24.4s, v24.4s, v15.4s
  fmin v25.4s, v25.4s, v15.4s
  fmin v26.4s, v26.4s, v15.4s
  fmin v27.4s, v27.4s, v15.4s
  fmin v28.4s, v28.4s, v15.4s
  fmin v29.4s, v29.4s, v15.4s
  fmin v30.4s, v30.4s, v15.4s
  fmin v31.4s, v31.4s, v15.4s
  mov w13, #6
  dup v2.4s, w13
  scvtf v2.4s, v2.4s
  fmin v8.4s, v8.4s, v2.4s
  fmin v9.4s, v9.4s, v2.4s
  fmin v10.4s, v10.4s, v2.4s
  fmin v11.4s, v11.4s, v2.4s
  fmin v12.4s, v12.4s, v2.4s
  fmin v13.4s, v13.4s, v2.4s
  fmin v14.4s, v14.4s, v2.4s
  fmin v15.4s, v15.4s, v2.4s
  fmin v16.4s, v16.4s, v2.4s
  fmin v17.4s, v17.4s, v2.4s
  fmin v18.4s, v18.4s, v2.4s
  fmin v19.4s, v19.4s, v2.4s
  fmin v20.4s, v20.4s, v2.4s
  fmin v21.4s, v21.4s, v2.4s
  fmin v22.4s, v22.4s, v2.4s
  fmin v23.4s, v23.4s, v2.4s
  fmin v24.4s, v24.4s, v2.4s
  fmin v25.4s, v25.4s, v2.4s
  fmin v26.4s, v26.4s, v2.4s
  fmin v27.4s, v27.4s, v2.4s
  fmin v28.4s, v28.4s, v2.4s
  fmin v29.4s, v29.4s, v2.4s
  fmin v30.4s, v30.4s, v2.4s
  fmin v31.4s, v31.4s, v2.4s

 Relu:
  dup v14.4s, wzr
  fmax v16.4s, v16.4s, v14.4s
  fmax v17.4s, v17.4s, v14.4s
  fmax v18.4s, v18.4s, v14.4s
  fmax v19.4s, v19.4s, v14.4s
  fmax v20.4s, v20.4s, v14.4s
  fmax v21.4s, v21.4s, v14.4s
  fmax v22.4s, v22.4s, v14.4s
  fmax v23.4s, v23.4s, v14.4s
  fmax v24.4s, v24.4s, v14.4s
  fmax v25.4s, v25.4s, v14.4s
  fmax v26.4s, v26.4s, v14.4s
  fmax v27.4s, v27.4s, v14.4s
  fmax v28.4s, v28.4s, v14.4s
  fmax v29.4s, v29.4s, v14.4s
  fmax v30.4s, v30.4s, v14.4s
  fmax v31.4s, v31.4s, v14.4s
  dup v3.4s, wzr
  fmax v8.4s, v8.4s, v3.4s
  fmax v9.4s, v9.4s, v3.4s
  fmax v10.4s, v10.4s, v3.4s
  fmax v11.4s, v11.4s, v3.4s
  fmax v12.4s, v12.4s, v3.4s
  fmax v13.4s, v13.4s, v3.4s
  fmax v14.4s, v14.4s, v3.4s
  fmax v15.4s, v15.4s, v3.4s
  fmax v16.4s, v16.4s, v3.4s
  fmax v17.4s, v17.4s, v3.4s
  fmax v18.4s, v18.4s, v3.4s
  fmax v19.4s, v19.4s, v3.4s
  fmax v20.4s, v20.4s, v3.4s
  fmax v21.4s, v21.4s, v3.4s
  fmax v22.4s, v22.4s, v3.4s
  fmax v23.4s, v23.4s, v3.4s
  fmax v24.4s, v24.4s, v3.4s
  fmax v25.4s, v25.4s, v3.4s
  fmax v26.4s, v26.4s, v3.4s
  fmax v27.4s, v27.4s, v3.4s
  fmax v28.4s, v28.4s, v3.4s
  fmax v29.4s, v29.4s, v3.4s
  fmax v30.4s, v30.4s, v3.4s
  fmax v31.4s, v31.4s, v3.4s

 Write:
  ldrb w13, [sp, #8]
  cbz w13, WriteC8
  cbnz x14, WriteWino
  cbz x9, WriteC8
  cmp w7, #1
  beq Write1
  cmp w7, #2
@@ -251,71 +253,107 @@ Write:
  b Write8

 Write1:
  str s16, [x18]
  str s8, [x18]
  cmp w10, #1
  beq WriteEnd
  add x18, x18, x17
  str s18, [x18]
  str s10, [x18]
  cmp w10, #2
  beq WriteEnd
  add x18, x18, x17
  str s20, [x18]
  str s12, [x18]
  cmp w10, #3
  beq WriteEnd
  add x18, x18, x17
  str s22, [x18]
  str s14, [x18]
  cmp w10, #4
  beq WriteEnd
  add x18, x18, x17
  str s24, [x18]
  str s16, [x18]
  cmp w10, #5
  beq WriteEnd
  add x18, x18, x17
  str s26, [x18]
  str s18, [x18]
  cmp w10, #6
  beq WriteEnd
  add x18, x18, x17
  str s28, [x18]
  str s20, [x18]
  cmp w10, #7
  beq WriteEnd
  add x18, x18, x17
  str s22, [x18]
  cmp w10, #8
  beq WriteEnd
  add x18, x18, x17
  str s24, [x18]
  cmp w10, #9
  beq WriteEnd
  add x18, x18, x17
  str s26, [x18]
  cmp w10, #10
  beq WriteEnd
  add x18, x18, x17
  str s28, [x18]
  cmp w10, #11
  beq WriteEnd
  add x18, x18, x17
  str s30, [x18]
  add x18, x18, x17
  b WriteEnd
 Write2:
  dup s9, v8.s[1]
  stp s8, s9, [x18]
  cmp w10, #1
  beq WriteEnd
  add x18, x18, x17
  dup s11, v10.s[1]
  stp s10, s11, [x18]
  cmp w10, #2
  beq WriteEnd
  add x18, x18, x17
  dup s13, v12.s[1]
  stp s12, s13, [x18]
  cmp w10, #3
  beq WriteEnd
  add x18, x18, x17
  dup s15, v14.s[1]
  stp s14, s15, [x18]
  cmp w10, #4
  beq WriteEnd
  add x18, x18, x17
  dup s17, v16.s[1]
  stp s16, s17, [x18]
  cmp w10, #1
  cmp w10, #5
  beq WriteEnd
  add x18, x18, x17
  dup s19, v18.s[1]
  stp s18, s19, [x18]
  cmp w10, #2
  cmp w10, #6
  beq WriteEnd
  add x18, x18, x17
  dup s21, v20.s[1]
  stp s20, s21, [x18]
  cmp w10, #3
  cmp w10, #7
  beq WriteEnd
  add x18, x18, x17
  dup s23, v22.s[1]
  stp s22, s23, [x18]
  cmp w10, #4
  cmp w10, #8
  beq WriteEnd
  add x18, x18, x17
  dup s25, v24.s[1]
  stp s24, s25, [x18]
  cmp w10, #5
  cmp w10, #9
  beq WriteEnd
  add x18, x18, x17
  dup s27, v26.s[1]
  stp s26, s27, [x18]
  cmp w10, #6
  cmp w10, #10
  beq WriteEnd
  add x18, x18, x17
  dup s29, v28.s[1]
  stp s28, s29, [x18]
  cmp w10, #7
  cmp w10, #11
  beq WriteEnd
  add x18, x18, x17
  dup s31, v30.s[1]
@@ -324,47 +362,71 @@ Write2:
  b WriteEnd
 Write3:
  add x13, x18, #8
  dup s9, v8.s[1]
  stp s8, s9, [x18]
  add x18, x18, x17
  st1 {v8.s}[2], [x13], x17
  cmp w10, #1
  beq WriteEnd
  dup s11, v10.s[1]
  stp s10, s11, [x18]
  add x18, x18, x17
  st1 {v10.s}[2], [x13], x17
  cmp w10, #2
  beq WriteEnd
  dup s13, v12.s[1]
  stp s12, s13, [x18]
  add x18, x18, x17
  st1 {v12.s}[2], [x13], x17
  cmp w10, #3
  beq WriteEnd
  dup s15, v14.s[1]
  stp s14, s15, [x18]
  add x18, x18, x17
  st1 {v14.s}[2], [x13], x17
  cmp w10, #4
  beq WriteEnd
  dup s17, v16.s[1]
  stp s16, s17, [x18]
  add x18, x18, x17
  st1 {v16.s}[2], [x13], x17
  cmp w10, #1
  cmp w10, #5
  beq WriteEnd
  dup s19, v18.s[1]
  stp s18, s19, [x18]
  add x18, x18, x17
  st1 {v18.s}[2], [x13], x17
  cmp w10, #2
  cmp w10, #6
  beq WriteEnd
  dup s21, v20.s[1]
  stp s20, s21, [x18]
  add x18, x18, x17
  st1 {v20.s}[2], [x13], x17
  cmp w10, #3
  cmp w10, #7
  beq WriteEnd
  dup s23, v22.s[1]
  stp s22, s23, [x18]
  add x18, x18, x17
  st1 {v22.s}[2], [x13], x17
  cmp w10, #4
  cmp w10, #8
  beq WriteEnd
  dup s25, v24.s[1]
  stp s24, s25, [x18]
  add x18, x18, x17
  st1 {v24.s}[2], [x13], x17
  cmp w10, #5
  cmp w10, #9
  beq WriteEnd
  dup s27, v26.s[1]
  stp s26, s27, [x18]
  add x18, x18, x17
  st1 {v26.s}[2], [x13], x17
  cmp w10, #6
  cmp w10, #10
  beq WriteEnd
  dup s29, v28.s[1]
  stp s28, s29, [x18]
  add x18, x18, x17
  st1 {v28.s}[2], [x13], x17
  cmp w10, #7
  cmp w10, #11
  beq WriteEnd
  dup s31, v30.s[1]
  stp s30, s31, [x18]
@@ -372,64 +434,96 @@ Write3:
  st1 {v30.s}[2], [x13]
  b WriteEnd
 Write4:
  st1 {v16.4s}, [x18], x17
  st1 {v8.4s}, [x18], x17
  cmp w10, #1
  beq WriteEnd
  st1 {v18.4s}, [x18], x17
  st1 {v10.4s}, [x18], x17
  cmp w10, #2
  beq WriteEnd
  st1 {v20.4s}, [x18], x17
  st1 {v12.4s}, [x18], x17
  cmp w10, #3
  beq WriteEnd
  st1 {v22.4s}, [x18], x17
  st1 {v14.4s}, [x18], x17
  cmp w10, #4
  beq WriteEnd
  st1 {v24.4s}, [x18], x17
  st1 {v16.4s}, [x18], x17
  cmp w10, #5
  beq WriteEnd
  st1 {v26.4s}, [x18], x17
  st1 {v18.4s}, [x18], x17
  cmp w10, #6
  beq WriteEnd
  st1 {v28.4s}, [x18], x17
  st1 {v20.4s}, [x18], x17
  cmp w10, #7
  beq WriteEnd
  st1 {v22.4s}, [x18], x17
  cmp w10, #8
  beq WriteEnd
  st1 {v24.4s}, [x18], x17
  cmp w10, #9
  beq WriteEnd
  st1 {v26.4s}, [x18], x17
  cmp w10, #10
  beq WriteEnd
  st1 {v28.4s}, [x18], x17
  cmp w10, #11
  beq WriteEnd
  st1 {v30.4s}, [x18], x17
  b WriteEnd
 Write5:
  add x13, x18, #16
  st1 {v8.4s}, [x18], x17
  str s9, [x13]
  cmp w10, #1
  beq WriteEnd
  add x13, x13, x17
  st1 {v10.4s}, [x18], x17
  str s11, [x13]
  cmp w10, #2
  beq WriteEnd
  add x13, x13, x17
  st1 {v12.4s}, [x18], x17
  str s13, [x13]
  cmp w10, #3
  beq WriteEnd
  add x13, x13, x17
  st1 {v14.4s}, [x18], x17
  str s15, [x13]
  cmp w10, #4
  beq WriteEnd
  add x13, x13, x17
  st1 {v16.4s}, [x18], x17
  str s17, [x13]
  cmp w10, #1
  cmp w10, #5
  beq WriteEnd
  add x13, x13, x17
  st1 {v18.4s}, [x18], x17
  str s19, [x13]
  cmp w10, #2
  cmp w10, #6
  beq WriteEnd
  add x13, x13, x17
  st1 {v20.4s}, [x18], x17
  str s21, [x13]
  cmp w10, #3
  cmp w10, #7
  beq WriteEnd
  add x13, x13, x17
  st1 {v22.4s}, [x18], x17
  str s23, [x13]
  cmp w10, #4
  cmp w10, #8
  beq WriteEnd
  add x13, x13, x17
  st1 {v24.4s}, [x18], x17
  str s25, [x13]
  cmp w10, #5
  cmp w10, #9
  beq WriteEnd
  add x13, x13, x17
  st1 {v26.4s}, [x18], x17
  str s27, [x13]
  cmp w10, #6
  cmp w10, #10
  beq WriteEnd
  add x13, x13, x17
  st1 {v28.4s}, [x18], x17
  str s29, [x13]
  cmp w10, #7
  cmp w10, #11
  beq WriteEnd
  add x13, x13, x17
  st1 {v30.4s}, [x18], x17
@@ -437,46 +531,70 @@ Write5:
  b WriteEnd
 Write6:
  add x13, x18, #16
  st1 {v8.4s}, [x18], x17
  dup s8, v9.s[1]
  stp s9, s8, [x13]
  cmp w10, #1
  beq WriteEnd
  add x13, x13, x17
  st1 {v10.4s}, [x18], x17
  dup s10, v11.s[1]
  stp s11, s10, [x13]
  cmp w10, #2
  beq WriteEnd
  add x13, x13, x17
  st1 {v12.4s}, [x18], x17
  dup s12, v13.s[1]
  stp s13, s12, [x13]
  cmp w10, #3
  beq WriteEnd
  add x13, x13, x17
  st1 {v14.4s}, [x18], x17
  dup s14, v15.s[1]
  stp s15, s14, [x13]
  cmp w10, #4
  beq WriteEnd
  add x13, x13, x17
  st1 {v16.4s}, [x18], x17
  dup s16, v17.s[1]
  stp s17, s16, [x13]
  cmp w10, #1
  cmp w10, #5
  beq WriteEnd
  add x13, x13, x17
  st1 {v18.4s}, [x18], x17
  dup s18, v19.s[1]
  stp s19, s18, [x13]
  cmp w10, #2
  cmp w10, #6
  beq WriteEnd
  add x13, x13, x17
  st1 {v20.4s}, [x18], x17
  dup s20, v21.s[1]
  stp s21, s20, [x13]
  cmp w10, #3
  cmp w10, #7
  beq WriteEnd
  add x13, x13, x17
  st1 {v22.4s}, [x18], x17
  dup s22, v23.s[1]
  stp s23, s22, [x13]
  cmp w10, #4
  cmp w10, #8
  beq WriteEnd
  add x13, x13, x17
  st1 {v24.4s}, [x18], x17
  dup s24, v25.s[1]
  stp s25, s24, [x13]
  cmp w10, #5
  cmp w10, #9
  beq WriteEnd
  add x13, x13, x17
  st1 {v26.4s}, [x18], x17
  dup s26, v27.s[1]
  stp s27, s26, [x13]
  cmp w10, #6
  cmp w10, #10
  beq WriteEnd
  add x13, x13, x17
  st1 {v28.4s}, [x18], x17
  dup s28, v29.s[1]
  stp s29, s28, [x13]
  cmp w10, #7
  cmp w10, #11
  beq WriteEnd
  add x13, x13, x17
  st1 {v30.4s}, [x18], x17
@@ -486,54 +604,82 @@ Write6:
 Write7:
  add x13, x18, #16
  add x16, x18, #24
  st1 {v8.4s}, [x18], x17
  dup s8, v9.s[1]
  stp s9, s8, [x13]
  add x13, x13, x17
  st1 {v9.s}[2], [x16], x17
  cmp w10, #1
  beq WriteEnd
  st1 {v10.4s}, [x18], x17
  dup s10, v11.s[1]
  stp s11, s10, [x13]
  add x13, x13, x17
  st1 {v11.s}[2], [x16], x17
  cmp w10, #2
  beq WriteEnd
  st1 {v12.4s}, [x18], x17
  dup s12, v13.s[1]
  stp s13, s12, [x13]
  add x13, x13, x17
  st1 {v13.s}[2], [x16], x17
  cmp w10, #3
  beq WriteEnd
  st1 {v14.4s}, [x18], x17
  dup s14, v15.s[1]
  stp s15, s14, [x13]
  add x13, x13, x17
  st1 {v15.s}[2], [x16], x17
  cmp w10, #4
  beq WriteEnd
  st1 {v16.4s}, [x18], x17
  dup s16, v17.s[1]
  stp s17, s16, [x13]
  add x13, x13, x17
  st1 {v17.s}[2], [x16], x17
  cmp w10, #1
  cmp w10, #5
  beq WriteEnd
  st1 {v18.4s}, [x18], x17
  dup s18, v19.s[1]
  stp s19, s18, [x13]
  add x13, x13, x17
  st1 {v19.s}[2], [x16], x17
  cmp w10, #2
  cmp w10, #6
  beq WriteEnd
  st1 {v20.4s}, [x18], x17
  dup s20, v21.s[1]
  stp s21, s20, [x13]
  add x13, x13, x17
  st1 {v21.s}[2], [x16], x17
  cmp w10, #3
  cmp w10, #7
  beq WriteEnd
  st1 {v22.4s}, [x18], x17
  dup s22, v23.s[1]
  stp s23, s22, [x13]
  add x13, x13, x17
  st1 {v23.s}[2], [x16], x17
  cmp w10, #4
  cmp w10, #8
  beq WriteEnd
  st1 {v24.4s}, [x18], x17
  dup s24, v25.s[1]
  stp s25, s24, [x13]
  add x13, x13, x17
  st1 {v25.s}[2], [x16], x17
  cmp w10, #5
  cmp w10, #9
  beq WriteEnd
  st1 {v26.4s}, [x18], x17
  dup s26, v27.s[1]
  stp s27, s26, [x13]
  add x13, x13, x17
  st1 {v27.s}[2], [x16], x17
  cmp w10, #6
  cmp w10, #10
  beq WriteEnd
  st1 {v28.4s}, [x18], x17
  dup s28, v29.s[1]
  stp s29, s28, [x13]
  add x13, x13, x17
  st1 {v29.s}[2], [x16], x17
  cmp w10, #7
  cmp w10, #11
  beq WriteEnd
  st1 {v30.4s}, [x18], x17
  dup s30, v31.s[1]
@@ -542,46 +688,79 @@ Write7:
  st1 {v31.s}[2], [x16], x17
  b WriteEnd
 WriteC8:
  st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
  st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64
  st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x2], #64
  st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x2], #64
  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x2], #64
  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x2], #64
  st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x2], #64
  st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x2], #64
  st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x2], #64
  st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x2], #64
  b WriteEnd
 WriteWino:
  st1 {v8.4s, v9.4s}, [x18], x8
  st1 {v10.4s, v11.4s}, [x18], x8
  st1 {v12.4s, v13.4s}, [x18], x8
  st1 {v14.4s, v15.4s}, [x18], x8
  st1 {v16.4s, v17.4s}, [x18], x8
  st1 {v18.4s, v19.4s}, [x18], x8
  st1 {v20.4s, v21.4s}, [x18], x8
  st1 {v22.4s, v23.4s}, [x18], x8
  st1 {v24.4s, v25.4s}, [x18], x8
  st1 {v26.4s, v27.4s}, [x18], x8
  st1 {v28.4s, v29.4s}, [x18], x8
  st1 {v30.4s, v31.4s}, [x18], x8
  b WriteEnd
 Write8:
  st1 {v16.4s, v17.4s}, [x18], x17
  st1 {v8.4s, v9.4s}, [x18], x17
  cmp w10, #1
  beq WriteEnd
  st1 {v18.4s, v19.4s}, [x18], x17
  st1 {v10.4s, v11.4s}, [x18], x17
  cmp w10, #2
  beq WriteEnd
  st1 {v20.4s, v21.4s}, [x18], x17
  st1 {v12.4s, v13.4s}, [x18], x17
  cmp w10, #3
  beq WriteEnd
  st1 {v22.4s, v23.4s}, [x18], x17
  st1 {v14.4s, v15.4s}, [x18], x17
  cmp w10, #4
  beq WriteEnd
  st1 {v24.4s, v25.4s}, [x18], x17
  st1 {v16.4s, v17.4s}, [x18], x17
  cmp w10, #5
  beq WriteEnd
  st1 {v26.4s, v27.4s}, [x18], x17
  st1 {v18.4s, v19.4s}, [x18], x17
  cmp w10, #6
  beq WriteEnd
  st1 {v28.4s, v29.4s}, [x18], x17
  st1 {v20.4s, v21.4s}, [x18], x17
  cmp w10, #7
  beq WriteEnd
  st1 {v22.4s, v23.4s}, [x18], x17
  cmp w10, #8
  beq WriteEnd
  st1 {v24.4s, v25.4s}, [x18], x17
  cmp w10, #9
  beq WriteEnd
  st1 {v26.4s, v27.4s}, [x18], x17
  cmp w10, #10
  beq WriteEnd
  st1 {v28.4s, v29.4s}, [x18], x17
  cmp w10, #11
  beq WriteEnd
  st1 {v30.4s, v31.4s}, [x18], x17

 WriteEnd:
  subs w10, w10, #8 // lhs row - 8
  subs w10, w10, #12 // lhs row - 12
  bgt L2

 End2:
  subs w7, w7, #8 // rhs col - 8
  add x1, x1, x15 // rhs ptr + stride
  cbz x3, NoBiasStep
  add x3, x3, #32 // bias ptr + stride
  ldrb w13, [sp, #8]
  cbz w13, NoDstStep
 NoBiasStep:
  cbnz x14, WinoDstStep
  cbz x9, NoDstStep
  add x2, x2, #32 // dst ptr + stride
  b NoDstStep
 WinoDstStep:
  add x2, x2, x11
 NoDstStep:
  bgt L1

--- a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32OptRemain.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32OptRemain.S
@@ -6,139 +6,761 @@
    .type MatmulFloatNeon64OptRemain, %function
 #endif

 // void MatmulFloatNeon64(const float *a, const float *b, float *c, int depth
 //                        int row, int col, size_t stride)
 // void MatmulFloatNeon64Remain(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
 //                        int row, int col, size_t stride, size_t writeMode)
 // x0: a
 // x1: b
 // x2: c
 // x3: depth
 // x4: row
 // x5: col
 // x6: stride
 // only for winograd
 // x3: bias
 // x4: act_type
 // x5: depth
 // x6: row
 // x7: col
 // x8: stride
 // x9: writeMode

 MatmulFloatNeon64OptRemain:
    mov x18, #32 // sizeof(float) * 8
    mul x9, x3, x18 // block stride of lhs/rhs: sizeof(float) * 8 * depth
    sub sp, sp, #144
    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
    stp x19, x20, [sp], #16

    ldr x8, [sp]
    ldr x9, [sp, #8]

    mov x18, #48 // sizeof(float) * 12
    mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float) * 12 * depth
    cbnz x9, NoC8Steps
    mov x11, x2
    mov x18, #32
    mul x16, x6, x18 // row * 8 * sizeof(float)
 NoC8Steps:
    cmp x9, #2
    bne NoWinoSteps
    mov x18, #4
    mul x15, x7, x8
    mul x15, x15, x18 // kernel_size * col *sizeof(float)
    mov x18, #32
    mul x16, x8, x18 // kernel_size * 8 * sizeof(float)
 NoWinoSteps:
    mov x18, #4
    mul x8, x5, x6
    mov x11, #8
    mul x11, x11, x6
    mul x8, x8, x18
    mul x11, x11, x18

    cmp x4, #4
    ble LoopH4

    LoopH8:
        mov x10, x4 // reload lhs row
        mov x12, x0 // reload lhs ptr
        mov x18, x2 // reload dst ptr

        LoopW8:
            mov x16, x1 // reload rhs ptr
            mov x13, x3 // reload depth
            dup v16.4s, wzr
            dup v17.4s, wzr
            dup v18.4s, wzr
            dup v19.4s, wzr
            dup v20.4s, wzr
            dup v21.4s, wzr
            dup v22.4s, wzr
            dup v23.4s, wzr
            dup v24.4s, wzr
            dup v25.4s, wzr
            dup v26.4s, wzr
            dup v27.4s, wzr
            dup v28.4s, wzr
            dup v29.4s, wzr
            dup v30.4s, wzr
            dup v31.4s, wzr

            LoopD8:
                ld1 {v0.4s, v1.4s, v2.4s}, [x12], #48
                ld1 {v3.4s, v4.4s}, [x16], #32
                fmla v16.4s, v3.4s, v0.s[0]
                fmla v18.4s, v3.4s, v0.s[1]
                fmla v20.4s, v3.4s, v0.s[2]
                fmla v22.4s, v3.4s, v0.s[3]
                fmla v17.4s, v4.4s, v0.s[0]
                fmla v19.4s, v4.4s, v0.s[1]
                fmla v21.4s, v4.4s, v0.s[2]
                fmla v23.4s, v4.4s, v0.s[3]
                fmla v24.4s, v3.4s, v1.s[0]
                fmla v26.4s, v3.4s, v1.s[1]
                fmla v28.4s, v3.4s, v1.s[2]
                fmla v30.4s, v3.4s, v1.s[3]
                fmla v25.4s, v4.4s, v1.s[0]
                fmla v27.4s, v4.4s, v1.s[1]
                fmla v29.4s, v4.4s, v1.s[2]
                fmla v31.4s, v4.4s, v1.s[3]

                subs w13, w13, #1
                bgt LoopD8

                st1 {v16.4s, v17.4s}, [x18], x8
                st1 {v18.4s, v19.4s}, [x18], x8
                st1 {v20.4s, v21.4s}, [x18], x8
                st1 {v22.4s, v23.4s}, [x18], x8
                st1 {v24.4s, v25.4s}, [x18], x8
                st1 {v26.4s, v27.4s}, [x18], x8
                st1 {v28.4s, v29.4s}, [x18], x8
                st1 {v30.4s, v31.4s}, [x18], x8

            subs x10, x10, #8 // lhs row - 8
            bgt LoopW8

        subs x5, x5, #8 // rhs col - 8
        add x1, x1, x9 // rhs ptr + stride
        add x2, x2, x11
        bgt LoopH8

        ret

    LoopH4:
        mov x10, x4 // reload lhs row
        mov x12, x0 // reload lhs ptr
        mov x18, x2 // reload dst ptr

        LoopW4:
            mov x16, x1 // reload rhs ptr
            mov x13, x3 // reload depth
            dup v16.4s, wzr
            dup v17.4s, wzr
            dup v18.4s, wzr
            dup v19.4s, wzr
            dup v20.4s, wzr
            dup v21.4s, wzr
            dup v22.4s, wzr
            dup v23.4s, wzr

            LoopD4:
                ld1 {v0.4s, v1.4s, v2.4s}, [x12], #48
                ld1 {v3.4s, v4.4s}, [x16], #32
                fmla v16.4s, v3.4s, v0.s[0]
                fmla v18.4s, v3.4s, v0.s[1]
                fmla v20.4s, v3.4s, v0.s[2]
                fmla v22.4s, v3.4s, v0.s[3]
                fmla v17.4s, v4.4s, v0.s[0]
                fmla v19.4s, v4.4s, v0.s[1]
                fmla v21.4s, v4.4s, v0.s[2]
                fmla v23.4s, v4.4s, v0.s[3]

                subs x13, x13, #1
                bgt LoopD4

                st1 {v16.4s, v17.4s}, [x18], x8
                st1 {v18.4s, v19.4s}, [x18], x8
                st1 {v20.4s, v21.4s}, [x18], x8
                st1 {v22.4s, v23.4s}, [x18], x8

            subs x10, x10, #4 // lhs row - 4
            bgt LoopW4

        subs x5, x5, #8 // rhs col - 8
        add x1, x1, x9 // rhs ptr + stride
        add x2, x2, x11
        bgt LoopH4
    ret

 LoopRow:
    cmp x6, #4
    ble LoopRow4

 LoopRow8:
    mov x14, x1 // reload rhs ptr
    mov x13, x7 // reload rhs col
    mov x12, x3 // reload bias

    LoopCol8:
        cbz x9, NoReloadDst8
        mov x11, x2
    NoReloadDst8:
        mov x10, x0 // reload lhs ptr
        mov x19, x5 // reload depth

        cmp x13, #4
        ble LoopDepthStartHalf8

    LoopDepthStart8:
        ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48
        ld1 {v3.4s, v4.4s}, [x14], #32
        fmul v8.4s, v3.4s, v0.s[0]
        fmul v10.4s, v3.4s, v0.s[1]
        fmul v12.4s, v3.4s, v0.s[2]
        fmul v14.4s, v3.4s, v0.s[3]
        fmul v9.4s, v4.4s, v0.s[0]
        fmul v11.4s, v4.4s, v0.s[1]
        fmul v13.4s, v4.4s, v0.s[2]
        fmul v15.4s, v4.4s, v0.s[3]
        fmul v16.4s, v3.4s, v1.s[0]
        fmul v18.4s, v3.4s, v1.s[1]
        fmul v20.4s, v3.4s, v1.s[2]
        fmul v22.4s, v3.4s, v1.s[3]
        fmul v17.4s, v4.4s, v1.s[0]
        fmul v19.4s, v4.4s, v1.s[1]
        fmul v21.4s, v4.4s, v1.s[2]
        fmul v23.4s, v4.4s, v1.s[3]

        subs x19, x19, #1
        beq Bias8

        LoopDepth8:
            ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48
            ld1 {v3.4s, v4.4s}, [x14], #32
            fmla v8.4s, v3.4s, v0.s[0]
            fmla v10.4s, v3.4s, v0.s[1]
            fmla v12.4s, v3.4s, v0.s[2]
            fmla v14.4s, v3.4s, v0.s[3]
            fmla v9.4s, v4.4s, v0.s[0]
            fmla v11.4s, v4.4s, v0.s[1]
            fmla v13.4s, v4.4s, v0.s[2]
            fmla v15.4s, v4.4s, v0.s[3]
            fmla v16.4s, v3.4s, v1.s[0]
            fmla v18.4s, v3.4s, v1.s[1]
            fmla v20.4s, v3.4s, v1.s[2]
            fmla v22.4s, v3.4s, v1.s[3]
            fmla v17.4s, v4.4s, v1.s[0]
            fmla v19.4s, v4.4s, v1.s[1]
            fmla v21.4s, v4.4s, v1.s[2]
            fmla v23.4s, v4.4s, v1.s[3]

            subs x19, x19, #1
            bgt LoopDepth8

        Bias8:
            cbz x3, Activation8
            ld1 {v0.4s}, [x12], #16
            ld1 {v1.4s}, [x12], #16
            fadd v8.4s, v8.4s, v0.4s
            fadd v9.4s, v9.4s, v1.4s
            fadd v10.4s, v10.4s, v0.4s
            fadd v11.4s, v11.4s, v1.4s
            fadd v12.4s, v12.4s, v0.4s
            fadd v13.4s, v13.4s, v1.4s
            fadd v14.4s, v14.4s, v0.4s
            fadd v15.4s, v15.4s, v1.4s
            fadd v16.4s, v16.4s, v0.4s
            fadd v17.4s, v17.4s, v1.4s
            fadd v18.4s, v18.4s, v0.4s
            fadd v19.4s, v19.4s, v1.4s
            fadd v20.4s, v20.4s, v0.4s
            fadd v21.4s, v21.4s, v1.4s
            fadd v22.4s, v22.4s, v0.4s
            fadd v23.4s, v23.4s, v1.4s

        Activation8:
            cmp x4, #2
            beq Relu68
            cmp x4, #1
            beq Relu8
            b Write

        Relu68:
            mov w19, #6
            dup v2.4s, w19
            scvtf v2.4s, v2.4s
            fmin v8.4s, v8.4s, v2.4s
            fmin v9.4s, v9.4s, v2.4s
            fmin v10.4s, v10.4s, v2.4s
            fmin v11.4s, v11.4s, v2.4s
            fmin v12.4s, v12.4s, v2.4s
            fmin v13.4s, v13.4s, v2.4s
            fmin v14.4s, v14.4s, v2.4s
            fmin v15.4s, v15.4s, v2.4s
            fmin v16.4s, v16.4s, v2.4s
            fmin v17.4s, v17.4s, v2.4s
            fmin v18.4s, v18.4s, v2.4s
            fmin v19.4s, v19.4s, v2.4s
            fmin v20.4s, v20.4s, v2.4s
            fmin v21.4s, v21.4s, v2.4s
            fmin v22.4s, v22.4s, v2.4s
            fmin v23.4s, v23.4s, v2.4s
        
        Relu8:
            dup v3.4s, wzr
            fmax v8.4s, v8.4s, v3.4s
            fmax v9.4s, v9.4s, v3.4s
            fmax v10.4s, v10.4s, v3.4s
            fmax v11.4s, v11.4s, v3.4s
            fmax v12.4s, v12.4s, v3.4s
            fmax v13.4s, v13.4s, v3.4s
            fmax v14.4s, v14.4s, v3.4s
            fmax v15.4s, v15.4s, v3.4s
            fmax v16.4s, v16.4s, v3.4s
            fmax v17.4s, v17.4s, v3.4s
            fmax v18.4s, v18.4s, v3.4s
            fmax v19.4s, v19.4s, v3.4s
            fmax v20.4s, v20.4s, v3.4s
            fmax v21.4s, v21.4s, v3.4s
            fmax v22.4s, v22.4s, v3.4s
            fmax v23.4s, v23.4s, v3.4s
            b Write

    LoopDepthStartHalf8:
        ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48
        ld1 {v3.4s, v4.4s}, [x14], #32
        fmul v8.4s, v3.4s, v0.s[0]
        fmul v10.4s, v3.4s, v0.s[1]
        fmul v12.4s, v3.4s, v0.s[2]
        fmul v14.4s, v3.4s, v0.s[3]
        fmul v16.4s, v3.4s, v1.s[0]
        fmul v18.4s, v3.4s, v1.s[1]
        fmul v20.4s, v3.4s, v1.s[2]
        fmul v22.4s, v3.4s, v1.s[3]

        subs x19, x19, #1
        beq BiasHalf8

        LoopDepthHalf8:
            ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48
            ld1 {v3.4s, v4.4s}, [x14], #32
            fmla v8.4s, v3.4s, v0.s[0]
            fmla v10.4s, v3.4s, v0.s[1]
            fmla v12.4s, v3.4s, v0.s[2]
            fmla v14.4s, v3.4s, v0.s[3]
            fmla v16.4s, v3.4s, v1.s[0]
            fmla v18.4s, v3.4s, v1.s[1]
            fmla v20.4s, v3.4s, v1.s[2]
            fmla v22.4s, v3.4s, v1.s[3]

            subs x19, x19, #1
            bgt LoopDepthHalf8

        BiasHalf8:
            cbz x3, ActivationHalf8
            ld1 {v0.4s}, [x12], #16
            ld1 {v1.4s}, [x12], #16
            fadd v8.4s, v8.4s, v0.4s
            fadd v10.4s, v10.4s, v0.4s
            fadd v12.4s, v12.4s, v0.4s
            fadd v14.4s, v14.4s, v0.4s
            fadd v16.4s, v16.4s, v0.4s
            fadd v18.4s, v18.4s, v0.4s
            fadd v20.4s, v20.4s, v0.4s
            fadd v22.4s, v22.4s, v0.4s

        ActivationHalf8:
            cmp x4, #2
            beq Relu6Half8
            cmp x4, #1
            beq ReluHalf8
            b Write

        Relu6Half8:
            mov w19, #6
            dup v2.4s, w19
            scvtf v2.4s, v2.4s
            fmin v8.4s, v8.4s, v2.4s
            fmin v10.4s, v10.4s, v2.4s
            fmin v12.4s, v12.4s, v2.4s
            fmin v14.4s, v14.4s, v2.4s
            fmin v16.4s, v16.4s, v2.4s
            fmin v18.4s, v18.4s, v2.4s
            fmin v20.4s, v20.4s, v2.4s
            fmin v22.4s, v22.4s, v2.4s

        ReluHalf8:
            dup v3.4s, wzr
            fmax v8.4s, v8.4s, v3.4s
            fmax v10.4s, v10.4s, v3.4s
            fmax v12.4s, v12.4s, v3.4s
            fmax v14.4s, v14.4s, v3.4s
            fmax v16.4s, v16.4s, v3.4s
            fmax v18.4s, v18.4s, v3.4s
            fmax v20.4s, v20.4s, v3.4s
            fmax v22.4s, v22.4s, v3.4s
            b Write

 LoopRow4:
    mov x14, x1 // reload rhs ptr
    mov x13, x7 // reload rhs col
    mov x12, x3 // reload bias

    LoopCol4:
        cbz x9, NoReloadDst4
        mov x11, x2
    NoReloadDst4:
        mov x10, x0 // reload lhs ptr
        mov x19, x5 // reload depth

        cmp x13, #4
        ble LoopDepthStartHalf4

    LoopDepthStart4:
        ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48
        ld1 {v3.4s, v4.4s}, [x14], #32
        fmul v8.4s, v3.4s, v0.s[0]
        fmul v10.4s, v3.4s, v0.s[1]
        fmul v12.4s, v3.4s, v0.s[2]
        fmul v14.4s, v3.4s, v0.s[3]
        fmul v9.4s, v4.4s, v0.s[0]
        fmul v11.4s, v4.4s, v0.s[1]
        fmul v13.4s, v4.4s, v0.s[2]
        fmul v15.4s, v4.4s, v0.s[3]

        subs x19, x19, #1
        beq Bias4

        LoopDepth4:
            ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48
            ld1 {v3.4s, v4.4s}, [x14], #32
            fmla v8.4s, v3.4s, v0.s[0]
            fmla v10.4s, v3.4s, v0.s[1]
            fmla v12.4s, v3.4s, v0.s[2]
            fmla v14.4s, v3.4s, v0.s[3]
            fmla v9.4s, v4.4s, v0.s[0]
            fmla v11.4s, v4.4s, v0.s[1]
            fmla v13.4s, v4.4s, v0.s[2]
            fmla v15.4s, v4.4s, v0.s[3]

            subs x19, x19, #1
            bgt LoopDepth4

        Bias4:
            cbz x3, Activation4
            ld1 {v0.4s}, [x12], #16
            ld1 {v1.4s}, [x12], #16
            fadd v8.4s, v8.4s, v0.4s
            fadd v9.4s, v9.4s, v1.4s
            fadd v10.4s, v10.4s, v0.4s
            fadd v11.4s, v11.4s, v1.4s
            fadd v12.4s, v12.4s, v0.4s
            fadd v13.4s, v13.4s, v1.4s
            fadd v14.4s, v14.4s, v0.4s
            fadd v15.4s, v15.4s, v1.4s

        Activation4:
            cmp x4, #2
            beq Relu64
            cmp x4, #1
            beq Relu4
            b Write

        Relu64:
            mov w19, #6
            dup v2.4s, w19
            scvtf v2.4s, v2.4s
            fmin v8.4s, v8.4s, v2.4s
            fmin v9.4s, v9.4s, v2.4s
            fmin v10.4s, v10.4s, v2.4s
            fmin v11.4s, v11.4s, v2.4s
            fmin v12.4s, v12.4s, v2.4s
            fmin v13.4s, v13.4s, v2.4s
            fmin v14.4s, v14.4s, v2.4s
            fmin v15.4s, v15.4s, v2.4s
        
        Relu4:
            dup v3.4s, wzr
            fmax v8.4s, v8.4s, v3.4s
            fmax v9.4s, v9.4s, v3.4s
            fmax v10.4s, v10.4s, v3.4s
            fmax v11.4s, v11.4s, v3.4s
            fmax v12.4s, v12.4s, v3.4s
            fmax v13.4s, v13.4s, v3.4s
            fmax v14.4s, v14.4s, v3.4s
            fmax v15.4s, v15.4s, v3.4s
            b Write

    LoopDepthStartHalf4:
        ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48
        ld1 {v3.4s, v4.4s}, [x14], #32
        fmul v8.4s, v3.4s, v0.s[0]
        fmul v10.4s, v3.4s, v0.s[1]
        fmul v12.4s, v3.4s, v0.s[2]
        fmul v14.4s, v3.4s, v0.s[3]

        subs x19, x19, #1
        beq BiasHalf4

        LoopDepthHalf4:
            ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48
            ld1 {v3.4s, v4.4s}, [x14], #32
            fmla v8.4s, v3.4s, v0.s[0]
            fmla v10.4s, v3.4s, v0.s[1]
            fmla v12.4s, v3.4s, v0.s[2]
            fmla v14.4s, v3.4s, v0.s[3]

            subs x19, x19, #1
            bgt LoopDepthHalf4

        BiasHalf4:
            cbz x3, ActivationHalf4
            ld1 {v0.4s}, [x12], #16
            ld1 {v1.4s}, [x12], #16
            fadd v8.4s, v8.4s, v0.4s
            fadd v10.4s, v10.4s, v0.4s
            fadd v12.4s, v12.4s, v0.4s
            fadd v14.4s, v14.4s, v0.4s

        ActivationHalf4:
            cmp x4, #2
            beq Relu6Half4
            cmp x4, #1
            beq ReluHalf4
            b Write

        Relu6Half4:
            mov w19, #6
            dup v2.4s, w19
            scvtf v2.4s, v2.4s
            fmin v8.4s, v8.4s, v2.4s
            fmin v10.4s, v10.4s, v2.4s
            fmin v12.4s, v12.4s, v2.4s
            fmin v14.4s, v14.4s, v2.4s

        ReluHalf4:
            dup v3.4s, wzr
            fmax v8.4s, v8.4s, v3.4s
            fmax v10.4s, v10.4s, v3.4s
            fmax v12.4s, v12.4s, v3.4s
            fmax v14.4s, v14.4s, v3.4s

        Write:
            cmp x9, #2
            beq WriteWino
            cbz x9, WriteC8
            cmp x13, #1
            beq Write1
            cmp x13, #2
            beq Write2
            cmp x13, #3
            beq Write3
            cmp x13, #4
            beq Write4
            cmp x13, #5
            beq Write5
            cmp x13, #6
            beq Write6
            cmp x13, #7
            beq Write7
            b Write8

        Write1:
            add x2, x2, #4
            str s8, [x11]
            cmp x6, #1
            beq WriteEnd
            add x11, x11, x8
            str s10, [x11]
            cmp x6, #2
            beq WriteEnd
            add x11, x11, x8
            str s12, [x11]
            cmp x6, #3
            beq WriteEnd
            add x11, x11, x8
            str s14, [x11]
            cmp x6, #4
            beq WriteEnd
            add x11, x11, x8
            str s16, [x11]
            cmp x6, #5
            beq WriteEnd
            add x11, x11, x8
            str s18, [x11]
            cmp x6, #6
            beq WriteEnd
            add x11, x11, x8
            str s20, [x11]
            cmp x6, #7
            beq WriteEnd
            add x11, x11, x8
            str s22, [x11]
            add x11, x11, x8
            add x11, x11, #4
            b WriteEnd
        Write2:
            add x2, x2, #8
            str d8, [x11]
            cmp x6, #1
            beq WriteEnd
            add x11, x11, x8
            str d10, [x11]
            cmp x6, #2
            beq WriteEnd
            add x11, x11, x8
            str d12, [x11]
            cmp x6, #3
            beq WriteEnd
            add x11, x11, x8
            str d14, [x11]
            cmp x6, #4
            beq WriteEnd
            add x11, x11, x8
            str d16, [x11]
            cmp x6, #5
            beq WriteEnd
            add x11, x11, x8
            str d18, [x11]
            cmp x6, #6
            beq WriteEnd
            add x11, x11, x8
            str d20, [x11]
            cmp x6, #7
            beq WriteEnd
            add x11, x11, x8
            str d22, [x11]
            add x11, x11, x8
            add x11, x11, #8
            b WriteEnd
        Write3:
            add x2, x2, #12
            add x19, x11, #8
            str d8, [x11]
            st1 {v8.s}[2], [x19], x8
            cmp x6, #1
            beq WriteEnd
            add x11, x11, x8
            str d10, [x11]
            st1 {v10.s}[2], [x19], x8
            cmp x6, #2
            beq WriteEnd
            add x11, x11, x8
            str d12, [x11]
            st1 {v12.s}[2], [x19], x8
            cmp x6, #3
            beq WriteEnd
            add x11, x11, x8
            str d14, [x11]
            st1 {v14.s}[2], [x19], x8
            cmp x6, #4
            beq WriteEnd
            add x11, x11, x8
            str d16, [x11]
            st1 {v16.s}[2], [x19], x8
            cmp x6, #5
            beq WriteEnd
            add x11, x11, x8
            str d18, [x11]
            st1 {v18.s}[2], [x19], x8
            cmp x6, #6
            beq WriteEnd
            add x11, x11, x8
            str d20, [x11]
            st1 {v20.s}[2], [x19], x8
            cmp x6, #7
            beq WriteEnd
            add x11, x11, x8
            str d22, [x11]
            st1 {v22.s}[2], [x19], x8
            add x11, x11, x8
            add x11, x11, #12
            b WriteEnd
        Write4:
            add x2, x2, #16
            st1 {v8.4s}, [x11], x8
            cmp x6, #1
            beq WriteEnd
            st1 {v10.4s}, [x11], x8
            cmp x6, #2
            beq WriteEnd
            st1 {v12.4s}, [x11], x8
            cmp x6, #3
            beq WriteEnd
            st1 {v14.4s}, [x11], x8
            cmp x6, #4
            beq WriteEnd
            st1 {v16.4s}, [x11], x8
            cmp x6, #5
            beq WriteEnd
            st1 {v18.4s}, [x11], x8
            cmp x6, #6
            beq WriteEnd
            st1 {v20.4s}, [x11], x8
            cmp x6, #7
            beq WriteEnd
            st1 {v22.4s}, [x11], x8
            add x11, x11, #16
            b WriteEnd
        Write5:
            add x2, x2, #20
            add x19, x11, #16
            st1 {v8.4s}, [x11], x8
            str s9, [x19]
            cmp x6, #1
            beq WriteEnd
            add x19, x19, x8
            st1 {v10.4s}, [x11], x8
            str s11, [x19]
            cmp x6, #2
            beq WriteEnd
            add x19, x19, x8
            st1 {v12.4s}, [x11], x8
            str s13, [x19]
            cmp x6, #3
            beq WriteEnd
            add x19, x19, x8
            st1 {v14.4s}, [x11], x8
            str s15, [x19]
            cmp x6, #4
            beq WriteEnd
            add x19, x19, x8
            st1 {v16.4s}, [x11], x8
            str s17, [x19]
            cmp x6, #5
            beq WriteEnd
            add x19, x19, x8
            st1 {v18.4s}, [x11], x8
            str s19, [x19]
            cmp x6, #6
            beq WriteEnd
            add x19, x19, x8
            st1 {v20.4s}, [x11], x8
            str s21, [x19]
            cmp x6, #7
            beq WriteEnd
            add x19, x19, x8
            st1 {v22.4s}, [x11], x8
            str s23, [x19]
            add x11, x11, #20
            b WriteEnd
        Write6:
            add x2, x2, #24
            add x19, x11, #16
            st1 {v8.4s}, [x11], x8
            str d9, [x19]
            cmp x6, #1
            beq WriteEnd
            add x19, x19, x8
            st1 {v10.4s}, [x11], x8
            str d11, [x19]
            cmp x6, #2
            beq WriteEnd
            add x19, x19, x8
            st1 {v12.4s}, [x11], x8
            str d13, [x19]
            cmp x6, #3
            beq WriteEnd
            add x19, x19, x8
            st1 {v14.4s}, [x11], x8
            str d15, [x19]
            cmp x6, #4
            beq WriteEnd
            add x19, x19, x8
            st1 {v16.4s}, [x11], x8
            str d17, [x19]
            cmp x6, #5
            beq WriteEnd
            add x19, x19, x8
            st1 {v18.4s}, [x11], x8
            str d19, [x19]
            cmp x6, #6
            beq WriteEnd
            add x19, x19, x8
            st1 {v20.4s}, [x11], x8
            str d21, [x19]
            cmp x6, #7
            beq WriteEnd
            add x19, x19, x8
            st1 {v22.4s}, [x11], x8
            str d23, [x19]
            add x11, x11, #24
            b WriteEnd
        Write7:
            add x2, x2, #28
            add x19, x11, #16
            add x20, x11, #24
            st1 {v8.4s}, [x11], x8
            str d9, [x19]
            st1 {v9.s}[2], [x20], x8
            cmp x6, #1
            beq WriteEnd
            add x19, x19, x8
            st1 {v10.4s}, [x11], x8
            str d11, [x19]
            st1 {v11.s}[2], [x20], x8
            cmp x6, #2
            beq WriteEnd
            add x19, x19, x8
            st1 {v12.4s}, [x11], x8
            str d13, [x19]
            st1 {v13.s}[2], [x20], x8
            cmp x6, #3
            beq WriteEnd
            add x19, x19, x8
            st1 {v14.4s}, [x11], x8
            str d15, [x19]
            st1 {v15.s}[2], [x20], x8
            cmp x6, #4
            beq WriteEnd
            add x19, x19, x8
            st1 {v16.4s}, [x11], x8
            str d17, [x19]
            st1 {v17.s}[2], [x20], x8
            cmp x6, #5
            beq WriteEnd
            add x19, x19, x8
            st1 {v18.4s}, [x11], x8
            str d19, [x19]
            st1 {v19.s}[2], [x20], x8
            cmp x6, #6
            beq WriteEnd
            add x19, x19, x8
            st1 {v20.4s}, [x11], x8
            str d21, [x19]
            st1 {v21.s}[2], [x20], x8
            cmp x6, #7
            beq WriteEnd
            add x19, x19, x8
            st1 {v22.4s}, [x11], x8
            str d23, [x19]
            st1 {v23.s}[2], [x20], x8
            add x11, x11, #28
            b WriteEnd
        WriteC8:
            mov x19, x11
            st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x19], #64
            st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x19], #64
            st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x19], #64
            st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x19], #64
            add x11, x11, x16
            b WriteEnd
        WriteWino:
            add x2, x11, x16
            st1 {v8.4s, v9.4s}, [x11], x15
            st1 {v10.4s, v11.4s}, [x11], x15
            st1 {v12.4s, v13.4s}, [x11], x15
            st1 {v14.4s, v15.4s}, [x11], x15
            st1 {v16.4s, v17.4s}, [x11], x15
            st1 {v18.4s, v19.4s}, [x11], x15
            st1 {v20.4s, v21.4s}, [x11], x15
            st1 {v22.4s, v23.4s}, [x11], x15
            b WriteEnd
        Write8:
            add x2, x2, #32
            st1 {v8.4s, v9.4s}, [x11], x8
            cmp x6, #1
            beq WriteEnd
            st1 {v10.4s, v11.4s}, [x11], x8
            cmp x6, #2
            beq WriteEnd
            st1 {v12.4s, v13.4s}, [x11], x8
            cmp x6, #3
            beq WriteEnd
            st1 {v14.4s, v15.4s}, [x11], x8
            cmp x6, #4
            beq WriteEnd
            st1 {v16.4s, v17.4s}, [x11], x8
            cmp x6, #5
            beq WriteEnd
            st1 {v18.4s, v19.4s}, [x11], x8
            cmp x6, #6
            beq WriteEnd
            st1 {v20.4s, v21.4s}, [x11], x8
            cmp x6, #7
            beq WriteEnd
            st1 {v22.4s, v23.4s}, [x11], x8
            add x11, x11, #32

        WriteEnd:
            subs x13, x13, #8 // rhs col - 8
            ble LoopColEnd
            cmp x6, #4
            ble LoopCol4
            b LoopCol8

 LoopColEnd:
        add x0, x0, x17
        cbz x9, C8DstStep
        mov x18, #4
        mul x18, x18, x7
        sub x11, x11, x18
        mov x2, x11
        b NoDstStep
    C8DstStep:
        add x2, x2, #384
        mov x11, x2
    NoDstStep:
        subs x6, x6, #12
        bgt LoopRow

  sub sp, sp, #144
  ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
  ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
  ldp x19, x20, [sp], #16
  ret
 #endif
--- a/mindspore/lite/nnacl/common_func.c
+++ b/mindspore/lite/nnacl/common_func.c
@@ -27,137 +27,6 @@ int offsetComm(const int *shape, const int dim0, const int dim1, const int dim2)

 int offset4d(const int *shape, const int *dims) { return offset(shape, dims[0], dims[1], dims[2], dims[3]); }

 #ifndef ENABLE_ARM64
 void IndirectGemmFp32(float *output, const float *input, const float *weight, const float *bias, size_t step, int ic4,
                      int output_channel, size_t offset, size_t relu, size_t relu6) {
  for (int i = 0; i < TILE_NUM; i++) {
    int input_tile_offset = i * C4NUM;
    int output_tile_offset = i * output_channel;
    for (int j = 0; j < output_channel; j++) {
      int oc8_block = j / C8NUM;
      int oc8_res = j % C8NUM;
      int weight_oc_offset = oc8_block * step * ic4 * C4NUM * C8NUM + oc8_res;
      int out_oc_offset = output_tile_offset + j;

      float acc = 0;
      for (int n = 0; n < step; n++) {
        int input_kw_offset = input_tile_offset + n * ic4 * C4NUM * TILE_NUM;
        int weight_kw_offset = weight_oc_offset + n * ic4 * C4NUM * C8NUM;

        for (int k = 0; k < ic4; k++) {
          int input_ic4_offset = input_kw_offset + k * TILE_NUM * C4NUM;
          int weight_ic4_offset = weight_kw_offset + k * C4NUM * C8NUM;
          for (int m = 0; m < C4NUM; m++) {
            int input_ic_offset = input_ic4_offset + m;
            int weight_ic_offset = weight_ic4_offset + m * C8NUM;
            acc += (weight + weight_ic_offset)[0] * (input + input_ic_offset)[0];
          }
        }
      }
      acc += bias[j];
      if (relu) {
        acc = acc > 0 ? acc : 0;
      } else if (relu6) {
        if (acc < 0) {
          acc = 0;
        } else if (acc > 6) {
          acc = 6;
        } else {
        }
      }
      (output + out_oc_offset)[0] = acc;
    }
  }
 }

 void IndirectGemmFp32_8x8(float *output, const float *input, const float *weight, const float *bias, size_t step,
                          size_t ic4, size_t output_channel, size_t offset, size_t mode, size_t writeC4, size_t relu,
                          size_t relu6) {
  int oc4 = UP_DIV(output_channel, C4NUM);
  if (mode && writeC4) {
    for (int i = 0; i < TILE_NUM; i++) {
      int input_tile_offset = i * C4NUM;
      int output_tile_offset = i * oc4 * C4NUM * step;
      for (int j = 0; j < output_channel; j++) {
        int oc4_block = j / 4;
        int oc4_res = j % 4;
        int oc8_block = oc4_block / 2;
        int oc8_res = oc4_block % 2;
        int weight_oc_offset = oc8_block * step * ic4 * C4NUM * C8NUM + oc8_res * C4NUM + oc4_res;
        int out_oc_offset = output_tile_offset + oc4_block * step * C4NUM + oc4_res;

        for (int n = 0; n < step; n++) {
          int input_kw_offset = input_tile_offset + n * ic4 * C4NUM * TILE_NUM;
          int weight_kw_offset = weight_oc_offset + n * ic4 * C4NUM * C8NUM;
          int output_kw_offset = out_oc_offset + n * C4NUM;
          float acc = 0;

          for (int k = 0; k < ic4; k++) {
            int input_ic4_offset = input_kw_offset + k * TILE_NUM * C4NUM;
            int weight_ic4_offset = weight_kw_offset + k * C4NUM * C8NUM;
            for (int m = 0; m < 4; m++) {
              int input_ic_offset = input_ic4_offset + m;
              int weight_ic_offset = weight_ic4_offset + m * C8NUM;
              acc += (weight + weight_ic_offset)[0] * (input + input_ic_offset)[0];
            }
          }
          (output + output_kw_offset)[0] = acc;
        }
      }
    }
  } else if (mode) {
    IndirectGemmFp32_Comm(output, input, weight, ic4, C8NUM, output_channel, offset);
  } else {
    IndirectGemmFp32(output, input, weight, bias, step, ic4, output_channel, offset, relu, relu6);
  }
 }
 #endif

 #ifndef ENABLE_ARM32
 void IndirectGemmFp32_8x4(float *output, const float *input, const float *weight, const float *bias, size_t step,
                          size_t ic4, size_t output_channel, size_t offset, size_t mode, size_t writeC4, size_t relu,
                          size_t relu6) {
  for (int i = 0; i < TILE_NUM; i++) {
    int input_tile_offset = i * C4NUM;
    int output_tile_offset = i * output_channel;
    for (int j = 0; j < output_channel; j++) {
      int oc4_block = j / C4NUM;
      int oc4_res = j % C4NUM;
      int weight_oc_offset = oc4_block * step * ic4 * C4NUM * C4NUM + oc4_res;
      int out_oc_offset = output_tile_offset + j;

      float acc = 0;
      for (int n = 0; n < step; n++) {
        int input_kw_offset = input_tile_offset + n * ic4 * C4NUM * TILE_NUM;
        int weight_kw_offset = weight_oc_offset + n * ic4 * C4NUM * C4NUM;

        for (int k = 0; k < ic4; k++) {
          int input_ic4_offset = input_kw_offset + k * TILE_NUM * C4NUM;
          int weight_ic4_offset = weight_kw_offset + k * C4NUM * C4NUM;
          for (int m = 0; m < C4NUM; m++) {
            int input_ic_offset = input_ic4_offset + m;
            int weight_ic_offset = weight_ic4_offset + m * C4NUM;
            acc += (weight + weight_ic_offset)[0] * (input + input_ic_offset)[0];
          }
        }
      }
      acc += bias[j];
      if (relu) {
        acc = acc > 0 ? acc : 0;
      } else if (relu6) {
        if (acc < 0) {
          acc = 0;
        } else if (acc > 6) {
          acc = 6;
        } else {
        }
      }
      (output + out_oc_offset)[0] = acc;
    }
  }
 }
 #endif

 int8_t MinInt8(int8_t a, int8_t b) { return b ^ ((a ^ b) & -(a < b)); }

 int8_t MaxInt8(int8_t a, int8_t b) { return a ^ ((a ^ b) & -(a < b)); }
@@ -210,21 +79,3 @@ void Relu6Fp32(float *data, float *dst, int ele_num) {
    data[j] = data[j] > 6 ? 6 : data[j];
  }
 }

 void IndirectGemmFp32_Comm(float *output, const float *input, const float *weight, size_t ic4, size_t hw, size_t oc,
                           size_t offset) {
  for (int r = 0; r < hw; r++) {
    for (int c = 0; c < oc; c++) {
      float value = 0;
      for (int deep = 0; deep < ic4; deep++) {
        int d4mod = deep % 4;
        int d4div = deep / 4;
        int a_index = d4div * 4 * 8 + r * 4 + d4mod;
        const int b_index = 8 * deep + c;
        value += input[a_index] * weight[b_index];
      }
      output[r * offset + c] = value;
    }
  }
  return;
 }
--- a/mindspore/lite/nnacl/common_func.h
+++ b/mindspore/lite/nnacl/common_func.h
@@ -31,18 +31,6 @@ int8_t MinInt8(int8_t a, int8_t b);
 int8_t MaxInt8(int8_t a, int8_t b);
 void ReluFp32(float *data, float *dst, int ele_num);
 void Relu6Fp32(float *data, float *dst, int ele_num);
 void SimplePostFuncInt8(const int *in, int8_t *out, int oc, int plane, int plane8, int32_t multiplier,
                        int32_t left_shift, int32_t right_shift, int32_t zp);
 void IndirectGemmFp32_8x8(float *output, const float *input, const float *weight, const float *bias, size_t step,
                          size_t ic4, size_t output_channel, size_t offset, size_t mode, size_t writeC4, size_t relu,
                          size_t relu6);
 void IndirectGemmFp32_8x4(float *output, const float *input, const float *weight, const float *bias, size_t step,
                          size_t ic4, size_t output_channel, size_t offset, size_t mode, size_t writeC4, size_t relu,
                          size_t relu6);
 void IndirectGemmFp32_Comm(float *output, const float *input, const float *weight, size_t ic4, size_t hw, size_t oc,
                           size_t offset);
 void IndirectGemmFp32(float *output, const float *input, const float *weight, const float *bias, size_t step, int ic4,
                      int output_channel, size_t offset, size_t relu, size_t relu6);
 int offset(const int *shape, const int dim0, const int dim1, const int dim2, const int dim3);
 int offsetComm(const int *shape, const int dim0, const int dim1, const int dim2);
 int offset4d(const int *shape, const int *dims);
--- a/mindspore/lite/nnacl/fp32/matmul.c
+++ b/mindspore/lite/nnacl/fp32/matmul.c
@@ -470,14 +470,19 @@ void MatMul4x8(const float *a, const float *b, float *dst, const float *bias, Ac
 void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActType act_type, int deep, int row,
               int col, size_t stride, int out_type) {
 #ifdef ENABLE_ARM64
  if (out_type == 2 && row <= 8) {
    MatmulFloatNeon64OptRemain(a, b, c, deep, row, col, stride);
  if (out_type == OutType_C8) {
    MatmulFloatNeon64(a, b, c, bias, (int)act_type, deep, row, col, stride, 0, 0);
  } else if (row <= 8) {
    MatmulFloatNeon64OptRemain(a, b, c, bias, (int)act_type, deep, row, col, stride, (int)(out_type));
  } else {
    MatmulFloatNeon64Opt(a, b, c, bias, (int)act_type, deep, row, col, stride, (int)(out_type == OutType_Nhwc),
                         (int)(out_type == OutType_TileC8));
    MatmulFloatNeon64Opt(a, b, c, bias, (int)act_type, deep, row, col, stride, (int)(out_type));
  }
 #elif ENABLE_ARM32
  MatmulFloatNeon32Opt(a, b, c, bias, (int)act_type, deep, row, col, stride, (int)(out_type));
  if (out_type == OutType_C8) {
    MatmulFloatNeon32(a, b, c, bias, (int)act_type, deep, row, col, stride, 0, 0);
  } else {
    MatmulFloatNeon32Opt(a, b, c, bias, (int)act_type, deep, row, col, stride, (int)(out_type));
  }
 #else
  MatMul12x8(a, b, c, bias, act_type, deep, row, col, stride, out_type);
 #endif
--- a/mindspore/lite/nnacl/fp32/matmul.h
+++ b/mindspore/lite/nnacl/fp32/matmul.h
@@ -36,11 +36,14 @@ void RowMajor2Col8Major(float *src_ptr, float *dst_ptr, size_t row, size_t col);
 void RowMajor2Col12Major(float *src_ptr, float *dst_ptr, size_t row, size_t col);
 #ifdef ENABLE_ARM64
 void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row,
                       int col, size_t stride, bool write_nhwc);
                       int col, size_t stride, size_t writeNhwc, size_t WriteWino);
 void MatmulFloatNeon64Opt(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row,
                          int col, size_t stride, size_t write_nhwc, size_t write_c4);
 void MatmulFloatNeon64OptRemain(const float *a, const float *b, float *c, int depth, int row, int col, size_t stride);
                          int col, size_t stride, size_t write_mode);
 void MatmulFloatNeon64OptRemain(const float *a, const float *b, float *c, const float *bias, int act_type, int depth,
                                int row, int col, size_t stride, size_t write_mode);
 #elif ENABLE_ARM32
 void MatmulFloatNeon32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row,
                       int col, int stride, size_t writeNhwc, size_t WriteWino);
 void MatmulFloatNeon32Opt(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row,
                          int col, int stride, int write_mode);
 #endif