diff --git a/mindspore/lite/internal/CMakeLists.txt b/mindspore/lite/internal/CMakeLists.txt index cb8e6c076c..65fef46415 100644 --- a/mindspore/lite/internal/CMakeLists.txt +++ b/mindspore/lite/internal/CMakeLists.txt @@ -38,7 +38,6 @@ set(CCSRC if (PLATFORM_ARM64) # assembly file(GLOB ASSEMBLY_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/assembly/arm64/MatmulFp32OptRemain.S ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/assembly/arm64/MatmulFp32Opt.S ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/assembly/arm64/MatmulFp32.S) set_property(SOURCE ${ASSEMBLY_SRC} PROPERTY LANGUAGE C) diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S index 16d6bba647..0e07a10347 100644 --- a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S +++ b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S @@ -46,6 +46,12 @@ NoWinoSteps: mov x18, #4 mul x8, x8, x18 +LoopRowStart: + cmp x6, #4 + ble LoopRow4 + cmp x6, #8 + blt LoopRow8 + LoopRow: mov x14, x1 // reload rhs ptr mov x13, x7 // reload rhs col @@ -309,6 +315,357 @@ LoopRow: fmax v26.4s, v26.4s, v3.4s fmax v28.4s, v28.4s, v3.4s fmax v30.4s, v30.4s, v3.4s + b Write + + +LoopRow8: + mov x14, x1 // reload rhs ptr + mov x13, x7 // reload rhs col + mov x12, x3 // reload bias + + LoopCol8: + cbz x9, NoReloadDst8 + mov x11, x2 + NoReloadDst8: + mov x10, x0 // reload lhs ptr + mov x19, x5 // reload depth + + cmp x13, #4 + ble LoopDepthStartHalf8 + + LoopDepthStart8: + ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48 + ld1 {v3.4s, v4.4s}, [x14], #32 + fmul v8.4s, v3.4s, v0.s[0] + fmul v10.4s, v3.4s, v0.s[1] + fmul v12.4s, v3.4s, v0.s[2] + fmul v14.4s, v3.4s, v0.s[3] + fmul v9.4s, v4.4s, v0.s[0] + fmul v11.4s, v4.4s, v0.s[1] + fmul v13.4s, v4.4s, v0.s[2] + fmul v15.4s, v4.4s, v0.s[3] + fmul v16.4s, v3.4s, v1.s[0] + fmul v18.4s, v3.4s, v1.s[1] + fmul v20.4s, v3.4s, v1.s[2] + fmul v22.4s, v3.4s, v1.s[3] + fmul v17.4s, v4.4s, v1.s[0] + fmul v19.4s, v4.4s, v1.s[1] + fmul v21.4s, v4.4s, v1.s[2] + fmul v23.4s, v4.4s, v1.s[3] + + subs x19, x19, #1 + beq Bias8 + + LoopDepth8: + ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48 + ld1 {v3.4s, v4.4s}, [x14], #32 + fmla v8.4s, v3.4s, v0.s[0] + fmla v10.4s, v3.4s, v0.s[1] + fmla v12.4s, v3.4s, v0.s[2] + fmla v14.4s, v3.4s, v0.s[3] + fmla v9.4s, v4.4s, v0.s[0] + fmla v11.4s, v4.4s, v0.s[1] + fmla v13.4s, v4.4s, v0.s[2] + fmla v15.4s, v4.4s, v0.s[3] + fmla v16.4s, v3.4s, v1.s[0] + fmla v18.4s, v3.4s, v1.s[1] + fmla v20.4s, v3.4s, v1.s[2] + fmla v22.4s, v3.4s, v1.s[3] + fmla v17.4s, v4.4s, v1.s[0] + fmla v19.4s, v4.4s, v1.s[1] + fmla v21.4s, v4.4s, v1.s[2] + fmla v23.4s, v4.4s, v1.s[3] + + subs x19, x19, #1 + bgt LoopDepth8 + + Bias8: + cbz x3, Activation8 + ld1 {v0.4s}, [x12], #16 + ld1 {v1.4s}, [x12], #16 + fadd v8.4s, v8.4s, v0.4s + fadd v9.4s, v9.4s, v1.4s + fadd v10.4s, v10.4s, v0.4s + fadd v11.4s, v11.4s, v1.4s + fadd v12.4s, v12.4s, v0.4s + fadd v13.4s, v13.4s, v1.4s + fadd v14.4s, v14.4s, v0.4s + fadd v15.4s, v15.4s, v1.4s + fadd v16.4s, v16.4s, v0.4s + fadd v17.4s, v17.4s, v1.4s + fadd v18.4s, v18.4s, v0.4s + fadd v19.4s, v19.4s, v1.4s + fadd v20.4s, v20.4s, v0.4s + fadd v21.4s, v21.4s, v1.4s + fadd v22.4s, v22.4s, v0.4s + fadd v23.4s, v23.4s, v1.4s + + Activation8: + cmp x4, #2 + beq Relu68 + cmp x4, #1 + beq Relu8 + b Write + + Relu68: + mov w19, #6 + dup v2.4s, w19 + scvtf v2.4s, v2.4s + fmin v8.4s, v8.4s, v2.4s + fmin v9.4s, v9.4s, v2.4s + fmin v10.4s, v10.4s, v2.4s + fmin v11.4s, v11.4s, v2.4s + fmin v12.4s, v12.4s, v2.4s + fmin v13.4s, v13.4s, v2.4s + fmin v14.4s, v14.4s, v2.4s + fmin v15.4s, v15.4s, v2.4s + fmin v16.4s, v16.4s, v2.4s + fmin v17.4s, v17.4s, v2.4s + fmin v18.4s, v18.4s, v2.4s + fmin v19.4s, v19.4s, v2.4s + fmin v20.4s, v20.4s, v2.4s + fmin v21.4s, v21.4s, v2.4s + fmin v22.4s, v22.4s, v2.4s + fmin v23.4s, v23.4s, v2.4s + + Relu8: + dup v3.4s, wzr + fmax v8.4s, v8.4s, v3.4s + fmax v9.4s, v9.4s, v3.4s + fmax v10.4s, v10.4s, v3.4s + fmax v11.4s, v11.4s, v3.4s + fmax v12.4s, v12.4s, v3.4s + fmax v13.4s, v13.4s, v3.4s + fmax v14.4s, v14.4s, v3.4s + fmax v15.4s, v15.4s, v3.4s + fmax v16.4s, v16.4s, v3.4s + fmax v17.4s, v17.4s, v3.4s + fmax v18.4s, v18.4s, v3.4s + fmax v19.4s, v19.4s, v3.4s + fmax v20.4s, v20.4s, v3.4s + fmax v21.4s, v21.4s, v3.4s + fmax v22.4s, v22.4s, v3.4s + fmax v23.4s, v23.4s, v3.4s + b Write + + LoopDepthStartHalf8: + ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48 + ld1 {v3.4s, v4.4s}, [x14], #32 + fmul v8.4s, v3.4s, v0.s[0] + fmul v10.4s, v3.4s, v0.s[1] + fmul v12.4s, v3.4s, v0.s[2] + fmul v14.4s, v3.4s, v0.s[3] + fmul v16.4s, v3.4s, v1.s[0] + fmul v18.4s, v3.4s, v1.s[1] + fmul v20.4s, v3.4s, v1.s[2] + fmul v22.4s, v3.4s, v1.s[3] + + subs x19, x19, #1 + beq BiasHalf8 + + LoopDepthHalf8: + ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48 + ld1 {v3.4s, v4.4s}, [x14], #32 + fmla v8.4s, v3.4s, v0.s[0] + fmla v10.4s, v3.4s, v0.s[1] + fmla v12.4s, v3.4s, v0.s[2] + fmla v14.4s, v3.4s, v0.s[3] + fmla v16.4s, v3.4s, v1.s[0] + fmla v18.4s, v3.4s, v1.s[1] + fmla v20.4s, v3.4s, v1.s[2] + fmla v22.4s, v3.4s, v1.s[3] + + subs x19, x19, #1 + bgt LoopDepthHalf8 + + BiasHalf8: + cbz x3, ActivationHalf8 + ld1 {v0.4s}, [x12], #16 + ld1 {v1.4s}, [x12], #16 + fadd v8.4s, v8.4s, v0.4s + fadd v10.4s, v10.4s, v0.4s + fadd v12.4s, v12.4s, v0.4s + fadd v14.4s, v14.4s, v0.4s + fadd v16.4s, v16.4s, v0.4s + fadd v18.4s, v18.4s, v0.4s + fadd v20.4s, v20.4s, v0.4s + fadd v22.4s, v22.4s, v0.4s + + ActivationHalf8: + cmp x4, #2 + beq Relu6Half8 + cmp x4, #1 + beq ReluHalf8 + b Write + + Relu6Half8: + mov w19, #6 + dup v2.4s, w19 + scvtf v2.4s, v2.4s + fmin v8.4s, v8.4s, v2.4s + fmin v10.4s, v10.4s, v2.4s + fmin v12.4s, v12.4s, v2.4s + fmin v14.4s, v14.4s, v2.4s + fmin v16.4s, v16.4s, v2.4s + fmin v18.4s, v18.4s, v2.4s + fmin v20.4s, v20.4s, v2.4s + fmin v22.4s, v22.4s, v2.4s + + ReluHalf8: + dup v3.4s, wzr + fmax v8.4s, v8.4s, v3.4s + fmax v10.4s, v10.4s, v3.4s + fmax v12.4s, v12.4s, v3.4s + fmax v14.4s, v14.4s, v3.4s + fmax v16.4s, v16.4s, v3.4s + fmax v18.4s, v18.4s, v3.4s + fmax v20.4s, v20.4s, v3.4s + fmax v22.4s, v22.4s, v3.4s + b Write + +LoopRow4: + mov x14, x1 // reload rhs ptr + mov x13, x7 // reload rhs col + mov x12, x3 // reload bias + + LoopCol4: + cbz x9, NoReloadDst4 + mov x11, x2 + NoReloadDst4: + mov x10, x0 // reload lhs ptr + mov x19, x5 // reload depth + + cmp x13, #4 + ble LoopDepthStartHalf4 + + LoopDepthStart4: + ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48 + ld1 {v3.4s, v4.4s}, [x14], #32 + fmul v8.4s, v3.4s, v0.s[0] + fmul v10.4s, v3.4s, v0.s[1] + fmul v12.4s, v3.4s, v0.s[2] + fmul v14.4s, v3.4s, v0.s[3] + fmul v9.4s, v4.4s, v0.s[0] + fmul v11.4s, v4.4s, v0.s[1] + fmul v13.4s, v4.4s, v0.s[2] + fmul v15.4s, v4.4s, v0.s[3] + + subs x19, x19, #1 + beq Bias4 + + LoopDepth4: + ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48 + ld1 {v3.4s, v4.4s}, [x14], #32 + fmla v8.4s, v3.4s, v0.s[0] + fmla v10.4s, v3.4s, v0.s[1] + fmla v12.4s, v3.4s, v0.s[2] + fmla v14.4s, v3.4s, v0.s[3] + fmla v9.4s, v4.4s, v0.s[0] + fmla v11.4s, v4.4s, v0.s[1] + fmla v13.4s, v4.4s, v0.s[2] + fmla v15.4s, v4.4s, v0.s[3] + + subs x19, x19, #1 + bgt LoopDepth4 + + Bias4: + cbz x3, Activation4 + ld1 {v0.4s}, [x12], #16 + ld1 {v1.4s}, [x12], #16 + fadd v8.4s, v8.4s, v0.4s + fadd v9.4s, v9.4s, v1.4s + fadd v10.4s, v10.4s, v0.4s + fadd v11.4s, v11.4s, v1.4s + fadd v12.4s, v12.4s, v0.4s + fadd v13.4s, v13.4s, v1.4s + fadd v14.4s, v14.4s, v0.4s + fadd v15.4s, v15.4s, v1.4s + + Activation4: + cmp x4, #2 + beq Relu64 + cmp x4, #1 + beq Relu4 + b Write + + Relu64: + mov w19, #6 + dup v2.4s, w19 + scvtf v2.4s, v2.4s + fmin v8.4s, v8.4s, v2.4s + fmin v9.4s, v9.4s, v2.4s + fmin v10.4s, v10.4s, v2.4s + fmin v11.4s, v11.4s, v2.4s + fmin v12.4s, v12.4s, v2.4s + fmin v13.4s, v13.4s, v2.4s + fmin v14.4s, v14.4s, v2.4s + fmin v15.4s, v15.4s, v2.4s + + Relu4: + dup v3.4s, wzr + fmax v8.4s, v8.4s, v3.4s + fmax v9.4s, v9.4s, v3.4s + fmax v10.4s, v10.4s, v3.4s + fmax v11.4s, v11.4s, v3.4s + fmax v12.4s, v12.4s, v3.4s + fmax v13.4s, v13.4s, v3.4s + fmax v14.4s, v14.4s, v3.4s + fmax v15.4s, v15.4s, v3.4s + b Write + + LoopDepthStartHalf4: + ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48 + ld1 {v3.4s, v4.4s}, [x14], #32 + fmul v8.4s, v3.4s, v0.s[0] + fmul v10.4s, v3.4s, v0.s[1] + fmul v12.4s, v3.4s, v0.s[2] + fmul v14.4s, v3.4s, v0.s[3] + + subs x19, x19, #1 + beq BiasHalf4 + + LoopDepthHalf4: + ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48 + ld1 {v3.4s, v4.4s}, [x14], #32 + fmla v8.4s, v3.4s, v0.s[0] + fmla v10.4s, v3.4s, v0.s[1] + fmla v12.4s, v3.4s, v0.s[2] + fmla v14.4s, v3.4s, v0.s[3] + + subs x19, x19, #1 + bgt LoopDepthHalf4 + + BiasHalf4: + cbz x3, ActivationHalf4 + ld1 {v0.4s}, [x12], #16 + ld1 {v1.4s}, [x12], #16 + fadd v8.4s, v8.4s, v0.4s + fadd v10.4s, v10.4s, v0.4s + fadd v12.4s, v12.4s, v0.4s + fadd v14.4s, v14.4s, v0.4s + + ActivationHalf4: + cmp x4, #2 + beq Relu6Half4 + cmp x4, #1 + beq ReluHalf4 + b Write + + Relu6Half4: + mov w19, #6 + dup v2.4s, w19 + scvtf v2.4s, v2.4s + fmin v8.4s, v8.4s, v2.4s + fmin v10.4s, v10.4s, v2.4s + fmin v12.4s, v12.4s, v2.4s + fmin v14.4s, v14.4s, v2.4s + + ReluHalf4: + dup v3.4s, wzr + fmax v8.4s, v8.4s, v3.4s + fmax v10.4s, v10.4s, v3.4s + fmax v12.4s, v12.4s, v3.4s + fmax v14.4s, v14.4s, v3.4s Write: cmp x9, #2 @@ -796,8 +1153,14 @@ LoopRow: WriteEnd: subs x13, x13, #8 // rhs col - 8 - bgt LoopCol + ble LoopColEnd + cmp x6, #4 + ble LoopCol4 + cmp x6, #8 + ble LoopCol8 + b LoopCol +LoopColEnd: add x0, x0, x17 cbz x9, C8DstStep mov x18, #4 @@ -810,7 +1173,7 @@ LoopRow: mov x11, x2 NoDstStep: subs x6, x6, #12 - bgt LoopRow + bgt LoopRowStart sub sp, sp, #144 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32OptRemain.S b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32OptRemain.S deleted file mode 100644 index ccc6ce534f..0000000000 --- a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32OptRemain.S +++ /dev/null @@ -1,766 +0,0 @@ -#ifdef __aarch64__ - .text - .align 5 - .global MatmulFloatNeon64OptRemain -#ifndef __APPLE__ - .type MatmulFloatNeon64OptRemain, %function -#endif - -// void MatmulFloatNeon64Remain(const float *a, const float *b, float *c, const float *bias, int act_type, int depth -// int row, int col, size_t stride, size_t writeMode) -// x0: a -// x1: b -// x2: c -// x3: bias -// x4: act_type -// x5: depth -// x6: row -// x7: col -// x8: stride -// x9: writeMode - -MatmulFloatNeon64OptRemain: - sub sp, sp, #144 - st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 - st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 - stp x19, x20, [sp], #16 - - ldr x8, [sp] - ldr x9, [sp, #8] - - mov x18, #48 // sizeof(float) * 12 - mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float) * 12 * depth - cbnz x9, NoC8Steps - mov x11, x2 - mov x18, #32 - mul x16, x6, x18 // row * 8 * sizeof(float) -NoC8Steps: - cmp x9, #2 - bne NoWinoSteps - mov x18, #4 - mul x15, x7, x8 - mul x15, x15, x18 // kernel_size * col *sizeof(float) - mov x18, #32 - mul x16, x8, x18 // kernel_size * 8 * sizeof(float) -NoWinoSteps: - mov x18, #4 - mul x8, x8, x18 - -LoopRow: - cmp x6, #4 - ble LoopRow4 - -LoopRow8: - mov x14, x1 // reload rhs ptr - mov x13, x7 // reload rhs col - mov x12, x3 // reload bias - - LoopCol8: - cbz x9, NoReloadDst8 - mov x11, x2 - NoReloadDst8: - mov x10, x0 // reload lhs ptr - mov x19, x5 // reload depth - - cmp x13, #4 - ble LoopDepthStartHalf8 - - LoopDepthStart8: - ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48 - ld1 {v3.4s, v4.4s}, [x14], #32 - fmul v8.4s, v3.4s, v0.s[0] - fmul v10.4s, v3.4s, v0.s[1] - fmul v12.4s, v3.4s, v0.s[2] - fmul v14.4s, v3.4s, v0.s[3] - fmul v9.4s, v4.4s, v0.s[0] - fmul v11.4s, v4.4s, v0.s[1] - fmul v13.4s, v4.4s, v0.s[2] - fmul v15.4s, v4.4s, v0.s[3] - fmul v16.4s, v3.4s, v1.s[0] - fmul v18.4s, v3.4s, v1.s[1] - fmul v20.4s, v3.4s, v1.s[2] - fmul v22.4s, v3.4s, v1.s[3] - fmul v17.4s, v4.4s, v1.s[0] - fmul v19.4s, v4.4s, v1.s[1] - fmul v21.4s, v4.4s, v1.s[2] - fmul v23.4s, v4.4s, v1.s[3] - - subs x19, x19, #1 - beq Bias8 - - LoopDepth8: - ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48 - ld1 {v3.4s, v4.4s}, [x14], #32 - fmla v8.4s, v3.4s, v0.s[0] - fmla v10.4s, v3.4s, v0.s[1] - fmla v12.4s, v3.4s, v0.s[2] - fmla v14.4s, v3.4s, v0.s[3] - fmla v9.4s, v4.4s, v0.s[0] - fmla v11.4s, v4.4s, v0.s[1] - fmla v13.4s, v4.4s, v0.s[2] - fmla v15.4s, v4.4s, v0.s[3] - fmla v16.4s, v3.4s, v1.s[0] - fmla v18.4s, v3.4s, v1.s[1] - fmla v20.4s, v3.4s, v1.s[2] - fmla v22.4s, v3.4s, v1.s[3] - fmla v17.4s, v4.4s, v1.s[0] - fmla v19.4s, v4.4s, v1.s[1] - fmla v21.4s, v4.4s, v1.s[2] - fmla v23.4s, v4.4s, v1.s[3] - - subs x19, x19, #1 - bgt LoopDepth8 - - Bias8: - cbz x3, Activation8 - ld1 {v0.4s}, [x12], #16 - ld1 {v1.4s}, [x12], #16 - fadd v8.4s, v8.4s, v0.4s - fadd v9.4s, v9.4s, v1.4s - fadd v10.4s, v10.4s, v0.4s - fadd v11.4s, v11.4s, v1.4s - fadd v12.4s, v12.4s, v0.4s - fadd v13.4s, v13.4s, v1.4s - fadd v14.4s, v14.4s, v0.4s - fadd v15.4s, v15.4s, v1.4s - fadd v16.4s, v16.4s, v0.4s - fadd v17.4s, v17.4s, v1.4s - fadd v18.4s, v18.4s, v0.4s - fadd v19.4s, v19.4s, v1.4s - fadd v20.4s, v20.4s, v0.4s - fadd v21.4s, v21.4s, v1.4s - fadd v22.4s, v22.4s, v0.4s - fadd v23.4s, v23.4s, v1.4s - - Activation8: - cmp x4, #2 - beq Relu68 - cmp x4, #1 - beq Relu8 - b Write - - Relu68: - mov w19, #6 - dup v2.4s, w19 - scvtf v2.4s, v2.4s - fmin v8.4s, v8.4s, v2.4s - fmin v9.4s, v9.4s, v2.4s - fmin v10.4s, v10.4s, v2.4s - fmin v11.4s, v11.4s, v2.4s - fmin v12.4s, v12.4s, v2.4s - fmin v13.4s, v13.4s, v2.4s - fmin v14.4s, v14.4s, v2.4s - fmin v15.4s, v15.4s, v2.4s - fmin v16.4s, v16.4s, v2.4s - fmin v17.4s, v17.4s, v2.4s - fmin v18.4s, v18.4s, v2.4s - fmin v19.4s, v19.4s, v2.4s - fmin v20.4s, v20.4s, v2.4s - fmin v21.4s, v21.4s, v2.4s - fmin v22.4s, v22.4s, v2.4s - fmin v23.4s, v23.4s, v2.4s - - Relu8: - dup v3.4s, wzr - fmax v8.4s, v8.4s, v3.4s - fmax v9.4s, v9.4s, v3.4s - fmax v10.4s, v10.4s, v3.4s - fmax v11.4s, v11.4s, v3.4s - fmax v12.4s, v12.4s, v3.4s - fmax v13.4s, v13.4s, v3.4s - fmax v14.4s, v14.4s, v3.4s - fmax v15.4s, v15.4s, v3.4s - fmax v16.4s, v16.4s, v3.4s - fmax v17.4s, v17.4s, v3.4s - fmax v18.4s, v18.4s, v3.4s - fmax v19.4s, v19.4s, v3.4s - fmax v20.4s, v20.4s, v3.4s - fmax v21.4s, v21.4s, v3.4s - fmax v22.4s, v22.4s, v3.4s - fmax v23.4s, v23.4s, v3.4s - b Write - - LoopDepthStartHalf8: - ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48 - ld1 {v3.4s, v4.4s}, [x14], #32 - fmul v8.4s, v3.4s, v0.s[0] - fmul v10.4s, v3.4s, v0.s[1] - fmul v12.4s, v3.4s, v0.s[2] - fmul v14.4s, v3.4s, v0.s[3] - fmul v16.4s, v3.4s, v1.s[0] - fmul v18.4s, v3.4s, v1.s[1] - fmul v20.4s, v3.4s, v1.s[2] - fmul v22.4s, v3.4s, v1.s[3] - - subs x19, x19, #1 - beq BiasHalf8 - - LoopDepthHalf8: - ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48 - ld1 {v3.4s, v4.4s}, [x14], #32 - fmla v8.4s, v3.4s, v0.s[0] - fmla v10.4s, v3.4s, v0.s[1] - fmla v12.4s, v3.4s, v0.s[2] - fmla v14.4s, v3.4s, v0.s[3] - fmla v16.4s, v3.4s, v1.s[0] - fmla v18.4s, v3.4s, v1.s[1] - fmla v20.4s, v3.4s, v1.s[2] - fmla v22.4s, v3.4s, v1.s[3] - - subs x19, x19, #1 - bgt LoopDepthHalf8 - - BiasHalf8: - cbz x3, ActivationHalf8 - ld1 {v0.4s}, [x12], #16 - ld1 {v1.4s}, [x12], #16 - fadd v8.4s, v8.4s, v0.4s - fadd v10.4s, v10.4s, v0.4s - fadd v12.4s, v12.4s, v0.4s - fadd v14.4s, v14.4s, v0.4s - fadd v16.4s, v16.4s, v0.4s - fadd v18.4s, v18.4s, v0.4s - fadd v20.4s, v20.4s, v0.4s - fadd v22.4s, v22.4s, v0.4s - - ActivationHalf8: - cmp x4, #2 - beq Relu6Half8 - cmp x4, #1 - beq ReluHalf8 - b Write - - Relu6Half8: - mov w19, #6 - dup v2.4s, w19 - scvtf v2.4s, v2.4s - fmin v8.4s, v8.4s, v2.4s - fmin v10.4s, v10.4s, v2.4s - fmin v12.4s, v12.4s, v2.4s - fmin v14.4s, v14.4s, v2.4s - fmin v16.4s, v16.4s, v2.4s - fmin v18.4s, v18.4s, v2.4s - fmin v20.4s, v20.4s, v2.4s - fmin v22.4s, v22.4s, v2.4s - - ReluHalf8: - dup v3.4s, wzr - fmax v8.4s, v8.4s, v3.4s - fmax v10.4s, v10.4s, v3.4s - fmax v12.4s, v12.4s, v3.4s - fmax v14.4s, v14.4s, v3.4s - fmax v16.4s, v16.4s, v3.4s - fmax v18.4s, v18.4s, v3.4s - fmax v20.4s, v20.4s, v3.4s - fmax v22.4s, v22.4s, v3.4s - b Write - -LoopRow4: - mov x14, x1 // reload rhs ptr - mov x13, x7 // reload rhs col - mov x12, x3 // reload bias - - LoopCol4: - cbz x9, NoReloadDst4 - mov x11, x2 - NoReloadDst4: - mov x10, x0 // reload lhs ptr - mov x19, x5 // reload depth - - cmp x13, #4 - ble LoopDepthStartHalf4 - - LoopDepthStart4: - ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48 - ld1 {v3.4s, v4.4s}, [x14], #32 - fmul v8.4s, v3.4s, v0.s[0] - fmul v10.4s, v3.4s, v0.s[1] - fmul v12.4s, v3.4s, v0.s[2] - fmul v14.4s, v3.4s, v0.s[3] - fmul v9.4s, v4.4s, v0.s[0] - fmul v11.4s, v4.4s, v0.s[1] - fmul v13.4s, v4.4s, v0.s[2] - fmul v15.4s, v4.4s, v0.s[3] - - subs x19, x19, #1 - beq Bias4 - - LoopDepth4: - ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48 - ld1 {v3.4s, v4.4s}, [x14], #32 - fmla v8.4s, v3.4s, v0.s[0] - fmla v10.4s, v3.4s, v0.s[1] - fmla v12.4s, v3.4s, v0.s[2] - fmla v14.4s, v3.4s, v0.s[3] - fmla v9.4s, v4.4s, v0.s[0] - fmla v11.4s, v4.4s, v0.s[1] - fmla v13.4s, v4.4s, v0.s[2] - fmla v15.4s, v4.4s, v0.s[3] - - subs x19, x19, #1 - bgt LoopDepth4 - - Bias4: - cbz x3, Activation4 - ld1 {v0.4s}, [x12], #16 - ld1 {v1.4s}, [x12], #16 - fadd v8.4s, v8.4s, v0.4s - fadd v9.4s, v9.4s, v1.4s - fadd v10.4s, v10.4s, v0.4s - fadd v11.4s, v11.4s, v1.4s - fadd v12.4s, v12.4s, v0.4s - fadd v13.4s, v13.4s, v1.4s - fadd v14.4s, v14.4s, v0.4s - fadd v15.4s, v15.4s, v1.4s - - Activation4: - cmp x4, #2 - beq Relu64 - cmp x4, #1 - beq Relu4 - b Write - - Relu64: - mov w19, #6 - dup v2.4s, w19 - scvtf v2.4s, v2.4s - fmin v8.4s, v8.4s, v2.4s - fmin v9.4s, v9.4s, v2.4s - fmin v10.4s, v10.4s, v2.4s - fmin v11.4s, v11.4s, v2.4s - fmin v12.4s, v12.4s, v2.4s - fmin v13.4s, v13.4s, v2.4s - fmin v14.4s, v14.4s, v2.4s - fmin v15.4s, v15.4s, v2.4s - - Relu4: - dup v3.4s, wzr - fmax v8.4s, v8.4s, v3.4s - fmax v9.4s, v9.4s, v3.4s - fmax v10.4s, v10.4s, v3.4s - fmax v11.4s, v11.4s, v3.4s - fmax v12.4s, v12.4s, v3.4s - fmax v13.4s, v13.4s, v3.4s - fmax v14.4s, v14.4s, v3.4s - fmax v15.4s, v15.4s, v3.4s - b Write - - LoopDepthStartHalf4: - ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48 - ld1 {v3.4s, v4.4s}, [x14], #32 - fmul v8.4s, v3.4s, v0.s[0] - fmul v10.4s, v3.4s, v0.s[1] - fmul v12.4s, v3.4s, v0.s[2] - fmul v14.4s, v3.4s, v0.s[3] - - subs x19, x19, #1 - beq BiasHalf4 - - LoopDepthHalf4: - ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48 - ld1 {v3.4s, v4.4s}, [x14], #32 - fmla v8.4s, v3.4s, v0.s[0] - fmla v10.4s, v3.4s, v0.s[1] - fmla v12.4s, v3.4s, v0.s[2] - fmla v14.4s, v3.4s, v0.s[3] - - subs x19, x19, #1 - bgt LoopDepthHalf4 - - BiasHalf4: - cbz x3, ActivationHalf4 - ld1 {v0.4s}, [x12], #16 - ld1 {v1.4s}, [x12], #16 - fadd v8.4s, v8.4s, v0.4s - fadd v10.4s, v10.4s, v0.4s - fadd v12.4s, v12.4s, v0.4s - fadd v14.4s, v14.4s, v0.4s - - ActivationHalf4: - cmp x4, #2 - beq Relu6Half4 - cmp x4, #1 - beq ReluHalf4 - b Write - - Relu6Half4: - mov w19, #6 - dup v2.4s, w19 - scvtf v2.4s, v2.4s - fmin v8.4s, v8.4s, v2.4s - fmin v10.4s, v10.4s, v2.4s - fmin v12.4s, v12.4s, v2.4s - fmin v14.4s, v14.4s, v2.4s - - ReluHalf4: - dup v3.4s, wzr - fmax v8.4s, v8.4s, v3.4s - fmax v10.4s, v10.4s, v3.4s - fmax v12.4s, v12.4s, v3.4s - fmax v14.4s, v14.4s, v3.4s - - Write: - cmp x9, #2 - beq WriteWino - cbz x9, WriteC8 - cmp x13, #1 - beq Write1 - cmp x13, #2 - beq Write2 - cmp x13, #3 - beq Write3 - cmp x13, #4 - beq Write4 - cmp x13, #5 - beq Write5 - cmp x13, #6 - beq Write6 - cmp x13, #7 - beq Write7 - b Write8 - - Write1: - add x2, x2, #4 - str s8, [x11] - cmp x6, #1 - beq WriteEnd - add x11, x11, x8 - str s10, [x11] - cmp x6, #2 - beq WriteEnd - add x11, x11, x8 - str s12, [x11] - cmp x6, #3 - beq WriteEnd - add x11, x11, x8 - str s14, [x11] - cmp x6, #4 - beq WriteEnd - add x11, x11, x8 - str s16, [x11] - cmp x6, #5 - beq WriteEnd - add x11, x11, x8 - str s18, [x11] - cmp x6, #6 - beq WriteEnd - add x11, x11, x8 - str s20, [x11] - cmp x6, #7 - beq WriteEnd - add x11, x11, x8 - str s22, [x11] - add x11, x11, x8 - add x11, x11, #4 - b WriteEnd - Write2: - add x2, x2, #8 - str d8, [x11] - cmp x6, #1 - beq WriteEnd - add x11, x11, x8 - str d10, [x11] - cmp x6, #2 - beq WriteEnd - add x11, x11, x8 - str d12, [x11] - cmp x6, #3 - beq WriteEnd - add x11, x11, x8 - str d14, [x11] - cmp x6, #4 - beq WriteEnd - add x11, x11, x8 - str d16, [x11] - cmp x6, #5 - beq WriteEnd - add x11, x11, x8 - str d18, [x11] - cmp x6, #6 - beq WriteEnd - add x11, x11, x8 - str d20, [x11] - cmp x6, #7 - beq WriteEnd - add x11, x11, x8 - str d22, [x11] - add x11, x11, x8 - add x11, x11, #8 - b WriteEnd - Write3: - add x2, x2, #12 - add x19, x11, #8 - str d8, [x11] - st1 {v8.s}[2], [x19], x8 - cmp x6, #1 - beq WriteEnd - add x11, x11, x8 - str d10, [x11] - st1 {v10.s}[2], [x19], x8 - cmp x6, #2 - beq WriteEnd - add x11, x11, x8 - str d12, [x11] - st1 {v12.s}[2], [x19], x8 - cmp x6, #3 - beq WriteEnd - add x11, x11, x8 - str d14, [x11] - st1 {v14.s}[2], [x19], x8 - cmp x6, #4 - beq WriteEnd - add x11, x11, x8 - str d16, [x11] - st1 {v16.s}[2], [x19], x8 - cmp x6, #5 - beq WriteEnd - add x11, x11, x8 - str d18, [x11] - st1 {v18.s}[2], [x19], x8 - cmp x6, #6 - beq WriteEnd - add x11, x11, x8 - str d20, [x11] - st1 {v20.s}[2], [x19], x8 - cmp x6, #7 - beq WriteEnd - add x11, x11, x8 - str d22, [x11] - st1 {v22.s}[2], [x19], x8 - add x11, x11, x8 - add x11, x11, #12 - b WriteEnd - Write4: - add x2, x2, #16 - st1 {v8.4s}, [x11], x8 - cmp x6, #1 - beq WriteEnd - st1 {v10.4s}, [x11], x8 - cmp x6, #2 - beq WriteEnd - st1 {v12.4s}, [x11], x8 - cmp x6, #3 - beq WriteEnd - st1 {v14.4s}, [x11], x8 - cmp x6, #4 - beq WriteEnd - st1 {v16.4s}, [x11], x8 - cmp x6, #5 - beq WriteEnd - st1 {v18.4s}, [x11], x8 - cmp x6, #6 - beq WriteEnd - st1 {v20.4s}, [x11], x8 - cmp x6, #7 - beq WriteEnd - st1 {v22.4s}, [x11], x8 - add x11, x11, #16 - b WriteEnd - Write5: - add x2, x2, #20 - add x19, x11, #16 - st1 {v8.4s}, [x11], x8 - str s9, [x19] - cmp x6, #1 - beq WriteEnd - add x19, x19, x8 - st1 {v10.4s}, [x11], x8 - str s11, [x19] - cmp x6, #2 - beq WriteEnd - add x19, x19, x8 - st1 {v12.4s}, [x11], x8 - str s13, [x19] - cmp x6, #3 - beq WriteEnd - add x19, x19, x8 - st1 {v14.4s}, [x11], x8 - str s15, [x19] - cmp x6, #4 - beq WriteEnd - add x19, x19, x8 - st1 {v16.4s}, [x11], x8 - str s17, [x19] - cmp x6, #5 - beq WriteEnd - add x19, x19, x8 - st1 {v18.4s}, [x11], x8 - str s19, [x19] - cmp x6, #6 - beq WriteEnd - add x19, x19, x8 - st1 {v20.4s}, [x11], x8 - str s21, [x19] - cmp x6, #7 - beq WriteEnd - add x19, x19, x8 - st1 {v22.4s}, [x11], x8 - str s23, [x19] - add x11, x11, #20 - b WriteEnd - Write6: - add x2, x2, #24 - add x19, x11, #16 - st1 {v8.4s}, [x11], x8 - str d9, [x19] - cmp x6, #1 - beq WriteEnd - add x19, x19, x8 - st1 {v10.4s}, [x11], x8 - str d11, [x19] - cmp x6, #2 - beq WriteEnd - add x19, x19, x8 - st1 {v12.4s}, [x11], x8 - str d13, [x19] - cmp x6, #3 - beq WriteEnd - add x19, x19, x8 - st1 {v14.4s}, [x11], x8 - str d15, [x19] - cmp x6, #4 - beq WriteEnd - add x19, x19, x8 - st1 {v16.4s}, [x11], x8 - str d17, [x19] - cmp x6, #5 - beq WriteEnd - add x19, x19, x8 - st1 {v18.4s}, [x11], x8 - str d19, [x19] - cmp x6, #6 - beq WriteEnd - add x19, x19, x8 - st1 {v20.4s}, [x11], x8 - str d21, [x19] - cmp x6, #7 - beq WriteEnd - add x19, x19, x8 - st1 {v22.4s}, [x11], x8 - str d23, [x19] - add x11, x11, #24 - b WriteEnd - Write7: - add x2, x2, #28 - add x19, x11, #16 - add x20, x11, #24 - st1 {v8.4s}, [x11], x8 - str d9, [x19] - st1 {v9.s}[2], [x20], x8 - cmp x6, #1 - beq WriteEnd - add x19, x19, x8 - st1 {v10.4s}, [x11], x8 - str d11, [x19] - st1 {v11.s}[2], [x20], x8 - cmp x6, #2 - beq WriteEnd - add x19, x19, x8 - st1 {v12.4s}, [x11], x8 - str d13, [x19] - st1 {v13.s}[2], [x20], x8 - cmp x6, #3 - beq WriteEnd - add x19, x19, x8 - st1 {v14.4s}, [x11], x8 - str d15, [x19] - st1 {v15.s}[2], [x20], x8 - cmp x6, #4 - beq WriteEnd - add x19, x19, x8 - st1 {v16.4s}, [x11], x8 - str d17, [x19] - st1 {v17.s}[2], [x20], x8 - cmp x6, #5 - beq WriteEnd - add x19, x19, x8 - st1 {v18.4s}, [x11], x8 - str d19, [x19] - st1 {v19.s}[2], [x20], x8 - cmp x6, #6 - beq WriteEnd - add x19, x19, x8 - st1 {v20.4s}, [x11], x8 - str d21, [x19] - st1 {v21.s}[2], [x20], x8 - cmp x6, #7 - beq WriteEnd - add x19, x19, x8 - st1 {v22.4s}, [x11], x8 - str d23, [x19] - st1 {v23.s}[2], [x20], x8 - add x11, x11, #28 - b WriteEnd - WriteC8: - mov x19, x11 - st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x19], #64 - st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x19], #64 - st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x19], #64 - st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x19], #64 - add x11, x11, x16 - b WriteEnd - WriteWino: - add x2, x11, x16 - st1 {v8.4s, v9.4s}, [x11], x15 - st1 {v10.4s, v11.4s}, [x11], x15 - st1 {v12.4s, v13.4s}, [x11], x15 - st1 {v14.4s, v15.4s}, [x11], x15 - st1 {v16.4s, v17.4s}, [x11], x15 - st1 {v18.4s, v19.4s}, [x11], x15 - st1 {v20.4s, v21.4s}, [x11], x15 - st1 {v22.4s, v23.4s}, [x11], x15 - b WriteEnd - Write8: - add x2, x2, #32 - st1 {v8.4s, v9.4s}, [x11], x8 - cmp x6, #1 - beq WriteEnd - st1 {v10.4s, v11.4s}, [x11], x8 - cmp x6, #2 - beq WriteEnd - st1 {v12.4s, v13.4s}, [x11], x8 - cmp x6, #3 - beq WriteEnd - st1 {v14.4s, v15.4s}, [x11], x8 - cmp x6, #4 - beq WriteEnd - st1 {v16.4s, v17.4s}, [x11], x8 - cmp x6, #5 - beq WriteEnd - st1 {v18.4s, v19.4s}, [x11], x8 - cmp x6, #6 - beq WriteEnd - st1 {v20.4s, v21.4s}, [x11], x8 - cmp x6, #7 - beq WriteEnd - st1 {v22.4s, v23.4s}, [x11], x8 - add x11, x11, #32 - - WriteEnd: - subs x13, x13, #8 // rhs col - 8 - ble LoopColEnd - cmp x6, #4 - ble LoopCol4 - b LoopCol8 - -LoopColEnd: - add x0, x0, x17 - cbz x9, C8DstStep - mov x18, #4 - mul x18, x18, x7 - sub x11, x11, x18 - mov x2, x11 - b NoDstStep - C8DstStep: - add x2, x2, #384 - mov x11, x2 - NoDstStep: - subs x6, x6, #12 - bgt LoopRow - - sub sp, sp, #144 - ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 - ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 - ldp x19, x20, [sp], #16 - ret -#endif diff --git a/mindspore/lite/nnacl/fp32/matmul.c b/mindspore/lite/nnacl/fp32/matmul.c index 934480fb28..2c965d6932 100644 --- a/mindspore/lite/nnacl/fp32/matmul.c +++ b/mindspore/lite/nnacl/fp32/matmul.c @@ -472,8 +472,6 @@ void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActT #ifdef ENABLE_ARM64 if (out_type == OutType_C8) { MatmulFloatNeon64(a, b, c, bias, (int)act_type, deep, row, col, stride, 0, 0); - } else if (row <= 8) { - MatmulFloatNeon64OptRemain(a, b, c, bias, (int)act_type, deep, row, col, stride, (int)(out_type)); } else { MatmulFloatNeon64Opt(a, b, c, bias, (int)act_type, deep, row, col, stride, (int)(out_type)); } diff --git a/mindspore/lite/nnacl/fp32/matmul.h b/mindspore/lite/nnacl/fp32/matmul.h index da3d7a7ac2..a258cb5592 100644 --- a/mindspore/lite/nnacl/fp32/matmul.h +++ b/mindspore/lite/nnacl/fp32/matmul.h @@ -39,8 +39,6 @@ void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bi int col, size_t stride, size_t writeNhwc, size_t WriteWino); void MatmulFloatNeon64Opt(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row, int col, size_t stride, size_t write_mode); -void MatmulFloatNeon64OptRemain(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, - int row, int col, size_t stride, size_t write_mode); #elif ENABLE_ARM32 void MatmulFloatNeon32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row, int col, int stride, size_t writeNhwc, size_t WriteWino);