| @@ -28,11 +28,11 @@ asm_function AdderFloatNeon64 | |||||
| ldr x8, [sp] | ldr x8, [sp] | ||||
| mov x18, #48 // sizeof(float) * 12 | |||||
| mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float) * 12 * depth | |||||
| mov x20, #48 // sizeof(float) * 12 | |||||
| mul x17, x5, x20 // block stride of lhs/rhs: sizeof(float) * 12 * depth | |||||
| mov x18, #4 | |||||
| mul x8, x8, x18 | |||||
| mov x20, #4 | |||||
| mul x8, x8, x20 | |||||
| LoopRowStart: | LoopRowStart: | ||||
| cmp x6, #4 | cmp x6, #4 | ||||
| @@ -595,9 +595,9 @@ LoopRow4: | |||||
| LoopColEnd: | LoopColEnd: | ||||
| add x0, x0, x17 | add x0, x0, x17 | ||||
| mov x18, #4 | |||||
| mul x18, x18, x7 | |||||
| sub x11, x11, x18 | |||||
| mov x20, #4 | |||||
| mul x20, x20, x7 | |||||
| sub x11, x11, x20 | |||||
| mov x2, x11 | mov x2, x11 | ||||
| subs x6, x6, #12 | subs x6, x6, #12 | ||||
| bgt LoopRowStart | bgt LoopRowStart | ||||
| @@ -33,12 +33,13 @@ | |||||
| // w16: per_channel | // w16: per_channel | ||||
| asm_function ConvDw3x3Int8Neon64 | asm_function ConvDw3x3Int8Neon64 | ||||
| sub sp, sp, #176 | |||||
| sub sp, sp, #192 | |||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| stp x19, x20, [sp], #16 | stp x19, x20, [sp], #16 | ||||
| stp x21, x22, [sp], #16 | stp x21, x22, [sp], #16 | ||||
| stp x23, x24, [sp], #16 | stp x23, x24, [sp], #16 | ||||
| stp x25, x26, [sp], #16 | |||||
| ldr x8, [sp] | ldr x8, [sp] | ||||
| ldr x9, [sp, #8] | ldr x9, [sp, #8] | ||||
| @@ -84,16 +85,16 @@ asm_function ConvDw3x3Int8Neon64 | |||||
| mov x16, x1 | mov x16, x1 | ||||
| add x17, x16, x5 | add x17, x16, x5 | ||||
| add x18, x17, x5 | |||||
| add x25, x17, x5 | |||||
| ld1 {v9.8b}, [x16], x4 | ld1 {v9.8b}, [x16], x4 | ||||
| ld1 {v10.8b}, [x16], x4 | ld1 {v10.8b}, [x16], x4 | ||||
| ld1 {v11.8b}, [x16], x4 | ld1 {v11.8b}, [x16], x4 | ||||
| ld1 {v13.8b}, [x17], x4 | ld1 {v13.8b}, [x17], x4 | ||||
| ld1 {v14.8b}, [x17], x4 | ld1 {v14.8b}, [x17], x4 | ||||
| ld1 {v15.8b}, [x17], x4 | ld1 {v15.8b}, [x17], x4 | ||||
| ld1 {v17.8b}, [x18], x4 | |||||
| ld1 {v18.8b}, [x18], x4 | |||||
| ld1 {v19.8b}, [x18], x4 | |||||
| ld1 {v17.8b}, [x25], x4 | |||||
| ld1 {v18.8b}, [x25], x4 | |||||
| ld1 {v19.8b}, [x25], x4 | |||||
| ld1 {v21.4s}, [x3] | ld1 {v21.4s}, [x3] | ||||
| ld1 {v22.4s}, [x19] | ld1 {v22.4s}, [x19] | ||||
| @@ -123,13 +124,13 @@ HEIGHT1_LOOP: | |||||
| ld1 {v16.8b}, [x17] | ld1 {v16.8b}, [x17] | ||||
| smlal v23.4s, v0.4h, v10.4h | smlal v23.4s, v0.4h, v10.4h | ||||
| smlal2 v24.4s, v0.8h, v10.8h | smlal2 v24.4s, v0.8h, v10.8h | ||||
| ld1 {v20.8b}, [x18] | |||||
| ld1 {v20.8b}, [x25] | |||||
| add x1, x1, x21 | add x1, x1, x21 | ||||
| ssubl v12.8h, v12.8b, v25.8b | ssubl v12.8h, v12.8b, v25.8b | ||||
| smlal v21.4s, v1.4h, v10.4h | smlal v21.4s, v1.4h, v10.4h | ||||
| mov x16, x1 | mov x16, x1 | ||||
| add x17, x16, x5 | add x17, x16, x5 | ||||
| add x18, x17, x5 | |||||
| add x25, x17, x5 | |||||
| smlal2 v22.4s, v1.8h, v10.8h | smlal2 v22.4s, v1.8h, v10.8h | ||||
| ld1 {v9.8b}, [x16], x4 | ld1 {v9.8b}, [x16], x4 | ||||
| ssubl v16.8h, v16.8b, v25.8b | ssubl v16.8h, v16.8b, v25.8b | ||||
| @@ -159,17 +160,17 @@ HEIGHT1_LOOP: | |||||
| smlal2 v24.4s, v5.8h, v16.8h | smlal2 v24.4s, v5.8h, v16.8h | ||||
| smlal v21.4s, v6.4h, v17.4h | smlal v21.4s, v6.4h, v17.4h | ||||
| smlal2 v22.4s, v6.8h, v17.8h | smlal2 v22.4s, v6.8h, v17.8h | ||||
| ld1 {v17.8b}, [x18], x4 | |||||
| ld1 {v17.8b}, [x25], x4 | |||||
| smlal v23.4s, v6.4h, v18.4h | smlal v23.4s, v6.4h, v18.4h | ||||
| smlal2 v24.4s, v6.8h, v18.8h | smlal2 v24.4s, v6.8h, v18.8h | ||||
| smlal v21.4s, v7.4h, v18.4h | smlal v21.4s, v7.4h, v18.4h | ||||
| smlal2 v22.4s, v7.8h, v18.8h | smlal2 v22.4s, v7.8h, v18.8h | ||||
| ld1 {v18.8b}, [x18], x4 | |||||
| ld1 {v18.8b}, [x25], x4 | |||||
| smlal v23.4s, v7.4h, v19.4h | smlal v23.4s, v7.4h, v19.4h | ||||
| smlal2 v24.4s, v7.8h, v19.8h | smlal2 v24.4s, v7.8h, v19.8h | ||||
| smlal v21.4s, v8.4h, v19.4h | smlal v21.4s, v8.4h, v19.4h | ||||
| smlal2 v22.4s, v8.8h, v19.8h | smlal2 v22.4s, v8.8h, v19.8h | ||||
| ld1 {v19.8b}, [x18], x4 | |||||
| ld1 {v19.8b}, [x25], x4 | |||||
| smlal v23.4s, v8.4h, v20.4h | smlal v23.4s, v8.4h, v20.4h | ||||
| smlal2 v24.4s, v8.8h, v20.8h | smlal2 v24.4s, v8.8h, v20.8h | ||||
| @@ -278,7 +279,7 @@ WIDTH2_LEFT: | |||||
| smlal2 v24.4s, v1.8h, v11.8h | smlal2 v24.4s, v1.8h, v11.8h | ||||
| smlal v21.4s, v2.4h, v11.4h | smlal v21.4s, v2.4h, v11.4h | ||||
| smlal2 v22.4s, v2.8h, v11.8h | smlal2 v22.4s, v2.8h, v11.8h | ||||
| ld1 {v20.8b}, [x18] | |||||
| ld1 {v20.8b}, [x25] | |||||
| smlal v23.4s, v2.4h, v12.4h | smlal v23.4s, v2.4h, v12.4h | ||||
| smlal2 v24.4s, v2.8h, v12.8h | smlal2 v24.4s, v2.8h, v12.8h | ||||
| smlal v21.4s, v3.4h, v13.4h | smlal v21.4s, v3.4h, v13.4h | ||||
| @@ -443,12 +444,13 @@ OUTZP3: | |||||
| st1 {v21.8b}, [x0], x6 | st1 {v21.8b}, [x0], x6 | ||||
| End: | End: | ||||
| sub sp, sp, #176 | |||||
| sub sp, sp, #192 | |||||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| ldp x19, x20, [sp], #16 | ldp x19, x20, [sp], #16 | ||||
| ldp x21, x22, [sp], #16 | ldp x21, x22, [sp], #16 | ||||
| ldp x23, x24, [sp], #16 | ldp x23, x24, [sp], #16 | ||||
| ldp x25, x26, [sp], #16 | |||||
| ret | ret | ||||
| #endif | #endif | ||||
| @@ -33,12 +33,13 @@ | |||||
| // w16: per_channel | // w16: per_channel | ||||
| asm_function ConvDw3x3Int8Stride2 | asm_function ConvDw3x3Int8Stride2 | ||||
| sub sp, sp, #176 | |||||
| sub sp, sp, #192 | |||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| stp x19, x20, [sp], #16 | stp x19, x20, [sp], #16 | ||||
| stp x21, x22, [sp], #16 | stp x21, x22, [sp], #16 | ||||
| stp x23, x24, [sp], #16 | stp x23, x24, [sp], #16 | ||||
| stp x25, x26, [sp], #16 | |||||
| ldr x8, [sp] | ldr x8, [sp] | ||||
| ldr x9, [sp, #8] | ldr x9, [sp, #8] | ||||
| @@ -71,7 +72,7 @@ asm_function ConvDw3x3Int8Stride2 | |||||
| mov x16, x1 | mov x16, x1 | ||||
| add x17, x16, x5 | add x17, x16, x5 | ||||
| add x18, x17, x5 | |||||
| add x25, x17, x5 | |||||
| ld1 {v9.8b}, [x16], x4 | ld1 {v9.8b}, [x16], x4 | ||||
| ld1 {v10.8b}, [x16], x4 | ld1 {v10.8b}, [x16], x4 | ||||
| ssubl v9.8h, v9.8b, v28.8b | ssubl v9.8h, v9.8b, v28.8b | ||||
| @@ -83,11 +84,11 @@ asm_function ConvDw3x3Int8Stride2 | |||||
| ssubl v14.8h, v14.8b, v28.8b | ssubl v14.8h, v14.8b, v28.8b | ||||
| ld1 {v16.8b}, [x17], x4 | ld1 {v16.8b}, [x17], x4 | ||||
| ssubl v15.8h, v15.8b, v28.8b | ssubl v15.8h, v15.8b, v28.8b | ||||
| ld1 {v19.8b}, [x18], x4 | |||||
| ld1 {v19.8b}, [x25], x4 | |||||
| ssubl v16.8h, v16.8b, v28.8b | ssubl v16.8h, v16.8b, v28.8b | ||||
| ld1 {v20.8b}, [x18], x4 | |||||
| ld1 {v20.8b}, [x25], x4 | |||||
| ssubl v19.8h, v19.8b, v28.8b | ssubl v19.8h, v19.8b, v28.8b | ||||
| ld1 {v21.8b}, [x18], x4 | |||||
| ld1 {v21.8b}, [x25], x4 | |||||
| ssubl v20.8h, v20.8b, v28.8b | ssubl v20.8h, v20.8b, v28.8b | ||||
| ssubl v21.8h, v21.8b, v28.8b | ssubl v21.8h, v21.8b, v28.8b | ||||
| @@ -108,7 +109,7 @@ HEIGHT1_LOOP: | |||||
| ld1 {v17.8b}, [x17], x4 | ld1 {v17.8b}, [x17], x4 | ||||
| ssubl v12.8h, v12.8b, v28.8b | ssubl v12.8h, v12.8b, v28.8b | ||||
| smlal v26.4s, v0.4h, v11.4h | smlal v26.4s, v0.4h, v11.4h | ||||
| ld1 {v22.8b}, [x18], x4 | |||||
| ld1 {v22.8b}, [x25], x4 | |||||
| ssubl v17.8h, v17.8b, v28.8b | ssubl v17.8h, v17.8b, v28.8b | ||||
| smlal2 v27.4s, v0.8h, v11.8h | smlal2 v27.4s, v0.8h, v11.8h | ||||
| ld1 {v13.8b}, [x16], x4 | ld1 {v13.8b}, [x16], x4 | ||||
| @@ -117,7 +118,7 @@ HEIGHT1_LOOP: | |||||
| ld1 {v18.8b}, [x17], x4 | ld1 {v18.8b}, [x17], x4 | ||||
| ssubl v13.8h, v13.8b, v28.8b | ssubl v13.8h, v13.8b, v28.8b | ||||
| smlal2 v25.4s, v1.8h, v10.8h | smlal2 v25.4s, v1.8h, v10.8h | ||||
| ld1 {v23.8b}, [x18], x4 | |||||
| ld1 {v23.8b}, [x25], x4 | |||||
| ssubl v18.8h, v18.8b, v28.8b | ssubl v18.8h, v18.8b, v28.8b | ||||
| smlal v26.4s, v1.4h, v12.4h | smlal v26.4s, v1.4h, v12.4h | ||||
| mov v9.16b, v13.16b | mov v9.16b, v13.16b | ||||
| @@ -157,12 +158,12 @@ HEIGHT1_LOOP: | |||||
| smlal2 v27.4s, v6.8h, v21.8h | smlal2 v27.4s, v6.8h, v21.8h | ||||
| smlal v24.4s, v7.4h, v20.4h | smlal v24.4s, v7.4h, v20.4h | ||||
| smlal2 v25.4s, v7.8h, v20.8h | smlal2 v25.4s, v7.8h, v20.8h | ||||
| ld1 {v20.8b}, [x18], x4 | |||||
| ld1 {v20.8b}, [x25], x4 | |||||
| smlal v26.4s, v7.4h, v22.4h | smlal v26.4s, v7.4h, v22.4h | ||||
| smlal2 v27.4s, v7.8h, v22.8h | smlal2 v27.4s, v7.8h, v22.8h | ||||
| smlal v24.4s, v8.4h, v21.4h | smlal v24.4s, v8.4h, v21.4h | ||||
| smlal2 v25.4s, v8.8h, v21.8h | smlal2 v25.4s, v8.8h, v21.8h | ||||
| ld1 {v21.8b}, [x18], x4 | |||||
| ld1 {v21.8b}, [x25], x4 | |||||
| ssubl v20.8h, v20.8b, v28.8b | ssubl v20.8h, v20.8b, v28.8b | ||||
| smlal v26.4s, v8.4h, v23.4h | smlal v26.4s, v8.4h, v23.4h | ||||
| ssubl v21.8h, v21.8b, v28.8b | ssubl v21.8h, v21.8b, v28.8b | ||||
| @@ -260,7 +261,7 @@ WIDTH2_LEFT: | |||||
| ld1 {v17.8b}, [x17], x4 | ld1 {v17.8b}, [x17], x4 | ||||
| ssubl v12.8h, v12.8b, v28.8b | ssubl v12.8h, v12.8b, v28.8b | ||||
| smlal v26.4s, v0.4h, v11.4h | smlal v26.4s, v0.4h, v11.4h | ||||
| ld1 {v22.8b}, [x18], x4 | |||||
| ld1 {v22.8b}, [x25], x4 | |||||
| ssubl v17.8h, v17.8b, v28.8b | ssubl v17.8h, v17.8b, v28.8b | ||||
| smlal2 v27.4s, v0.8h, v11.8h | smlal2 v27.4s, v0.8h, v11.8h | ||||
| ld1 {v13.8b}, [x16], x4 | ld1 {v13.8b}, [x16], x4 | ||||
| @@ -269,7 +270,7 @@ WIDTH2_LEFT: | |||||
| ld1 {v18.8b}, [x17], x4 | ld1 {v18.8b}, [x17], x4 | ||||
| ssubl v13.8h, v13.8b, v28.8b | ssubl v13.8h, v13.8b, v28.8b | ||||
| smlal2 v25.4s, v1.8h, v10.8h | smlal2 v25.4s, v1.8h, v10.8h | ||||
| ld1 {v23.8b}, [x18], x4 | |||||
| ld1 {v23.8b}, [x25], x4 | |||||
| ssubl v18.8h, v18.8b, v28.8b | ssubl v18.8h, v18.8b, v28.8b | ||||
| smlal v26.4s, v1.4h, v12.4h | smlal v26.4s, v1.4h, v12.4h | ||||
| ssubl v23.8h, v23.8b, v28.8b | ssubl v23.8h, v23.8b, v28.8b | ||||
| @@ -452,11 +453,12 @@ OUTZP3: | |||||
| st1 {v24.8b}, [x0], x6 | st1 {v24.8b}, [x0], x6 | ||||
| End: | End: | ||||
| sub sp, sp, #176 | |||||
| sub sp, sp, #192 | |||||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| ldp x19, x20, [sp], #16 | ldp x19, x20, [sp], #16 | ||||
| ldp x21, x22, [sp], #16 | ldp x21, x22, [sp], #16 | ||||
| ldp x23, x24, [sp], #16 | ldp x23, x24, [sp], #16 | ||||
| ldp x25, x26, [sp], #16 | |||||
| ret | ret | ||||
| #endif | #endif | ||||
| @@ -19,12 +19,13 @@ asm_function ConvDwFp32Center | |||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| // whereas our coding style do not permit such amount of parameters | // whereas our coding style do not permit such amount of parameters | ||||
| sub sp, sp, #176 | |||||
| sub sp, sp, #192 | |||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| stp x19, x20, [sp], #16 | stp x19, x20, [sp], #16 | ||||
| stp x21, x22, [sp], #16 | stp x21, x22, [sp], #16 | ||||
| stp x23, x24, [sp], #16 | stp x23, x24, [sp], #16 | ||||
| stp x25, x26, [sp], #16 | |||||
| ldr x8, [sp] | ldr x8, [sp] | ||||
| ldr x9, [sp, #8] | ldr x9, [sp, #8] | ||||
| @@ -72,7 +73,7 @@ asm_function ConvDwFp32Center | |||||
| mov v14.16b, v24.16b | mov v14.16b, v24.16b | ||||
| mov v15.16b, v24.16b | mov v15.16b, v24.16b | ||||
| LoopKh16: | LoopKh16: | ||||
| mov x18, x7 | |||||
| mov x25, x7 | |||||
| mov x21, x16 | mov x21, x16 | ||||
| LoopKw16: | LoopKw16: | ||||
| mov x22, x21 | mov x22, x21 | ||||
| @@ -109,7 +110,7 @@ asm_function ConvDwFp32Center | |||||
| ld1 {v23.4s}, [x22], x11 | ld1 {v23.4s}, [x22], x11 | ||||
| fmla v14.4s, v22.4s, v25.4s | fmla v14.4s, v22.4s, v25.4s | ||||
| fmla v15.4s, v23.4s, v25.4s | fmla v15.4s, v23.4s, v25.4s | ||||
| subs x18, x18, #1 | |||||
| subs x25, x25, #1 | |||||
| add x21, x21, x13 | add x21, x21, x13 | ||||
| bne LoopKw16 | bne LoopKw16 | ||||
| add x16, x16, x12 | add x16, x16, x12 | ||||
| @@ -192,7 +193,7 @@ asm_function ConvDwFp32Center | |||||
| mov v6.16b, v24.16b | mov v6.16b, v24.16b | ||||
| mov v7.16b, v24.16b | mov v7.16b, v24.16b | ||||
| LoopKh8: | LoopKh8: | ||||
| mov x18, x7 | |||||
| mov x25, x7 | |||||
| mov x21, x16 | mov x21, x16 | ||||
| LoopKw8: | LoopKw8: | ||||
| mov x22, x21 | mov x22, x21 | ||||
| @@ -213,7 +214,7 @@ asm_function ConvDwFp32Center | |||||
| ld1 {v23.4s}, [x22], x11 | ld1 {v23.4s}, [x22], x11 | ||||
| fmla v6.4s, v22.4s, v25.4s | fmla v6.4s, v22.4s, v25.4s | ||||
| fmla v7.4s, v23.4s, v25.4s | fmla v7.4s, v23.4s, v25.4s | ||||
| subs x18, x18, #1 | |||||
| subs x25, x25, #1 | |||||
| add x21, x21, x13 | add x21, x21, x13 | ||||
| bne LoopKw8 | bne LoopKw8 | ||||
| add x16, x16, x12 | add x16, x16, x12 | ||||
| @@ -261,13 +262,13 @@ asm_function ConvDwFp32Center | |||||
| mov x20, x6 | mov x20, x6 | ||||
| mov v0.16b, v24.16b | mov v0.16b, v24.16b | ||||
| LoopKh: | LoopKh: | ||||
| mov x18, x7 | |||||
| mov x25, x7 | |||||
| mov x22, x16 | mov x22, x16 | ||||
| LoopKw: | LoopKw: | ||||
| ld1 {v16.4s}, [x22], x13 | ld1 {v16.4s}, [x22], x13 | ||||
| ld1 {v25.4s}, [x17], #16 | ld1 {v25.4s}, [x17], #16 | ||||
| fmla v0.4s, v16.4s, v25.4s | fmla v0.4s, v16.4s, v25.4s | ||||
| subs x18, x18, #1 | |||||
| subs x25, x25, #1 | |||||
| bne LoopKw | bne LoopKw | ||||
| add x16, x16, x12 | add x16, x16, x12 | ||||
| subs x20, x20, #1 | subs x20, x20, #1 | ||||
| @@ -290,11 +291,12 @@ asm_function ConvDwFp32Center | |||||
| subs x4, x4, #1 | subs x4, x4, #1 | ||||
| bne LoopH | bne LoopH | ||||
| sub sp, sp, #176 | |||||
| sub sp, sp, #192 | |||||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| ldp x19, x20, [sp], #16 | ldp x19, x20, [sp], #16 | ||||
| ldp x21, x22, [sp], #16 | ldp x21, x22, [sp], #16 | ||||
| ldp x23, x24, [sp], #16 | ldp x23, x24, [sp], #16 | ||||
| ldp x25, x26, [sp], #16 | |||||
| ret | ret | ||||
| #endif | #endif | ||||
| @@ -13,8 +13,9 @@ | |||||
| // x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6 | // x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6 | ||||
| asm_function ConvDwFp32Indirect3x3 | asm_function ConvDwFp32Indirect3x3 | ||||
| sub sp, sp, #16 | |||||
| sub sp, sp, #32 | |||||
| stp x19, x20, [sp], #16 | stp x19, x20, [sp], #16 | ||||
| stp x21, x22, [sp], #16 | |||||
| movi v31.4s, #6 | movi v31.4s, #6 | ||||
| scvtf v31.4s, v31.4s | scvtf v31.4s, v31.4s | ||||
| @@ -28,7 +29,7 @@ asm_function ConvDwFp32Indirect3x3 | |||||
| ldp x12, x13, [x1] | ldp x12, x13, [x1] | ||||
| ldp x14, x15, [x1, #16] | ldp x14, x15, [x1, #16] | ||||
| ldp x16, x17, [x1, #32] | ldp x16, x17, [x1, #32] | ||||
| ldp x18, x19, [x1, #48] | |||||
| ldp x21, x19, [x1, #48] | |||||
| ldr x20, [x1, #64] | ldr x20, [x1, #64] | ||||
| mov x9, x2 | mov x9, x2 | ||||
| mov x10, x3 | mov x10, x3 | ||||
| @@ -56,7 +57,7 @@ asm_function ConvDwFp32Indirect3x3 | |||||
| ld1 {v5.4s}, [x17], #16 | ld1 {v5.4s}, [x17], #16 | ||||
| ld1 {v22.4s}, [x9], #16 | ld1 {v22.4s}, [x9], #16 | ||||
| fmla v29.4s, v3.4s, v20.4s | fmla v29.4s, v3.4s, v20.4s | ||||
| ld1 {v6.4s}, [x18], #16 | |||||
| ld1 {v6.4s}, [x21], #16 | |||||
| ld1 {v23.4s}, [x9], #16 | ld1 {v23.4s}, [x9], #16 | ||||
| fmla v29.4s, v4.4s, v21.4s | fmla v29.4s, v4.4s, v21.4s | ||||
| ld1 {v7.4s}, [x19], #16 | ld1 {v7.4s}, [x19], #16 | ||||
| @@ -100,7 +101,7 @@ asm_function ConvDwFp32Indirect3x3 | |||||
| ld1 {v5.4s}, [x17], #16 | ld1 {v5.4s}, [x17], #16 | ||||
| ld1 {v22.4s}, [x9], #16 | ld1 {v22.4s}, [x9], #16 | ||||
| fmla v29.4s, v3.4s, v20.4s | fmla v29.4s, v3.4s, v20.4s | ||||
| ld1 {v6.4s}, [x18], #16 | |||||
| ld1 {v6.4s}, [x21], #16 | |||||
| ld1 {v23.4s}, [x9], #16 | ld1 {v23.4s}, [x9], #16 | ||||
| fmla v29.4s, v4.4s, v21.4s | fmla v29.4s, v4.4s, v21.4s | ||||
| ld1 {v7.4s}, [x19], #16 | ld1 {v7.4s}, [x19], #16 | ||||
| @@ -141,7 +142,8 @@ asm_function ConvDwFp32Indirect3x3 | |||||
| cmp x5, #0 | cmp x5, #0 | ||||
| bgt LoopPixel | bgt LoopPixel | ||||
| End: | End: | ||||
| sub sp, sp, #16 | |||||
| sub sp, sp, #32 | |||||
| ldp x19, x20, [sp], #16 | ldp x19, x20, [sp], #16 | ||||
| ldp x21, x22, [sp], #16 | |||||
| ret | ret | ||||
| #endif | #endif | ||||
| @@ -13,17 +13,18 @@ | |||||
| // x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6 | // x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6 | ||||
| asm_function ConvDwFp32Indirect5x5 | asm_function ConvDwFp32Indirect5x5 | ||||
| sub sp, sp, #160 | |||||
| sub sp, sp, #176 | |||||
| stp x19, x20, [sp, #64] | stp x19, x20, [sp, #64] | ||||
| stp x21, x22, [sp, #80] | stp x21, x22, [sp, #80] | ||||
| stp x23, x24, [sp, #96] | stp x23, x24, [sp, #96] | ||||
| stp x25, x26, [sp, #112] | stp x25, x26, [sp, #112] | ||||
| stp x27, x28, [sp, #128] | stp x27, x28, [sp, #128] | ||||
| stp x29, x30, [sp, #144] | stp x29, x30, [sp, #144] | ||||
| ldrb w8, [sp, #160] | |||||
| ldrb w8, [sp, #176] | |||||
| stp x2, x3, [sp] | stp x2, x3, [sp] | ||||
| stp x4, x6, [sp, #16] | stp x4, x6, [sp, #16] | ||||
| stp x7, x8, [sp, #32] | stp x7, x8, [sp, #32] | ||||
| stp x0, x1, [sp, #160] | |||||
| movi v31.4s, #6 | movi v31.4s, #6 | ||||
| scvtf v31.4s, v31.4s | scvtf v31.4s, v31.4s | ||||
| @@ -44,7 +45,7 @@ asm_function ConvDwFp32Indirect5x5 | |||||
| ldp x12, x13, [x1, #48] | ldp x12, x13, [x1, #48] | ||||
| ldp x14, x15, [x1, #64] | ldp x14, x15, [x1, #64] | ||||
| ldp x16, x17, [x1, #80] | ldp x16, x17, [x1, #80] | ||||
| ldp x18, x19, [x1, #96] | |||||
| ldp x0, x19, [x1, #96] | |||||
| ldp x20, x21, [x1, #112] | ldp x20, x21, [x1, #112] | ||||
| ldp x22, x23, [x1, #128] | ldp x22, x23, [x1, #128] | ||||
| ldp x24, x25, [x1, #144] | ldp x24, x25, [x1, #144] | ||||
| @@ -93,7 +94,7 @@ asm_function ConvDwFp32Indirect5x5 | |||||
| ld1 {v1.4s}, [x17], #16 | ld1 {v1.4s}, [x17], #16 | ||||
| ld1 {v19.4s}, [x5], #16 | ld1 {v19.4s}, [x5], #16 | ||||
| fmla v29.4s, v7.4s, v25.4s | fmla v29.4s, v7.4s, v25.4s | ||||
| ld1 {v2.4s}, [x18], #16 | |||||
| ld1 {v2.4s}, [x0], #16 | |||||
| ld1 {v20.4s}, [x5], #16 | ld1 {v20.4s}, [x5], #16 | ||||
| fmla v29.4s, v16.4s, v26.4s | fmla v29.4s, v16.4s, v26.4s | ||||
| ld1 {v3.4s}, [x19], #16 | ld1 {v3.4s}, [x19], #16 | ||||
| @@ -160,7 +161,9 @@ asm_function ConvDwFp32Indirect5x5 | |||||
| RELU: | RELU: | ||||
| fmax v29.4s, v29.4s, v30.4s | fmax v29.4s, v29.4s, v30.4s | ||||
| WRITE: | WRITE: | ||||
| st1 {v29.4s}, [x0], #16 | |||||
| ldr x4, [sp, #160] | |||||
| st1 {v29.4s}, [x4], #16 | |||||
| str x4, [sp, #160] | |||||
| ldr x4, [sp, #56] | ldr x4, [sp, #56] | ||||
| ld1 {v29.4s}, [x4], #16 | ld1 {v29.4s}, [x4], #16 | ||||
| @@ -195,7 +198,7 @@ asm_function ConvDwFp32Indirect5x5 | |||||
| ld1 {v1.4s}, [x17], #16 | ld1 {v1.4s}, [x17], #16 | ||||
| ld1 {v19.4s}, [x5], #16 | ld1 {v19.4s}, [x5], #16 | ||||
| fmla v29.4s, v7.4s, v25.4s | fmla v29.4s, v7.4s, v25.4s | ||||
| ld1 {v2.4s}, [x18], #16 | |||||
| ld1 {v2.4s}, [x0], #16 | |||||
| ld1 {v20.4s}, [x5], #16 | ld1 {v20.4s}, [x5], #16 | ||||
| fmla v29.4s, v16.4s, v26.4s | fmla v29.4s, v16.4s, v26.4s | ||||
| ld1 {v3.4s}, [x19], #16 | ld1 {v3.4s}, [x19], #16 | ||||
| @@ -253,18 +256,24 @@ asm_function ConvDwFp32Indirect5x5 | |||||
| LeftWrite: | LeftWrite: | ||||
| cmp x2, #4 | cmp x2, #4 | ||||
| bne Write3 | bne Write3 | ||||
| st1 {v29.4s}, [x0], #16 | |||||
| ldr x4, [sp, #160] | |||||
| st1 {v29.4s}, [x4], #16 | |||||
| str x4, [sp, #160] | |||||
| b NextPixel | b NextPixel | ||||
| Write3: | Write3: | ||||
| sxtw x2, w2 | sxtw x2, w2 | ||||
| tbnz w2, #1, Write2 | tbnz w2, #1, Write2 | ||||
| tbnz w2, #0, Write1 | tbnz w2, #0, Write1 | ||||
| Write2: | Write2: | ||||
| st1 {v29.2s}, [x0], #8 | |||||
| ldr x4, [sp, #160] | |||||
| st1 {v29.2s}, [x4], #8 | |||||
| str x4, [sp, #160] | |||||
| ext v29.16b, v29.16b, v29.16b, #8 | ext v29.16b, v29.16b, v29.16b, #8 | ||||
| tbz w2, #0, NextPixel | tbz w2, #0, NextPixel | ||||
| Write1: | Write1: | ||||
| str s29, [x0], #4 | |||||
| ldr x4, [sp, #160] | |||||
| str s29, [x4], #4 | |||||
| str x4, [sp, #160] | |||||
| NextPixel: | NextPixel: | ||||
| ldr x2, [sp, #24] | ldr x2, [sp, #24] | ||||
| @@ -279,6 +288,6 @@ End: | |||||
| ldp x25, x26, [sp, #112] | ldp x25, x26, [sp, #112] | ||||
| ldp x27, x28, [sp, #128] | ldp x27, x28, [sp, #128] | ||||
| ldp x29, x30, [sp, #144] | ldp x29, x30, [sp, #144] | ||||
| add sp, sp, #160 | |||||
| add sp, sp, #176 | |||||
| ret | ret | ||||
| #endif | #endif | ||||
| @@ -22,12 +22,13 @@ asm_function ConvDwInt8Center | |||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| // whereas our coding style do not permit such amount of parameters | // whereas our coding style do not permit such amount of parameters | ||||
| sub sp, sp, #176 | |||||
| sub sp, sp, #192 | |||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| stp x19, x20, [sp], #16 | stp x19, x20, [sp], #16 | ||||
| stp x21, x22, [sp], #16 | stp x21, x22, [sp], #16 | ||||
| stp x23, x24, [sp], #16 | stp x23, x24, [sp], #16 | ||||
| stp x25, x26, [sp], #16 | |||||
| ldr x8, [sp] | ldr x8, [sp] | ||||
| ldr x9, [sp, #8] | ldr x9, [sp, #8] | ||||
| @@ -51,9 +52,9 @@ asm_function ConvDwInt8Center | |||||
| ld1 {v24.4s}, [x17], #16 | ld1 {v24.4s}, [x17], #16 | ||||
| ld1 {v25.4s}, [x17], #16 | ld1 {v25.4s}, [x17], #16 | ||||
| ldr x18, [sp, #80] // right shift | |||||
| ld1 {v26.4s}, [x18], #16 | |||||
| ld1 {v27.4s}, [x18], #16 | |||||
| ldr x25, [sp, #80] // right shift | |||||
| ld1 {v26.4s}, [x25], #16 | |||||
| ld1 {v27.4s}, [x25], #16 | |||||
| ldr x19, [sp, #88] // acc_min | ldr x19, [sp, #88] // acc_min | ||||
| ld1 {v28.4s}, [x19], #16 | ld1 {v28.4s}, [x19], #16 | ||||
| @@ -90,7 +91,7 @@ asm_function ConvDwInt8Center | |||||
| mov v6.16b, v17.16b | mov v6.16b, v17.16b | ||||
| mov v7.16b, v18.16b | mov v7.16b, v18.16b | ||||
| LoopKh4: | LoopKh4: | ||||
| mov x18, x7 | |||||
| mov x25, x7 | |||||
| mov x21, x16 | mov x21, x16 | ||||
| LoopKw4: | LoopKw4: | ||||
| mov x22, x21 | mov x22, x21 | ||||
| @@ -116,7 +117,7 @@ asm_function ConvDwInt8Center | |||||
| smlal v6.4s, v8.4h, v16.4h | smlal v6.4s, v8.4h, v16.4h | ||||
| smlal2 v7.4s, v8.8h, v16.8h | smlal2 v7.4s, v8.8h, v16.8h | ||||
| subs x18, x18, #1 | |||||
| subs x25, x25, #1 | |||||
| add x21, x21, x13 | add x21, x21, x13 | ||||
| bne LoopKw4 | bne LoopKw4 | ||||
| add x16, x16, x12 | add x16, x16, x12 | ||||
| @@ -194,15 +195,15 @@ asm_function ConvDwInt8Center | |||||
| mov x16, x3 | mov x16, x3 | ||||
| add x17, x16, x9 | add x17, x16, x9 | ||||
| add x18, x17, x9 | |||||
| add x21, x18, x9 | |||||
| add x25, x17, x9 | |||||
| add x21, x25, x9 | |||||
| st1 {v0.s}[0], [x16], #4 | st1 {v0.s}[0], [x16], #4 | ||||
| st1 {v1.s}[0], [x16], #4 | st1 {v1.s}[0], [x16], #4 | ||||
| st1 {v2.s}[0], [x17], #4 | st1 {v2.s}[0], [x17], #4 | ||||
| st1 {v3.s}[0], [x17], #4 | st1 {v3.s}[0], [x17], #4 | ||||
| st1 {v4.s}[0], [x18], #4 | |||||
| st1 {v5.s}[0], [x18], #4 | |||||
| st1 {v4.s}[0], [x25], #4 | |||||
| st1 {v5.s}[0], [x25], #4 | |||||
| st1 {v6.s}[0], [x21], #4 | st1 {v6.s}[0], [x21], #4 | ||||
| st1 {v7.s}[0], [x21], #4 | st1 {v7.s}[0], [x21], #4 | ||||
| @@ -221,7 +222,7 @@ asm_function ConvDwInt8Center | |||||
| mov v0.16b, v17.16b | mov v0.16b, v17.16b | ||||
| mov v1.16b, v18.16b | mov v1.16b, v18.16b | ||||
| LoopKh: | LoopKh: | ||||
| mov x18, x7 | |||||
| mov x25, x7 | |||||
| mov x22, x16 | mov x22, x16 | ||||
| LoopKw: | LoopKw: | ||||
| ld1 {v15.8b}, [x22], x13 | ld1 {v15.8b}, [x22], x13 | ||||
| @@ -229,7 +230,7 @@ asm_function ConvDwInt8Center | |||||
| ld1 {v16.8h}, [x17], #16 | ld1 {v16.8h}, [x17], #16 | ||||
| smlal v0.4s, v14.4h, v16.4h | smlal v0.4s, v14.4h, v16.4h | ||||
| smlal2 v1.4s, v14.8h, v16.8h | smlal2 v1.4s, v14.8h, v16.8h | ||||
| subs x18, x18, #1 | |||||
| subs x25, x25, #1 | |||||
| bne LoopKw | bne LoopKw | ||||
| add x16, x16, x12 | add x16, x16, x12 | ||||
| subs x20, x20, #1 | subs x20, x20, #1 | ||||
| @@ -271,11 +272,12 @@ asm_function ConvDwInt8Center | |||||
| subs x4, x4, #1 | subs x4, x4, #1 | ||||
| bne LoopH | bne LoopH | ||||
| sub sp, sp, #176 | |||||
| sub sp, sp, #192 | |||||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| ldp x19, x20, [sp], #16 | ldp x19, x20, [sp], #16 | ||||
| ldp x21, x22, [sp], #16 | ldp x21, x22, [sp], #16 | ||||
| ldp x23, x24, [sp], #16 | ldp x23, x24, [sp], #16 | ||||
| ldp x25, x26, [sp], #16 | |||||
| ret | ret | ||||
| #endif | #endif | ||||
| @@ -47,11 +47,11 @@ asm_function ConvSwFp32Center | |||||
| LoopH: | LoopH: | ||||
| mov x17, x1 | mov x17, x1 | ||||
| mov x18, x5 | |||||
| mov x28, x5 | |||||
| mov x3, x0 | mov x3, x0 | ||||
| cmp x18, #8 | |||||
| cmp x28, #8 | |||||
| blt LoopW | blt LoopW | ||||
| cmp x18, #16 | |||||
| cmp x28, #16 | |||||
| blt LoopW8 | blt LoopW8 | ||||
| LoopW16: | LoopW16: | ||||
| @@ -244,12 +244,12 @@ asm_function ConvSwFp32Center | |||||
| st1 {v14.4s}, [x3], x9 | st1 {v14.4s}, [x3], x9 | ||||
| st1 {v15.4s}, [x3], x9 | st1 {v15.4s}, [x3], x9 | ||||
| add x17, x17, x19 | add x17, x17, x19 | ||||
| sub x18, x18, #16 | |||||
| cmp x18, #0 | |||||
| sub x28, x28, #16 | |||||
| cmp x28, #0 | |||||
| ble LoopWEnd | ble LoopWEnd | ||||
| cmp x18, #8 | |||||
| cmp x28, #8 | |||||
| blt LoopW | blt LoopW | ||||
| cmp x18, #16 | |||||
| cmp x28, #16 | |||||
| bge LoopW16 | bge LoopW16 | ||||
| LoopW8: | LoopW8: | ||||
| mov x19, #8 | mov x19, #8 | ||||
| @@ -369,10 +369,10 @@ asm_function ConvSwFp32Center | |||||
| st1 {v6.4s}, [x3], x9 | st1 {v6.4s}, [x3], x9 | ||||
| st1 {v7.4s}, [x3], x9 | st1 {v7.4s}, [x3], x9 | ||||
| add x17, x17, x19 | add x17, x17, x19 | ||||
| sub x18, x18, #8 | |||||
| cmp x18, #0 | |||||
| sub x28, x28, #8 | |||||
| cmp x28, #0 | |||||
| ble LoopWEnd | ble LoopWEnd | ||||
| cmp x18, #8 | |||||
| cmp x28, #8 | |||||
| bge LoopW8 | bge LoopW8 | ||||
| LoopW: | LoopW: | ||||
| mov x20, x17 | mov x20, x17 | ||||
| @@ -427,7 +427,7 @@ asm_function ConvSwFp32Center | |||||
| Write: | Write: | ||||
| st1 {v0.4s}, [x3], x9 | st1 {v0.4s}, [x3], x9 | ||||
| add x17, x17, x12 | add x17, x17, x12 | ||||
| subs x18, x18, #1 | |||||
| subs x28, x28, #1 | |||||
| bne LoopW | bne LoopW | ||||
| LoopWEnd: | LoopWEnd: | ||||
| add x0, x0, x8 | add x0, x0, x8 | ||||
| @@ -33,12 +33,12 @@ asm_function DeconvDwFp32Center | |||||
| mov x16, x1 | mov x16, x1 | ||||
| mov x17, x4 | mov x17, x4 | ||||
| LoopW: | LoopW: | ||||
| mov x18, x15 | |||||
| mov x22, x15 | |||||
| mov x19, x2 | mov x19, x2 | ||||
| mov x20, x5 | mov x20, x5 | ||||
| ld1 {v1.4s}, [x16], x8 | ld1 {v1.4s}, [x16], x8 | ||||
| LoopKh: | LoopKh: | ||||
| mov x21, x18 | |||||
| mov x21, x22 | |||||
| mov x13, x6 | mov x13, x6 | ||||
| LoopKw: | LoopKw: | ||||
| ld1 {v0.4s}, [x21] | ld1 {v0.4s}, [x21] | ||||
| @@ -47,7 +47,7 @@ asm_function DeconvDwFp32Center | |||||
| st1 {v0.4s}, [x21], x12 | st1 {v0.4s}, [x21], x12 | ||||
| subs x13, x13, #1 | subs x13, x13, #1 | ||||
| bne LoopKw | bne LoopKw | ||||
| add x18, x18, x11 | |||||
| add x22, x22, x11 | |||||
| subs x20, x20, #1 | subs x20, x20, #1 | ||||
| bne LoopKh | bne LoopKh | ||||
| add x15, x15, x10 | add x15, x15, x10 | ||||
| @@ -21,30 +21,31 @@ | |||||
| // w13: c8_nhwc_c4 | // w13: c8_nhwc_c4 | ||||
| asm_function MatmulFloatNeon64 | asm_function MatmulFloatNeon64 | ||||
| sub sp, sp, #128 | |||||
| sub sp, sp, #144 | |||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| stp x19, x20, [sp], #16 | |||||
| ldr x9, [sp, #8] | ldr x9, [sp, #8] | ||||
| ldr x14, [sp, #16] | ldr x14, [sp, #16] | ||||
| mov w18, #32 // sizeof(float) * 8 | |||||
| mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float) * 8 * depth | |||||
| mov x18, #4 | |||||
| mov w19, #32 // sizeof(float) * 8 | |||||
| mul w15, w5, w19 // block stride of lhs/rhs: sizeof(float) * 8 * depth | |||||
| mov x19, #4 | |||||
| ldr x17, [sp] | ldr x17, [sp] | ||||
| cbz x14, NoWinoSteps | cbz x14, NoWinoSteps | ||||
| mul x8, x7, x17 | mul x8, x7, x17 | ||||
| mov x11, #8 | mov x11, #8 | ||||
| mul x11, x11, x17 | mul x11, x11, x17 | ||||
| mul x8, x8, x18 | |||||
| mul x11, x11, x18 | |||||
| mul x8, x8, x19 | |||||
| mul x11, x11, x19 | |||||
| NoWinoSteps: | NoWinoSteps: | ||||
| mul x17, x17, x18 | |||||
| mul x17, x17, x19 | |||||
| L1: | L1: | ||||
| mov w10, w6 // reload lhs row | mov w10, w6 // reload lhs row | ||||
| mov x12, x0 // reload lhs ptr | mov x12, x0 // reload lhs ptr | ||||
| mov x18, x2 // reload dst ptr | |||||
| mov x19, x2 // reload dst ptr | |||||
| L2: | L2: | ||||
| mov x16, x1 // reload rhs ptr | mov x16, x1 // reload rhs ptr | ||||
| @@ -254,435 +255,435 @@ Write: | |||||
| b Write8 | b Write8 | ||||
| Write1: | Write1: | ||||
| str s8, [x18] | |||||
| str s8, [x19] | |||||
| cmp w10, #1 | cmp w10, #1 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x18, x18, x17 | |||||
| str s10, [x18] | |||||
| add x19, x19, x17 | |||||
| str s10, [x19] | |||||
| cmp w10, #2 | cmp w10, #2 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x18, x18, x17 | |||||
| str s12, [x18] | |||||
| add x19, x19, x17 | |||||
| str s12, [x19] | |||||
| cmp w10, #3 | cmp w10, #3 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x18, x18, x17 | |||||
| str s14, [x18] | |||||
| add x19, x19, x17 | |||||
| str s14, [x19] | |||||
| cmp w10, #4 | cmp w10, #4 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x18, x18, x17 | |||||
| str s16, [x18] | |||||
| add x19, x19, x17 | |||||
| str s16, [x19] | |||||
| cmp w10, #5 | cmp w10, #5 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x18, x18, x17 | |||||
| str s18, [x18] | |||||
| add x19, x19, x17 | |||||
| str s18, [x19] | |||||
| cmp w10, #6 | cmp w10, #6 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x18, x18, x17 | |||||
| str s20, [x18] | |||||
| add x19, x19, x17 | |||||
| str s20, [x19] | |||||
| cmp w10, #7 | cmp w10, #7 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x18, x18, x17 | |||||
| str s22, [x18] | |||||
| add x19, x19, x17 | |||||
| str s22, [x19] | |||||
| cmp w10, #8 | cmp w10, #8 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x18, x18, x17 | |||||
| str s24, [x18] | |||||
| add x19, x19, x17 | |||||
| str s24, [x19] | |||||
| cmp w10, #9 | cmp w10, #9 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x18, x18, x17 | |||||
| str s26, [x18] | |||||
| add x19, x19, x17 | |||||
| str s26, [x19] | |||||
| cmp w10, #10 | cmp w10, #10 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x18, x18, x17 | |||||
| str s28, [x18] | |||||
| add x19, x19, x17 | |||||
| str s28, [x19] | |||||
| cmp w10, #11 | cmp w10, #11 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x18, x18, x17 | |||||
| str s30, [x18] | |||||
| add x18, x18, x17 | |||||
| add x19, x19, x17 | |||||
| str s30, [x19] | |||||
| add x19, x19, x17 | |||||
| b WriteEnd | b WriteEnd | ||||
| Write2: | Write2: | ||||
| dup s9, v8.s[1] | dup s9, v8.s[1] | ||||
| stp s8, s9, [x18] | |||||
| stp s8, s9, [x19] | |||||
| cmp w10, #1 | cmp w10, #1 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x18, x18, x17 | |||||
| add x19, x19, x17 | |||||
| dup s11, v10.s[1] | dup s11, v10.s[1] | ||||
| stp s10, s11, [x18] | |||||
| stp s10, s11, [x19] | |||||
| cmp w10, #2 | cmp w10, #2 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x18, x18, x17 | |||||
| add x19, x19, x17 | |||||
| dup s13, v12.s[1] | dup s13, v12.s[1] | ||||
| stp s12, s13, [x18] | |||||
| stp s12, s13, [x19] | |||||
| cmp w10, #3 | cmp w10, #3 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x18, x18, x17 | |||||
| add x19, x19, x17 | |||||
| dup s15, v14.s[1] | dup s15, v14.s[1] | ||||
| stp s14, s15, [x18] | |||||
| stp s14, s15, [x19] | |||||
| cmp w10, #4 | cmp w10, #4 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x18, x18, x17 | |||||
| add x19, x19, x17 | |||||
| dup s17, v16.s[1] | dup s17, v16.s[1] | ||||
| stp s16, s17, [x18] | |||||
| stp s16, s17, [x19] | |||||
| cmp w10, #5 | cmp w10, #5 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x18, x18, x17 | |||||
| add x19, x19, x17 | |||||
| dup s19, v18.s[1] | dup s19, v18.s[1] | ||||
| stp s18, s19, [x18] | |||||
| stp s18, s19, [x19] | |||||
| cmp w10, #6 | cmp w10, #6 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x18, x18, x17 | |||||
| add x19, x19, x17 | |||||
| dup s21, v20.s[1] | dup s21, v20.s[1] | ||||
| stp s20, s21, [x18] | |||||
| stp s20, s21, [x19] | |||||
| cmp w10, #7 | cmp w10, #7 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x18, x18, x17 | |||||
| add x19, x19, x17 | |||||
| dup s23, v22.s[1] | dup s23, v22.s[1] | ||||
| stp s22, s23, [x18] | |||||
| stp s22, s23, [x19] | |||||
| cmp w10, #8 | cmp w10, #8 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x18, x18, x17 | |||||
| add x19, x19, x17 | |||||
| dup s25, v24.s[1] | dup s25, v24.s[1] | ||||
| stp s24, s25, [x18] | |||||
| stp s24, s25, [x19] | |||||
| cmp w10, #9 | cmp w10, #9 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x18, x18, x17 | |||||
| add x19, x19, x17 | |||||
| dup s27, v26.s[1] | dup s27, v26.s[1] | ||||
| stp s26, s27, [x18] | |||||
| stp s26, s27, [x19] | |||||
| cmp w10, #10 | cmp w10, #10 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x18, x18, x17 | |||||
| add x19, x19, x17 | |||||
| dup s29, v28.s[1] | dup s29, v28.s[1] | ||||
| stp s28, s29, [x18] | |||||
| stp s28, s29, [x19] | |||||
| cmp w10, #11 | cmp w10, #11 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x18, x18, x17 | |||||
| add x19, x19, x17 | |||||
| dup s31, v30.s[1] | dup s31, v30.s[1] | ||||
| stp s30, s31, [x18] | |||||
| add x18, x18, x17 | |||||
| stp s30, s31, [x19] | |||||
| add x19, x19, x17 | |||||
| b WriteEnd | b WriteEnd | ||||
| Write3: | Write3: | ||||
| add x13, x18, #8 | |||||
| add x13, x19, #8 | |||||
| dup s9, v8.s[1] | dup s9, v8.s[1] | ||||
| stp s8, s9, [x18] | |||||
| add x18, x18, x17 | |||||
| stp s8, s9, [x19] | |||||
| add x19, x19, x17 | |||||
| st1 {v8.s}[2], [x13], x17 | st1 {v8.s}[2], [x13], x17 | ||||
| cmp w10, #1 | cmp w10, #1 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| dup s11, v10.s[1] | dup s11, v10.s[1] | ||||
| stp s10, s11, [x18] | |||||
| add x18, x18, x17 | |||||
| stp s10, s11, [x19] | |||||
| add x19, x19, x17 | |||||
| st1 {v10.s}[2], [x13], x17 | st1 {v10.s}[2], [x13], x17 | ||||
| cmp w10, #2 | cmp w10, #2 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| dup s13, v12.s[1] | dup s13, v12.s[1] | ||||
| stp s12, s13, [x18] | |||||
| add x18, x18, x17 | |||||
| stp s12, s13, [x19] | |||||
| add x19, x19, x17 | |||||
| st1 {v12.s}[2], [x13], x17 | st1 {v12.s}[2], [x13], x17 | ||||
| cmp w10, #3 | cmp w10, #3 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| dup s15, v14.s[1] | dup s15, v14.s[1] | ||||
| stp s14, s15, [x18] | |||||
| add x18, x18, x17 | |||||
| stp s14, s15, [x19] | |||||
| add x19, x19, x17 | |||||
| st1 {v14.s}[2], [x13], x17 | st1 {v14.s}[2], [x13], x17 | ||||
| cmp w10, #4 | cmp w10, #4 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| dup s17, v16.s[1] | dup s17, v16.s[1] | ||||
| stp s16, s17, [x18] | |||||
| add x18, x18, x17 | |||||
| stp s16, s17, [x19] | |||||
| add x19, x19, x17 | |||||
| st1 {v16.s}[2], [x13], x17 | st1 {v16.s}[2], [x13], x17 | ||||
| cmp w10, #5 | cmp w10, #5 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| dup s19, v18.s[1] | dup s19, v18.s[1] | ||||
| stp s18, s19, [x18] | |||||
| add x18, x18, x17 | |||||
| stp s18, s19, [x19] | |||||
| add x19, x19, x17 | |||||
| st1 {v18.s}[2], [x13], x17 | st1 {v18.s}[2], [x13], x17 | ||||
| cmp w10, #6 | cmp w10, #6 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| dup s21, v20.s[1] | dup s21, v20.s[1] | ||||
| stp s20, s21, [x18] | |||||
| add x18, x18, x17 | |||||
| stp s20, s21, [x19] | |||||
| add x19, x19, x17 | |||||
| st1 {v20.s}[2], [x13], x17 | st1 {v20.s}[2], [x13], x17 | ||||
| cmp w10, #7 | cmp w10, #7 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| dup s23, v22.s[1] | dup s23, v22.s[1] | ||||
| stp s22, s23, [x18] | |||||
| add x18, x18, x17 | |||||
| stp s22, s23, [x19] | |||||
| add x19, x19, x17 | |||||
| st1 {v22.s}[2], [x13], x17 | st1 {v22.s}[2], [x13], x17 | ||||
| cmp w10, #8 | cmp w10, #8 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| dup s25, v24.s[1] | dup s25, v24.s[1] | ||||
| stp s24, s25, [x18] | |||||
| add x18, x18, x17 | |||||
| stp s24, s25, [x19] | |||||
| add x19, x19, x17 | |||||
| st1 {v24.s}[2], [x13], x17 | st1 {v24.s}[2], [x13], x17 | ||||
| cmp w10, #9 | cmp w10, #9 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| dup s27, v26.s[1] | dup s27, v26.s[1] | ||||
| stp s26, s27, [x18] | |||||
| add x18, x18, x17 | |||||
| stp s26, s27, [x19] | |||||
| add x19, x19, x17 | |||||
| st1 {v26.s}[2], [x13], x17 | st1 {v26.s}[2], [x13], x17 | ||||
| cmp w10, #10 | cmp w10, #10 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| dup s29, v28.s[1] | dup s29, v28.s[1] | ||||
| stp s28, s29, [x18] | |||||
| add x18, x18, x17 | |||||
| stp s28, s29, [x19] | |||||
| add x19, x19, x17 | |||||
| st1 {v28.s}[2], [x13], x17 | st1 {v28.s}[2], [x13], x17 | ||||
| cmp w10, #11 | cmp w10, #11 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| dup s31, v30.s[1] | dup s31, v30.s[1] | ||||
| stp s30, s31, [x18] | |||||
| add x18, x18, x17 | |||||
| stp s30, s31, [x19] | |||||
| add x19, x19, x17 | |||||
| st1 {v30.s}[2], [x13] | st1 {v30.s}[2], [x13] | ||||
| b WriteEnd | b WriteEnd | ||||
| Write4: | Write4: | ||||
| st1 {v8.4s}, [x18], x17 | |||||
| st1 {v8.4s}, [x19], x17 | |||||
| cmp w10, #1 | cmp w10, #1 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v10.4s}, [x18], x17 | |||||
| st1 {v10.4s}, [x19], x17 | |||||
| cmp w10, #2 | cmp w10, #2 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v12.4s}, [x18], x17 | |||||
| st1 {v12.4s}, [x19], x17 | |||||
| cmp w10, #3 | cmp w10, #3 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v14.4s}, [x18], x17 | |||||
| st1 {v14.4s}, [x19], x17 | |||||
| cmp w10, #4 | cmp w10, #4 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v16.4s}, [x18], x17 | |||||
| st1 {v16.4s}, [x19], x17 | |||||
| cmp w10, #5 | cmp w10, #5 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v18.4s}, [x18], x17 | |||||
| st1 {v18.4s}, [x19], x17 | |||||
| cmp w10, #6 | cmp w10, #6 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v20.4s}, [x18], x17 | |||||
| st1 {v20.4s}, [x19], x17 | |||||
| cmp w10, #7 | cmp w10, #7 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v22.4s}, [x18], x17 | |||||
| st1 {v22.4s}, [x19], x17 | |||||
| cmp w10, #8 | cmp w10, #8 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v24.4s}, [x18], x17 | |||||
| st1 {v24.4s}, [x19], x17 | |||||
| cmp w10, #9 | cmp w10, #9 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v26.4s}, [x18], x17 | |||||
| st1 {v26.4s}, [x19], x17 | |||||
| cmp w10, #10 | cmp w10, #10 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v28.4s}, [x18], x17 | |||||
| st1 {v28.4s}, [x19], x17 | |||||
| cmp w10, #11 | cmp w10, #11 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v30.4s}, [x18], x17 | |||||
| st1 {v30.4s}, [x19], x17 | |||||
| b WriteEnd | b WriteEnd | ||||
| Write5: | Write5: | ||||
| add x13, x18, #16 | |||||
| st1 {v8.4s}, [x18], x17 | |||||
| add x13, x19, #16 | |||||
| st1 {v8.4s}, [x19], x17 | |||||
| str s9, [x13] | str s9, [x13] | ||||
| cmp w10, #1 | cmp w10, #1 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v10.4s}, [x18], x17 | |||||
| st1 {v10.4s}, [x19], x17 | |||||
| str s11, [x13] | str s11, [x13] | ||||
| cmp w10, #2 | cmp w10, #2 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v12.4s}, [x18], x17 | |||||
| st1 {v12.4s}, [x19], x17 | |||||
| str s13, [x13] | str s13, [x13] | ||||
| cmp w10, #3 | cmp w10, #3 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v14.4s}, [x18], x17 | |||||
| st1 {v14.4s}, [x19], x17 | |||||
| str s15, [x13] | str s15, [x13] | ||||
| cmp w10, #4 | cmp w10, #4 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v16.4s}, [x18], x17 | |||||
| st1 {v16.4s}, [x19], x17 | |||||
| str s17, [x13] | str s17, [x13] | ||||
| cmp w10, #5 | cmp w10, #5 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v18.4s}, [x18], x17 | |||||
| st1 {v18.4s}, [x19], x17 | |||||
| str s19, [x13] | str s19, [x13] | ||||
| cmp w10, #6 | cmp w10, #6 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v20.4s}, [x18], x17 | |||||
| st1 {v20.4s}, [x19], x17 | |||||
| str s21, [x13] | str s21, [x13] | ||||
| cmp w10, #7 | cmp w10, #7 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v22.4s}, [x18], x17 | |||||
| st1 {v22.4s}, [x19], x17 | |||||
| str s23, [x13] | str s23, [x13] | ||||
| cmp w10, #8 | cmp w10, #8 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v24.4s}, [x18], x17 | |||||
| st1 {v24.4s}, [x19], x17 | |||||
| str s25, [x13] | str s25, [x13] | ||||
| cmp w10, #9 | cmp w10, #9 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v26.4s}, [x18], x17 | |||||
| st1 {v26.4s}, [x19], x17 | |||||
| str s27, [x13] | str s27, [x13] | ||||
| cmp w10, #10 | cmp w10, #10 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v28.4s}, [x18], x17 | |||||
| st1 {v28.4s}, [x19], x17 | |||||
| str s29, [x13] | str s29, [x13] | ||||
| cmp w10, #11 | cmp w10, #11 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v30.4s}, [x18], x17 | |||||
| st1 {v30.4s}, [x19], x17 | |||||
| str s31, [x13] | str s31, [x13] | ||||
| b WriteEnd | b WriteEnd | ||||
| Write6: | Write6: | ||||
| add x13, x18, #16 | |||||
| st1 {v8.4s}, [x18], x17 | |||||
| add x13, x19, #16 | |||||
| st1 {v8.4s}, [x19], x17 | |||||
| dup s8, v9.s[1] | dup s8, v9.s[1] | ||||
| stp s9, s8, [x13] | stp s9, s8, [x13] | ||||
| cmp w10, #1 | cmp w10, #1 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v10.4s}, [x18], x17 | |||||
| st1 {v10.4s}, [x19], x17 | |||||
| dup s10, v11.s[1] | dup s10, v11.s[1] | ||||
| stp s11, s10, [x13] | stp s11, s10, [x13] | ||||
| cmp w10, #2 | cmp w10, #2 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v12.4s}, [x18], x17 | |||||
| st1 {v12.4s}, [x19], x17 | |||||
| dup s12, v13.s[1] | dup s12, v13.s[1] | ||||
| stp s13, s12, [x13] | stp s13, s12, [x13] | ||||
| cmp w10, #3 | cmp w10, #3 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v14.4s}, [x18], x17 | |||||
| st1 {v14.4s}, [x19], x17 | |||||
| dup s14, v15.s[1] | dup s14, v15.s[1] | ||||
| stp s15, s14, [x13] | stp s15, s14, [x13] | ||||
| cmp w10, #4 | cmp w10, #4 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v16.4s}, [x18], x17 | |||||
| st1 {v16.4s}, [x19], x17 | |||||
| dup s16, v17.s[1] | dup s16, v17.s[1] | ||||
| stp s17, s16, [x13] | stp s17, s16, [x13] | ||||
| cmp w10, #5 | cmp w10, #5 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v18.4s}, [x18], x17 | |||||
| st1 {v18.4s}, [x19], x17 | |||||
| dup s18, v19.s[1] | dup s18, v19.s[1] | ||||
| stp s19, s18, [x13] | stp s19, s18, [x13] | ||||
| cmp w10, #6 | cmp w10, #6 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v20.4s}, [x18], x17 | |||||
| st1 {v20.4s}, [x19], x17 | |||||
| dup s20, v21.s[1] | dup s20, v21.s[1] | ||||
| stp s21, s20, [x13] | stp s21, s20, [x13] | ||||
| cmp w10, #7 | cmp w10, #7 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v22.4s}, [x18], x17 | |||||
| st1 {v22.4s}, [x19], x17 | |||||
| dup s22, v23.s[1] | dup s22, v23.s[1] | ||||
| stp s23, s22, [x13] | stp s23, s22, [x13] | ||||
| cmp w10, #8 | cmp w10, #8 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v24.4s}, [x18], x17 | |||||
| st1 {v24.4s}, [x19], x17 | |||||
| dup s24, v25.s[1] | dup s24, v25.s[1] | ||||
| stp s25, s24, [x13] | stp s25, s24, [x13] | ||||
| cmp w10, #9 | cmp w10, #9 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v26.4s}, [x18], x17 | |||||
| st1 {v26.4s}, [x19], x17 | |||||
| dup s26, v27.s[1] | dup s26, v27.s[1] | ||||
| stp s27, s26, [x13] | stp s27, s26, [x13] | ||||
| cmp w10, #10 | cmp w10, #10 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v28.4s}, [x18], x17 | |||||
| st1 {v28.4s}, [x19], x17 | |||||
| dup s28, v29.s[1] | dup s28, v29.s[1] | ||||
| stp s29, s28, [x13] | stp s29, s28, [x13] | ||||
| cmp w10, #11 | cmp w10, #11 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v30.4s}, [x18], x17 | |||||
| st1 {v30.4s}, [x19], x17 | |||||
| dup s30, v31.s[1] | dup s30, v31.s[1] | ||||
| stp s31, s30, [x13] | stp s31, s30, [x13] | ||||
| b WriteEnd | b WriteEnd | ||||
| Write7: | Write7: | ||||
| add x13, x18, #16 | |||||
| add x16, x18, #24 | |||||
| st1 {v8.4s}, [x18], x17 | |||||
| add x13, x19, #16 | |||||
| add x16, x19, #24 | |||||
| st1 {v8.4s}, [x19], x17 | |||||
| dup s8, v9.s[1] | dup s8, v9.s[1] | ||||
| stp s9, s8, [x13] | stp s9, s8, [x13] | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v9.s}[2], [x16], x17 | st1 {v9.s}[2], [x16], x17 | ||||
| cmp w10, #1 | cmp w10, #1 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v10.4s}, [x18], x17 | |||||
| st1 {v10.4s}, [x19], x17 | |||||
| dup s10, v11.s[1] | dup s10, v11.s[1] | ||||
| stp s11, s10, [x13] | stp s11, s10, [x13] | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v11.s}[2], [x16], x17 | st1 {v11.s}[2], [x16], x17 | ||||
| cmp w10, #2 | cmp w10, #2 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v12.4s}, [x18], x17 | |||||
| st1 {v12.4s}, [x19], x17 | |||||
| dup s12, v13.s[1] | dup s12, v13.s[1] | ||||
| stp s13, s12, [x13] | stp s13, s12, [x13] | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v13.s}[2], [x16], x17 | st1 {v13.s}[2], [x16], x17 | ||||
| cmp w10, #3 | cmp w10, #3 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v14.4s}, [x18], x17 | |||||
| st1 {v14.4s}, [x19], x17 | |||||
| dup s14, v15.s[1] | dup s14, v15.s[1] | ||||
| stp s15, s14, [x13] | stp s15, s14, [x13] | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v15.s}[2], [x16], x17 | st1 {v15.s}[2], [x16], x17 | ||||
| cmp w10, #4 | cmp w10, #4 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v16.4s}, [x18], x17 | |||||
| st1 {v16.4s}, [x19], x17 | |||||
| dup s16, v17.s[1] | dup s16, v17.s[1] | ||||
| stp s17, s16, [x13] | stp s17, s16, [x13] | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v17.s}[2], [x16], x17 | st1 {v17.s}[2], [x16], x17 | ||||
| cmp w10, #5 | cmp w10, #5 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v18.4s}, [x18], x17 | |||||
| st1 {v18.4s}, [x19], x17 | |||||
| dup s18, v19.s[1] | dup s18, v19.s[1] | ||||
| stp s19, s18, [x13] | stp s19, s18, [x13] | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v19.s}[2], [x16], x17 | st1 {v19.s}[2], [x16], x17 | ||||
| cmp w10, #6 | cmp w10, #6 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v20.4s}, [x18], x17 | |||||
| st1 {v20.4s}, [x19], x17 | |||||
| dup s20, v21.s[1] | dup s20, v21.s[1] | ||||
| stp s21, s20, [x13] | stp s21, s20, [x13] | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v21.s}[2], [x16], x17 | st1 {v21.s}[2], [x16], x17 | ||||
| cmp w10, #7 | cmp w10, #7 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v22.4s}, [x18], x17 | |||||
| st1 {v22.4s}, [x19], x17 | |||||
| dup s22, v23.s[1] | dup s22, v23.s[1] | ||||
| stp s23, s22, [x13] | stp s23, s22, [x13] | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v23.s}[2], [x16], x17 | st1 {v23.s}[2], [x16], x17 | ||||
| cmp w10, #8 | cmp w10, #8 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v24.4s}, [x18], x17 | |||||
| st1 {v24.4s}, [x19], x17 | |||||
| dup s24, v25.s[1] | dup s24, v25.s[1] | ||||
| stp s25, s24, [x13] | stp s25, s24, [x13] | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v25.s}[2], [x16], x17 | st1 {v25.s}[2], [x16], x17 | ||||
| cmp w10, #9 | cmp w10, #9 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v26.4s}, [x18], x17 | |||||
| st1 {v26.4s}, [x19], x17 | |||||
| dup s26, v27.s[1] | dup s26, v27.s[1] | ||||
| stp s27, s26, [x13] | stp s27, s26, [x13] | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v27.s}[2], [x16], x17 | st1 {v27.s}[2], [x16], x17 | ||||
| cmp w10, #10 | cmp w10, #10 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v28.4s}, [x18], x17 | |||||
| st1 {v28.4s}, [x19], x17 | |||||
| dup s28, v29.s[1] | dup s28, v29.s[1] | ||||
| stp s29, s28, [x13] | stp s29, s28, [x13] | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| st1 {v29.s}[2], [x16], x17 | st1 {v29.s}[2], [x16], x17 | ||||
| cmp w10, #11 | cmp w10, #11 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v30.4s}, [x18], x17 | |||||
| st1 {v30.4s}, [x19], x17 | |||||
| dup s30, v31.s[1] | dup s30, v31.s[1] | ||||
| stp s31, s30, [x13] | stp s31, s30, [x13] | ||||
| add x13, x13, x17 | add x13, x13, x17 | ||||
| @@ -697,54 +698,54 @@ WriteC8: | |||||
| st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x2], #64 | st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x2], #64 | ||||
| b WriteEnd | b WriteEnd | ||||
| WriteWino: | WriteWino: | ||||
| st1 {v8.4s, v9.4s}, [x18], x8 | |||||
| st1 {v10.4s, v11.4s}, [x18], x8 | |||||
| st1 {v12.4s, v13.4s}, [x18], x8 | |||||
| st1 {v14.4s, v15.4s}, [x18], x8 | |||||
| st1 {v16.4s, v17.4s}, [x18], x8 | |||||
| st1 {v18.4s, v19.4s}, [x18], x8 | |||||
| st1 {v20.4s, v21.4s}, [x18], x8 | |||||
| st1 {v22.4s, v23.4s}, [x18], x8 | |||||
| st1 {v24.4s, v25.4s}, [x18], x8 | |||||
| st1 {v26.4s, v27.4s}, [x18], x8 | |||||
| st1 {v28.4s, v29.4s}, [x18], x8 | |||||
| st1 {v30.4s, v31.4s}, [x18], x8 | |||||
| st1 {v8.4s, v9.4s}, [x19], x8 | |||||
| st1 {v10.4s, v11.4s}, [x19], x8 | |||||
| st1 {v12.4s, v13.4s}, [x19], x8 | |||||
| st1 {v14.4s, v15.4s}, [x19], x8 | |||||
| st1 {v16.4s, v17.4s}, [x19], x8 | |||||
| st1 {v18.4s, v19.4s}, [x19], x8 | |||||
| st1 {v20.4s, v21.4s}, [x19], x8 | |||||
| st1 {v22.4s, v23.4s}, [x19], x8 | |||||
| st1 {v24.4s, v25.4s}, [x19], x8 | |||||
| st1 {v26.4s, v27.4s}, [x19], x8 | |||||
| st1 {v28.4s, v29.4s}, [x19], x8 | |||||
| st1 {v30.4s, v31.4s}, [x19], x8 | |||||
| b WriteEnd | b WriteEnd | ||||
| Write8: | Write8: | ||||
| st1 {v8.4s, v9.4s}, [x18], x17 | |||||
| st1 {v8.4s, v9.4s}, [x19], x17 | |||||
| cmp w10, #1 | cmp w10, #1 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v10.4s, v11.4s}, [x18], x17 | |||||
| st1 {v10.4s, v11.4s}, [x19], x17 | |||||
| cmp w10, #2 | cmp w10, #2 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v12.4s, v13.4s}, [x18], x17 | |||||
| st1 {v12.4s, v13.4s}, [x19], x17 | |||||
| cmp w10, #3 | cmp w10, #3 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v14.4s, v15.4s}, [x18], x17 | |||||
| st1 {v14.4s, v15.4s}, [x19], x17 | |||||
| cmp w10, #4 | cmp w10, #4 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v16.4s, v17.4s}, [x18], x17 | |||||
| st1 {v16.4s, v17.4s}, [x19], x17 | |||||
| cmp w10, #5 | cmp w10, #5 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v18.4s, v19.4s}, [x18], x17 | |||||
| st1 {v18.4s, v19.4s}, [x19], x17 | |||||
| cmp w10, #6 | cmp w10, #6 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v20.4s, v21.4s}, [x18], x17 | |||||
| st1 {v20.4s, v21.4s}, [x19], x17 | |||||
| cmp w10, #7 | cmp w10, #7 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v22.4s, v23.4s}, [x18], x17 | |||||
| st1 {v22.4s, v23.4s}, [x19], x17 | |||||
| cmp w10, #8 | cmp w10, #8 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v24.4s, v25.4s}, [x18], x17 | |||||
| st1 {v24.4s, v25.4s}, [x19], x17 | |||||
| cmp w10, #9 | cmp w10, #9 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v26.4s, v27.4s}, [x18], x17 | |||||
| st1 {v26.4s, v27.4s}, [x19], x17 | |||||
| cmp w10, #10 | cmp w10, #10 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v28.4s, v29.4s}, [x18], x17 | |||||
| st1 {v28.4s, v29.4s}, [x19], x17 | |||||
| cmp w10, #11 | cmp w10, #11 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v30.4s, v31.4s}, [x18], x17 | |||||
| st1 {v30.4s, v31.4s}, [x19], x17 | |||||
| WriteEnd: | WriteEnd: | ||||
| subs w10, w10, #12 // lhs row - 12 | subs w10, w10, #12 // lhs row - 12 | ||||
| @@ -766,8 +767,9 @@ NoDstStep: | |||||
| bgt L1 | bgt L1 | ||||
| End1: | End1: | ||||
| sub sp, sp, #128 | |||||
| sub sp, sp, #144 | |||||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| ldp x19, x20, [sp], #16 | |||||
| ret | ret | ||||
| #endif | #endif | ||||
| @@ -21,31 +21,32 @@ | |||||
| // x9: writeMode | // x9: writeMode | ||||
| asm_function MatmulFloatNeon64Opt | asm_function MatmulFloatNeon64Opt | ||||
| sub sp, sp, #144 | |||||
| sub sp, sp, #160 | |||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| stp x19, x20, [sp], #16 | stp x19, x20, [sp], #16 | ||||
| stp x21, x22, [sp], #16 | |||||
| ldr x8, [sp] | ldr x8, [sp] | ||||
| ldr x9, [sp, #8] | ldr x9, [sp, #8] | ||||
| mov x18, #48 // sizeof(float) * 12 | |||||
| mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float) * 12 * depth | |||||
| mov x21, #48 // sizeof(float) * 12 | |||||
| mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth | |||||
| cbnz x9, NoC8Steps | cbnz x9, NoC8Steps | ||||
| mov x11, x2 | mov x11, x2 | ||||
| mov x18, #32 | |||||
| mul x16, x6, x18 // row * 8 * sizeof(float) | |||||
| mov x21, #32 | |||||
| mul x16, x6, x21 // row * 8 * sizeof(float) | |||||
| NoC8Steps: | NoC8Steps: | ||||
| cmp x9, #2 | cmp x9, #2 | ||||
| bne NoWinoSteps | bne NoWinoSteps | ||||
| mov x18, #4 | |||||
| mov x21, #4 | |||||
| mul x15, x7, x8 | mul x15, x7, x8 | ||||
| mul x15, x15, x18 // kernel_size * col *sizeof(float) | |||||
| mov x18, #32 | |||||
| mul x16, x8, x18 // kernel_size * 8 * sizeof(float) | |||||
| mul x15, x15, x21 // kernel_size * col *sizeof(float) | |||||
| mov x21, #32 | |||||
| mul x16, x8, x21 // kernel_size * 8 * sizeof(float) | |||||
| NoWinoSteps: | NoWinoSteps: | ||||
| mov x18, #4 | |||||
| mul x8, x8, x18 | |||||
| mov x21, #4 | |||||
| mul x8, x8, x21 | |||||
| LoopRowStart: | LoopRowStart: | ||||
| cmp x6, #4 | cmp x6, #4 | ||||
| @@ -1117,9 +1118,9 @@ LoopRow4: | |||||
| LoopColEnd: | LoopColEnd: | ||||
| add x0, x0, x17 | add x0, x0, x17 | ||||
| cbz x9, C8DstStep | cbz x9, C8DstStep | ||||
| mov x18, #4 | |||||
| mul x18, x18, x7 | |||||
| sub x11, x11, x18 | |||||
| mov x21, #4 | |||||
| mul x21, x21, x7 | |||||
| sub x11, x11, x21 | |||||
| mov x2, x11 | mov x2, x11 | ||||
| b NoDstStep | b NoDstStep | ||||
| C8DstStep: | C8DstStep: | ||||
| @@ -1129,9 +1130,10 @@ LoopColEnd: | |||||
| subs x6, x6, #12 | subs x6, x6, #12 | ||||
| bgt LoopRowStart | bgt LoopRowStart | ||||
| sub sp, sp, #144 | |||||
| sub sp, sp, #160 | |||||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| ldp x19, x20, [sp], #16 | ldp x19, x20, [sp], #16 | ||||
| ldp x21, x22, [sp], #16 | |||||
| ret | ret | ||||
| #endif | #endif | ||||
| @@ -67,7 +67,7 @@ L2: | |||||
| cmp w16, #0 | cmp w16, #0 | ||||
| beq End2 | beq End2 | ||||
| mov x18, x1 // reload b ptr | |||||
| mov x28, x1 // reload b ptr | |||||
| mov x19, x7 // reload bias ptr | mov x19, x7 // reload bias ptr | ||||
| mov w20, w5 // reload depth | mov w20, w5 // reload depth | ||||
| dup v16.4s, wzr | dup v16.4s, wzr | ||||
| @@ -94,10 +94,10 @@ L3: | |||||
| ld1 {v1.16b}, [x17], #16 | ld1 {v1.16b}, [x17], #16 | ||||
| ld1 {v2.16b}, [x17], #16 | ld1 {v2.16b}, [x17], #16 | ||||
| ld1 {v3.16b}, [x17], #16 | ld1 {v3.16b}, [x17], #16 | ||||
| ld1 {v4.16b}, [x18], #16 | |||||
| ld1 {v5.16b}, [x18], #16 | |||||
| ld1 {v6.16b}, [x18], #16 | |||||
| ld1 {v7.16b}, [x18], #16 | |||||
| ld1 {v4.16b}, [x28], #16 | |||||
| ld1 {v5.16b}, [x28], #16 | |||||
| ld1 {v6.16b}, [x28], #16 | |||||
| ld1 {v7.16b}, [x28], #16 | |||||
| smull v8.8h, v4.8b, v0.8b | smull v8.8h, v4.8b, v0.8b | ||||
| smull v9.8h, v5.8b, v0.8b | smull v9.8h, v5.8b, v0.8b | ||||
| @@ -30,7 +30,7 @@ | |||||
| // x28: filter_zp | // x28: filter_zp | ||||
| asm_function MatmulInt8Opt | asm_function MatmulInt8Opt | ||||
| sub sp, sp, #208 | |||||
| sub sp, sp, #224 | |||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| stp x19, x20, [sp], #16 | stp x19, x20, [sp], #16 | ||||
| @@ -38,6 +38,7 @@ asm_function MatmulInt8Opt | |||||
| stp x23, x24, [sp], #16 | stp x23, x24, [sp], #16 | ||||
| stp x25, x26, [sp], #16 | stp x25, x26, [sp], #16 | ||||
| stp x27, x28, [sp], #16 | stp x27, x28, [sp], #16 | ||||
| stp x29, x30, [sp], #16 | |||||
| ldr w8, [sp] | ldr w8, [sp] | ||||
| ldr w9, [sp, #8] | ldr w9, [sp, #8] | ||||
| @@ -55,7 +56,7 @@ asm_function MatmulInt8Opt | |||||
| LoopRow: | LoopRow: | ||||
| mov x16, x1 // reload rhs ptr | mov x16, x1 // reload rhs ptr | ||||
| mov x17, x4 // reload rhs col | mov x17, x4 // reload rhs col | ||||
| mov x18, x7 // reload bias ptr | |||||
| mov x29, x7 // reload bias ptr | |||||
| mov x27, x2 // reload dst ptr | mov x27, x2 // reload dst ptr | ||||
| ldr x28, [sp, #64] // reload filter_zp | ldr x28, [sp, #64] // reload filter_zp | ||||
| @@ -158,7 +159,7 @@ LoopRow: | |||||
| Bias: | Bias: | ||||
| cbz x7, NoBias | cbz x7, NoBias | ||||
| ld1 {v15.4s}, [x18], #16 | |||||
| ld1 {v15.4s}, [x29], #16 | |||||
| add v16.4s, v16.4s, v15.4s | add v16.4s, v16.4s, v15.4s | ||||
| add v17.4s, v17.4s, v15.4s | add v17.4s, v17.4s, v15.4s | ||||
| add v18.4s, v18.4s, v15.4s | add v18.4s, v18.4s, v15.4s | ||||
| @@ -330,7 +331,7 @@ LoopColEnd: | |||||
| b LoopRow | b LoopRow | ||||
| LoopRowEnd: | LoopRowEnd: | ||||
| sub sp, sp, #208 | |||||
| sub sp, sp, #224 | |||||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| ldp x19, x20, [sp], #16 | ldp x19, x20, [sp], #16 | ||||
| @@ -338,5 +339,6 @@ LoopRowEnd: | |||||
| ldp x23, x24, [sp], #16 | ldp x23, x24, [sp], #16 | ||||
| ldp x25, x26, [sp], #16 | ldp x25, x26, [sp], #16 | ||||
| ldp x27, x28, [sp], #16 | ldp x27, x28, [sp], #16 | ||||
| ldp x29, x30, [sp], #16 | |||||
| ret | ret | ||||
| #endif | #endif | ||||
| @@ -20,9 +20,10 @@ | |||||
| // x7: bias | // x7: bias | ||||
| asm_function MatMulR4Int8Neon64 | asm_function MatMulR4Int8Neon64 | ||||
| sub sp, sp, #128 | |||||
| sub sp, sp, #144 | |||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| stp x19, x20, [sp], #16 | |||||
| mov w15, #0 // b col index | mov w15, #0 // b col index | ||||
| mov w16, #0 // a row index | mov w16, #0 // a row index | ||||
| @@ -40,7 +41,7 @@ L2: | |||||
| cmp w16, w3 | cmp w16, w3 | ||||
| beq End2 | beq End2 | ||||
| mov x18, x1 // reload b ptr | |||||
| mov x19, x1 // reload b ptr | |||||
| mov x10, x7 // reload bias ptr | mov x10, x7 // reload bias ptr | ||||
| mov w11, w5 // reload depth | mov w11, w5 // reload depth | ||||
| dup v16.4s, wzr | dup v16.4s, wzr | ||||
| @@ -67,10 +68,10 @@ L3: | |||||
| ld1 {v1.16b}, [x17], #16 | ld1 {v1.16b}, [x17], #16 | ||||
| ld1 {v2.16b}, [x17], #16 | ld1 {v2.16b}, [x17], #16 | ||||
| ld1 {v3.16b}, [x17], #16 | ld1 {v3.16b}, [x17], #16 | ||||
| ld1 {v4.16b}, [x18], #16 | |||||
| ld1 {v5.16b}, [x18], #16 | |||||
| ld1 {v6.16b}, [x18], #16 | |||||
| ld1 {v7.16b}, [x18], #16 | |||||
| ld1 {v4.16b}, [x19], #16 | |||||
| ld1 {v5.16b}, [x19], #16 | |||||
| ld1 {v6.16b}, [x19], #16 | |||||
| ld1 {v7.16b}, [x19], #16 | |||||
| smull v8.8h, v4.8b, v0.8b | smull v8.8h, v4.8b, v0.8b | ||||
| smull v9.8h, v5.8b, v0.8b | smull v9.8h, v5.8b, v0.8b | ||||
| @@ -172,8 +173,9 @@ End2: | |||||
| b L1 | b L1 | ||||
| End1: | End1: | ||||
| sub sp, sp, #128 | |||||
| sub sp, sp, #144 | |||||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| ldp x19, x20, [sp], #16 | |||||
| ret | ret | ||||
| #endif | #endif | ||||
| @@ -30,13 +30,13 @@ asm_function MatrixMultiplyWinograd | |||||
| mov x14, x1 // mat_b | mov x14, x1 // mat_b | ||||
| LoopN: | LoopN: | ||||
| mov x16, x0 // mat_a_m | mov x16, x0 // mat_a_m | ||||
| sub x18, x5, x15 // ni | |||||
| sub x22, x5, x15 // ni | |||||
| sub x19, x17, x3 // mi | sub x19, x17, x3 // mi | ||||
| mul x18, x18, x17 // ni * m | |||||
| mul x22, x22, x17 // ni * m | |||||
| mov x11, x6 // in_channel | mov x11, x6 // in_channel | ||||
| add x18, x18, x19 // (ni * m) + mi | |||||
| mul x18, x18, x7 // x18 * c4_channel | |||||
| add x20, x2, x18 // dst + offset | |||||
| add x22, x22, x19 // (ni * m) + mi | |||||
| mul x22, x22, x7 // x22 * c4_channel | |||||
| add x20, x2, x22 // dst + offset | |||||
| cmp x11, #16 | cmp x11, #16 | ||||
| bge LoopC16 | bge LoopC16 | ||||
| cmp x11, #8 | cmp x11, #8 | ||||
| @@ -1,6 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | #include "nnacl/assembly_global.h" | ||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| //.p2align 5,,15 | //.p2align 5,,15 | ||||
| @@ -55,16 +55,16 @@ LoopH: | |||||
| ld1 {v0.s}[2], [x17], x10 | ld1 {v0.s}[2], [x17], x10 | ||||
| ld1 {v0.s}[3], [x17], x10 | ld1 {v0.s}[3], [x17], x10 | ||||
| mov x11, x6 | mov x11, x6 | ||||
| mov x18, x17 | |||||
| add x18, x14, x7 | |||||
| add x16, x18, x7 | |||||
| mov x20, x17 | |||||
| add x20, x14, x7 | |||||
| add x16, x20, x7 | |||||
| add x19, x16, x7 | add x19, x16, x7 | ||||
| LoopLength4: | LoopLength4: | ||||
| ld1 {v16.4s}, [x2] | ld1 {v16.4s}, [x2] | ||||
| ld1 {v20.4s}, [x14], #16 | ld1 {v20.4s}, [x14], #16 | ||||
| fmla v16.4s, v20.4s, v0.s[0] | fmla v16.4s, v20.4s, v0.s[0] | ||||
| ld1 {v21.4s}, [x18], #16 | |||||
| ld1 {v21.4s}, [x20], #16 | |||||
| fmul v17.4s, v21.4s, v0.s[1] | fmul v17.4s, v21.4s, v0.s[1] | ||||
| ld1 {v20.4s}, [x16], #16 | ld1 {v20.4s}, [x16], #16 | ||||
| fmla v16.4s, v20.4s, v0.s[2] | fmla v16.4s, v20.4s, v0.s[2] | ||||
| @@ -90,14 +90,14 @@ LoopH: | |||||
| ld1 {v0.s}[1], [x17], x10 | ld1 {v0.s}[1], [x17], x10 | ||||
| ld1 {v0.s}[2], [x17], x10 | ld1 {v0.s}[2], [x17], x10 | ||||
| mov x11, x6 | mov x11, x6 | ||||
| mov x18, x17 | |||||
| add x18, x14, x7 | |||||
| add x16, x18, x7 | |||||
| mov x20, x17 | |||||
| add x20, x14, x7 | |||||
| add x16, x20, x7 | |||||
| LoopLength3: | LoopLength3: | ||||
| ld1 {v16.4s}, [x2] | ld1 {v16.4s}, [x2] | ||||
| ld1 {v20.4s}, [x14], #16 | ld1 {v20.4s}, [x14], #16 | ||||
| fmla v16.4s, v20.4s, v0.s[0] | fmla v16.4s, v20.4s, v0.s[0] | ||||
| ld1 {v21.4s}, [x18], #16 | |||||
| ld1 {v21.4s}, [x20], #16 | |||||
| fmul v17.4s, v21.4s, v0.s[1] | fmul v17.4s, v21.4s, v0.s[1] | ||||
| ld1 {v20.4s}, [x16], #16 | ld1 {v20.4s}, [x16], #16 | ||||
| fmla v16.4s, v20.4s, v0.s[2] | fmla v16.4s, v20.4s, v0.s[2] | ||||
| @@ -18,6 +18,9 @@ asm_function WinogradTransRight | |||||
| //x5: k | //x5: k | ||||
| //x6: length | //x6: length | ||||
| sub sp, sp, #16 | |||||
| stp x19, x20, [sp], #16 | |||||
| mov x8, #16 // 4 * sizeof(float) | mov x8, #16 // 4 * sizeof(float) | ||||
| mul x8, x6, x8 | mul x8, x6, x8 | ||||
| mul x9, x5, x8 // step for S | mul x9, x5, x8 // step for S | ||||
| @@ -43,7 +46,7 @@ LoopH: | |||||
| cmp x12, #4 | cmp x12, #4 | ||||
| blt LoopKStart3 | blt LoopKStart3 | ||||
| mov x16, x15 | mov x16, x15 | ||||
| mov x18, x4 | |||||
| mov x19, x4 | |||||
| LoopK4: | LoopK4: | ||||
| ld1 {v0.s}[0], [x13], x10 | ld1 {v0.s}[0], [x13], x10 | ||||
| ld1 {v0.s}[1], [x13], x10 | ld1 {v0.s}[1], [x13], x10 | ||||
| @@ -54,7 +57,7 @@ LoopH: | |||||
| add x14, x17, x8 | add x14, x17, x8 | ||||
| add x16, x14, x8 | add x16, x14, x8 | ||||
| add x18, x16, x8 | |||||
| add x19, x16, x8 | |||||
| LoopLength4: | LoopLength4: | ||||
| ld1 {v16.4s}, [x2] | ld1 {v16.4s}, [x2] | ||||
| @@ -64,7 +67,7 @@ LoopH: | |||||
| fmul v17.4s, v21.4s, v0.s[1] | fmul v17.4s, v21.4s, v0.s[1] | ||||
| ld1 {v20.4s}, [x16], #16 | ld1 {v20.4s}, [x16], #16 | ||||
| fmla v16.4s, v20.4s, v0.s[2] | fmla v16.4s, v20.4s, v0.s[2] | ||||
| ld1 {v21.4s}, [x18], #16 | |||||
| ld1 {v21.4s}, [x19], #16 | |||||
| fmla v17.4s, v21.4s, v0.s[3] | fmla v17.4s, v21.4s, v0.s[3] | ||||
| fadd v17.4s, v16.4s, v17.4s | fadd v17.4s, v16.4s, v17.4s | ||||
| @@ -73,7 +76,7 @@ LoopH: | |||||
| bne LoopLength4 | bne LoopLength4 | ||||
| sub x2, x2, x8 | sub x2, x2, x8 | ||||
| sub x12, x12, #4 | sub x12, x12, #4 | ||||
| mov x17, x18 | |||||
| mov x17, x19 | |||||
| cmp x12, #4 | cmp x12, #4 | ||||
| bge LoopK4 | bge LoopK4 | ||||
| @@ -107,7 +110,7 @@ LoopH: | |||||
| bne LoopLength3 | bne LoopLength3 | ||||
| sub x2, x2, x8 | sub x2, x2, x8 | ||||
| sub x12, x12, #3 | sub x12, x12, #3 | ||||
| mov x17, x18 | |||||
| mov x17, x19 | |||||
| cmp x12, #3 | cmp x12, #3 | ||||
| bge LoopK3 | bge LoopK3 | ||||
| @@ -141,5 +144,7 @@ LoopH: | |||||
| subs x4, x4, #1 | subs x4, x4, #1 | ||||
| bne LoopH | bne LoopH | ||||
| sub sp, sp, #16 | |||||
| ldp x19, x20, [sp], #16 | |||||
| ret | ret | ||||
| #endif | #endif | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef ENABLE_AVX | #ifdef ENABLE_AVX | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 4 | .align 4 | ||||
| .global ConvDwFp32Avx3x3 | .global ConvDwFp32Avx3x3 | ||||
| @@ -31,7 +32,7 @@ | |||||
| // 56: input_stride | // 56: input_stride | ||||
| // 64: relu | // 64: relu | ||||
| // 72: relu6 | // 72: relu6 | ||||
| ConvDwFp32Avx3x3: | |||||
| asm_function ConvDwFp32Avx3x3 | |||||
| pushq %r15 | pushq %r15 | ||||
| pushq %r14 | pushq %r14 | ||||
| pushq %r13 | pushq %r13 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef ENABLE_AVX | #ifdef ENABLE_AVX | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 4 | .align 4 | ||||
| .global MatmulFloatAvxOpt | .global MatmulFloatAvxOpt | ||||
| @@ -34,7 +35,7 @@ | |||||
| // 72: stride | // 72: stride | ||||
| // 80: writeMode | // 80: writeMode | ||||
| MatmulFloatAvxOpt: | |||||
| asm_function MatmulFloatAvxOpt | |||||
| // rbx, rsp, rbp, r12-r15 must be saved according to x86 calling convention | // rbx, rsp, rbp, r12-r15 must be saved according to x86 calling convention | ||||
| pushq %r15 | pushq %r15 | ||||
| pushq %r14 | pushq %r14 | ||||
| @@ -19,12 +19,13 @@ asm_function ConvDwFp16Center | |||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| // whereas our coding style do not permit such amount of parameters | // whereas our coding style do not permit such amount of parameters | ||||
| sub sp, sp, #176 | |||||
| sub sp, sp, #192 | |||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| stp x19, x20, [sp], #16 | stp x19, x20, [sp], #16 | ||||
| stp x21, x22, [sp], #16 | stp x21, x22, [sp], #16 | ||||
| stp x23, x24, [sp], #16 | stp x23, x24, [sp], #16 | ||||
| stp x25, x26, [sp], #16 | |||||
| ldr x8, [sp] | ldr x8, [sp] | ||||
| ldr x9, [sp, #8] | ldr x9, [sp, #8] | ||||
| @@ -71,7 +72,7 @@ asm_function ConvDwFp16Center | |||||
| mov v14.16b, v24.16b | mov v14.16b, v24.16b | ||||
| mov v15.16b, v24.16b | mov v15.16b, v24.16b | ||||
| LoopKh16: | LoopKh16: | ||||
| mov x18, x7 | |||||
| mov x25, x7 | |||||
| mov x21, x16 | mov x21, x16 | ||||
| LoopKw16: | LoopKw16: | ||||
| mov x22, x21 | mov x22, x21 | ||||
| @@ -108,7 +109,7 @@ asm_function ConvDwFp16Center | |||||
| ld1 {v23.8h}, [x22], x11 | ld1 {v23.8h}, [x22], x11 | ||||
| fmla v14.8h, v22.8h, v25.8h | fmla v14.8h, v22.8h, v25.8h | ||||
| fmla v15.8h, v23.8h, v25.8h | fmla v15.8h, v23.8h, v25.8h | ||||
| subs x18, x18, #1 | |||||
| subs x25, x25, #1 | |||||
| add x21, x21, x13 | add x21, x21, x13 | ||||
| bne LoopKw16 | bne LoopKw16 | ||||
| add x16, x16, x12 | add x16, x16, x12 | ||||
| @@ -191,7 +192,7 @@ asm_function ConvDwFp16Center | |||||
| mov v6.16b, v24.16b | mov v6.16b, v24.16b | ||||
| mov v7.16b, v24.16b | mov v7.16b, v24.16b | ||||
| LoopKh8: | LoopKh8: | ||||
| mov x18, x7 | |||||
| mov x25, x7 | |||||
| mov x21, x16 | mov x21, x16 | ||||
| LoopKw8: | LoopKw8: | ||||
| mov x22, x21 | mov x22, x21 | ||||
| @@ -212,7 +213,7 @@ asm_function ConvDwFp16Center | |||||
| ld1 {v23.8h}, [x22], x11 | ld1 {v23.8h}, [x22], x11 | ||||
| fmla v6.8h, v22.8h, v25.8h | fmla v6.8h, v22.8h, v25.8h | ||||
| fmla v7.8h, v23.8h, v25.8h | fmla v7.8h, v23.8h, v25.8h | ||||
| subs x18, x18, #1 | |||||
| subs x25, x25, #1 | |||||
| add x21, x21, x13 | add x21, x21, x13 | ||||
| bne LoopKw8 | bne LoopKw8 | ||||
| add x16, x16, x12 | add x16, x16, x12 | ||||
| @@ -260,13 +261,13 @@ asm_function ConvDwFp16Center | |||||
| mov x20, x6 | mov x20, x6 | ||||
| mov v0.16b, v24.16b | mov v0.16b, v24.16b | ||||
| LoopKh: | LoopKh: | ||||
| mov x18, x7 | |||||
| mov x25, x7 | |||||
| mov x22, x16 | mov x22, x16 | ||||
| LoopKw: | LoopKw: | ||||
| ld1 {v16.8h}, [x22], x13 | ld1 {v16.8h}, [x22], x13 | ||||
| ld1 {v25.8h}, [x17], #16 | ld1 {v25.8h}, [x17], #16 | ||||
| fmla v0.8h, v16.8h, v25.8h | fmla v0.8h, v16.8h, v25.8h | ||||
| subs x18, x18, #1 | |||||
| subs x25, x25, #1 | |||||
| bne LoopKw | bne LoopKw | ||||
| add x16, x16, x12 | add x16, x16, x12 | ||||
| subs x20, x20, #1 | subs x20, x20, #1 | ||||
| @@ -289,11 +290,12 @@ asm_function ConvDwFp16Center | |||||
| subs x4, x4, #1 | subs x4, x4, #1 | ||||
| bne LoopH | bne LoopH | ||||
| sub sp, sp, #176 | |||||
| sub sp, sp, #192 | |||||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| ldp x19, x20, [sp], #16 | ldp x19, x20, [sp], #16 | ||||
| ldp x21, x22, [sp], #16 | ldp x21, x22, [sp], #16 | ||||
| ldp x23, x24, [sp], #16 | ldp x23, x24, [sp], #16 | ||||
| ldp x25, x26, [sp], #16 | |||||
| ret | ret | ||||
| #endif | #endif | ||||
| @@ -33,12 +33,12 @@ asm_function DeconvDwFp16Center | |||||
| mov x16, x1 | mov x16, x1 | ||||
| mov x17, x4 | mov x17, x4 | ||||
| LoopW: | LoopW: | ||||
| mov x18, x15 | |||||
| mov x22, x15 | |||||
| mov x19, x2 | mov x19, x2 | ||||
| mov x20, x5 | mov x20, x5 | ||||
| ld1 {v1.8h}, [x16], x8 | ld1 {v1.8h}, [x16], x8 | ||||
| LoopKh: | LoopKh: | ||||
| mov x21, x18 | |||||
| mov x21, x22 | |||||
| mov x13, x6 | mov x13, x6 | ||||
| LoopKw: | LoopKw: | ||||
| ld1 {v0.8h}, [x21] | ld1 {v0.8h}, [x21] | ||||
| @@ -47,7 +47,7 @@ asm_function DeconvDwFp16Center | |||||
| st1 {v0.8h}, [x21], x12 | st1 {v0.8h}, [x21], x12 | ||||
| subs x13, x13, #1 | subs x13, x13, #1 | ||||
| bne LoopKw | bne LoopKw | ||||
| add x18, x18, x11 | |||||
| add x22, x22, x11 | |||||
| subs x20, x20, #1 | subs x20, x20, #1 | ||||
| bne LoopKh | bne LoopKh | ||||
| add x15, x15, x10 | add x15, x15, x10 | ||||
| @@ -41,11 +41,12 @@ asm_function IndirectGemmFp16_16x8 | |||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ r29 should be also preserved | // x19 ~ r29 should be also preserved | ||||
| // whereas our coding style do not permit such amount of parameters | // whereas our coding style do not permit such amount of parameters | ||||
| sub sp, sp, #128 | |||||
| sub sp, sp, #144 | |||||
| // performance between storing 4 registers at the same time and separately storing them on in-order cores | // performance between storing 4 registers at the same time and separately storing them on in-order cores | ||||
| // is not tested yet | // is not tested yet | ||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| stp x19, x20, [sp], #16 | |||||
| ldr x8, [sp, #0] | ldr x8, [sp, #0] | ||||
| ldr x9, [sp, #8] | ldr x9, [sp, #8] | ||||
| @@ -548,87 +549,87 @@ IndirectGemmStart: | |||||
| b WriteEnd | b WriteEnd | ||||
| Write7: | Write7: | ||||
| add x17, x15, #8 | add x17, x15, #8 | ||||
| add x18, x15, #10 | |||||
| add x19, x15, #10 | |||||
| add x16, x15, #12 | add x16, x15, #12 | ||||
| st1 {v16.4h}, [x15], x7 | st1 {v16.4h}, [x15], x7 | ||||
| ins v0.s[0], v16.s[2] | ins v0.s[0], v16.s[2] | ||||
| st1 {v0.h}[0], [x17], x7 | st1 {v0.h}[0], [x17], x7 | ||||
| st1 {v0.h}[1], [x18], x7 | |||||
| st1 {v0.h}[1], [x19], x7 | |||||
| st1 {v16.h}[6], [x16], x7 | st1 {v16.h}[6], [x16], x7 | ||||
| st1 {v17.4h}, [x15], x7 | st1 {v17.4h}, [x15], x7 | ||||
| ins v1.s[0], v17.s[2] | ins v1.s[0], v17.s[2] | ||||
| st1 {v1.h}[0], [x17], x7 | st1 {v1.h}[0], [x17], x7 | ||||
| st1 {v1.h}[1], [x18], x7 | |||||
| st1 {v1.h}[1], [x19], x7 | |||||
| st1 {v17.h}[6], [x16], x7 | st1 {v17.h}[6], [x16], x7 | ||||
| st1 {v18.4h}, [x15], x7 | st1 {v18.4h}, [x15], x7 | ||||
| ins v2.s[0], v18.s[2] | ins v2.s[0], v18.s[2] | ||||
| st1 {v2.h}[0], [x17], x7 | st1 {v2.h}[0], [x17], x7 | ||||
| st1 {v2.h}[1], [x18], x7 | |||||
| st1 {v2.h}[1], [x19], x7 | |||||
| st1 {v18.h}[6], [x16], x7 | st1 {v18.h}[6], [x16], x7 | ||||
| st1 {v19.4h}, [x15], x7 | st1 {v19.4h}, [x15], x7 | ||||
| ins v3.s[0], v19.s[2] | ins v3.s[0], v19.s[2] | ||||
| st1 {v3.h}[0], [x17], x7 | st1 {v3.h}[0], [x17], x7 | ||||
| st1 {v3.h}[1], [x18], x7 | |||||
| st1 {v3.h}[1], [x19], x7 | |||||
| st1 {v19.h}[6], [x16], x7 | st1 {v19.h}[6], [x16], x7 | ||||
| st1 {v20.4h}, [x15], x7 | st1 {v20.4h}, [x15], x7 | ||||
| ins v4.s[0], v20.s[2] | ins v4.s[0], v20.s[2] | ||||
| st1 {v4.h}[0], [x17], x7 | st1 {v4.h}[0], [x17], x7 | ||||
| st1 {v4.h}[1], [x18], x7 | |||||
| st1 {v4.h}[1], [x19], x7 | |||||
| st1 {v20.h}[6], [x16], x7 | st1 {v20.h}[6], [x16], x7 | ||||
| st1 {v21.4h}, [x15], x7 | st1 {v21.4h}, [x15], x7 | ||||
| ins v5.s[0], v21.s[2] | ins v5.s[0], v21.s[2] | ||||
| st1 {v5.h}[0], [x17], x7 | st1 {v5.h}[0], [x17], x7 | ||||
| st1 {v5.h}[1], [x18], x7 | |||||
| st1 {v5.h}[1], [x19], x7 | |||||
| st1 {v21.h}[6], [x16], x7 | st1 {v21.h}[6], [x16], x7 | ||||
| st1 {v22.4h}, [x15], x7 | st1 {v22.4h}, [x15], x7 | ||||
| ins v6.s[0], v22.s[2] | ins v6.s[0], v22.s[2] | ||||
| st1 {v6.h}[0], [x17], x7 | st1 {v6.h}[0], [x17], x7 | ||||
| st1 {v6.h}[1], [x18], x7 | |||||
| st1 {v6.h}[1], [x19], x7 | |||||
| st1 {v22.h}[6], [x16], x7 | st1 {v22.h}[6], [x16], x7 | ||||
| st1 {v23.4h}, [x15], x7 | st1 {v23.4h}, [x15], x7 | ||||
| ins v7.s[0], v23.s[2] | ins v7.s[0], v23.s[2] | ||||
| st1 {v7.h}[0], [x17], x7 | st1 {v7.h}[0], [x17], x7 | ||||
| st1 {v7.h}[1], [x18], x7 | |||||
| st1 {v7.h}[1], [x19], x7 | |||||
| st1 {v23.h}[6], [x16], x7 | st1 {v23.h}[6], [x16], x7 | ||||
| st1 {v24.4h}, [x15], x7 | st1 {v24.4h}, [x15], x7 | ||||
| ins v8.s[0], v24.s[2] | ins v8.s[0], v24.s[2] | ||||
| st1 {v8.h}[0], [x17], x7 | st1 {v8.h}[0], [x17], x7 | ||||
| st1 {v8.h}[1], [x18], x7 | |||||
| st1 {v8.h}[1], [x19], x7 | |||||
| st1 {v24.h}[6], [x16], x7 | st1 {v24.h}[6], [x16], x7 | ||||
| st1 {v25.4h}, [x15], x7 | st1 {v25.4h}, [x15], x7 | ||||
| ins v9.s[0], v25.s[2] | ins v9.s[0], v25.s[2] | ||||
| st1 {v9.h}[0], [x17], x7 | st1 {v9.h}[0], [x17], x7 | ||||
| st1 {v9.h}[1], [x18], x7 | |||||
| st1 {v9.h}[1], [x19], x7 | |||||
| st1 {v25.h}[6], [x16], x7 | st1 {v25.h}[6], [x16], x7 | ||||
| st1 {v26.4h}, [x15], x7 | st1 {v26.4h}, [x15], x7 | ||||
| ins v10.s[0], v26.s[2] | ins v10.s[0], v26.s[2] | ||||
| st1 {v10.h}[0], [x17], x7 | st1 {v10.h}[0], [x17], x7 | ||||
| st1 {v10.h}[1], [x18], x7 | |||||
| st1 {v10.h}[1], [x19], x7 | |||||
| st1 {v26.h}[6], [x16], x7 | st1 {v26.h}[6], [x16], x7 | ||||
| st1 {v27.4h}, [x15], x7 | st1 {v27.4h}, [x15], x7 | ||||
| ins v11.s[0], v27.s[2] | ins v11.s[0], v27.s[2] | ||||
| st1 {v11.h}[0], [x17], x7 | st1 {v11.h}[0], [x17], x7 | ||||
| st1 {v11.h}[1], [x18], x7 | |||||
| st1 {v11.h}[1], [x19], x7 | |||||
| st1 {v27.h}[6], [x16], x7 | st1 {v27.h}[6], [x16], x7 | ||||
| st1 {v28.4h}, [x15], x7 | st1 {v28.4h}, [x15], x7 | ||||
| ins v12.s[0], v28.s[2] | ins v12.s[0], v28.s[2] | ||||
| st1 {v12.h}[0], [x17], x7 | st1 {v12.h}[0], [x17], x7 | ||||
| st1 {v12.h}[1], [x18], x7 | |||||
| st1 {v12.h}[1], [x19], x7 | |||||
| st1 {v28.h}[6], [x16], x7 | st1 {v28.h}[6], [x16], x7 | ||||
| st1 {v29.4h}, [x15], x7 | st1 {v29.4h}, [x15], x7 | ||||
| ins v13.s[0], v29.s[2] | ins v13.s[0], v29.s[2] | ||||
| st1 {v13.h}[0], [x17], x7 | st1 {v13.h}[0], [x17], x7 | ||||
| st1 {v13.h}[1], [x18], x7 | |||||
| st1 {v13.h}[1], [x19], x7 | |||||
| st1 {v29.h}[6], [x16], x7 | st1 {v29.h}[6], [x16], x7 | ||||
| st1 {v30.4h}, [x15], x7 | st1 {v30.4h}, [x15], x7 | ||||
| ins v14.s[0], v30.s[2] | ins v14.s[0], v30.s[2] | ||||
| st1 {v14.h}[0], [x17], x7 | st1 {v14.h}[0], [x17], x7 | ||||
| st1 {v14.h}[1], [x18], x7 | |||||
| st1 {v14.h}[1], [x19], x7 | |||||
| st1 {v30.h}[6], [x16], x7 | st1 {v30.h}[6], [x16], x7 | ||||
| st1 {v31.4h}, [x15] | st1 {v31.4h}, [x15] | ||||
| ins v15.s[0], v31.s[2] | ins v15.s[0], v31.s[2] | ||||
| st1 {v15.h}[0], [x17] | st1 {v15.h}[0], [x17] | ||||
| st1 {v15.h}[1], [x18] | |||||
| st1 {v15.h}[1], [x19] | |||||
| st1 {v31.h}[6], [x16] | st1 {v31.h}[6], [x16] | ||||
| add x0, x0, #14 | add x0, x0, #14 | ||||
| b WriteEnd | b WriteEnd | ||||
| @@ -661,9 +662,10 @@ IndirectGemmStart: | |||||
| NoStepForward: | NoStepForward: | ||||
| bgt LoopOc | bgt LoopOc | ||||
| sub sp, sp, #128 | |||||
| sub sp, sp, #144 | |||||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| ldp x19, x20, [sp], #16 | |||||
| ret | ret | ||||
| #endif | #endif | ||||
| @@ -21,21 +21,22 @@ | |||||
| // w13: writeC8 | // w13: writeC8 | ||||
| asm_function MatmulFp16Neon64 | asm_function MatmulFp16Neon64 | ||||
| sub sp, sp, #128 | |||||
| sub sp, sp, #144 | |||||
| st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | ||||
| st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 | st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 | ||||
| stp x19, x20, [sp], #16 | |||||
| mov w18, #16 // sizeof(float16) * 8 | mov w18, #16 // sizeof(float16) * 8 | ||||
| mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float16) * 8 * depth | mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float16) * 8 * depth | ||||
| mov x11, x3 // bias flag | mov x11, x3 // bias flag | ||||
| mov x18, #2 | |||||
| mov x19, #2 | |||||
| ldr x17, [sp] | ldr x17, [sp] | ||||
| mul x17, x17, x18 | |||||
| mul x17, x17, x19 | |||||
| L1: | L1: | ||||
| mov w10, w6 // reload lhs row | mov w10, w6 // reload lhs row | ||||
| mov x12, x0 // reload lhs ptr | mov x12, x0 // reload lhs ptr | ||||
| mov x18, x2 // reload dst ptr | |||||
| mov x19, x2 // reload dst ptr | |||||
| L2: | L2: | ||||
| mov x16, x1 // reload rhs ptr | mov x16, x1 // reload rhs ptr | ||||
| @@ -314,490 +315,490 @@ Write: | |||||
| b Write8 | b Write8 | ||||
| Write1: | Write1: | ||||
| st1 {v16.h}[0], [x18], x17 | |||||
| st1 {v16.h}[0], [x19], x17 | |||||
| cmp w10, #1 | cmp w10, #1 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v17.h}[0], [x18], x17 | |||||
| st1 {v17.h}[0], [x19], x17 | |||||
| cmp w10, #2 | cmp w10, #2 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v18.h}[0], [x18], x17 | |||||
| st1 {v18.h}[0], [x19], x17 | |||||
| cmp w10, #3 | cmp w10, #3 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v19.h}[0], [x18], x17 | |||||
| st1 {v19.h}[0], [x19], x17 | |||||
| cmp w10, #4 | cmp w10, #4 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v20.h}[0], [x18], x17 | |||||
| st1 {v20.h}[0], [x19], x17 | |||||
| cmp w10, #5 | cmp w10, #5 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v21.h}[0], [x18], x17 | |||||
| st1 {v21.h}[0], [x19], x17 | |||||
| cmp w10, #6 | cmp w10, #6 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v22.h}[0], [x18], x17 | |||||
| st1 {v22.h}[0], [x19], x17 | |||||
| cmp w10, #7 | cmp w10, #7 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v23.h}[0], [x18], x17 | |||||
| st1 {v23.h}[0], [x19], x17 | |||||
| cmp w10, #8 | cmp w10, #8 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v24.h}[0], [x18], x17 | |||||
| st1 {v24.h}[0], [x19], x17 | |||||
| cmp w10, #9 | cmp w10, #9 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v25.h}[0], [x18], x17 | |||||
| st1 {v25.h}[0], [x19], x17 | |||||
| cmp w10, #10 | cmp w10, #10 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v26.h}[0], [x18], x17 | |||||
| st1 {v26.h}[0], [x19], x17 | |||||
| cmp w10, #11 | cmp w10, #11 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v27.h}[0], [x18], x17 | |||||
| st1 {v27.h}[0], [x19], x17 | |||||
| cmp w10, #12 | cmp w10, #12 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v28.h}[0], [x18], x17 | |||||
| st1 {v28.h}[0], [x19], x17 | |||||
| cmp w10, #13 | cmp w10, #13 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v29.h}[0], [x18], x17 | |||||
| st1 {v29.h}[0], [x19], x17 | |||||
| cmp w10, #14 | cmp w10, #14 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v30.h}[0], [x18], x17 | |||||
| st1 {v30.h}[0], [x19], x17 | |||||
| cmp w10, #15 | cmp w10, #15 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v31.h}[0], [x18], x17 | |||||
| st1 {v31.h}[0], [x19], x17 | |||||
| b WriteEnd | b WriteEnd | ||||
| Write2: | Write2: | ||||
| add x13, x18, #2 | |||||
| st1 {v16.h}[0], [x18], x17 | |||||
| add x13, x19, #2 | |||||
| st1 {v16.h}[0], [x19], x17 | |||||
| st1 {v16.h}[1], [x13], x17 | st1 {v16.h}[1], [x13], x17 | ||||
| cmp w10, #1 | cmp w10, #1 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v17.h}[0], [x18], x17 | |||||
| st1 {v17.h}[0], [x19], x17 | |||||
| st1 {v17.h}[1], [x13], x17 | st1 {v17.h}[1], [x13], x17 | ||||
| cmp w10, #2 | cmp w10, #2 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v18.h}[0], [x18], x17 | |||||
| st1 {v18.h}[0], [x19], x17 | |||||
| st1 {v18.h}[1], [x13], x17 | st1 {v18.h}[1], [x13], x17 | ||||
| cmp w10, #3 | cmp w10, #3 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v19.h}[0], [x18], x17 | |||||
| st1 {v19.h}[0], [x19], x17 | |||||
| st1 {v19.h}[1], [x13], x17 | st1 {v19.h}[1], [x13], x17 | ||||
| cmp w10, #4 | cmp w10, #4 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v20.h}[0], [x18], x17 | |||||
| st1 {v20.h}[0], [x19], x17 | |||||
| st1 {v20.h}[1], [x13], x17 | st1 {v20.h}[1], [x13], x17 | ||||
| cmp w10, #5 | cmp w10, #5 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v21.h}[0], [x18], x17 | |||||
| st1 {v21.h}[0], [x19], x17 | |||||
| st1 {v21.h}[1], [x13], x17 | st1 {v21.h}[1], [x13], x17 | ||||
| cmp w10, #6 | cmp w10, #6 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v22.h}[0], [x18], x17 | |||||
| st1 {v22.h}[0], [x19], x17 | |||||
| st1 {v22.h}[1], [x13], x17 | st1 {v22.h}[1], [x13], x17 | ||||
| cmp w10, #7 | cmp w10, #7 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v23.h}[0], [x18], x17 | |||||
| st1 {v23.h}[0], [x19], x17 | |||||
| st1 {v23.h}[1], [x13], x17 | st1 {v23.h}[1], [x13], x17 | ||||
| cmp w10, #8 | cmp w10, #8 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v24.h}[0], [x18], x17 | |||||
| st1 {v24.h}[0], [x19], x17 | |||||
| st1 {v24.h}[1], [x13], x17 | st1 {v24.h}[1], [x13], x17 | ||||
| cmp w10, #9 | cmp w10, #9 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v25.h}[0], [x18], x17 | |||||
| st1 {v25.h}[0], [x19], x17 | |||||
| st1 {v25.h}[1], [x13], x17 | st1 {v25.h}[1], [x13], x17 | ||||
| cmp w10, #10 | cmp w10, #10 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v26.h}[0], [x18], x17 | |||||
| st1 {v26.h}[0], [x19], x17 | |||||
| st1 {v26.h}[1], [x13], x17 | st1 {v26.h}[1], [x13], x17 | ||||
| cmp w10, #11 | cmp w10, #11 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v27.h}[0], [x18], x17 | |||||
| st1 {v27.h}[0], [x19], x17 | |||||
| st1 {v27.h}[1], [x13], x17 | st1 {v27.h}[1], [x13], x17 | ||||
| cmp w10, #12 | cmp w10, #12 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v28.h}[0], [x18], x17 | |||||
| st1 {v28.h}[0], [x19], x17 | |||||
| st1 {v28.h}[1], [x13], x17 | st1 {v28.h}[1], [x13], x17 | ||||
| cmp w10, #13 | cmp w10, #13 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v29.h}[0], [x18], x17 | |||||
| st1 {v29.h}[0], [x19], x17 | |||||
| st1 {v29.h}[1], [x13], x17 | st1 {v29.h}[1], [x13], x17 | ||||
| cmp w10, #14 | cmp w10, #14 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v30.h}[0], [x18], x17 | |||||
| st1 {v30.h}[0], [x19], x17 | |||||
| st1 {v30.h}[1], [x13], x17 | st1 {v30.h}[1], [x13], x17 | ||||
| cmp w10, #15 | cmp w10, #15 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v31.h}[0], [x18], x17 | |||||
| st1 {v31.h}[0], [x19], x17 | |||||
| st1 {v31.h}[1], [x13], x17 | st1 {v31.h}[1], [x13], x17 | ||||
| b WriteEnd | b WriteEnd | ||||
| Write3: | Write3: | ||||
| add x13, x18, #2 | |||||
| add x14, x18, #4 | |||||
| st1 {v16.h}[0], [x18], x17 | |||||
| add x13, x19, #2 | |||||
| add x14, x19, #4 | |||||
| st1 {v16.h}[0], [x19], x17 | |||||
| st1 {v16.h}[1], [x13], x17 | st1 {v16.h}[1], [x13], x17 | ||||
| st1 {v16.h}[2], [x14], x17 | st1 {v16.h}[2], [x14], x17 | ||||
| cmp w10, #1 | cmp w10, #1 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v17.h}[0], [x18], x17 | |||||
| st1 {v17.h}[0], [x19], x17 | |||||
| st1 {v17.h}[1], [x13], x17 | st1 {v17.h}[1], [x13], x17 | ||||
| st1 {v17.h}[2], [x14], x17 | st1 {v17.h}[2], [x14], x17 | ||||
| cmp w10, #2 | cmp w10, #2 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v18.h}[0], [x18], x17 | |||||
| st1 {v18.h}[0], [x19], x17 | |||||
| st1 {v18.h}[1], [x13], x17 | st1 {v18.h}[1], [x13], x17 | ||||
| st1 {v18.h}[2], [x14], x17 | st1 {v18.h}[2], [x14], x17 | ||||
| cmp w10, #3 | cmp w10, #3 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v19.h}[0], [x18], x17 | |||||
| st1 {v19.h}[0], [x19], x17 | |||||
| st1 {v19.h}[1], [x13], x17 | st1 {v19.h}[1], [x13], x17 | ||||
| st1 {v19.h}[2], [x14], x17 | st1 {v19.h}[2], [x14], x17 | ||||
| cmp w10, #4 | cmp w10, #4 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v20.h}[0], [x18], x17 | |||||
| st1 {v20.h}[0], [x19], x17 | |||||
| st1 {v20.h}[1], [x13], x17 | st1 {v20.h}[1], [x13], x17 | ||||
| st1 {v20.h}[2], [x14], x17 | st1 {v20.h}[2], [x14], x17 | ||||
| cmp w10, #5 | cmp w10, #5 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v21.h}[0], [x18], x17 | |||||
| st1 {v21.h}[0], [x19], x17 | |||||
| st1 {v21.h}[1], [x13], x17 | st1 {v21.h}[1], [x13], x17 | ||||
| st1 {v21.h}[2], [x14], x17 | st1 {v21.h}[2], [x14], x17 | ||||
| cmp w10, #6 | cmp w10, #6 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v22.h}[0], [x18], x17 | |||||
| st1 {v22.h}[0], [x19], x17 | |||||
| st1 {v22.h}[1], [x13], x17 | st1 {v22.h}[1], [x13], x17 | ||||
| st1 {v22.h}[2], [x14], x17 | st1 {v22.h}[2], [x14], x17 | ||||
| cmp w10, #7 | cmp w10, #7 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v23.h}[0], [x18], x17 | |||||
| st1 {v23.h}[0], [x19], x17 | |||||
| st1 {v23.h}[1], [x13], x17 | st1 {v23.h}[1], [x13], x17 | ||||
| st1 {v23.h}[2], [x14], x17 | st1 {v23.h}[2], [x14], x17 | ||||
| cmp w10, #8 | cmp w10, #8 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v24.h}[0], [x18], x17 | |||||
| st1 {v24.h}[0], [x19], x17 | |||||
| st1 {v24.h}[1], [x13], x17 | st1 {v24.h}[1], [x13], x17 | ||||
| st1 {v24.h}[2], [x14], x17 | st1 {v24.h}[2], [x14], x17 | ||||
| cmp w10, #9 | cmp w10, #9 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v25.h}[0], [x18], x17 | |||||
| st1 {v25.h}[0], [x19], x17 | |||||
| st1 {v25.h}[1], [x13], x17 | st1 {v25.h}[1], [x13], x17 | ||||
| st1 {v25.h}[2], [x14], x17 | st1 {v25.h}[2], [x14], x17 | ||||
| cmp w10, #10 | cmp w10, #10 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v26.h}[0], [x18], x17 | |||||
| st1 {v26.h}[0], [x19], x17 | |||||
| st1 {v26.h}[1], [x13], x17 | st1 {v26.h}[1], [x13], x17 | ||||
| st1 {v26.h}[2], [x14], x17 | st1 {v26.h}[2], [x14], x17 | ||||
| cmp w10, #11 | cmp w10, #11 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v27.h}[0], [x18], x17 | |||||
| st1 {v27.h}[0], [x19], x17 | |||||
| st1 {v27.h}[1], [x13], x17 | st1 {v27.h}[1], [x13], x17 | ||||
| st1 {v27.h}[2], [x14], x17 | st1 {v27.h}[2], [x14], x17 | ||||
| cmp w10, #12 | cmp w10, #12 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v28.h}[0], [x18], x17 | |||||
| st1 {v28.h}[0], [x19], x17 | |||||
| st1 {v28.h}[1], [x13], x17 | st1 {v28.h}[1], [x13], x17 | ||||
| st1 {v28.h}[2], [x14], x17 | st1 {v28.h}[2], [x14], x17 | ||||
| cmp w10, #13 | cmp w10, #13 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v29.h}[0], [x18], x17 | |||||
| st1 {v29.h}[0], [x19], x17 | |||||
| st1 {v29.h}[1], [x13], x17 | st1 {v29.h}[1], [x13], x17 | ||||
| st1 {v29.h}[2], [x14], x17 | st1 {v29.h}[2], [x14], x17 | ||||
| cmp w10, #14 | cmp w10, #14 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v30.h}[0], [x18], x17 | |||||
| st1 {v30.h}[0], [x19], x17 | |||||
| st1 {v30.h}[1], [x13], x17 | st1 {v30.h}[1], [x13], x17 | ||||
| st1 {v30.h}[2], [x14], x17 | st1 {v30.h}[2], [x14], x17 | ||||
| cmp w10, #15 | cmp w10, #15 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v31.h}[0], [x18], x17 | |||||
| st1 {v31.h}[0], [x19], x17 | |||||
| st1 {v31.h}[1], [x13], x17 | st1 {v31.h}[1], [x13], x17 | ||||
| st1 {v31.h}[2], [x14], x17 | st1 {v31.h}[2], [x14], x17 | ||||
| b WriteEnd | b WriteEnd | ||||
| Write4: | Write4: | ||||
| st1 {v16.4h}, [x18], x17 | |||||
| st1 {v16.4h}, [x19], x17 | |||||
| cmp w10, #1 | cmp w10, #1 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v17.4h}, [x18], x17 | |||||
| st1 {v17.4h}, [x19], x17 | |||||
| cmp w10, #2 | cmp w10, #2 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v18.4h}, [x18], x17 | |||||
| st1 {v18.4h}, [x19], x17 | |||||
| cmp w10, #3 | cmp w10, #3 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v19.4h}, [x18], x17 | |||||
| st1 {v19.4h}, [x19], x17 | |||||
| cmp w10, #4 | cmp w10, #4 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v20.4h}, [x18], x17 | |||||
| st1 {v20.4h}, [x19], x17 | |||||
| cmp w10, #5 | cmp w10, #5 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v21.4h}, [x18], x17 | |||||
| st1 {v21.4h}, [x19], x17 | |||||
| cmp w10, #6 | cmp w10, #6 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v22.4h}, [x18], x17 | |||||
| st1 {v22.4h}, [x19], x17 | |||||
| cmp w10, #7 | cmp w10, #7 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v23.4h}, [x18], x17 | |||||
| st1 {v23.4h}, [x19], x17 | |||||
| cmp w10, #8 | cmp w10, #8 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v24.4h}, [x18], x17 | |||||
| st1 {v24.4h}, [x19], x17 | |||||
| cmp w10, #9 | cmp w10, #9 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v25.4h}, [x18], x17 | |||||
| st1 {v25.4h}, [x19], x17 | |||||
| cmp w10, #10 | cmp w10, #10 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v26.4h}, [x18], x17 | |||||
| st1 {v26.4h}, [x19], x17 | |||||
| cmp w10, #11 | cmp w10, #11 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v27.4h}, [x18], x17 | |||||
| st1 {v27.4h}, [x19], x17 | |||||
| cmp w10, #12 | cmp w10, #12 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v28.4h}, [x18], x17 | |||||
| st1 {v28.4h}, [x19], x17 | |||||
| cmp w10, #13 | cmp w10, #13 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v29.4h}, [x18], x17 | |||||
| st1 {v29.4h}, [x19], x17 | |||||
| cmp w10, #14 | cmp w10, #14 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v30.4h}, [x18], x17 | |||||
| st1 {v30.4h}, [x19], x17 | |||||
| cmp w10, #15 | cmp w10, #15 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v31.4h}, [x18], x17 | |||||
| st1 {v31.4h}, [x19], x17 | |||||
| b WriteEnd | b WriteEnd | ||||
| Write5: | Write5: | ||||
| add x13, x18, #8 | |||||
| st1 {v16.4h}, [x18], x17 | |||||
| add x13, x19, #8 | |||||
| st1 {v16.4h}, [x19], x17 | |||||
| st1 {v16.h}[4], [x13], x17 | st1 {v16.h}[4], [x13], x17 | ||||
| cmp w10, #1 | cmp w10, #1 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v17.4h}, [x18], x17 | |||||
| st1 {v17.4h}, [x19], x17 | |||||
| st1 {v17.h}[4], [x13], x17 | st1 {v17.h}[4], [x13], x17 | ||||
| cmp w10, #2 | cmp w10, #2 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v18.4h}, [x18], x17 | |||||
| st1 {v18.4h}, [x19], x17 | |||||
| st1 {v18.h}[4], [x13], x17 | st1 {v18.h}[4], [x13], x17 | ||||
| cmp w10, #3 | cmp w10, #3 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v19.4h}, [x18], x17 | |||||
| st1 {v19.4h}, [x19], x17 | |||||
| st1 {v19.h}[4], [x13], x17 | st1 {v19.h}[4], [x13], x17 | ||||
| cmp w10, #4 | cmp w10, #4 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v20.4h}, [x18], x17 | |||||
| st1 {v20.4h}, [x19], x17 | |||||
| st1 {v20.h}[4], [x13], x17 | st1 {v20.h}[4], [x13], x17 | ||||
| cmp w10, #5 | cmp w10, #5 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v21.4h}, [x18], x17 | |||||
| st1 {v21.4h}, [x19], x17 | |||||
| st1 {v21.h}[4], [x13], x17 | st1 {v21.h}[4], [x13], x17 | ||||
| cmp w10, #6 | cmp w10, #6 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v22.4h}, [x18], x17 | |||||
| st1 {v22.4h}, [x19], x17 | |||||
| st1 {v22.h}[4], [x13], x17 | st1 {v22.h}[4], [x13], x17 | ||||
| cmp w10, #7 | cmp w10, #7 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v23.4h}, [x18], x17 | |||||
| st1 {v23.4h}, [x19], x17 | |||||
| st1 {v23.h}[4], [x13], x17 | st1 {v23.h}[4], [x13], x17 | ||||
| cmp w10, #8 | cmp w10, #8 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v24.4h}, [x18], x17 | |||||
| st1 {v24.4h}, [x19], x17 | |||||
| st1 {v24.h}[4], [x13], x17 | st1 {v24.h}[4], [x13], x17 | ||||
| cmp w10, #9 | cmp w10, #9 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v25.4h}, [x18], x17 | |||||
| st1 {v25.4h}, [x19], x17 | |||||
| st1 {v25.h}[4], [x13], x17 | st1 {v25.h}[4], [x13], x17 | ||||
| cmp w10, #10 | cmp w10, #10 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v26.4h}, [x18], x17 | |||||
| st1 {v26.4h}, [x19], x17 | |||||
| st1 {v26.h}[4], [x13], x17 | st1 {v26.h}[4], [x13], x17 | ||||
| cmp w10, #11 | cmp w10, #11 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v27.4h}, [x18], x17 | |||||
| st1 {v27.4h}, [x19], x17 | |||||
| st1 {v27.h}[4], [x13], x17 | st1 {v27.h}[4], [x13], x17 | ||||
| cmp w10, #12 | cmp w10, #12 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v28.4h}, [x18], x17 | |||||
| st1 {v28.4h}, [x19], x17 | |||||
| st1 {v28.h}[4], [x13], x17 | st1 {v28.h}[4], [x13], x17 | ||||
| cmp w10, #13 | cmp w10, #13 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v29.4h}, [x18], x17 | |||||
| st1 {v29.4h}, [x19], x17 | |||||
| st1 {v29.h}[4], [x13], x17 | st1 {v29.h}[4], [x13], x17 | ||||
| cmp w10, #14 | cmp w10, #14 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v30.4h}, [x18], x17 | |||||
| st1 {v30.4h}, [x19], x17 | |||||
| st1 {v30.h}[4], [x13], x17 | st1 {v30.h}[4], [x13], x17 | ||||
| cmp w10, #15 | cmp w10, #15 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v31.4h}, [x18], x17 | |||||
| st1 {v31.4h}, [x19], x17 | |||||
| st1 {v31.h}[4], [x13], x17 | st1 {v31.h}[4], [x13], x17 | ||||
| b WriteEnd | b WriteEnd | ||||
| Write6: | Write6: | ||||
| add x13, x18, #8 | |||||
| add x14, x18, #10 | |||||
| st1 {v16.4h}, [x18], x17 | |||||
| add x13, x19, #8 | |||||
| add x14, x19, #10 | |||||
| st1 {v16.4h}, [x19], x17 | |||||
| st1 {v16.h}[4], [x13], x17 | st1 {v16.h}[4], [x13], x17 | ||||
| st1 {v16.h}[5], [x14], x17 | st1 {v16.h}[5], [x14], x17 | ||||
| cmp w10, #1 | cmp w10, #1 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v17.4h}, [x18], x17 | |||||
| st1 {v17.4h}, [x19], x17 | |||||
| st1 {v17.h}[4], [x13], x17 | st1 {v17.h}[4], [x13], x17 | ||||
| st1 {v17.h}[5], [x14], x17 | st1 {v17.h}[5], [x14], x17 | ||||
| cmp w10, #2 | cmp w10, #2 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v18.4h}, [x18], x17 | |||||
| st1 {v18.4h}, [x19], x17 | |||||
| st1 {v18.h}[4], [x13], x17 | st1 {v18.h}[4], [x13], x17 | ||||
| st1 {v18.h}[5], [x14], x17 | st1 {v18.h}[5], [x14], x17 | ||||
| cmp w10, #3 | cmp w10, #3 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v19.4h}, [x18], x17 | |||||
| st1 {v19.4h}, [x19], x17 | |||||
| st1 {v19.h}[4], [x13], x17 | st1 {v19.h}[4], [x13], x17 | ||||
| st1 {v19.h}[5], [x14], x17 | st1 {v19.h}[5], [x14], x17 | ||||
| cmp w10, #4 | cmp w10, #4 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v20.4h}, [x18], x17 | |||||
| st1 {v20.4h}, [x19], x17 | |||||
| st1 {v20.h}[4], [x13], x17 | st1 {v20.h}[4], [x13], x17 | ||||
| st1 {v20.h}[5], [x14], x17 | st1 {v20.h}[5], [x14], x17 | ||||
| cmp w10, #5 | cmp w10, #5 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v21.4h}, [x18], x17 | |||||
| st1 {v21.4h}, [x19], x17 | |||||
| st1 {v21.h}[4], [x13], x17 | st1 {v21.h}[4], [x13], x17 | ||||
| st1 {v21.h}[5], [x14], x17 | st1 {v21.h}[5], [x14], x17 | ||||
| cmp w10, #6 | cmp w10, #6 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v22.4h}, [x18], x17 | |||||
| st1 {v22.4h}, [x19], x17 | |||||
| st1 {v22.h}[4], [x13], x17 | st1 {v22.h}[4], [x13], x17 | ||||
| st1 {v22.h}[5], [x14], x17 | st1 {v22.h}[5], [x14], x17 | ||||
| cmp w10, #7 | cmp w10, #7 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v23.4h}, [x18], x17 | |||||
| st1 {v23.4h}, [x19], x17 | |||||
| st1 {v23.h}[4], [x13], x17 | st1 {v23.h}[4], [x13], x17 | ||||
| st1 {v23.h}[5], [x14], x17 | st1 {v23.h}[5], [x14], x17 | ||||
| cmp w10, #8 | cmp w10, #8 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v24.4h}, [x18], x17 | |||||
| st1 {v24.4h}, [x19], x17 | |||||
| st1 {v24.h}[4], [x13], x17 | st1 {v24.h}[4], [x13], x17 | ||||
| st1 {v24.h}[5], [x14], x17 | st1 {v24.h}[5], [x14], x17 | ||||
| cmp w10, #9 | cmp w10, #9 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v25.4h}, [x18], x17 | |||||
| st1 {v25.4h}, [x19], x17 | |||||
| st1 {v25.h}[4], [x13], x17 | st1 {v25.h}[4], [x13], x17 | ||||
| st1 {v25.h}[5], [x14], x17 | st1 {v25.h}[5], [x14], x17 | ||||
| cmp w10, #10 | cmp w10, #10 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v26.4h}, [x18], x17 | |||||
| st1 {v26.4h}, [x19], x17 | |||||
| st1 {v26.h}[4], [x13], x17 | st1 {v26.h}[4], [x13], x17 | ||||
| st1 {v26.h}[5], [x14], x17 | st1 {v26.h}[5], [x14], x17 | ||||
| cmp w10, #11 | cmp w10, #11 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v27.4h}, [x18], x17 | |||||
| st1 {v27.4h}, [x19], x17 | |||||
| st1 {v27.h}[4], [x13], x17 | st1 {v27.h}[4], [x13], x17 | ||||
| st1 {v27.h}[5], [x14], x17 | st1 {v27.h}[5], [x14], x17 | ||||
| cmp w10, #12 | cmp w10, #12 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v28.4h}, [x18], x17 | |||||
| st1 {v28.4h}, [x19], x17 | |||||
| st1 {v28.h}[4], [x13], x17 | st1 {v28.h}[4], [x13], x17 | ||||
| st1 {v28.h}[5], [x14], x17 | st1 {v28.h}[5], [x14], x17 | ||||
| cmp w10, #13 | cmp w10, #13 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v29.4h}, [x18], x17 | |||||
| st1 {v29.4h}, [x19], x17 | |||||
| st1 {v29.h}[4], [x13], x17 | st1 {v29.h}[4], [x13], x17 | ||||
| st1 {v29.h}[5], [x14], x17 | st1 {v29.h}[5], [x14], x17 | ||||
| cmp w10, #14 | cmp w10, #14 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v30.4h}, [x18], x17 | |||||
| st1 {v30.4h}, [x19], x17 | |||||
| st1 {v30.h}[4], [x13], x17 | st1 {v30.h}[4], [x13], x17 | ||||
| st1 {v30.h}[5], [x14], x17 | st1 {v30.h}[5], [x14], x17 | ||||
| cmp w10, #15 | cmp w10, #15 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v31.4h}, [x18], x17 | |||||
| st1 {v31.4h}, [x19], x17 | |||||
| st1 {v31.h}[4], [x13], x17 | st1 {v31.h}[4], [x13], x17 | ||||
| st1 {v31.h}[5], [x14], x17 | st1 {v31.h}[5], [x14], x17 | ||||
| b WriteEnd | b WriteEnd | ||||
| Write7: | Write7: | ||||
| add x13, x18, #8 | |||||
| add x14, x18, #10 | |||||
| add x16, x18, #12 | |||||
| st1 {v16.4h}, [x18], x17 | |||||
| add x13, x19, #8 | |||||
| add x14, x19, #10 | |||||
| add x16, x19, #12 | |||||
| st1 {v16.4h}, [x19], x17 | |||||
| st1 {v16.h}[4], [x13], x17 | st1 {v16.h}[4], [x13], x17 | ||||
| st1 {v16.h}[5], [x14], x17 | st1 {v16.h}[5], [x14], x17 | ||||
| st1 {v16.h}[6], [x16], x17 | st1 {v16.h}[6], [x16], x17 | ||||
| cmp w10, #1 | cmp w10, #1 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v17.4h}, [x18], x17 | |||||
| st1 {v17.4h}, [x19], x17 | |||||
| st1 {v17.h}[4], [x13], x17 | st1 {v17.h}[4], [x13], x17 | ||||
| st1 {v17.h}[5], [x14], x17 | st1 {v17.h}[5], [x14], x17 | ||||
| st1 {v17.h}[6], [x16], x17 | st1 {v17.h}[6], [x16], x17 | ||||
| cmp w10, #2 | cmp w10, #2 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v18.4h}, [x18], x17 | |||||
| st1 {v18.4h}, [x19], x17 | |||||
| st1 {v18.h}[4], [x13], x17 | st1 {v18.h}[4], [x13], x17 | ||||
| st1 {v18.h}[5], [x14], x17 | st1 {v18.h}[5], [x14], x17 | ||||
| st1 {v18.h}[6], [x16], x17 | st1 {v18.h}[6], [x16], x17 | ||||
| cmp w10, #3 | cmp w10, #3 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v19.4h}, [x18], x17 | |||||
| st1 {v19.4h}, [x19], x17 | |||||
| st1 {v19.h}[4], [x13], x17 | st1 {v19.h}[4], [x13], x17 | ||||
| st1 {v19.h}[5], [x14], x17 | st1 {v19.h}[5], [x14], x17 | ||||
| st1 {v19.h}[6], [x16], x17 | st1 {v19.h}[6], [x16], x17 | ||||
| cmp w10, #4 | cmp w10, #4 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v20.4h}, [x18], x17 | |||||
| st1 {v20.4h}, [x19], x17 | |||||
| st1 {v20.h}[4], [x13], x17 | st1 {v20.h}[4], [x13], x17 | ||||
| st1 {v20.h}[5], [x14], x17 | st1 {v20.h}[5], [x14], x17 | ||||
| st1 {v20.h}[6], [x16], x17 | st1 {v20.h}[6], [x16], x17 | ||||
| cmp w10, #5 | cmp w10, #5 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v21.4h}, [x18], x17 | |||||
| st1 {v21.4h}, [x19], x17 | |||||
| st1 {v21.h}[4], [x13], x17 | st1 {v21.h}[4], [x13], x17 | ||||
| st1 {v21.h}[5], [x14], x17 | st1 {v21.h}[5], [x14], x17 | ||||
| st1 {v21.h}[6], [x16], x17 | st1 {v21.h}[6], [x16], x17 | ||||
| cmp w10, #6 | cmp w10, #6 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v22.4h}, [x18], x17 | |||||
| st1 {v22.4h}, [x19], x17 | |||||
| st1 {v22.h}[4], [x13], x17 | st1 {v22.h}[4], [x13], x17 | ||||
| st1 {v22.h}[5], [x14], x17 | st1 {v22.h}[5], [x14], x17 | ||||
| st1 {v22.h}[6], [x16], x17 | st1 {v22.h}[6], [x16], x17 | ||||
| cmp w10, #7 | cmp w10, #7 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v23.4h}, [x18], x17 | |||||
| st1 {v23.4h}, [x19], x17 | |||||
| st1 {v23.h}[4], [x13], x17 | st1 {v23.h}[4], [x13], x17 | ||||
| st1 {v23.h}[5], [x14], x17 | st1 {v23.h}[5], [x14], x17 | ||||
| st1 {v23.h}[6], [x16], x17 | st1 {v23.h}[6], [x16], x17 | ||||
| cmp w10, #8 | cmp w10, #8 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v24.4h}, [x18], x17 | |||||
| st1 {v24.4h}, [x19], x17 | |||||
| st1 {v24.h}[4], [x13], x17 | st1 {v24.h}[4], [x13], x17 | ||||
| st1 {v24.h}[5], [x14], x17 | st1 {v24.h}[5], [x14], x17 | ||||
| st1 {v24.h}[6], [x16], x17 | st1 {v24.h}[6], [x16], x17 | ||||
| cmp w10, #9 | cmp w10, #9 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v25.4h}, [x18], x17 | |||||
| st1 {v25.4h}, [x19], x17 | |||||
| st1 {v25.h}[4], [x13], x17 | st1 {v25.h}[4], [x13], x17 | ||||
| st1 {v25.h}[5], [x14], x17 | st1 {v25.h}[5], [x14], x17 | ||||
| st1 {v25.h}[6], [x16], x17 | st1 {v25.h}[6], [x16], x17 | ||||
| cmp w10, #10 | cmp w10, #10 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v26.4h}, [x18], x17 | |||||
| st1 {v26.4h}, [x19], x17 | |||||
| st1 {v26.h}[4], [x13], x17 | st1 {v26.h}[4], [x13], x17 | ||||
| st1 {v26.h}[5], [x14], x17 | st1 {v26.h}[5], [x14], x17 | ||||
| st1 {v26.h}[6], [x16], x17 | st1 {v26.h}[6], [x16], x17 | ||||
| cmp w10, #11 | cmp w10, #11 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v27.4h}, [x18], x17 | |||||
| st1 {v27.4h}, [x19], x17 | |||||
| st1 {v27.h}[4], [x13], x17 | st1 {v27.h}[4], [x13], x17 | ||||
| st1 {v27.h}[5], [x14], x17 | st1 {v27.h}[5], [x14], x17 | ||||
| st1 {v27.h}[6], [x16], x17 | st1 {v27.h}[6], [x16], x17 | ||||
| cmp w10, #12 | cmp w10, #12 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v28.4h}, [x18], x17 | |||||
| st1 {v28.4h}, [x19], x17 | |||||
| st1 {v28.h}[4], [x13], x17 | st1 {v28.h}[4], [x13], x17 | ||||
| st1 {v28.h}[5], [x14], x17 | st1 {v28.h}[5], [x14], x17 | ||||
| st1 {v28.h}[6], [x16], x17 | st1 {v28.h}[6], [x16], x17 | ||||
| cmp w10, #13 | cmp w10, #13 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v29.4h}, [x18], x17 | |||||
| st1 {v29.4h}, [x19], x17 | |||||
| st1 {v29.h}[4], [x13], x17 | st1 {v29.h}[4], [x13], x17 | ||||
| st1 {v29.h}[5], [x14], x17 | st1 {v29.h}[5], [x14], x17 | ||||
| st1 {v29.h}[6], [x16], x17 | st1 {v29.h}[6], [x16], x17 | ||||
| cmp w10, #14 | cmp w10, #14 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v30.4h}, [x18], x17 | |||||
| st1 {v30.4h}, [x19], x17 | |||||
| st1 {v30.h}[4], [x13], x17 | st1 {v30.h}[4], [x13], x17 | ||||
| st1 {v30.h}[5], [x14], x17 | st1 {v30.h}[5], [x14], x17 | ||||
| st1 {v30.h}[6], [x16], x17 | st1 {v30.h}[6], [x16], x17 | ||||
| cmp w10, #15 | cmp w10, #15 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v31.4h}, [x18], x17 | |||||
| st1 {v31.4h}, [x19], x17 | |||||
| st1 {v31.h}[4], [x13], x17 | st1 {v31.h}[4], [x13], x17 | ||||
| st1 {v31.h}[5], [x14], x17 | st1 {v31.h}[5], [x14], x17 | ||||
| st1 {v31.h}[6], [x16], x17 | st1 {v31.h}[6], [x16], x17 | ||||
| @@ -809,52 +810,52 @@ WriteC8: | |||||
| st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x2], #64 | st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x2], #64 | ||||
| b WriteEnd | b WriteEnd | ||||
| Write8: | Write8: | ||||
| st1 {v16.8h}, [x18], x17 | |||||
| st1 {v16.8h}, [x19], x17 | |||||
| cmp w10, #1 | cmp w10, #1 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v17.8h}, [x18], x17 | |||||
| st1 {v17.8h}, [x19], x17 | |||||
| cmp w10, #2 | cmp w10, #2 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v18.8h}, [x18], x17 | |||||
| st1 {v18.8h}, [x19], x17 | |||||
| cmp w10, #3 | cmp w10, #3 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v19.8h}, [x18], x17 | |||||
| st1 {v19.8h}, [x19], x17 | |||||
| cmp w10, #4 | cmp w10, #4 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v20.8h}, [x18], x17 | |||||
| st1 {v20.8h}, [x19], x17 | |||||
| cmp w10, #5 | cmp w10, #5 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v21.8h}, [x18], x17 | |||||
| st1 {v21.8h}, [x19], x17 | |||||
| cmp w10, #6 | cmp w10, #6 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v22.8h}, [x18], x17 | |||||
| st1 {v22.8h}, [x19], x17 | |||||
| cmp w10, #7 | cmp w10, #7 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v23.8h}, [x18], x17 | |||||
| st1 {v23.8h}, [x19], x17 | |||||
| cmp w10, #8 | cmp w10, #8 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v24.8h}, [x18], x17 | |||||
| st1 {v24.8h}, [x19], x17 | |||||
| cmp w10, #9 | cmp w10, #9 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v25.8h}, [x18], x17 | |||||
| st1 {v25.8h}, [x19], x17 | |||||
| cmp w10, #10 | cmp w10, #10 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v26.8h}, [x18], x17 | |||||
| st1 {v26.8h}, [x19], x17 | |||||
| cmp w10, #11 | cmp w10, #11 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v27.8h}, [x18], x17 | |||||
| st1 {v27.8h}, [x19], x17 | |||||
| cmp w10, #12 | cmp w10, #12 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v28.8h}, [x18], x17 | |||||
| st1 {v28.8h}, [x19], x17 | |||||
| cmp w10, #13 | cmp w10, #13 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v29.8h}, [x18], x17 | |||||
| st1 {v29.8h}, [x19], x17 | |||||
| cmp w10, #14 | cmp w10, #14 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v30.8h}, [x18], x17 | |||||
| st1 {v30.8h}, [x19], x17 | |||||
| cmp w10, #15 | cmp w10, #15 | ||||
| beq WriteEnd | beq WriteEnd | ||||
| st1 {v31.8h}, [x18], x17 | |||||
| st1 {v31.8h}, [x19], x17 | |||||
| WriteEnd: | WriteEnd: | ||||
| subs w10, w10, #16 // lhs row - 8 | subs w10, w10, #16 // lhs row - 8 | ||||
| @@ -871,8 +872,9 @@ NoDstStep: | |||||
| bgt L1 | bgt L1 | ||||
| End1: | End1: | ||||
| sub sp, sp, #128 | |||||
| sub sp, sp, #144 | |||||
| ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | ||||
| ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 | ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 | ||||
| ldp x19, x20, [sp], #16 | |||||
| ret | ret | ||||
| #endif | #endif | ||||
| @@ -21,30 +21,31 @@ | |||||
| // x9: writeMode | // x9: writeMode | ||||
| asm_function MatmulFp16Neon64Opt | asm_function MatmulFp16Neon64Opt | ||||
| sub sp, sp, #80 | |||||
| sub sp, sp, #96 | |||||
| st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | ||||
| stp x19, x20, [sp], #16 | stp x19, x20, [sp], #16 | ||||
| stp x21, x22, [sp], #16 | |||||
| ldr x8, [sp] | ldr x8, [sp] | ||||
| ldr x9, [sp, #8] | ldr x9, [sp, #8] | ||||
| mov x18, #32 // sizeof(float16_t) * 16 | |||||
| mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float16_t) * 16 * depth | |||||
| mov x21, #32 // sizeof(float16_t) * 16 | |||||
| mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float16_t) * 16 * depth | |||||
| cbnz x9, NoC8Steps | cbnz x9, NoC8Steps | ||||
| mov x11, x2 | mov x11, x2 | ||||
| mov x18, #16 | |||||
| mul x16, x6, x18 // row * 8 * sizeof(float16_t) | |||||
| mov x21, #16 | |||||
| mul x16, x6, x21 // row * 8 * sizeof(float16_t) | |||||
| NoC8Steps: | NoC8Steps: | ||||
| cmp x9, #2 | cmp x9, #2 | ||||
| bne NoWinoSteps | bne NoWinoSteps | ||||
| mov x18, #2 | |||||
| mov x21, #2 | |||||
| mul x15, x7, x8 | mul x15, x7, x8 | ||||
| mul x15, x15, x18 // kernel_size * col *sizeof(float16_t) | |||||
| mov x18, #16 | |||||
| mul x16, x8, x18 // kernel_size * 8 * sizeof(float16_t) | |||||
| mul x15, x15, x21 // kernel_size * col *sizeof(float16_t) | |||||
| mov x21, #16 | |||||
| mul x16, x8, x21 // kernel_size * 8 * sizeof(float16_t) | |||||
| NoWinoSteps: | NoWinoSteps: | ||||
| mov x18, #2 | |||||
| mul x8, x8, x18 | |||||
| mov x21, #2 | |||||
| mul x8, x8, x21 | |||||
| LoopRowStart: | LoopRowStart: | ||||
| cmp x6, #1 | cmp x6, #1 | ||||
| @@ -1221,9 +1222,9 @@ LoopRow: | |||||
| LoopColEnd: | LoopColEnd: | ||||
| add x0, x0, x17 | add x0, x0, x17 | ||||
| cbz x9, C8DstStep | cbz x9, C8DstStep | ||||
| mov x18, #2 | |||||
| mul x18, x18, x7 | |||||
| sub x11, x11, x18 | |||||
| mov x21, #2 | |||||
| mul x21, x21, x7 | |||||
| sub x11, x11, x21 | |||||
| mov x2, x11 | mov x2, x11 | ||||
| b NoDstStep | b NoDstStep | ||||
| C8DstStep: | C8DstStep: | ||||
| @@ -1233,8 +1234,9 @@ LoopColEnd: | |||||
| subs x6, x6, #16 | subs x6, x6, #16 | ||||
| bgt LoopRowStart | bgt LoopRowStart | ||||
| sub sp, sp, #80 | |||||
| sub sp, sp, #96 | |||||
| ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | ||||
| ldp x19, x20, [sp], #16 | ldp x19, x20, [sp], #16 | ||||
| ldp x21, x22, [sp], #16 | |||||
| ret | ret | ||||
| #endif | #endif | ||||
| @@ -31,13 +31,13 @@ asm_function MatrixMultiplyWinogradFp16 | |||||
| mov x14, x1 // mat_b | mov x14, x1 // mat_b | ||||
| LoopN: | LoopN: | ||||
| mov x16, x0 // mat_a_m | mov x16, x0 // mat_a_m | ||||
| sub x18, x5, x15 // ni | |||||
| sub x22, x5, x15 // ni | |||||
| sub x19, x17, x3 // mi | sub x19, x17, x3 // mi | ||||
| mul x18, x18, x17 // ni * m | |||||
| mul x22, x22, x17 // ni * m | |||||
| mov x11, x6 // in_channel | mov x11, x6 // in_channel | ||||
| add x18, x18, x19 // (ni * m) + mi | |||||
| mul x18, x18, x13 // x18 * channel_in * 2 | |||||
| add x20, x2, x18 // dst + offset | |||||
| add x22, x22, x19 // (ni * m) + mi | |||||
| mul x22, x22, x13 // x22 * channel_in * 2 | |||||
| add x20, x2, x22 // dst + offset | |||||
| cmp x11, #32 | cmp x11, #32 | ||||
| bge LoopC32 | bge LoopC32 | ||||
| cmp x11, #16 | cmp x11, #16 | ||||
| @@ -9,8 +9,8 @@ | |||||
| asm_function WinogradTransLeftFp16 | asm_function WinogradTransLeftFp16 | ||||
| sub sp, sp, #32 | |||||
| stp x19, x20, [sp], #32 | |||||
| sub sp, sp, #16 | |||||
| stp x19, x20, [sp], #16 | |||||
| mov x8, #8 // 4 * sizeof(float16) | mov x8, #8 // 4 * sizeof(float16) | ||||
| mul x8, x6, x8 | mul x8, x6, x8 | ||||
| @@ -46,16 +46,16 @@ LoopH: | |||||
| ld1 {v0.h}[2], [x17], x10 | ld1 {v0.h}[2], [x17], x10 | ||||
| ld1 {v0.h}[3], [x17], x10 | ld1 {v0.h}[3], [x17], x10 | ||||
| mov x11, x6 | mov x11, x6 | ||||
| mov x18, x17 | |||||
| add x18, x14, x7 | |||||
| add x16, x18, x7 | |||||
| mov x20, x17 | |||||
| add x20, x14, x7 | |||||
| add x16, x20, x7 | |||||
| add x19, x16, x7 | add x19, x16, x7 | ||||
| LoopLength4: | LoopLength4: | ||||
| ld1 {v16.4h}, [x2] | ld1 {v16.4h}, [x2] | ||||
| ld1 {v20.4h}, [x14], #8 | ld1 {v20.4h}, [x14], #8 | ||||
| fmla v16.4h, v20.4h, v0.h[0] | fmla v16.4h, v20.4h, v0.h[0] | ||||
| ld1 {v21.4h}, [x18], #8 | |||||
| ld1 {v21.4h}, [x20], #8 | |||||
| fmul v17.4h, v21.4h, v0.h[1] | fmul v17.4h, v21.4h, v0.h[1] | ||||
| ld1 {v20.4h}, [x16], #8 | ld1 {v20.4h}, [x16], #8 | ||||
| fmla v16.4h, v20.4h, v0.h[2] | fmla v16.4h, v20.4h, v0.h[2] | ||||
| @@ -81,14 +81,14 @@ LoopH: | |||||
| ld1 {v0.h}[1], [x17], x10 | ld1 {v0.h}[1], [x17], x10 | ||||
| ld1 {v0.h}[2], [x17], x10 | ld1 {v0.h}[2], [x17], x10 | ||||
| mov x11, x6 | mov x11, x6 | ||||
| mov x18, x17 | |||||
| add x18, x14, x7 | |||||
| add x16, x18, x7 | |||||
| mov x20, x17 | |||||
| add x20, x14, x7 | |||||
| add x16, x20, x7 | |||||
| LoopLength3: | LoopLength3: | ||||
| ld1 {v16.4h}, [x2] | ld1 {v16.4h}, [x2] | ||||
| ld1 {v20.4h}, [x14], #8 | ld1 {v20.4h}, [x14], #8 | ||||
| fmla v16.4h, v20.4h, v0.h[0] | fmla v16.4h, v20.4h, v0.h[0] | ||||
| ld1 {v21.4h}, [x18], #8 | |||||
| ld1 {v21.4h}, [x20], #8 | |||||
| fmul v17.4h, v21.4h, v0.h[1] | fmul v17.4h, v21.4h, v0.h[1] | ||||
| ld1 {v20.4h}, [x16], #8 | ld1 {v20.4h}, [x16], #8 | ||||
| fmla v16.4h, v20.4h, v0.h[2] | fmla v16.4h, v20.4h, v0.h[2] | ||||
| @@ -132,6 +132,6 @@ LoopH: | |||||
| subs x4, x4, #1 | subs x4, x4, #1 | ||||
| bne LoopH | bne LoopH | ||||
| sub sp, sp, #32 | |||||
| ldp x19, x20, [sp], #32 | |||||
| sub sp, sp, #16 | |||||
| ldp x19, x20, [sp], #16 | |||||
| ret | ret | ||||
| @@ -9,6 +9,9 @@ | |||||
| asm_function WinogradTransRightFp16 | asm_function WinogradTransRightFp16 | ||||
| sub sp, sp, #16 | |||||
| stp x19, x20, [sp], #16 | |||||
| mov x8, #8 // 4 * sizeof(float16) | mov x8, #8 // 4 * sizeof(float16) | ||||
| mul x8, x6, x8 | mul x8, x6, x8 | ||||
| mul x9, x5, x8 // step for S | mul x9, x5, x8 // step for S | ||||
| @@ -34,7 +37,7 @@ LoopH: | |||||
| cmp x12, #4 | cmp x12, #4 | ||||
| blt LoopKStart3 | blt LoopKStart3 | ||||
| mov x16, x15 | mov x16, x15 | ||||
| mov x18, x4 | |||||
| mov x19, x4 | |||||
| LoopK4: | LoopK4: | ||||
| ld1 {v0.h}[0], [x13], x10 | ld1 {v0.h}[0], [x13], x10 | ||||
| ld1 {v0.h}[1], [x13], x10 | ld1 {v0.h}[1], [x13], x10 | ||||
| @@ -45,7 +48,7 @@ LoopH: | |||||
| add x14, x17, x8 | add x14, x17, x8 | ||||
| add x16, x14, x8 | add x16, x14, x8 | ||||
| add x18, x16, x8 | |||||
| add x19, x16, x8 | |||||
| LoopLength4: | LoopLength4: | ||||
| ld1 {v16.4h}, [x2] | ld1 {v16.4h}, [x2] | ||||
| @@ -55,7 +58,7 @@ LoopH: | |||||
| fmul v17.4h, v21.4h, v0.h[1] | fmul v17.4h, v21.4h, v0.h[1] | ||||
| ld1 {v20.4h}, [x16], #8 | ld1 {v20.4h}, [x16], #8 | ||||
| fmla v16.4h, v20.4h, v0.h[2] | fmla v16.4h, v20.4h, v0.h[2] | ||||
| ld1 {v21.4h}, [x18], #8 | |||||
| ld1 {v21.4h}, [x19], #8 | |||||
| fmla v17.4h, v21.4h, v0.h[3] | fmla v17.4h, v21.4h, v0.h[3] | ||||
| fadd v17.4h, v16.4h, v17.4h | fadd v17.4h, v16.4h, v17.4h | ||||
| @@ -64,7 +67,7 @@ LoopH: | |||||
| bne LoopLength4 | bne LoopLength4 | ||||
| sub x2, x2, x8 | sub x2, x2, x8 | ||||
| sub x12, x12, #4 | sub x12, x12, #4 | ||||
| mov x17, x18 | |||||
| mov x17, x19 | |||||
| cmp x12, #4 | cmp x12, #4 | ||||
| bge LoopK4 | bge LoopK4 | ||||
| @@ -98,7 +101,7 @@ LoopH: | |||||
| bne LoopLength3 | bne LoopLength3 | ||||
| sub x2, x2, x8 | sub x2, x2, x8 | ||||
| sub x12, x12, #3 | sub x12, x12, #3 | ||||
| mov x17, x18 | |||||
| mov x17, x19 | |||||
| cmp x12, #3 | cmp x12, #3 | ||||
| bge LoopK3 | bge LoopK3 | ||||
| @@ -132,4 +135,7 @@ LoopH: | |||||
| subs x4, x4, #1 | subs x4, x4, #1 | ||||
| bne LoopH | bne LoopH | ||||
| sub sp, sp, #16 | |||||
| ldp x19, x20, [sp], #16 | |||||
| ret | ret | ||||
| @@ -66,7 +66,7 @@ L2: | |||||
| cmp w16, #0 | cmp w16, #0 | ||||
| beq End2 | beq End2 | ||||
| mov x18, x1 // reload b ptr | |||||
| mov x28, x1 // reload b ptr | |||||
| mov x19, x7 // reload bias ptr | mov x19, x7 // reload bias ptr | ||||
| mov w20, w5 // reload depth | mov w20, w5 // reload depth | ||||
| dup v16.4s, wzr | dup v16.4s, wzr | ||||
| @@ -91,7 +91,7 @@ L3: | |||||
| LoopD16: | LoopD16: | ||||
| ld1 {v0.16b, v1.16b}, [x17], #32 | ld1 {v0.16b, v1.16b}, [x17], #32 | ||||
| ld1 {v2.16b, v3.16b}, [x18], #32 | |||||
| ld1 {v2.16b, v3.16b}, [x28], #32 | |||||
| sdot v16.4s, v2.16b, v0.4b[0] | sdot v16.4s, v2.16b, v0.4b[0] | ||||
| sdot v18.4s, v2.16b, v0.4b[1] | sdot v18.4s, v2.16b, v0.4b[1] | ||||
| @@ -104,7 +104,7 @@ LoopD16: | |||||
| sdot v28.4s, v2.16b, v1.4b[2] | sdot v28.4s, v2.16b, v1.4b[2] | ||||
| sdot v30.4s, v2.16b, v1.4b[3] | sdot v30.4s, v2.16b, v1.4b[3] | ||||
| ld1 {v6.16b, v7.16b}, [x18], #32 | |||||
| ld1 {v6.16b, v7.16b}, [x28], #32 | |||||
| sdot v17.4s, v3.16b, v0.4b[0] | sdot v17.4s, v3.16b, v0.4b[0] | ||||
| sdot v19.4s, v3.16b, v0.4b[1] | sdot v19.4s, v3.16b, v0.4b[1] | ||||
| sdot v21.4s, v3.16b, v0.4b[2] | sdot v21.4s, v3.16b, v0.4b[2] | ||||
| @@ -126,7 +126,7 @@ LoopD16: | |||||
| sdot v28.4s, v6.16b, v5.4b[2] | sdot v28.4s, v6.16b, v5.4b[2] | ||||
| sdot v30.4s, v6.16b, v5.4b[3] | sdot v30.4s, v6.16b, v5.4b[3] | ||||
| ld1 {v10.16b, v11.16b}, [x18], #32 | |||||
| ld1 {v10.16b, v11.16b}, [x28], #32 | |||||
| sdot v17.4s, v7.16b, v4.4b[0] | sdot v17.4s, v7.16b, v4.4b[0] | ||||
| sdot v19.4s, v7.16b, v4.4b[1] | sdot v19.4s, v7.16b, v4.4b[1] | ||||
| sdot v21.4s, v7.16b, v4.4b[2] | sdot v21.4s, v7.16b, v4.4b[2] | ||||
| @@ -148,7 +148,7 @@ LoopD16: | |||||
| sdot v28.4s, v10.16b, v9.4b[2] | sdot v28.4s, v10.16b, v9.4b[2] | ||||
| sdot v30.4s, v10.16b, v9.4b[3] | sdot v30.4s, v10.16b, v9.4b[3] | ||||
| ld1 {v14.16b, v15.16b}, [x18], #32 | |||||
| ld1 {v14.16b, v15.16b}, [x28], #32 | |||||
| sdot v17.4s, v11.16b, v8.4b[0] | sdot v17.4s, v11.16b, v8.4b[0] | ||||
| sdot v19.4s, v11.16b, v8.4b[1] | sdot v19.4s, v11.16b, v8.4b[1] | ||||
| sdot v21.4s, v11.16b, v8.4b[2] | sdot v21.4s, v11.16b, v8.4b[2] | ||||
| @@ -187,7 +187,7 @@ LoopD4: | |||||
| beq End3 | beq End3 | ||||
| ld1 {v0.16b, v1.16b}, [x17], #32 | ld1 {v0.16b, v1.16b}, [x17], #32 | ||||
| ld1 {v2.16b, v3.16b}, [x18], #32 | |||||
| ld1 {v2.16b, v3.16b}, [x28], #32 | |||||
| sdot v16.4s, v2.16b, v0.4b[0] | sdot v16.4s, v2.16b, v0.4b[0] | ||||
| sdot v18.4s, v2.16b, v0.4b[1] | sdot v18.4s, v2.16b, v0.4b[1] | ||||
| @@ -30,7 +30,7 @@ | |||||
| // x28: filter_zp | // x28: filter_zp | ||||
| asm_function MatmulInt8DpOpt | asm_function MatmulInt8DpOpt | ||||
| sub sp, sp, #208 | |||||
| sub sp, sp, #224 | |||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| stp x19, x20, [sp], #16 | stp x19, x20, [sp], #16 | ||||
| @@ -38,6 +38,7 @@ asm_function MatmulInt8DpOpt | |||||
| stp x23, x24, [sp], #16 | stp x23, x24, [sp], #16 | ||||
| stp x25, x26, [sp], #16 | stp x25, x26, [sp], #16 | ||||
| stp x27, x28, [sp], #16 | stp x27, x28, [sp], #16 | ||||
| stp x29, x30, [sp], #16 | |||||
| ldr w8, [sp] | ldr w8, [sp] | ||||
| ldr w9, [sp, #8] | ldr w9, [sp, #8] | ||||
| @@ -56,7 +57,7 @@ asm_function MatmulInt8DpOpt | |||||
| LoopRow: | LoopRow: | ||||
| mov x16, x1 // reload rhs ptr | mov x16, x1 // reload rhs ptr | ||||
| mov x17, x4 // reload rhs col | mov x17, x4 // reload rhs col | ||||
| mov x18, x7 // reload bias ptr | |||||
| mov x29, x7 // reload bias ptr | |||||
| mov x25, x6 // reload input_sum ptr | mov x25, x6 // reload input_sum ptr | ||||
| mov x27, x2 // reload dst ptr | mov x27, x2 // reload dst ptr | ||||
| ldr x28, [sp, #64] // reload filter_zp | ldr x28, [sp, #64] // reload filter_zp | ||||
| @@ -113,7 +114,7 @@ LoopRow: | |||||
| Bias: | Bias: | ||||
| cbz x7, NoReadBias | cbz x7, NoReadBias | ||||
| ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x18], #64 | |||||
| ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x29], #64 | |||||
| add v16.4s, v16.4s, v0.4s | add v16.4s, v16.4s, v0.4s | ||||
| add v17.4s, v17.4s, v1.4s | add v17.4s, v17.4s, v1.4s | ||||
| add v18.4s, v18.4s, v2.4s | add v18.4s, v18.4s, v2.4s | ||||
| @@ -423,8 +424,8 @@ LoopRow: | |||||
| BiasHalf: | BiasHalf: | ||||
| cbz x7, NoReadBiasHalf | cbz x7, NoReadBiasHalf | ||||
| ld1 {v0.4s, v1.4s}, [x18] | |||||
| add x18, x18, #64 | |||||
| ld1 {v0.4s, v1.4s}, [x29] | |||||
| add x29, x29, #64 | |||||
| add v16.4s, v16.4s, v0.4s | add v16.4s, v16.4s, v0.4s | ||||
| add v17.4s, v17.4s, v1.4s | add v17.4s, v17.4s, v1.4s | ||||
| add v20.4s, v20.4s, v0.4s | add v20.4s, v20.4s, v0.4s | ||||
| @@ -612,8 +613,8 @@ LoopRow: | |||||
| BiasQuarter: | BiasQuarter: | ||||
| cbz x7, NoReadBiasQuarter | cbz x7, NoReadBiasQuarter | ||||
| ld1 {v0.4s}, [x18] | |||||
| add x18, x18, #64 | |||||
| ld1 {v0.4s}, [x29] | |||||
| add x29, x29, #64 | |||||
| add v16.4s, v16.4s, v0.4s | add v16.4s, v16.4s, v0.4s | ||||
| add v20.4s, v20.4s, v0.4s | add v20.4s, v20.4s, v0.4s | ||||
| add v24.4s, v24.4s, v0.4s | add v24.4s, v24.4s, v0.4s | ||||
| @@ -1072,7 +1073,7 @@ LoopColEnd: | |||||
| b LoopRow | b LoopRow | ||||
| LoopRowEnd: | LoopRowEnd: | ||||
| sub sp, sp, #208 | |||||
| sub sp, sp, #224 | |||||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| ldp x19, x20, [sp], #16 | ldp x19, x20, [sp], #16 | ||||
| @@ -1080,5 +1081,6 @@ LoopRowEnd: | |||||
| ldp x23, x24, [sp], #16 | ldp x23, x24, [sp], #16 | ||||
| ldp x25, x26, [sp], #16 | ldp x25, x26, [sp], #16 | ||||
| ldp x27, x28, [sp], #16 | ldp x27, x28, [sp], #16 | ||||
| ldp x29, x30, [sp], #16 | |||||
| ret | ret | ||||
| #endif | #endif | ||||
| @@ -20,9 +20,10 @@ | |||||
| // x7: bias | // x7: bias | ||||
| asm_function MatMulOptR4Int8Neon64 | asm_function MatMulOptR4Int8Neon64 | ||||
| sub sp, sp, #128 | |||||
| sub sp, sp, #144 | |||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| stp x19, x20, [sp], #16 | |||||
| mov w15, #0 // b col index | mov w15, #0 // b col index | ||||
| mov w16, #0 // a row index | mov w16, #0 // a row index | ||||
| @@ -40,7 +41,7 @@ L2: | |||||
| cmp w16, w3 | cmp w16, w3 | ||||
| beq End2 | beq End2 | ||||
| mov x18, x1 // reload b ptr | |||||
| mov x19, x1 // reload b ptr | |||||
| mov x10, x7 // reload bias ptr | mov x10, x7 // reload bias ptr | ||||
| mov w11, w5 // reload depth | mov w11, w5 // reload depth | ||||
| dup v16.4s, wzr | dup v16.4s, wzr | ||||
| @@ -67,10 +68,10 @@ L3: | |||||
| ld1 {v1.16b}, [x17], #16 | ld1 {v1.16b}, [x17], #16 | ||||
| ld1 {v2.16b}, [x17], #16 | ld1 {v2.16b}, [x17], #16 | ||||
| ld1 {v3.16b}, [x17], #16 | ld1 {v3.16b}, [x17], #16 | ||||
| ld1 {v4.16b}, [x18], #16 | |||||
| ld1 {v5.16b}, [x18], #16 | |||||
| ld1 {v6.16b}, [x18], #16 | |||||
| ld1 {v7.16b}, [x18], #16 | |||||
| ld1 {v4.16b}, [x19], #16 | |||||
| ld1 {v5.16b}, [x19], #16 | |||||
| ld1 {v6.16b}, [x19], #16 | |||||
| ld1 {v7.16b}, [x19], #16 | |||||
| sdot v16.4s, v4.16b, v0.16b | sdot v16.4s, v4.16b, v0.16b | ||||
| sdot v17.4s, v5.16b, v0.16b | sdot v17.4s, v5.16b, v0.16b | ||||
| @@ -135,8 +136,9 @@ End2: | |||||
| b L1 | b L1 | ||||
| End1: | End1: | ||||
| sub sp, sp, #128 | |||||
| sub sp, sp, #144 | |||||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| ldp x19, x20, [sp], #16 | |||||
| ret | ret | ||||
| #endif | #endif | ||||