| @@ -28,11 +28,11 @@ asm_function AdderFloatNeon64 | |||
| ldr x8, [sp] | |||
| mov x18, #48 // sizeof(float) * 12 | |||
| mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float) * 12 * depth | |||
| mov x20, #48 // sizeof(float) * 12 | |||
| mul x17, x5, x20 // block stride of lhs/rhs: sizeof(float) * 12 * depth | |||
| mov x18, #4 | |||
| mul x8, x8, x18 | |||
| mov x20, #4 | |||
| mul x8, x8, x20 | |||
| LoopRowStart: | |||
| cmp x6, #4 | |||
| @@ -595,9 +595,9 @@ LoopRow4: | |||
| LoopColEnd: | |||
| add x0, x0, x17 | |||
| mov x18, #4 | |||
| mul x18, x18, x7 | |||
| sub x11, x11, x18 | |||
| mov x20, #4 | |||
| mul x20, x20, x7 | |||
| sub x11, x11, x20 | |||
| mov x2, x11 | |||
| subs x6, x6, #12 | |||
| bgt LoopRowStart | |||
| @@ -33,12 +33,13 @@ | |||
| // w16: per_channel | |||
| asm_function ConvDw3x3Int8Neon64 | |||
| sub sp, sp, #176 | |||
| sub sp, sp, #192 | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| stp x19, x20, [sp], #16 | |||
| stp x21, x22, [sp], #16 | |||
| stp x23, x24, [sp], #16 | |||
| stp x25, x26, [sp], #16 | |||
| ldr x8, [sp] | |||
| ldr x9, [sp, #8] | |||
| @@ -84,16 +85,16 @@ asm_function ConvDw3x3Int8Neon64 | |||
| mov x16, x1 | |||
| add x17, x16, x5 | |||
| add x18, x17, x5 | |||
| add x25, x17, x5 | |||
| ld1 {v9.8b}, [x16], x4 | |||
| ld1 {v10.8b}, [x16], x4 | |||
| ld1 {v11.8b}, [x16], x4 | |||
| ld1 {v13.8b}, [x17], x4 | |||
| ld1 {v14.8b}, [x17], x4 | |||
| ld1 {v15.8b}, [x17], x4 | |||
| ld1 {v17.8b}, [x18], x4 | |||
| ld1 {v18.8b}, [x18], x4 | |||
| ld1 {v19.8b}, [x18], x4 | |||
| ld1 {v17.8b}, [x25], x4 | |||
| ld1 {v18.8b}, [x25], x4 | |||
| ld1 {v19.8b}, [x25], x4 | |||
| ld1 {v21.4s}, [x3] | |||
| ld1 {v22.4s}, [x19] | |||
| @@ -123,13 +124,13 @@ HEIGHT1_LOOP: | |||
| ld1 {v16.8b}, [x17] | |||
| smlal v23.4s, v0.4h, v10.4h | |||
| smlal2 v24.4s, v0.8h, v10.8h | |||
| ld1 {v20.8b}, [x18] | |||
| ld1 {v20.8b}, [x25] | |||
| add x1, x1, x21 | |||
| ssubl v12.8h, v12.8b, v25.8b | |||
| smlal v21.4s, v1.4h, v10.4h | |||
| mov x16, x1 | |||
| add x17, x16, x5 | |||
| add x18, x17, x5 | |||
| add x25, x17, x5 | |||
| smlal2 v22.4s, v1.8h, v10.8h | |||
| ld1 {v9.8b}, [x16], x4 | |||
| ssubl v16.8h, v16.8b, v25.8b | |||
| @@ -159,17 +160,17 @@ HEIGHT1_LOOP: | |||
| smlal2 v24.4s, v5.8h, v16.8h | |||
| smlal v21.4s, v6.4h, v17.4h | |||
| smlal2 v22.4s, v6.8h, v17.8h | |||
| ld1 {v17.8b}, [x18], x4 | |||
| ld1 {v17.8b}, [x25], x4 | |||
| smlal v23.4s, v6.4h, v18.4h | |||
| smlal2 v24.4s, v6.8h, v18.8h | |||
| smlal v21.4s, v7.4h, v18.4h | |||
| smlal2 v22.4s, v7.8h, v18.8h | |||
| ld1 {v18.8b}, [x18], x4 | |||
| ld1 {v18.8b}, [x25], x4 | |||
| smlal v23.4s, v7.4h, v19.4h | |||
| smlal2 v24.4s, v7.8h, v19.8h | |||
| smlal v21.4s, v8.4h, v19.4h | |||
| smlal2 v22.4s, v8.8h, v19.8h | |||
| ld1 {v19.8b}, [x18], x4 | |||
| ld1 {v19.8b}, [x25], x4 | |||
| smlal v23.4s, v8.4h, v20.4h | |||
| smlal2 v24.4s, v8.8h, v20.8h | |||
| @@ -278,7 +279,7 @@ WIDTH2_LEFT: | |||
| smlal2 v24.4s, v1.8h, v11.8h | |||
| smlal v21.4s, v2.4h, v11.4h | |||
| smlal2 v22.4s, v2.8h, v11.8h | |||
| ld1 {v20.8b}, [x18] | |||
| ld1 {v20.8b}, [x25] | |||
| smlal v23.4s, v2.4h, v12.4h | |||
| smlal2 v24.4s, v2.8h, v12.8h | |||
| smlal v21.4s, v3.4h, v13.4h | |||
| @@ -443,12 +444,13 @@ OUTZP3: | |||
| st1 {v21.8b}, [x0], x6 | |||
| End: | |||
| sub sp, sp, #176 | |||
| sub sp, sp, #192 | |||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| ldp x19, x20, [sp], #16 | |||
| ldp x21, x22, [sp], #16 | |||
| ldp x23, x24, [sp], #16 | |||
| ldp x25, x26, [sp], #16 | |||
| ret | |||
| #endif | |||
| @@ -33,12 +33,13 @@ | |||
| // w16: per_channel | |||
| asm_function ConvDw3x3Int8Stride2 | |||
| sub sp, sp, #176 | |||
| sub sp, sp, #192 | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| stp x19, x20, [sp], #16 | |||
| stp x21, x22, [sp], #16 | |||
| stp x23, x24, [sp], #16 | |||
| stp x25, x26, [sp], #16 | |||
| ldr x8, [sp] | |||
| ldr x9, [sp, #8] | |||
| @@ -71,7 +72,7 @@ asm_function ConvDw3x3Int8Stride2 | |||
| mov x16, x1 | |||
| add x17, x16, x5 | |||
| add x18, x17, x5 | |||
| add x25, x17, x5 | |||
| ld1 {v9.8b}, [x16], x4 | |||
| ld1 {v10.8b}, [x16], x4 | |||
| ssubl v9.8h, v9.8b, v28.8b | |||
| @@ -83,11 +84,11 @@ asm_function ConvDw3x3Int8Stride2 | |||
| ssubl v14.8h, v14.8b, v28.8b | |||
| ld1 {v16.8b}, [x17], x4 | |||
| ssubl v15.8h, v15.8b, v28.8b | |||
| ld1 {v19.8b}, [x18], x4 | |||
| ld1 {v19.8b}, [x25], x4 | |||
| ssubl v16.8h, v16.8b, v28.8b | |||
| ld1 {v20.8b}, [x18], x4 | |||
| ld1 {v20.8b}, [x25], x4 | |||
| ssubl v19.8h, v19.8b, v28.8b | |||
| ld1 {v21.8b}, [x18], x4 | |||
| ld1 {v21.8b}, [x25], x4 | |||
| ssubl v20.8h, v20.8b, v28.8b | |||
| ssubl v21.8h, v21.8b, v28.8b | |||
| @@ -108,7 +109,7 @@ HEIGHT1_LOOP: | |||
| ld1 {v17.8b}, [x17], x4 | |||
| ssubl v12.8h, v12.8b, v28.8b | |||
| smlal v26.4s, v0.4h, v11.4h | |||
| ld1 {v22.8b}, [x18], x4 | |||
| ld1 {v22.8b}, [x25], x4 | |||
| ssubl v17.8h, v17.8b, v28.8b | |||
| smlal2 v27.4s, v0.8h, v11.8h | |||
| ld1 {v13.8b}, [x16], x4 | |||
| @@ -117,7 +118,7 @@ HEIGHT1_LOOP: | |||
| ld1 {v18.8b}, [x17], x4 | |||
| ssubl v13.8h, v13.8b, v28.8b | |||
| smlal2 v25.4s, v1.8h, v10.8h | |||
| ld1 {v23.8b}, [x18], x4 | |||
| ld1 {v23.8b}, [x25], x4 | |||
| ssubl v18.8h, v18.8b, v28.8b | |||
| smlal v26.4s, v1.4h, v12.4h | |||
| mov v9.16b, v13.16b | |||
| @@ -157,12 +158,12 @@ HEIGHT1_LOOP: | |||
| smlal2 v27.4s, v6.8h, v21.8h | |||
| smlal v24.4s, v7.4h, v20.4h | |||
| smlal2 v25.4s, v7.8h, v20.8h | |||
| ld1 {v20.8b}, [x18], x4 | |||
| ld1 {v20.8b}, [x25], x4 | |||
| smlal v26.4s, v7.4h, v22.4h | |||
| smlal2 v27.4s, v7.8h, v22.8h | |||
| smlal v24.4s, v8.4h, v21.4h | |||
| smlal2 v25.4s, v8.8h, v21.8h | |||
| ld1 {v21.8b}, [x18], x4 | |||
| ld1 {v21.8b}, [x25], x4 | |||
| ssubl v20.8h, v20.8b, v28.8b | |||
| smlal v26.4s, v8.4h, v23.4h | |||
| ssubl v21.8h, v21.8b, v28.8b | |||
| @@ -260,7 +261,7 @@ WIDTH2_LEFT: | |||
| ld1 {v17.8b}, [x17], x4 | |||
| ssubl v12.8h, v12.8b, v28.8b | |||
| smlal v26.4s, v0.4h, v11.4h | |||
| ld1 {v22.8b}, [x18], x4 | |||
| ld1 {v22.8b}, [x25], x4 | |||
| ssubl v17.8h, v17.8b, v28.8b | |||
| smlal2 v27.4s, v0.8h, v11.8h | |||
| ld1 {v13.8b}, [x16], x4 | |||
| @@ -269,7 +270,7 @@ WIDTH2_LEFT: | |||
| ld1 {v18.8b}, [x17], x4 | |||
| ssubl v13.8h, v13.8b, v28.8b | |||
| smlal2 v25.4s, v1.8h, v10.8h | |||
| ld1 {v23.8b}, [x18], x4 | |||
| ld1 {v23.8b}, [x25], x4 | |||
| ssubl v18.8h, v18.8b, v28.8b | |||
| smlal v26.4s, v1.4h, v12.4h | |||
| ssubl v23.8h, v23.8b, v28.8b | |||
| @@ -452,11 +453,12 @@ OUTZP3: | |||
| st1 {v24.8b}, [x0], x6 | |||
| End: | |||
| sub sp, sp, #176 | |||
| sub sp, sp, #192 | |||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| ldp x19, x20, [sp], #16 | |||
| ldp x21, x22, [sp], #16 | |||
| ldp x23, x24, [sp], #16 | |||
| ldp x25, x26, [sp], #16 | |||
| ret | |||
| #endif | |||
| @@ -19,12 +19,13 @@ asm_function ConvDwFp32Center | |||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | |||
| // x19 ~ x29 should be also preserved | |||
| // whereas our coding style do not permit such amount of parameters | |||
| sub sp, sp, #176 | |||
| sub sp, sp, #192 | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| stp x19, x20, [sp], #16 | |||
| stp x21, x22, [sp], #16 | |||
| stp x23, x24, [sp], #16 | |||
| stp x25, x26, [sp], #16 | |||
| ldr x8, [sp] | |||
| ldr x9, [sp, #8] | |||
| @@ -72,7 +73,7 @@ asm_function ConvDwFp32Center | |||
| mov v14.16b, v24.16b | |||
| mov v15.16b, v24.16b | |||
| LoopKh16: | |||
| mov x18, x7 | |||
| mov x25, x7 | |||
| mov x21, x16 | |||
| LoopKw16: | |||
| mov x22, x21 | |||
| @@ -109,7 +110,7 @@ asm_function ConvDwFp32Center | |||
| ld1 {v23.4s}, [x22], x11 | |||
| fmla v14.4s, v22.4s, v25.4s | |||
| fmla v15.4s, v23.4s, v25.4s | |||
| subs x18, x18, #1 | |||
| subs x25, x25, #1 | |||
| add x21, x21, x13 | |||
| bne LoopKw16 | |||
| add x16, x16, x12 | |||
| @@ -192,7 +193,7 @@ asm_function ConvDwFp32Center | |||
| mov v6.16b, v24.16b | |||
| mov v7.16b, v24.16b | |||
| LoopKh8: | |||
| mov x18, x7 | |||
| mov x25, x7 | |||
| mov x21, x16 | |||
| LoopKw8: | |||
| mov x22, x21 | |||
| @@ -213,7 +214,7 @@ asm_function ConvDwFp32Center | |||
| ld1 {v23.4s}, [x22], x11 | |||
| fmla v6.4s, v22.4s, v25.4s | |||
| fmla v7.4s, v23.4s, v25.4s | |||
| subs x18, x18, #1 | |||
| subs x25, x25, #1 | |||
| add x21, x21, x13 | |||
| bne LoopKw8 | |||
| add x16, x16, x12 | |||
| @@ -261,13 +262,13 @@ asm_function ConvDwFp32Center | |||
| mov x20, x6 | |||
| mov v0.16b, v24.16b | |||
| LoopKh: | |||
| mov x18, x7 | |||
| mov x25, x7 | |||
| mov x22, x16 | |||
| LoopKw: | |||
| ld1 {v16.4s}, [x22], x13 | |||
| ld1 {v25.4s}, [x17], #16 | |||
| fmla v0.4s, v16.4s, v25.4s | |||
| subs x18, x18, #1 | |||
| subs x25, x25, #1 | |||
| bne LoopKw | |||
| add x16, x16, x12 | |||
| subs x20, x20, #1 | |||
| @@ -290,11 +291,12 @@ asm_function ConvDwFp32Center | |||
| subs x4, x4, #1 | |||
| bne LoopH | |||
| sub sp, sp, #176 | |||
| sub sp, sp, #192 | |||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| ldp x19, x20, [sp], #16 | |||
| ldp x21, x22, [sp], #16 | |||
| ldp x23, x24, [sp], #16 | |||
| ldp x25, x26, [sp], #16 | |||
| ret | |||
| #endif | |||
| @@ -13,8 +13,9 @@ | |||
| // x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6 | |||
| asm_function ConvDwFp32Indirect3x3 | |||
| sub sp, sp, #16 | |||
| sub sp, sp, #32 | |||
| stp x19, x20, [sp], #16 | |||
| stp x21, x22, [sp], #16 | |||
| movi v31.4s, #6 | |||
| scvtf v31.4s, v31.4s | |||
| @@ -28,7 +29,7 @@ asm_function ConvDwFp32Indirect3x3 | |||
| ldp x12, x13, [x1] | |||
| ldp x14, x15, [x1, #16] | |||
| ldp x16, x17, [x1, #32] | |||
| ldp x18, x19, [x1, #48] | |||
| ldp x21, x19, [x1, #48] | |||
| ldr x20, [x1, #64] | |||
| mov x9, x2 | |||
| mov x10, x3 | |||
| @@ -56,7 +57,7 @@ asm_function ConvDwFp32Indirect3x3 | |||
| ld1 {v5.4s}, [x17], #16 | |||
| ld1 {v22.4s}, [x9], #16 | |||
| fmla v29.4s, v3.4s, v20.4s | |||
| ld1 {v6.4s}, [x18], #16 | |||
| ld1 {v6.4s}, [x21], #16 | |||
| ld1 {v23.4s}, [x9], #16 | |||
| fmla v29.4s, v4.4s, v21.4s | |||
| ld1 {v7.4s}, [x19], #16 | |||
| @@ -100,7 +101,7 @@ asm_function ConvDwFp32Indirect3x3 | |||
| ld1 {v5.4s}, [x17], #16 | |||
| ld1 {v22.4s}, [x9], #16 | |||
| fmla v29.4s, v3.4s, v20.4s | |||
| ld1 {v6.4s}, [x18], #16 | |||
| ld1 {v6.4s}, [x21], #16 | |||
| ld1 {v23.4s}, [x9], #16 | |||
| fmla v29.4s, v4.4s, v21.4s | |||
| ld1 {v7.4s}, [x19], #16 | |||
| @@ -141,7 +142,8 @@ asm_function ConvDwFp32Indirect3x3 | |||
| cmp x5, #0 | |||
| bgt LoopPixel | |||
| End: | |||
| sub sp, sp, #16 | |||
| sub sp, sp, #32 | |||
| ldp x19, x20, [sp], #16 | |||
| ldp x21, x22, [sp], #16 | |||
| ret | |||
| #endif | |||
| @@ -13,17 +13,18 @@ | |||
| // x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6 | |||
| asm_function ConvDwFp32Indirect5x5 | |||
| sub sp, sp, #160 | |||
| sub sp, sp, #176 | |||
| stp x19, x20, [sp, #64] | |||
| stp x21, x22, [sp, #80] | |||
| stp x23, x24, [sp, #96] | |||
| stp x25, x26, [sp, #112] | |||
| stp x27, x28, [sp, #128] | |||
| stp x29, x30, [sp, #144] | |||
| ldrb w8, [sp, #160] | |||
| ldrb w8, [sp, #176] | |||
| stp x2, x3, [sp] | |||
| stp x4, x6, [sp, #16] | |||
| stp x7, x8, [sp, #32] | |||
| stp x0, x1, [sp, #160] | |||
| movi v31.4s, #6 | |||
| scvtf v31.4s, v31.4s | |||
| @@ -44,7 +45,7 @@ asm_function ConvDwFp32Indirect5x5 | |||
| ldp x12, x13, [x1, #48] | |||
| ldp x14, x15, [x1, #64] | |||
| ldp x16, x17, [x1, #80] | |||
| ldp x18, x19, [x1, #96] | |||
| ldp x0, x19, [x1, #96] | |||
| ldp x20, x21, [x1, #112] | |||
| ldp x22, x23, [x1, #128] | |||
| ldp x24, x25, [x1, #144] | |||
| @@ -93,7 +94,7 @@ asm_function ConvDwFp32Indirect5x5 | |||
| ld1 {v1.4s}, [x17], #16 | |||
| ld1 {v19.4s}, [x5], #16 | |||
| fmla v29.4s, v7.4s, v25.4s | |||
| ld1 {v2.4s}, [x18], #16 | |||
| ld1 {v2.4s}, [x0], #16 | |||
| ld1 {v20.4s}, [x5], #16 | |||
| fmla v29.4s, v16.4s, v26.4s | |||
| ld1 {v3.4s}, [x19], #16 | |||
| @@ -160,7 +161,9 @@ asm_function ConvDwFp32Indirect5x5 | |||
| RELU: | |||
| fmax v29.4s, v29.4s, v30.4s | |||
| WRITE: | |||
| st1 {v29.4s}, [x0], #16 | |||
| ldr x4, [sp, #160] | |||
| st1 {v29.4s}, [x4], #16 | |||
| str x4, [sp, #160] | |||
| ldr x4, [sp, #56] | |||
| ld1 {v29.4s}, [x4], #16 | |||
| @@ -195,7 +198,7 @@ asm_function ConvDwFp32Indirect5x5 | |||
| ld1 {v1.4s}, [x17], #16 | |||
| ld1 {v19.4s}, [x5], #16 | |||
| fmla v29.4s, v7.4s, v25.4s | |||
| ld1 {v2.4s}, [x18], #16 | |||
| ld1 {v2.4s}, [x0], #16 | |||
| ld1 {v20.4s}, [x5], #16 | |||
| fmla v29.4s, v16.4s, v26.4s | |||
| ld1 {v3.4s}, [x19], #16 | |||
| @@ -253,18 +256,24 @@ asm_function ConvDwFp32Indirect5x5 | |||
| LeftWrite: | |||
| cmp x2, #4 | |||
| bne Write3 | |||
| st1 {v29.4s}, [x0], #16 | |||
| ldr x4, [sp, #160] | |||
| st1 {v29.4s}, [x4], #16 | |||
| str x4, [sp, #160] | |||
| b NextPixel | |||
| Write3: | |||
| sxtw x2, w2 | |||
| tbnz w2, #1, Write2 | |||
| tbnz w2, #0, Write1 | |||
| Write2: | |||
| st1 {v29.2s}, [x0], #8 | |||
| ldr x4, [sp, #160] | |||
| st1 {v29.2s}, [x4], #8 | |||
| str x4, [sp, #160] | |||
| ext v29.16b, v29.16b, v29.16b, #8 | |||
| tbz w2, #0, NextPixel | |||
| Write1: | |||
| str s29, [x0], #4 | |||
| ldr x4, [sp, #160] | |||
| str s29, [x4], #4 | |||
| str x4, [sp, #160] | |||
| NextPixel: | |||
| ldr x2, [sp, #24] | |||
| @@ -279,6 +288,6 @@ End: | |||
| ldp x25, x26, [sp, #112] | |||
| ldp x27, x28, [sp, #128] | |||
| ldp x29, x30, [sp, #144] | |||
| add sp, sp, #160 | |||
| add sp, sp, #176 | |||
| ret | |||
| #endif | |||
| @@ -22,12 +22,13 @@ asm_function ConvDwInt8Center | |||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | |||
| // x19 ~ x29 should be also preserved | |||
| // whereas our coding style do not permit such amount of parameters | |||
| sub sp, sp, #176 | |||
| sub sp, sp, #192 | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| stp x19, x20, [sp], #16 | |||
| stp x21, x22, [sp], #16 | |||
| stp x23, x24, [sp], #16 | |||
| stp x25, x26, [sp], #16 | |||
| ldr x8, [sp] | |||
| ldr x9, [sp, #8] | |||
| @@ -51,9 +52,9 @@ asm_function ConvDwInt8Center | |||
| ld1 {v24.4s}, [x17], #16 | |||
| ld1 {v25.4s}, [x17], #16 | |||
| ldr x18, [sp, #80] // right shift | |||
| ld1 {v26.4s}, [x18], #16 | |||
| ld1 {v27.4s}, [x18], #16 | |||
| ldr x25, [sp, #80] // right shift | |||
| ld1 {v26.4s}, [x25], #16 | |||
| ld1 {v27.4s}, [x25], #16 | |||
| ldr x19, [sp, #88] // acc_min | |||
| ld1 {v28.4s}, [x19], #16 | |||
| @@ -90,7 +91,7 @@ asm_function ConvDwInt8Center | |||
| mov v6.16b, v17.16b | |||
| mov v7.16b, v18.16b | |||
| LoopKh4: | |||
| mov x18, x7 | |||
| mov x25, x7 | |||
| mov x21, x16 | |||
| LoopKw4: | |||
| mov x22, x21 | |||
| @@ -116,7 +117,7 @@ asm_function ConvDwInt8Center | |||
| smlal v6.4s, v8.4h, v16.4h | |||
| smlal2 v7.4s, v8.8h, v16.8h | |||
| subs x18, x18, #1 | |||
| subs x25, x25, #1 | |||
| add x21, x21, x13 | |||
| bne LoopKw4 | |||
| add x16, x16, x12 | |||
| @@ -194,15 +195,15 @@ asm_function ConvDwInt8Center | |||
| mov x16, x3 | |||
| add x17, x16, x9 | |||
| add x18, x17, x9 | |||
| add x21, x18, x9 | |||
| add x25, x17, x9 | |||
| add x21, x25, x9 | |||
| st1 {v0.s}[0], [x16], #4 | |||
| st1 {v1.s}[0], [x16], #4 | |||
| st1 {v2.s}[0], [x17], #4 | |||
| st1 {v3.s}[0], [x17], #4 | |||
| st1 {v4.s}[0], [x18], #4 | |||
| st1 {v5.s}[0], [x18], #4 | |||
| st1 {v4.s}[0], [x25], #4 | |||
| st1 {v5.s}[0], [x25], #4 | |||
| st1 {v6.s}[0], [x21], #4 | |||
| st1 {v7.s}[0], [x21], #4 | |||
| @@ -221,7 +222,7 @@ asm_function ConvDwInt8Center | |||
| mov v0.16b, v17.16b | |||
| mov v1.16b, v18.16b | |||
| LoopKh: | |||
| mov x18, x7 | |||
| mov x25, x7 | |||
| mov x22, x16 | |||
| LoopKw: | |||
| ld1 {v15.8b}, [x22], x13 | |||
| @@ -229,7 +230,7 @@ asm_function ConvDwInt8Center | |||
| ld1 {v16.8h}, [x17], #16 | |||
| smlal v0.4s, v14.4h, v16.4h | |||
| smlal2 v1.4s, v14.8h, v16.8h | |||
| subs x18, x18, #1 | |||
| subs x25, x25, #1 | |||
| bne LoopKw | |||
| add x16, x16, x12 | |||
| subs x20, x20, #1 | |||
| @@ -271,11 +272,12 @@ asm_function ConvDwInt8Center | |||
| subs x4, x4, #1 | |||
| bne LoopH | |||
| sub sp, sp, #176 | |||
| sub sp, sp, #192 | |||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| ldp x19, x20, [sp], #16 | |||
| ldp x21, x22, [sp], #16 | |||
| ldp x23, x24, [sp], #16 | |||
| ldp x25, x26, [sp], #16 | |||
| ret | |||
| #endif | |||
| @@ -47,11 +47,11 @@ asm_function ConvSwFp32Center | |||
| LoopH: | |||
| mov x17, x1 | |||
| mov x18, x5 | |||
| mov x28, x5 | |||
| mov x3, x0 | |||
| cmp x18, #8 | |||
| cmp x28, #8 | |||
| blt LoopW | |||
| cmp x18, #16 | |||
| cmp x28, #16 | |||
| blt LoopW8 | |||
| LoopW16: | |||
| @@ -244,12 +244,12 @@ asm_function ConvSwFp32Center | |||
| st1 {v14.4s}, [x3], x9 | |||
| st1 {v15.4s}, [x3], x9 | |||
| add x17, x17, x19 | |||
| sub x18, x18, #16 | |||
| cmp x18, #0 | |||
| sub x28, x28, #16 | |||
| cmp x28, #0 | |||
| ble LoopWEnd | |||
| cmp x18, #8 | |||
| cmp x28, #8 | |||
| blt LoopW | |||
| cmp x18, #16 | |||
| cmp x28, #16 | |||
| bge LoopW16 | |||
| LoopW8: | |||
| mov x19, #8 | |||
| @@ -369,10 +369,10 @@ asm_function ConvSwFp32Center | |||
| st1 {v6.4s}, [x3], x9 | |||
| st1 {v7.4s}, [x3], x9 | |||
| add x17, x17, x19 | |||
| sub x18, x18, #8 | |||
| cmp x18, #0 | |||
| sub x28, x28, #8 | |||
| cmp x28, #0 | |||
| ble LoopWEnd | |||
| cmp x18, #8 | |||
| cmp x28, #8 | |||
| bge LoopW8 | |||
| LoopW: | |||
| mov x20, x17 | |||
| @@ -427,7 +427,7 @@ asm_function ConvSwFp32Center | |||
| Write: | |||
| st1 {v0.4s}, [x3], x9 | |||
| add x17, x17, x12 | |||
| subs x18, x18, #1 | |||
| subs x28, x28, #1 | |||
| bne LoopW | |||
| LoopWEnd: | |||
| add x0, x0, x8 | |||
| @@ -33,12 +33,12 @@ asm_function DeconvDwFp32Center | |||
| mov x16, x1 | |||
| mov x17, x4 | |||
| LoopW: | |||
| mov x18, x15 | |||
| mov x22, x15 | |||
| mov x19, x2 | |||
| mov x20, x5 | |||
| ld1 {v1.4s}, [x16], x8 | |||
| LoopKh: | |||
| mov x21, x18 | |||
| mov x21, x22 | |||
| mov x13, x6 | |||
| LoopKw: | |||
| ld1 {v0.4s}, [x21] | |||
| @@ -47,7 +47,7 @@ asm_function DeconvDwFp32Center | |||
| st1 {v0.4s}, [x21], x12 | |||
| subs x13, x13, #1 | |||
| bne LoopKw | |||
| add x18, x18, x11 | |||
| add x22, x22, x11 | |||
| subs x20, x20, #1 | |||
| bne LoopKh | |||
| add x15, x15, x10 | |||
| @@ -21,30 +21,31 @@ | |||
| // w13: c8_nhwc_c4 | |||
| asm_function MatmulFloatNeon64 | |||
| sub sp, sp, #128 | |||
| sub sp, sp, #144 | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| stp x19, x20, [sp], #16 | |||
| ldr x9, [sp, #8] | |||
| ldr x14, [sp, #16] | |||
| mov w18, #32 // sizeof(float) * 8 | |||
| mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float) * 8 * depth | |||
| mov x18, #4 | |||
| mov w19, #32 // sizeof(float) * 8 | |||
| mul w15, w5, w19 // block stride of lhs/rhs: sizeof(float) * 8 * depth | |||
| mov x19, #4 | |||
| ldr x17, [sp] | |||
| cbz x14, NoWinoSteps | |||
| mul x8, x7, x17 | |||
| mov x11, #8 | |||
| mul x11, x11, x17 | |||
| mul x8, x8, x18 | |||
| mul x11, x11, x18 | |||
| mul x8, x8, x19 | |||
| mul x11, x11, x19 | |||
| NoWinoSteps: | |||
| mul x17, x17, x18 | |||
| mul x17, x17, x19 | |||
| L1: | |||
| mov w10, w6 // reload lhs row | |||
| mov x12, x0 // reload lhs ptr | |||
| mov x18, x2 // reload dst ptr | |||
| mov x19, x2 // reload dst ptr | |||
| L2: | |||
| mov x16, x1 // reload rhs ptr | |||
| @@ -254,435 +255,435 @@ Write: | |||
| b Write8 | |||
| Write1: | |||
| str s8, [x18] | |||
| str s8, [x19] | |||
| cmp w10, #1 | |||
| beq WriteEnd | |||
| add x18, x18, x17 | |||
| str s10, [x18] | |||
| add x19, x19, x17 | |||
| str s10, [x19] | |||
| cmp w10, #2 | |||
| beq WriteEnd | |||
| add x18, x18, x17 | |||
| str s12, [x18] | |||
| add x19, x19, x17 | |||
| str s12, [x19] | |||
| cmp w10, #3 | |||
| beq WriteEnd | |||
| add x18, x18, x17 | |||
| str s14, [x18] | |||
| add x19, x19, x17 | |||
| str s14, [x19] | |||
| cmp w10, #4 | |||
| beq WriteEnd | |||
| add x18, x18, x17 | |||
| str s16, [x18] | |||
| add x19, x19, x17 | |||
| str s16, [x19] | |||
| cmp w10, #5 | |||
| beq WriteEnd | |||
| add x18, x18, x17 | |||
| str s18, [x18] | |||
| add x19, x19, x17 | |||
| str s18, [x19] | |||
| cmp w10, #6 | |||
| beq WriteEnd | |||
| add x18, x18, x17 | |||
| str s20, [x18] | |||
| add x19, x19, x17 | |||
| str s20, [x19] | |||
| cmp w10, #7 | |||
| beq WriteEnd | |||
| add x18, x18, x17 | |||
| str s22, [x18] | |||
| add x19, x19, x17 | |||
| str s22, [x19] | |||
| cmp w10, #8 | |||
| beq WriteEnd | |||
| add x18, x18, x17 | |||
| str s24, [x18] | |||
| add x19, x19, x17 | |||
| str s24, [x19] | |||
| cmp w10, #9 | |||
| beq WriteEnd | |||
| add x18, x18, x17 | |||
| str s26, [x18] | |||
| add x19, x19, x17 | |||
| str s26, [x19] | |||
| cmp w10, #10 | |||
| beq WriteEnd | |||
| add x18, x18, x17 | |||
| str s28, [x18] | |||
| add x19, x19, x17 | |||
| str s28, [x19] | |||
| cmp w10, #11 | |||
| beq WriteEnd | |||
| add x18, x18, x17 | |||
| str s30, [x18] | |||
| add x18, x18, x17 | |||
| add x19, x19, x17 | |||
| str s30, [x19] | |||
| add x19, x19, x17 | |||
| b WriteEnd | |||
| Write2: | |||
| dup s9, v8.s[1] | |||
| stp s8, s9, [x18] | |||
| stp s8, s9, [x19] | |||
| cmp w10, #1 | |||
| beq WriteEnd | |||
| add x18, x18, x17 | |||
| add x19, x19, x17 | |||
| dup s11, v10.s[1] | |||
| stp s10, s11, [x18] | |||
| stp s10, s11, [x19] | |||
| cmp w10, #2 | |||
| beq WriteEnd | |||
| add x18, x18, x17 | |||
| add x19, x19, x17 | |||
| dup s13, v12.s[1] | |||
| stp s12, s13, [x18] | |||
| stp s12, s13, [x19] | |||
| cmp w10, #3 | |||
| beq WriteEnd | |||
| add x18, x18, x17 | |||
| add x19, x19, x17 | |||
| dup s15, v14.s[1] | |||
| stp s14, s15, [x18] | |||
| stp s14, s15, [x19] | |||
| cmp w10, #4 | |||
| beq WriteEnd | |||
| add x18, x18, x17 | |||
| add x19, x19, x17 | |||
| dup s17, v16.s[1] | |||
| stp s16, s17, [x18] | |||
| stp s16, s17, [x19] | |||
| cmp w10, #5 | |||
| beq WriteEnd | |||
| add x18, x18, x17 | |||
| add x19, x19, x17 | |||
| dup s19, v18.s[1] | |||
| stp s18, s19, [x18] | |||
| stp s18, s19, [x19] | |||
| cmp w10, #6 | |||
| beq WriteEnd | |||
| add x18, x18, x17 | |||
| add x19, x19, x17 | |||
| dup s21, v20.s[1] | |||
| stp s20, s21, [x18] | |||
| stp s20, s21, [x19] | |||
| cmp w10, #7 | |||
| beq WriteEnd | |||
| add x18, x18, x17 | |||
| add x19, x19, x17 | |||
| dup s23, v22.s[1] | |||
| stp s22, s23, [x18] | |||
| stp s22, s23, [x19] | |||
| cmp w10, #8 | |||
| beq WriteEnd | |||
| add x18, x18, x17 | |||
| add x19, x19, x17 | |||
| dup s25, v24.s[1] | |||
| stp s24, s25, [x18] | |||
| stp s24, s25, [x19] | |||
| cmp w10, #9 | |||
| beq WriteEnd | |||
| add x18, x18, x17 | |||
| add x19, x19, x17 | |||
| dup s27, v26.s[1] | |||
| stp s26, s27, [x18] | |||
| stp s26, s27, [x19] | |||
| cmp w10, #10 | |||
| beq WriteEnd | |||
| add x18, x18, x17 | |||
| add x19, x19, x17 | |||
| dup s29, v28.s[1] | |||
| stp s28, s29, [x18] | |||
| stp s28, s29, [x19] | |||
| cmp w10, #11 | |||
| beq WriteEnd | |||
| add x18, x18, x17 | |||
| add x19, x19, x17 | |||
| dup s31, v30.s[1] | |||
| stp s30, s31, [x18] | |||
| add x18, x18, x17 | |||
| stp s30, s31, [x19] | |||
| add x19, x19, x17 | |||
| b WriteEnd | |||
| Write3: | |||
| add x13, x18, #8 | |||
| add x13, x19, #8 | |||
| dup s9, v8.s[1] | |||
| stp s8, s9, [x18] | |||
| add x18, x18, x17 | |||
| stp s8, s9, [x19] | |||
| add x19, x19, x17 | |||
| st1 {v8.s}[2], [x13], x17 | |||
| cmp w10, #1 | |||
| beq WriteEnd | |||
| dup s11, v10.s[1] | |||
| stp s10, s11, [x18] | |||
| add x18, x18, x17 | |||
| stp s10, s11, [x19] | |||
| add x19, x19, x17 | |||
| st1 {v10.s}[2], [x13], x17 | |||
| cmp w10, #2 | |||
| beq WriteEnd | |||
| dup s13, v12.s[1] | |||
| stp s12, s13, [x18] | |||
| add x18, x18, x17 | |||
| stp s12, s13, [x19] | |||
| add x19, x19, x17 | |||
| st1 {v12.s}[2], [x13], x17 | |||
| cmp w10, #3 | |||
| beq WriteEnd | |||
| dup s15, v14.s[1] | |||
| stp s14, s15, [x18] | |||
| add x18, x18, x17 | |||
| stp s14, s15, [x19] | |||
| add x19, x19, x17 | |||
| st1 {v14.s}[2], [x13], x17 | |||
| cmp w10, #4 | |||
| beq WriteEnd | |||
| dup s17, v16.s[1] | |||
| stp s16, s17, [x18] | |||
| add x18, x18, x17 | |||
| stp s16, s17, [x19] | |||
| add x19, x19, x17 | |||
| st1 {v16.s}[2], [x13], x17 | |||
| cmp w10, #5 | |||
| beq WriteEnd | |||
| dup s19, v18.s[1] | |||
| stp s18, s19, [x18] | |||
| add x18, x18, x17 | |||
| stp s18, s19, [x19] | |||
| add x19, x19, x17 | |||
| st1 {v18.s}[2], [x13], x17 | |||
| cmp w10, #6 | |||
| beq WriteEnd | |||
| dup s21, v20.s[1] | |||
| stp s20, s21, [x18] | |||
| add x18, x18, x17 | |||
| stp s20, s21, [x19] | |||
| add x19, x19, x17 | |||
| st1 {v20.s}[2], [x13], x17 | |||
| cmp w10, #7 | |||
| beq WriteEnd | |||
| dup s23, v22.s[1] | |||
| stp s22, s23, [x18] | |||
| add x18, x18, x17 | |||
| stp s22, s23, [x19] | |||
| add x19, x19, x17 | |||
| st1 {v22.s}[2], [x13], x17 | |||
| cmp w10, #8 | |||
| beq WriteEnd | |||
| dup s25, v24.s[1] | |||
| stp s24, s25, [x18] | |||
| add x18, x18, x17 | |||
| stp s24, s25, [x19] | |||
| add x19, x19, x17 | |||
| st1 {v24.s}[2], [x13], x17 | |||
| cmp w10, #9 | |||
| beq WriteEnd | |||
| dup s27, v26.s[1] | |||
| stp s26, s27, [x18] | |||
| add x18, x18, x17 | |||
| stp s26, s27, [x19] | |||
| add x19, x19, x17 | |||
| st1 {v26.s}[2], [x13], x17 | |||
| cmp w10, #10 | |||
| beq WriteEnd | |||
| dup s29, v28.s[1] | |||
| stp s28, s29, [x18] | |||
| add x18, x18, x17 | |||
| stp s28, s29, [x19] | |||
| add x19, x19, x17 | |||
| st1 {v28.s}[2], [x13], x17 | |||
| cmp w10, #11 | |||
| beq WriteEnd | |||
| dup s31, v30.s[1] | |||
| stp s30, s31, [x18] | |||
| add x18, x18, x17 | |||
| stp s30, s31, [x19] | |||
| add x19, x19, x17 | |||
| st1 {v30.s}[2], [x13] | |||
| b WriteEnd | |||
| Write4: | |||
| st1 {v8.4s}, [x18], x17 | |||
| st1 {v8.4s}, [x19], x17 | |||
| cmp w10, #1 | |||
| beq WriteEnd | |||
| st1 {v10.4s}, [x18], x17 | |||
| st1 {v10.4s}, [x19], x17 | |||
| cmp w10, #2 | |||
| beq WriteEnd | |||
| st1 {v12.4s}, [x18], x17 | |||
| st1 {v12.4s}, [x19], x17 | |||
| cmp w10, #3 | |||
| beq WriteEnd | |||
| st1 {v14.4s}, [x18], x17 | |||
| st1 {v14.4s}, [x19], x17 | |||
| cmp w10, #4 | |||
| beq WriteEnd | |||
| st1 {v16.4s}, [x18], x17 | |||
| st1 {v16.4s}, [x19], x17 | |||
| cmp w10, #5 | |||
| beq WriteEnd | |||
| st1 {v18.4s}, [x18], x17 | |||
| st1 {v18.4s}, [x19], x17 | |||
| cmp w10, #6 | |||
| beq WriteEnd | |||
| st1 {v20.4s}, [x18], x17 | |||
| st1 {v20.4s}, [x19], x17 | |||
| cmp w10, #7 | |||
| beq WriteEnd | |||
| st1 {v22.4s}, [x18], x17 | |||
| st1 {v22.4s}, [x19], x17 | |||
| cmp w10, #8 | |||
| beq WriteEnd | |||
| st1 {v24.4s}, [x18], x17 | |||
| st1 {v24.4s}, [x19], x17 | |||
| cmp w10, #9 | |||
| beq WriteEnd | |||
| st1 {v26.4s}, [x18], x17 | |||
| st1 {v26.4s}, [x19], x17 | |||
| cmp w10, #10 | |||
| beq WriteEnd | |||
| st1 {v28.4s}, [x18], x17 | |||
| st1 {v28.4s}, [x19], x17 | |||
| cmp w10, #11 | |||
| beq WriteEnd | |||
| st1 {v30.4s}, [x18], x17 | |||
| st1 {v30.4s}, [x19], x17 | |||
| b WriteEnd | |||
| Write5: | |||
| add x13, x18, #16 | |||
| st1 {v8.4s}, [x18], x17 | |||
| add x13, x19, #16 | |||
| st1 {v8.4s}, [x19], x17 | |||
| str s9, [x13] | |||
| cmp w10, #1 | |||
| beq WriteEnd | |||
| add x13, x13, x17 | |||
| st1 {v10.4s}, [x18], x17 | |||
| st1 {v10.4s}, [x19], x17 | |||
| str s11, [x13] | |||
| cmp w10, #2 | |||
| beq WriteEnd | |||
| add x13, x13, x17 | |||
| st1 {v12.4s}, [x18], x17 | |||
| st1 {v12.4s}, [x19], x17 | |||
| str s13, [x13] | |||
| cmp w10, #3 | |||
| beq WriteEnd | |||
| add x13, x13, x17 | |||
| st1 {v14.4s}, [x18], x17 | |||
| st1 {v14.4s}, [x19], x17 | |||
| str s15, [x13] | |||
| cmp w10, #4 | |||
| beq WriteEnd | |||
| add x13, x13, x17 | |||
| st1 {v16.4s}, [x18], x17 | |||
| st1 {v16.4s}, [x19], x17 | |||
| str s17, [x13] | |||
| cmp w10, #5 | |||
| beq WriteEnd | |||
| add x13, x13, x17 | |||
| st1 {v18.4s}, [x18], x17 | |||
| st1 {v18.4s}, [x19], x17 | |||
| str s19, [x13] | |||
| cmp w10, #6 | |||
| beq WriteEnd | |||
| add x13, x13, x17 | |||
| st1 {v20.4s}, [x18], x17 | |||
| st1 {v20.4s}, [x19], x17 | |||
| str s21, [x13] | |||
| cmp w10, #7 | |||
| beq WriteEnd | |||
| add x13, x13, x17 | |||
| st1 {v22.4s}, [x18], x17 | |||
| st1 {v22.4s}, [x19], x17 | |||
| str s23, [x13] | |||
| cmp w10, #8 | |||
| beq WriteEnd | |||
| add x13, x13, x17 | |||
| st1 {v24.4s}, [x18], x17 | |||
| st1 {v24.4s}, [x19], x17 | |||
| str s25, [x13] | |||
| cmp w10, #9 | |||
| beq WriteEnd | |||
| add x13, x13, x17 | |||
| st1 {v26.4s}, [x18], x17 | |||
| st1 {v26.4s}, [x19], x17 | |||
| str s27, [x13] | |||
| cmp w10, #10 | |||
| beq WriteEnd | |||
| add x13, x13, x17 | |||
| st1 {v28.4s}, [x18], x17 | |||
| st1 {v28.4s}, [x19], x17 | |||
| str s29, [x13] | |||
| cmp w10, #11 | |||
| beq WriteEnd | |||
| add x13, x13, x17 | |||
| st1 {v30.4s}, [x18], x17 | |||
| st1 {v30.4s}, [x19], x17 | |||
| str s31, [x13] | |||
| b WriteEnd | |||
| Write6: | |||
| add x13, x18, #16 | |||
| st1 {v8.4s}, [x18], x17 | |||
| add x13, x19, #16 | |||
| st1 {v8.4s}, [x19], x17 | |||
| dup s8, v9.s[1] | |||
| stp s9, s8, [x13] | |||
| cmp w10, #1 | |||
| beq WriteEnd | |||
| add x13, x13, x17 | |||
| st1 {v10.4s}, [x18], x17 | |||
| st1 {v10.4s}, [x19], x17 | |||
| dup s10, v11.s[1] | |||
| stp s11, s10, [x13] | |||
| cmp w10, #2 | |||
| beq WriteEnd | |||
| add x13, x13, x17 | |||
| st1 {v12.4s}, [x18], x17 | |||
| st1 {v12.4s}, [x19], x17 | |||
| dup s12, v13.s[1] | |||
| stp s13, s12, [x13] | |||
| cmp w10, #3 | |||
| beq WriteEnd | |||
| add x13, x13, x17 | |||
| st1 {v14.4s}, [x18], x17 | |||
| st1 {v14.4s}, [x19], x17 | |||
| dup s14, v15.s[1] | |||
| stp s15, s14, [x13] | |||
| cmp w10, #4 | |||
| beq WriteEnd | |||
| add x13, x13, x17 | |||
| st1 {v16.4s}, [x18], x17 | |||
| st1 {v16.4s}, [x19], x17 | |||
| dup s16, v17.s[1] | |||
| stp s17, s16, [x13] | |||
| cmp w10, #5 | |||
| beq WriteEnd | |||
| add x13, x13, x17 | |||
| st1 {v18.4s}, [x18], x17 | |||
| st1 {v18.4s}, [x19], x17 | |||
| dup s18, v19.s[1] | |||
| stp s19, s18, [x13] | |||
| cmp w10, #6 | |||
| beq WriteEnd | |||
| add x13, x13, x17 | |||
| st1 {v20.4s}, [x18], x17 | |||
| st1 {v20.4s}, [x19], x17 | |||
| dup s20, v21.s[1] | |||
| stp s21, s20, [x13] | |||
| cmp w10, #7 | |||
| beq WriteEnd | |||
| add x13, x13, x17 | |||
| st1 {v22.4s}, [x18], x17 | |||
| st1 {v22.4s}, [x19], x17 | |||
| dup s22, v23.s[1] | |||
| stp s23, s22, [x13] | |||
| cmp w10, #8 | |||
| beq WriteEnd | |||
| add x13, x13, x17 | |||
| st1 {v24.4s}, [x18], x17 | |||
| st1 {v24.4s}, [x19], x17 | |||
| dup s24, v25.s[1] | |||
| stp s25, s24, [x13] | |||
| cmp w10, #9 | |||
| beq WriteEnd | |||
| add x13, x13, x17 | |||
| st1 {v26.4s}, [x18], x17 | |||
| st1 {v26.4s}, [x19], x17 | |||
| dup s26, v27.s[1] | |||
| stp s27, s26, [x13] | |||
| cmp w10, #10 | |||
| beq WriteEnd | |||
| add x13, x13, x17 | |||
| st1 {v28.4s}, [x18], x17 | |||
| st1 {v28.4s}, [x19], x17 | |||
| dup s28, v29.s[1] | |||
| stp s29, s28, [x13] | |||
| cmp w10, #11 | |||
| beq WriteEnd | |||
| add x13, x13, x17 | |||
| st1 {v30.4s}, [x18], x17 | |||
| st1 {v30.4s}, [x19], x17 | |||
| dup s30, v31.s[1] | |||
| stp s31, s30, [x13] | |||
| b WriteEnd | |||
| Write7: | |||
| add x13, x18, #16 | |||
| add x16, x18, #24 | |||
| st1 {v8.4s}, [x18], x17 | |||
| add x13, x19, #16 | |||
| add x16, x19, #24 | |||
| st1 {v8.4s}, [x19], x17 | |||
| dup s8, v9.s[1] | |||
| stp s9, s8, [x13] | |||
| add x13, x13, x17 | |||
| st1 {v9.s}[2], [x16], x17 | |||
| cmp w10, #1 | |||
| beq WriteEnd | |||
| st1 {v10.4s}, [x18], x17 | |||
| st1 {v10.4s}, [x19], x17 | |||
| dup s10, v11.s[1] | |||
| stp s11, s10, [x13] | |||
| add x13, x13, x17 | |||
| st1 {v11.s}[2], [x16], x17 | |||
| cmp w10, #2 | |||
| beq WriteEnd | |||
| st1 {v12.4s}, [x18], x17 | |||
| st1 {v12.4s}, [x19], x17 | |||
| dup s12, v13.s[1] | |||
| stp s13, s12, [x13] | |||
| add x13, x13, x17 | |||
| st1 {v13.s}[2], [x16], x17 | |||
| cmp w10, #3 | |||
| beq WriteEnd | |||
| st1 {v14.4s}, [x18], x17 | |||
| st1 {v14.4s}, [x19], x17 | |||
| dup s14, v15.s[1] | |||
| stp s15, s14, [x13] | |||
| add x13, x13, x17 | |||
| st1 {v15.s}[2], [x16], x17 | |||
| cmp w10, #4 | |||
| beq WriteEnd | |||
| st1 {v16.4s}, [x18], x17 | |||
| st1 {v16.4s}, [x19], x17 | |||
| dup s16, v17.s[1] | |||
| stp s17, s16, [x13] | |||
| add x13, x13, x17 | |||
| st1 {v17.s}[2], [x16], x17 | |||
| cmp w10, #5 | |||
| beq WriteEnd | |||
| st1 {v18.4s}, [x18], x17 | |||
| st1 {v18.4s}, [x19], x17 | |||
| dup s18, v19.s[1] | |||
| stp s19, s18, [x13] | |||
| add x13, x13, x17 | |||
| st1 {v19.s}[2], [x16], x17 | |||
| cmp w10, #6 | |||
| beq WriteEnd | |||
| st1 {v20.4s}, [x18], x17 | |||
| st1 {v20.4s}, [x19], x17 | |||
| dup s20, v21.s[1] | |||
| stp s21, s20, [x13] | |||
| add x13, x13, x17 | |||
| st1 {v21.s}[2], [x16], x17 | |||
| cmp w10, #7 | |||
| beq WriteEnd | |||
| st1 {v22.4s}, [x18], x17 | |||
| st1 {v22.4s}, [x19], x17 | |||
| dup s22, v23.s[1] | |||
| stp s23, s22, [x13] | |||
| add x13, x13, x17 | |||
| st1 {v23.s}[2], [x16], x17 | |||
| cmp w10, #8 | |||
| beq WriteEnd | |||
| st1 {v24.4s}, [x18], x17 | |||
| st1 {v24.4s}, [x19], x17 | |||
| dup s24, v25.s[1] | |||
| stp s25, s24, [x13] | |||
| add x13, x13, x17 | |||
| st1 {v25.s}[2], [x16], x17 | |||
| cmp w10, #9 | |||
| beq WriteEnd | |||
| st1 {v26.4s}, [x18], x17 | |||
| st1 {v26.4s}, [x19], x17 | |||
| dup s26, v27.s[1] | |||
| stp s27, s26, [x13] | |||
| add x13, x13, x17 | |||
| st1 {v27.s}[2], [x16], x17 | |||
| cmp w10, #10 | |||
| beq WriteEnd | |||
| st1 {v28.4s}, [x18], x17 | |||
| st1 {v28.4s}, [x19], x17 | |||
| dup s28, v29.s[1] | |||
| stp s29, s28, [x13] | |||
| add x13, x13, x17 | |||
| st1 {v29.s}[2], [x16], x17 | |||
| cmp w10, #11 | |||
| beq WriteEnd | |||
| st1 {v30.4s}, [x18], x17 | |||
| st1 {v30.4s}, [x19], x17 | |||
| dup s30, v31.s[1] | |||
| stp s31, s30, [x13] | |||
| add x13, x13, x17 | |||
| @@ -697,54 +698,54 @@ WriteC8: | |||
| st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x2], #64 | |||
| b WriteEnd | |||
| WriteWino: | |||
| st1 {v8.4s, v9.4s}, [x18], x8 | |||
| st1 {v10.4s, v11.4s}, [x18], x8 | |||
| st1 {v12.4s, v13.4s}, [x18], x8 | |||
| st1 {v14.4s, v15.4s}, [x18], x8 | |||
| st1 {v16.4s, v17.4s}, [x18], x8 | |||
| st1 {v18.4s, v19.4s}, [x18], x8 | |||
| st1 {v20.4s, v21.4s}, [x18], x8 | |||
| st1 {v22.4s, v23.4s}, [x18], x8 | |||
| st1 {v24.4s, v25.4s}, [x18], x8 | |||
| st1 {v26.4s, v27.4s}, [x18], x8 | |||
| st1 {v28.4s, v29.4s}, [x18], x8 | |||
| st1 {v30.4s, v31.4s}, [x18], x8 | |||
| st1 {v8.4s, v9.4s}, [x19], x8 | |||
| st1 {v10.4s, v11.4s}, [x19], x8 | |||
| st1 {v12.4s, v13.4s}, [x19], x8 | |||
| st1 {v14.4s, v15.4s}, [x19], x8 | |||
| st1 {v16.4s, v17.4s}, [x19], x8 | |||
| st1 {v18.4s, v19.4s}, [x19], x8 | |||
| st1 {v20.4s, v21.4s}, [x19], x8 | |||
| st1 {v22.4s, v23.4s}, [x19], x8 | |||
| st1 {v24.4s, v25.4s}, [x19], x8 | |||
| st1 {v26.4s, v27.4s}, [x19], x8 | |||
| st1 {v28.4s, v29.4s}, [x19], x8 | |||
| st1 {v30.4s, v31.4s}, [x19], x8 | |||
| b WriteEnd | |||
| Write8: | |||
| st1 {v8.4s, v9.4s}, [x18], x17 | |||
| st1 {v8.4s, v9.4s}, [x19], x17 | |||
| cmp w10, #1 | |||
| beq WriteEnd | |||
| st1 {v10.4s, v11.4s}, [x18], x17 | |||
| st1 {v10.4s, v11.4s}, [x19], x17 | |||
| cmp w10, #2 | |||
| beq WriteEnd | |||
| st1 {v12.4s, v13.4s}, [x18], x17 | |||
| st1 {v12.4s, v13.4s}, [x19], x17 | |||
| cmp w10, #3 | |||
| beq WriteEnd | |||
| st1 {v14.4s, v15.4s}, [x18], x17 | |||
| st1 {v14.4s, v15.4s}, [x19], x17 | |||
| cmp w10, #4 | |||
| beq WriteEnd | |||
| st1 {v16.4s, v17.4s}, [x18], x17 | |||
| st1 {v16.4s, v17.4s}, [x19], x17 | |||
| cmp w10, #5 | |||
| beq WriteEnd | |||
| st1 {v18.4s, v19.4s}, [x18], x17 | |||
| st1 {v18.4s, v19.4s}, [x19], x17 | |||
| cmp w10, #6 | |||
| beq WriteEnd | |||
| st1 {v20.4s, v21.4s}, [x18], x17 | |||
| st1 {v20.4s, v21.4s}, [x19], x17 | |||
| cmp w10, #7 | |||
| beq WriteEnd | |||
| st1 {v22.4s, v23.4s}, [x18], x17 | |||
| st1 {v22.4s, v23.4s}, [x19], x17 | |||
| cmp w10, #8 | |||
| beq WriteEnd | |||
| st1 {v24.4s, v25.4s}, [x18], x17 | |||
| st1 {v24.4s, v25.4s}, [x19], x17 | |||
| cmp w10, #9 | |||
| beq WriteEnd | |||
| st1 {v26.4s, v27.4s}, [x18], x17 | |||
| st1 {v26.4s, v27.4s}, [x19], x17 | |||
| cmp w10, #10 | |||
| beq WriteEnd | |||
| st1 {v28.4s, v29.4s}, [x18], x17 | |||
| st1 {v28.4s, v29.4s}, [x19], x17 | |||
| cmp w10, #11 | |||
| beq WriteEnd | |||
| st1 {v30.4s, v31.4s}, [x18], x17 | |||
| st1 {v30.4s, v31.4s}, [x19], x17 | |||
| WriteEnd: | |||
| subs w10, w10, #12 // lhs row - 12 | |||
| @@ -766,8 +767,9 @@ NoDstStep: | |||
| bgt L1 | |||
| End1: | |||
| sub sp, sp, #128 | |||
| sub sp, sp, #144 | |||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| ldp x19, x20, [sp], #16 | |||
| ret | |||
| #endif | |||
| @@ -21,31 +21,32 @@ | |||
| // x9: writeMode | |||
| asm_function MatmulFloatNeon64Opt | |||
| sub sp, sp, #144 | |||
| sub sp, sp, #160 | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| stp x19, x20, [sp], #16 | |||
| stp x21, x22, [sp], #16 | |||
| ldr x8, [sp] | |||
| ldr x9, [sp, #8] | |||
| mov x18, #48 // sizeof(float) * 12 | |||
| mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float) * 12 * depth | |||
| mov x21, #48 // sizeof(float) * 12 | |||
| mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth | |||
| cbnz x9, NoC8Steps | |||
| mov x11, x2 | |||
| mov x18, #32 | |||
| mul x16, x6, x18 // row * 8 * sizeof(float) | |||
| mov x21, #32 | |||
| mul x16, x6, x21 // row * 8 * sizeof(float) | |||
| NoC8Steps: | |||
| cmp x9, #2 | |||
| bne NoWinoSteps | |||
| mov x18, #4 | |||
| mov x21, #4 | |||
| mul x15, x7, x8 | |||
| mul x15, x15, x18 // kernel_size * col *sizeof(float) | |||
| mov x18, #32 | |||
| mul x16, x8, x18 // kernel_size * 8 * sizeof(float) | |||
| mul x15, x15, x21 // kernel_size * col *sizeof(float) | |||
| mov x21, #32 | |||
| mul x16, x8, x21 // kernel_size * 8 * sizeof(float) | |||
| NoWinoSteps: | |||
| mov x18, #4 | |||
| mul x8, x8, x18 | |||
| mov x21, #4 | |||
| mul x8, x8, x21 | |||
| LoopRowStart: | |||
| cmp x6, #4 | |||
| @@ -1117,9 +1118,9 @@ LoopRow4: | |||
| LoopColEnd: | |||
| add x0, x0, x17 | |||
| cbz x9, C8DstStep | |||
| mov x18, #4 | |||
| mul x18, x18, x7 | |||
| sub x11, x11, x18 | |||
| mov x21, #4 | |||
| mul x21, x21, x7 | |||
| sub x11, x11, x21 | |||
| mov x2, x11 | |||
| b NoDstStep | |||
| C8DstStep: | |||
| @@ -1129,9 +1130,10 @@ LoopColEnd: | |||
| subs x6, x6, #12 | |||
| bgt LoopRowStart | |||
| sub sp, sp, #144 | |||
| sub sp, sp, #160 | |||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| ldp x19, x20, [sp], #16 | |||
| ldp x21, x22, [sp], #16 | |||
| ret | |||
| #endif | |||
| @@ -67,7 +67,7 @@ L2: | |||
| cmp w16, #0 | |||
| beq End2 | |||
| mov x18, x1 // reload b ptr | |||
| mov x28, x1 // reload b ptr | |||
| mov x19, x7 // reload bias ptr | |||
| mov w20, w5 // reload depth | |||
| dup v16.4s, wzr | |||
| @@ -94,10 +94,10 @@ L3: | |||
| ld1 {v1.16b}, [x17], #16 | |||
| ld1 {v2.16b}, [x17], #16 | |||
| ld1 {v3.16b}, [x17], #16 | |||
| ld1 {v4.16b}, [x18], #16 | |||
| ld1 {v5.16b}, [x18], #16 | |||
| ld1 {v6.16b}, [x18], #16 | |||
| ld1 {v7.16b}, [x18], #16 | |||
| ld1 {v4.16b}, [x28], #16 | |||
| ld1 {v5.16b}, [x28], #16 | |||
| ld1 {v6.16b}, [x28], #16 | |||
| ld1 {v7.16b}, [x28], #16 | |||
| smull v8.8h, v4.8b, v0.8b | |||
| smull v9.8h, v5.8b, v0.8b | |||
| @@ -30,7 +30,7 @@ | |||
| // x28: filter_zp | |||
| asm_function MatmulInt8Opt | |||
| sub sp, sp, #208 | |||
| sub sp, sp, #224 | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| stp x19, x20, [sp], #16 | |||
| @@ -38,6 +38,7 @@ asm_function MatmulInt8Opt | |||
| stp x23, x24, [sp], #16 | |||
| stp x25, x26, [sp], #16 | |||
| stp x27, x28, [sp], #16 | |||
| stp x29, x30, [sp], #16 | |||
| ldr w8, [sp] | |||
| ldr w9, [sp, #8] | |||
| @@ -55,7 +56,7 @@ asm_function MatmulInt8Opt | |||
| LoopRow: | |||
| mov x16, x1 // reload rhs ptr | |||
| mov x17, x4 // reload rhs col | |||
| mov x18, x7 // reload bias ptr | |||
| mov x29, x7 // reload bias ptr | |||
| mov x27, x2 // reload dst ptr | |||
| ldr x28, [sp, #64] // reload filter_zp | |||
| @@ -158,7 +159,7 @@ LoopRow: | |||
| Bias: | |||
| cbz x7, NoBias | |||
| ld1 {v15.4s}, [x18], #16 | |||
| ld1 {v15.4s}, [x29], #16 | |||
| add v16.4s, v16.4s, v15.4s | |||
| add v17.4s, v17.4s, v15.4s | |||
| add v18.4s, v18.4s, v15.4s | |||
| @@ -330,7 +331,7 @@ LoopColEnd: | |||
| b LoopRow | |||
| LoopRowEnd: | |||
| sub sp, sp, #208 | |||
| sub sp, sp, #224 | |||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| ldp x19, x20, [sp], #16 | |||
| @@ -338,5 +339,6 @@ LoopRowEnd: | |||
| ldp x23, x24, [sp], #16 | |||
| ldp x25, x26, [sp], #16 | |||
| ldp x27, x28, [sp], #16 | |||
| ldp x29, x30, [sp], #16 | |||
| ret | |||
| #endif | |||
| @@ -20,9 +20,10 @@ | |||
| // x7: bias | |||
| asm_function MatMulR4Int8Neon64 | |||
| sub sp, sp, #128 | |||
| sub sp, sp, #144 | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| stp x19, x20, [sp], #16 | |||
| mov w15, #0 // b col index | |||
| mov w16, #0 // a row index | |||
| @@ -40,7 +41,7 @@ L2: | |||
| cmp w16, w3 | |||
| beq End2 | |||
| mov x18, x1 // reload b ptr | |||
| mov x19, x1 // reload b ptr | |||
| mov x10, x7 // reload bias ptr | |||
| mov w11, w5 // reload depth | |||
| dup v16.4s, wzr | |||
| @@ -67,10 +68,10 @@ L3: | |||
| ld1 {v1.16b}, [x17], #16 | |||
| ld1 {v2.16b}, [x17], #16 | |||
| ld1 {v3.16b}, [x17], #16 | |||
| ld1 {v4.16b}, [x18], #16 | |||
| ld1 {v5.16b}, [x18], #16 | |||
| ld1 {v6.16b}, [x18], #16 | |||
| ld1 {v7.16b}, [x18], #16 | |||
| ld1 {v4.16b}, [x19], #16 | |||
| ld1 {v5.16b}, [x19], #16 | |||
| ld1 {v6.16b}, [x19], #16 | |||
| ld1 {v7.16b}, [x19], #16 | |||
| smull v8.8h, v4.8b, v0.8b | |||
| smull v9.8h, v5.8b, v0.8b | |||
| @@ -172,8 +173,9 @@ End2: | |||
| b L1 | |||
| End1: | |||
| sub sp, sp, #128 | |||
| sub sp, sp, #144 | |||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| ldp x19, x20, [sp], #16 | |||
| ret | |||
| #endif | |||
| @@ -30,13 +30,13 @@ asm_function MatrixMultiplyWinograd | |||
| mov x14, x1 // mat_b | |||
| LoopN: | |||
| mov x16, x0 // mat_a_m | |||
| sub x18, x5, x15 // ni | |||
| sub x22, x5, x15 // ni | |||
| sub x19, x17, x3 // mi | |||
| mul x18, x18, x17 // ni * m | |||
| mul x22, x22, x17 // ni * m | |||
| mov x11, x6 // in_channel | |||
| add x18, x18, x19 // (ni * m) + mi | |||
| mul x18, x18, x7 // x18 * c4_channel | |||
| add x20, x2, x18 // dst + offset | |||
| add x22, x22, x19 // (ni * m) + mi | |||
| mul x22, x22, x7 // x22 * c4_channel | |||
| add x20, x2, x22 // dst + offset | |||
| cmp x11, #16 | |||
| bge LoopC16 | |||
| cmp x11, #8 | |||
| @@ -1,6 +1,5 @@ | |||
| #ifdef __aarch64__ | |||
| #include "nnacl/assembly_global.h" | |||
| .text | |||
| .align 5 | |||
| //.p2align 5,,15 | |||
| @@ -55,16 +55,16 @@ LoopH: | |||
| ld1 {v0.s}[2], [x17], x10 | |||
| ld1 {v0.s}[3], [x17], x10 | |||
| mov x11, x6 | |||
| mov x18, x17 | |||
| add x18, x14, x7 | |||
| add x16, x18, x7 | |||
| mov x20, x17 | |||
| add x20, x14, x7 | |||
| add x16, x20, x7 | |||
| add x19, x16, x7 | |||
| LoopLength4: | |||
| ld1 {v16.4s}, [x2] | |||
| ld1 {v20.4s}, [x14], #16 | |||
| fmla v16.4s, v20.4s, v0.s[0] | |||
| ld1 {v21.4s}, [x18], #16 | |||
| ld1 {v21.4s}, [x20], #16 | |||
| fmul v17.4s, v21.4s, v0.s[1] | |||
| ld1 {v20.4s}, [x16], #16 | |||
| fmla v16.4s, v20.4s, v0.s[2] | |||
| @@ -90,14 +90,14 @@ LoopH: | |||
| ld1 {v0.s}[1], [x17], x10 | |||
| ld1 {v0.s}[2], [x17], x10 | |||
| mov x11, x6 | |||
| mov x18, x17 | |||
| add x18, x14, x7 | |||
| add x16, x18, x7 | |||
| mov x20, x17 | |||
| add x20, x14, x7 | |||
| add x16, x20, x7 | |||
| LoopLength3: | |||
| ld1 {v16.4s}, [x2] | |||
| ld1 {v20.4s}, [x14], #16 | |||
| fmla v16.4s, v20.4s, v0.s[0] | |||
| ld1 {v21.4s}, [x18], #16 | |||
| ld1 {v21.4s}, [x20], #16 | |||
| fmul v17.4s, v21.4s, v0.s[1] | |||
| ld1 {v20.4s}, [x16], #16 | |||
| fmla v16.4s, v20.4s, v0.s[2] | |||
| @@ -18,6 +18,9 @@ asm_function WinogradTransRight | |||
| //x5: k | |||
| //x6: length | |||
| sub sp, sp, #16 | |||
| stp x19, x20, [sp], #16 | |||
| mov x8, #16 // 4 * sizeof(float) | |||
| mul x8, x6, x8 | |||
| mul x9, x5, x8 // step for S | |||
| @@ -43,7 +46,7 @@ LoopH: | |||
| cmp x12, #4 | |||
| blt LoopKStart3 | |||
| mov x16, x15 | |||
| mov x18, x4 | |||
| mov x19, x4 | |||
| LoopK4: | |||
| ld1 {v0.s}[0], [x13], x10 | |||
| ld1 {v0.s}[1], [x13], x10 | |||
| @@ -54,7 +57,7 @@ LoopH: | |||
| add x14, x17, x8 | |||
| add x16, x14, x8 | |||
| add x18, x16, x8 | |||
| add x19, x16, x8 | |||
| LoopLength4: | |||
| ld1 {v16.4s}, [x2] | |||
| @@ -64,7 +67,7 @@ LoopH: | |||
| fmul v17.4s, v21.4s, v0.s[1] | |||
| ld1 {v20.4s}, [x16], #16 | |||
| fmla v16.4s, v20.4s, v0.s[2] | |||
| ld1 {v21.4s}, [x18], #16 | |||
| ld1 {v21.4s}, [x19], #16 | |||
| fmla v17.4s, v21.4s, v0.s[3] | |||
| fadd v17.4s, v16.4s, v17.4s | |||
| @@ -73,7 +76,7 @@ LoopH: | |||
| bne LoopLength4 | |||
| sub x2, x2, x8 | |||
| sub x12, x12, #4 | |||
| mov x17, x18 | |||
| mov x17, x19 | |||
| cmp x12, #4 | |||
| bge LoopK4 | |||
| @@ -107,7 +110,7 @@ LoopH: | |||
| bne LoopLength3 | |||
| sub x2, x2, x8 | |||
| sub x12, x12, #3 | |||
| mov x17, x18 | |||
| mov x17, x19 | |||
| cmp x12, #3 | |||
| bge LoopK3 | |||
| @@ -141,5 +144,7 @@ LoopH: | |||
| subs x4, x4, #1 | |||
| bne LoopH | |||
| sub sp, sp, #16 | |||
| ldp x19, x20, [sp], #16 | |||
| ret | |||
| #endif | |||
| @@ -1,4 +1,5 @@ | |||
| #ifdef ENABLE_AVX | |||
| #include "nnacl/assembly_global.h" | |||
| .text | |||
| .align 4 | |||
| .global ConvDwFp32Avx3x3 | |||
| @@ -31,7 +32,7 @@ | |||
| // 56: input_stride | |||
| // 64: relu | |||
| // 72: relu6 | |||
| ConvDwFp32Avx3x3: | |||
| asm_function ConvDwFp32Avx3x3 | |||
| pushq %r15 | |||
| pushq %r14 | |||
| pushq %r13 | |||
| @@ -1,4 +1,5 @@ | |||
| #ifdef ENABLE_AVX | |||
| #include "nnacl/assembly_global.h" | |||
| .text | |||
| .align 4 | |||
| .global MatmulFloatAvxOpt | |||
| @@ -34,7 +35,7 @@ | |||
| // 72: stride | |||
| // 80: writeMode | |||
| MatmulFloatAvxOpt: | |||
| asm_function MatmulFloatAvxOpt | |||
| // rbx, rsp, rbp, r12-r15 must be saved according to x86 calling convention | |||
| pushq %r15 | |||
| pushq %r14 | |||
| @@ -19,12 +19,13 @@ asm_function ConvDwFp16Center | |||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | |||
| // x19 ~ x29 should be also preserved | |||
| // whereas our coding style do not permit such amount of parameters | |||
| sub sp, sp, #176 | |||
| sub sp, sp, #192 | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| stp x19, x20, [sp], #16 | |||
| stp x21, x22, [sp], #16 | |||
| stp x23, x24, [sp], #16 | |||
| stp x25, x26, [sp], #16 | |||
| ldr x8, [sp] | |||
| ldr x9, [sp, #8] | |||
| @@ -71,7 +72,7 @@ asm_function ConvDwFp16Center | |||
| mov v14.16b, v24.16b | |||
| mov v15.16b, v24.16b | |||
| LoopKh16: | |||
| mov x18, x7 | |||
| mov x25, x7 | |||
| mov x21, x16 | |||
| LoopKw16: | |||
| mov x22, x21 | |||
| @@ -108,7 +109,7 @@ asm_function ConvDwFp16Center | |||
| ld1 {v23.8h}, [x22], x11 | |||
| fmla v14.8h, v22.8h, v25.8h | |||
| fmla v15.8h, v23.8h, v25.8h | |||
| subs x18, x18, #1 | |||
| subs x25, x25, #1 | |||
| add x21, x21, x13 | |||
| bne LoopKw16 | |||
| add x16, x16, x12 | |||
| @@ -191,7 +192,7 @@ asm_function ConvDwFp16Center | |||
| mov v6.16b, v24.16b | |||
| mov v7.16b, v24.16b | |||
| LoopKh8: | |||
| mov x18, x7 | |||
| mov x25, x7 | |||
| mov x21, x16 | |||
| LoopKw8: | |||
| mov x22, x21 | |||
| @@ -212,7 +213,7 @@ asm_function ConvDwFp16Center | |||
| ld1 {v23.8h}, [x22], x11 | |||
| fmla v6.8h, v22.8h, v25.8h | |||
| fmla v7.8h, v23.8h, v25.8h | |||
| subs x18, x18, #1 | |||
| subs x25, x25, #1 | |||
| add x21, x21, x13 | |||
| bne LoopKw8 | |||
| add x16, x16, x12 | |||
| @@ -260,13 +261,13 @@ asm_function ConvDwFp16Center | |||
| mov x20, x6 | |||
| mov v0.16b, v24.16b | |||
| LoopKh: | |||
| mov x18, x7 | |||
| mov x25, x7 | |||
| mov x22, x16 | |||
| LoopKw: | |||
| ld1 {v16.8h}, [x22], x13 | |||
| ld1 {v25.8h}, [x17], #16 | |||
| fmla v0.8h, v16.8h, v25.8h | |||
| subs x18, x18, #1 | |||
| subs x25, x25, #1 | |||
| bne LoopKw | |||
| add x16, x16, x12 | |||
| subs x20, x20, #1 | |||
| @@ -289,11 +290,12 @@ asm_function ConvDwFp16Center | |||
| subs x4, x4, #1 | |||
| bne LoopH | |||
| sub sp, sp, #176 | |||
| sub sp, sp, #192 | |||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| ldp x19, x20, [sp], #16 | |||
| ldp x21, x22, [sp], #16 | |||
| ldp x23, x24, [sp], #16 | |||
| ldp x25, x26, [sp], #16 | |||
| ret | |||
| #endif | |||
| @@ -33,12 +33,12 @@ asm_function DeconvDwFp16Center | |||
| mov x16, x1 | |||
| mov x17, x4 | |||
| LoopW: | |||
| mov x18, x15 | |||
| mov x22, x15 | |||
| mov x19, x2 | |||
| mov x20, x5 | |||
| ld1 {v1.8h}, [x16], x8 | |||
| LoopKh: | |||
| mov x21, x18 | |||
| mov x21, x22 | |||
| mov x13, x6 | |||
| LoopKw: | |||
| ld1 {v0.8h}, [x21] | |||
| @@ -47,7 +47,7 @@ asm_function DeconvDwFp16Center | |||
| st1 {v0.8h}, [x21], x12 | |||
| subs x13, x13, #1 | |||
| bne LoopKw | |||
| add x18, x18, x11 | |||
| add x22, x22, x11 | |||
| subs x20, x20, #1 | |||
| bne LoopKh | |||
| add x15, x15, x10 | |||
| @@ -41,11 +41,12 @@ asm_function IndirectGemmFp16_16x8 | |||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | |||
| // x19 ~ r29 should be also preserved | |||
| // whereas our coding style do not permit such amount of parameters | |||
| sub sp, sp, #128 | |||
| sub sp, sp, #144 | |||
| // performance between storing 4 registers at the same time and separately storing them on in-order cores | |||
| // is not tested yet | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| stp x19, x20, [sp], #16 | |||
| ldr x8, [sp, #0] | |||
| ldr x9, [sp, #8] | |||
| @@ -548,87 +549,87 @@ IndirectGemmStart: | |||
| b WriteEnd | |||
| Write7: | |||
| add x17, x15, #8 | |||
| add x18, x15, #10 | |||
| add x19, x15, #10 | |||
| add x16, x15, #12 | |||
| st1 {v16.4h}, [x15], x7 | |||
| ins v0.s[0], v16.s[2] | |||
| st1 {v0.h}[0], [x17], x7 | |||
| st1 {v0.h}[1], [x18], x7 | |||
| st1 {v0.h}[1], [x19], x7 | |||
| st1 {v16.h}[6], [x16], x7 | |||
| st1 {v17.4h}, [x15], x7 | |||
| ins v1.s[0], v17.s[2] | |||
| st1 {v1.h}[0], [x17], x7 | |||
| st1 {v1.h}[1], [x18], x7 | |||
| st1 {v1.h}[1], [x19], x7 | |||
| st1 {v17.h}[6], [x16], x7 | |||
| st1 {v18.4h}, [x15], x7 | |||
| ins v2.s[0], v18.s[2] | |||
| st1 {v2.h}[0], [x17], x7 | |||
| st1 {v2.h}[1], [x18], x7 | |||
| st1 {v2.h}[1], [x19], x7 | |||
| st1 {v18.h}[6], [x16], x7 | |||
| st1 {v19.4h}, [x15], x7 | |||
| ins v3.s[0], v19.s[2] | |||
| st1 {v3.h}[0], [x17], x7 | |||
| st1 {v3.h}[1], [x18], x7 | |||
| st1 {v3.h}[1], [x19], x7 | |||
| st1 {v19.h}[6], [x16], x7 | |||
| st1 {v20.4h}, [x15], x7 | |||
| ins v4.s[0], v20.s[2] | |||
| st1 {v4.h}[0], [x17], x7 | |||
| st1 {v4.h}[1], [x18], x7 | |||
| st1 {v4.h}[1], [x19], x7 | |||
| st1 {v20.h}[6], [x16], x7 | |||
| st1 {v21.4h}, [x15], x7 | |||
| ins v5.s[0], v21.s[2] | |||
| st1 {v5.h}[0], [x17], x7 | |||
| st1 {v5.h}[1], [x18], x7 | |||
| st1 {v5.h}[1], [x19], x7 | |||
| st1 {v21.h}[6], [x16], x7 | |||
| st1 {v22.4h}, [x15], x7 | |||
| ins v6.s[0], v22.s[2] | |||
| st1 {v6.h}[0], [x17], x7 | |||
| st1 {v6.h}[1], [x18], x7 | |||
| st1 {v6.h}[1], [x19], x7 | |||
| st1 {v22.h}[6], [x16], x7 | |||
| st1 {v23.4h}, [x15], x7 | |||
| ins v7.s[0], v23.s[2] | |||
| st1 {v7.h}[0], [x17], x7 | |||
| st1 {v7.h}[1], [x18], x7 | |||
| st1 {v7.h}[1], [x19], x7 | |||
| st1 {v23.h}[6], [x16], x7 | |||
| st1 {v24.4h}, [x15], x7 | |||
| ins v8.s[0], v24.s[2] | |||
| st1 {v8.h}[0], [x17], x7 | |||
| st1 {v8.h}[1], [x18], x7 | |||
| st1 {v8.h}[1], [x19], x7 | |||
| st1 {v24.h}[6], [x16], x7 | |||
| st1 {v25.4h}, [x15], x7 | |||
| ins v9.s[0], v25.s[2] | |||
| st1 {v9.h}[0], [x17], x7 | |||
| st1 {v9.h}[1], [x18], x7 | |||
| st1 {v9.h}[1], [x19], x7 | |||
| st1 {v25.h}[6], [x16], x7 | |||
| st1 {v26.4h}, [x15], x7 | |||
| ins v10.s[0], v26.s[2] | |||
| st1 {v10.h}[0], [x17], x7 | |||
| st1 {v10.h}[1], [x18], x7 | |||
| st1 {v10.h}[1], [x19], x7 | |||
| st1 {v26.h}[6], [x16], x7 | |||
| st1 {v27.4h}, [x15], x7 | |||
| ins v11.s[0], v27.s[2] | |||
| st1 {v11.h}[0], [x17], x7 | |||
| st1 {v11.h}[1], [x18], x7 | |||
| st1 {v11.h}[1], [x19], x7 | |||
| st1 {v27.h}[6], [x16], x7 | |||
| st1 {v28.4h}, [x15], x7 | |||
| ins v12.s[0], v28.s[2] | |||
| st1 {v12.h}[0], [x17], x7 | |||
| st1 {v12.h}[1], [x18], x7 | |||
| st1 {v12.h}[1], [x19], x7 | |||
| st1 {v28.h}[6], [x16], x7 | |||
| st1 {v29.4h}, [x15], x7 | |||
| ins v13.s[0], v29.s[2] | |||
| st1 {v13.h}[0], [x17], x7 | |||
| st1 {v13.h}[1], [x18], x7 | |||
| st1 {v13.h}[1], [x19], x7 | |||
| st1 {v29.h}[6], [x16], x7 | |||
| st1 {v30.4h}, [x15], x7 | |||
| ins v14.s[0], v30.s[2] | |||
| st1 {v14.h}[0], [x17], x7 | |||
| st1 {v14.h}[1], [x18], x7 | |||
| st1 {v14.h}[1], [x19], x7 | |||
| st1 {v30.h}[6], [x16], x7 | |||
| st1 {v31.4h}, [x15] | |||
| ins v15.s[0], v31.s[2] | |||
| st1 {v15.h}[0], [x17] | |||
| st1 {v15.h}[1], [x18] | |||
| st1 {v15.h}[1], [x19] | |||
| st1 {v31.h}[6], [x16] | |||
| add x0, x0, #14 | |||
| b WriteEnd | |||
| @@ -661,9 +662,10 @@ IndirectGemmStart: | |||
| NoStepForward: | |||
| bgt LoopOc | |||
| sub sp, sp, #128 | |||
| sub sp, sp, #144 | |||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| ldp x19, x20, [sp], #16 | |||
| ret | |||
| #endif | |||
| @@ -21,21 +21,22 @@ | |||
| // w13: writeC8 | |||
| asm_function MatmulFp16Neon64 | |||
| sub sp, sp, #128 | |||
| sub sp, sp, #144 | |||
| st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | |||
| st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 | |||
| stp x19, x20, [sp], #16 | |||
| mov w18, #16 // sizeof(float16) * 8 | |||
| mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float16) * 8 * depth | |||
| mov x11, x3 // bias flag | |||
| mov x18, #2 | |||
| mov x19, #2 | |||
| ldr x17, [sp] | |||
| mul x17, x17, x18 | |||
| mul x17, x17, x19 | |||
| L1: | |||
| mov w10, w6 // reload lhs row | |||
| mov x12, x0 // reload lhs ptr | |||
| mov x18, x2 // reload dst ptr | |||
| mov x19, x2 // reload dst ptr | |||
| L2: | |||
| mov x16, x1 // reload rhs ptr | |||
| @@ -314,490 +315,490 @@ Write: | |||
| b Write8 | |||
| Write1: | |||
| st1 {v16.h}[0], [x18], x17 | |||
| st1 {v16.h}[0], [x19], x17 | |||
| cmp w10, #1 | |||
| beq WriteEnd | |||
| st1 {v17.h}[0], [x18], x17 | |||
| st1 {v17.h}[0], [x19], x17 | |||
| cmp w10, #2 | |||
| beq WriteEnd | |||
| st1 {v18.h}[0], [x18], x17 | |||
| st1 {v18.h}[0], [x19], x17 | |||
| cmp w10, #3 | |||
| beq WriteEnd | |||
| st1 {v19.h}[0], [x18], x17 | |||
| st1 {v19.h}[0], [x19], x17 | |||
| cmp w10, #4 | |||
| beq WriteEnd | |||
| st1 {v20.h}[0], [x18], x17 | |||
| st1 {v20.h}[0], [x19], x17 | |||
| cmp w10, #5 | |||
| beq WriteEnd | |||
| st1 {v21.h}[0], [x18], x17 | |||
| st1 {v21.h}[0], [x19], x17 | |||
| cmp w10, #6 | |||
| beq WriteEnd | |||
| st1 {v22.h}[0], [x18], x17 | |||
| st1 {v22.h}[0], [x19], x17 | |||
| cmp w10, #7 | |||
| beq WriteEnd | |||
| st1 {v23.h}[0], [x18], x17 | |||
| st1 {v23.h}[0], [x19], x17 | |||
| cmp w10, #8 | |||
| beq WriteEnd | |||
| st1 {v24.h}[0], [x18], x17 | |||
| st1 {v24.h}[0], [x19], x17 | |||
| cmp w10, #9 | |||
| beq WriteEnd | |||
| st1 {v25.h}[0], [x18], x17 | |||
| st1 {v25.h}[0], [x19], x17 | |||
| cmp w10, #10 | |||
| beq WriteEnd | |||
| st1 {v26.h}[0], [x18], x17 | |||
| st1 {v26.h}[0], [x19], x17 | |||
| cmp w10, #11 | |||
| beq WriteEnd | |||
| st1 {v27.h}[0], [x18], x17 | |||
| st1 {v27.h}[0], [x19], x17 | |||
| cmp w10, #12 | |||
| beq WriteEnd | |||
| st1 {v28.h}[0], [x18], x17 | |||
| st1 {v28.h}[0], [x19], x17 | |||
| cmp w10, #13 | |||
| beq WriteEnd | |||
| st1 {v29.h}[0], [x18], x17 | |||
| st1 {v29.h}[0], [x19], x17 | |||
| cmp w10, #14 | |||
| beq WriteEnd | |||
| st1 {v30.h}[0], [x18], x17 | |||
| st1 {v30.h}[0], [x19], x17 | |||
| cmp w10, #15 | |||
| beq WriteEnd | |||
| st1 {v31.h}[0], [x18], x17 | |||
| st1 {v31.h}[0], [x19], x17 | |||
| b WriteEnd | |||
| Write2: | |||
| add x13, x18, #2 | |||
| st1 {v16.h}[0], [x18], x17 | |||
| add x13, x19, #2 | |||
| st1 {v16.h}[0], [x19], x17 | |||
| st1 {v16.h}[1], [x13], x17 | |||
| cmp w10, #1 | |||
| beq WriteEnd | |||
| st1 {v17.h}[0], [x18], x17 | |||
| st1 {v17.h}[0], [x19], x17 | |||
| st1 {v17.h}[1], [x13], x17 | |||
| cmp w10, #2 | |||
| beq WriteEnd | |||
| st1 {v18.h}[0], [x18], x17 | |||
| st1 {v18.h}[0], [x19], x17 | |||
| st1 {v18.h}[1], [x13], x17 | |||
| cmp w10, #3 | |||
| beq WriteEnd | |||
| st1 {v19.h}[0], [x18], x17 | |||
| st1 {v19.h}[0], [x19], x17 | |||
| st1 {v19.h}[1], [x13], x17 | |||
| cmp w10, #4 | |||
| beq WriteEnd | |||
| st1 {v20.h}[0], [x18], x17 | |||
| st1 {v20.h}[0], [x19], x17 | |||
| st1 {v20.h}[1], [x13], x17 | |||
| cmp w10, #5 | |||
| beq WriteEnd | |||
| st1 {v21.h}[0], [x18], x17 | |||
| st1 {v21.h}[0], [x19], x17 | |||
| st1 {v21.h}[1], [x13], x17 | |||
| cmp w10, #6 | |||
| beq WriteEnd | |||
| st1 {v22.h}[0], [x18], x17 | |||
| st1 {v22.h}[0], [x19], x17 | |||
| st1 {v22.h}[1], [x13], x17 | |||
| cmp w10, #7 | |||
| beq WriteEnd | |||
| st1 {v23.h}[0], [x18], x17 | |||
| st1 {v23.h}[0], [x19], x17 | |||
| st1 {v23.h}[1], [x13], x17 | |||
| cmp w10, #8 | |||
| beq WriteEnd | |||
| st1 {v24.h}[0], [x18], x17 | |||
| st1 {v24.h}[0], [x19], x17 | |||
| st1 {v24.h}[1], [x13], x17 | |||
| cmp w10, #9 | |||
| beq WriteEnd | |||
| st1 {v25.h}[0], [x18], x17 | |||
| st1 {v25.h}[0], [x19], x17 | |||
| st1 {v25.h}[1], [x13], x17 | |||
| cmp w10, #10 | |||
| beq WriteEnd | |||
| st1 {v26.h}[0], [x18], x17 | |||
| st1 {v26.h}[0], [x19], x17 | |||
| st1 {v26.h}[1], [x13], x17 | |||
| cmp w10, #11 | |||
| beq WriteEnd | |||
| st1 {v27.h}[0], [x18], x17 | |||
| st1 {v27.h}[0], [x19], x17 | |||
| st1 {v27.h}[1], [x13], x17 | |||
| cmp w10, #12 | |||
| beq WriteEnd | |||
| st1 {v28.h}[0], [x18], x17 | |||
| st1 {v28.h}[0], [x19], x17 | |||
| st1 {v28.h}[1], [x13], x17 | |||
| cmp w10, #13 | |||
| beq WriteEnd | |||
| st1 {v29.h}[0], [x18], x17 | |||
| st1 {v29.h}[0], [x19], x17 | |||
| st1 {v29.h}[1], [x13], x17 | |||
| cmp w10, #14 | |||
| beq WriteEnd | |||
| st1 {v30.h}[0], [x18], x17 | |||
| st1 {v30.h}[0], [x19], x17 | |||
| st1 {v30.h}[1], [x13], x17 | |||
| cmp w10, #15 | |||
| beq WriteEnd | |||
| st1 {v31.h}[0], [x18], x17 | |||
| st1 {v31.h}[0], [x19], x17 | |||
| st1 {v31.h}[1], [x13], x17 | |||
| b WriteEnd | |||
| Write3: | |||
| add x13, x18, #2 | |||
| add x14, x18, #4 | |||
| st1 {v16.h}[0], [x18], x17 | |||
| add x13, x19, #2 | |||
| add x14, x19, #4 | |||
| st1 {v16.h}[0], [x19], x17 | |||
| st1 {v16.h}[1], [x13], x17 | |||
| st1 {v16.h}[2], [x14], x17 | |||
| cmp w10, #1 | |||
| beq WriteEnd | |||
| st1 {v17.h}[0], [x18], x17 | |||
| st1 {v17.h}[0], [x19], x17 | |||
| st1 {v17.h}[1], [x13], x17 | |||
| st1 {v17.h}[2], [x14], x17 | |||
| cmp w10, #2 | |||
| beq WriteEnd | |||
| st1 {v18.h}[0], [x18], x17 | |||
| st1 {v18.h}[0], [x19], x17 | |||
| st1 {v18.h}[1], [x13], x17 | |||
| st1 {v18.h}[2], [x14], x17 | |||
| cmp w10, #3 | |||
| beq WriteEnd | |||
| st1 {v19.h}[0], [x18], x17 | |||
| st1 {v19.h}[0], [x19], x17 | |||
| st1 {v19.h}[1], [x13], x17 | |||
| st1 {v19.h}[2], [x14], x17 | |||
| cmp w10, #4 | |||
| beq WriteEnd | |||
| st1 {v20.h}[0], [x18], x17 | |||
| st1 {v20.h}[0], [x19], x17 | |||
| st1 {v20.h}[1], [x13], x17 | |||
| st1 {v20.h}[2], [x14], x17 | |||
| cmp w10, #5 | |||
| beq WriteEnd | |||
| st1 {v21.h}[0], [x18], x17 | |||
| st1 {v21.h}[0], [x19], x17 | |||
| st1 {v21.h}[1], [x13], x17 | |||
| st1 {v21.h}[2], [x14], x17 | |||
| cmp w10, #6 | |||
| beq WriteEnd | |||
| st1 {v22.h}[0], [x18], x17 | |||
| st1 {v22.h}[0], [x19], x17 | |||
| st1 {v22.h}[1], [x13], x17 | |||
| st1 {v22.h}[2], [x14], x17 | |||
| cmp w10, #7 | |||
| beq WriteEnd | |||
| st1 {v23.h}[0], [x18], x17 | |||
| st1 {v23.h}[0], [x19], x17 | |||
| st1 {v23.h}[1], [x13], x17 | |||
| st1 {v23.h}[2], [x14], x17 | |||
| cmp w10, #8 | |||
| beq WriteEnd | |||
| st1 {v24.h}[0], [x18], x17 | |||
| st1 {v24.h}[0], [x19], x17 | |||
| st1 {v24.h}[1], [x13], x17 | |||
| st1 {v24.h}[2], [x14], x17 | |||
| cmp w10, #9 | |||
| beq WriteEnd | |||
| st1 {v25.h}[0], [x18], x17 | |||
| st1 {v25.h}[0], [x19], x17 | |||
| st1 {v25.h}[1], [x13], x17 | |||
| st1 {v25.h}[2], [x14], x17 | |||
| cmp w10, #10 | |||
| beq WriteEnd | |||
| st1 {v26.h}[0], [x18], x17 | |||
| st1 {v26.h}[0], [x19], x17 | |||
| st1 {v26.h}[1], [x13], x17 | |||
| st1 {v26.h}[2], [x14], x17 | |||
| cmp w10, #11 | |||
| beq WriteEnd | |||
| st1 {v27.h}[0], [x18], x17 | |||
| st1 {v27.h}[0], [x19], x17 | |||
| st1 {v27.h}[1], [x13], x17 | |||
| st1 {v27.h}[2], [x14], x17 | |||
| cmp w10, #12 | |||
| beq WriteEnd | |||
| st1 {v28.h}[0], [x18], x17 | |||
| st1 {v28.h}[0], [x19], x17 | |||
| st1 {v28.h}[1], [x13], x17 | |||
| st1 {v28.h}[2], [x14], x17 | |||
| cmp w10, #13 | |||
| beq WriteEnd | |||
| st1 {v29.h}[0], [x18], x17 | |||
| st1 {v29.h}[0], [x19], x17 | |||
| st1 {v29.h}[1], [x13], x17 | |||
| st1 {v29.h}[2], [x14], x17 | |||
| cmp w10, #14 | |||
| beq WriteEnd | |||
| st1 {v30.h}[0], [x18], x17 | |||
| st1 {v30.h}[0], [x19], x17 | |||
| st1 {v30.h}[1], [x13], x17 | |||
| st1 {v30.h}[2], [x14], x17 | |||
| cmp w10, #15 | |||
| beq WriteEnd | |||
| st1 {v31.h}[0], [x18], x17 | |||
| st1 {v31.h}[0], [x19], x17 | |||
| st1 {v31.h}[1], [x13], x17 | |||
| st1 {v31.h}[2], [x14], x17 | |||
| b WriteEnd | |||
| Write4: | |||
| st1 {v16.4h}, [x18], x17 | |||
| st1 {v16.4h}, [x19], x17 | |||
| cmp w10, #1 | |||
| beq WriteEnd | |||
| st1 {v17.4h}, [x18], x17 | |||
| st1 {v17.4h}, [x19], x17 | |||
| cmp w10, #2 | |||
| beq WriteEnd | |||
| st1 {v18.4h}, [x18], x17 | |||
| st1 {v18.4h}, [x19], x17 | |||
| cmp w10, #3 | |||
| beq WriteEnd | |||
| st1 {v19.4h}, [x18], x17 | |||
| st1 {v19.4h}, [x19], x17 | |||
| cmp w10, #4 | |||
| beq WriteEnd | |||
| st1 {v20.4h}, [x18], x17 | |||
| st1 {v20.4h}, [x19], x17 | |||
| cmp w10, #5 | |||
| beq WriteEnd | |||
| st1 {v21.4h}, [x18], x17 | |||
| st1 {v21.4h}, [x19], x17 | |||
| cmp w10, #6 | |||
| beq WriteEnd | |||
| st1 {v22.4h}, [x18], x17 | |||
| st1 {v22.4h}, [x19], x17 | |||
| cmp w10, #7 | |||
| beq WriteEnd | |||
| st1 {v23.4h}, [x18], x17 | |||
| st1 {v23.4h}, [x19], x17 | |||
| cmp w10, #8 | |||
| beq WriteEnd | |||
| st1 {v24.4h}, [x18], x17 | |||
| st1 {v24.4h}, [x19], x17 | |||
| cmp w10, #9 | |||
| beq WriteEnd | |||
| st1 {v25.4h}, [x18], x17 | |||
| st1 {v25.4h}, [x19], x17 | |||
| cmp w10, #10 | |||
| beq WriteEnd | |||
| st1 {v26.4h}, [x18], x17 | |||
| st1 {v26.4h}, [x19], x17 | |||
| cmp w10, #11 | |||
| beq WriteEnd | |||
| st1 {v27.4h}, [x18], x17 | |||
| st1 {v27.4h}, [x19], x17 | |||
| cmp w10, #12 | |||
| beq WriteEnd | |||
| st1 {v28.4h}, [x18], x17 | |||
| st1 {v28.4h}, [x19], x17 | |||
| cmp w10, #13 | |||
| beq WriteEnd | |||
| st1 {v29.4h}, [x18], x17 | |||
| st1 {v29.4h}, [x19], x17 | |||
| cmp w10, #14 | |||
| beq WriteEnd | |||
| st1 {v30.4h}, [x18], x17 | |||
| st1 {v30.4h}, [x19], x17 | |||
| cmp w10, #15 | |||
| beq WriteEnd | |||
| st1 {v31.4h}, [x18], x17 | |||
| st1 {v31.4h}, [x19], x17 | |||
| b WriteEnd | |||
| Write5: | |||
| add x13, x18, #8 | |||
| st1 {v16.4h}, [x18], x17 | |||
| add x13, x19, #8 | |||
| st1 {v16.4h}, [x19], x17 | |||
| st1 {v16.h}[4], [x13], x17 | |||
| cmp w10, #1 | |||
| beq WriteEnd | |||
| st1 {v17.4h}, [x18], x17 | |||
| st1 {v17.4h}, [x19], x17 | |||
| st1 {v17.h}[4], [x13], x17 | |||
| cmp w10, #2 | |||
| beq WriteEnd | |||
| st1 {v18.4h}, [x18], x17 | |||
| st1 {v18.4h}, [x19], x17 | |||
| st1 {v18.h}[4], [x13], x17 | |||
| cmp w10, #3 | |||
| beq WriteEnd | |||
| st1 {v19.4h}, [x18], x17 | |||
| st1 {v19.4h}, [x19], x17 | |||
| st1 {v19.h}[4], [x13], x17 | |||
| cmp w10, #4 | |||
| beq WriteEnd | |||
| st1 {v20.4h}, [x18], x17 | |||
| st1 {v20.4h}, [x19], x17 | |||
| st1 {v20.h}[4], [x13], x17 | |||
| cmp w10, #5 | |||
| beq WriteEnd | |||
| st1 {v21.4h}, [x18], x17 | |||
| st1 {v21.4h}, [x19], x17 | |||
| st1 {v21.h}[4], [x13], x17 | |||
| cmp w10, #6 | |||
| beq WriteEnd | |||
| st1 {v22.4h}, [x18], x17 | |||
| st1 {v22.4h}, [x19], x17 | |||
| st1 {v22.h}[4], [x13], x17 | |||
| cmp w10, #7 | |||
| beq WriteEnd | |||
| st1 {v23.4h}, [x18], x17 | |||
| st1 {v23.4h}, [x19], x17 | |||
| st1 {v23.h}[4], [x13], x17 | |||
| cmp w10, #8 | |||
| beq WriteEnd | |||
| st1 {v24.4h}, [x18], x17 | |||
| st1 {v24.4h}, [x19], x17 | |||
| st1 {v24.h}[4], [x13], x17 | |||
| cmp w10, #9 | |||
| beq WriteEnd | |||
| st1 {v25.4h}, [x18], x17 | |||
| st1 {v25.4h}, [x19], x17 | |||
| st1 {v25.h}[4], [x13], x17 | |||
| cmp w10, #10 | |||
| beq WriteEnd | |||
| st1 {v26.4h}, [x18], x17 | |||
| st1 {v26.4h}, [x19], x17 | |||
| st1 {v26.h}[4], [x13], x17 | |||
| cmp w10, #11 | |||
| beq WriteEnd | |||
| st1 {v27.4h}, [x18], x17 | |||
| st1 {v27.4h}, [x19], x17 | |||
| st1 {v27.h}[4], [x13], x17 | |||
| cmp w10, #12 | |||
| beq WriteEnd | |||
| st1 {v28.4h}, [x18], x17 | |||
| st1 {v28.4h}, [x19], x17 | |||
| st1 {v28.h}[4], [x13], x17 | |||
| cmp w10, #13 | |||
| beq WriteEnd | |||
| st1 {v29.4h}, [x18], x17 | |||
| st1 {v29.4h}, [x19], x17 | |||
| st1 {v29.h}[4], [x13], x17 | |||
| cmp w10, #14 | |||
| beq WriteEnd | |||
| st1 {v30.4h}, [x18], x17 | |||
| st1 {v30.4h}, [x19], x17 | |||
| st1 {v30.h}[4], [x13], x17 | |||
| cmp w10, #15 | |||
| beq WriteEnd | |||
| st1 {v31.4h}, [x18], x17 | |||
| st1 {v31.4h}, [x19], x17 | |||
| st1 {v31.h}[4], [x13], x17 | |||
| b WriteEnd | |||
| Write6: | |||
| add x13, x18, #8 | |||
| add x14, x18, #10 | |||
| st1 {v16.4h}, [x18], x17 | |||
| add x13, x19, #8 | |||
| add x14, x19, #10 | |||
| st1 {v16.4h}, [x19], x17 | |||
| st1 {v16.h}[4], [x13], x17 | |||
| st1 {v16.h}[5], [x14], x17 | |||
| cmp w10, #1 | |||
| beq WriteEnd | |||
| st1 {v17.4h}, [x18], x17 | |||
| st1 {v17.4h}, [x19], x17 | |||
| st1 {v17.h}[4], [x13], x17 | |||
| st1 {v17.h}[5], [x14], x17 | |||
| cmp w10, #2 | |||
| beq WriteEnd | |||
| st1 {v18.4h}, [x18], x17 | |||
| st1 {v18.4h}, [x19], x17 | |||
| st1 {v18.h}[4], [x13], x17 | |||
| st1 {v18.h}[5], [x14], x17 | |||
| cmp w10, #3 | |||
| beq WriteEnd | |||
| st1 {v19.4h}, [x18], x17 | |||
| st1 {v19.4h}, [x19], x17 | |||
| st1 {v19.h}[4], [x13], x17 | |||
| st1 {v19.h}[5], [x14], x17 | |||
| cmp w10, #4 | |||
| beq WriteEnd | |||
| st1 {v20.4h}, [x18], x17 | |||
| st1 {v20.4h}, [x19], x17 | |||
| st1 {v20.h}[4], [x13], x17 | |||
| st1 {v20.h}[5], [x14], x17 | |||
| cmp w10, #5 | |||
| beq WriteEnd | |||
| st1 {v21.4h}, [x18], x17 | |||
| st1 {v21.4h}, [x19], x17 | |||
| st1 {v21.h}[4], [x13], x17 | |||
| st1 {v21.h}[5], [x14], x17 | |||
| cmp w10, #6 | |||
| beq WriteEnd | |||
| st1 {v22.4h}, [x18], x17 | |||
| st1 {v22.4h}, [x19], x17 | |||
| st1 {v22.h}[4], [x13], x17 | |||
| st1 {v22.h}[5], [x14], x17 | |||
| cmp w10, #7 | |||
| beq WriteEnd | |||
| st1 {v23.4h}, [x18], x17 | |||
| st1 {v23.4h}, [x19], x17 | |||
| st1 {v23.h}[4], [x13], x17 | |||
| st1 {v23.h}[5], [x14], x17 | |||
| cmp w10, #8 | |||
| beq WriteEnd | |||
| st1 {v24.4h}, [x18], x17 | |||
| st1 {v24.4h}, [x19], x17 | |||
| st1 {v24.h}[4], [x13], x17 | |||
| st1 {v24.h}[5], [x14], x17 | |||
| cmp w10, #9 | |||
| beq WriteEnd | |||
| st1 {v25.4h}, [x18], x17 | |||
| st1 {v25.4h}, [x19], x17 | |||
| st1 {v25.h}[4], [x13], x17 | |||
| st1 {v25.h}[5], [x14], x17 | |||
| cmp w10, #10 | |||
| beq WriteEnd | |||
| st1 {v26.4h}, [x18], x17 | |||
| st1 {v26.4h}, [x19], x17 | |||
| st1 {v26.h}[4], [x13], x17 | |||
| st1 {v26.h}[5], [x14], x17 | |||
| cmp w10, #11 | |||
| beq WriteEnd | |||
| st1 {v27.4h}, [x18], x17 | |||
| st1 {v27.4h}, [x19], x17 | |||
| st1 {v27.h}[4], [x13], x17 | |||
| st1 {v27.h}[5], [x14], x17 | |||
| cmp w10, #12 | |||
| beq WriteEnd | |||
| st1 {v28.4h}, [x18], x17 | |||
| st1 {v28.4h}, [x19], x17 | |||
| st1 {v28.h}[4], [x13], x17 | |||
| st1 {v28.h}[5], [x14], x17 | |||
| cmp w10, #13 | |||
| beq WriteEnd | |||
| st1 {v29.4h}, [x18], x17 | |||
| st1 {v29.4h}, [x19], x17 | |||
| st1 {v29.h}[4], [x13], x17 | |||
| st1 {v29.h}[5], [x14], x17 | |||
| cmp w10, #14 | |||
| beq WriteEnd | |||
| st1 {v30.4h}, [x18], x17 | |||
| st1 {v30.4h}, [x19], x17 | |||
| st1 {v30.h}[4], [x13], x17 | |||
| st1 {v30.h}[5], [x14], x17 | |||
| cmp w10, #15 | |||
| beq WriteEnd | |||
| st1 {v31.4h}, [x18], x17 | |||
| st1 {v31.4h}, [x19], x17 | |||
| st1 {v31.h}[4], [x13], x17 | |||
| st1 {v31.h}[5], [x14], x17 | |||
| b WriteEnd | |||
| Write7: | |||
| add x13, x18, #8 | |||
| add x14, x18, #10 | |||
| add x16, x18, #12 | |||
| st1 {v16.4h}, [x18], x17 | |||
| add x13, x19, #8 | |||
| add x14, x19, #10 | |||
| add x16, x19, #12 | |||
| st1 {v16.4h}, [x19], x17 | |||
| st1 {v16.h}[4], [x13], x17 | |||
| st1 {v16.h}[5], [x14], x17 | |||
| st1 {v16.h}[6], [x16], x17 | |||
| cmp w10, #1 | |||
| beq WriteEnd | |||
| st1 {v17.4h}, [x18], x17 | |||
| st1 {v17.4h}, [x19], x17 | |||
| st1 {v17.h}[4], [x13], x17 | |||
| st1 {v17.h}[5], [x14], x17 | |||
| st1 {v17.h}[6], [x16], x17 | |||
| cmp w10, #2 | |||
| beq WriteEnd | |||
| st1 {v18.4h}, [x18], x17 | |||
| st1 {v18.4h}, [x19], x17 | |||
| st1 {v18.h}[4], [x13], x17 | |||
| st1 {v18.h}[5], [x14], x17 | |||
| st1 {v18.h}[6], [x16], x17 | |||
| cmp w10, #3 | |||
| beq WriteEnd | |||
| st1 {v19.4h}, [x18], x17 | |||
| st1 {v19.4h}, [x19], x17 | |||
| st1 {v19.h}[4], [x13], x17 | |||
| st1 {v19.h}[5], [x14], x17 | |||
| st1 {v19.h}[6], [x16], x17 | |||
| cmp w10, #4 | |||
| beq WriteEnd | |||
| st1 {v20.4h}, [x18], x17 | |||
| st1 {v20.4h}, [x19], x17 | |||
| st1 {v20.h}[4], [x13], x17 | |||
| st1 {v20.h}[5], [x14], x17 | |||
| st1 {v20.h}[6], [x16], x17 | |||
| cmp w10, #5 | |||
| beq WriteEnd | |||
| st1 {v21.4h}, [x18], x17 | |||
| st1 {v21.4h}, [x19], x17 | |||
| st1 {v21.h}[4], [x13], x17 | |||
| st1 {v21.h}[5], [x14], x17 | |||
| st1 {v21.h}[6], [x16], x17 | |||
| cmp w10, #6 | |||
| beq WriteEnd | |||
| st1 {v22.4h}, [x18], x17 | |||
| st1 {v22.4h}, [x19], x17 | |||
| st1 {v22.h}[4], [x13], x17 | |||
| st1 {v22.h}[5], [x14], x17 | |||
| st1 {v22.h}[6], [x16], x17 | |||
| cmp w10, #7 | |||
| beq WriteEnd | |||
| st1 {v23.4h}, [x18], x17 | |||
| st1 {v23.4h}, [x19], x17 | |||
| st1 {v23.h}[4], [x13], x17 | |||
| st1 {v23.h}[5], [x14], x17 | |||
| st1 {v23.h}[6], [x16], x17 | |||
| cmp w10, #8 | |||
| beq WriteEnd | |||
| st1 {v24.4h}, [x18], x17 | |||
| st1 {v24.4h}, [x19], x17 | |||
| st1 {v24.h}[4], [x13], x17 | |||
| st1 {v24.h}[5], [x14], x17 | |||
| st1 {v24.h}[6], [x16], x17 | |||
| cmp w10, #9 | |||
| beq WriteEnd | |||
| st1 {v25.4h}, [x18], x17 | |||
| st1 {v25.4h}, [x19], x17 | |||
| st1 {v25.h}[4], [x13], x17 | |||
| st1 {v25.h}[5], [x14], x17 | |||
| st1 {v25.h}[6], [x16], x17 | |||
| cmp w10, #10 | |||
| beq WriteEnd | |||
| st1 {v26.4h}, [x18], x17 | |||
| st1 {v26.4h}, [x19], x17 | |||
| st1 {v26.h}[4], [x13], x17 | |||
| st1 {v26.h}[5], [x14], x17 | |||
| st1 {v26.h}[6], [x16], x17 | |||
| cmp w10, #11 | |||
| beq WriteEnd | |||
| st1 {v27.4h}, [x18], x17 | |||
| st1 {v27.4h}, [x19], x17 | |||
| st1 {v27.h}[4], [x13], x17 | |||
| st1 {v27.h}[5], [x14], x17 | |||
| st1 {v27.h}[6], [x16], x17 | |||
| cmp w10, #12 | |||
| beq WriteEnd | |||
| st1 {v28.4h}, [x18], x17 | |||
| st1 {v28.4h}, [x19], x17 | |||
| st1 {v28.h}[4], [x13], x17 | |||
| st1 {v28.h}[5], [x14], x17 | |||
| st1 {v28.h}[6], [x16], x17 | |||
| cmp w10, #13 | |||
| beq WriteEnd | |||
| st1 {v29.4h}, [x18], x17 | |||
| st1 {v29.4h}, [x19], x17 | |||
| st1 {v29.h}[4], [x13], x17 | |||
| st1 {v29.h}[5], [x14], x17 | |||
| st1 {v29.h}[6], [x16], x17 | |||
| cmp w10, #14 | |||
| beq WriteEnd | |||
| st1 {v30.4h}, [x18], x17 | |||
| st1 {v30.4h}, [x19], x17 | |||
| st1 {v30.h}[4], [x13], x17 | |||
| st1 {v30.h}[5], [x14], x17 | |||
| st1 {v30.h}[6], [x16], x17 | |||
| cmp w10, #15 | |||
| beq WriteEnd | |||
| st1 {v31.4h}, [x18], x17 | |||
| st1 {v31.4h}, [x19], x17 | |||
| st1 {v31.h}[4], [x13], x17 | |||
| st1 {v31.h}[5], [x14], x17 | |||
| st1 {v31.h}[6], [x16], x17 | |||
| @@ -809,52 +810,52 @@ WriteC8: | |||
| st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x2], #64 | |||
| b WriteEnd | |||
| Write8: | |||
| st1 {v16.8h}, [x18], x17 | |||
| st1 {v16.8h}, [x19], x17 | |||
| cmp w10, #1 | |||
| beq WriteEnd | |||
| st1 {v17.8h}, [x18], x17 | |||
| st1 {v17.8h}, [x19], x17 | |||
| cmp w10, #2 | |||
| beq WriteEnd | |||
| st1 {v18.8h}, [x18], x17 | |||
| st1 {v18.8h}, [x19], x17 | |||
| cmp w10, #3 | |||
| beq WriteEnd | |||
| st1 {v19.8h}, [x18], x17 | |||
| st1 {v19.8h}, [x19], x17 | |||
| cmp w10, #4 | |||
| beq WriteEnd | |||
| st1 {v20.8h}, [x18], x17 | |||
| st1 {v20.8h}, [x19], x17 | |||
| cmp w10, #5 | |||
| beq WriteEnd | |||
| st1 {v21.8h}, [x18], x17 | |||
| st1 {v21.8h}, [x19], x17 | |||
| cmp w10, #6 | |||
| beq WriteEnd | |||
| st1 {v22.8h}, [x18], x17 | |||
| st1 {v22.8h}, [x19], x17 | |||
| cmp w10, #7 | |||
| beq WriteEnd | |||
| st1 {v23.8h}, [x18], x17 | |||
| st1 {v23.8h}, [x19], x17 | |||
| cmp w10, #8 | |||
| beq WriteEnd | |||
| st1 {v24.8h}, [x18], x17 | |||
| st1 {v24.8h}, [x19], x17 | |||
| cmp w10, #9 | |||
| beq WriteEnd | |||
| st1 {v25.8h}, [x18], x17 | |||
| st1 {v25.8h}, [x19], x17 | |||
| cmp w10, #10 | |||
| beq WriteEnd | |||
| st1 {v26.8h}, [x18], x17 | |||
| st1 {v26.8h}, [x19], x17 | |||
| cmp w10, #11 | |||
| beq WriteEnd | |||
| st1 {v27.8h}, [x18], x17 | |||
| st1 {v27.8h}, [x19], x17 | |||
| cmp w10, #12 | |||
| beq WriteEnd | |||
| st1 {v28.8h}, [x18], x17 | |||
| st1 {v28.8h}, [x19], x17 | |||
| cmp w10, #13 | |||
| beq WriteEnd | |||
| st1 {v29.8h}, [x18], x17 | |||
| st1 {v29.8h}, [x19], x17 | |||
| cmp w10, #14 | |||
| beq WriteEnd | |||
| st1 {v30.8h}, [x18], x17 | |||
| st1 {v30.8h}, [x19], x17 | |||
| cmp w10, #15 | |||
| beq WriteEnd | |||
| st1 {v31.8h}, [x18], x17 | |||
| st1 {v31.8h}, [x19], x17 | |||
| WriteEnd: | |||
| subs w10, w10, #16 // lhs row - 8 | |||
| @@ -871,8 +872,9 @@ NoDstStep: | |||
| bgt L1 | |||
| End1: | |||
| sub sp, sp, #128 | |||
| sub sp, sp, #144 | |||
| ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | |||
| ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 | |||
| ldp x19, x20, [sp], #16 | |||
| ret | |||
| #endif | |||
| @@ -21,30 +21,31 @@ | |||
| // x9: writeMode | |||
| asm_function MatmulFp16Neon64Opt | |||
| sub sp, sp, #80 | |||
| sub sp, sp, #96 | |||
| st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | |||
| stp x19, x20, [sp], #16 | |||
| stp x21, x22, [sp], #16 | |||
| ldr x8, [sp] | |||
| ldr x9, [sp, #8] | |||
| mov x18, #32 // sizeof(float16_t) * 16 | |||
| mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float16_t) * 16 * depth | |||
| mov x21, #32 // sizeof(float16_t) * 16 | |||
| mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float16_t) * 16 * depth | |||
| cbnz x9, NoC8Steps | |||
| mov x11, x2 | |||
| mov x18, #16 | |||
| mul x16, x6, x18 // row * 8 * sizeof(float16_t) | |||
| mov x21, #16 | |||
| mul x16, x6, x21 // row * 8 * sizeof(float16_t) | |||
| NoC8Steps: | |||
| cmp x9, #2 | |||
| bne NoWinoSteps | |||
| mov x18, #2 | |||
| mov x21, #2 | |||
| mul x15, x7, x8 | |||
| mul x15, x15, x18 // kernel_size * col *sizeof(float16_t) | |||
| mov x18, #16 | |||
| mul x16, x8, x18 // kernel_size * 8 * sizeof(float16_t) | |||
| mul x15, x15, x21 // kernel_size * col *sizeof(float16_t) | |||
| mov x21, #16 | |||
| mul x16, x8, x21 // kernel_size * 8 * sizeof(float16_t) | |||
| NoWinoSteps: | |||
| mov x18, #2 | |||
| mul x8, x8, x18 | |||
| mov x21, #2 | |||
| mul x8, x8, x21 | |||
| LoopRowStart: | |||
| cmp x6, #1 | |||
| @@ -1221,9 +1222,9 @@ LoopRow: | |||
| LoopColEnd: | |||
| add x0, x0, x17 | |||
| cbz x9, C8DstStep | |||
| mov x18, #2 | |||
| mul x18, x18, x7 | |||
| sub x11, x11, x18 | |||
| mov x21, #2 | |||
| mul x21, x21, x7 | |||
| sub x11, x11, x21 | |||
| mov x2, x11 | |||
| b NoDstStep | |||
| C8DstStep: | |||
| @@ -1233,8 +1234,9 @@ LoopColEnd: | |||
| subs x6, x6, #16 | |||
| bgt LoopRowStart | |||
| sub sp, sp, #80 | |||
| sub sp, sp, #96 | |||
| ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | |||
| ldp x19, x20, [sp], #16 | |||
| ldp x21, x22, [sp], #16 | |||
| ret | |||
| #endif | |||
| @@ -31,13 +31,13 @@ asm_function MatrixMultiplyWinogradFp16 | |||
| mov x14, x1 // mat_b | |||
| LoopN: | |||
| mov x16, x0 // mat_a_m | |||
| sub x18, x5, x15 // ni | |||
| sub x22, x5, x15 // ni | |||
| sub x19, x17, x3 // mi | |||
| mul x18, x18, x17 // ni * m | |||
| mul x22, x22, x17 // ni * m | |||
| mov x11, x6 // in_channel | |||
| add x18, x18, x19 // (ni * m) + mi | |||
| mul x18, x18, x13 // x18 * channel_in * 2 | |||
| add x20, x2, x18 // dst + offset | |||
| add x22, x22, x19 // (ni * m) + mi | |||
| mul x22, x22, x13 // x22 * channel_in * 2 | |||
| add x20, x2, x22 // dst + offset | |||
| cmp x11, #32 | |||
| bge LoopC32 | |||
| cmp x11, #16 | |||
| @@ -9,8 +9,8 @@ | |||
| asm_function WinogradTransLeftFp16 | |||
| sub sp, sp, #32 | |||
| stp x19, x20, [sp], #32 | |||
| sub sp, sp, #16 | |||
| stp x19, x20, [sp], #16 | |||
| mov x8, #8 // 4 * sizeof(float16) | |||
| mul x8, x6, x8 | |||
| @@ -46,16 +46,16 @@ LoopH: | |||
| ld1 {v0.h}[2], [x17], x10 | |||
| ld1 {v0.h}[3], [x17], x10 | |||
| mov x11, x6 | |||
| mov x18, x17 | |||
| add x18, x14, x7 | |||
| add x16, x18, x7 | |||
| mov x20, x17 | |||
| add x20, x14, x7 | |||
| add x16, x20, x7 | |||
| add x19, x16, x7 | |||
| LoopLength4: | |||
| ld1 {v16.4h}, [x2] | |||
| ld1 {v20.4h}, [x14], #8 | |||
| fmla v16.4h, v20.4h, v0.h[0] | |||
| ld1 {v21.4h}, [x18], #8 | |||
| ld1 {v21.4h}, [x20], #8 | |||
| fmul v17.4h, v21.4h, v0.h[1] | |||
| ld1 {v20.4h}, [x16], #8 | |||
| fmla v16.4h, v20.4h, v0.h[2] | |||
| @@ -81,14 +81,14 @@ LoopH: | |||
| ld1 {v0.h}[1], [x17], x10 | |||
| ld1 {v0.h}[2], [x17], x10 | |||
| mov x11, x6 | |||
| mov x18, x17 | |||
| add x18, x14, x7 | |||
| add x16, x18, x7 | |||
| mov x20, x17 | |||
| add x20, x14, x7 | |||
| add x16, x20, x7 | |||
| LoopLength3: | |||
| ld1 {v16.4h}, [x2] | |||
| ld1 {v20.4h}, [x14], #8 | |||
| fmla v16.4h, v20.4h, v0.h[0] | |||
| ld1 {v21.4h}, [x18], #8 | |||
| ld1 {v21.4h}, [x20], #8 | |||
| fmul v17.4h, v21.4h, v0.h[1] | |||
| ld1 {v20.4h}, [x16], #8 | |||
| fmla v16.4h, v20.4h, v0.h[2] | |||
| @@ -132,6 +132,6 @@ LoopH: | |||
| subs x4, x4, #1 | |||
| bne LoopH | |||
| sub sp, sp, #32 | |||
| ldp x19, x20, [sp], #32 | |||
| sub sp, sp, #16 | |||
| ldp x19, x20, [sp], #16 | |||
| ret | |||
| @@ -9,6 +9,9 @@ | |||
| asm_function WinogradTransRightFp16 | |||
| sub sp, sp, #16 | |||
| stp x19, x20, [sp], #16 | |||
| mov x8, #8 // 4 * sizeof(float16) | |||
| mul x8, x6, x8 | |||
| mul x9, x5, x8 // step for S | |||
| @@ -34,7 +37,7 @@ LoopH: | |||
| cmp x12, #4 | |||
| blt LoopKStart3 | |||
| mov x16, x15 | |||
| mov x18, x4 | |||
| mov x19, x4 | |||
| LoopK4: | |||
| ld1 {v0.h}[0], [x13], x10 | |||
| ld1 {v0.h}[1], [x13], x10 | |||
| @@ -45,7 +48,7 @@ LoopH: | |||
| add x14, x17, x8 | |||
| add x16, x14, x8 | |||
| add x18, x16, x8 | |||
| add x19, x16, x8 | |||
| LoopLength4: | |||
| ld1 {v16.4h}, [x2] | |||
| @@ -55,7 +58,7 @@ LoopH: | |||
| fmul v17.4h, v21.4h, v0.h[1] | |||
| ld1 {v20.4h}, [x16], #8 | |||
| fmla v16.4h, v20.4h, v0.h[2] | |||
| ld1 {v21.4h}, [x18], #8 | |||
| ld1 {v21.4h}, [x19], #8 | |||
| fmla v17.4h, v21.4h, v0.h[3] | |||
| fadd v17.4h, v16.4h, v17.4h | |||
| @@ -64,7 +67,7 @@ LoopH: | |||
| bne LoopLength4 | |||
| sub x2, x2, x8 | |||
| sub x12, x12, #4 | |||
| mov x17, x18 | |||
| mov x17, x19 | |||
| cmp x12, #4 | |||
| bge LoopK4 | |||
| @@ -98,7 +101,7 @@ LoopH: | |||
| bne LoopLength3 | |||
| sub x2, x2, x8 | |||
| sub x12, x12, #3 | |||
| mov x17, x18 | |||
| mov x17, x19 | |||
| cmp x12, #3 | |||
| bge LoopK3 | |||
| @@ -132,4 +135,7 @@ LoopH: | |||
| subs x4, x4, #1 | |||
| bne LoopH | |||
| sub sp, sp, #16 | |||
| ldp x19, x20, [sp], #16 | |||
| ret | |||
| @@ -66,7 +66,7 @@ L2: | |||
| cmp w16, #0 | |||
| beq End2 | |||
| mov x18, x1 // reload b ptr | |||
| mov x28, x1 // reload b ptr | |||
| mov x19, x7 // reload bias ptr | |||
| mov w20, w5 // reload depth | |||
| dup v16.4s, wzr | |||
| @@ -91,7 +91,7 @@ L3: | |||
| LoopD16: | |||
| ld1 {v0.16b, v1.16b}, [x17], #32 | |||
| ld1 {v2.16b, v3.16b}, [x18], #32 | |||
| ld1 {v2.16b, v3.16b}, [x28], #32 | |||
| sdot v16.4s, v2.16b, v0.4b[0] | |||
| sdot v18.4s, v2.16b, v0.4b[1] | |||
| @@ -104,7 +104,7 @@ LoopD16: | |||
| sdot v28.4s, v2.16b, v1.4b[2] | |||
| sdot v30.4s, v2.16b, v1.4b[3] | |||
| ld1 {v6.16b, v7.16b}, [x18], #32 | |||
| ld1 {v6.16b, v7.16b}, [x28], #32 | |||
| sdot v17.4s, v3.16b, v0.4b[0] | |||
| sdot v19.4s, v3.16b, v0.4b[1] | |||
| sdot v21.4s, v3.16b, v0.4b[2] | |||
| @@ -126,7 +126,7 @@ LoopD16: | |||
| sdot v28.4s, v6.16b, v5.4b[2] | |||
| sdot v30.4s, v6.16b, v5.4b[3] | |||
| ld1 {v10.16b, v11.16b}, [x18], #32 | |||
| ld1 {v10.16b, v11.16b}, [x28], #32 | |||
| sdot v17.4s, v7.16b, v4.4b[0] | |||
| sdot v19.4s, v7.16b, v4.4b[1] | |||
| sdot v21.4s, v7.16b, v4.4b[2] | |||
| @@ -148,7 +148,7 @@ LoopD16: | |||
| sdot v28.4s, v10.16b, v9.4b[2] | |||
| sdot v30.4s, v10.16b, v9.4b[3] | |||
| ld1 {v14.16b, v15.16b}, [x18], #32 | |||
| ld1 {v14.16b, v15.16b}, [x28], #32 | |||
| sdot v17.4s, v11.16b, v8.4b[0] | |||
| sdot v19.4s, v11.16b, v8.4b[1] | |||
| sdot v21.4s, v11.16b, v8.4b[2] | |||
| @@ -187,7 +187,7 @@ LoopD4: | |||
| beq End3 | |||
| ld1 {v0.16b, v1.16b}, [x17], #32 | |||
| ld1 {v2.16b, v3.16b}, [x18], #32 | |||
| ld1 {v2.16b, v3.16b}, [x28], #32 | |||
| sdot v16.4s, v2.16b, v0.4b[0] | |||
| sdot v18.4s, v2.16b, v0.4b[1] | |||
| @@ -30,7 +30,7 @@ | |||
| // x28: filter_zp | |||
| asm_function MatmulInt8DpOpt | |||
| sub sp, sp, #208 | |||
| sub sp, sp, #224 | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| stp x19, x20, [sp], #16 | |||
| @@ -38,6 +38,7 @@ asm_function MatmulInt8DpOpt | |||
| stp x23, x24, [sp], #16 | |||
| stp x25, x26, [sp], #16 | |||
| stp x27, x28, [sp], #16 | |||
| stp x29, x30, [sp], #16 | |||
| ldr w8, [sp] | |||
| ldr w9, [sp, #8] | |||
| @@ -56,7 +57,7 @@ asm_function MatmulInt8DpOpt | |||
| LoopRow: | |||
| mov x16, x1 // reload rhs ptr | |||
| mov x17, x4 // reload rhs col | |||
| mov x18, x7 // reload bias ptr | |||
| mov x29, x7 // reload bias ptr | |||
| mov x25, x6 // reload input_sum ptr | |||
| mov x27, x2 // reload dst ptr | |||
| ldr x28, [sp, #64] // reload filter_zp | |||
| @@ -113,7 +114,7 @@ LoopRow: | |||
| Bias: | |||
| cbz x7, NoReadBias | |||
| ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x18], #64 | |||
| ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x29], #64 | |||
| add v16.4s, v16.4s, v0.4s | |||
| add v17.4s, v17.4s, v1.4s | |||
| add v18.4s, v18.4s, v2.4s | |||
| @@ -423,8 +424,8 @@ LoopRow: | |||
| BiasHalf: | |||
| cbz x7, NoReadBiasHalf | |||
| ld1 {v0.4s, v1.4s}, [x18] | |||
| add x18, x18, #64 | |||
| ld1 {v0.4s, v1.4s}, [x29] | |||
| add x29, x29, #64 | |||
| add v16.4s, v16.4s, v0.4s | |||
| add v17.4s, v17.4s, v1.4s | |||
| add v20.4s, v20.4s, v0.4s | |||
| @@ -612,8 +613,8 @@ LoopRow: | |||
| BiasQuarter: | |||
| cbz x7, NoReadBiasQuarter | |||
| ld1 {v0.4s}, [x18] | |||
| add x18, x18, #64 | |||
| ld1 {v0.4s}, [x29] | |||
| add x29, x29, #64 | |||
| add v16.4s, v16.4s, v0.4s | |||
| add v20.4s, v20.4s, v0.4s | |||
| add v24.4s, v24.4s, v0.4s | |||
| @@ -1072,7 +1073,7 @@ LoopColEnd: | |||
| b LoopRow | |||
| LoopRowEnd: | |||
| sub sp, sp, #208 | |||
| sub sp, sp, #224 | |||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| ldp x19, x20, [sp], #16 | |||
| @@ -1080,5 +1081,6 @@ LoopRowEnd: | |||
| ldp x23, x24, [sp], #16 | |||
| ldp x25, x26, [sp], #16 | |||
| ldp x27, x28, [sp], #16 | |||
| ldp x29, x30, [sp], #16 | |||
| ret | |||
| #endif | |||
| @@ -20,9 +20,10 @@ | |||
| // x7: bias | |||
| asm_function MatMulOptR4Int8Neon64 | |||
| sub sp, sp, #128 | |||
| sub sp, sp, #144 | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| stp x19, x20, [sp], #16 | |||
| mov w15, #0 // b col index | |||
| mov w16, #0 // a row index | |||
| @@ -40,7 +41,7 @@ L2: | |||
| cmp w16, w3 | |||
| beq End2 | |||
| mov x18, x1 // reload b ptr | |||
| mov x19, x1 // reload b ptr | |||
| mov x10, x7 // reload bias ptr | |||
| mov w11, w5 // reload depth | |||
| dup v16.4s, wzr | |||
| @@ -67,10 +68,10 @@ L3: | |||
| ld1 {v1.16b}, [x17], #16 | |||
| ld1 {v2.16b}, [x17], #16 | |||
| ld1 {v3.16b}, [x17], #16 | |||
| ld1 {v4.16b}, [x18], #16 | |||
| ld1 {v5.16b}, [x18], #16 | |||
| ld1 {v6.16b}, [x18], #16 | |||
| ld1 {v7.16b}, [x18], #16 | |||
| ld1 {v4.16b}, [x19], #16 | |||
| ld1 {v5.16b}, [x19], #16 | |||
| ld1 {v6.16b}, [x19], #16 | |||
| ld1 {v7.16b}, [x19], #16 | |||
| sdot v16.4s, v4.16b, v0.16b | |||
| sdot v17.4s, v5.16b, v0.16b | |||
| @@ -135,8 +136,9 @@ End2: | |||
| b L1 | |||
| End1: | |||
| sub sp, sp, #128 | |||
| sub sp, sp, #144 | |||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| ldp x19, x20, [sp], #16 | |||
| ret | |||
| #endif | |||