|
|
@@ -1,766 +0,0 @@ |
|
|
#ifdef __aarch64__ |
|
|
|
|
|
.text |
|
|
|
|
|
.align 5 |
|
|
|
|
|
.global MatmulFloatNeon64OptRemain |
|
|
|
|
|
#ifndef __APPLE__ |
|
|
|
|
|
.type MatmulFloatNeon64OptRemain, %function |
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
// void MatmulFloatNeon64Remain(const float *a, const float *b, float *c, const float *bias, int act_type, int depth |
|
|
|
|
|
// int row, int col, size_t stride, size_t writeMode) |
|
|
|
|
|
// x0: a |
|
|
|
|
|
// x1: b |
|
|
|
|
|
// x2: c |
|
|
|
|
|
// x3: bias |
|
|
|
|
|
// x4: act_type |
|
|
|
|
|
// x5: depth |
|
|
|
|
|
// x6: row |
|
|
|
|
|
// x7: col |
|
|
|
|
|
// x8: stride |
|
|
|
|
|
// x9: writeMode |
|
|
|
|
|
|
|
|
|
|
|
MatmulFloatNeon64OptRemain: |
|
|
|
|
|
sub sp, sp, #144 |
|
|
|
|
|
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 |
|
|
|
|
|
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 |
|
|
|
|
|
stp x19, x20, [sp], #16 |
|
|
|
|
|
|
|
|
|
|
|
ldr x8, [sp] |
|
|
|
|
|
ldr x9, [sp, #8] |
|
|
|
|
|
|
|
|
|
|
|
mov x18, #48 // sizeof(float) * 12 |
|
|
|
|
|
mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float) * 12 * depth |
|
|
|
|
|
cbnz x9, NoC8Steps |
|
|
|
|
|
mov x11, x2 |
|
|
|
|
|
mov x18, #32 |
|
|
|
|
|
mul x16, x6, x18 // row * 8 * sizeof(float) |
|
|
|
|
|
NoC8Steps: |
|
|
|
|
|
cmp x9, #2 |
|
|
|
|
|
bne NoWinoSteps |
|
|
|
|
|
mov x18, #4 |
|
|
|
|
|
mul x15, x7, x8 |
|
|
|
|
|
mul x15, x15, x18 // kernel_size * col *sizeof(float) |
|
|
|
|
|
mov x18, #32 |
|
|
|
|
|
mul x16, x8, x18 // kernel_size * 8 * sizeof(float) |
|
|
|
|
|
NoWinoSteps: |
|
|
|
|
|
mov x18, #4 |
|
|
|
|
|
mul x8, x8, x18 |
|
|
|
|
|
|
|
|
|
|
|
LoopRow: |
|
|
|
|
|
cmp x6, #4 |
|
|
|
|
|
ble LoopRow4 |
|
|
|
|
|
|
|
|
|
|
|
LoopRow8: |
|
|
|
|
|
mov x14, x1 // reload rhs ptr |
|
|
|
|
|
mov x13, x7 // reload rhs col |
|
|
|
|
|
mov x12, x3 // reload bias |
|
|
|
|
|
|
|
|
|
|
|
LoopCol8: |
|
|
|
|
|
cbz x9, NoReloadDst8 |
|
|
|
|
|
mov x11, x2 |
|
|
|
|
|
NoReloadDst8: |
|
|
|
|
|
mov x10, x0 // reload lhs ptr |
|
|
|
|
|
mov x19, x5 // reload depth |
|
|
|
|
|
|
|
|
|
|
|
cmp x13, #4 |
|
|
|
|
|
ble LoopDepthStartHalf8 |
|
|
|
|
|
|
|
|
|
|
|
LoopDepthStart8: |
|
|
|
|
|
ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48 |
|
|
|
|
|
ld1 {v3.4s, v4.4s}, [x14], #32 |
|
|
|
|
|
fmul v8.4s, v3.4s, v0.s[0] |
|
|
|
|
|
fmul v10.4s, v3.4s, v0.s[1] |
|
|
|
|
|
fmul v12.4s, v3.4s, v0.s[2] |
|
|
|
|
|
fmul v14.4s, v3.4s, v0.s[3] |
|
|
|
|
|
fmul v9.4s, v4.4s, v0.s[0] |
|
|
|
|
|
fmul v11.4s, v4.4s, v0.s[1] |
|
|
|
|
|
fmul v13.4s, v4.4s, v0.s[2] |
|
|
|
|
|
fmul v15.4s, v4.4s, v0.s[3] |
|
|
|
|
|
fmul v16.4s, v3.4s, v1.s[0] |
|
|
|
|
|
fmul v18.4s, v3.4s, v1.s[1] |
|
|
|
|
|
fmul v20.4s, v3.4s, v1.s[2] |
|
|
|
|
|
fmul v22.4s, v3.4s, v1.s[3] |
|
|
|
|
|
fmul v17.4s, v4.4s, v1.s[0] |
|
|
|
|
|
fmul v19.4s, v4.4s, v1.s[1] |
|
|
|
|
|
fmul v21.4s, v4.4s, v1.s[2] |
|
|
|
|
|
fmul v23.4s, v4.4s, v1.s[3] |
|
|
|
|
|
|
|
|
|
|
|
subs x19, x19, #1 |
|
|
|
|
|
beq Bias8 |
|
|
|
|
|
|
|
|
|
|
|
LoopDepth8: |
|
|
|
|
|
ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48 |
|
|
|
|
|
ld1 {v3.4s, v4.4s}, [x14], #32 |
|
|
|
|
|
fmla v8.4s, v3.4s, v0.s[0] |
|
|
|
|
|
fmla v10.4s, v3.4s, v0.s[1] |
|
|
|
|
|
fmla v12.4s, v3.4s, v0.s[2] |
|
|
|
|
|
fmla v14.4s, v3.4s, v0.s[3] |
|
|
|
|
|
fmla v9.4s, v4.4s, v0.s[0] |
|
|
|
|
|
fmla v11.4s, v4.4s, v0.s[1] |
|
|
|
|
|
fmla v13.4s, v4.4s, v0.s[2] |
|
|
|
|
|
fmla v15.4s, v4.4s, v0.s[3] |
|
|
|
|
|
fmla v16.4s, v3.4s, v1.s[0] |
|
|
|
|
|
fmla v18.4s, v3.4s, v1.s[1] |
|
|
|
|
|
fmla v20.4s, v3.4s, v1.s[2] |
|
|
|
|
|
fmla v22.4s, v3.4s, v1.s[3] |
|
|
|
|
|
fmla v17.4s, v4.4s, v1.s[0] |
|
|
|
|
|
fmla v19.4s, v4.4s, v1.s[1] |
|
|
|
|
|
fmla v21.4s, v4.4s, v1.s[2] |
|
|
|
|
|
fmla v23.4s, v4.4s, v1.s[3] |
|
|
|
|
|
|
|
|
|
|
|
subs x19, x19, #1 |
|
|
|
|
|
bgt LoopDepth8 |
|
|
|
|
|
|
|
|
|
|
|
Bias8: |
|
|
|
|
|
cbz x3, Activation8 |
|
|
|
|
|
ld1 {v0.4s}, [x12], #16 |
|
|
|
|
|
ld1 {v1.4s}, [x12], #16 |
|
|
|
|
|
fadd v8.4s, v8.4s, v0.4s |
|
|
|
|
|
fadd v9.4s, v9.4s, v1.4s |
|
|
|
|
|
fadd v10.4s, v10.4s, v0.4s |
|
|
|
|
|
fadd v11.4s, v11.4s, v1.4s |
|
|
|
|
|
fadd v12.4s, v12.4s, v0.4s |
|
|
|
|
|
fadd v13.4s, v13.4s, v1.4s |
|
|
|
|
|
fadd v14.4s, v14.4s, v0.4s |
|
|
|
|
|
fadd v15.4s, v15.4s, v1.4s |
|
|
|
|
|
fadd v16.4s, v16.4s, v0.4s |
|
|
|
|
|
fadd v17.4s, v17.4s, v1.4s |
|
|
|
|
|
fadd v18.4s, v18.4s, v0.4s |
|
|
|
|
|
fadd v19.4s, v19.4s, v1.4s |
|
|
|
|
|
fadd v20.4s, v20.4s, v0.4s |
|
|
|
|
|
fadd v21.4s, v21.4s, v1.4s |
|
|
|
|
|
fadd v22.4s, v22.4s, v0.4s |
|
|
|
|
|
fadd v23.4s, v23.4s, v1.4s |
|
|
|
|
|
|
|
|
|
|
|
Activation8: |
|
|
|
|
|
cmp x4, #2 |
|
|
|
|
|
beq Relu68 |
|
|
|
|
|
cmp x4, #1 |
|
|
|
|
|
beq Relu8 |
|
|
|
|
|
b Write |
|
|
|
|
|
|
|
|
|
|
|
Relu68: |
|
|
|
|
|
mov w19, #6 |
|
|
|
|
|
dup v2.4s, w19 |
|
|
|
|
|
scvtf v2.4s, v2.4s |
|
|
|
|
|
fmin v8.4s, v8.4s, v2.4s |
|
|
|
|
|
fmin v9.4s, v9.4s, v2.4s |
|
|
|
|
|
fmin v10.4s, v10.4s, v2.4s |
|
|
|
|
|
fmin v11.4s, v11.4s, v2.4s |
|
|
|
|
|
fmin v12.4s, v12.4s, v2.4s |
|
|
|
|
|
fmin v13.4s, v13.4s, v2.4s |
|
|
|
|
|
fmin v14.4s, v14.4s, v2.4s |
|
|
|
|
|
fmin v15.4s, v15.4s, v2.4s |
|
|
|
|
|
fmin v16.4s, v16.4s, v2.4s |
|
|
|
|
|
fmin v17.4s, v17.4s, v2.4s |
|
|
|
|
|
fmin v18.4s, v18.4s, v2.4s |
|
|
|
|
|
fmin v19.4s, v19.4s, v2.4s |
|
|
|
|
|
fmin v20.4s, v20.4s, v2.4s |
|
|
|
|
|
fmin v21.4s, v21.4s, v2.4s |
|
|
|
|
|
fmin v22.4s, v22.4s, v2.4s |
|
|
|
|
|
fmin v23.4s, v23.4s, v2.4s |
|
|
|
|
|
|
|
|
|
|
|
Relu8: |
|
|
|
|
|
dup v3.4s, wzr |
|
|
|
|
|
fmax v8.4s, v8.4s, v3.4s |
|
|
|
|
|
fmax v9.4s, v9.4s, v3.4s |
|
|
|
|
|
fmax v10.4s, v10.4s, v3.4s |
|
|
|
|
|
fmax v11.4s, v11.4s, v3.4s |
|
|
|
|
|
fmax v12.4s, v12.4s, v3.4s |
|
|
|
|
|
fmax v13.4s, v13.4s, v3.4s |
|
|
|
|
|
fmax v14.4s, v14.4s, v3.4s |
|
|
|
|
|
fmax v15.4s, v15.4s, v3.4s |
|
|
|
|
|
fmax v16.4s, v16.4s, v3.4s |
|
|
|
|
|
fmax v17.4s, v17.4s, v3.4s |
|
|
|
|
|
fmax v18.4s, v18.4s, v3.4s |
|
|
|
|
|
fmax v19.4s, v19.4s, v3.4s |
|
|
|
|
|
fmax v20.4s, v20.4s, v3.4s |
|
|
|
|
|
fmax v21.4s, v21.4s, v3.4s |
|
|
|
|
|
fmax v22.4s, v22.4s, v3.4s |
|
|
|
|
|
fmax v23.4s, v23.4s, v3.4s |
|
|
|
|
|
b Write |
|
|
|
|
|
|
|
|
|
|
|
LoopDepthStartHalf8: |
|
|
|
|
|
ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48 |
|
|
|
|
|
ld1 {v3.4s, v4.4s}, [x14], #32 |
|
|
|
|
|
fmul v8.4s, v3.4s, v0.s[0] |
|
|
|
|
|
fmul v10.4s, v3.4s, v0.s[1] |
|
|
|
|
|
fmul v12.4s, v3.4s, v0.s[2] |
|
|
|
|
|
fmul v14.4s, v3.4s, v0.s[3] |
|
|
|
|
|
fmul v16.4s, v3.4s, v1.s[0] |
|
|
|
|
|
fmul v18.4s, v3.4s, v1.s[1] |
|
|
|
|
|
fmul v20.4s, v3.4s, v1.s[2] |
|
|
|
|
|
fmul v22.4s, v3.4s, v1.s[3] |
|
|
|
|
|
|
|
|
|
|
|
subs x19, x19, #1 |
|
|
|
|
|
beq BiasHalf8 |
|
|
|
|
|
|
|
|
|
|
|
LoopDepthHalf8: |
|
|
|
|
|
ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48 |
|
|
|
|
|
ld1 {v3.4s, v4.4s}, [x14], #32 |
|
|
|
|
|
fmla v8.4s, v3.4s, v0.s[0] |
|
|
|
|
|
fmla v10.4s, v3.4s, v0.s[1] |
|
|
|
|
|
fmla v12.4s, v3.4s, v0.s[2] |
|
|
|
|
|
fmla v14.4s, v3.4s, v0.s[3] |
|
|
|
|
|
fmla v16.4s, v3.4s, v1.s[0] |
|
|
|
|
|
fmla v18.4s, v3.4s, v1.s[1] |
|
|
|
|
|
fmla v20.4s, v3.4s, v1.s[2] |
|
|
|
|
|
fmla v22.4s, v3.4s, v1.s[3] |
|
|
|
|
|
|
|
|
|
|
|
subs x19, x19, #1 |
|
|
|
|
|
bgt LoopDepthHalf8 |
|
|
|
|
|
|
|
|
|
|
|
BiasHalf8: |
|
|
|
|
|
cbz x3, ActivationHalf8 |
|
|
|
|
|
ld1 {v0.4s}, [x12], #16 |
|
|
|
|
|
ld1 {v1.4s}, [x12], #16 |
|
|
|
|
|
fadd v8.4s, v8.4s, v0.4s |
|
|
|
|
|
fadd v10.4s, v10.4s, v0.4s |
|
|
|
|
|
fadd v12.4s, v12.4s, v0.4s |
|
|
|
|
|
fadd v14.4s, v14.4s, v0.4s |
|
|
|
|
|
fadd v16.4s, v16.4s, v0.4s |
|
|
|
|
|
fadd v18.4s, v18.4s, v0.4s |
|
|
|
|
|
fadd v20.4s, v20.4s, v0.4s |
|
|
|
|
|
fadd v22.4s, v22.4s, v0.4s |
|
|
|
|
|
|
|
|
|
|
|
ActivationHalf8: |
|
|
|
|
|
cmp x4, #2 |
|
|
|
|
|
beq Relu6Half8 |
|
|
|
|
|
cmp x4, #1 |
|
|
|
|
|
beq ReluHalf8 |
|
|
|
|
|
b Write |
|
|
|
|
|
|
|
|
|
|
|
Relu6Half8: |
|
|
|
|
|
mov w19, #6 |
|
|
|
|
|
dup v2.4s, w19 |
|
|
|
|
|
scvtf v2.4s, v2.4s |
|
|
|
|
|
fmin v8.4s, v8.4s, v2.4s |
|
|
|
|
|
fmin v10.4s, v10.4s, v2.4s |
|
|
|
|
|
fmin v12.4s, v12.4s, v2.4s |
|
|
|
|
|
fmin v14.4s, v14.4s, v2.4s |
|
|
|
|
|
fmin v16.4s, v16.4s, v2.4s |
|
|
|
|
|
fmin v18.4s, v18.4s, v2.4s |
|
|
|
|
|
fmin v20.4s, v20.4s, v2.4s |
|
|
|
|
|
fmin v22.4s, v22.4s, v2.4s |
|
|
|
|
|
|
|
|
|
|
|
ReluHalf8: |
|
|
|
|
|
dup v3.4s, wzr |
|
|
|
|
|
fmax v8.4s, v8.4s, v3.4s |
|
|
|
|
|
fmax v10.4s, v10.4s, v3.4s |
|
|
|
|
|
fmax v12.4s, v12.4s, v3.4s |
|
|
|
|
|
fmax v14.4s, v14.4s, v3.4s |
|
|
|
|
|
fmax v16.4s, v16.4s, v3.4s |
|
|
|
|
|
fmax v18.4s, v18.4s, v3.4s |
|
|
|
|
|
fmax v20.4s, v20.4s, v3.4s |
|
|
|
|
|
fmax v22.4s, v22.4s, v3.4s |
|
|
|
|
|
b Write |
|
|
|
|
|
|
|
|
|
|
|
LoopRow4: |
|
|
|
|
|
mov x14, x1 // reload rhs ptr |
|
|
|
|
|
mov x13, x7 // reload rhs col |
|
|
|
|
|
mov x12, x3 // reload bias |
|
|
|
|
|
|
|
|
|
|
|
LoopCol4: |
|
|
|
|
|
cbz x9, NoReloadDst4 |
|
|
|
|
|
mov x11, x2 |
|
|
|
|
|
NoReloadDst4: |
|
|
|
|
|
mov x10, x0 // reload lhs ptr |
|
|
|
|
|
mov x19, x5 // reload depth |
|
|
|
|
|
|
|
|
|
|
|
cmp x13, #4 |
|
|
|
|
|
ble LoopDepthStartHalf4 |
|
|
|
|
|
|
|
|
|
|
|
LoopDepthStart4: |
|
|
|
|
|
ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48 |
|
|
|
|
|
ld1 {v3.4s, v4.4s}, [x14], #32 |
|
|
|
|
|
fmul v8.4s, v3.4s, v0.s[0] |
|
|
|
|
|
fmul v10.4s, v3.4s, v0.s[1] |
|
|
|
|
|
fmul v12.4s, v3.4s, v0.s[2] |
|
|
|
|
|
fmul v14.4s, v3.4s, v0.s[3] |
|
|
|
|
|
fmul v9.4s, v4.4s, v0.s[0] |
|
|
|
|
|
fmul v11.4s, v4.4s, v0.s[1] |
|
|
|
|
|
fmul v13.4s, v4.4s, v0.s[2] |
|
|
|
|
|
fmul v15.4s, v4.4s, v0.s[3] |
|
|
|
|
|
|
|
|
|
|
|
subs x19, x19, #1 |
|
|
|
|
|
beq Bias4 |
|
|
|
|
|
|
|
|
|
|
|
LoopDepth4: |
|
|
|
|
|
ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48 |
|
|
|
|
|
ld1 {v3.4s, v4.4s}, [x14], #32 |
|
|
|
|
|
fmla v8.4s, v3.4s, v0.s[0] |
|
|
|
|
|
fmla v10.4s, v3.4s, v0.s[1] |
|
|
|
|
|
fmla v12.4s, v3.4s, v0.s[2] |
|
|
|
|
|
fmla v14.4s, v3.4s, v0.s[3] |
|
|
|
|
|
fmla v9.4s, v4.4s, v0.s[0] |
|
|
|
|
|
fmla v11.4s, v4.4s, v0.s[1] |
|
|
|
|
|
fmla v13.4s, v4.4s, v0.s[2] |
|
|
|
|
|
fmla v15.4s, v4.4s, v0.s[3] |
|
|
|
|
|
|
|
|
|
|
|
subs x19, x19, #1 |
|
|
|
|
|
bgt LoopDepth4 |
|
|
|
|
|
|
|
|
|
|
|
Bias4: |
|
|
|
|
|
cbz x3, Activation4 |
|
|
|
|
|
ld1 {v0.4s}, [x12], #16 |
|
|
|
|
|
ld1 {v1.4s}, [x12], #16 |
|
|
|
|
|
fadd v8.4s, v8.4s, v0.4s |
|
|
|
|
|
fadd v9.4s, v9.4s, v1.4s |
|
|
|
|
|
fadd v10.4s, v10.4s, v0.4s |
|
|
|
|
|
fadd v11.4s, v11.4s, v1.4s |
|
|
|
|
|
fadd v12.4s, v12.4s, v0.4s |
|
|
|
|
|
fadd v13.4s, v13.4s, v1.4s |
|
|
|
|
|
fadd v14.4s, v14.4s, v0.4s |
|
|
|
|
|
fadd v15.4s, v15.4s, v1.4s |
|
|
|
|
|
|
|
|
|
|
|
Activation4: |
|
|
|
|
|
cmp x4, #2 |
|
|
|
|
|
beq Relu64 |
|
|
|
|
|
cmp x4, #1 |
|
|
|
|
|
beq Relu4 |
|
|
|
|
|
b Write |
|
|
|
|
|
|
|
|
|
|
|
Relu64: |
|
|
|
|
|
mov w19, #6 |
|
|
|
|
|
dup v2.4s, w19 |
|
|
|
|
|
scvtf v2.4s, v2.4s |
|
|
|
|
|
fmin v8.4s, v8.4s, v2.4s |
|
|
|
|
|
fmin v9.4s, v9.4s, v2.4s |
|
|
|
|
|
fmin v10.4s, v10.4s, v2.4s |
|
|
|
|
|
fmin v11.4s, v11.4s, v2.4s |
|
|
|
|
|
fmin v12.4s, v12.4s, v2.4s |
|
|
|
|
|
fmin v13.4s, v13.4s, v2.4s |
|
|
|
|
|
fmin v14.4s, v14.4s, v2.4s |
|
|
|
|
|
fmin v15.4s, v15.4s, v2.4s |
|
|
|
|
|
|
|
|
|
|
|
Relu4: |
|
|
|
|
|
dup v3.4s, wzr |
|
|
|
|
|
fmax v8.4s, v8.4s, v3.4s |
|
|
|
|
|
fmax v9.4s, v9.4s, v3.4s |
|
|
|
|
|
fmax v10.4s, v10.4s, v3.4s |
|
|
|
|
|
fmax v11.4s, v11.4s, v3.4s |
|
|
|
|
|
fmax v12.4s, v12.4s, v3.4s |
|
|
|
|
|
fmax v13.4s, v13.4s, v3.4s |
|
|
|
|
|
fmax v14.4s, v14.4s, v3.4s |
|
|
|
|
|
fmax v15.4s, v15.4s, v3.4s |
|
|
|
|
|
b Write |
|
|
|
|
|
|
|
|
|
|
|
LoopDepthStartHalf4: |
|
|
|
|
|
ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48 |
|
|
|
|
|
ld1 {v3.4s, v4.4s}, [x14], #32 |
|
|
|
|
|
fmul v8.4s, v3.4s, v0.s[0] |
|
|
|
|
|
fmul v10.4s, v3.4s, v0.s[1] |
|
|
|
|
|
fmul v12.4s, v3.4s, v0.s[2] |
|
|
|
|
|
fmul v14.4s, v3.4s, v0.s[3] |
|
|
|
|
|
|
|
|
|
|
|
subs x19, x19, #1 |
|
|
|
|
|
beq BiasHalf4 |
|
|
|
|
|
|
|
|
|
|
|
LoopDepthHalf4: |
|
|
|
|
|
ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48 |
|
|
|
|
|
ld1 {v3.4s, v4.4s}, [x14], #32 |
|
|
|
|
|
fmla v8.4s, v3.4s, v0.s[0] |
|
|
|
|
|
fmla v10.4s, v3.4s, v0.s[1] |
|
|
|
|
|
fmla v12.4s, v3.4s, v0.s[2] |
|
|
|
|
|
fmla v14.4s, v3.4s, v0.s[3] |
|
|
|
|
|
|
|
|
|
|
|
subs x19, x19, #1 |
|
|
|
|
|
bgt LoopDepthHalf4 |
|
|
|
|
|
|
|
|
|
|
|
BiasHalf4: |
|
|
|
|
|
cbz x3, ActivationHalf4 |
|
|
|
|
|
ld1 {v0.4s}, [x12], #16 |
|
|
|
|
|
ld1 {v1.4s}, [x12], #16 |
|
|
|
|
|
fadd v8.4s, v8.4s, v0.4s |
|
|
|
|
|
fadd v10.4s, v10.4s, v0.4s |
|
|
|
|
|
fadd v12.4s, v12.4s, v0.4s |
|
|
|
|
|
fadd v14.4s, v14.4s, v0.4s |
|
|
|
|
|
|
|
|
|
|
|
ActivationHalf4: |
|
|
|
|
|
cmp x4, #2 |
|
|
|
|
|
beq Relu6Half4 |
|
|
|
|
|
cmp x4, #1 |
|
|
|
|
|
beq ReluHalf4 |
|
|
|
|
|
b Write |
|
|
|
|
|
|
|
|
|
|
|
Relu6Half4: |
|
|
|
|
|
mov w19, #6 |
|
|
|
|
|
dup v2.4s, w19 |
|
|
|
|
|
scvtf v2.4s, v2.4s |
|
|
|
|
|
fmin v8.4s, v8.4s, v2.4s |
|
|
|
|
|
fmin v10.4s, v10.4s, v2.4s |
|
|
|
|
|
fmin v12.4s, v12.4s, v2.4s |
|
|
|
|
|
fmin v14.4s, v14.4s, v2.4s |
|
|
|
|
|
|
|
|
|
|
|
ReluHalf4: |
|
|
|
|
|
dup v3.4s, wzr |
|
|
|
|
|
fmax v8.4s, v8.4s, v3.4s |
|
|
|
|
|
fmax v10.4s, v10.4s, v3.4s |
|
|
|
|
|
fmax v12.4s, v12.4s, v3.4s |
|
|
|
|
|
fmax v14.4s, v14.4s, v3.4s |
|
|
|
|
|
|
|
|
|
|
|
Write: |
|
|
|
|
|
cmp x9, #2 |
|
|
|
|
|
beq WriteWino |
|
|
|
|
|
cbz x9, WriteC8 |
|
|
|
|
|
cmp x13, #1 |
|
|
|
|
|
beq Write1 |
|
|
|
|
|
cmp x13, #2 |
|
|
|
|
|
beq Write2 |
|
|
|
|
|
cmp x13, #3 |
|
|
|
|
|
beq Write3 |
|
|
|
|
|
cmp x13, #4 |
|
|
|
|
|
beq Write4 |
|
|
|
|
|
cmp x13, #5 |
|
|
|
|
|
beq Write5 |
|
|
|
|
|
cmp x13, #6 |
|
|
|
|
|
beq Write6 |
|
|
|
|
|
cmp x13, #7 |
|
|
|
|
|
beq Write7 |
|
|
|
|
|
b Write8 |
|
|
|
|
|
|
|
|
|
|
|
Write1: |
|
|
|
|
|
add x2, x2, #4 |
|
|
|
|
|
str s8, [x11] |
|
|
|
|
|
cmp x6, #1 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x11, x11, x8 |
|
|
|
|
|
str s10, [x11] |
|
|
|
|
|
cmp x6, #2 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x11, x11, x8 |
|
|
|
|
|
str s12, [x11] |
|
|
|
|
|
cmp x6, #3 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x11, x11, x8 |
|
|
|
|
|
str s14, [x11] |
|
|
|
|
|
cmp x6, #4 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x11, x11, x8 |
|
|
|
|
|
str s16, [x11] |
|
|
|
|
|
cmp x6, #5 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x11, x11, x8 |
|
|
|
|
|
str s18, [x11] |
|
|
|
|
|
cmp x6, #6 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x11, x11, x8 |
|
|
|
|
|
str s20, [x11] |
|
|
|
|
|
cmp x6, #7 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x11, x11, x8 |
|
|
|
|
|
str s22, [x11] |
|
|
|
|
|
add x11, x11, x8 |
|
|
|
|
|
add x11, x11, #4 |
|
|
|
|
|
b WriteEnd |
|
|
|
|
|
Write2: |
|
|
|
|
|
add x2, x2, #8 |
|
|
|
|
|
str d8, [x11] |
|
|
|
|
|
cmp x6, #1 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x11, x11, x8 |
|
|
|
|
|
str d10, [x11] |
|
|
|
|
|
cmp x6, #2 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x11, x11, x8 |
|
|
|
|
|
str d12, [x11] |
|
|
|
|
|
cmp x6, #3 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x11, x11, x8 |
|
|
|
|
|
str d14, [x11] |
|
|
|
|
|
cmp x6, #4 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x11, x11, x8 |
|
|
|
|
|
str d16, [x11] |
|
|
|
|
|
cmp x6, #5 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x11, x11, x8 |
|
|
|
|
|
str d18, [x11] |
|
|
|
|
|
cmp x6, #6 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x11, x11, x8 |
|
|
|
|
|
str d20, [x11] |
|
|
|
|
|
cmp x6, #7 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x11, x11, x8 |
|
|
|
|
|
str d22, [x11] |
|
|
|
|
|
add x11, x11, x8 |
|
|
|
|
|
add x11, x11, #8 |
|
|
|
|
|
b WriteEnd |
|
|
|
|
|
Write3: |
|
|
|
|
|
add x2, x2, #12 |
|
|
|
|
|
add x19, x11, #8 |
|
|
|
|
|
str d8, [x11] |
|
|
|
|
|
st1 {v8.s}[2], [x19], x8 |
|
|
|
|
|
cmp x6, #1 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x11, x11, x8 |
|
|
|
|
|
str d10, [x11] |
|
|
|
|
|
st1 {v10.s}[2], [x19], x8 |
|
|
|
|
|
cmp x6, #2 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x11, x11, x8 |
|
|
|
|
|
str d12, [x11] |
|
|
|
|
|
st1 {v12.s}[2], [x19], x8 |
|
|
|
|
|
cmp x6, #3 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x11, x11, x8 |
|
|
|
|
|
str d14, [x11] |
|
|
|
|
|
st1 {v14.s}[2], [x19], x8 |
|
|
|
|
|
cmp x6, #4 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x11, x11, x8 |
|
|
|
|
|
str d16, [x11] |
|
|
|
|
|
st1 {v16.s}[2], [x19], x8 |
|
|
|
|
|
cmp x6, #5 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x11, x11, x8 |
|
|
|
|
|
str d18, [x11] |
|
|
|
|
|
st1 {v18.s}[2], [x19], x8 |
|
|
|
|
|
cmp x6, #6 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x11, x11, x8 |
|
|
|
|
|
str d20, [x11] |
|
|
|
|
|
st1 {v20.s}[2], [x19], x8 |
|
|
|
|
|
cmp x6, #7 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x11, x11, x8 |
|
|
|
|
|
str d22, [x11] |
|
|
|
|
|
st1 {v22.s}[2], [x19], x8 |
|
|
|
|
|
add x11, x11, x8 |
|
|
|
|
|
add x11, x11, #12 |
|
|
|
|
|
b WriteEnd |
|
|
|
|
|
Write4: |
|
|
|
|
|
add x2, x2, #16 |
|
|
|
|
|
st1 {v8.4s}, [x11], x8 |
|
|
|
|
|
cmp x6, #1 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
st1 {v10.4s}, [x11], x8 |
|
|
|
|
|
cmp x6, #2 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
st1 {v12.4s}, [x11], x8 |
|
|
|
|
|
cmp x6, #3 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
st1 {v14.4s}, [x11], x8 |
|
|
|
|
|
cmp x6, #4 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
st1 {v16.4s}, [x11], x8 |
|
|
|
|
|
cmp x6, #5 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
st1 {v18.4s}, [x11], x8 |
|
|
|
|
|
cmp x6, #6 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
st1 {v20.4s}, [x11], x8 |
|
|
|
|
|
cmp x6, #7 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
st1 {v22.4s}, [x11], x8 |
|
|
|
|
|
add x11, x11, #16 |
|
|
|
|
|
b WriteEnd |
|
|
|
|
|
Write5: |
|
|
|
|
|
add x2, x2, #20 |
|
|
|
|
|
add x19, x11, #16 |
|
|
|
|
|
st1 {v8.4s}, [x11], x8 |
|
|
|
|
|
str s9, [x19] |
|
|
|
|
|
cmp x6, #1 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x19, x19, x8 |
|
|
|
|
|
st1 {v10.4s}, [x11], x8 |
|
|
|
|
|
str s11, [x19] |
|
|
|
|
|
cmp x6, #2 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x19, x19, x8 |
|
|
|
|
|
st1 {v12.4s}, [x11], x8 |
|
|
|
|
|
str s13, [x19] |
|
|
|
|
|
cmp x6, #3 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x19, x19, x8 |
|
|
|
|
|
st1 {v14.4s}, [x11], x8 |
|
|
|
|
|
str s15, [x19] |
|
|
|
|
|
cmp x6, #4 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x19, x19, x8 |
|
|
|
|
|
st1 {v16.4s}, [x11], x8 |
|
|
|
|
|
str s17, [x19] |
|
|
|
|
|
cmp x6, #5 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x19, x19, x8 |
|
|
|
|
|
st1 {v18.4s}, [x11], x8 |
|
|
|
|
|
str s19, [x19] |
|
|
|
|
|
cmp x6, #6 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x19, x19, x8 |
|
|
|
|
|
st1 {v20.4s}, [x11], x8 |
|
|
|
|
|
str s21, [x19] |
|
|
|
|
|
cmp x6, #7 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x19, x19, x8 |
|
|
|
|
|
st1 {v22.4s}, [x11], x8 |
|
|
|
|
|
str s23, [x19] |
|
|
|
|
|
add x11, x11, #20 |
|
|
|
|
|
b WriteEnd |
|
|
|
|
|
Write6: |
|
|
|
|
|
add x2, x2, #24 |
|
|
|
|
|
add x19, x11, #16 |
|
|
|
|
|
st1 {v8.4s}, [x11], x8 |
|
|
|
|
|
str d9, [x19] |
|
|
|
|
|
cmp x6, #1 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x19, x19, x8 |
|
|
|
|
|
st1 {v10.4s}, [x11], x8 |
|
|
|
|
|
str d11, [x19] |
|
|
|
|
|
cmp x6, #2 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x19, x19, x8 |
|
|
|
|
|
st1 {v12.4s}, [x11], x8 |
|
|
|
|
|
str d13, [x19] |
|
|
|
|
|
cmp x6, #3 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x19, x19, x8 |
|
|
|
|
|
st1 {v14.4s}, [x11], x8 |
|
|
|
|
|
str d15, [x19] |
|
|
|
|
|
cmp x6, #4 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x19, x19, x8 |
|
|
|
|
|
st1 {v16.4s}, [x11], x8 |
|
|
|
|
|
str d17, [x19] |
|
|
|
|
|
cmp x6, #5 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x19, x19, x8 |
|
|
|
|
|
st1 {v18.4s}, [x11], x8 |
|
|
|
|
|
str d19, [x19] |
|
|
|
|
|
cmp x6, #6 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x19, x19, x8 |
|
|
|
|
|
st1 {v20.4s}, [x11], x8 |
|
|
|
|
|
str d21, [x19] |
|
|
|
|
|
cmp x6, #7 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x19, x19, x8 |
|
|
|
|
|
st1 {v22.4s}, [x11], x8 |
|
|
|
|
|
str d23, [x19] |
|
|
|
|
|
add x11, x11, #24 |
|
|
|
|
|
b WriteEnd |
|
|
|
|
|
Write7: |
|
|
|
|
|
add x2, x2, #28 |
|
|
|
|
|
add x19, x11, #16 |
|
|
|
|
|
add x20, x11, #24 |
|
|
|
|
|
st1 {v8.4s}, [x11], x8 |
|
|
|
|
|
str d9, [x19] |
|
|
|
|
|
st1 {v9.s}[2], [x20], x8 |
|
|
|
|
|
cmp x6, #1 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x19, x19, x8 |
|
|
|
|
|
st1 {v10.4s}, [x11], x8 |
|
|
|
|
|
str d11, [x19] |
|
|
|
|
|
st1 {v11.s}[2], [x20], x8 |
|
|
|
|
|
cmp x6, #2 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x19, x19, x8 |
|
|
|
|
|
st1 {v12.4s}, [x11], x8 |
|
|
|
|
|
str d13, [x19] |
|
|
|
|
|
st1 {v13.s}[2], [x20], x8 |
|
|
|
|
|
cmp x6, #3 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x19, x19, x8 |
|
|
|
|
|
st1 {v14.4s}, [x11], x8 |
|
|
|
|
|
str d15, [x19] |
|
|
|
|
|
st1 {v15.s}[2], [x20], x8 |
|
|
|
|
|
cmp x6, #4 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x19, x19, x8 |
|
|
|
|
|
st1 {v16.4s}, [x11], x8 |
|
|
|
|
|
str d17, [x19] |
|
|
|
|
|
st1 {v17.s}[2], [x20], x8 |
|
|
|
|
|
cmp x6, #5 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x19, x19, x8 |
|
|
|
|
|
st1 {v18.4s}, [x11], x8 |
|
|
|
|
|
str d19, [x19] |
|
|
|
|
|
st1 {v19.s}[2], [x20], x8 |
|
|
|
|
|
cmp x6, #6 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x19, x19, x8 |
|
|
|
|
|
st1 {v20.4s}, [x11], x8 |
|
|
|
|
|
str d21, [x19] |
|
|
|
|
|
st1 {v21.s}[2], [x20], x8 |
|
|
|
|
|
cmp x6, #7 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
add x19, x19, x8 |
|
|
|
|
|
st1 {v22.4s}, [x11], x8 |
|
|
|
|
|
str d23, [x19] |
|
|
|
|
|
st1 {v23.s}[2], [x20], x8 |
|
|
|
|
|
add x11, x11, #28 |
|
|
|
|
|
b WriteEnd |
|
|
|
|
|
WriteC8: |
|
|
|
|
|
mov x19, x11 |
|
|
|
|
|
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x19], #64 |
|
|
|
|
|
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x19], #64 |
|
|
|
|
|
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x19], #64 |
|
|
|
|
|
st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x19], #64 |
|
|
|
|
|
add x11, x11, x16 |
|
|
|
|
|
b WriteEnd |
|
|
|
|
|
WriteWino: |
|
|
|
|
|
add x2, x11, x16 |
|
|
|
|
|
st1 {v8.4s, v9.4s}, [x11], x15 |
|
|
|
|
|
st1 {v10.4s, v11.4s}, [x11], x15 |
|
|
|
|
|
st1 {v12.4s, v13.4s}, [x11], x15 |
|
|
|
|
|
st1 {v14.4s, v15.4s}, [x11], x15 |
|
|
|
|
|
st1 {v16.4s, v17.4s}, [x11], x15 |
|
|
|
|
|
st1 {v18.4s, v19.4s}, [x11], x15 |
|
|
|
|
|
st1 {v20.4s, v21.4s}, [x11], x15 |
|
|
|
|
|
st1 {v22.4s, v23.4s}, [x11], x15 |
|
|
|
|
|
b WriteEnd |
|
|
|
|
|
Write8: |
|
|
|
|
|
add x2, x2, #32 |
|
|
|
|
|
st1 {v8.4s, v9.4s}, [x11], x8 |
|
|
|
|
|
cmp x6, #1 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
st1 {v10.4s, v11.4s}, [x11], x8 |
|
|
|
|
|
cmp x6, #2 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
st1 {v12.4s, v13.4s}, [x11], x8 |
|
|
|
|
|
cmp x6, #3 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
st1 {v14.4s, v15.4s}, [x11], x8 |
|
|
|
|
|
cmp x6, #4 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
st1 {v16.4s, v17.4s}, [x11], x8 |
|
|
|
|
|
cmp x6, #5 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
st1 {v18.4s, v19.4s}, [x11], x8 |
|
|
|
|
|
cmp x6, #6 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
st1 {v20.4s, v21.4s}, [x11], x8 |
|
|
|
|
|
cmp x6, #7 |
|
|
|
|
|
beq WriteEnd |
|
|
|
|
|
st1 {v22.4s, v23.4s}, [x11], x8 |
|
|
|
|
|
add x11, x11, #32 |
|
|
|
|
|
|
|
|
|
|
|
WriteEnd: |
|
|
|
|
|
subs x13, x13, #8 // rhs col - 8 |
|
|
|
|
|
ble LoopColEnd |
|
|
|
|
|
cmp x6, #4 |
|
|
|
|
|
ble LoopCol4 |
|
|
|
|
|
b LoopCol8 |
|
|
|
|
|
|
|
|
|
|
|
LoopColEnd: |
|
|
|
|
|
add x0, x0, x17 |
|
|
|
|
|
cbz x9, C8DstStep |
|
|
|
|
|
mov x18, #4 |
|
|
|
|
|
mul x18, x18, x7 |
|
|
|
|
|
sub x11, x11, x18 |
|
|
|
|
|
mov x2, x11 |
|
|
|
|
|
b NoDstStep |
|
|
|
|
|
C8DstStep: |
|
|
|
|
|
add x2, x2, #384 |
|
|
|
|
|
mov x11, x2 |
|
|
|
|
|
NoDstStep: |
|
|
|
|
|
subs x6, x6, #12 |
|
|
|
|
|
bgt LoopRow |
|
|
|
|
|
|
|
|
|
|
|
sub sp, sp, #144 |
|
|
|
|
|
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 |
|
|
|
|
|
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 |
|
|
|
|
|
ldp x19, x20, [sp], #16 |
|
|
|
|
|
ret |
|
|
|
|
|
#endif |
|
|
|