|
|
|
@@ -24,8 +24,8 @@ MatmulFp16Neon64: |
|
|
|
st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 |
|
|
|
st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 |
|
|
|
|
|
|
|
mov w18, #16 // sizeof(float) * 8 |
|
|
|
mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float) * 8 * depth |
|
|
|
mov w18, #16 // sizeof(float16) * 8 |
|
|
|
mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float16) * 8 * depth |
|
|
|
mov x11, x3 // bias flag |
|
|
|
mov x18, #2 |
|
|
|
ldr x17, [sp] |
|
|
|
@@ -57,7 +57,7 @@ L2: |
|
|
|
dup v30.4s, wzr |
|
|
|
dup v31.4s, wzr |
|
|
|
|
|
|
|
cmp w13, #4 |
|
|
|
cmp w13, #8 |
|
|
|
blt CommLoopMul |
|
|
|
|
|
|
|
OptLoopMul8: |
|
|
|
|