|
|
@@ -158,36 +158,28 @@ HEIGHT1_LOOP: |
|
|
smlal v23.4s, v8.4h, v20.4h |
|
|
smlal v23.4s, v8.4h, v20.4h |
|
|
smlal2 v24.4s, v8.8h, v20.8h |
|
|
smlal2 v24.4s, v8.8h, v20.8h |
|
|
|
|
|
|
|
|
// Apply left shfit |
|
|
|
|
|
|
|
|
cbz w12, SKIP_LEFTSHIFT1 |
|
|
sqshl v21.4s, v21.4s, v26.4s |
|
|
sqshl v21.4s, v21.4s, v26.4s |
|
|
sqshl v22.4s, v22.4s, v26.4s |
|
|
sqshl v22.4s, v22.4s, v26.4s |
|
|
sqshl v23.4s, v23.4s, v26.4s |
|
|
sqshl v23.4s, v23.4s, v26.4s |
|
|
sqshl v24.4s, v24.4s, v26.4s |
|
|
sqshl v24.4s, v24.4s, v26.4s |
|
|
|
|
|
|
|
|
// Apply the fixed-point part of the multiplier. |
|
|
|
|
|
sqrdmulh v21.4s, v21.4s, v27.4s |
|
|
sqrdmulh v21.4s, v21.4s, v27.4s |
|
|
sqrdmulh v22.4s, v22.4s, v27.4s |
|
|
sqrdmulh v22.4s, v22.4s, v27.4s |
|
|
sqrdmulh v23.4s, v23.4s, v27.4s |
|
|
sqrdmulh v23.4s, v23.4s, v27.4s |
|
|
sqrdmulh v24.4s, v24.4s, v27.4s |
|
|
sqrdmulh v24.4s, v24.4s, v27.4s |
|
|
|
|
|
b OUTZP1 |
|
|
|
|
|
|
|
|
// Apply right shfit |
|
|
|
|
|
and v12.16b, v28.16b, v21.16b |
|
|
|
|
|
sshr v12.4s, v12.4s, #31 |
|
|
|
|
|
sqadd v21.4s, v21.4s, v12.4s |
|
|
|
|
|
srshl v21.4s, v21.4s, v28.4s |
|
|
|
|
|
and v16.16b, v28.16b, v22.16b |
|
|
|
|
|
sshr v16.4s, v16.4s, #31 |
|
|
|
|
|
sqadd v22.4s, v22.4s, v16.4s |
|
|
|
|
|
srshl v22.4s, v22.4s, v28.4s |
|
|
|
|
|
and v20.16b, v28.16b, v23.16b |
|
|
|
|
|
sshr v20.4s, v20.4s, #31 |
|
|
|
|
|
sqadd v23.4s, v23.4s, v20.4s |
|
|
|
|
|
srshl v23.4s, v23.4s, v28.4s |
|
|
|
|
|
and v12.16b, v28.16b, v24.16b |
|
|
|
|
|
sshr v12.4s, v12.4s, #31 |
|
|
|
|
|
sqadd v24.4s, v24.4s, v12.4s |
|
|
|
|
|
srshl v24.4s, v24.4s, v28.4s |
|
|
|
|
|
|
|
|
SKIP_LEFTSHIFT1: |
|
|
|
|
|
sqrdmulh v21.4s, v21.4s, v27.4s |
|
|
|
|
|
sqrdmulh v22.4s, v22.4s, v27.4s |
|
|
|
|
|
sqrdmulh v23.4s, v23.4s, v27.4s |
|
|
|
|
|
sqrdmulh v24.4s, v24.4s, v27.4s |
|
|
|
|
|
sqrshl v21.4s, v21.4s, v28.4s |
|
|
|
|
|
sqrshl v22.4s, v22.4s, v28.4s |
|
|
|
|
|
sqrshl v23.4s, v23.4s, v28.4s |
|
|
|
|
|
sqrshl v24.4s, v24.4s, v28.4s |
|
|
|
|
|
|
|
|
|
|
|
OUTZP1: |
|
|
// Add output zero point |
|
|
// Add output zero point |
|
|
sqadd v21.4s, v21.4s, v29.4s |
|
|
sqadd v21.4s, v21.4s, v29.4s |
|
|
sqadd v22.4s, v22.4s, v29.4s |
|
|
sqadd v22.4s, v22.4s, v29.4s |
|
|
@@ -279,36 +271,28 @@ WIDTH2_LEFT: |
|
|
smlal v23.4s, v8.4h, v20.4h |
|
|
smlal v23.4s, v8.4h, v20.4h |
|
|
smlal2 v24.4s, v8.8h, v20.8h |
|
|
smlal2 v24.4s, v8.8h, v20.8h |
|
|
|
|
|
|
|
|
// Apply left shfit |
|
|
|
|
|
|
|
|
cbz w12, SKIP_LEFTSHIFT2 |
|
|
sqshl v21.4s, v21.4s, v26.4s |
|
|
sqshl v21.4s, v21.4s, v26.4s |
|
|
sqshl v22.4s, v22.4s, v26.4s |
|
|
sqshl v22.4s, v22.4s, v26.4s |
|
|
sqshl v23.4s, v23.4s, v26.4s |
|
|
sqshl v23.4s, v23.4s, v26.4s |
|
|
sqshl v24.4s, v24.4s, v26.4s |
|
|
sqshl v24.4s, v24.4s, v26.4s |
|
|
|
|
|
|
|
|
// Apply the fixed-point part of the multiplier. |
|
|
|
|
|
sqrdmulh v21.4s, v21.4s, v27.4s |
|
|
sqrdmulh v21.4s, v21.4s, v27.4s |
|
|
sqrdmulh v22.4s, v22.4s, v27.4s |
|
|
sqrdmulh v22.4s, v22.4s, v27.4s |
|
|
sqrdmulh v23.4s, v23.4s, v27.4s |
|
|
sqrdmulh v23.4s, v23.4s, v27.4s |
|
|
sqrdmulh v24.4s, v24.4s, v27.4s |
|
|
sqrdmulh v24.4s, v24.4s, v27.4s |
|
|
|
|
|
b OUTZP2 |
|
|
|
|
|
|
|
|
// Apply right shfit |
|
|
|
|
|
and v9.16b, v28.16b, v21.16b |
|
|
|
|
|
sshr v9.4s, v9.4s, #31 |
|
|
|
|
|
sqadd v21.4s, v21.4s, v9.4s |
|
|
|
|
|
srshl v21.4s, v21.4s, v28.4s |
|
|
|
|
|
and v10.16b, v28.16b, v22.16b |
|
|
|
|
|
sshr v10.4s, v10.4s, #31 |
|
|
|
|
|
sqadd v22.4s, v22.4s, v10.4s |
|
|
|
|
|
srshl v22.4s, v22.4s, v28.4s |
|
|
|
|
|
and v11.16b, v28.16b, v23.16b |
|
|
|
|
|
sshr v11.4s, v11.4s, #31 |
|
|
|
|
|
sqadd v23.4s, v23.4s, v11.4s |
|
|
|
|
|
srshl v23.4s, v23.4s, v28.4s |
|
|
|
|
|
and v12.16b, v28.16b, v24.16b |
|
|
|
|
|
sshr v12.4s, v12.4s, #31 |
|
|
|
|
|
sqadd v24.4s, v24.4s, v12.4s |
|
|
|
|
|
srshl v24.4s, v24.4s, v28.4s |
|
|
|
|
|
|
|
|
SKIP_LEFTSHIFT2: |
|
|
|
|
|
sqrdmulh v21.4s, v21.4s, v27.4s |
|
|
|
|
|
sqrdmulh v22.4s, v22.4s, v27.4s |
|
|
|
|
|
sqrdmulh v23.4s, v23.4s, v27.4s |
|
|
|
|
|
sqrdmulh v24.4s, v24.4s, v27.4s |
|
|
|
|
|
sqrshl v21.4s, v21.4s, v28.4s |
|
|
|
|
|
sqrshl v22.4s, v22.4s, v28.4s |
|
|
|
|
|
sqrshl v23.4s, v23.4s, v28.4s |
|
|
|
|
|
sqrshl v24.4s, v24.4s, v28.4s |
|
|
|
|
|
|
|
|
|
|
|
OUTZP2: |
|
|
// Add output zero point |
|
|
// Add output zero point |
|
|
sqadd v21.4s, v21.4s, v29.4s |
|
|
sqadd v21.4s, v21.4s, v29.4s |
|
|
sqadd v22.4s, v22.4s, v29.4s |
|
|
sqadd v22.4s, v22.4s, v29.4s |
|
|
@@ -358,24 +342,20 @@ WIDTH1_LEFT: |
|
|
smlal v21.4s, v8.4h, v19.4h |
|
|
smlal v21.4s, v8.4h, v19.4h |
|
|
smlal2 v22.4s, v8.8h, v19.8h |
|
|
smlal2 v22.4s, v8.8h, v19.8h |
|
|
|
|
|
|
|
|
// Apply left shfit |
|
|
|
|
|
|
|
|
cbz w12, SKIP_LEFTSHIFT3 |
|
|
sqshl v21.4s, v21.4s, v26.4s |
|
|
sqshl v21.4s, v21.4s, v26.4s |
|
|
sqshl v22.4s, v22.4s, v26.4s |
|
|
sqshl v22.4s, v22.4s, v26.4s |
|
|
|
|
|
|
|
|
// Apply the fixed-point part of the multiplier. |
|
|
|
|
|
sqrdmulh v21.4s, v21.4s, v27.4s |
|
|
sqrdmulh v21.4s, v21.4s, v27.4s |
|
|
sqrdmulh v22.4s, v22.4s, v27.4s |
|
|
sqrdmulh v22.4s, v22.4s, v27.4s |
|
|
|
|
|
b OUTZP3 |
|
|
|
|
|
|
|
|
// Apply right shfit |
|
|
|
|
|
and v9.16b, v28.16b, v21.16b |
|
|
|
|
|
sshr v9.4s, v9.4s, #31 |
|
|
|
|
|
sqadd v21.4s, v21.4s, v9.4s |
|
|
|
|
|
srshl v21.4s, v21.4s, v28.4s |
|
|
|
|
|
and v10.16b, v28.16b, v22.16b |
|
|
|
|
|
sshr v10.4s, v10.4s, #31 |
|
|
|
|
|
sqadd v22.4s, v22.4s, v10.4s |
|
|
|
|
|
srshl v22.4s, v22.4s, v28.4s |
|
|
|
|
|
|
|
|
SKIP_LEFTSHIFT3: |
|
|
|
|
|
sqrdmulh v21.4s, v21.4s, v27.4s |
|
|
|
|
|
sqrdmulh v22.4s, v22.4s, v27.4s |
|
|
|
|
|
sqrshl v21.4s, v21.4s, v28.4s |
|
|
|
|
|
sqrshl v22.4s, v22.4s, v28.4s |
|
|
|
|
|
|
|
|
|
|
|
OUTZP3: |
|
|
// Add output zero point |
|
|
// Add output zero point |
|
|
sqadd v21.4s, v21.4s, v29.4s |
|
|
sqadd v21.4s, v21.4s, v29.4s |
|
|
sqadd v22.4s, v22.4s, v29.4s |
|
|
sqadd v22.4s, v22.4s, v29.4s |
|
|
|