| @@ -158,36 +158,28 @@ HEIGHT1_LOOP: | |||
| smlal v23.4s, v8.4h, v20.4h | |||
| smlal2 v24.4s, v8.8h, v20.8h | |||
| // Apply left shfit | |||
| cbz w12, SKIP_LEFTSHIFT1 | |||
| sqshl v21.4s, v21.4s, v26.4s | |||
| sqshl v22.4s, v22.4s, v26.4s | |||
| sqshl v23.4s, v23.4s, v26.4s | |||
| sqshl v24.4s, v24.4s, v26.4s | |||
| // Apply the fixed-point part of the multiplier. | |||
| sqrdmulh v21.4s, v21.4s, v27.4s | |||
| sqrdmulh v22.4s, v22.4s, v27.4s | |||
| sqrdmulh v23.4s, v23.4s, v27.4s | |||
| sqrdmulh v24.4s, v24.4s, v27.4s | |||
| b OUTZP1 | |||
| // Apply right shfit | |||
| and v12.16b, v28.16b, v21.16b | |||
| sshr v12.4s, v12.4s, #31 | |||
| sqadd v21.4s, v21.4s, v12.4s | |||
| srshl v21.4s, v21.4s, v28.4s | |||
| and v16.16b, v28.16b, v22.16b | |||
| sshr v16.4s, v16.4s, #31 | |||
| sqadd v22.4s, v22.4s, v16.4s | |||
| srshl v22.4s, v22.4s, v28.4s | |||
| and v20.16b, v28.16b, v23.16b | |||
| sshr v20.4s, v20.4s, #31 | |||
| sqadd v23.4s, v23.4s, v20.4s | |||
| srshl v23.4s, v23.4s, v28.4s | |||
| and v12.16b, v28.16b, v24.16b | |||
| sshr v12.4s, v12.4s, #31 | |||
| sqadd v24.4s, v24.4s, v12.4s | |||
| srshl v24.4s, v24.4s, v28.4s | |||
| SKIP_LEFTSHIFT1: | |||
| sqrdmulh v21.4s, v21.4s, v27.4s | |||
| sqrdmulh v22.4s, v22.4s, v27.4s | |||
| sqrdmulh v23.4s, v23.4s, v27.4s | |||
| sqrdmulh v24.4s, v24.4s, v27.4s | |||
| sqrshl v21.4s, v21.4s, v28.4s | |||
| sqrshl v22.4s, v22.4s, v28.4s | |||
| sqrshl v23.4s, v23.4s, v28.4s | |||
| sqrshl v24.4s, v24.4s, v28.4s | |||
| OUTZP1: | |||
| // Add output zero point | |||
| sqadd v21.4s, v21.4s, v29.4s | |||
| sqadd v22.4s, v22.4s, v29.4s | |||
| @@ -279,36 +271,28 @@ WIDTH2_LEFT: | |||
| smlal v23.4s, v8.4h, v20.4h | |||
| smlal2 v24.4s, v8.8h, v20.8h | |||
| // Apply left shfit | |||
| cbz w12, SKIP_LEFTSHIFT2 | |||
| sqshl v21.4s, v21.4s, v26.4s | |||
| sqshl v22.4s, v22.4s, v26.4s | |||
| sqshl v23.4s, v23.4s, v26.4s | |||
| sqshl v24.4s, v24.4s, v26.4s | |||
| // Apply the fixed-point part of the multiplier. | |||
| sqrdmulh v21.4s, v21.4s, v27.4s | |||
| sqrdmulh v22.4s, v22.4s, v27.4s | |||
| sqrdmulh v23.4s, v23.4s, v27.4s | |||
| sqrdmulh v24.4s, v24.4s, v27.4s | |||
| b OUTZP2 | |||
| // Apply right shfit | |||
| and v9.16b, v28.16b, v21.16b | |||
| sshr v9.4s, v9.4s, #31 | |||
| sqadd v21.4s, v21.4s, v9.4s | |||
| srshl v21.4s, v21.4s, v28.4s | |||
| and v10.16b, v28.16b, v22.16b | |||
| sshr v10.4s, v10.4s, #31 | |||
| sqadd v22.4s, v22.4s, v10.4s | |||
| srshl v22.4s, v22.4s, v28.4s | |||
| and v11.16b, v28.16b, v23.16b | |||
| sshr v11.4s, v11.4s, #31 | |||
| sqadd v23.4s, v23.4s, v11.4s | |||
| srshl v23.4s, v23.4s, v28.4s | |||
| and v12.16b, v28.16b, v24.16b | |||
| sshr v12.4s, v12.4s, #31 | |||
| sqadd v24.4s, v24.4s, v12.4s | |||
| srshl v24.4s, v24.4s, v28.4s | |||
| SKIP_LEFTSHIFT2: | |||
| sqrdmulh v21.4s, v21.4s, v27.4s | |||
| sqrdmulh v22.4s, v22.4s, v27.4s | |||
| sqrdmulh v23.4s, v23.4s, v27.4s | |||
| sqrdmulh v24.4s, v24.4s, v27.4s | |||
| sqrshl v21.4s, v21.4s, v28.4s | |||
| sqrshl v22.4s, v22.4s, v28.4s | |||
| sqrshl v23.4s, v23.4s, v28.4s | |||
| sqrshl v24.4s, v24.4s, v28.4s | |||
| OUTZP2: | |||
| // Add output zero point | |||
| sqadd v21.4s, v21.4s, v29.4s | |||
| sqadd v22.4s, v22.4s, v29.4s | |||
| @@ -358,24 +342,20 @@ WIDTH1_LEFT: | |||
| smlal v21.4s, v8.4h, v19.4h | |||
| smlal2 v22.4s, v8.8h, v19.8h | |||
| // Apply left shfit | |||
| cbz w12, SKIP_LEFTSHIFT3 | |||
| sqshl v21.4s, v21.4s, v26.4s | |||
| sqshl v22.4s, v22.4s, v26.4s | |||
| // Apply the fixed-point part of the multiplier. | |||
| sqrdmulh v21.4s, v21.4s, v27.4s | |||
| sqrdmulh v22.4s, v22.4s, v27.4s | |||
| b OUTZP3 | |||
| // Apply right shfit | |||
| and v9.16b, v28.16b, v21.16b | |||
| sshr v9.4s, v9.4s, #31 | |||
| sqadd v21.4s, v21.4s, v9.4s | |||
| srshl v21.4s, v21.4s, v28.4s | |||
| and v10.16b, v28.16b, v22.16b | |||
| sshr v10.4s, v10.4s, #31 | |||
| sqadd v22.4s, v22.4s, v10.4s | |||
| srshl v22.4s, v22.4s, v28.4s | |||
| SKIP_LEFTSHIFT3: | |||
| sqrdmulh v21.4s, v21.4s, v27.4s | |||
| sqrdmulh v22.4s, v22.4s, v27.4s | |||
| sqrshl v21.4s, v21.4s, v28.4s | |||
| sqrshl v22.4s, v22.4s, v28.4s | |||
| OUTZP3: | |||
| // Add output zero point | |||
| sqadd v21.4s, v21.4s, v29.4s | |||
| sqadd v22.4s, v22.4s, v29.4s | |||