Merge pull request !4209 from lixian/mastertags/v0.7.0-beta
| @@ -127,102 +127,102 @@ ConvDwInt8Center: | |||
| subs x20, x20, #1 | |||
| bne LoopKh16 | |||
| sqshl v0.4s, v0.4s ,v26.4s | |||
| sqshl v1.4s, v1.4s ,v26.4s | |||
| sqshl v2.4s, v2.4s ,v26.4s | |||
| sqshl v3.4s, v3.4s ,v26.4s | |||
| sqshl v4.4s, v4.4s ,v26.4s | |||
| sqshl v5.4s, v5.4s ,v26.4s | |||
| sqshl v6.4s, v6.4s ,v26.4s | |||
| sqshl v7.4s, v7.4s ,v26.4s | |||
| sqshl v8.4s, v8.4s ,v26.4s | |||
| sqshl v9.4s, v9.4s ,v26.4s | |||
| sqshl v10.4s, v10.4s ,v26.4s | |||
| sqshl v11.4s, v11.4s ,v26.4s | |||
| sqshl v12.4s, v12.4s ,v26.4s | |||
| sqshl v13.4s, v13.4s ,v26.4s | |||
| sqshl v14.4s, v14.4s ,v26.4s | |||
| sqshl v15.4s, v15.4s ,v26.4s | |||
| sqrdmulh v0.4s, v0.4s ,v27.4s | |||
| sqrdmulh v1.4s, v1.4s ,v27.4s | |||
| sqrdmulh v2.4s, v2.4s ,v27.4s | |||
| sqrdmulh v3.4s, v3.4s ,v27.4s | |||
| sqrdmulh v4.4s, v4.4s ,v27.4s | |||
| sqrdmulh v5.4s, v5.4s ,v27.4s | |||
| sqrdmulh v6.4s, v6.4s ,v27.4s | |||
| sqrdmulh v7.4s, v7.4s ,v27.4s | |||
| sqrdmulh v8.4s, v8.4s ,v27.4s | |||
| sqrdmulh v9.4s, v9.4s ,v27.4s | |||
| sqrdmulh v10.4s, v10.4s ,v27.4s | |||
| sqrdmulh v11.4s, v11.4s ,v27.4s | |||
| sqrdmulh v12.4s, v12.4s ,v27.4s | |||
| sqrdmulh v13.4s, v13.4s ,v27.4s | |||
| sqrdmulh v14.4s, v14.4s ,v27.4s | |||
| sqrdmulh v15.4s, v15.4s ,v27.4s | |||
| sqrshl v0.4s, v0.4s ,v28.4s | |||
| sqrshl v1.4s, v1.4s ,v28.4s | |||
| sqrshl v2.4s, v2.4s ,v28.4s | |||
| sqrshl v3.4s, v3.4s ,v28.4s | |||
| sqrshl v4.4s, v4.4s ,v28.4s | |||
| sqrshl v5.4s, v5.4s ,v28.4s | |||
| sqrshl v6.4s, v6.4s ,v28.4s | |||
| sqrshl v7.4s, v7.4s ,v28.4s | |||
| sqrshl v8.4s, v8.4s ,v28.4s | |||
| sqrshl v9.4s, v9.4s ,v28.4s | |||
| sqrshl v10.4s, v10.4s ,v28.4s | |||
| sqrshl v11.4s, v11.4s ,v28.4s | |||
| sqrshl v12.4s, v12.4s ,v28.4s | |||
| sqrshl v13.4s, v13.4s ,v28.4s | |||
| sqrshl v14.4s, v14.4s ,v28.4s | |||
| sqrshl v15.4s, v15.4s ,v28.4s | |||
| add v0.4s, v0.4s ,v29.4s | |||
| add v1.4s, v1.4s ,v29.4s | |||
| add v2.4s, v2.4s ,v29.4s | |||
| add v3.4s, v3.4s ,v29.4s | |||
| add v4.4s, v4.4s ,v29.4s | |||
| add v5.4s, v5.4s ,v29.4s | |||
| add v6.4s, v6.4s ,v29.4s | |||
| add v7.4s, v7.4s ,v29.4s | |||
| add v8.4s, v8.4s ,v29.4s | |||
| add v9.4s, v9.4s ,v29.4s | |||
| add v10.4s, v10.4s ,v29.4s | |||
| add v11.4s, v11.4s ,v29.4s | |||
| add v12.4s, v12.4s ,v29.4s | |||
| add v13.4s, v13.4s ,v29.4s | |||
| add v14.4s, v14.4s ,v29.4s | |||
| add v15.4s, v15.4s ,v29.4s | |||
| smax v0.4s, v0.4s ,v30.4s | |||
| smax v1.4s, v1.4s ,v30.4s | |||
| smax v2.4s, v2.4s ,v30.4s | |||
| smax v3.4s, v3.4s ,v30.4s | |||
| smax v4.4s, v4.4s ,v30.4s | |||
| smax v5.4s, v5.4s ,v30.4s | |||
| smax v6.4s, v6.4s ,v30.4s | |||
| smax v7.4s, v7.4s ,v30.4s | |||
| smax v8.4s, v8.4s ,v30.4s | |||
| smax v9.4s, v9.4s ,v30.4s | |||
| smax v10.4s, v10.4s ,v30.4s | |||
| smax v11.4s, v11.4s ,v30.4s | |||
| smax v12.4s, v12.4s ,v30.4s | |||
| smax v13.4s, v13.4s ,v30.4s | |||
| smax v14.4s, v14.4s ,v30.4s | |||
| smax v15.4s, v15.4s ,v30.4s | |||
| smin v0.4s, v0.4s ,v31.4s | |||
| smin v1.4s, v1.4s ,v31.4s | |||
| smin v2.4s, v2.4s ,v31.4s | |||
| smin v3.4s, v3.4s ,v31.4s | |||
| smin v4.4s, v4.4s ,v31.4s | |||
| smin v5.4s, v5.4s ,v31.4s | |||
| smin v6.4s, v6.4s ,v31.4s | |||
| smin v7.4s, v7.4s ,v31.4s | |||
| smin v8.4s, v8.4s ,v31.4s | |||
| smin v9.4s, v9.4s ,v31.4s | |||
| smin v10.4s, v10.4s ,v31.4s | |||
| smin v11.4s, v11.4s ,v31.4s | |||
| smin v12.4s, v12.4s ,v31.4s | |||
| smin v13.4s, v13.4s ,v31.4s | |||
| smin v14.4s, v14.4s ,v31.4s | |||
| smin v15.4s, v15.4s ,v31.4s | |||
| sqshl v0.4s, v0.4s, v26.4s | |||
| sqshl v1.4s, v1.4s, v26.4s | |||
| sqshl v2.4s, v2.4s, v26.4s | |||
| sqshl v3.4s, v3.4s, v26.4s | |||
| sqshl v4.4s, v4.4s, v26.4s | |||
| sqshl v5.4s, v5.4s, v26.4s | |||
| sqshl v6.4s, v6.4s, v26.4s | |||
| sqshl v7.4s, v7.4s, v26.4s | |||
| sqshl v8.4s, v8.4s, v26.4s | |||
| sqshl v9.4s, v9.4s, v26.4s | |||
| sqshl v10.4s, v10.4s, v26.4s | |||
| sqshl v11.4s, v11.4s, v26.4s | |||
| sqshl v12.4s, v12.4s, v26.4s | |||
| sqshl v13.4s, v13.4s, v26.4s | |||
| sqshl v14.4s, v14.4s, v26.4s | |||
| sqshl v15.4s, v15.4s, v26.4s | |||
| sqrdmulh v0.4s, v0.4s, v27.4s | |||
| sqrdmulh v1.4s, v1.4s, v27.4s | |||
| sqrdmulh v2.4s, v2.4s, v27.4s | |||
| sqrdmulh v3.4s, v3.4s, v27.4s | |||
| sqrdmulh v4.4s, v4.4s, v27.4s | |||
| sqrdmulh v5.4s, v5.4s, v27.4s | |||
| sqrdmulh v6.4s, v6.4s, v27.4s | |||
| sqrdmulh v7.4s, v7.4s, v27.4s | |||
| sqrdmulh v8.4s, v8.4s, v27.4s | |||
| sqrdmulh v9.4s, v9.4s, v27.4s | |||
| sqrdmulh v10.4s, v10.4s, v27.4s | |||
| sqrdmulh v11.4s, v11.4s, v27.4s | |||
| sqrdmulh v12.4s, v12.4s, v27.4s | |||
| sqrdmulh v13.4s, v13.4s, v27.4s | |||
| sqrdmulh v14.4s, v14.4s, v27.4s | |||
| sqrdmulh v15.4s, v15.4s, v27.4s | |||
| sqrshl v0.4s, v0.4s, v28.4s | |||
| sqrshl v1.4s, v1.4s, v28.4s | |||
| sqrshl v2.4s, v2.4s, v28.4s | |||
| sqrshl v3.4s, v3.4s, v28.4s | |||
| sqrshl v4.4s, v4.4s, v28.4s | |||
| sqrshl v5.4s, v5.4s, v28.4s | |||
| sqrshl v6.4s, v6.4s, v28.4s | |||
| sqrshl v7.4s, v7.4s, v28.4s | |||
| sqrshl v8.4s, v8.4s, v28.4s | |||
| sqrshl v9.4s, v9.4s, v28.4s | |||
| sqrshl v10.4s, v10.4s, v28.4s | |||
| sqrshl v11.4s, v11.4s, v28.4s | |||
| sqrshl v12.4s, v12.4s, v28.4s | |||
| sqrshl v13.4s, v13.4s, v28.4s | |||
| sqrshl v14.4s, v14.4s, v28.4s | |||
| sqrshl v15.4s, v15.4s, v28.4s | |||
| add v0.4s, v0.4s, v29.4s | |||
| add v1.4s, v1.4s, v29.4s | |||
| add v2.4s, v2.4s, v29.4s | |||
| add v3.4s, v3.4s, v29.4s | |||
| add v4.4s, v4.4s, v29.4s | |||
| add v5.4s, v5.4s, v29.4s | |||
| add v6.4s, v6.4s, v29.4s | |||
| add v7.4s, v7.4s, v29.4s | |||
| add v8.4s, v8.4s, v29.4s | |||
| add v9.4s, v9.4s, v29.4s | |||
| add v10.4s, v10.4s, v29.4s | |||
| add v11.4s, v11.4s, v29.4s | |||
| add v12.4s, v12.4s, v29.4s | |||
| add v13.4s, v13.4s, v29.4s | |||
| add v14.4s, v14.4s, v29.4s | |||
| add v15.4s, v15.4s, v29.4s | |||
| smax v0.4s, v0.4s, v30.4s | |||
| smax v1.4s, v1.4s, v30.4s | |||
| smax v2.4s, v2.4s, v30.4s | |||
| smax v3.4s, v3.4s, v30.4s | |||
| smax v4.4s, v4.4s, v30.4s | |||
| smax v5.4s, v5.4s, v30.4s | |||
| smax v6.4s, v6.4s, v30.4s | |||
| smax v7.4s, v7.4s, v30.4s | |||
| smax v8.4s, v8.4s, v30.4s | |||
| smax v9.4s, v9.4s, v30.4s | |||
| smax v10.4s, v10.4s, v30.4s | |||
| smax v11.4s, v11.4s, v30.4s | |||
| smax v12.4s, v12.4s, v30.4s | |||
| smax v13.4s, v13.4s, v30.4s | |||
| smax v14.4s, v14.4s, v30.4s | |||
| smax v15.4s, v15.4s, v30.4s | |||
| smin v0.4s, v0.4s, v31.4s | |||
| smin v1.4s, v1.4s, v31.4s | |||
| smin v2.4s, v2.4s, v31.4s | |||
| smin v3.4s, v3.4s, v31.4s | |||
| smin v4.4s, v4.4s, v31.4s | |||
| smin v5.4s, v5.4s, v31.4s | |||
| smin v6.4s, v6.4s, v31.4s | |||
| smin v7.4s, v7.4s, v31.4s | |||
| smin v8.4s, v8.4s, v31.4s | |||
| smin v9.4s, v9.4s, v31.4s | |||
| smin v10.4s, v10.4s, v31.4s | |||
| smin v11.4s, v11.4s, v31.4s | |||
| smin v12.4s, v12.4s, v31.4s | |||
| smin v13.4s, v13.4s, v31.4s | |||
| smin v14.4s, v14.4s, v31.4s | |||
| smin v15.4s, v15.4s, v31.4s | |||
| sqxtn v0.4h, v0.4s | |||
| sqxtn v1.4h, v1.4s | |||
| @@ -391,54 +391,54 @@ ConvDwInt8Center: | |||
| subs x20, x20, #1 | |||
| bne LoopKh8 | |||
| sqshl v0.4s, v0.4s ,v26.4s | |||
| sqshl v1.4s, v1.4s ,v26.4s | |||
| sqshl v2.4s, v2.4s ,v26.4s | |||
| sqshl v3.4s, v3.4s ,v26.4s | |||
| sqshl v4.4s, v4.4s ,v26.4s | |||
| sqshl v5.4s, v5.4s ,v26.4s | |||
| sqshl v6.4s, v6.4s ,v26.4s | |||
| sqshl v7.4s, v7.4s ,v26.4s | |||
| sqrdmulh v0.4s, v0.4s ,v27.4s | |||
| sqrdmulh v1.4s, v1.4s ,v27.4s | |||
| sqrdmulh v2.4s, v2.4s ,v27.4s | |||
| sqrdmulh v3.4s, v3.4s ,v27.4s | |||
| sqrdmulh v4.4s, v4.4s ,v27.4s | |||
| sqrdmulh v5.4s, v5.4s ,v27.4s | |||
| sqrdmulh v6.4s, v6.4s ,v27.4s | |||
| sqrdmulh v7.4s, v7.4s ,v27.4s | |||
| sqrshl v0.4s, v0.4s ,v28.4s | |||
| sqrshl v1.4s, v1.4s ,v28.4s | |||
| sqrshl v2.4s, v2.4s ,v28.4s | |||
| sqrshl v3.4s, v3.4s ,v28.4s | |||
| sqrshl v4.4s, v4.4s ,v28.4s | |||
| sqrshl v5.4s, v5.4s ,v28.4s | |||
| sqrshl v6.4s, v6.4s ,v28.4s | |||
| sqrshl v7.4s, v7.4s ,v28.4s | |||
| add v0.4s, v0.4s ,v29.4s | |||
| add v1.4s, v1.4s ,v29.4s | |||
| add v2.4s, v2.4s ,v29.4s | |||
| add v3.4s, v3.4s ,v29.4s | |||
| add v4.4s, v4.4s ,v29.4s | |||
| add v5.4s, v5.4s ,v29.4s | |||
| add v6.4s, v6.4s ,v29.4s | |||
| add v7.4s, v7.4s ,v29.4s | |||
| smax v0.4s, v0.4s ,v30.4s | |||
| smax v1.4s, v1.4s ,v30.4s | |||
| smax v2.4s, v2.4s ,v30.4s | |||
| smax v3.4s, v3.4s ,v30.4s | |||
| smax v4.4s, v4.4s ,v30.4s | |||
| smax v5.4s, v5.4s ,v30.4s | |||
| smax v6.4s, v6.4s ,v30.4s | |||
| smax v7.4s, v7.4s ,v30.4s | |||
| smin v0.4s, v0.4s ,v31.4s | |||
| smin v1.4s, v1.4s ,v31.4s | |||
| smin v2.4s, v2.4s ,v31.4s | |||
| smin v3.4s, v3.4s ,v31.4s | |||
| smin v4.4s, v4.4s ,v31.4s | |||
| smin v5.4s, v5.4s ,v31.4s | |||
| smin v6.4s, v6.4s ,v31.4s | |||
| smin v7.4s, v7.4s ,v31.4s | |||
| sqshl v0.4s, v0.4s, v26.4s | |||
| sqshl v1.4s, v1.4s, v26.4s | |||
| sqshl v2.4s, v2.4s, v26.4s | |||
| sqshl v3.4s, v3.4s, v26.4s | |||
| sqshl v4.4s, v4.4s, v26.4s | |||
| sqshl v5.4s, v5.4s, v26.4s | |||
| sqshl v6.4s, v6.4s, v26.4s | |||
| sqshl v7.4s, v7.4s, v26.4s | |||
| sqrdmulh v0.4s, v0.4s, v27.4s | |||
| sqrdmulh v1.4s, v1.4s, v27.4s | |||
| sqrdmulh v2.4s, v2.4s, v27.4s | |||
| sqrdmulh v3.4s, v3.4s, v27.4s | |||
| sqrdmulh v4.4s, v4.4s, v27.4s | |||
| sqrdmulh v5.4s, v5.4s, v27.4s | |||
| sqrdmulh v6.4s, v6.4s, v27.4s | |||
| sqrdmulh v7.4s, v7.4s, v27.4s | |||
| sqrshl v0.4s, v0.4s, v28.4s | |||
| sqrshl v1.4s, v1.4s, v28.4s | |||
| sqrshl v2.4s, v2.4s, v28.4s | |||
| sqrshl v3.4s, v3.4s, v28.4s | |||
| sqrshl v4.4s, v4.4s, v28.4s | |||
| sqrshl v5.4s, v5.4s, v28.4s | |||
| sqrshl v6.4s, v6.4s, v28.4s | |||
| sqrshl v7.4s, v7.4s, v28.4s | |||
| add v0.4s, v0.4s, v29.4s | |||
| add v1.4s, v1.4s, v29.4s | |||
| add v2.4s, v2.4s, v29.4s | |||
| add v3.4s, v3.4s, v29.4s | |||
| add v4.4s, v4.4s, v29.4s | |||
| add v5.4s, v5.4s, v29.4s | |||
| add v6.4s, v6.4s, v29.4s | |||
| add v7.4s, v7.4s, v29.4s | |||
| smax v0.4s, v0.4s, v30.4s | |||
| smax v1.4s, v1.4s, v30.4s | |||
| smax v2.4s, v2.4s, v30.4s | |||
| smax v3.4s, v3.4s, v30.4s | |||
| smax v4.4s, v4.4s, v30.4s | |||
| smax v5.4s, v5.4s, v30.4s | |||
| smax v6.4s, v6.4s, v30.4s | |||
| smax v7.4s, v7.4s, v30.4s | |||
| smin v0.4s, v0.4s, v31.4s | |||
| smin v1.4s, v1.4s, v31.4s | |||
| smin v2.4s, v2.4s, v31.4s | |||
| smin v3.4s, v3.4s, v31.4s | |||
| smin v4.4s, v4.4s, v31.4s | |||
| smin v5.4s, v5.4s, v31.4s | |||
| smin v6.4s, v6.4s, v31.4s | |||
| smin v7.4s, v7.4s, v31.4s | |||
| sqxtn v0.4h, v0.4s | |||
| sqxtn v1.4h, v1.4s | |||
| @@ -524,12 +524,12 @@ ConvDwInt8Center: | |||
| subs x20, x20, #1 | |||
| bne LoopKh | |||
| sqshl v0.4s, v0.4s ,v26.4s | |||
| sqrdmulh v0.4s, v0.4s ,v27.4s | |||
| sqrshl v0.4s, v0.4s ,v28.4s | |||
| add v0.4s, v0.4s ,v29.4s | |||
| smax v0.4s, v0.4s ,v30.4s | |||
| smin v0.4s, v0.4s ,v31.4s | |||
| sqshl v0.4s, v0.4s, v26.4s | |||
| sqrdmulh v0.4s, v0.4s, v27.4s | |||
| sqrshl v0.4s, v0.4s, v28.4s | |||
| add v0.4s, v0.4s, v29.4s | |||
| smax v0.4s, v0.4s, v30.4s | |||
| smin v0.4s, v0.4s, v31.4s | |||
| sqxtn v0.4h, v0.4s | |||
| sqxtn v0.8b, v0.8h | |||
| @@ -268,40 +268,40 @@ IndirectGemmStart: | |||
| Relu6: | |||
| movi v1.4s, #6 | |||
| scvtf v1.4s, v1.4s | |||
| fmin v16.4s, v16.4s ,v1.4s | |||
| fmin v17.4s, v17.4s ,v1.4s | |||
| fmin v18.4s, v18.4s ,v1.4s | |||
| fmin v19.4s, v19.4s ,v1.4s | |||
| fmin v20.4s, v20.4s ,v1.4s | |||
| fmin v21.4s, v21.4s ,v1.4s | |||
| fmin v22.4s, v22.4s ,v1.4s | |||
| fmin v23.4s, v23.4s ,v1.4s | |||
| fmin v24.4s, v24.4s ,v1.4s | |||
| fmin v25.4s, v25.4s ,v1.4s | |||
| fmin v26.4s, v26.4s ,v1.4s | |||
| fmin v27.4s, v27.4s ,v1.4s | |||
| fmin v28.4s, v28.4s ,v1.4s | |||
| fmin v29.4s, v29.4s ,v1.4s | |||
| fmin v30.4s, v30.4s ,v1.4s | |||
| fmin v31.4s, v31.4s ,v1.4s | |||
| fmin v16.4s, v16.4s, v1.4s | |||
| fmin v17.4s, v17.4s, v1.4s | |||
| fmin v18.4s, v18.4s, v1.4s | |||
| fmin v19.4s, v19.4s, v1.4s | |||
| fmin v20.4s, v20.4s, v1.4s | |||
| fmin v21.4s, v21.4s, v1.4s | |||
| fmin v22.4s, v22.4s, v1.4s | |||
| fmin v23.4s, v23.4s, v1.4s | |||
| fmin v24.4s, v24.4s, v1.4s | |||
| fmin v25.4s, v25.4s, v1.4s | |||
| fmin v26.4s, v26.4s, v1.4s | |||
| fmin v27.4s, v27.4s, v1.4s | |||
| fmin v28.4s, v28.4s, v1.4s | |||
| fmin v29.4s, v29.4s, v1.4s | |||
| fmin v30.4s, v30.4s, v1.4s | |||
| fmin v31.4s, v31.4s, v1.4s | |||
| Relu: | |||
| dup v0.4s, wzr | |||
| fmax v16.4s, v16.4s ,v0.4s | |||
| fmax v17.4s, v17.4s ,v0.4s | |||
| fmax v18.4s, v18.4s ,v0.4s | |||
| fmax v19.4s, v19.4s ,v0.4s | |||
| fmax v20.4s, v20.4s ,v0.4s | |||
| fmax v21.4s, v21.4s ,v0.4s | |||
| fmax v22.4s, v22.4s ,v0.4s | |||
| fmax v23.4s, v23.4s ,v0.4s | |||
| fmax v24.4s, v24.4s ,v0.4s | |||
| fmax v25.4s, v25.4s ,v0.4s | |||
| fmax v26.4s, v26.4s ,v0.4s | |||
| fmax v27.4s, v27.4s ,v0.4s | |||
| fmax v28.4s, v28.4s ,v0.4s | |||
| fmax v29.4s, v29.4s ,v0.4s | |||
| fmax v30.4s, v30.4s ,v0.4s | |||
| fmax v31.4s, v31.4s ,v0.4s | |||
| fmax v16.4s, v16.4s, v0.4s | |||
| fmax v17.4s, v17.4s, v0.4s | |||
| fmax v18.4s, v18.4s, v0.4s | |||
| fmax v19.4s, v19.4s, v0.4s | |||
| fmax v20.4s, v20.4s, v0.4s | |||
| fmax v21.4s, v21.4s, v0.4s | |||
| fmax v22.4s, v22.4s, v0.4s | |||
| fmax v23.4s, v23.4s, v0.4s | |||
| fmax v24.4s, v24.4s, v0.4s | |||
| fmax v25.4s, v25.4s, v0.4s | |||
| fmax v26.4s, v26.4s, v0.4s | |||
| fmax v27.4s, v27.4s, v0.4s | |||
| fmax v28.4s, v28.4s, v0.4s | |||
| fmax v29.4s, v29.4s, v0.4s | |||
| fmax v30.4s, v30.4s, v0.4s | |||
| fmax v31.4s, v31.4s, v0.4s | |||
| WriteStart: | |||
| cbnz x9, WriteC4 | |||
| @@ -595,24 +595,24 @@ IndirectGemmStart: | |||
| Relu6Half: | |||
| movi v1.4s, #6 | |||
| scvtf v1.4s, v1.4s | |||
| fmin v16.4s, v16.4s ,v1.4s | |||
| fmin v18.4s, v18.4s ,v1.4s | |||
| fmin v20.4s, v20.4s ,v1.4s | |||
| fmin v22.4s, v22.4s ,v1.4s | |||
| fmin v24.4s, v24.4s ,v1.4s | |||
| fmin v26.4s, v26.4s ,v1.4s | |||
| fmin v28.4s, v28.4s ,v1.4s | |||
| fmin v30.4s, v30.4s ,v1.4s | |||
| fmin v16.4s, v16.4s, v1.4s | |||
| fmin v18.4s, v18.4s, v1.4s | |||
| fmin v20.4s, v20.4s, v1.4s | |||
| fmin v22.4s, v22.4s, v1.4s | |||
| fmin v24.4s, v24.4s, v1.4s | |||
| fmin v26.4s, v26.4s, v1.4s | |||
| fmin v28.4s, v28.4s, v1.4s | |||
| fmin v30.4s, v30.4s, v1.4s | |||
| ReluHalf: | |||
| dup v0.4s, wzr | |||
| fmax v16.4s, v16.4s ,v0.4s | |||
| fmax v18.4s, v18.4s ,v0.4s | |||
| fmax v20.4s, v20.4s ,v0.4s | |||
| fmax v22.4s, v22.4s ,v0.4s | |||
| fmax v24.4s, v24.4s ,v0.4s | |||
| fmax v26.4s, v26.4s ,v0.4s | |||
| fmax v28.4s, v28.4s ,v0.4s | |||
| fmax v30.4s, v30.4s ,v0.4s | |||
| fmax v16.4s, v16.4s, v0.4s | |||
| fmax v18.4s, v18.4s, v0.4s | |||
| fmax v20.4s, v20.4s, v0.4s | |||
| fmax v22.4s, v22.4s, v0.4s | |||
| fmax v24.4s, v24.4s, v0.4s | |||
| fmax v26.4s, v26.4s, v0.4s | |||
| fmax v28.4s, v28.4s, v0.4s | |||
| fmax v30.4s, v30.4s, v0.4s | |||
| WriteStartHalf: | |||
| cbnz x9, Write4 | |||
| @@ -87,14 +87,15 @@ IndirectGemmInt8_4x4: | |||
| ld1 {v2.16b, v3.16b}, [x12], #32 | |||
| smull v10.8h, v0.8b, v6.8b | |||
| smull v11.8h, v0.8b, v7.8b | |||
| saddlp v16.4s, v8.8h | |||
| smlal2 v10.8h, v0.16b, v6.16b | |||
| smlal2 v11.8h, v0.16b, v7.16b | |||
| saddlp v16.4s, v8.8h | |||
| saddlp v17.4s, v9.8h | |||
| smull v14.8h, v1.8b, v6.8b | |||
| smull v15.8h, v1.8b, v7.8b | |||
| saddlp v18.4s, v10.8h | |||
| smlal2 v14.8h, v1.16b, v6.16b | |||
| smlal2 v15.8h, v1.16b, v7.16b | |||
| saddlp v17.4s, v9.8h | |||
| subs x13, x5, #1 | |||
| beq LoopIcEnd | |||
| @@ -102,55 +103,55 @@ IndirectGemmInt8_4x4: | |||
| LoopIc: | |||
| // load input for output 1-8 | |||
| ld1 {v0.16b, v1.16b}, [x12], #32 | |||
| sadalp v18.4s, v10.8h | |||
| sadalp v19.4s, v11.8h | |||
| smull v8.8h, v2.8b, v4.8b | |||
| smull v9.8h, v2.8b, v5.8b | |||
| sadalp v19.4s, v11.8h | |||
| sadalp v20.4s, v12.8h | |||
| smlal2 v8.8h, v2.16b, v4.16b | |||
| smlal2 v9.8h, v2.16b, v5.16b | |||
| sadalp v20.4s, v12.8h | |||
| sadalp v21.4s, v13.8h | |||
| smull v10.8h, v2.8b, v6.8b | |||
| smull v11.8h, v2.8b, v7.8b | |||
| sadalp v21.4s, v13.8h | |||
| sadalp v22.4s, v14.8h | |||
| smlal2 v10.8h, v2.16b, v6.16b | |||
| smlal2 v11.8h, v2.16b, v7.16b | |||
| sadalp v22.4s, v14.8h | |||
| sadalp v23.4s, v15.8h | |||
| smull v12.8h, v3.8b, v4.8b | |||
| smull v13.8h, v3.8b, v5.8b | |||
| sadalp v23.4s, v15.8h | |||
| sadalp v24.4s, v8.8h | |||
| smlal2 v12.8h, v3.16b, v4.16b | |||
| smlal2 v13.8h, v3.16b, v5.16b | |||
| sadalp v24.4s, v8.8h | |||
| ld1 {v4.16b, v5.16b}, [x2], #32 | |||
| sadalp v25.4s, v9.8h | |||
| smull v14.8h, v3.8b, v6.8b | |||
| smull v15.8h, v3.8b, v7.8b | |||
| sadalp v25.4s, v9.8h | |||
| sadalp v26.4s, v10.8h | |||
| smlal2 v14.8h, v3.16b, v6.16b | |||
| smlal2 v15.8h, v3.16b, v7.16b | |||
| sadalp v26.4s, v10.8h | |||
| ld1 {v6.16b, v7.16b}, [x2], #32 | |||
| sadalp v27.4s, v11.8h | |||
| smull v8.8h, v0.8b, v4.8b | |||
| smull v9.8h, v0.8b, v5.8b | |||
| sadalp v27.4s, v11.8h | |||
| sadalp v28.4s, v12.8h | |||
| smlal2 v8.8h, v0.16b, v4.16b | |||
| smlal2 v9.8h, v0.16b, v5.16b | |||
| sadalp v28.4s, v12.8h | |||
| ld1 {v2.16b, v3.16b}, [x12], #32 | |||
| sadalp v29.4s, v13.8h | |||
| smull v12.8h, v1.8b, v4.8b | |||
| smull v13.8h, v1.8b, v5.8b | |||
| sadalp v29.4s, v13.8h | |||
| sadalp v30.4s, v14.8h | |||
| smlal2 v12.8h, v1.16b, v4.16b | |||
| smlal2 v13.8h, v1.16b, v5.16b | |||
| sadalp v30.4s, v14.8h | |||
| sadalp v31.4s, v15.8h | |||
| smull v10.8h, v0.8b, v6.8b | |||
| smull v11.8h, v0.8b, v7.8b | |||
| sadalp v31.4s, v15.8h | |||
| sadalp v16.4s, v8.8h | |||
| smlal2 v10.8h, v0.16b, v6.16b | |||
| smlal2 v11.8h, v0.16b, v7.16b | |||
| sadalp v16.4s, v8.8h | |||
| sadalp v17.4s, v9.8h | |||
| smull v14.8h, v1.8b, v6.8b | |||
| smull v15.8h, v1.8b, v7.8b | |||
| sadalp v17.4s, v9.8h | |||
| saddlp v18.4s, v10.8h | |||
| smlal2 v14.8h, v1.16b, v6.16b | |||
| smlal2 v15.8h, v1.16b, v7.16b | |||
| @@ -158,33 +159,32 @@ IndirectGemmInt8_4x4: | |||
| bne LoopIc | |||
| LoopIcEnd: | |||
| sadalp v18.4s, v10.8h | |||
| sadalp v19.4s, v11.8h | |||
| smull v8.8h, v2.8b, v4.8b | |||
| smull v9.8h, v2.8b, v5.8b | |||
| sadalp v19.4s, v11.8h | |||
| sadalp v20.4s, v12.8h | |||
| smlal2 v8.8h, v2.16b, v4.16b | |||
| smlal2 v9.8h, v2.16b, v5.16b | |||
| sadalp v20.4s, v12.8h | |||
| sadalp v21.4s, v13.8h | |||
| smull v10.8h, v2.8b, v6.8b | |||
| smull v11.8h, v2.8b, v7.8b | |||
| sadalp v21.4s, v13.8h | |||
| sadalp v22.4s, v14.8h | |||
| smlal2 v10.8h, v2.16b, v6.16b | |||
| smlal2 v11.8h, v2.16b, v7.16b | |||
| sadalp v22.4s, v14.8h | |||
| sadalp v23.4s, v15.8h | |||
| smull v12.8h, v3.8b, v4.8b | |||
| smull v13.8h, v3.8b, v5.8b | |||
| sadalp v23.4s, v15.8h | |||
| sadalp v24.4s, v8.8h | |||
| smlal2 v12.8h, v3.16b, v4.16b | |||
| smlal2 v13.8h, v3.16b, v5.16b | |||
| sadalp v24.4s, v8.8h | |||
| sadalp v25.4s, v9.8h | |||
| smull v14.8h, v3.8b, v6.8b | |||
| smull v15.8h, v3.8b, v7.8b | |||
| sadalp v25.4s, v9.8h | |||
| sadalp v26.4s, v10.8h | |||
| smlal2 v14.8h, v3.16b, v6.16b | |||
| smlal2 v15.8h, v3.16b, v7.16b | |||
| sadalp v26.4s, v10.8h | |||
| sadalp v27.4s, v11.8h | |||
| sadalp v28.4s ,v12.8h | |||
| sadalp v28.4s, v12.8h | |||
| sadalp v29.4s, v13.8h | |||
| sadalp v30.4s, v14.8h | |||
| sadalp v31.4s, v15.8h | |||
| @@ -204,6 +204,7 @@ IndirectGemmInt8_4x4: | |||
| addp v26.4s, v26.4s, v27.4s | |||
| addp v28.4s, v28.4s, v29.4s | |||
| addp v30.4s, v30.4s, v31.4s | |||
| dup v12.4s, wzr | |||
| cbz x3, NoReadBias | |||
| ld1 {v12.4s}, [x3] | |||
| NoReadBias: | |||
| @@ -221,40 +222,40 @@ IndirectGemmInt8_4x4: | |||
| add v28.4s, v28.4s, v12.4s | |||
| dup v2.4s, w18 | |||
| sqshl v16.4s, v16.4s ,v2.4s | |||
| sqshl v20.4s, v20.4s ,v2.4s | |||
| sqshl v24.4s, v24.4s ,v2.4s | |||
| sqshl v28.4s, v28.4s ,v2.4s | |||
| sqshl v16.4s, v16.4s, v2.4s | |||
| sqshl v20.4s, v20.4s, v2.4s | |||
| sqshl v24.4s, v24.4s, v2.4s | |||
| sqshl v28.4s, v28.4s, v2.4s | |||
| dup v3.4s, w17 | |||
| sqrdmulh v16.4s, v16.4s ,v3.4s | |||
| sqrdmulh v20.4s, v20.4s ,v3.4s | |||
| sqrdmulh v24.4s, v24.4s ,v3.4s | |||
| sqrdmulh v28.4s, v28.4s ,v3.4s | |||
| sqrdmulh v16.4s, v16.4s, v3.4s | |||
| sqrdmulh v20.4s, v20.4s, v3.4s | |||
| sqrdmulh v24.4s, v24.4s, v3.4s | |||
| sqrdmulh v28.4s, v28.4s, v3.4s | |||
| dup v4.4s, w19 | |||
| sqrshl v16.4s, v16.4s ,v4.4s | |||
| sqrshl v20.4s, v20.4s ,v4.4s | |||
| sqrshl v24.4s, v24.4s ,v4.4s | |||
| sqrshl v28.4s, v28.4s ,v4.4s | |||
| sqrshl v16.4s, v16.4s, v4.4s | |||
| sqrshl v20.4s, v20.4s, v4.4s | |||
| sqrshl v24.4s, v24.4s, v4.4s | |||
| sqrshl v28.4s, v28.4s, v4.4s | |||
| dup v5.4s, w16 | |||
| add v16.4s, v16.4s ,v5.4s | |||
| add v20.4s, v20.4s ,v5.4s | |||
| add v24.4s, v24.4s ,v5.4s | |||
| add v28.4s, v28.4s ,v5.4s | |||
| add v16.4s, v16.4s, v5.4s | |||
| add v20.4s, v20.4s, v5.4s | |||
| add v24.4s, v24.4s, v5.4s | |||
| add v28.4s, v28.4s, v5.4s | |||
| dup v0.4s, w8 | |||
| smax v16.4s, v16.4s ,v0.4s | |||
| smax v20.4s, v20.4s ,v0.4s | |||
| smax v24.4s, v24.4s ,v0.4s | |||
| smax v28.4s, v28.4s ,v0.4s | |||
| smax v16.4s, v16.4s, v0.4s | |||
| smax v20.4s, v20.4s, v0.4s | |||
| smax v24.4s, v24.4s, v0.4s | |||
| smax v28.4s, v28.4s, v0.4s | |||
| dup v1.4s, w9 | |||
| smin v16.4s, v16.4s ,v1.4s | |||
| smin v20.4s, v20.4s ,v1.4s | |||
| smin v24.4s, v24.4s ,v1.4s | |||
| smin v28.4s, v28.4s ,v1.4s | |||
| smin v16.4s, v16.4s, v1.4s | |||
| smin v20.4s, v20.4s, v1.4s | |||
| smin v24.4s, v24.4s, v1.4s | |||
| smin v28.4s, v28.4s, v1.4s | |||
| sqxtn v13.4h, v16.4s | |||
| sqxtn2 v13.8h, v20.4s | |||
| @@ -277,160 +277,160 @@ IndirectGemmInt8_24x4_dp: | |||
| Quantization: | |||
| dup v2.4s, w18 | |||
| sqshl v8.4s, v8.4s ,v2.4s | |||
| sqshl v9.4s, v9.4s ,v2.4s | |||
| sqshl v10.4s, v10.4s ,v2.4s | |||
| sqshl v11.4s, v11.4s ,v2.4s | |||
| sqshl v12.4s, v12.4s ,v2.4s | |||
| sqshl v13.4s, v13.4s ,v2.4s | |||
| sqshl v14.4s, v14.4s ,v2.4s | |||
| sqshl v15.4s, v15.4s ,v2.4s | |||
| sqshl v16.4s, v16.4s ,v2.4s | |||
| sqshl v17.4s, v17.4s ,v2.4s | |||
| sqshl v18.4s, v18.4s ,v2.4s | |||
| sqshl v19.4s, v19.4s ,v2.4s | |||
| sqshl v20.4s, v20.4s ,v2.4s | |||
| sqshl v21.4s, v21.4s ,v2.4s | |||
| sqshl v22.4s, v22.4s ,v2.4s | |||
| sqshl v23.4s, v23.4s ,v2.4s | |||
| sqshl v24.4s, v24.4s ,v2.4s | |||
| sqshl v25.4s, v25.4s ,v2.4s | |||
| sqshl v26.4s, v26.4s ,v2.4s | |||
| sqshl v27.4s, v27.4s ,v2.4s | |||
| sqshl v28.4s, v28.4s ,v2.4s | |||
| sqshl v29.4s, v29.4s ,v2.4s | |||
| sqshl v30.4s, v30.4s ,v2.4s | |||
| sqshl v31.4s, v31.4s ,v2.4s | |||
| sqshl v8.4s, v8.4s, v2.4s | |||
| sqshl v9.4s, v9.4s, v2.4s | |||
| sqshl v10.4s, v10.4s, v2.4s | |||
| sqshl v11.4s, v11.4s, v2.4s | |||
| sqshl v12.4s, v12.4s, v2.4s | |||
| sqshl v13.4s, v13.4s, v2.4s | |||
| sqshl v14.4s, v14.4s, v2.4s | |||
| sqshl v15.4s, v15.4s, v2.4s | |||
| sqshl v16.4s, v16.4s, v2.4s | |||
| sqshl v17.4s, v17.4s, v2.4s | |||
| sqshl v18.4s, v18.4s, v2.4s | |||
| sqshl v19.4s, v19.4s, v2.4s | |||
| sqshl v20.4s, v20.4s, v2.4s | |||
| sqshl v21.4s, v21.4s, v2.4s | |||
| sqshl v22.4s, v22.4s, v2.4s | |||
| sqshl v23.4s, v23.4s, v2.4s | |||
| sqshl v24.4s, v24.4s, v2.4s | |||
| sqshl v25.4s, v25.4s, v2.4s | |||
| sqshl v26.4s, v26.4s, v2.4s | |||
| sqshl v27.4s, v27.4s, v2.4s | |||
| sqshl v28.4s, v28.4s, v2.4s | |||
| sqshl v29.4s, v29.4s, v2.4s | |||
| sqshl v30.4s, v30.4s, v2.4s | |||
| sqshl v31.4s, v31.4s, v2.4s | |||
| dup v3.4s, w17 | |||
| sqrdmulh v8.4s, v8.4s ,v3.4s | |||
| sqrdmulh v9.4s, v9.4s ,v3.4s | |||
| sqrdmulh v10.4s, v10.4s ,v3.4s | |||
| sqrdmulh v11.4s, v11.4s ,v3.4s | |||
| sqrdmulh v12.4s, v12.4s ,v3.4s | |||
| sqrdmulh v13.4s, v13.4s ,v3.4s | |||
| sqrdmulh v14.4s, v14.4s ,v3.4s | |||
| sqrdmulh v15.4s, v15.4s ,v3.4s | |||
| sqrdmulh v16.4s, v16.4s ,v3.4s | |||
| sqrdmulh v17.4s, v17.4s ,v3.4s | |||
| sqrdmulh v18.4s, v18.4s ,v3.4s | |||
| sqrdmulh v19.4s, v19.4s ,v3.4s | |||
| sqrdmulh v20.4s, v20.4s ,v3.4s | |||
| sqrdmulh v21.4s, v21.4s ,v3.4s | |||
| sqrdmulh v22.4s, v22.4s ,v3.4s | |||
| sqrdmulh v23.4s, v23.4s ,v3.4s | |||
| sqrdmulh v24.4s, v24.4s ,v3.4s | |||
| sqrdmulh v25.4s, v25.4s ,v3.4s | |||
| sqrdmulh v26.4s, v26.4s ,v3.4s | |||
| sqrdmulh v27.4s, v27.4s ,v3.4s | |||
| sqrdmulh v28.4s, v28.4s ,v3.4s | |||
| sqrdmulh v29.4s, v29.4s ,v3.4s | |||
| sqrdmulh v30.4s, v30.4s ,v3.4s | |||
| sqrdmulh v31.4s, v31.4s ,v3.4s | |||
| sqrdmulh v8.4s, v8.4s, v3.4s | |||
| sqrdmulh v9.4s, v9.4s, v3.4s | |||
| sqrdmulh v10.4s, v10.4s, v3.4s | |||
| sqrdmulh v11.4s, v11.4s, v3.4s | |||
| sqrdmulh v12.4s, v12.4s, v3.4s | |||
| sqrdmulh v13.4s, v13.4s, v3.4s | |||
| sqrdmulh v14.4s, v14.4s, v3.4s | |||
| sqrdmulh v15.4s, v15.4s, v3.4s | |||
| sqrdmulh v16.4s, v16.4s, v3.4s | |||
| sqrdmulh v17.4s, v17.4s, v3.4s | |||
| sqrdmulh v18.4s, v18.4s, v3.4s | |||
| sqrdmulh v19.4s, v19.4s, v3.4s | |||
| sqrdmulh v20.4s, v20.4s, v3.4s | |||
| sqrdmulh v21.4s, v21.4s, v3.4s | |||
| sqrdmulh v22.4s, v22.4s, v3.4s | |||
| sqrdmulh v23.4s, v23.4s, v3.4s | |||
| sqrdmulh v24.4s, v24.4s, v3.4s | |||
| sqrdmulh v25.4s, v25.4s, v3.4s | |||
| sqrdmulh v26.4s, v26.4s, v3.4s | |||
| sqrdmulh v27.4s, v27.4s, v3.4s | |||
| sqrdmulh v28.4s, v28.4s, v3.4s | |||
| sqrdmulh v29.4s, v29.4s, v3.4s | |||
| sqrdmulh v30.4s, v30.4s, v3.4s | |||
| sqrdmulh v31.4s, v31.4s, v3.4s | |||
| dup v4.4s, w19 | |||
| sqrshl v8.4s, v8.4s ,v4.4s | |||
| sqrshl v9.4s, v9.4s ,v4.4s | |||
| sqrshl v10.4s, v10.4s ,v4.4s | |||
| sqrshl v11.4s, v11.4s ,v4.4s | |||
| sqrshl v12.4s, v12.4s ,v4.4s | |||
| sqrshl v13.4s, v13.4s ,v4.4s | |||
| sqrshl v14.4s, v14.4s ,v4.4s | |||
| sqrshl v15.4s, v15.4s ,v4.4s | |||
| sqrshl v16.4s, v16.4s ,v4.4s | |||
| sqrshl v17.4s, v17.4s ,v4.4s | |||
| sqrshl v18.4s, v18.4s ,v4.4s | |||
| sqrshl v19.4s, v19.4s ,v4.4s | |||
| sqrshl v20.4s, v20.4s ,v4.4s | |||
| sqrshl v21.4s, v21.4s ,v4.4s | |||
| sqrshl v22.4s, v22.4s ,v4.4s | |||
| sqrshl v23.4s, v23.4s ,v4.4s | |||
| sqrshl v24.4s, v24.4s ,v4.4s | |||
| sqrshl v25.4s, v25.4s ,v4.4s | |||
| sqrshl v26.4s, v26.4s ,v4.4s | |||
| sqrshl v27.4s, v27.4s ,v4.4s | |||
| sqrshl v28.4s, v28.4s ,v4.4s | |||
| sqrshl v29.4s, v29.4s ,v4.4s | |||
| sqrshl v30.4s, v30.4s ,v4.4s | |||
| sqrshl v31.4s, v31.4s ,v4.4s | |||
| sqrshl v8.4s, v8.4s, v4.4s | |||
| sqrshl v9.4s, v9.4s, v4.4s | |||
| sqrshl v10.4s, v10.4s, v4.4s | |||
| sqrshl v11.4s, v11.4s, v4.4s | |||
| sqrshl v12.4s, v12.4s, v4.4s | |||
| sqrshl v13.4s, v13.4s, v4.4s | |||
| sqrshl v14.4s, v14.4s, v4.4s | |||
| sqrshl v15.4s, v15.4s, v4.4s | |||
| sqrshl v16.4s, v16.4s, v4.4s | |||
| sqrshl v17.4s, v17.4s, v4.4s | |||
| sqrshl v18.4s, v18.4s, v4.4s | |||
| sqrshl v19.4s, v19.4s, v4.4s | |||
| sqrshl v20.4s, v20.4s, v4.4s | |||
| sqrshl v21.4s, v21.4s, v4.4s | |||
| sqrshl v22.4s, v22.4s, v4.4s | |||
| sqrshl v23.4s, v23.4s, v4.4s | |||
| sqrshl v24.4s, v24.4s, v4.4s | |||
| sqrshl v25.4s, v25.4s, v4.4s | |||
| sqrshl v26.4s, v26.4s, v4.4s | |||
| sqrshl v27.4s, v27.4s, v4.4s | |||
| sqrshl v28.4s, v28.4s, v4.4s | |||
| sqrshl v29.4s, v29.4s, v4.4s | |||
| sqrshl v30.4s, v30.4s, v4.4s | |||
| sqrshl v31.4s, v31.4s, v4.4s | |||
| dup v5.4s, w16 | |||
| add v8.4s, v8.4s ,v5.4s | |||
| add v9.4s, v9.4s ,v5.4s | |||
| add v10.4s, v10.4s ,v5.4s | |||
| add v11.4s, v11.4s ,v5.4s | |||
| add v12.4s, v12.4s ,v5.4s | |||
| add v13.4s, v13.4s ,v5.4s | |||
| add v14.4s, v14.4s ,v5.4s | |||
| add v15.4s, v15.4s ,v5.4s | |||
| add v16.4s, v16.4s ,v5.4s | |||
| add v17.4s, v17.4s ,v5.4s | |||
| add v18.4s, v18.4s ,v5.4s | |||
| add v19.4s, v19.4s ,v5.4s | |||
| add v20.4s, v20.4s ,v5.4s | |||
| add v21.4s, v21.4s ,v5.4s | |||
| add v22.4s, v22.4s ,v5.4s | |||
| add v23.4s, v23.4s ,v5.4s | |||
| add v24.4s, v24.4s ,v5.4s | |||
| add v25.4s, v25.4s ,v5.4s | |||
| add v26.4s, v26.4s ,v5.4s | |||
| add v27.4s, v27.4s ,v5.4s | |||
| add v28.4s, v28.4s ,v5.4s | |||
| add v29.4s, v29.4s ,v5.4s | |||
| add v30.4s, v30.4s ,v5.4s | |||
| add v31.4s, v31.4s ,v5.4s | |||
| add v8.4s, v8.4s, v5.4s | |||
| add v9.4s, v9.4s, v5.4s | |||
| add v10.4s, v10.4s, v5.4s | |||
| add v11.4s, v11.4s, v5.4s | |||
| add v12.4s, v12.4s, v5.4s | |||
| add v13.4s, v13.4s, v5.4s | |||
| add v14.4s, v14.4s, v5.4s | |||
| add v15.4s, v15.4s, v5.4s | |||
| add v16.4s, v16.4s, v5.4s | |||
| add v17.4s, v17.4s, v5.4s | |||
| add v18.4s, v18.4s, v5.4s | |||
| add v19.4s, v19.4s, v5.4s | |||
| add v20.4s, v20.4s, v5.4s | |||
| add v21.4s, v21.4s, v5.4s | |||
| add v22.4s, v22.4s, v5.4s | |||
| add v23.4s, v23.4s, v5.4s | |||
| add v24.4s, v24.4s, v5.4s | |||
| add v25.4s, v25.4s, v5.4s | |||
| add v26.4s, v26.4s, v5.4s | |||
| add v27.4s, v27.4s, v5.4s | |||
| add v28.4s, v28.4s, v5.4s | |||
| add v29.4s, v29.4s, v5.4s | |||
| add v30.4s, v30.4s, v5.4s | |||
| add v31.4s, v31.4s, v5.4s | |||
| dup v0.4s, w8 | |||
| smax v8.4s, v8.4s ,v0.4s | |||
| smax v9.4s, v9.4s ,v0.4s | |||
| smax v10.4s, v10.4s ,v0.4s | |||
| smax v11.4s, v11.4s ,v0.4s | |||
| smax v12.4s, v12.4s ,v0.4s | |||
| smax v13.4s, v13.4s ,v0.4s | |||
| smax v14.4s, v14.4s ,v0.4s | |||
| smax v15.4s, v15.4s ,v0.4s | |||
| smax v16.4s, v16.4s ,v0.4s | |||
| smax v17.4s, v17.4s ,v0.4s | |||
| smax v18.4s, v18.4s ,v0.4s | |||
| smax v19.4s, v19.4s ,v0.4s | |||
| smax v20.4s, v20.4s ,v0.4s | |||
| smax v21.4s, v21.4s ,v0.4s | |||
| smax v22.4s, v22.4s ,v0.4s | |||
| smax v23.4s, v23.4s ,v0.4s | |||
| smax v24.4s, v24.4s ,v0.4s | |||
| smax v25.4s, v25.4s ,v0.4s | |||
| smax v26.4s, v26.4s ,v0.4s | |||
| smax v27.4s, v27.4s ,v0.4s | |||
| smax v28.4s, v28.4s ,v0.4s | |||
| smax v29.4s, v29.4s ,v0.4s | |||
| smax v30.4s, v30.4s ,v0.4s | |||
| smax v31.4s, v31.4s ,v0.4s | |||
| smax v8.4s, v8.4s, v0.4s | |||
| smax v9.4s, v9.4s, v0.4s | |||
| smax v10.4s, v10.4s, v0.4s | |||
| smax v11.4s, v11.4s, v0.4s | |||
| smax v12.4s, v12.4s, v0.4s | |||
| smax v13.4s, v13.4s, v0.4s | |||
| smax v14.4s, v14.4s, v0.4s | |||
| smax v15.4s, v15.4s, v0.4s | |||
| smax v16.4s, v16.4s, v0.4s | |||
| smax v17.4s, v17.4s, v0.4s | |||
| smax v18.4s, v18.4s, v0.4s | |||
| smax v19.4s, v19.4s, v0.4s | |||
| smax v20.4s, v20.4s, v0.4s | |||
| smax v21.4s, v21.4s, v0.4s | |||
| smax v22.4s, v22.4s, v0.4s | |||
| smax v23.4s, v23.4s, v0.4s | |||
| smax v24.4s, v24.4s, v0.4s | |||
| smax v25.4s, v25.4s, v0.4s | |||
| smax v26.4s, v26.4s, v0.4s | |||
| smax v27.4s, v27.4s, v0.4s | |||
| smax v28.4s, v28.4s, v0.4s | |||
| smax v29.4s, v29.4s, v0.4s | |||
| smax v30.4s, v30.4s, v0.4s | |||
| smax v31.4s, v31.4s, v0.4s | |||
| dup v1.4s, w9 | |||
| smin v8.4s, v8.4s ,v1.4s | |||
| smin v9.4s, v9.4s ,v1.4s | |||
| smin v10.4s, v10.4s ,v1.4s | |||
| smin v11.4s, v11.4s ,v1.4s | |||
| smin v12.4s, v12.4s ,v1.4s | |||
| smin v13.4s, v13.4s ,v1.4s | |||
| smin v14.4s, v14.4s ,v1.4s | |||
| smin v15.4s, v15.4s ,v1.4s | |||
| smin v16.4s, v16.4s ,v1.4s | |||
| smin v17.4s, v17.4s ,v1.4s | |||
| smin v18.4s, v18.4s ,v1.4s | |||
| smin v19.4s, v19.4s ,v1.4s | |||
| smin v20.4s, v20.4s ,v1.4s | |||
| smin v21.4s, v21.4s ,v1.4s | |||
| smin v22.4s, v22.4s ,v1.4s | |||
| smin v23.4s, v23.4s ,v1.4s | |||
| smin v24.4s, v24.4s ,v1.4s | |||
| smin v25.4s, v25.4s ,v1.4s | |||
| smin v26.4s, v26.4s ,v1.4s | |||
| smin v27.4s, v27.4s ,v1.4s | |||
| smin v28.4s, v28.4s ,v1.4s | |||
| smin v29.4s, v29.4s ,v1.4s | |||
| smin v30.4s, v30.4s ,v1.4s | |||
| smin v31.4s, v31.4s ,v1.4s | |||
| smin v8.4s, v8.4s, v1.4s | |||
| smin v9.4s, v9.4s, v1.4s | |||
| smin v10.4s, v10.4s, v1.4s | |||
| smin v11.4s, v11.4s, v1.4s | |||
| smin v12.4s, v12.4s, v1.4s | |||
| smin v13.4s, v13.4s, v1.4s | |||
| smin v14.4s, v14.4s, v1.4s | |||
| smin v15.4s, v15.4s, v1.4s | |||
| smin v16.4s, v16.4s, v1.4s | |||
| smin v17.4s, v17.4s, v1.4s | |||
| smin v18.4s, v18.4s, v1.4s | |||
| smin v19.4s, v19.4s, v1.4s | |||
| smin v20.4s, v20.4s, v1.4s | |||
| smin v21.4s, v21.4s, v1.4s | |||
| smin v22.4s, v22.4s, v1.4s | |||
| smin v23.4s, v23.4s, v1.4s | |||
| smin v24.4s, v24.4s, v1.4s | |||
| smin v25.4s, v25.4s, v1.4s | |||
| smin v26.4s, v26.4s, v1.4s | |||
| smin v27.4s, v27.4s, v1.4s | |||
| smin v28.4s, v28.4s, v1.4s | |||
| smin v29.4s, v29.4s, v1.4s | |||
| smin v30.4s, v30.4s, v1.4s | |||
| smin v31.4s, v31.4s, v1.4s | |||
| sqxtn v6.4h, v8.4s | |||
| sqxtn2 v6.8h, v9.4s | |||
| @@ -29,11 +29,13 @@ void IndirectGemmInt8(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const in | |||
| int32_t act_min = conv_param->conv_quant_arg_.out_act_min_[0]; | |||
| int32_t act_max = conv_param->conv_quant_arg_.out_act_max_[0]; | |||
| #ifdef __aarch64__ | |||
| IndirectGemmInt8_4x4(dst, src, weight, bias, kernel_plane, ic4, output_channel, output_channel * sizeof(int8_t), | |||
| input_sum, act_min, act_max, out_zp, out_multiplier, shift_before, shift_after); | |||
| IndirectGemmInt8_4x4(dst, src, weight, bias, UP_DIV(kernel_plane, C4NUM), ic4, output_channel, | |||
| output_channel * sizeof(int8_t), input_sum, act_min, act_max, out_zp, out_multiplier, | |||
| shift_before, shift_after); | |||
| #elif defined(ENABLE_ARM32) | |||
| IndirectGemmInt8_2x4(dst, src, weight, bias, kernel_plane, ic4, output_channel, output_channel * sizeof(int8_t), | |||
| input_sum, act_min, act_max, out_zp, out_multiplier, shift_before, shift_after); | |||
| IndirectGemmInt8_2x4(dst, src, weight, bias, UP_DIV(kernel_plane, C4NUM), ic4, output_channel, | |||
| output_channel * sizeof(int8_t), input_sum, act_min, act_max, out_zp, out_multiplier, | |||
| shift_before, shift_after); | |||
| #else | |||
| int tile_num = conv_param->tile_num_; | |||
| int plane_c4 = UP_DIV(kernel_plane, C4NUM); | |||