diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwInt8Center.S b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwInt8Center.S index 0381b6bdb0..50378de953 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwInt8Center.S +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwInt8Center.S @@ -127,102 +127,102 @@ ConvDwInt8Center: subs x20, x20, #1 bne LoopKh16 - sqshl v0.4s, v0.4s ,v26.4s - sqshl v1.4s, v1.4s ,v26.4s - sqshl v2.4s, v2.4s ,v26.4s - sqshl v3.4s, v3.4s ,v26.4s - sqshl v4.4s, v4.4s ,v26.4s - sqshl v5.4s, v5.4s ,v26.4s - sqshl v6.4s, v6.4s ,v26.4s - sqshl v7.4s, v7.4s ,v26.4s - sqshl v8.4s, v8.4s ,v26.4s - sqshl v9.4s, v9.4s ,v26.4s - sqshl v10.4s, v10.4s ,v26.4s - sqshl v11.4s, v11.4s ,v26.4s - sqshl v12.4s, v12.4s ,v26.4s - sqshl v13.4s, v13.4s ,v26.4s - sqshl v14.4s, v14.4s ,v26.4s - sqshl v15.4s, v15.4s ,v26.4s - sqrdmulh v0.4s, v0.4s ,v27.4s - sqrdmulh v1.4s, v1.4s ,v27.4s - sqrdmulh v2.4s, v2.4s ,v27.4s - sqrdmulh v3.4s, v3.4s ,v27.4s - sqrdmulh v4.4s, v4.4s ,v27.4s - sqrdmulh v5.4s, v5.4s ,v27.4s - sqrdmulh v6.4s, v6.4s ,v27.4s - sqrdmulh v7.4s, v7.4s ,v27.4s - sqrdmulh v8.4s, v8.4s ,v27.4s - sqrdmulh v9.4s, v9.4s ,v27.4s - sqrdmulh v10.4s, v10.4s ,v27.4s - sqrdmulh v11.4s, v11.4s ,v27.4s - sqrdmulh v12.4s, v12.4s ,v27.4s - sqrdmulh v13.4s, v13.4s ,v27.4s - sqrdmulh v14.4s, v14.4s ,v27.4s - sqrdmulh v15.4s, v15.4s ,v27.4s - sqrshl v0.4s, v0.4s ,v28.4s - sqrshl v1.4s, v1.4s ,v28.4s - sqrshl v2.4s, v2.4s ,v28.4s - sqrshl v3.4s, v3.4s ,v28.4s - sqrshl v4.4s, v4.4s ,v28.4s - sqrshl v5.4s, v5.4s ,v28.4s - sqrshl v6.4s, v6.4s ,v28.4s - sqrshl v7.4s, v7.4s ,v28.4s - sqrshl v8.4s, v8.4s ,v28.4s - sqrshl v9.4s, v9.4s ,v28.4s - sqrshl v10.4s, v10.4s ,v28.4s - sqrshl v11.4s, v11.4s ,v28.4s - sqrshl v12.4s, v12.4s ,v28.4s - sqrshl v13.4s, v13.4s ,v28.4s - sqrshl v14.4s, v14.4s ,v28.4s - sqrshl v15.4s, v15.4s ,v28.4s - add v0.4s, v0.4s ,v29.4s - add v1.4s, v1.4s ,v29.4s - add v2.4s, v2.4s ,v29.4s - add v3.4s, v3.4s ,v29.4s - add v4.4s, v4.4s ,v29.4s - add v5.4s, v5.4s ,v29.4s - add v6.4s, v6.4s ,v29.4s - add v7.4s, v7.4s ,v29.4s - add v8.4s, v8.4s ,v29.4s - add v9.4s, v9.4s ,v29.4s - add v10.4s, v10.4s ,v29.4s - add v11.4s, v11.4s ,v29.4s - add v12.4s, v12.4s ,v29.4s - add v13.4s, v13.4s ,v29.4s - add v14.4s, v14.4s ,v29.4s - add v15.4s, v15.4s ,v29.4s - smax v0.4s, v0.4s ,v30.4s - smax v1.4s, v1.4s ,v30.4s - smax v2.4s, v2.4s ,v30.4s - smax v3.4s, v3.4s ,v30.4s - smax v4.4s, v4.4s ,v30.4s - smax v5.4s, v5.4s ,v30.4s - smax v6.4s, v6.4s ,v30.4s - smax v7.4s, v7.4s ,v30.4s - smax v8.4s, v8.4s ,v30.4s - smax v9.4s, v9.4s ,v30.4s - smax v10.4s, v10.4s ,v30.4s - smax v11.4s, v11.4s ,v30.4s - smax v12.4s, v12.4s ,v30.4s - smax v13.4s, v13.4s ,v30.4s - smax v14.4s, v14.4s ,v30.4s - smax v15.4s, v15.4s ,v30.4s - smin v0.4s, v0.4s ,v31.4s - smin v1.4s, v1.4s ,v31.4s - smin v2.4s, v2.4s ,v31.4s - smin v3.4s, v3.4s ,v31.4s - smin v4.4s, v4.4s ,v31.4s - smin v5.4s, v5.4s ,v31.4s - smin v6.4s, v6.4s ,v31.4s - smin v7.4s, v7.4s ,v31.4s - smin v8.4s, v8.4s ,v31.4s - smin v9.4s, v9.4s ,v31.4s - smin v10.4s, v10.4s ,v31.4s - smin v11.4s, v11.4s ,v31.4s - smin v12.4s, v12.4s ,v31.4s - smin v13.4s, v13.4s ,v31.4s - smin v14.4s, v14.4s ,v31.4s - smin v15.4s, v15.4s ,v31.4s + sqshl v0.4s, v0.4s, v26.4s + sqshl v1.4s, v1.4s, v26.4s + sqshl v2.4s, v2.4s, v26.4s + sqshl v3.4s, v3.4s, v26.4s + sqshl v4.4s, v4.4s, v26.4s + sqshl v5.4s, v5.4s, v26.4s + sqshl v6.4s, v6.4s, v26.4s + sqshl v7.4s, v7.4s, v26.4s + sqshl v8.4s, v8.4s, v26.4s + sqshl v9.4s, v9.4s, v26.4s + sqshl v10.4s, v10.4s, v26.4s + sqshl v11.4s, v11.4s, v26.4s + sqshl v12.4s, v12.4s, v26.4s + sqshl v13.4s, v13.4s, v26.4s + sqshl v14.4s, v14.4s, v26.4s + sqshl v15.4s, v15.4s, v26.4s + sqrdmulh v0.4s, v0.4s, v27.4s + sqrdmulh v1.4s, v1.4s, v27.4s + sqrdmulh v2.4s, v2.4s, v27.4s + sqrdmulh v3.4s, v3.4s, v27.4s + sqrdmulh v4.4s, v4.4s, v27.4s + sqrdmulh v5.4s, v5.4s, v27.4s + sqrdmulh v6.4s, v6.4s, v27.4s + sqrdmulh v7.4s, v7.4s, v27.4s + sqrdmulh v8.4s, v8.4s, v27.4s + sqrdmulh v9.4s, v9.4s, v27.4s + sqrdmulh v10.4s, v10.4s, v27.4s + sqrdmulh v11.4s, v11.4s, v27.4s + sqrdmulh v12.4s, v12.4s, v27.4s + sqrdmulh v13.4s, v13.4s, v27.4s + sqrdmulh v14.4s, v14.4s, v27.4s + sqrdmulh v15.4s, v15.4s, v27.4s + sqrshl v0.4s, v0.4s, v28.4s + sqrshl v1.4s, v1.4s, v28.4s + sqrshl v2.4s, v2.4s, v28.4s + sqrshl v3.4s, v3.4s, v28.4s + sqrshl v4.4s, v4.4s, v28.4s + sqrshl v5.4s, v5.4s, v28.4s + sqrshl v6.4s, v6.4s, v28.4s + sqrshl v7.4s, v7.4s, v28.4s + sqrshl v8.4s, v8.4s, v28.4s + sqrshl v9.4s, v9.4s, v28.4s + sqrshl v10.4s, v10.4s, v28.4s + sqrshl v11.4s, v11.4s, v28.4s + sqrshl v12.4s, v12.4s, v28.4s + sqrshl v13.4s, v13.4s, v28.4s + sqrshl v14.4s, v14.4s, v28.4s + sqrshl v15.4s, v15.4s, v28.4s + add v0.4s, v0.4s, v29.4s + add v1.4s, v1.4s, v29.4s + add v2.4s, v2.4s, v29.4s + add v3.4s, v3.4s, v29.4s + add v4.4s, v4.4s, v29.4s + add v5.4s, v5.4s, v29.4s + add v6.4s, v6.4s, v29.4s + add v7.4s, v7.4s, v29.4s + add v8.4s, v8.4s, v29.4s + add v9.4s, v9.4s, v29.4s + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + add v12.4s, v12.4s, v29.4s + add v13.4s, v13.4s, v29.4s + add v14.4s, v14.4s, v29.4s + add v15.4s, v15.4s, v29.4s + smax v0.4s, v0.4s, v30.4s + smax v1.4s, v1.4s, v30.4s + smax v2.4s, v2.4s, v30.4s + smax v3.4s, v3.4s, v30.4s + smax v4.4s, v4.4s, v30.4s + smax v5.4s, v5.4s, v30.4s + smax v6.4s, v6.4s, v30.4s + smax v7.4s, v7.4s, v30.4s + smax v8.4s, v8.4s, v30.4s + smax v9.4s, v9.4s, v30.4s + smax v10.4s, v10.4s, v30.4s + smax v11.4s, v11.4s, v30.4s + smax v12.4s, v12.4s, v30.4s + smax v13.4s, v13.4s, v30.4s + smax v14.4s, v14.4s, v30.4s + smax v15.4s, v15.4s, v30.4s + smin v0.4s, v0.4s, v31.4s + smin v1.4s, v1.4s, v31.4s + smin v2.4s, v2.4s, v31.4s + smin v3.4s, v3.4s, v31.4s + smin v4.4s, v4.4s, v31.4s + smin v5.4s, v5.4s, v31.4s + smin v6.4s, v6.4s, v31.4s + smin v7.4s, v7.4s, v31.4s + smin v8.4s, v8.4s, v31.4s + smin v9.4s, v9.4s, v31.4s + smin v10.4s, v10.4s, v31.4s + smin v11.4s, v11.4s, v31.4s + smin v12.4s, v12.4s, v31.4s + smin v13.4s, v13.4s, v31.4s + smin v14.4s, v14.4s, v31.4s + smin v15.4s, v15.4s, v31.4s sqxtn v0.4h, v0.4s sqxtn v1.4h, v1.4s @@ -391,54 +391,54 @@ ConvDwInt8Center: subs x20, x20, #1 bne LoopKh8 - sqshl v0.4s, v0.4s ,v26.4s - sqshl v1.4s, v1.4s ,v26.4s - sqshl v2.4s, v2.4s ,v26.4s - sqshl v3.4s, v3.4s ,v26.4s - sqshl v4.4s, v4.4s ,v26.4s - sqshl v5.4s, v5.4s ,v26.4s - sqshl v6.4s, v6.4s ,v26.4s - sqshl v7.4s, v7.4s ,v26.4s - sqrdmulh v0.4s, v0.4s ,v27.4s - sqrdmulh v1.4s, v1.4s ,v27.4s - sqrdmulh v2.4s, v2.4s ,v27.4s - sqrdmulh v3.4s, v3.4s ,v27.4s - sqrdmulh v4.4s, v4.4s ,v27.4s - sqrdmulh v5.4s, v5.4s ,v27.4s - sqrdmulh v6.4s, v6.4s ,v27.4s - sqrdmulh v7.4s, v7.4s ,v27.4s - sqrshl v0.4s, v0.4s ,v28.4s - sqrshl v1.4s, v1.4s ,v28.4s - sqrshl v2.4s, v2.4s ,v28.4s - sqrshl v3.4s, v3.4s ,v28.4s - sqrshl v4.4s, v4.4s ,v28.4s - sqrshl v5.4s, v5.4s ,v28.4s - sqrshl v6.4s, v6.4s ,v28.4s - sqrshl v7.4s, v7.4s ,v28.4s - add v0.4s, v0.4s ,v29.4s - add v1.4s, v1.4s ,v29.4s - add v2.4s, v2.4s ,v29.4s - add v3.4s, v3.4s ,v29.4s - add v4.4s, v4.4s ,v29.4s - add v5.4s, v5.4s ,v29.4s - add v6.4s, v6.4s ,v29.4s - add v7.4s, v7.4s ,v29.4s - smax v0.4s, v0.4s ,v30.4s - smax v1.4s, v1.4s ,v30.4s - smax v2.4s, v2.4s ,v30.4s - smax v3.4s, v3.4s ,v30.4s - smax v4.4s, v4.4s ,v30.4s - smax v5.4s, v5.4s ,v30.4s - smax v6.4s, v6.4s ,v30.4s - smax v7.4s, v7.4s ,v30.4s - smin v0.4s, v0.4s ,v31.4s - smin v1.4s, v1.4s ,v31.4s - smin v2.4s, v2.4s ,v31.4s - smin v3.4s, v3.4s ,v31.4s - smin v4.4s, v4.4s ,v31.4s - smin v5.4s, v5.4s ,v31.4s - smin v6.4s, v6.4s ,v31.4s - smin v7.4s, v7.4s ,v31.4s + sqshl v0.4s, v0.4s, v26.4s + sqshl v1.4s, v1.4s, v26.4s + sqshl v2.4s, v2.4s, v26.4s + sqshl v3.4s, v3.4s, v26.4s + sqshl v4.4s, v4.4s, v26.4s + sqshl v5.4s, v5.4s, v26.4s + sqshl v6.4s, v6.4s, v26.4s + sqshl v7.4s, v7.4s, v26.4s + sqrdmulh v0.4s, v0.4s, v27.4s + sqrdmulh v1.4s, v1.4s, v27.4s + sqrdmulh v2.4s, v2.4s, v27.4s + sqrdmulh v3.4s, v3.4s, v27.4s + sqrdmulh v4.4s, v4.4s, v27.4s + sqrdmulh v5.4s, v5.4s, v27.4s + sqrdmulh v6.4s, v6.4s, v27.4s + sqrdmulh v7.4s, v7.4s, v27.4s + sqrshl v0.4s, v0.4s, v28.4s + sqrshl v1.4s, v1.4s, v28.4s + sqrshl v2.4s, v2.4s, v28.4s + sqrshl v3.4s, v3.4s, v28.4s + sqrshl v4.4s, v4.4s, v28.4s + sqrshl v5.4s, v5.4s, v28.4s + sqrshl v6.4s, v6.4s, v28.4s + sqrshl v7.4s, v7.4s, v28.4s + add v0.4s, v0.4s, v29.4s + add v1.4s, v1.4s, v29.4s + add v2.4s, v2.4s, v29.4s + add v3.4s, v3.4s, v29.4s + add v4.4s, v4.4s, v29.4s + add v5.4s, v5.4s, v29.4s + add v6.4s, v6.4s, v29.4s + add v7.4s, v7.4s, v29.4s + smax v0.4s, v0.4s, v30.4s + smax v1.4s, v1.4s, v30.4s + smax v2.4s, v2.4s, v30.4s + smax v3.4s, v3.4s, v30.4s + smax v4.4s, v4.4s, v30.4s + smax v5.4s, v5.4s, v30.4s + smax v6.4s, v6.4s, v30.4s + smax v7.4s, v7.4s, v30.4s + smin v0.4s, v0.4s, v31.4s + smin v1.4s, v1.4s, v31.4s + smin v2.4s, v2.4s, v31.4s + smin v3.4s, v3.4s, v31.4s + smin v4.4s, v4.4s, v31.4s + smin v5.4s, v5.4s, v31.4s + smin v6.4s, v6.4s, v31.4s + smin v7.4s, v7.4s, v31.4s sqxtn v0.4h, v0.4s sqxtn v1.4h, v1.4s @@ -524,12 +524,12 @@ ConvDwInt8Center: subs x20, x20, #1 bne LoopKh - sqshl v0.4s, v0.4s ,v26.4s - sqrdmulh v0.4s, v0.4s ,v27.4s - sqrshl v0.4s, v0.4s ,v28.4s - add v0.4s, v0.4s ,v29.4s - smax v0.4s, v0.4s ,v30.4s - smin v0.4s, v0.4s ,v31.4s + sqshl v0.4s, v0.4s, v26.4s + sqrdmulh v0.4s, v0.4s, v27.4s + sqrshl v0.4s, v0.4s, v28.4s + add v0.4s, v0.4s, v29.4s + smax v0.4s, v0.4s, v30.4s + smin v0.4s, v0.4s, v31.4s sqxtn v0.4h, v0.4s sqxtn v0.8b, v0.8h diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/IndirectGemmFp32_8x8.S b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/IndirectGemmFp32_8x8.S index be649b0e58..6373bb7132 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/IndirectGemmFp32_8x8.S +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/IndirectGemmFp32_8x8.S @@ -268,40 +268,40 @@ IndirectGemmStart: Relu6: movi v1.4s, #6 scvtf v1.4s, v1.4s - fmin v16.4s, v16.4s ,v1.4s - fmin v17.4s, v17.4s ,v1.4s - fmin v18.4s, v18.4s ,v1.4s - fmin v19.4s, v19.4s ,v1.4s - fmin v20.4s, v20.4s ,v1.4s - fmin v21.4s, v21.4s ,v1.4s - fmin v22.4s, v22.4s ,v1.4s - fmin v23.4s, v23.4s ,v1.4s - fmin v24.4s, v24.4s ,v1.4s - fmin v25.4s, v25.4s ,v1.4s - fmin v26.4s, v26.4s ,v1.4s - fmin v27.4s, v27.4s ,v1.4s - fmin v28.4s, v28.4s ,v1.4s - fmin v29.4s, v29.4s ,v1.4s - fmin v30.4s, v30.4s ,v1.4s - fmin v31.4s, v31.4s ,v1.4s + fmin v16.4s, v16.4s, v1.4s + fmin v17.4s, v17.4s, v1.4s + fmin v18.4s, v18.4s, v1.4s + fmin v19.4s, v19.4s, v1.4s + fmin v20.4s, v20.4s, v1.4s + fmin v21.4s, v21.4s, v1.4s + fmin v22.4s, v22.4s, v1.4s + fmin v23.4s, v23.4s, v1.4s + fmin v24.4s, v24.4s, v1.4s + fmin v25.4s, v25.4s, v1.4s + fmin v26.4s, v26.4s, v1.4s + fmin v27.4s, v27.4s, v1.4s + fmin v28.4s, v28.4s, v1.4s + fmin v29.4s, v29.4s, v1.4s + fmin v30.4s, v30.4s, v1.4s + fmin v31.4s, v31.4s, v1.4s Relu: dup v0.4s, wzr - fmax v16.4s, v16.4s ,v0.4s - fmax v17.4s, v17.4s ,v0.4s - fmax v18.4s, v18.4s ,v0.4s - fmax v19.4s, v19.4s ,v0.4s - fmax v20.4s, v20.4s ,v0.4s - fmax v21.4s, v21.4s ,v0.4s - fmax v22.4s, v22.4s ,v0.4s - fmax v23.4s, v23.4s ,v0.4s - fmax v24.4s, v24.4s ,v0.4s - fmax v25.4s, v25.4s ,v0.4s - fmax v26.4s, v26.4s ,v0.4s - fmax v27.4s, v27.4s ,v0.4s - fmax v28.4s, v28.4s ,v0.4s - fmax v29.4s, v29.4s ,v0.4s - fmax v30.4s, v30.4s ,v0.4s - fmax v31.4s, v31.4s ,v0.4s + fmax v16.4s, v16.4s, v0.4s + fmax v17.4s, v17.4s, v0.4s + fmax v18.4s, v18.4s, v0.4s + fmax v19.4s, v19.4s, v0.4s + fmax v20.4s, v20.4s, v0.4s + fmax v21.4s, v21.4s, v0.4s + fmax v22.4s, v22.4s, v0.4s + fmax v23.4s, v23.4s, v0.4s + fmax v24.4s, v24.4s, v0.4s + fmax v25.4s, v25.4s, v0.4s + fmax v26.4s, v26.4s, v0.4s + fmax v27.4s, v27.4s, v0.4s + fmax v28.4s, v28.4s, v0.4s + fmax v29.4s, v29.4s, v0.4s + fmax v30.4s, v30.4s, v0.4s + fmax v31.4s, v31.4s, v0.4s WriteStart: cbnz x9, WriteC4 @@ -595,24 +595,24 @@ IndirectGemmStart: Relu6Half: movi v1.4s, #6 scvtf v1.4s, v1.4s - fmin v16.4s, v16.4s ,v1.4s - fmin v18.4s, v18.4s ,v1.4s - fmin v20.4s, v20.4s ,v1.4s - fmin v22.4s, v22.4s ,v1.4s - fmin v24.4s, v24.4s ,v1.4s - fmin v26.4s, v26.4s ,v1.4s - fmin v28.4s, v28.4s ,v1.4s - fmin v30.4s, v30.4s ,v1.4s + fmin v16.4s, v16.4s, v1.4s + fmin v18.4s, v18.4s, v1.4s + fmin v20.4s, v20.4s, v1.4s + fmin v22.4s, v22.4s, v1.4s + fmin v24.4s, v24.4s, v1.4s + fmin v26.4s, v26.4s, v1.4s + fmin v28.4s, v28.4s, v1.4s + fmin v30.4s, v30.4s, v1.4s ReluHalf: dup v0.4s, wzr - fmax v16.4s, v16.4s ,v0.4s - fmax v18.4s, v18.4s ,v0.4s - fmax v20.4s, v20.4s ,v0.4s - fmax v22.4s, v22.4s ,v0.4s - fmax v24.4s, v24.4s ,v0.4s - fmax v26.4s, v26.4s ,v0.4s - fmax v28.4s, v28.4s ,v0.4s - fmax v30.4s, v30.4s ,v0.4s + fmax v16.4s, v16.4s, v0.4s + fmax v18.4s, v18.4s, v0.4s + fmax v20.4s, v20.4s, v0.4s + fmax v22.4s, v22.4s, v0.4s + fmax v24.4s, v24.4s, v0.4s + fmax v26.4s, v26.4s, v0.4s + fmax v28.4s, v28.4s, v0.4s + fmax v30.4s, v30.4s, v0.4s WriteStartHalf: cbnz x9, Write4 diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/IndirectGemmInt8_4x4.S b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/IndirectGemmInt8_4x4.S index f70495e0e2..1843fed283 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/IndirectGemmInt8_4x4.S +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/IndirectGemmInt8_4x4.S @@ -87,14 +87,15 @@ IndirectGemmInt8_4x4: ld1 {v2.16b, v3.16b}, [x12], #32 smull v10.8h, v0.8b, v6.8b smull v11.8h, v0.8b, v7.8b + saddlp v16.4s, v8.8h smlal2 v10.8h, v0.16b, v6.16b smlal2 v11.8h, v0.16b, v7.16b - saddlp v16.4s, v8.8h + saddlp v17.4s, v9.8h smull v14.8h, v1.8b, v6.8b smull v15.8h, v1.8b, v7.8b + saddlp v18.4s, v10.8h smlal2 v14.8h, v1.16b, v6.16b smlal2 v15.8h, v1.16b, v7.16b - saddlp v17.4s, v9.8h subs x13, x5, #1 beq LoopIcEnd @@ -102,55 +103,55 @@ IndirectGemmInt8_4x4: LoopIc: // load input for output 1-8 ld1 {v0.16b, v1.16b}, [x12], #32 - sadalp v18.4s, v10.8h + sadalp v19.4s, v11.8h smull v8.8h, v2.8b, v4.8b smull v9.8h, v2.8b, v5.8b - sadalp v19.4s, v11.8h + sadalp v20.4s, v12.8h smlal2 v8.8h, v2.16b, v4.16b smlal2 v9.8h, v2.16b, v5.16b - sadalp v20.4s, v12.8h + sadalp v21.4s, v13.8h smull v10.8h, v2.8b, v6.8b smull v11.8h, v2.8b, v7.8b - sadalp v21.4s, v13.8h + sadalp v22.4s, v14.8h smlal2 v10.8h, v2.16b, v6.16b smlal2 v11.8h, v2.16b, v7.16b - sadalp v22.4s, v14.8h + sadalp v23.4s, v15.8h smull v12.8h, v3.8b, v4.8b smull v13.8h, v3.8b, v5.8b - sadalp v23.4s, v15.8h + sadalp v24.4s, v8.8h smlal2 v12.8h, v3.16b, v4.16b smlal2 v13.8h, v3.16b, v5.16b - sadalp v24.4s, v8.8h ld1 {v4.16b, v5.16b}, [x2], #32 + sadalp v25.4s, v9.8h smull v14.8h, v3.8b, v6.8b smull v15.8h, v3.8b, v7.8b - sadalp v25.4s, v9.8h + sadalp v26.4s, v10.8h smlal2 v14.8h, v3.16b, v6.16b smlal2 v15.8h, v3.16b, v7.16b - sadalp v26.4s, v10.8h ld1 {v6.16b, v7.16b}, [x2], #32 + sadalp v27.4s, v11.8h smull v8.8h, v0.8b, v4.8b smull v9.8h, v0.8b, v5.8b - sadalp v27.4s, v11.8h + sadalp v28.4s, v12.8h smlal2 v8.8h, v0.16b, v4.16b smlal2 v9.8h, v0.16b, v5.16b - sadalp v28.4s, v12.8h ld1 {v2.16b, v3.16b}, [x12], #32 + sadalp v29.4s, v13.8h smull v12.8h, v1.8b, v4.8b smull v13.8h, v1.8b, v5.8b - sadalp v29.4s, v13.8h + sadalp v30.4s, v14.8h smlal2 v12.8h, v1.16b, v4.16b smlal2 v13.8h, v1.16b, v5.16b - sadalp v30.4s, v14.8h + sadalp v31.4s, v15.8h smull v10.8h, v0.8b, v6.8b smull v11.8h, v0.8b, v7.8b - sadalp v31.4s, v15.8h + sadalp v16.4s, v8.8h smlal2 v10.8h, v0.16b, v6.16b smlal2 v11.8h, v0.16b, v7.16b - sadalp v16.4s, v8.8h + sadalp v17.4s, v9.8h smull v14.8h, v1.8b, v6.8b smull v15.8h, v1.8b, v7.8b - sadalp v17.4s, v9.8h + saddlp v18.4s, v10.8h smlal2 v14.8h, v1.16b, v6.16b smlal2 v15.8h, v1.16b, v7.16b @@ -158,33 +159,32 @@ IndirectGemmInt8_4x4: bne LoopIc LoopIcEnd: - sadalp v18.4s, v10.8h + sadalp v19.4s, v11.8h smull v8.8h, v2.8b, v4.8b smull v9.8h, v2.8b, v5.8b - sadalp v19.4s, v11.8h + sadalp v20.4s, v12.8h smlal2 v8.8h, v2.16b, v4.16b smlal2 v9.8h, v2.16b, v5.16b - sadalp v20.4s, v12.8h + sadalp v21.4s, v13.8h smull v10.8h, v2.8b, v6.8b smull v11.8h, v2.8b, v7.8b - sadalp v21.4s, v13.8h + sadalp v22.4s, v14.8h smlal2 v10.8h, v2.16b, v6.16b smlal2 v11.8h, v2.16b, v7.16b - sadalp v22.4s, v14.8h + sadalp v23.4s, v15.8h smull v12.8h, v3.8b, v4.8b smull v13.8h, v3.8b, v5.8b - sadalp v23.4s, v15.8h + sadalp v24.4s, v8.8h smlal2 v12.8h, v3.16b, v4.16b smlal2 v13.8h, v3.16b, v5.16b - sadalp v24.4s, v8.8h + sadalp v25.4s, v9.8h smull v14.8h, v3.8b, v6.8b smull v15.8h, v3.8b, v7.8b - sadalp v25.4s, v9.8h + sadalp v26.4s, v10.8h smlal2 v14.8h, v3.16b, v6.16b smlal2 v15.8h, v3.16b, v7.16b - sadalp v26.4s, v10.8h sadalp v27.4s, v11.8h - sadalp v28.4s ,v12.8h + sadalp v28.4s, v12.8h sadalp v29.4s, v13.8h sadalp v30.4s, v14.8h sadalp v31.4s, v15.8h @@ -204,6 +204,7 @@ IndirectGemmInt8_4x4: addp v26.4s, v26.4s, v27.4s addp v28.4s, v28.4s, v29.4s addp v30.4s, v30.4s, v31.4s + dup v12.4s, wzr cbz x3, NoReadBias ld1 {v12.4s}, [x3] NoReadBias: @@ -221,40 +222,40 @@ IndirectGemmInt8_4x4: add v28.4s, v28.4s, v12.4s dup v2.4s, w18 - sqshl v16.4s, v16.4s ,v2.4s - sqshl v20.4s, v20.4s ,v2.4s - sqshl v24.4s, v24.4s ,v2.4s - sqshl v28.4s, v28.4s ,v2.4s + sqshl v16.4s, v16.4s, v2.4s + sqshl v20.4s, v20.4s, v2.4s + sqshl v24.4s, v24.4s, v2.4s + sqshl v28.4s, v28.4s, v2.4s dup v3.4s, w17 - sqrdmulh v16.4s, v16.4s ,v3.4s - sqrdmulh v20.4s, v20.4s ,v3.4s - sqrdmulh v24.4s, v24.4s ,v3.4s - sqrdmulh v28.4s, v28.4s ,v3.4s + sqrdmulh v16.4s, v16.4s, v3.4s + sqrdmulh v20.4s, v20.4s, v3.4s + sqrdmulh v24.4s, v24.4s, v3.4s + sqrdmulh v28.4s, v28.4s, v3.4s dup v4.4s, w19 - sqrshl v16.4s, v16.4s ,v4.4s - sqrshl v20.4s, v20.4s ,v4.4s - sqrshl v24.4s, v24.4s ,v4.4s - sqrshl v28.4s, v28.4s ,v4.4s + sqrshl v16.4s, v16.4s, v4.4s + sqrshl v20.4s, v20.4s, v4.4s + sqrshl v24.4s, v24.4s, v4.4s + sqrshl v28.4s, v28.4s, v4.4s dup v5.4s, w16 - add v16.4s, v16.4s ,v5.4s - add v20.4s, v20.4s ,v5.4s - add v24.4s, v24.4s ,v5.4s - add v28.4s, v28.4s ,v5.4s + add v16.4s, v16.4s, v5.4s + add v20.4s, v20.4s, v5.4s + add v24.4s, v24.4s, v5.4s + add v28.4s, v28.4s, v5.4s dup v0.4s, w8 - smax v16.4s, v16.4s ,v0.4s - smax v20.4s, v20.4s ,v0.4s - smax v24.4s, v24.4s ,v0.4s - smax v28.4s, v28.4s ,v0.4s + smax v16.4s, v16.4s, v0.4s + smax v20.4s, v20.4s, v0.4s + smax v24.4s, v24.4s, v0.4s + smax v28.4s, v28.4s, v0.4s dup v1.4s, w9 - smin v16.4s, v16.4s ,v1.4s - smin v20.4s, v20.4s ,v1.4s - smin v24.4s, v24.4s ,v1.4s - smin v28.4s, v28.4s ,v1.4s + smin v16.4s, v16.4s, v1.4s + smin v20.4s, v20.4s, v1.4s + smin v24.4s, v24.4s, v1.4s + smin v28.4s, v28.4s, v1.4s sqxtn v13.4h, v16.4s sqxtn2 v13.8h, v20.4s diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/IndirectGemmInt8_24x4_dp.S b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/IndirectGemmInt8_24x4_dp.S index 278b4376b2..ca7c73e62e 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/IndirectGemmInt8_24x4_dp.S +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/IndirectGemmInt8_24x4_dp.S @@ -277,160 +277,160 @@ IndirectGemmInt8_24x4_dp: Quantization: dup v2.4s, w18 - sqshl v8.4s, v8.4s ,v2.4s - sqshl v9.4s, v9.4s ,v2.4s - sqshl v10.4s, v10.4s ,v2.4s - sqshl v11.4s, v11.4s ,v2.4s - sqshl v12.4s, v12.4s ,v2.4s - sqshl v13.4s, v13.4s ,v2.4s - sqshl v14.4s, v14.4s ,v2.4s - sqshl v15.4s, v15.4s ,v2.4s - sqshl v16.4s, v16.4s ,v2.4s - sqshl v17.4s, v17.4s ,v2.4s - sqshl v18.4s, v18.4s ,v2.4s - sqshl v19.4s, v19.4s ,v2.4s - sqshl v20.4s, v20.4s ,v2.4s - sqshl v21.4s, v21.4s ,v2.4s - sqshl v22.4s, v22.4s ,v2.4s - sqshl v23.4s, v23.4s ,v2.4s - sqshl v24.4s, v24.4s ,v2.4s - sqshl v25.4s, v25.4s ,v2.4s - sqshl v26.4s, v26.4s ,v2.4s - sqshl v27.4s, v27.4s ,v2.4s - sqshl v28.4s, v28.4s ,v2.4s - sqshl v29.4s, v29.4s ,v2.4s - sqshl v30.4s, v30.4s ,v2.4s - sqshl v31.4s, v31.4s ,v2.4s + sqshl v8.4s, v8.4s, v2.4s + sqshl v9.4s, v9.4s, v2.4s + sqshl v10.4s, v10.4s, v2.4s + sqshl v11.4s, v11.4s, v2.4s + sqshl v12.4s, v12.4s, v2.4s + sqshl v13.4s, v13.4s, v2.4s + sqshl v14.4s, v14.4s, v2.4s + sqshl v15.4s, v15.4s, v2.4s + sqshl v16.4s, v16.4s, v2.4s + sqshl v17.4s, v17.4s, v2.4s + sqshl v18.4s, v18.4s, v2.4s + sqshl v19.4s, v19.4s, v2.4s + sqshl v20.4s, v20.4s, v2.4s + sqshl v21.4s, v21.4s, v2.4s + sqshl v22.4s, v22.4s, v2.4s + sqshl v23.4s, v23.4s, v2.4s + sqshl v24.4s, v24.4s, v2.4s + sqshl v25.4s, v25.4s, v2.4s + sqshl v26.4s, v26.4s, v2.4s + sqshl v27.4s, v27.4s, v2.4s + sqshl v28.4s, v28.4s, v2.4s + sqshl v29.4s, v29.4s, v2.4s + sqshl v30.4s, v30.4s, v2.4s + sqshl v31.4s, v31.4s, v2.4s dup v3.4s, w17 - sqrdmulh v8.4s, v8.4s ,v3.4s - sqrdmulh v9.4s, v9.4s ,v3.4s - sqrdmulh v10.4s, v10.4s ,v3.4s - sqrdmulh v11.4s, v11.4s ,v3.4s - sqrdmulh v12.4s, v12.4s ,v3.4s - sqrdmulh v13.4s, v13.4s ,v3.4s - sqrdmulh v14.4s, v14.4s ,v3.4s - sqrdmulh v15.4s, v15.4s ,v3.4s - sqrdmulh v16.4s, v16.4s ,v3.4s - sqrdmulh v17.4s, v17.4s ,v3.4s - sqrdmulh v18.4s, v18.4s ,v3.4s - sqrdmulh v19.4s, v19.4s ,v3.4s - sqrdmulh v20.4s, v20.4s ,v3.4s - sqrdmulh v21.4s, v21.4s ,v3.4s - sqrdmulh v22.4s, v22.4s ,v3.4s - sqrdmulh v23.4s, v23.4s ,v3.4s - sqrdmulh v24.4s, v24.4s ,v3.4s - sqrdmulh v25.4s, v25.4s ,v3.4s - sqrdmulh v26.4s, v26.4s ,v3.4s - sqrdmulh v27.4s, v27.4s ,v3.4s - sqrdmulh v28.4s, v28.4s ,v3.4s - sqrdmulh v29.4s, v29.4s ,v3.4s - sqrdmulh v30.4s, v30.4s ,v3.4s - sqrdmulh v31.4s, v31.4s ,v3.4s + sqrdmulh v8.4s, v8.4s, v3.4s + sqrdmulh v9.4s, v9.4s, v3.4s + sqrdmulh v10.4s, v10.4s, v3.4s + sqrdmulh v11.4s, v11.4s, v3.4s + sqrdmulh v12.4s, v12.4s, v3.4s + sqrdmulh v13.4s, v13.4s, v3.4s + sqrdmulh v14.4s, v14.4s, v3.4s + sqrdmulh v15.4s, v15.4s, v3.4s + sqrdmulh v16.4s, v16.4s, v3.4s + sqrdmulh v17.4s, v17.4s, v3.4s + sqrdmulh v18.4s, v18.4s, v3.4s + sqrdmulh v19.4s, v19.4s, v3.4s + sqrdmulh v20.4s, v20.4s, v3.4s + sqrdmulh v21.4s, v21.4s, v3.4s + sqrdmulh v22.4s, v22.4s, v3.4s + sqrdmulh v23.4s, v23.4s, v3.4s + sqrdmulh v24.4s, v24.4s, v3.4s + sqrdmulh v25.4s, v25.4s, v3.4s + sqrdmulh v26.4s, v26.4s, v3.4s + sqrdmulh v27.4s, v27.4s, v3.4s + sqrdmulh v28.4s, v28.4s, v3.4s + sqrdmulh v29.4s, v29.4s, v3.4s + sqrdmulh v30.4s, v30.4s, v3.4s + sqrdmulh v31.4s, v31.4s, v3.4s dup v4.4s, w19 - sqrshl v8.4s, v8.4s ,v4.4s - sqrshl v9.4s, v9.4s ,v4.4s - sqrshl v10.4s, v10.4s ,v4.4s - sqrshl v11.4s, v11.4s ,v4.4s - sqrshl v12.4s, v12.4s ,v4.4s - sqrshl v13.4s, v13.4s ,v4.4s - sqrshl v14.4s, v14.4s ,v4.4s - sqrshl v15.4s, v15.4s ,v4.4s - sqrshl v16.4s, v16.4s ,v4.4s - sqrshl v17.4s, v17.4s ,v4.4s - sqrshl v18.4s, v18.4s ,v4.4s - sqrshl v19.4s, v19.4s ,v4.4s - sqrshl v20.4s, v20.4s ,v4.4s - sqrshl v21.4s, v21.4s ,v4.4s - sqrshl v22.4s, v22.4s ,v4.4s - sqrshl v23.4s, v23.4s ,v4.4s - sqrshl v24.4s, v24.4s ,v4.4s - sqrshl v25.4s, v25.4s ,v4.4s - sqrshl v26.4s, v26.4s ,v4.4s - sqrshl v27.4s, v27.4s ,v4.4s - sqrshl v28.4s, v28.4s ,v4.4s - sqrshl v29.4s, v29.4s ,v4.4s - sqrshl v30.4s, v30.4s ,v4.4s - sqrshl v31.4s, v31.4s ,v4.4s + sqrshl v8.4s, v8.4s, v4.4s + sqrshl v9.4s, v9.4s, v4.4s + sqrshl v10.4s, v10.4s, v4.4s + sqrshl v11.4s, v11.4s, v4.4s + sqrshl v12.4s, v12.4s, v4.4s + sqrshl v13.4s, v13.4s, v4.4s + sqrshl v14.4s, v14.4s, v4.4s + sqrshl v15.4s, v15.4s, v4.4s + sqrshl v16.4s, v16.4s, v4.4s + sqrshl v17.4s, v17.4s, v4.4s + sqrshl v18.4s, v18.4s, v4.4s + sqrshl v19.4s, v19.4s, v4.4s + sqrshl v20.4s, v20.4s, v4.4s + sqrshl v21.4s, v21.4s, v4.4s + sqrshl v22.4s, v22.4s, v4.4s + sqrshl v23.4s, v23.4s, v4.4s + sqrshl v24.4s, v24.4s, v4.4s + sqrshl v25.4s, v25.4s, v4.4s + sqrshl v26.4s, v26.4s, v4.4s + sqrshl v27.4s, v27.4s, v4.4s + sqrshl v28.4s, v28.4s, v4.4s + sqrshl v29.4s, v29.4s, v4.4s + sqrshl v30.4s, v30.4s, v4.4s + sqrshl v31.4s, v31.4s, v4.4s dup v5.4s, w16 - add v8.4s, v8.4s ,v5.4s - add v9.4s, v9.4s ,v5.4s - add v10.4s, v10.4s ,v5.4s - add v11.4s, v11.4s ,v5.4s - add v12.4s, v12.4s ,v5.4s - add v13.4s, v13.4s ,v5.4s - add v14.4s, v14.4s ,v5.4s - add v15.4s, v15.4s ,v5.4s - add v16.4s, v16.4s ,v5.4s - add v17.4s, v17.4s ,v5.4s - add v18.4s, v18.4s ,v5.4s - add v19.4s, v19.4s ,v5.4s - add v20.4s, v20.4s ,v5.4s - add v21.4s, v21.4s ,v5.4s - add v22.4s, v22.4s ,v5.4s - add v23.4s, v23.4s ,v5.4s - add v24.4s, v24.4s ,v5.4s - add v25.4s, v25.4s ,v5.4s - add v26.4s, v26.4s ,v5.4s - add v27.4s, v27.4s ,v5.4s - add v28.4s, v28.4s ,v5.4s - add v29.4s, v29.4s ,v5.4s - add v30.4s, v30.4s ,v5.4s - add v31.4s, v31.4s ,v5.4s + add v8.4s, v8.4s, v5.4s + add v9.4s, v9.4s, v5.4s + add v10.4s, v10.4s, v5.4s + add v11.4s, v11.4s, v5.4s + add v12.4s, v12.4s, v5.4s + add v13.4s, v13.4s, v5.4s + add v14.4s, v14.4s, v5.4s + add v15.4s, v15.4s, v5.4s + add v16.4s, v16.4s, v5.4s + add v17.4s, v17.4s, v5.4s + add v18.4s, v18.4s, v5.4s + add v19.4s, v19.4s, v5.4s + add v20.4s, v20.4s, v5.4s + add v21.4s, v21.4s, v5.4s + add v22.4s, v22.4s, v5.4s + add v23.4s, v23.4s, v5.4s + add v24.4s, v24.4s, v5.4s + add v25.4s, v25.4s, v5.4s + add v26.4s, v26.4s, v5.4s + add v27.4s, v27.4s, v5.4s + add v28.4s, v28.4s, v5.4s + add v29.4s, v29.4s, v5.4s + add v30.4s, v30.4s, v5.4s + add v31.4s, v31.4s, v5.4s dup v0.4s, w8 - smax v8.4s, v8.4s ,v0.4s - smax v9.4s, v9.4s ,v0.4s - smax v10.4s, v10.4s ,v0.4s - smax v11.4s, v11.4s ,v0.4s - smax v12.4s, v12.4s ,v0.4s - smax v13.4s, v13.4s ,v0.4s - smax v14.4s, v14.4s ,v0.4s - smax v15.4s, v15.4s ,v0.4s - smax v16.4s, v16.4s ,v0.4s - smax v17.4s, v17.4s ,v0.4s - smax v18.4s, v18.4s ,v0.4s - smax v19.4s, v19.4s ,v0.4s - smax v20.4s, v20.4s ,v0.4s - smax v21.4s, v21.4s ,v0.4s - smax v22.4s, v22.4s ,v0.4s - smax v23.4s, v23.4s ,v0.4s - smax v24.4s, v24.4s ,v0.4s - smax v25.4s, v25.4s ,v0.4s - smax v26.4s, v26.4s ,v0.4s - smax v27.4s, v27.4s ,v0.4s - smax v28.4s, v28.4s ,v0.4s - smax v29.4s, v29.4s ,v0.4s - smax v30.4s, v30.4s ,v0.4s - smax v31.4s, v31.4s ,v0.4s + smax v8.4s, v8.4s, v0.4s + smax v9.4s, v9.4s, v0.4s + smax v10.4s, v10.4s, v0.4s + smax v11.4s, v11.4s, v0.4s + smax v12.4s, v12.4s, v0.4s + smax v13.4s, v13.4s, v0.4s + smax v14.4s, v14.4s, v0.4s + smax v15.4s, v15.4s, v0.4s + smax v16.4s, v16.4s, v0.4s + smax v17.4s, v17.4s, v0.4s + smax v18.4s, v18.4s, v0.4s + smax v19.4s, v19.4s, v0.4s + smax v20.4s, v20.4s, v0.4s + smax v21.4s, v21.4s, v0.4s + smax v22.4s, v22.4s, v0.4s + smax v23.4s, v23.4s, v0.4s + smax v24.4s, v24.4s, v0.4s + smax v25.4s, v25.4s, v0.4s + smax v26.4s, v26.4s, v0.4s + smax v27.4s, v27.4s, v0.4s + smax v28.4s, v28.4s, v0.4s + smax v29.4s, v29.4s, v0.4s + smax v30.4s, v30.4s, v0.4s + smax v31.4s, v31.4s, v0.4s dup v1.4s, w9 - smin v8.4s, v8.4s ,v1.4s - smin v9.4s, v9.4s ,v1.4s - smin v10.4s, v10.4s ,v1.4s - smin v11.4s, v11.4s ,v1.4s - smin v12.4s, v12.4s ,v1.4s - smin v13.4s, v13.4s ,v1.4s - smin v14.4s, v14.4s ,v1.4s - smin v15.4s, v15.4s ,v1.4s - smin v16.4s, v16.4s ,v1.4s - smin v17.4s, v17.4s ,v1.4s - smin v18.4s, v18.4s ,v1.4s - smin v19.4s, v19.4s ,v1.4s - smin v20.4s, v20.4s ,v1.4s - smin v21.4s, v21.4s ,v1.4s - smin v22.4s, v22.4s ,v1.4s - smin v23.4s, v23.4s ,v1.4s - smin v24.4s, v24.4s ,v1.4s - smin v25.4s, v25.4s ,v1.4s - smin v26.4s, v26.4s ,v1.4s - smin v27.4s, v27.4s ,v1.4s - smin v28.4s, v28.4s ,v1.4s - smin v29.4s, v29.4s ,v1.4s - smin v30.4s, v30.4s ,v1.4s - smin v31.4s, v31.4s ,v1.4s + smin v8.4s, v8.4s, v1.4s + smin v9.4s, v9.4s, v1.4s + smin v10.4s, v10.4s, v1.4s + smin v11.4s, v11.4s, v1.4s + smin v12.4s, v12.4s, v1.4s + smin v13.4s, v13.4s, v1.4s + smin v14.4s, v14.4s, v1.4s + smin v15.4s, v15.4s, v1.4s + smin v16.4s, v16.4s, v1.4s + smin v17.4s, v17.4s, v1.4s + smin v18.4s, v18.4s, v1.4s + smin v19.4s, v19.4s, v1.4s + smin v20.4s, v20.4s, v1.4s + smin v21.4s, v21.4s, v1.4s + smin v22.4s, v22.4s, v1.4s + smin v23.4s, v23.4s, v1.4s + smin v24.4s, v24.4s, v1.4s + smin v25.4s, v25.4s, v1.4s + smin v26.4s, v26.4s, v1.4s + smin v27.4s, v27.4s, v1.4s + smin v28.4s, v28.4s, v1.4s + smin v29.4s, v29.4s, v1.4s + smin v30.4s, v30.4s, v1.4s + smin v31.4s, v31.4s, v1.4s sqxtn v6.4h, v8.4s sqxtn2 v6.8h, v9.4s diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/conv_int8.cc b/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/conv_int8.cc index 83b1d9bc5f..aa9379932b 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/conv_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/conv_int8.cc @@ -29,11 +29,13 @@ void IndirectGemmInt8(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const in int32_t act_min = conv_param->conv_quant_arg_.out_act_min_[0]; int32_t act_max = conv_param->conv_quant_arg_.out_act_max_[0]; #ifdef __aarch64__ - IndirectGemmInt8_4x4(dst, src, weight, bias, kernel_plane, ic4, output_channel, output_channel * sizeof(int8_t), - input_sum, act_min, act_max, out_zp, out_multiplier, shift_before, shift_after); + IndirectGemmInt8_4x4(dst, src, weight, bias, UP_DIV(kernel_plane, C4NUM), ic4, output_channel, + output_channel * sizeof(int8_t), input_sum, act_min, act_max, out_zp, out_multiplier, + shift_before, shift_after); #elif defined(ENABLE_ARM32) - IndirectGemmInt8_2x4(dst, src, weight, bias, kernel_plane, ic4, output_channel, output_channel * sizeof(int8_t), - input_sum, act_min, act_max, out_zp, out_multiplier, shift_before, shift_after); + IndirectGemmInt8_2x4(dst, src, weight, bias, UP_DIV(kernel_plane, C4NUM), ic4, output_channel, + output_channel * sizeof(int8_t), input_sum, act_min, act_max, out_zp, out_multiplier, + shift_before, shift_after); #else int tile_num = conv_param->tile_num_; int plane_c4 = UP_DIV(kernel_plane, C4NUM);