|
-
- #ifdef __aarch64__
- .text
- .align 5
- //.p2align 5,,15
- .global PreSum4x16Int8Pert
- #ifndef __APPLE__
- .type PreSum4x16Int8Pert, %function
- #endif
-
- // void PreSum4x16Int8Pert(const int8_t *src, int32_t *dst, size_t row4, size_t col16, int32_t filter_zp);
-
- // x0 src
- // x1 dst
- // w2 row4
- // w3 co16
- // w4 filter_zp
-
- PreSum4x16Int8Pert:
- dup v17.4s, w4
- mov w5, #0
-
- RowLoop:
- cmp w5, w2
- beq End
- add w5, w5, #4
- dup v16.4s, wzr
- mov w6, #0
-
- CalLoop:
- cmp w6, w3
- beq Write
- add w6, w6, #16
-
- ld1 {v0.16b}, [x0], #16
- ld1 {v1.16b}, [x0], #16
- ld1 {v2.16b}, [x0], #16
- ld1 {v3.16b}, [x0], #16
-
- saddlp v4.8h, v0.16b
- saddlp v5.8h, v1.16b
- saddlp v6.8h, v2.16b
- saddlp v7.8h, v3.16b
-
- saddlp v0.4S, v4.8h
- saddlp v1.4S, v5.8h
- saddlp v2.4S, v6.8h
- saddlp v3.4S, v7.8h
-
- addv s4, v0.4S
- addv s5, v1.4S
- addv s6, v2.4S
- addv s7, v3.4S
-
- mov v0.s[0], v4.s[0]
- mov v0.s[1], v5.s[0]
- mov v0.s[2], v6.s[0]
- mov v0.s[3], v7.s[0]
-
- add v16.4s, v16.4s, v0.4s
- b CalLoop
-
- Write:
- mul v16.4s, v16.4s, v17.4s
- st1 {v16.4s}, [x1], #16
- beq RowLoop
-
- End:
- ret
- #endif
|