| @@ -1,5 +1,6 @@ | |||||
| #ifdef __arm__ | #ifdef __arm__ | ||||
| #ifndef __aarch64__ | #ifndef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -17,7 +18,7 @@ | |||||
| // r0: dst, r1: src, r2: weight, r3: bias, r4: height, r5: width, r6: in_kh_step, r7: in_kw_step, | // r0: dst, r1: src, r2: weight, r3: bias, r4: height, r5: width, r6: in_kh_step, r7: in_kw_step, | ||||
| // r8: channel, r9: in_zp, r10: out_zp, r11: out_multiplier, r12: left_shift, r13: right_shift | // r8: channel, r9: in_zp, r10: out_zp, r11: out_multiplier, r12: left_shift, r13: right_shift | ||||
| // r14: acc_min, r15: acc_max | // r14: acc_min, r15: acc_max | ||||
| ConvDw3x3Int8BorderPixel: | |||||
| asm_function ConvDw3x3Int8BorderPixel | |||||
| // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | ||||
| // according to https://stackoverflow.com/questions/53625807 | // according to https://stackoverflow.com/questions/53625807 | ||||
| // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef ENABLE_ARM32 | #ifdef ENABLE_ARM32 | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -11,7 +12,7 @@ | |||||
| // size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6) | // size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6) | ||||
| // r0: dst, r1: src, r2: weight, r3: bias, r4: height, r5: width, r6: in_kh_step, r7: in_kw_step, | // r0: dst, r1: src, r2: weight, r3: bias, r4: height, r5: width, r6: in_kh_step, r7: in_kw_step, | ||||
| // r8: kernel_w, r9: relu, r10: relu6 | // r8: kernel_w, r9: relu, r10: relu6 | ||||
| ConvDwFp32Border: | |||||
| asm_function ConvDwFp32Border | |||||
| // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf | // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf | ||||
| push {r4-r12, lr} | push {r4-r12, lr} | ||||
| vpush {q4-q7} | vpush {q4-q7} | ||||
| @@ -1,5 +1,6 @@ | |||||
| #ifdef __arm__ | #ifdef __arm__ | ||||
| #ifndef __aarch64__ | #ifndef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -14,7 +15,7 @@ | |||||
| // r0: dst, r1: src, r2: weight, r3: bias, #0: height, #4: width, #8: kernel_h, #12: kernel_w, | // r0: dst, r1: src, r2: weight, r3: bias, #0: height, #4: width, #8: kernel_h, #12: kernel_w, | ||||
| // #16: out_h_step, #20: block_channel, #24: in_sh_step, #28: in_sw_step, #32: in_kh_step,#36: in_kw_step | // #16: out_h_step, #20: block_channel, #24: in_sh_step, #28: in_sw_step, #32: in_kh_step,#36: in_kw_step | ||||
| // #40: relu, #44: relu6 | // #40: relu, #44: relu6 | ||||
| ConvDwFp32Center: | |||||
| asm_function ConvDwFp32Center | |||||
| // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | ||||
| // according to https://stackoverflow.com/questions/53625807 | // according to https://stackoverflow.com/questions/53625807 | ||||
| // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef ENABLE_ARM32 | #ifdef ENABLE_ARM32 | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -11,7 +12,7 @@ | |||||
| // size_t num_pixels, size_t input_channel, size_t input_step) | // size_t num_pixels, size_t input_channel, size_t input_step) | ||||
| // r0: output_ptr, r1: input_ptr, r2: filter_ptr, r3: num_pixels, | // r0: output_ptr, r1: input_ptr, r2: filter_ptr, r3: num_pixels, | ||||
| // r4: input_channel, r5: input_step | // r4: input_channel, r5: input_step | ||||
| ConvDwFp32Row: | |||||
| asm_function ConvDwFp32Row | |||||
| // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf | // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf | ||||
| push {r4-r6, r8, r10, r11} | push {r4-r6, r8, r10, r11} | ||||
| @@ -1,5 +1,6 @@ | |||||
| #ifdef __arm__ | #ifdef __arm__ | ||||
| #ifndef __aarch64__ | #ifndef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -15,7 +16,7 @@ | |||||
| // #-48: dst, #-44: src, #-40: weight, #-36: bias, #0: height, #4: width, #8: kernel_h, #12: kernel_w, | // #-48: dst, #-44: src, #-40: weight, #-36: bias, #0: height, #4: width, #8: kernel_h, #12: kernel_w, | ||||
| // #16: out_h_step, #20: block_channel, #24: in_sh_step, #28: in_sw_step, #32: in_kh_step, #36: in_kw_step | // #16: out_h_step, #20: block_channel, #24: in_sh_step, #28: in_sw_step, #32: in_kh_step, #36: in_kw_step | ||||
| // #40: in_zp, #44: out_zp, #48: out_multiplier, #52: left_shift, #56: right_shift, #60:acc_min, #64: acc_max | // #40: in_zp, #44: out_zp, #48: out_multiplier, #52: left_shift, #56: right_shift, #60:acc_min, #64: acc_max | ||||
| ConvDwInt8Center: | |||||
| asm_function ConvDwInt8Center | |||||
| // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | ||||
| // according to https://stackoverflow.com/questions/53625807 | // according to https://stackoverflow.com/questions/53625807 | ||||
| // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | ||||
| @@ -1,5 +1,6 @@ | |||||
| #ifdef __arm__ | #ifdef __arm__ | ||||
| #ifndef __aarch64__ | #ifndef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -13,7 +14,7 @@ | |||||
| // r0: dst, r1: buffer, r2: num_pixels, r3: output_zp, r4: out_multiplier, | // r0: dst, r1: buffer, r2: num_pixels, r3: output_zp, r4: out_multiplier, | ||||
| // r5: left_shift, r6: right_shift, r7: acc_min, r8: acc_max | // r5: left_shift, r6: right_shift, r7: acc_min, r8: acc_max | ||||
| ConvDwInt8PostAlign4: | |||||
| asm_function ConvDwInt8PostAlign4 | |||||
| // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | ||||
| // according to https://stackoverflow.com/questions/53625807 | // according to https://stackoverflow.com/questions/53625807 | ||||
| // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | ||||
| @@ -1,5 +1,6 @@ | |||||
| #ifdef __arm__ | #ifdef __arm__ | ||||
| #ifndef __aarch64__ | #ifndef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -13,7 +14,7 @@ | |||||
| // r0: dst, r1: buffer, r2: num_pixels, r3: output_zp, r4: out_multiplier, | // r0: dst, r1: buffer, r2: num_pixels, r3: output_zp, r4: out_multiplier, | ||||
| // r5: left_shift, r6: right_shift, r7: acc_min, r8: acc_max | // r5: left_shift, r6: right_shift, r7: acc_min, r8: acc_max | ||||
| ConvDwInt8PostAlign4PerChannel: | |||||
| asm_function ConvDwInt8PostAlign4PerChannel | |||||
| // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | ||||
| // according to https://stackoverflow.com/questions/53625807 | // according to https://stackoverflow.com/questions/53625807 | ||||
| // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | ||||
| @@ -1,5 +1,6 @@ | |||||
| #ifdef __arm__ | #ifdef __arm__ | ||||
| #ifndef __aarch64__ | #ifndef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -13,7 +14,7 @@ | |||||
| // r0: output_ptr, r1: input_ptr, r2: weight_ptr, r3: num_pixels, | // r0: output_ptr, r1: input_ptr, r2: weight_ptr, r3: num_pixels, | ||||
| // r4: output_channel, r5: input_step, r6: input_zp, | // r4: output_channel, r5: input_step, r6: input_zp, | ||||
| ConvDwInt8Row: | |||||
| asm_function ConvDwInt8Row | |||||
| // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | ||||
| // according to https://stackoverflow.com/questions/53625807 | // according to https://stackoverflow.com/questions/53625807 | ||||
| // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | ||||
| @@ -1,5 +1,6 @@ | |||||
| #ifdef __arm__ | #ifdef __arm__ | ||||
| #ifndef __aarch64__ | #ifndef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -13,7 +14,7 @@ | |||||
| // size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); | // size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); | ||||
| // r0: dst, r1: src, r2: weight, r3: height, r4: width, #52: kernel_h, #56: kernel_w, #60: out_h_step | // r0: dst, r1: src, r2: weight, r3: height, r4: width, #52: kernel_h, #56: kernel_w, #60: out_h_step | ||||
| // #64: block_channel, #68: in_sh_step, #72: in_sw_step, #76: in_kh_step, #80: in_kw_step | // #64: block_channel, #68: in_sh_step, #72: in_sw_step, #76: in_kh_step, #80: in_kw_step | ||||
| DeconvDwFp32Center: | |||||
| asm_function DeconvDwFp32Center | |||||
| // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | ||||
| // according to https://stackoverflow.com/questions/53625807 | // according to https://stackoverflow.com/questions/53625807 | ||||
| // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | ||||
| @@ -1,5 +1,6 @@ | |||||
| #ifdef __arm__ | #ifdef __arm__ | ||||
| #ifndef __aarch64__ | #ifndef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -13,7 +14,7 @@ | |||||
| // size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); | // size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); | ||||
| // r0: dst, r1: src, r2: weight, r3: height, r4: width, #52: kernel_h, #56: kernel_w, #60: out_h_step | // r0: dst, r1: src, r2: weight, r3: height, r4: width, #52: kernel_h, #56: kernel_w, #60: out_h_step | ||||
| // #64: block_channel, #68: in_sh_step, #72: in_sw_step, #76: in_kh_step, #80: in_kw_step | // #64: block_channel, #68: in_sh_step, #72: in_sw_step, #76: in_kh_step, #80: in_kw_step | ||||
| DeconvDwInt8Center: | |||||
| asm_function DeconvDwInt8Center | |||||
| // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | ||||
| // according to https://stackoverflow.com/questions/53625807 | // according to https://stackoverflow.com/questions/53625807 | ||||
| // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | ||||
| @@ -1,5 +1,6 @@ | |||||
| #ifdef __arm__ | #ifdef __arm__ | ||||
| #ifndef __aarch64__ | #ifndef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -14,7 +15,7 @@ | |||||
| // r0: dst, r1: output_buffer, r2: bias, r3: block_channel, r4: pixel_nums, r5: out_multiplier, | // r0: dst, r1: output_buffer, r2: bias, r3: block_channel, r4: pixel_nums, r5: out_multiplier, | ||||
| // r6: left_shift, r7: right_shift, r8: out_zp, r9: acc_min, r10: acc_max | // r6: left_shift, r7: right_shift, r8: out_zp, r9: acc_min, r10: acc_max | ||||
| DeconvDwInt8Post: | |||||
| asm_function DeconvDwInt8Post | |||||
| // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | ||||
| // according to https://stackoverflow.com/questions/53625807 | // according to https://stackoverflow.com/questions/53625807 | ||||
| // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef ENABLE_ARM32 | #ifdef ENABLE_ARM32 | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -9,7 +10,7 @@ | |||||
| // void IndirectGemmInt16to32_8x4(int *output, short *input, short *weight, size_t kszie, size_t ic8, size_t oc4, size_t offset); | // void IndirectGemmInt16to32_8x4(int *output, short *input, short *weight, size_t kszie, size_t ic8, size_t oc4, size_t offset); | ||||
| // r0: output, r1: input, r2: weight, r3: kszie, r4: ic8, r5: oc4, r6: offset | // r0: output, r1: input, r2: weight, r3: kszie, r4: ic8, r5: oc4, r6: offset | ||||
| IndirectGemmInt16to32_8x4: | |||||
| asm_function IndirectGemmInt16to32_8x4 | |||||
| .macro INIT_ZERO | .macro INIT_ZERO | ||||
| // we could also use "vmov.s32 q12, #0" to initialize q12 by 0 | // we could also use "vmov.s32 q12, #0" to initialize q12 by 0 | ||||
| @@ -1,5 +1,6 @@ | |||||
| #ifdef __arm__ | #ifdef __arm__ | ||||
| #ifndef __aarch64__ | #ifndef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -13,7 +14,7 @@ | |||||
| // int32_t *shift_before, int32_t *shift_after, size_t asymmetric, size_t per_channel, size_t per_channel_offset); | // int32_t *shift_before, int32_t *shift_after, size_t asymmetric, size_t per_channel, size_t per_channel_offset); | ||||
| // r0: output, r1: input, r2: weight, r3: bias, r4: kSize, r5: ic4, r6: oc, r7: offset | // r0: output, r1: input, r2: weight, r3: bias, r4: kSize, r5: ic4, r6: oc, r7: offset | ||||
| // r8: input_sum, r10: act_min, r11: act_max, r10: out_zp, r11: out_multiplier, r10: shift_before, r11: shift_after | // r8: input_sum, r10: act_min, r11: act_max, r10: out_zp, r11: out_multiplier, r10: shift_before, r11: shift_after | ||||
| IndirectGemmInt8_2x4: | |||||
| asm_function IndirectGemmInt8_2x4 | |||||
| .macro INIT_BIAS | .macro INIT_BIAS | ||||
| veor q10, q10, q10 | veor q10, q10, q10 | ||||
| @@ -221,7 +222,7 @@ IndirectGemmInt8_2x4: | |||||
| vqmovn.s32 d31, q12 | vqmovn.s32 d31, q12 | ||||
| vqmovn.s16 d0, q15 | vqmovn.s16 d0, q15 | ||||
| // prefetching is not prefered while writing results in spite of cache missings | |||||
| // prefetching is not preferred while writing results in spite of cache missing | |||||
| // you could try prfm pstl2strm | // you could try prfm pstl2strm | ||||
| WriteStart: | WriteStart: | ||||
| cmp r6, #1 | cmp r6, #1 | ||||
| @@ -1,5 +1,6 @@ | |||||
| #ifdef __arm__ | #ifdef __arm__ | ||||
| #ifndef __aarch64__ | #ifndef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -17,7 +18,7 @@ | |||||
| // r5: depth | // r5: depth | ||||
| // r6: col | // r6: col | ||||
| MatVecMulFp32: | |||||
| asm_function MatVecMulFp32 | |||||
| // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf | // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf | ||||
| push {r0-r8, r10, r11, lr} | push {r0-r8, r10, r11, lr} | ||||
| add sp, sp, #48 | add sp, sp, #48 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef ENABLE_ARM32 | #ifdef ENABLE_ARM32 | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| .global MatmulFloatNeon32 | .global MatmulFloatNeon32 | ||||
| @@ -19,7 +20,7 @@ | |||||
| // r8: stride | // r8: stride | ||||
| // lr: writeNhwc/writeWino | // lr: writeNhwc/writeWino | ||||
| MatmulFloatNeon32: | |||||
| asm_function MatmulFloatNeon32 | |||||
| // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf | // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf | ||||
| push {r0-r8, r10, r11, lr} | push {r0-r8, r10, r11, lr} | ||||
| add sp, sp, #48 | add sp, sp, #48 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef ENABLE_ARM32 | #ifdef ENABLE_ARM32 | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| .global MatmulFloatNeon32Opt | .global MatmulFloatNeon32Opt | ||||
| @@ -19,7 +20,7 @@ | |||||
| // r8: stride | // r8: stride | ||||
| // lr: writeNhwc/writeWino | // lr: writeNhwc/writeWino | ||||
| MatmulFloatNeon32Opt: | |||||
| asm_function MatmulFloatNeon32Opt | |||||
| // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf | // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf | ||||
| push {r0-r8, r10, r11, lr} | push {r0-r8, r10, r11, lr} | ||||
| add sp, sp, #48 | add sp, sp, #48 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef ENABLE_ARM32 | #ifdef ENABLE_ARM32 | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| .global MatmulFloatNeon32Opt12x4 | .global MatmulFloatNeon32Opt12x4 | ||||
| @@ -19,7 +20,7 @@ | |||||
| // r8: stride | // r8: stride | ||||
| // lr: OutType_C8 = 0, OutType_Nhwc = 1, OutType_TileC8 = 2 | // lr: OutType_C8 = 0, OutType_Nhwc = 1, OutType_TileC8 = 2 | ||||
| MatmulFloatNeon32Opt12x4: | |||||
| asm_function MatmulFloatNeon32Opt12x4 | |||||
| // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf | // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf | ||||
| push {r0-r8, r10, r11, lr} | push {r0-r8, r10, r11, lr} | ||||
| vpush {q4-q7} | vpush {q4-q7} | ||||
| @@ -1,5 +1,6 @@ | |||||
| #ifdef __arm__ | #ifdef __arm__ | ||||
| #ifndef __aarch64__ | #ifndef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -15,7 +16,7 @@ | |||||
| // #0: col, #4: deep16, #8: input_sums, #12: weight_bias, #16: act_min, #20: act_max, #24: out_zp | // #0: col, #4: deep16, #8: input_sums, #12: weight_bias, #16: act_min, #20: act_max, #24: out_zp | ||||
| // #28: multiplier, #32: left_shift, #36: right_shift, #40: stride, #44: per_channel | // #28: multiplier, #32: left_shift, #36: right_shift, #40: stride, #44: per_channel | ||||
| MatmulInt8Neon32: | |||||
| asm_function MatmulInt8Neon32 | |||||
| push {r0-r11, lr} | push {r0-r11, lr} | ||||
| vpush {q4-q7} | vpush {q4-q7} | ||||
| add sp, sp, #116 | add sp, sp, #116 | ||||
| @@ -117,7 +118,7 @@ End3: | |||||
| bgt PerChannel | bgt PerChannel | ||||
| PerTensor: | PerTensor: | ||||
| // Substract input_sums | |||||
| // Subtract input_sums | |||||
| vld1.32 {d24, d25}, [r6]! | vld1.32 {d24, d25}, [r6]! | ||||
| vdup.32 d20, d24[0] | vdup.32 d20, d24[0] | ||||
| vdup.32 d21, d24[1] | vdup.32 d21, d24[1] | ||||
| @@ -157,7 +158,7 @@ PerTensor: | |||||
| b AddDstZP | b AddDstZP | ||||
| PerChannel: | PerChannel: | ||||
| // Substract input_sums | |||||
| // Subtract input_sums | |||||
| vld1.32 {d24, d25, d26, d27}, [r6]! | vld1.32 {d24, d25, d26, d27}, [r6]! | ||||
| vsub.s32 d28, d28, d24 | vsub.s32 d28, d28, d24 | ||||
| vsub.s32 d29, d29, d25 | vsub.s32 d29, d29, d25 | ||||
| @@ -1,5 +1,6 @@ | |||||
| #ifdef __arm__ | #ifdef __arm__ | ||||
| #ifndef __aarch64__ | #ifndef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -16,7 +17,7 @@ | |||||
| // #0: col, #4: deep16, #8: input_sums, #12: weight_bias, #16: act_min, #20: act_max, #24: out_zp | // #0: col, #4: deep16, #8: input_sums, #12: weight_bias, #16: act_min, #20: act_max, #24: out_zp | ||||
| // #28: multiplier, #32: left_shift, #36: right_shift, #40: stride, #44: per_channel, #48: filter_zp | // #28: multiplier, #32: left_shift, #36: right_shift, #40: stride, #44: per_channel, #48: filter_zp | ||||
| MatmulInt8Opt: | |||||
| asm_function MatmulInt8Opt | |||||
| push {r0-r8, r10, r11, lr} | push {r0-r8, r10, r11, lr} | ||||
| vpush {q4-q7} | vpush {q4-q7} | ||||
| add sp, sp, #112 | add sp, sp, #112 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef ENABLE_ARM32 | #ifdef ENABLE_ARM32 | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -10,7 +11,7 @@ | |||||
| // MatrixMultiplyWinograd(float *matix_a, float *matrix_b, float *matrix_c, int m, int k, int n, int in_channel, int c4_channel) | // MatrixMultiplyWinograd(float *matix_a, float *matrix_b, float *matrix_c, int m, int k, int n, int in_channel, int c4_channel) | ||||
| // r0: matrix_a, r1: matrix_b, r2: matrix_c, r3: m, r4: k, r5: n, r6: in_channel, r7: c4_channel * 4 | // r0: matrix_a, r1: matrix_b, r2: matrix_c, r3: m, r4: k, r5: n, r6: in_channel, r7: c4_channel * 4 | ||||
| // #-56: matrix_a, #-52: matrix_b, #-48: matrix_c, #-44: m, #0: k, #4: n, #8: in_channel, #12: c4_channel * 4 | // #-56: matrix_a, #-52: matrix_b, #-48: matrix_c, #-44: m, #0: k, #4: n, #8: in_channel, #12: c4_channel * 4 | ||||
| MatrixMultiplyWinograd: | |||||
| asm_function MatrixMultiplyWinograd | |||||
| // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | ||||
| // according to https://stackoverflow.com/questions/53625807 | // according to https://stackoverflow.com/questions/53625807 | ||||
| // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | ||||
| @@ -1,3 +1,4 @@ | |||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -7,7 +8,7 @@ | |||||
| .type PostFuncBiasReluC4, %function | .type PostFuncBiasReluC4, %function | ||||
| #endif | #endif | ||||
| PostFuncBiasReluC4: | |||||
| asm_function PostFuncBiasReluC4 | |||||
| push {r4-r8, r10, r11, lr} | push {r4-r8, r10, r11, lr} | ||||
| add sp, sp, #32 | add sp, sp, #32 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef ENABLE_ARM32 | #ifdef ENABLE_ARM32 | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -21,7 +22,7 @@ | |||||
| // lr oc8 loop control | // lr oc8 loop control | ||||
| // r8 hw loop control | // r8 hw loop control | ||||
| PostFuncBiasReluC8: | |||||
| asm_function PostFuncBiasReluC8 | |||||
| push {r4-r8, r10, r11, lr} | push {r4-r8, r10, r11, lr} | ||||
| add sp, sp, #32 | add sp, sp, #32 | ||||
| @@ -1,3 +1,4 @@ | |||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -19,7 +20,7 @@ | |||||
| // r6 oc_res2 | // r6 oc_res2 | ||||
| // r7 stride | // r7 stride | ||||
| PreSum4x16Int8Peroc: | |||||
| asm_function PreSum4x16Int8Peroc | |||||
| push {r4-r11, lr} | push {r4-r11, lr} | ||||
| vpush {q4-q7} | vpush {q4-q7} | ||||
| add sp, sp, #100 | add sp, sp, #100 | ||||
| @@ -1,3 +1,4 @@ | |||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -15,7 +16,7 @@ | |||||
| // r3 co16 | // r3 co16 | ||||
| // r4 filter_zp | // r4 filter_zp | ||||
| PreSum4x16Int8Pert: | |||||
| asm_function PreSum4x16Int8Pert | |||||
| push {r4-r8, r10, r11, lr} | push {r4-r8, r10, r11, lr} | ||||
| vpush {q4-q7} | vpush {q4-q7} | ||||
| add sp, sp, #96 | add sp, sp, #96 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef ENABLE_ARM32 | #ifdef ENABLE_ARM32 | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| .global TiledC4MatmulFp32 | .global TiledC4MatmulFp32 | ||||
| @@ -6,7 +7,7 @@ | |||||
| .type TiledC4MatmulFp32, %function | .type TiledC4MatmulFp32, %function | ||||
| #endif | #endif | ||||
| TiledC4MatmulFp32: | |||||
| asm_function TiledC4MatmulFp32 | |||||
| //void TiledC4MatmulFp32(float* dst, const float* src, const float* weight, size_t cal_num, size_t ic4, size_t oc4) | //void TiledC4MatmulFp32(float* dst, const float* src, const float* weight, size_t cal_num, size_t ic4, size_t oc4) | ||||
| //x0: dst | //x0: dst | ||||
| //x1: src | //x1: src | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef ENABLE_ARM32 | #ifdef ENABLE_ARM32 | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -15,7 +16,7 @@ | |||||
| //x4: h | //x4: h | ||||
| //x5: k | //x5: k | ||||
| //x6: length | //x6: length | ||||
| WinogradTransLeft: | |||||
| asm_function WinogradTransLeft | |||||
| push {r4-r11, lr} | push {r4-r11, lr} | ||||
| ldr r4, [sp, #36] | ldr r4, [sp, #36] | ||||
| ldr r5, [sp, #40] | ldr r5, [sp, #40] | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef ENABLE_ARM32 | #ifdef ENABLE_ARM32 | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -15,7 +16,7 @@ | |||||
| //x4: h | //x4: h | ||||
| //x5: k | //x5: k | ||||
| //x6: length | //x6: length | ||||
| WinogradTransRight: | |||||
| asm_function WinogradTransRight | |||||
| push {r4-r11, lr} | push {r4-r11, lr} | ||||
| ldr r4, [sp, #36] | ldr r4, [sp, #36] | ||||
| ldr r5, [sp, #40] | ldr r5, [sp, #40] | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| .global AdderFloatNeon64 | .global AdderFloatNeon64 | ||||
| @@ -19,7 +20,7 @@ | |||||
| // x8: stride | // x8: stride | ||||
| // x9: writeMode | // x9: writeMode | ||||
| AdderFloatNeon64: | |||||
| asm_function AdderFloatNeon64 | |||||
| sub sp, sp, #144 | sub sp, sp, #144 | ||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -12,7 +13,7 @@ | |||||
| // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, x6: channel, x7: relu, x8: relu6 | // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, x6: channel, x7: relu, x8: relu6 | ||||
| ConvDw3x3Corner: | |||||
| asm_function ConvDw3x3Corner | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -12,7 +13,7 @@ | |||||
| // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, x6: channel, x7: relu, x8: relu6 | // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, x6: channel, x7: relu, x8: relu6 | ||||
| ConvDw3x3Horizontal: | |||||
| asm_function ConvDw3x3Horizontal | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -23,7 +24,7 @@ | |||||
| // w9: relu | // w9: relu | ||||
| // w10: relu6 | // w10: relu6 | ||||
| ConvDw3x3Stride1: | |||||
| asm_function ConvDw3x3Stride1 | |||||
| sub sp, sp, #128 | sub sp, sp, #128 | ||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -23,7 +24,7 @@ | |||||
| // w9: relu | // w9: relu | ||||
| // w10: relu6 | // w10: relu6 | ||||
| ConvDw3x3Stride2: | |||||
| asm_function ConvDw3x3Stride2 | |||||
| sub sp, sp, #128 | sub sp, sp, #128 | ||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -12,7 +13,7 @@ | |||||
| // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, x6: channel, x7: relu, x8: relu6 | // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, x6: channel, x7: relu, x8: relu6 | ||||
| ConvDw3x3Vertical: | |||||
| asm_function ConvDw3x3Vertical | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -31,7 +32,7 @@ | |||||
| // w15: acc_max | // w15: acc_max | ||||
| // w16: per_channel | // w16: per_channel | ||||
| ConvDw3x3Int8Neon64: | |||||
| asm_function ConvDw3x3Int8Neon64 | |||||
| sub sp, sp, #176 | sub sp, sp, #176 | ||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -14,7 +15,7 @@ | |||||
| // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, | // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, | ||||
| // x6: channel, x7: in_zp, x8: out_zp, x9: out_multiplier, x10: left_shift, x11: right_shift | // x6: channel, x7: in_zp, x8: out_zp, x9: out_multiplier, x10: left_shift, x11: right_shift | ||||
| // x12: acc_min, x13: acc_max, x14: per_channel | // x12: acc_min, x13: acc_max, x14: per_channel | ||||
| ConvDw3x3Int8Corner: | |||||
| asm_function ConvDw3x3Int8Corner | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -14,7 +15,7 @@ | |||||
| // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, | // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, | ||||
| // x6: channel, x7: in_zp, x8: out_zp, x9: out_multiplier, x10: left_shift, x11: right_shift | // x6: channel, x7: in_zp, x8: out_zp, x9: out_multiplier, x10: left_shift, x11: right_shift | ||||
| // x12: acc_min, x13: acc_max, x14: per_channel | // x12: acc_min, x13: acc_max, x14: per_channel | ||||
| ConvDw3x3Int8Horizontal: | |||||
| asm_function ConvDw3x3Int8Horizontal | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -31,7 +32,7 @@ | |||||
| // w15: acc_max | // w15: acc_max | ||||
| // w16: per_channel | // w16: per_channel | ||||
| ConvDw3x3Int8Stride2: | |||||
| asm_function ConvDw3x3Int8Stride2 | |||||
| sub sp, sp, #176 | sub sp, sp, #176 | ||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -14,7 +15,7 @@ | |||||
| // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, | // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, | ||||
| // x6: channel, x7: in_zp, x8: out_zp, x9: out_multiplier, x10: left_shift, x11: right_shift | // x6: channel, x7: in_zp, x8: out_zp, x9: out_multiplier, x10: left_shift, x11: right_shift | ||||
| // x12: acc_min, x13: acc_max, x14: per_channel | // x12: acc_min, x13: acc_max, x14: per_channel | ||||
| ConvDw3x3Int8Vertical: | |||||
| asm_function ConvDw3x3Int8Vertical | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -12,7 +13,7 @@ | |||||
| // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: in_kh_step, x7: in_kw_step, | // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: in_kh_step, x7: in_kw_step, | ||||
| // x8: kernel_w, x9: relu, x10: relu6 | // x8: kernel_w, x9: relu, x10: relu6 | ||||
| ConvDwFp32Border: | |||||
| asm_function ConvDwFp32Border | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -13,7 +14,7 @@ | |||||
| // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: kernel_h, x7: kernel_w, | // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: kernel_h, x7: kernel_w, | ||||
| // x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step | // x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step | ||||
| // x14: relu, x15: relu6 | // x14: relu, x15: relu6 | ||||
| ConvDwFp32Center: | |||||
| asm_function ConvDwFp32Center | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -11,7 +12,7 @@ | |||||
| // size_t input_stride, size_t relu, size_t relu6) | // size_t input_stride, size_t relu, size_t relu6) | ||||
| // x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6 | // x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6 | ||||
| ConvDwFp32Indirect3x3: | |||||
| asm_function ConvDwFp32Indirect3x3 | |||||
| sub sp, sp, #16 | sub sp, sp, #16 | ||||
| stp x19, x20, [sp], #16 | stp x19, x20, [sp], #16 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -11,7 +12,7 @@ | |||||
| // size_t input_stride, size_t relu, size_t relu6) | // size_t input_stride, size_t relu, size_t relu6) | ||||
| // x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6 | // x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6 | ||||
| ConvDwFp32Indirect5x5: | |||||
| asm_function ConvDwFp32Indirect5x5 | |||||
| sub sp, sp, #160 | sub sp, sp, #160 | ||||
| stp x19, x20, [sp, #64] | stp x19, x20, [sp, #64] | ||||
| stp x21, x22, [sp, #80] | stp x21, x22, [sp, #80] | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -12,7 +13,7 @@ | |||||
| // x0: output_ptr, x1: input_ptr, x2: filter_ptr, x3: num_pixels, | // x0: output_ptr, x1: input_ptr, x2: filter_ptr, x3: num_pixels, | ||||
| // x4: input_channel, x5: input_step | // x4: input_channel, x5: input_step | ||||
| // | // | ||||
| ConvDwFp32Row: | |||||
| asm_function ConvDwFp32Row | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -16,7 +17,7 @@ | |||||
| // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w, | // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w, | ||||
| // x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step | // x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step | ||||
| // x14: in_zp, #56: out_zp, #64: out_multiplier, #72:left_shift, #80: right_shift, #88: acc_min, #96: acc_max | // x14: in_zp, #56: out_zp, #64: out_multiplier, #72:left_shift, #80: right_shift, #88: acc_min, #96: acc_max | ||||
| ConvDwInt8Center: | |||||
| asm_function ConvDwInt8Center | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -12,7 +13,7 @@ | |||||
| // x0: dst, x1: buffer, x2: num_pixels, x3: output_zp, x4: out_multiplier, | // x0: dst, x1: buffer, x2: num_pixels, x3: output_zp, x4: out_multiplier, | ||||
| // x5: left_shift, x6: right_shift, x7: acc_min, x8: acc_max | // x5: left_shift, x6: right_shift, x7: acc_min, x8: acc_max | ||||
| ConvDwInt8PostAlign4: | |||||
| asm_function ConvDwInt8PostAlign4 | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -12,7 +13,7 @@ | |||||
| // x0: dst, x1: buffer, x2: num_pixels, x3: output_zp, x4: out_multiplier, | // x0: dst, x1: buffer, x2: num_pixels, x3: output_zp, x4: out_multiplier, | ||||
| // x5: left_shift, x6: right_shift, x7: acc_min, x8: acc_max | // x5: left_shift, x6: right_shift, x7: acc_min, x8: acc_max | ||||
| ConvDwInt8PostAlign4PerChannel: | |||||
| asm_function ConvDwInt8PostAlign4PerChannel | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -12,7 +13,7 @@ | |||||
| // x0: output_ptr, x1: input_ptr, x2: weight_ptr, x3: num_pixels, | // x0: output_ptr, x1: input_ptr, x2: weight_ptr, x3: num_pixels, | ||||
| // x4: output_channel, x5: input_step, x6: input_zp | // x4: output_channel, x5: input_step, x6: input_zp | ||||
| // | // | ||||
| ConvDwInt8Row: | |||||
| asm_function ConvDwInt8Row | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -13,7 +14,7 @@ | |||||
| // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: kernel_h, x7: kernel_w, | // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: kernel_h, x7: kernel_w, | ||||
| // x8: out_h_step, x9: block_channel, x10: ic4, x11: in_sh_step, x12: in_sw_step, x13: in_kh_step, x14: in_kw_step | // x8: out_h_step, x9: block_channel, x10: ic4, x11: in_sh_step, x12: in_sw_step, x13: in_kh_step, x14: in_kw_step | ||||
| // x26: relu, x16: relu6 | // x26: relu, x16: relu6 | ||||
| ConvSwFp32Center: | |||||
| asm_function ConvSwFp32Center | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -11,7 +12,7 @@ | |||||
| // size_t in_kh_step, size_t in_kw_step, size_t kernel_w) | // size_t in_kh_step, size_t in_kw_step, size_t kernel_w) | ||||
| // x0: dst, x1: src, x2: weight, x3: height, x4: width, x5: in_kh_step, x6: in_kw_step, x7: kernel_w | // x0: dst, x1: src, x2: weight, x3: height, x4: width, x5: in_kh_step, x6: in_kw_step, x7: kernel_w | ||||
| DeconvDwFp32Border: | |||||
| asm_function DeconvDwFp32Border | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -12,7 +13,7 @@ | |||||
| // size_t in_kh_step, size_t in_kw_step); | // size_t in_kh_step, size_t in_kw_step); | ||||
| // x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step | // x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step | ||||
| // x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step | // x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step | ||||
| DeconvDwFp32Center: | |||||
| asm_function DeconvDwFp32Center | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -12,7 +13,7 @@ | |||||
| // size_t in_kh_step, size_t in_kw_step); | // size_t in_kh_step, size_t in_kw_step); | ||||
| // x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step | // x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step | ||||
| // x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step | // x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step | ||||
| DeconvDwInt8Center: | |||||
| asm_function DeconvDwInt8Center | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -13,7 +14,7 @@ | |||||
| // x0: dst, x1: output_buffer, x2: bias, x3: block_channel, x4: pixel_nums, x5: out_multiplier | // x0: dst, x1: output_buffer, x2: bias, x3: block_channel, x4: pixel_nums, x5: out_multiplier | ||||
| // x6: left_shift, x7: right_shift, x8: out_zp, x9: acc_min, x10: acc_max | // x6: left_shift, x7: right_shift, x8: out_zp, x9: acc_min, x10: acc_max | ||||
| DeconvDwInt8Post: | |||||
| asm_function DeconvDwInt8Post | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -9,7 +10,7 @@ | |||||
| // void IndirectGemmInt16to32_8x4(int *output, short *input, short *weight, size_t ksize, size_t ic8, size_t oc4, size_t offset); | // void IndirectGemmInt16to32_8x4(int *output, short *input, short *weight, size_t ksize, size_t ic8, size_t oc4, size_t offset); | ||||
| // x0: output, x1: input, x2: weight, x3: ksize, x4: ic8, x5: oc4, x6: offset | // x0: output, x1: input, x2: weight, x3: ksize, x4: ic8, x5: oc4, x6: offset | ||||
| IndirectGemmInt16to32_8x4: | |||||
| asm_function IndirectGemmInt16to32_8x4 | |||||
| .macro INIT_ZERO | .macro INIT_ZERO | ||||
| dup v28.4s, wzr | dup v28.4s, wzr | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| .global MatVecMulFp32 | .global MatVecMulFp32 | ||||
| @@ -15,7 +16,7 @@ | |||||
| // w5: depth | // w5: depth | ||||
| // w6: col | // w6: col | ||||
| MatVecMulFp32: | |||||
| asm_function MatVecMulFp32 | |||||
| sub sp, sp, #128 | sub sp, sp, #128 | ||||
| st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | ||||
| st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 | st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| .global MatmulFloatNeon64 | .global MatmulFloatNeon64 | ||||
| @@ -19,7 +20,7 @@ | |||||
| // w17: stride | // w17: stride | ||||
| // w13: c8_nhwc_c4 | // w13: c8_nhwc_c4 | ||||
| MatmulFloatNeon64: | |||||
| asm_function MatmulFloatNeon64 | |||||
| sub sp, sp, #128 | sub sp, sp, #128 | ||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| .global MatmulFloatNeon64Opt | .global MatmulFloatNeon64Opt | ||||
| @@ -19,7 +20,7 @@ | |||||
| // x8: stride | // x8: stride | ||||
| // x9: writeMode | // x9: writeMode | ||||
| MatmulFloatNeon64Opt: | |||||
| asm_function MatmulFloatNeon64Opt | |||||
| sub sp, sp, #144 | sub sp, sp, #144 | ||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| .global MatmulInt8Neon64 | .global MatmulInt8Neon64 | ||||
| @@ -29,7 +30,7 @@ | |||||
| // w24: stride | // w24: stride | ||||
| // w27: filter_peroc | // w27: filter_peroc | ||||
| MatmulInt8Neon64: | |||||
| asm_function MatmulInt8Neon64 | |||||
| sub sp, sp, #208 | sub sp, sp, #208 | ||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| .global MatmulInt8Opt | .global MatmulInt8Opt | ||||
| @@ -28,7 +29,7 @@ | |||||
| // x15: filter_peroc | // x15: filter_peroc | ||||
| // x28: filter_zp | // x28: filter_zp | ||||
| MatmulInt8Opt: | |||||
| asm_function MatmulInt8Opt | |||||
| sub sp, sp, #208 | sub sp, sp, #208 | ||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| .global MatMulR4Int8Neon64 | .global MatMulR4Int8Neon64 | ||||
| @@ -18,7 +19,7 @@ | |||||
| // x6: a_sums | // x6: a_sums | ||||
| // x7: bias | // x7: bias | ||||
| MatMulR4Int8Neon64: | |||||
| asm_function MatMulR4Int8Neon64 | |||||
| sub sp, sp, #128 | sub sp, sp, #128 | ||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -9,7 +10,7 @@ | |||||
| // MatrixMultiplyWinograd(float *matix_a, float *matrix_b, float *matrix_c, int m, int k, int n, int in_channel, int c4_channel) | // MatrixMultiplyWinograd(float *matix_a, float *matrix_b, float *matrix_c, int m, int k, int n, int in_channel, int c4_channel) | ||||
| // x0: matrix_a, x1: matrix_b, x2: matrix_c, x3: m, x4: k, x5: n, x6: in_channel, x7: c4_channel | // x0: matrix_a, x1: matrix_b, x2: matrix_c, x3: m, x4: k, x5: n, x6: in_channel, x7: c4_channel | ||||
| MatrixMultiplyWinograd: | |||||
| asm_function MatrixMultiplyWinograd | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -23,7 +24,7 @@ | |||||
| // w13 hw loop control | // w13 hw loop control | ||||
| PostFuncBiasReluC4: | |||||
| asm_function PostFuncBiasReluC4 | |||||
| movi v26.4s, #6 | movi v26.4s, #6 | ||||
| scvtf v26.4s, v26.4s | scvtf v26.4s, v26.4s | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -21,7 +22,7 @@ | |||||
| // w10 oc8 loop control | // w10 oc8 loop control | ||||
| // w13 hw loop control | // w13 hw loop control | ||||
| PostFuncBiasReluC8: | |||||
| asm_function PostFuncBiasReluC8 | |||||
| sub sp, sp, #128 | sub sp, sp, #128 | ||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -42,7 +43,7 @@ | |||||
| // w15 oc4 loop control | // w15 oc4 loop control | ||||
| // w16 hw loop control | // w16 hw loop control | ||||
| PostFuncInt8C4Neon64: | |||||
| asm_function PostFuncInt8C4Neon64 | |||||
| ldr w8, [sp] | ldr w8, [sp] | ||||
| ldr w9, [sp, #8] | ldr w9, [sp, #8] | ||||
| @@ -1,5 +1,6 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| //.p2align 5,,15 | //.p2align 5,,15 | ||||
| @@ -20,7 +21,7 @@ | |||||
| // w6 oc_res4 | // w6 oc_res4 | ||||
| // w7 stride | // w7 stride | ||||
| PreSum4x16Int8Peroc: | |||||
| asm_function PreSum4x16Int8Peroc | |||||
| mov w8, #0 | mov w8, #0 | ||||
| RowLoop: | RowLoop: | ||||
| @@ -1,5 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| //.p2align 5,,15 | //.p2align 5,,15 | ||||
| @@ -16,7 +16,7 @@ | |||||
| // w3 co16 | // w3 co16 | ||||
| // w4 filter_zp | // w4 filter_zp | ||||
| PreSum4x16Int8Pert: | |||||
| asm_function PreSum4x16Int8Pert | |||||
| dup v17.4s, w4 | dup v17.4s, w4 | ||||
| mov w5, #0 | mov w5, #0 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -7,7 +8,7 @@ | |||||
| .type TiledC4MatmulFp32, %function | .type TiledC4MatmulFp32, %function | ||||
| #endif | #endif | ||||
| TiledC4MatmulFp32: | |||||
| asm_function TiledC4MatmulFp32 | |||||
| //void TiledC4MatmulFp32(float* dst, const float* src, const float* weight, size_t ic4, size_t cal_num, size_t oc4) | //void TiledC4MatmulFp32(float* dst, const float* src, const float* weight, size_t ic4, size_t cal_num, size_t oc4) | ||||
| //x0: dst | //x0: dst | ||||
| //x1: src | //x1: src | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -7,7 +8,7 @@ | |||||
| .type WinogradTransLeft, %function | .type WinogradTransLeft, %function | ||||
| #endif | #endif | ||||
| WinogradTransLeft: | |||||
| asm_function WinogradTransLeft | |||||
| //void WinogradTransLeft(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length); | //void WinogradTransLeft(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length); | ||||
| //x0: S | //x0: S | ||||
| //x1: B | //x1: B | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -7,7 +8,7 @@ | |||||
| .type WinogradTransRight, %function | .type WinogradTransRight, %function | ||||
| #endif | #endif | ||||
| WinogradTransRight: | |||||
| asm_function WinogradTransRight | |||||
| //void WinogradTransRight(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length); | //void WinogradTransRight(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length); | ||||
| //x0: S | //x0: S | ||||
| //x1: B | //x1: B | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -13,7 +14,7 @@ | |||||
| // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: in_kh_step, x7: in_kw_step, | // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: in_kh_step, x7: in_kw_step, | ||||
| // x8: kernel_w, x9: relu, x10: relu6 | // x8: kernel_w, x9: relu, x10: relu6 | ||||
| ConvDwFp16Border: | |||||
| asm_function ConvDwFp16Border | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -13,7 +14,7 @@ | |||||
| // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w, | // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w, | ||||
| // x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step | // x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step | ||||
| // x14: relu, x15: relu6 | // x14: relu, x15: relu6 | ||||
| ConvDwFp16Center: | |||||
| asm_function ConvDwFp16Center | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -12,7 +13,7 @@ | |||||
| // x0: output_ptr, x1: input_ptr, x2: filter_ptr, x3: num_pixels, | // x0: output_ptr, x1: input_ptr, x2: filter_ptr, x3: num_pixels, | ||||
| // x4: input_channel, x5: input_step | // x4: input_channel, x5: input_step | ||||
| // | // | ||||
| ConvDwFp16Row: | |||||
| asm_function ConvDwFp16Row | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -11,7 +12,7 @@ | |||||
| // size_t in_kh_step, size_t in_kw_step, size_t kernel_w) | // size_t in_kh_step, size_t in_kw_step, size_t kernel_w) | ||||
| // x0: dst, x1: src, x2: weight, x3: height, x4: width, x5: in_kh_step, x6: in_kw_step, x7: kernel_w | // x0: dst, x1: src, x2: weight, x3: height, x4: width, x5: in_kh_step, x6: in_kw_step, x7: kernel_w | ||||
| DeconvDwFp16Border: | |||||
| asm_function DeconvDwFp16Border | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -12,7 +13,7 @@ | |||||
| // size_t in_kh_step, size_t in_kw_step); | // size_t in_kh_step, size_t in_kw_step); | ||||
| // x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step | // x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step | ||||
| // x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step | // x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step | ||||
| DeconvDwFp16Center: | |||||
| asm_function DeconvDwFp16Center | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -9,7 +10,7 @@ | |||||
| // void Float16ToFloat32(const float16_t *input, float *output, int number); | // void Float16ToFloat32(const float16_t *input, float *output, int number); | ||||
| // x0: input, x1: output, x2: number | // x0: input, x1: output, x2: number | ||||
| Float16ToFloat32: | |||||
| asm_function Float16ToFloat32 | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -9,7 +10,7 @@ | |||||
| // void Float32ToFloat16(const float *input, float16_t output, int number); | // void Float32ToFloat16(const float *input, float16_t output, int number); | ||||
| // x0: input, x1: output, x2: number | // x0: input, x1: output, x2: number | ||||
| Float32ToFloat16: | |||||
| asm_function Float32ToFloat16 | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -12,7 +13,7 @@ | |||||
| // x0: output, x1: input, x2: weight, x3: bias, x4: step, x5: ic4, x6: oc8, x7: offset, | // x0: output, x1: input, x2: weight, x3: bias, x4: step, x5: ic4, x6: oc8, x7: offset, | ||||
| // x8:mode, x9: writeC4, x10:relu, x11: relu6 | // x8:mode, x9: writeC4, x10:relu, x11: relu6 | ||||
| // compute 8 channel for 16 outputs | // compute 8 channel for 16 outputs | ||||
| IndirectGemmFp16_16x8: | |||||
| asm_function IndirectGemmFp16_16x8 | |||||
| .macro INIT_BIAS | .macro INIT_BIAS | ||||
| dup v16.4s, wzr | dup v16.4s, wzr | ||||
| @@ -41,7 +42,7 @@ IndirectGemmFp16_16x8: | |||||
| // x19 ~ r29 should be also preserved | // x19 ~ r29 should be also preserved | ||||
| // whereas our coding style do not permit such amount of parameters | // whereas our coding style do not permit such amount of parameters | ||||
| sub sp, sp, #128 | sub sp, sp, #128 | ||||
| // performance between storing 4 registers at the same time and seperatly storing them on in-order cores | |||||
| // performance between storing 4 registers at the same time and separately storing them on in-order cores | |||||
| // is not tested yet | // is not tested yet | ||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| @@ -86,7 +87,7 @@ IndirectGemmStart: | |||||
| fmla v19.8h, v9.8h, v1.h[5] | fmla v19.8h, v9.8h, v1.h[5] | ||||
| // load input for output 9-16 | // load input for output 9-16 | ||||
| // input cache should be refreshed after loading | // input cache should be refreshed after loading | ||||
| // ATTENTION: advance is prefered, but advancing too much may lead to invalid prefetching | |||||
| // ATTENTION: advance is preferred, but advancing too much may lead to invalid prefetching | |||||
| ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x12], #64 | ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x12], #64 | ||||
| // last 2 steps for output 1 and 3 | // last 2 steps for output 1 and 3 | ||||
| fmla v16.8h, v10.8h, v0.h[2] | fmla v16.8h, v10.8h, v0.h[2] | ||||
| @@ -295,7 +296,7 @@ IndirectGemmStart: | |||||
| cmp x6, #7 | cmp x6, #7 | ||||
| beq Write7 | beq Write7 | ||||
| b Write8 | b Write8 | ||||
| // prefetching is not prefered while writing results in spite of cache missings | |||||
| // prefetching is not preferred while writing results in spite of cache missing | |||||
| // you could try prfm pstl2strm | // you could try prfm pstl2strm | ||||
| // there are almost no benefits observed though | // there are almost no benefits observed though | ||||
| Write1: | Write1: | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| .global MatVecMulFp16Neon64 | .global MatVecMulFp16Neon64 | ||||
| @@ -15,7 +16,7 @@ | |||||
| // w5: depth | // w5: depth | ||||
| // w6: col | // w6: col | ||||
| MatVecMulFp16Neon64: | |||||
| asm_function MatVecMulFp16Neon64 | |||||
| sub sp, sp, #128 | sub sp, sp, #128 | ||||
| st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | ||||
| st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 | st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| .global MatmulFp16Neon64 | .global MatmulFp16Neon64 | ||||
| @@ -19,7 +20,7 @@ | |||||
| // w17: stride | // w17: stride | ||||
| // w13: writeC8 | // w13: writeC8 | ||||
| MatmulFp16Neon64: | |||||
| asm_function MatmulFp16Neon64 | |||||
| sub sp, sp, #128 | sub sp, sp, #128 | ||||
| st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | ||||
| st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 | st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| .global MatmulFp16Neon64Opt | .global MatmulFp16Neon64Opt | ||||
| @@ -19,7 +20,7 @@ | |||||
| // x8: stride | // x8: stride | ||||
| // x9: writeMode | // x9: writeMode | ||||
| MatmulFp16Neon64Opt: | |||||
| asm_function MatmulFp16Neon64Opt | |||||
| sub sp, sp, #80 | sub sp, sp, #80 | ||||
| st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | ||||
| stp x19, x20, [sp], #16 | stp x19, x20, [sp], #16 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -9,7 +10,7 @@ | |||||
| // MatrixMultiplyWinogradFp16(float16_t *matix_a, float16_t *matrix_b, float16_t *matrix_c, int m, int k, int n, int in_channel) | // MatrixMultiplyWinogradFp16(float16_t *matix_a, float16_t *matrix_b, float16_t *matrix_c, int m, int k, int n, int in_channel) | ||||
| // x0: matrix_a, x1: matrix_b, x2: matrix_c, x3: m, x4: k, x5: n, x6: in_channel | // x0: matrix_a, x1: matrix_b, x2: matrix_c, x3: m, x4: k, x5: n, x6: in_channel | ||||
| MatrixMultiplyWinogradFp16: | |||||
| asm_function MatrixMultiplyWinogradFp16 | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | ||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | ||||
| // x19 ~ x29 should be also preserved | // x19 ~ x29 should be also preserved | ||||
| @@ -1,3 +1,4 @@ | |||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -13,7 +14,7 @@ | |||||
| // w3 oc4div w4 oc4mod w5 plane_size | // w3 oc4div w4 oc4mod w5 plane_size | ||||
| // x6 plane_stride x7 relu_type | // x6 plane_stride x7 relu_type | ||||
| PostFuncBiasReluC4Fp16: | |||||
| asm_function PostFuncBiasReluC4Fp16 | |||||
| movi v26.4h, #6 | movi v26.4h, #6 | ||||
| scvtf v26.4h, v26.4h | scvtf v26.4h, v26.4h | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -21,7 +22,7 @@ | |||||
| // w10 oc8 loop control | // w10 oc8 loop control | ||||
| // w13 hw loop control | // w13 hw loop control | ||||
| PostFuncBiasReluC8Fp16: | |||||
| asm_function PostFuncBiasReluC8Fp16 | |||||
| movi v26.8h, #0x46, lsl #8 | movi v26.8h, #0x46, lsl #8 | ||||
| dup v27.8h, wzr | dup v27.8h, wzr | ||||
| mov w10, #0 | mov w10, #0 | ||||
| @@ -1,3 +1,4 @@ | |||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -6,7 +7,7 @@ | |||||
| .type TiledC4MatmulFp16, %function | .type TiledC4MatmulFp16, %function | ||||
| #endif | #endif | ||||
| TiledC4MatmulFp16: | |||||
| asm_function TiledC4MatmulFp16 | |||||
| sub sp, sp, #128 | sub sp, sp, #128 | ||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| @@ -1,3 +1,4 @@ | |||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -6,7 +7,7 @@ | |||||
| .type WinogradTransLeftFp16, %function | .type WinogradTransLeftFp16, %function | ||||
| #endif | #endif | ||||
| WinogradTransLeftFp16: | |||||
| asm_function WinogradTransLeftFp16 | |||||
| sub sp, sp, #32 | sub sp, sp, #32 | ||||
| stp x19, x20, [sp], #32 | stp x19, x20, [sp], #32 | ||||
| @@ -1,3 +1,4 @@ | |||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| @@ -6,7 +7,7 @@ | |||||
| .type WinogradTransRightFp16, %function | .type WinogradTransRightFp16, %function | ||||
| #endif | #endif | ||||
| WinogradTransRightFp16: | |||||
| asm_function WinogradTransRightFp16 | |||||
| mov x8, #8 // 4 * sizeof(float16) | mov x8, #8 // 4 * sizeof(float16) | ||||
| mul x8, x6, x8 | mul x8, x6, x8 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| .global MatmulInt8DpNeon64 | .global MatmulInt8DpNeon64 | ||||
| @@ -29,7 +30,7 @@ | |||||
| // w24: stride | // w24: stride | ||||
| // w27: filter_peroc | // w27: filter_peroc | ||||
| MatmulInt8DpNeon64: | |||||
| asm_function MatmulInt8DpNeon64 | |||||
| sub sp, sp, #208 | sub sp, sp, #208 | ||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| .global MatmulInt8DpOpt | .global MatmulInt8DpOpt | ||||
| @@ -28,7 +29,7 @@ | |||||
| // x15: filter_peroc | // x15: filter_peroc | ||||
| // x28: filter_zp | // x28: filter_zp | ||||
| MatmulInt8DpOpt: | |||||
| asm_function MatmulInt8DpOpt | |||||
| sub sp, sp, #208 | sub sp, sp, #208 | ||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| @@ -1,4 +1,5 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| #include "nnacl/assembly_global.h" | |||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| .global MatMulOptR4Int8Neon64 | .global MatMulOptR4Int8Neon64 | ||||
| @@ -18,7 +19,7 @@ | |||||
| // x6: a_sums | // x6: a_sums | ||||
| // x7: bias | // x7: bias | ||||
| MatMulOptR4Int8Neon64: | |||||
| asm_function MatMulOptR4Int8Neon64 | |||||
| sub sp, sp, #128 | sub sp, sp, #128 | ||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | ||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | ||||
| @@ -0,0 +1,32 @@ | |||||
| /** | |||||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_LITE_NNACL_ASSEMBLY_GLOBAL_H | |||||
| #define MINDSPORE_LITE_NNACL_ASSEMBLY_GLOBAL_H | |||||
| .macro asm_function fname | |||||
| #ifdef __APPLE__ | |||||
| .globl _\fname _\fname : | |||||
| #else | |||||
| .global \fname | |||||
| #ifdef __ELE__ | |||||
| .hidden \fname.type \fname, | |||||
| % function | |||||
| #endif | |||||
| \fname : | |||||
| #endif | |||||
| .endm | |||||
| #endif // MINDSPORE_LITE_NNACL_ASSEMBLY_GLOBAL_H | |||||