zzy34407230
/
mindspore2022

 
			
			   
				 
					
						
						
							
							#ifdef __aarch64__

.text
.align 5
.global DeconvDwFp16Center
#ifndef __APPLE__
.type DeconvDwFp16Center, %function
#endif

// void DeconvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width,
//                      size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
//                      size_t in_kh_step, size_t in_kw_step);
// x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step
// x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step
DeconvDwFp16Center:
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
    // whereas our coding style do not permit such amount of parameters
    sub sp, sp, #32
    stp x19, x20, [sp], #16
    stp x21, x22, [sp], #16

    ldr x8, [sp]
    ldr x9, [sp, #8]
    ldr x10, [sp, #16]
    ldr x11, [sp, #24]
    ldr x12, [sp, #32]

    LoopH:
        mov x15, x0
        mov x16, x1
        mov x17, x4
        LoopW:
            mov x18, x15
            mov x19, x2
            mov x20, x5
            ld1 {v1.8h}, [x16], x8
            LoopKh:
                mov x21, x18
                mov x13, x6
                LoopKw:
                    ld1 {v0.8h}, [x21]
                    ld1 {v2.8h}, [x19], #16
                    fmla v0.8h, v1.8h, v2.8h
                    st1 {v0.8h}, [x21], x12
                    subs x13, x13, #1
                    bne LoopKw
                add x18, x18, x11
                subs x20, x20, #1
                bne LoopKh
            add x15, x15, x10
            subs x17, x17, #1
            bne LoopW
        add x0, x0, x9
        add x1, x1, x7
        subs x3, x3, #1
        bne LoopH

    sub sp, sp, #32
    ldp x19, x20, [sp], #16
    ldp x21, x22, [sp], #16
    ret
#endif