Huawei_Technology
/
mindspore

 
			
			   
				 
					
						
						
							
							#ifdef __aarch64__

.text
.align 5
.global ConvDwFp32Row
#ifndef __APPLE__
.type ConvDwFp32Row, %function
#endif

// void ConvDwFp32Row(float* output_ptr, const float* input_ptr,const float* filter_ptr,
//                    size_t num_pixels, size_t input_channel, size_t input_step)
// x0: output_ptr, x1: input_ptr, x2: filter_ptr, x3: num_pixels,
// x4: input_channel, x5: input_step
//
ConvDwFp32Row:
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
    // whereas our coding style do not permit such amount of parameters
cmp x3, #0
beq End

mov x9, x0
mov x12, #4
mul x5, x5, x12

LoopOutPixel:
mov x6, x1
mov x7, x2
mov x8, x4

    LoopInputDepth16In:
    cmp x8, #16
    blt L4
    sub x8, x8, #16

    ld1 {v0.4s, v1.4s}, [x6], #32
    ld1 {v2.4s, v3.4s}, [x7], #32
    ld1 {v16.4s, v17.4s}, [x0], #32

    cmp x8, #16
    blt LoopInputDepth16Out
    LoopInputDepth16:
    fmla v16.4s, v0.4s, v2.4s
    fmla v17.4s, v1.4s, v3.4s

    st1 {v16.4s, v17.4s}, [x9], #32

    ld1 {v4.4s, v5.4s}, [x6], #32
    ld1 {v6.4s, v7.4s}, [x7], #32
    ld1 {v18.4s, v19.4s}, [x0], #32

    fmla v18.4s, v4.4s, v6.4s
    fmla v19.4s, v5.4s, v7.4s

    st1 {v18.4s, v19.4s}, [x9], #32

    ld1 {v0.4s, v1.4s}, [x6], #32
    ld1 {v2.4s, v3.4s}, [x7], #32
    ld1 {v16.4s, v17.4s}, [x0], #32

    sub x8, x8, #16
    cmp x8, #16
    bge LoopInputDepth16

    LoopInputDepth16Out:
    fmla v16.4s, v0.4s, v2.4s
    fmla v17.4s, v1.4s, v3.4s
    st1 {v16.4s, v17.4s}, [x9], #32
    
    ld1 {v4.4s, v5.4s}, [x6], #32
    ld1 {v6.4s, v7.4s}, [x7], #32
    ld1 {v18.4s, v19.4s}, [x0], #32
    
    fmla v18.4s, v4.4s, v6.4s
    fmla v19.4s, v5.4s, v7.4s
    
    st1 {v18.4s, v19.4s}, [x9], #32

    L4:
    cmp x8, #4
    blt L0

    LoopInputDepth4:
    ld1 {v0.4s}, [x6], #16
    ld1 {v2.4s}, [x7], #16
    ld1 {v16.4s}, [x0], #16
    fmla v16.4s, v0.4s, v2.4s
    st1 {v16.4s}, [x9], #16
    sub x8, x8, #4
    cmp x8, #4
    bge LoopInputDepth4

    L0:
    cmp x8, #0
    beq Loop16LineEnd

    LoopInputDepth0:
    ldr s0, [x6], #4
    ldr s1, [x7], #4
    ldr s2, [x0], #4
    fmul s0, s0, s1
    fadd s2, s2, s0
    str s2, [x9], #4
    subs x8, x8, #1
    bne LoopInputDepth0

    Loop16LineEnd:

subs x3, x3, #1
add x1, x1, x5
bne LoopOutPixel

End:
ret

#endif