zzy34407230
/
mindspore2022

 
			
							#ifdef __aarch64__

    .text
    .align 5
    //.p2align 5,,15
    .global PostFuncBiasReluC4
#ifndef __APPLE__
    .type PostFuncBiasReluC4, %function
#endif

//void PostFuncBiasReluC4(float *dst, const float *src, const float *bias, size_t oc4div, size_t oc4mod,
//                        size_t plane_size, size_t plane_stride, size_t relu_type);
// x0 dst           x1 srx           x2 bias
// w3 oc4div        w4 oc4mod        w5 plane_size
// x6 plane_stride  x7 relu_type

// v0 ~ v7 value
// v16  bias data
// x12  oc_stride
// x14 x15   write loop tmp buf
// v26  relu6  #6;    v27 relu #0
// w10  oc4 loop control
// w13  hw  loop control


PostFuncBiasReluC4:

  movi v26.4s, #6
  scvtf v26.4s, v26.4s
  dup v27.4s, wzr

  mov x10, #4
  add x12, x3, x4
  mul x12, x12, x10

  mov w10, #0

Loop_C4:
  cmp w10, w3
  beq Loop_C1
  mov x15, #4
  mul x14, x10, x15
  add x15, x0, x14
  add w10, w10, #4
  mov w13, w5
  ld1 {v16.4s}, [x2], #16

Loop_8x4:
  cmp w13, #8
  blt Loop_4x4
  sub w13, w13, #8
  ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
  ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64

  fadd v0.4s, v0.4s, v16.4s
  fadd v1.4s, v1.4s, v16.4s
  fadd v2.4s, v2.4s, v16.4s
  fadd v3.4s, v3.4s, v16.4s
  fadd v4.4s, v4.4s, v16.4s
  fadd v5.4s, v5.4s, v16.4s
  fadd v6.4s, v6.4s, v16.4s
  fadd v7.4s, v7.4s, v16.4s

  cmp x7, #3
  beq Relu6_8x4
  cmp x7, #1
  beq Relu_8x4
  b Write_8x4
Relu6_8x4:
  fmin v0.4s, v0.4s, v26.4s
  fmin v1.4s, v1.4s, v26.4s
  fmin v2.4s, v2.4s, v26.4s
  fmin v3.4s, v3.4s, v26.4s
  fmin v4.4s, v4.4s, v26.4s
  fmin v5.4s, v5.4s, v26.4s
  fmin v6.4s, v6.4s, v26.4s
  fmin v7.4s, v7.4s, v26.4s
Relu_8x4:
  fmax v0.4s, v0.4s, v27.4s
  fmax v1.4s, v1.4s, v27.4s
  fmax v2.4s, v2.4s, v27.4s
  fmax v3.4s, v3.4s, v27.4s
  fmax v4.4s, v4.4s, v27.4s
  fmax v5.4s, v5.4s, v27.4s
  fmax v6.4s, v6.4s, v27.4s
  fmax v7.4s, v7.4s, v27.4s
Write_8x4:
  st1 {v0.4s}, [x15], x12
  st1 {v1.4s}, [x15], x12
  st1 {v2.4s}, [x15], x12
  st1 {v3.4s}, [x15], x12
  st1 {v4.4s}, [x15], x12
  st1 {v5.4s}, [x15], x12
  st1 {v6.4s}, [x15], x12
  st1 {v7.4s}, [x15], x12
  b Loop_8x4

Loop_4x4:
  cmp w13, #4
  blt Loop_1x4
  sub w13, w13, #4
  ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
  fadd v0.4s, v0.4s, v16.4s
  fadd v1.4s, v1.4s, v16.4s
  fadd v2.4s, v2.4s, v16.4s
  fadd v3.4s, v3.4s, v16.4s
  cmp x7, #3
  beq Relu6_4x4
  cmp x7, #1
  beq Relu_4x4
  b Write_4x4
Relu6_4x4:
  fmin v0.4s, v0.4s, v26.4s
  fmin v1.4s, v1.4s, v26.4s
  fmin v2.4s, v2.4s, v26.4s
  fmin v3.4s, v3.4s, v26.4s
Relu_4x4:
  fmax v0.4s, v0.4s, v27.4s
  fmax v1.4s, v1.4s, v27.4s
  fmax v2.4s, v2.4s, v27.4s
  fmax v3.4s, v3.4s, v27.4s
Write_4x4:
  st1 {v0.4s}, [x15], x12
  st1 {v1.4s}, [x15], x12
  st1 {v2.4s}, [x15], x12
  st1 {v3.4s}, [x15], x12

Loop_1x4:
  cmp x7, #3
  beq Relu6_1x4
  cmp x7, #1
  beq Relu_1x4
  b Write_1x4
Relu6_1x4:
  cmp w13, #0
  beq HW_Add
  sub w13, w13, #1
  ld1 {v0.4s}, [x1], #16
  fadd v0.4s, v0.4s, v16.4s
  fmin v0.4s, v0.4s, v26.4s
  fmax v0.4s, v0.4s, v27.4s
  st1 {v0.4s}, [x15], x12
  b Relu6_1x4
Relu_1x4:
  cmp w13, #0
  beq HW_Add
  sub w13, w13, #1
  ld1 {v0.4s}, [x1], #16
  fadd v0.4s, v0.4s, v16.4s
  fmax v0.4s, v0.4s, v27.4s
  st1 {v0.4s}, [x15], x12
  b Relu_1x4
Write_1x4:
  cmp w13, #0
  beq HW_Add
  sub w13, w13, #1
  ld1 {v0.4s}, [x1], #16
  fadd v0.4s, v0.4s, v16.4s
  st1 {v0.4s}, [x15], x12
  b Write_1x4

HW_Add:
  add x1, x1, x6
  b Loop_C4

Loop_C1:
  cmp x4, #0
  beq End
  mov w13, w5
  ld1 {v16.4s}, [x2], #16
  mov x15,  #4
  mul x14, x10, x15
  add x0, x0, x14

  cmp x4, #1
  beq Loop_C1_1
  cmp x4, #2
  beq Loop_C1_2
  cmp x4, #3
  beq Loop_C1_3

Loop_C1_1:
  cmp x7, #3
  beq Loop_C1_1_Relu6
  cmp x7, #1
  beq Loop_C1_1_Relu
  b Loop_C1_1_Write
Loop_C1_1_Relu6:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4s}, [x1], #16
  fadd v0.4s, v0.4s, v16.4s
  fmin v0.4s, v0.4s, v26.4s
  fmax v0.4s, v0.4s, v27.4s
  str s0, [x0]
  add x0, x0, x12
  b Loop_C1_1_Relu6
Loop_C1_1_Relu:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4s}, [x1], #16
  fadd v0.4s, v0.4s, v16.4s
  fmax v0.4s, v0.4s, v27.4s
  str s0, [x0]
  add x0, x0, x12
  b Loop_C1_1_Relu
Loop_C1_1_Write:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4s}, [x1], #16
  fadd v0.4s, v0.4s, v16.4s
  str s0, [x0]
  add x0, x0, x12
  b Loop_C1_1_Write

Loop_C1_2:
  cmp x7, #3
  beq Loop_C1_2_Relu6
  cmp x7, #1
  beq Loop_C1_2_Relu
  b Loop_C1_2_Write
Loop_C1_2_Relu6:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4s}, [x1], #16
  fadd v0.4s, v0.4s, v16.4s
  fmin v0.4s, v0.4s, v26.4s
  fmax v0.4s, v0.4s, v27.4s
  dup s1, v0.s[1]
  stp s0, s1, [x0]
  add x0, x0, x12
  b Loop_C1_2_Relu6
Loop_C1_2_Relu:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4s}, [x1], #16
  fadd v0.4s, v0.4s, v16.4s
  fmax v0.4s, v0.4s, v27.4s
  dup s1, v0.s[1]
  stp s0, s1, [x0]
  add x0, x0, x12
  b Loop_C1_2_Relu
Loop_C1_2_Write:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4s}, [x1], #16
  fadd v0.4s, v0.4s, v16.4s
  dup s1, v0.s[1]
  stp s0, s1, [x0]
  add x0, x0, x12
  b Loop_C1_2_Write

Loop_C1_3:
  add x15, x0, #8
  cmp x7, #3
  beq Loop_C1_3_Relu6
  cmp x7, #1
  beq Loop_C1_3_Relu
  b Loop_C1_3_Write
Loop_C1_3_Relu6:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4s}, [x1], #16
  fadd v0.4s, v0.4s, v16.4s
  fmin v0.4s, v0.4s, v26.4s
  fmax v0.4s, v0.4s, v27.4s
  dup s1, v0.s[1]
  stp s0, s1, [x0]
  add x0, x0, x12
  st1 {v0.s}[2], [x15], x12
  b Loop_C1_3_Relu6
Loop_C1_3_Relu:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4s}, [x1], #16
  fadd v0.4s, v0.4s, v16.4s
  fmax v0.4s, v0.4s, v27.4s
  dup s1, v0.s[1]
  stp s0, s1, [x0]
  add x0, x0, x12
  st1 {v0.s}[2], [x15], x12
  b Loop_C1_3_Relu
Loop_C1_3_Write:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4s}, [x1], #16
  fadd v0.4s, v0.4s, v16.4s
  dup s1, v0.s[1]
  stp s0, s1, [x0]
  add x0, x0, x12
  st1 {v0.s}[2], [x15], x12
  b Loop_C1_3_Write

End:
  ret
#endif