Browse Source

!16431 arm64 matmul loop

From: @zhaozhenlong
Reviewed-by: 
Signed-off-by:
tags/v1.3.0
mindspore-ci-bot Gitee 4 years ago
parent
commit
7924014114
8 changed files with 1818 additions and 7 deletions
  1. +774
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow12.S
  2. +399
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow4.S
  3. +586
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow8.S
  4. +40
    -7
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.c
  5. +5
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.h
  6. +8
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/matmul_fp32.h
  7. +3
    -0
      mindspore/lite/micro/coder/opcoders/nnacl/fp32/convolution_fp32_coder.cc
  8. +3
    -0
      mindspore/lite/micro/coder/opcoders/nnacl/fp32/matmul_fp32_base_coder.cc

+ 774
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow12.S View File

@@ -0,0 +1,774 @@
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5

// void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
// int row, int col, size_t stride, size_t writeMode)
// x0: a
// x1: b
// x2: c
// x3: bias
// x4: act_type
// x5: depth
// x6: row
// x7: col
// x8: stride
// x9: writeMode

asm_function MatmulFloatNeon64OptRow12
sub sp, sp, #160
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16

ldr x8, [sp]
ldr x9, [sp, #8]

mov x21, #48 // sizeof(float) * 12
mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth
cbnz x9, NoC8Steps
mov x11, x2
mov x21, #32
mul x16, x6, x21 // row * 8 * sizeof(float)
NoC8Steps:
cmp x9, #2
bne NoWinoSteps
mov x21, #4
mul x15, x7, x8
mul x15, x15, x21 // kernel_size * col *sizeof(float)
mov x21, #32
mul x16, x8, x21 // kernel_size * 8 * sizeof(float)
NoWinoSteps:
mov x21, #4
mul x8, x8, x21

LoopRow:
mov x14, x1 // reload rhs ptr
mov x13, x7 // reload rhs col
mov x12, x3 // reload bias

LoopCol:
cbz x9, NoReloadDst
mov x11, x2
NoReloadDst:
mov x10, x0 // reload lhs ptr
mov x19, x5 // reload depth

cmp x13, #4
ble LoopDepthStartHalf

LoopDepthStart:
ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48
ld1 {v3.4s, v4.4s}, [x14], #32
fmul v8.4s, v3.4s, v0.s[0]
fmul v10.4s, v3.4s, v0.s[1]
fmul v12.4s, v3.4s, v0.s[2]
fmul v14.4s, v3.4s, v0.s[3]
fmul v9.4s, v4.4s, v0.s[0]
fmul v11.4s, v4.4s, v0.s[1]
fmul v13.4s, v4.4s, v0.s[2]
fmul v15.4s, v4.4s, v0.s[3]
fmul v16.4s, v3.4s, v1.s[0]
fmul v18.4s, v3.4s, v1.s[1]
fmul v20.4s, v3.4s, v1.s[2]
fmul v22.4s, v3.4s, v1.s[3]
fmul v17.4s, v4.4s, v1.s[0]
fmul v19.4s, v4.4s, v1.s[1]
fmul v21.4s, v4.4s, v1.s[2]
fmul v23.4s, v4.4s, v1.s[3]
fmul v24.4s, v3.4s, v2.s[0]
fmul v26.4s, v3.4s, v2.s[1]
fmul v28.4s, v3.4s, v2.s[2]
fmul v30.4s, v3.4s, v2.s[3]
fmul v25.4s, v4.4s, v2.s[0]
fmul v27.4s, v4.4s, v2.s[1]
fmul v29.4s, v4.4s, v2.s[2]
fmul v31.4s, v4.4s, v2.s[3]

subs x19, x19, #1
beq Bias

LoopDepth:
ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48
ld1 {v3.4s, v4.4s}, [x14], #32
fmla v8.4s, v3.4s, v0.s[0]
fmla v10.4s, v3.4s, v0.s[1]
fmla v12.4s, v3.4s, v0.s[2]
fmla v14.4s, v3.4s, v0.s[3]
fmla v9.4s, v4.4s, v0.s[0]
fmla v11.4s, v4.4s, v0.s[1]
fmla v13.4s, v4.4s, v0.s[2]
fmla v15.4s, v4.4s, v0.s[3]
fmla v16.4s, v3.4s, v1.s[0]
fmla v18.4s, v3.4s, v1.s[1]
fmla v20.4s, v3.4s, v1.s[2]
fmla v22.4s, v3.4s, v1.s[3]
fmla v17.4s, v4.4s, v1.s[0]
fmla v19.4s, v4.4s, v1.s[1]
fmla v21.4s, v4.4s, v1.s[2]
fmla v23.4s, v4.4s, v1.s[3]
fmla v24.4s, v3.4s, v2.s[0]
fmla v26.4s, v3.4s, v2.s[1]
fmla v28.4s, v3.4s, v2.s[2]
fmla v30.4s, v3.4s, v2.s[3]
fmla v25.4s, v4.4s, v2.s[0]
fmla v27.4s, v4.4s, v2.s[1]
fmla v29.4s, v4.4s, v2.s[2]
fmla v31.4s, v4.4s, v2.s[3]

subs x19, x19, #1
bgt LoopDepth

Bias:
cbz x3, Activation
ld1 {v0.4s}, [x12], #16
ld1 {v1.4s}, [x12], #16
fadd v8.4s, v8.4s, v0.4s
fadd v9.4s, v9.4s, v1.4s
fadd v10.4s, v10.4s, v0.4s
fadd v11.4s, v11.4s, v1.4s
fadd v12.4s, v12.4s, v0.4s
fadd v13.4s, v13.4s, v1.4s
fadd v14.4s, v14.4s, v0.4s
fadd v15.4s, v15.4s, v1.4s
fadd v16.4s, v16.4s, v0.4s
fadd v17.4s, v17.4s, v1.4s
fadd v18.4s, v18.4s, v0.4s
fadd v19.4s, v19.4s, v1.4s
fadd v20.4s, v20.4s, v0.4s
fadd v21.4s, v21.4s, v1.4s
fadd v22.4s, v22.4s, v0.4s
fadd v23.4s, v23.4s, v1.4s
fadd v24.4s, v24.4s, v0.4s
fadd v25.4s, v25.4s, v1.4s
fadd v26.4s, v26.4s, v0.4s
fadd v27.4s, v27.4s, v1.4s
fadd v28.4s, v28.4s, v0.4s
fadd v29.4s, v29.4s, v1.4s
fadd v30.4s, v30.4s, v0.4s
fadd v31.4s, v31.4s, v1.4s

Activation:
cmp x4, #3
beq Relu6
cmp x4, #1
beq Relu
b Write

Relu6:
mov w19, #6
dup v2.4s, w19
scvtf v2.4s, v2.4s
fmin v8.4s, v8.4s, v2.4s
fmin v9.4s, v9.4s, v2.4s
fmin v10.4s, v10.4s, v2.4s
fmin v11.4s, v11.4s, v2.4s
fmin v12.4s, v12.4s, v2.4s
fmin v13.4s, v13.4s, v2.4s
fmin v14.4s, v14.4s, v2.4s
fmin v15.4s, v15.4s, v2.4s
fmin v16.4s, v16.4s, v2.4s
fmin v17.4s, v17.4s, v2.4s
fmin v18.4s, v18.4s, v2.4s
fmin v19.4s, v19.4s, v2.4s
fmin v20.4s, v20.4s, v2.4s
fmin v21.4s, v21.4s, v2.4s
fmin v22.4s, v22.4s, v2.4s
fmin v23.4s, v23.4s, v2.4s
fmin v24.4s, v24.4s, v2.4s
fmin v25.4s, v25.4s, v2.4s
fmin v26.4s, v26.4s, v2.4s
fmin v27.4s, v27.4s, v2.4s
fmin v28.4s, v28.4s, v2.4s
fmin v29.4s, v29.4s, v2.4s
fmin v30.4s, v30.4s, v2.4s
fmin v31.4s, v31.4s, v2.4s
Relu:
dup v3.4s, wzr
fmax v8.4s, v8.4s, v3.4s
fmax v9.4s, v9.4s, v3.4s
fmax v10.4s, v10.4s, v3.4s
fmax v11.4s, v11.4s, v3.4s
fmax v12.4s, v12.4s, v3.4s
fmax v13.4s, v13.4s, v3.4s
fmax v14.4s, v14.4s, v3.4s
fmax v15.4s, v15.4s, v3.4s
fmax v16.4s, v16.4s, v3.4s
fmax v17.4s, v17.4s, v3.4s
fmax v18.4s, v18.4s, v3.4s
fmax v19.4s, v19.4s, v3.4s
fmax v20.4s, v20.4s, v3.4s
fmax v21.4s, v21.4s, v3.4s
fmax v22.4s, v22.4s, v3.4s
fmax v23.4s, v23.4s, v3.4s
fmax v24.4s, v24.4s, v3.4s
fmax v25.4s, v25.4s, v3.4s
fmax v26.4s, v26.4s, v3.4s
fmax v27.4s, v27.4s, v3.4s
fmax v28.4s, v28.4s, v3.4s
fmax v29.4s, v29.4s, v3.4s
fmax v30.4s, v30.4s, v3.4s
fmax v31.4s, v31.4s, v3.4s
b Write

LoopDepthStartHalf:
ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48
ld1 {v3.4s, v4.4s}, [x14], #32
fmul v8.4s, v3.4s, v0.s[0]
fmul v10.4s, v3.4s, v0.s[1]
fmul v12.4s, v3.4s, v0.s[2]
fmul v14.4s, v3.4s, v0.s[3]
fmul v16.4s, v3.4s, v1.s[0]
fmul v18.4s, v3.4s, v1.s[1]
fmul v20.4s, v3.4s, v1.s[2]
fmul v22.4s, v3.4s, v1.s[3]
fmul v24.4s, v3.4s, v2.s[0]
fmul v26.4s, v3.4s, v2.s[1]
fmul v28.4s, v3.4s, v2.s[2]
fmul v30.4s, v3.4s, v2.s[3]

subs x19, x19, #1
beq BiasHalf

LoopDepthHalf:
ld1 {v0.4s, v1.4s, v2.4s}, [x10], #48
ld1 {v3.4s, v4.4s}, [x14], #32
fmla v8.4s, v3.4s, v0.s[0]
fmla v10.4s, v3.4s, v0.s[1]
fmla v12.4s, v3.4s, v0.s[2]
fmla v14.4s, v3.4s, v0.s[3]
fmla v16.4s, v3.4s, v1.s[0]
fmla v18.4s, v3.4s, v1.s[1]
fmla v20.4s, v3.4s, v1.s[2]
fmla v22.4s, v3.4s, v1.s[3]
fmla v24.4s, v3.4s, v2.s[0]
fmla v26.4s, v3.4s, v2.s[1]
fmla v28.4s, v3.4s, v2.s[2]
fmla v30.4s, v3.4s, v2.s[3]

subs x19, x19, #1
bgt LoopDepthHalf

BiasHalf:
cbz x3, ActivationHalf
ld1 {v0.4s}, [x12], #16
ld1 {v1.4s}, [x12], #16
fadd v8.4s, v8.4s, v0.4s
fadd v10.4s, v10.4s, v0.4s
fadd v12.4s, v12.4s, v0.4s
fadd v14.4s, v14.4s, v0.4s
fadd v16.4s, v16.4s, v0.4s
fadd v18.4s, v18.4s, v0.4s
fadd v20.4s, v20.4s, v0.4s
fadd v22.4s, v22.4s, v0.4s
fadd v24.4s, v24.4s, v0.4s
fadd v26.4s, v26.4s, v0.4s
fadd v28.4s, v28.4s, v0.4s
fadd v30.4s, v30.4s, v0.4s

ActivationHalf:
cmp x4, #3
beq Relu6Half
cmp x4, #1
beq ReluHalf
b Write

Relu6Half:
mov w19, #6
dup v2.4s, w19
scvtf v2.4s, v2.4s
fmin v8.4s, v8.4s, v2.4s
fmin v10.4s, v10.4s, v2.4s
fmin v12.4s, v12.4s, v2.4s
fmin v14.4s, v14.4s, v2.4s
fmin v16.4s, v16.4s, v2.4s
fmin v18.4s, v18.4s, v2.4s
fmin v20.4s, v20.4s, v2.4s
fmin v22.4s, v22.4s, v2.4s
fmin v24.4s, v24.4s, v2.4s
fmin v26.4s, v26.4s, v2.4s
fmin v28.4s, v28.4s, v2.4s
fmin v30.4s, v30.4s, v2.4s

ReluHalf:
dup v3.4s, wzr
fmax v8.4s, v8.4s, v3.4s
fmax v10.4s, v10.4s, v3.4s
fmax v12.4s, v12.4s, v3.4s
fmax v14.4s, v14.4s, v3.4s
fmax v16.4s, v16.4s, v3.4s
fmax v18.4s, v18.4s, v3.4s
fmax v20.4s, v20.4s, v3.4s
fmax v22.4s, v22.4s, v3.4s
fmax v24.4s, v24.4s, v3.4s
fmax v26.4s, v26.4s, v3.4s
fmax v28.4s, v28.4s, v3.4s
fmax v30.4s, v30.4s, v3.4s

Write:
cmp x9, #2
beq WriteWino
cbz x9, WriteC8
cmp x13, #1
beq Write1
cmp x13, #2
beq Write2
cmp x13, #3
beq Write3
cmp x13, #4
beq Write4
cmp x13, #5
beq Write5
cmp x13, #6
beq Write6
cmp x13, #7
beq Write7
b Write8

Write1:
add x2, x2, #4
str s8, [x11]
cmp x6, #1
beq WriteEnd
add x11, x11, x8
str s10, [x11]
cmp x6, #2
beq WriteEnd
add x11, x11, x8
str s12, [x11]
cmp x6, #3
beq WriteEnd
add x11, x11, x8
str s14, [x11]
cmp x6, #4
beq WriteEnd
add x11, x11, x8
str s16, [x11]
cmp x6, #5
beq WriteEnd
add x11, x11, x8
str s18, [x11]
cmp x6, #6
beq WriteEnd
add x11, x11, x8
str s20, [x11]
cmp x6, #7
beq WriteEnd
add x11, x11, x8
str s22, [x11]
cmp x6, #8
beq WriteEnd
add x11, x11, x8
str s24, [x11]
cmp x6, #9
beq WriteEnd
add x11, x11, x8
str s26, [x11]
cmp x6, #10
beq WriteEnd
add x11, x11, x8
str s28, [x11]
cmp x6, #11
beq WriteEnd
add x11, x11, x8
str s30, [x11]
add x11, x11, x8
add x11, x11, #4
b WriteEnd
Write2:
add x2, x2, #8
st1 {v8.2s}, [x11], x8
cmp x6, #1
beq WriteEnd
st1 {v10.2s}, [x11], x8
cmp x6, #2
beq WriteEnd
st1 {v12.2s}, [x11], x8
cmp x6, #3
beq WriteEnd
st1 {v14.2s}, [x11], x8
cmp x6, #4
beq WriteEnd
st1 {v16.2s}, [x11], x8
cmp x6, #5
beq WriteEnd
st1 {v18.2s}, [x11], x8
cmp x6, #6
beq WriteEnd
st1 {v20.2s}, [x11], x8
cmp x6, #7
beq WriteEnd
st1 {v22.2s}, [x11], x8
cmp x6, #8
beq WriteEnd
st1 {v24.2s}, [x11], x8
cmp x6, #9
beq WriteEnd
st1 {v26.2s}, [x11], x8
cmp x6, #10
beq WriteEnd
st1 {v28.2s}, [x11], x8
cmp x6, #11
beq WriteEnd
st1 {v30.2s}, [x11], x8
add x11, x11, #8
b WriteEnd
Write3:
add x2, x2, #12
add x19, x11, #8
st1 {v8.2s}, [x11], x8
st1 {v8.s}[2], [x19], x8
cmp x6, #1
beq WriteEnd
st1 {v10.2s}, [x11], x8
st1 {v10.s}[2], [x19], x8
cmp x6, #2
beq WriteEnd
st1 {v12.2s}, [x11], x8
st1 {v12.s}[2], [x19], x8
cmp x6, #3
beq WriteEnd
st1 {v14.2s}, [x11], x8
st1 {v14.s}[2], [x19], x8
cmp x6, #4
beq WriteEnd
st1 {v16.2s}, [x11], x8
st1 {v16.s}[2], [x19], x8
cmp x6, #5
beq WriteEnd
st1 {v18.2s}, [x11], x8
st1 {v18.s}[2], [x19], x8
cmp x6, #6
beq WriteEnd
st1 {v20.2s}, [x11], x8
st1 {v20.s}[2], [x19], x8
cmp x6, #7
beq WriteEnd
st1 {v22.2s}, [x11], x8
st1 {v22.s}[2], [x19], x8
cmp x6, #8
beq WriteEnd
st1 {v24.2s}, [x11], x8
st1 {v24.s}[2], [x19], x8
cmp x6, #9
beq WriteEnd
st1 {v26.2s}, [x11], x8
st1 {v26.s}[2], [x19], x8
cmp x6, #10
beq WriteEnd
st1 {v28.2s}, [x11], x8
st1 {v28.s}[2], [x19], x8
cmp x6, #11
beq WriteEnd
st1 {v30.2s}, [x11], x8
st1 {v30.s}[2], [x19]
add x11, x11, #12
b WriteEnd
Write4:
add x2, x2, #16
st1 {v8.4s}, [x11], x8
cmp x6, #1
beq WriteEnd
st1 {v10.4s}, [x11], x8
cmp x6, #2
beq WriteEnd
st1 {v12.4s}, [x11], x8
cmp x6, #3
beq WriteEnd
st1 {v14.4s}, [x11], x8
cmp x6, #4
beq WriteEnd
st1 {v16.4s}, [x11], x8
cmp x6, #5
beq WriteEnd
st1 {v18.4s}, [x11], x8
cmp x6, #6
beq WriteEnd
st1 {v20.4s}, [x11], x8
cmp x6, #7
beq WriteEnd
st1 {v22.4s}, [x11], x8
cmp x6, #8
beq WriteEnd
st1 {v24.4s}, [x11], x8
cmp x6, #9
beq WriteEnd
st1 {v26.4s}, [x11], x8
cmp x6, #10
beq WriteEnd
st1 {v28.4s}, [x11], x8
cmp x6, #11
beq WriteEnd
st1 {v30.4s}, [x11], x8
add x11, x11, #16
b WriteEnd
Write5:
add x2, x2, #20
add x19, x11, #16
st1 {v8.4s}, [x11], x8
str s9, [x19]
cmp x6, #1
beq WriteEnd
add x19, x19, x8
st1 {v10.4s}, [x11], x8
str s11, [x19]
cmp x6, #2
beq WriteEnd
add x19, x19, x8
st1 {v12.4s}, [x11], x8
str s13, [x19]
cmp x6, #3
beq WriteEnd
add x19, x19, x8
st1 {v14.4s}, [x11], x8
str s15, [x19]
cmp x6, #4
beq WriteEnd
add x19, x19, x8
st1 {v16.4s}, [x11], x8
str s17, [x19]
cmp x6, #5
beq WriteEnd
add x19, x19, x8
st1 {v18.4s}, [x11], x8
str s19, [x19]
cmp x6, #6
beq WriteEnd
add x19, x19, x8
st1 {v20.4s}, [x11], x8
str s21, [x19]
cmp x6, #7
beq WriteEnd
add x19, x19, x8
st1 {v22.4s}, [x11], x8
str s23, [x19]
cmp x6, #8
beq WriteEnd
add x19, x19, x8
st1 {v24.4s}, [x11], x8
str s25, [x19]
cmp x6, #9
beq WriteEnd
add x19, x19, x8
st1 {v26.4s}, [x11], x8
str s27, [x19]
cmp x6, #10
beq WriteEnd
add x19, x19, x8
st1 {v28.4s}, [x11], x8
str s29, [x19]
cmp x6, #11
beq WriteEnd
add x19, x19, x8
st1 {v30.4s}, [x11], x8
str s31, [x19]
add x11, x11, #20
b WriteEnd
Write6:
add x2, x2, #24
add x19, x11, #16
st1 {v8.4s}, [x11], x8
st1 {v9.2s}, [x19], x8
cmp x6, #1
beq WriteEnd
st1 {v10.4s}, [x11], x8
st1 {v11.2s}, [x19], x8
cmp x6, #2
beq WriteEnd
st1 {v12.4s}, [x11], x8
st1 {v13.2s}, [x19], x8
cmp x6, #3
beq WriteEnd
st1 {v14.4s}, [x11], x8
st1 {v15.2s}, [x19], x8
cmp x6, #4
beq WriteEnd
st1 {v16.4s}, [x11], x8
st1 {v17.2s}, [x19], x8
cmp x6, #5
beq WriteEnd
st1 {v18.4s}, [x11], x8
st1 {v19.2s}, [x19], x8
cmp x6, #6
beq WriteEnd
st1 {v20.4s}, [x11], x8
st1 {v21.2s}, [x19], x8
cmp x6, #7
beq WriteEnd
st1 {v22.4s}, [x11], x8
st1 {v23.2s}, [x19], x8
cmp x6, #8
beq WriteEnd
st1 {v24.4s}, [x11], x8
st1 {v25.2s}, [x19], x8
cmp x6, #9
beq WriteEnd
st1 {v26.4s}, [x11], x8
st1 {v27.2s}, [x19], x8
cmp x6, #10
beq WriteEnd
st1 {v28.4s}, [x11], x8
st1 {v29.2s}, [x19], x8
cmp x6, #11
beq WriteEnd
st1 {v30.4s}, [x11], x8
st1 {v31.2s}, [x19]
add x11, x11, #24
b WriteEnd
Write7:
add x2, x2, #28
add x19, x11, #16
add x20, x11, #24
st1 {v8.4s}, [x11], x8
st1 {v9.2s}, [x19], x8
st1 {v9.s}[2], [x20], x8
cmp x6, #1
beq WriteEnd
st1 {v10.4s}, [x11], x8
st1 {v11.2s}, [x19], x8
st1 {v11.s}[2], [x20], x8
cmp x6, #2
beq WriteEnd
st1 {v12.4s}, [x11], x8
st1 {v13.2s}, [x19], x8
st1 {v13.s}[2], [x20], x8
cmp x6, #3
beq WriteEnd
st1 {v14.4s}, [x11], x8
st1 {v15.2s}, [x19], x8
st1 {v15.s}[2], [x20], x8
cmp x6, #4
beq WriteEnd
st1 {v16.4s}, [x11], x8
st1 {v17.2s}, [x19], x8
st1 {v17.s}[2], [x20], x8
cmp x6, #5
beq WriteEnd
st1 {v18.4s}, [x11], x8
st1 {v19.2s}, [x19], x8
st1 {v19.s}[2], [x20], x8
cmp x6, #6
beq WriteEnd
st1 {v20.4s}, [x11], x8
st1 {v21.2s}, [x19], x8
st1 {v21.s}[2], [x20], x8
cmp x6, #7
beq WriteEnd
st1 {v22.4s}, [x11], x8
st1 {v23.2s}, [x19], x8
st1 {v23.s}[2], [x20], x8
cmp x6, #8
beq WriteEnd
st1 {v24.4s}, [x11], x8
st1 {v25.2s}, [x19], x8
st1 {v25.s}[2], [x20], x8
cmp x6, #9
beq WriteEnd
st1 {v26.4s}, [x11], x8
st1 {v27.2s}, [x19], x8
st1 {v27.s}[2], [x20], x8
cmp x6, #10
beq WriteEnd
st1 {v28.4s}, [x11], x8
st1 {v29.2s}, [x19], x8
st1 {v29.s}[2], [x20], x8
cmp x6, #11
beq WriteEnd
st1 {v30.4s}, [x11], x8
st1 {v31.2s}, [x19]
st1 {v31.s}[2], [x20]
add x11, x11, #28
b WriteEnd
WriteC8:
mov x19, x11
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x19], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x19], #64
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x19], #64
st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x19], #64
st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x19], #64
st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x19], #64
add x11, x11, x16
b WriteEnd
WriteWino:
add x2, x11, x16
st1 {v8.4s, v9.4s}, [x11], x15
st1 {v10.4s, v11.4s}, [x11], x15
st1 {v12.4s, v13.4s}, [x11], x15
st1 {v14.4s, v15.4s}, [x11], x15
st1 {v16.4s, v17.4s}, [x11], x15
st1 {v18.4s, v19.4s}, [x11], x15
st1 {v20.4s, v21.4s}, [x11], x15
st1 {v22.4s, v23.4s}, [x11], x15
st1 {v24.4s, v25.4s}, [x11], x15
st1 {v26.4s, v27.4s}, [x11], x15
st1 {v28.4s, v29.4s}, [x11], x15
st1 {v30.4s, v31.4s}, [x11], x15
b WriteEnd
Write8:
add x2, x2, #32
st1 {v8.4s, v9.4s}, [x11], x8
cmp x6, #1
beq WriteEnd
st1 {v10.4s, v11.4s}, [x11], x8
cmp x6, #2
beq WriteEnd
st1 {v12.4s, v13.4s}, [x11], x8
cmp x6, #3
beq WriteEnd
st1 {v14.4s, v15.4s}, [x11], x8
cmp x6, #4
beq WriteEnd
st1 {v16.4s, v17.4s}, [x11], x8
cmp x6, #5
beq WriteEnd
st1 {v18.4s, v19.4s}, [x11], x8
cmp x6, #6
beq WriteEnd
st1 {v20.4s, v21.4s}, [x11], x8
cmp x6, #7
beq WriteEnd
st1 {v22.4s, v23.4s}, [x11], x8
cmp x6, #8
beq WriteEnd
st1 {v24.4s, v25.4s}, [x11], x8
cmp x6, #9
beq WriteEnd
st1 {v26.4s, v27.4s}, [x11], x8
cmp x6, #10
beq WriteEnd
st1 {v28.4s, v29.4s}, [x11], x8
cmp x6, #11
beq WriteEnd
st1 {v30.4s, v31.4s}, [x11], x8
add x11, x11, #32

WriteEnd:
subs x13, x13, #8 // rhs col - 8
bgt LoopCol

LoopColEnd:
add x0, x0, x17
cbz x9, C8DstStep
mov x21, #4
mul x21, x21, x7
sub x11, x11, x21
mov x2, x11
b NoDstStep
C8DstStep:
add x2, x2, #384
mov x11, x2
NoDstStep:
subs x6, x6, #12
bgt LoopRow

sub sp, sp, #160
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ret
#endif

+ 399
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow4.S View File

@@ -0,0 +1,399 @@
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5

// void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
// int row, int col, size_t stride, size_t writeMode)
// x0: a
// x1: b
// x2: c
// x3: bias
// x4: act_type
// x5: depth
// x6: row
// x7: col
// x8: stride
// x9: writeMode

asm_function MatmulFloatNeon64OptRow4
sub sp, sp, #160
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16

ldr x8, [sp]
ldr x9, [sp, #8]

mov x21, #48 // sizeof(float) * 12
mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth
cbnz x9, NoC8Steps
mov x11, x2
mov x21, #32
mul x16, x6, x21 // row * 8 * sizeof(float)
NoC8Steps:
cmp x9, #2
bne NoWinoSteps
mov x21, #4
mul x15, x7, x8
mul x15, x15, x21 // kernel_size * col *sizeof(float)
mov x21, #32
mul x16, x8, x21 // kernel_size * 8 * sizeof(float)
NoWinoSteps:
mov x21, #4
mul x8, x8, x21

LoopRow4:
mov x14, x1 // reload rhs ptr
mov x13, x7 // reload rhs col
mov x12, x3 // reload bias

LoopCol4:
cbz x9, NoReloadDst4
mov x11, x2
NoReloadDst4:
mov x10, x0 // reload lhs ptr
mov x19, x5 // reload depth

cmp x13, #4
ble LoopDepthStartHalf4

LoopDepthStart4:
ld1 {v0.4s}, [x10], #16
ld1 {v3.4s, v4.4s}, [x14], #32
fmul v8.4s, v3.4s, v0.s[0]
fmul v10.4s, v3.4s, v0.s[1]
fmul v12.4s, v3.4s, v0.s[2]
fmul v14.4s, v3.4s, v0.s[3]
fmul v9.4s, v4.4s, v0.s[0]
fmul v11.4s, v4.4s, v0.s[1]
fmul v13.4s, v4.4s, v0.s[2]
fmul v15.4s, v4.4s, v0.s[3]

subs x19, x19, #1
beq Bias4

LoopDepth4:
ld1 {v0.4s}, [x10], #16
ld1 {v3.4s, v4.4s}, [x14], #32
fmla v8.4s, v3.4s, v0.s[0]
fmla v10.4s, v3.4s, v0.s[1]
fmla v12.4s, v3.4s, v0.s[2]
fmla v14.4s, v3.4s, v0.s[3]
fmla v9.4s, v4.4s, v0.s[0]
fmla v11.4s, v4.4s, v0.s[1]
fmla v13.4s, v4.4s, v0.s[2]
fmla v15.4s, v4.4s, v0.s[3]

subs x19, x19, #1
bgt LoopDepth4

Bias4:
cbz x3, Activation4
ld1 {v0.4s}, [x12], #16
ld1 {v1.4s}, [x12], #16
fadd v8.4s, v8.4s, v0.4s
fadd v9.4s, v9.4s, v1.4s
fadd v10.4s, v10.4s, v0.4s
fadd v11.4s, v11.4s, v1.4s
fadd v12.4s, v12.4s, v0.4s
fadd v13.4s, v13.4s, v1.4s
fadd v14.4s, v14.4s, v0.4s
fadd v15.4s, v15.4s, v1.4s

Activation4:
cmp x4, #3
beq Relu64
cmp x4, #1
beq Relu4
b Write

Relu64:
mov w19, #6
dup v2.4s, w19
scvtf v2.4s, v2.4s
fmin v8.4s, v8.4s, v2.4s
fmin v9.4s, v9.4s, v2.4s
fmin v10.4s, v10.4s, v2.4s
fmin v11.4s, v11.4s, v2.4s
fmin v12.4s, v12.4s, v2.4s
fmin v13.4s, v13.4s, v2.4s
fmin v14.4s, v14.4s, v2.4s
fmin v15.4s, v15.4s, v2.4s

Relu4:
dup v3.4s, wzr
fmax v8.4s, v8.4s, v3.4s
fmax v9.4s, v9.4s, v3.4s
fmax v10.4s, v10.4s, v3.4s
fmax v11.4s, v11.4s, v3.4s
fmax v12.4s, v12.4s, v3.4s
fmax v13.4s, v13.4s, v3.4s
fmax v14.4s, v14.4s, v3.4s
fmax v15.4s, v15.4s, v3.4s
b Write

LoopDepthStartHalf4:
ld1 {v0.4s}, [x10], #16
ld1 {v3.4s}, [x14], #16
ld1 {v4.4s}, [x14], #16
fmul v8.4s, v3.4s, v0.s[0]
fmul v10.4s, v3.4s, v0.s[1]
fmul v12.4s, v3.4s, v0.s[2]
fmul v14.4s, v3.4s, v0.s[3]

subs x19, x19, #1
beq BiasHalf4

LoopDepthHalf4:
ld1 {v0.4s}, [x10], #16
ld1 {v3.4s}, [x14], #16
ld1 {v4.4s}, [x14], #16
fmla v8.4s, v3.4s, v0.s[0]
fmla v10.4s, v3.4s, v0.s[1]
fmla v12.4s, v3.4s, v0.s[2]
fmla v14.4s, v3.4s, v0.s[3]

subs x19, x19, #1
bgt LoopDepthHalf4

BiasHalf4:
cbz x3, ActivationHalf4
ld1 {v0.4s}, [x12], #16
ld1 {v1.4s}, [x12], #16
fadd v8.4s, v8.4s, v0.4s
fadd v10.4s, v10.4s, v0.4s
fadd v12.4s, v12.4s, v0.4s
fadd v14.4s, v14.4s, v0.4s

ActivationHalf4:
cmp x4, #3
beq Relu6Half4
cmp x4, #1
beq ReluHalf4
b Write

Relu6Half4:
mov w19, #6
dup v2.4s, w19
scvtf v2.4s, v2.4s
fmin v8.4s, v8.4s, v2.4s
fmin v10.4s, v10.4s, v2.4s
fmin v12.4s, v12.4s, v2.4s
fmin v14.4s, v14.4s, v2.4s

ReluHalf4:
dup v3.4s, wzr
fmax v8.4s, v8.4s, v3.4s
fmax v10.4s, v10.4s, v3.4s
fmax v12.4s, v12.4s, v3.4s
fmax v14.4s, v14.4s, v3.4s

Write:
cmp x9, #2
beq WriteWino
cbz x9, WriteC8
cmp x13, #1
beq Write1
cmp x13, #2
beq Write2
cmp x13, #3
beq Write3
cmp x13, #4
beq Write4
cmp x13, #5
beq Write5
cmp x13, #6
beq Write6
cmp x13, #7
beq Write7
b Write8

Write1:
add x2, x2, #4
str s8, [x11]
cmp x6, #1
beq WriteEnd
add x11, x11, x8
str s10, [x11]
cmp x6, #2
beq WriteEnd
add x11, x11, x8
str s12, [x11]
cmp x6, #3
beq WriteEnd
add x11, x11, x8
str s14, [x11]
add x11, x11, x8
add x11, x11, #4
b WriteEnd
Write2:
add x2, x2, #8
st1 {v8.2s}, [x11], x8
cmp x6, #1
beq WriteEnd
st1 {v10.2s}, [x11], x8
cmp x6, #2
beq WriteEnd
st1 {v12.2s}, [x11], x8
cmp x6, #3
beq WriteEnd
st1 {v14.2s}, [x11], x8
add x11, x11, #8
b WriteEnd
Write3:
add x2, x2, #12
add x19, x11, #8
st1 {v8.2s}, [x11], x8
st1 {v8.s}[2], [x19], x8
cmp x6, #1
beq WriteEnd
st1 {v10.2s}, [x11], x8
st1 {v10.s}[2], [x19], x8
cmp x6, #2
beq WriteEnd
st1 {v12.2s}, [x11], x8
st1 {v12.s}[2], [x19], x8
cmp x6, #3
beq WriteEnd
st1 {v14.2s}, [x11], x8
st1 {v14.s}[2], [x19]
add x11, x11, #12
b WriteEnd
Write4:
add x2, x2, #16
st1 {v8.4s}, [x11], x8
cmp x6, #1
beq WriteEnd
st1 {v10.4s}, [x11], x8
cmp x6, #2
beq WriteEnd
st1 {v12.4s}, [x11], x8
cmp x6, #3
beq WriteEnd
st1 {v14.4s}, [x11], x8
add x11, x11, #16
b WriteEnd
Write5:
add x2, x2, #20
add x19, x11, #16
st1 {v8.4s}, [x11], x8
str s9, [x19]
cmp x6, #1
beq WriteEnd
add x19, x19, x8
st1 {v10.4s}, [x11], x8
str s11, [x19]
cmp x6, #2
beq WriteEnd
add x19, x19, x8
st1 {v12.4s}, [x11], x8
str s13, [x19]
cmp x6, #3
beq WriteEnd
add x19, x19, x8
st1 {v14.4s}, [x11], x8
str s15, [x19]
add x11, x11, #20
b WriteEnd
Write6:
add x2, x2, #24
add x19, x11, #16
st1 {v8.4s}, [x11], x8
st1 {v9.2s}, [x19], x8
cmp x6, #1
beq WriteEnd
st1 {v10.4s}, [x11], x8
st1 {v11.2s}, [x19], x8
cmp x6, #2
beq WriteEnd
st1 {v12.4s}, [x11], x8
st1 {v13.2s}, [x19], x8
cmp x6, #3
beq WriteEnd
st1 {v14.4s}, [x11], x8
st1 {v15.2s}, [x19], x8
add x11, x11, #24
b WriteEnd
Write7:
add x2, x2, #28
add x19, x11, #16
add x20, x11, #24
st1 {v8.4s}, [x11], x8
st1 {v9.2s}, [x19], x8
st1 {v9.s}[2], [x20], x8
cmp x6, #1
beq WriteEnd
st1 {v10.4s}, [x11], x8
st1 {v11.2s}, [x19], x8
st1 {v11.s}[2], [x20], x8
cmp x6, #2
beq WriteEnd
st1 {v12.4s}, [x11], x8
st1 {v13.2s}, [x19], x8
st1 {v13.s}[2], [x20], x8
cmp x6, #3
beq WriteEnd
st1 {v14.4s}, [x11], x8
st1 {v15.2s}, [x19], x8
st1 {v15.s}[2], [x20], x8
add x11, x11, #28
b WriteEnd
WriteC8:
mov x19, x11
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x19], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x19], #64

add x11, x11, x16
b WriteEnd
WriteWino:
add x2, x11, x16
st1 {v8.4s, v9.4s}, [x11], x15
st1 {v10.4s, v11.4s}, [x11], x15
st1 {v12.4s, v13.4s}, [x11], x15
st1 {v14.4s, v15.4s}, [x11], x15

b WriteEnd
Write8:
add x2, x2, #32
st1 {v8.4s, v9.4s}, [x11], x8
cmp x6, #1
beq WriteEnd
st1 {v10.4s, v11.4s}, [x11], x8
cmp x6, #2
beq WriteEnd
st1 {v12.4s, v13.4s}, [x11], x8
cmp x6, #3
beq WriteEnd
st1 {v14.4s, v15.4s}, [x11], x8
add x11, x11, #32

WriteEnd:
subs x13, x13, #8 // rhs col - 8
bgt LoopCol4


LoopColEnd:
add x0, x0, x17
cbz x9, C8DstStep
mov x21, #4
mul x21, x21, x7
sub x11, x11, x21
mov x2, x11
b NoDstStep
C8DstStep:
add x2, x2, #384
mov x11, x2
NoDstStep:
subs x6, x6, #12
bgt LoopRow4

sub sp, sp, #160
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ret
#endif

+ 586
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow8.S View File

@@ -0,0 +1,586 @@
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5

// void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
// int row, int col, size_t stride, size_t writeMode)
// x0: a
// x1: b
// x2: c
// x3: bias
// x4: act_type
// x5: depth
// x6: row
// x7: col
// x8: stride
// x9: writeMode

asm_function MatmulFloatNeon64OptRow8
sub sp, sp, #160
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16

ldr x8, [sp]
ldr x9, [sp, #8]

mov x21, #48 // sizeof(float) * 12
mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth
cbnz x9, NoC8Steps
mov x11, x2
mov x21, #32
mul x16, x6, x21 // row * 8 * sizeof(float)
NoC8Steps:
cmp x9, #2
bne NoWinoSteps
mov x21, #4
mul x15, x7, x8
mul x15, x15, x21 // kernel_size * col *sizeof(float)
mov x21, #32
mul x16, x8, x21 // kernel_size * 8 * sizeof(float)
NoWinoSteps:
mov x21, #4
mul x8, x8, x21

LoopRow8:
mov x14, x1 // reload rhs ptr
mov x13, x7 // reload rhs col
mov x12, x3 // reload bias

LoopCol8:
cbz x9, NoReloadDst8
mov x11, x2
NoReloadDst8:
mov x10, x0 // reload lhs ptr
mov x19, x5 // reload depth

cmp x13, #4
ble LoopDepthStartHalf8

LoopDepthStart8:
ld1 {v0.4s, v1.4s}, [x10], #32
ld1 {v3.4s, v4.4s}, [x14], #32
fmul v8.4s, v3.4s, v0.s[0]
fmul v10.4s, v3.4s, v0.s[1]
fmul v12.4s, v3.4s, v0.s[2]
fmul v14.4s, v3.4s, v0.s[3]
fmul v9.4s, v4.4s, v0.s[0]
fmul v11.4s, v4.4s, v0.s[1]
fmul v13.4s, v4.4s, v0.s[2]
fmul v15.4s, v4.4s, v0.s[3]
fmul v16.4s, v3.4s, v1.s[0]
fmul v18.4s, v3.4s, v1.s[1]
fmul v20.4s, v3.4s, v1.s[2]
fmul v22.4s, v3.4s, v1.s[3]
fmul v17.4s, v4.4s, v1.s[0]
fmul v19.4s, v4.4s, v1.s[1]
fmul v21.4s, v4.4s, v1.s[2]
fmul v23.4s, v4.4s, v1.s[3]

subs x19, x19, #1
beq Bias8

LoopDepth8:
ld1 {v0.4s, v1.4s}, [x10], #32
ld1 {v3.4s, v4.4s}, [x14], #32
fmla v8.4s, v3.4s, v0.s[0]
fmla v10.4s, v3.4s, v0.s[1]
fmla v12.4s, v3.4s, v0.s[2]
fmla v14.4s, v3.4s, v0.s[3]
fmla v9.4s, v4.4s, v0.s[0]
fmla v11.4s, v4.4s, v0.s[1]
fmla v13.4s, v4.4s, v0.s[2]
fmla v15.4s, v4.4s, v0.s[3]
fmla v16.4s, v3.4s, v1.s[0]
fmla v18.4s, v3.4s, v1.s[1]
fmla v20.4s, v3.4s, v1.s[2]
fmla v22.4s, v3.4s, v1.s[3]
fmla v17.4s, v4.4s, v1.s[0]
fmla v19.4s, v4.4s, v1.s[1]
fmla v21.4s, v4.4s, v1.s[2]
fmla v23.4s, v4.4s, v1.s[3]

subs x19, x19, #1
bgt LoopDepth8

Bias8:
cbz x3, Activation8
ld1 {v0.4s}, [x12], #16
ld1 {v1.4s}, [x12], #16
fadd v8.4s, v8.4s, v0.4s
fadd v9.4s, v9.4s, v1.4s
fadd v10.4s, v10.4s, v0.4s
fadd v11.4s, v11.4s, v1.4s
fadd v12.4s, v12.4s, v0.4s
fadd v13.4s, v13.4s, v1.4s
fadd v14.4s, v14.4s, v0.4s
fadd v15.4s, v15.4s, v1.4s
fadd v16.4s, v16.4s, v0.4s
fadd v17.4s, v17.4s, v1.4s
fadd v18.4s, v18.4s, v0.4s
fadd v19.4s, v19.4s, v1.4s
fadd v20.4s, v20.4s, v0.4s
fadd v21.4s, v21.4s, v1.4s
fadd v22.4s, v22.4s, v0.4s
fadd v23.4s, v23.4s, v1.4s

Activation8:
cmp x4, #3
beq Relu68
cmp x4, #1
beq Relu8
b Write

Relu68:
mov w19, #6
dup v2.4s, w19
scvtf v2.4s, v2.4s
fmin v8.4s, v8.4s, v2.4s
fmin v9.4s, v9.4s, v2.4s
fmin v10.4s, v10.4s, v2.4s
fmin v11.4s, v11.4s, v2.4s
fmin v12.4s, v12.4s, v2.4s
fmin v13.4s, v13.4s, v2.4s
fmin v14.4s, v14.4s, v2.4s
fmin v15.4s, v15.4s, v2.4s
fmin v16.4s, v16.4s, v2.4s
fmin v17.4s, v17.4s, v2.4s
fmin v18.4s, v18.4s, v2.4s
fmin v19.4s, v19.4s, v2.4s
fmin v20.4s, v20.4s, v2.4s
fmin v21.4s, v21.4s, v2.4s
fmin v22.4s, v22.4s, v2.4s
fmin v23.4s, v23.4s, v2.4s

Relu8:
dup v3.4s, wzr
fmax v8.4s, v8.4s, v3.4s
fmax v9.4s, v9.4s, v3.4s
fmax v10.4s, v10.4s, v3.4s
fmax v11.4s, v11.4s, v3.4s
fmax v12.4s, v12.4s, v3.4s
fmax v13.4s, v13.4s, v3.4s
fmax v14.4s, v14.4s, v3.4s
fmax v15.4s, v15.4s, v3.4s
fmax v16.4s, v16.4s, v3.4s
fmax v17.4s, v17.4s, v3.4s
fmax v18.4s, v18.4s, v3.4s
fmax v19.4s, v19.4s, v3.4s
fmax v20.4s, v20.4s, v3.4s
fmax v21.4s, v21.4s, v3.4s
fmax v22.4s, v22.4s, v3.4s
fmax v23.4s, v23.4s, v3.4s
b Write

LoopDepthStartHalf8:
ld1 {v0.4s, v1.4s}, [x10], #32
ld1 {v3.4s}, [x14], #16
ld1 {v4.4s}, [x14], #16 // weight packed 8, only hold place
fmul v8.4s, v3.4s, v0.s[0]
fmul v10.4s, v3.4s, v0.s[1]
fmul v12.4s, v3.4s, v0.s[2]
fmul v14.4s, v3.4s, v0.s[3]
fmul v16.4s, v3.4s, v1.s[0]
fmul v18.4s, v3.4s, v1.s[1]
fmul v20.4s, v3.4s, v1.s[2]
fmul v22.4s, v3.4s, v1.s[3]

subs x19, x19, #1
beq BiasHalf8

LoopDepthHalf8:
ld1 {v0.4s, v1.4s}, [x10], #32
ld1 {v3.4s}, [x14], #16
ld1 {v4.4s}, [x14], #16 // only hold place
fmla v8.4s, v3.4s, v0.s[0]
fmla v10.4s, v3.4s, v0.s[1]
fmla v12.4s, v3.4s, v0.s[2]
fmla v14.4s, v3.4s, v0.s[3]
fmla v16.4s, v3.4s, v1.s[0]
fmla v18.4s, v3.4s, v1.s[1]
fmla v20.4s, v3.4s, v1.s[2]
fmla v22.4s, v3.4s, v1.s[3]

subs x19, x19, #1
bgt LoopDepthHalf8

BiasHalf8:
cbz x3, ActivationHalf8
ld1 {v0.4s}, [x12], #16
ld1 {v1.4s}, [x12], #16
fadd v8.4s, v8.4s, v0.4s
fadd v10.4s, v10.4s, v0.4s
fadd v12.4s, v12.4s, v0.4s
fadd v14.4s, v14.4s, v0.4s
fadd v16.4s, v16.4s, v0.4s
fadd v18.4s, v18.4s, v0.4s
fadd v20.4s, v20.4s, v0.4s
fadd v22.4s, v22.4s, v0.4s

ActivationHalf8:
cmp x4, #3
beq Relu6Half8
cmp x4, #1
beq ReluHalf8
b Write

Relu6Half8:
mov w19, #6
dup v2.4s, w19
scvtf v2.4s, v2.4s
fmin v8.4s, v8.4s, v2.4s
fmin v10.4s, v10.4s, v2.4s
fmin v12.4s, v12.4s, v2.4s
fmin v14.4s, v14.4s, v2.4s
fmin v16.4s, v16.4s, v2.4s
fmin v18.4s, v18.4s, v2.4s
fmin v20.4s, v20.4s, v2.4s
fmin v22.4s, v22.4s, v2.4s

ReluHalf8:
dup v3.4s, wzr
fmax v8.4s, v8.4s, v3.4s
fmax v10.4s, v10.4s, v3.4s
fmax v12.4s, v12.4s, v3.4s
fmax v14.4s, v14.4s, v3.4s
fmax v16.4s, v16.4s, v3.4s
fmax v18.4s, v18.4s, v3.4s
fmax v20.4s, v20.4s, v3.4s
fmax v22.4s, v22.4s, v3.4s

Write:
cmp x9, #2
beq WriteWino
cbz x9, WriteC8
cmp x13, #1
beq Write1
cmp x13, #2
beq Write2
cmp x13, #3
beq Write3
cmp x13, #4
beq Write4
cmp x13, #5
beq Write5
cmp x13, #6
beq Write6
cmp x13, #7
beq Write7
b Write8

Write1:
add x2, x2, #4
str s8, [x11]
cmp x6, #1
beq WriteEnd
add x11, x11, x8
str s10, [x11]
cmp x6, #2
beq WriteEnd
add x11, x11, x8
str s12, [x11]
cmp x6, #3
beq WriteEnd
add x11, x11, x8
str s14, [x11]
cmp x6, #4
beq WriteEnd
add x11, x11, x8
str s16, [x11]
cmp x6, #5
beq WriteEnd
add x11, x11, x8
str s18, [x11]
cmp x6, #6
beq WriteEnd
add x11, x11, x8
str s20, [x11]
cmp x6, #7
beq WriteEnd
add x11, x11, x8
str s22, [x11]
add x11, x11, x8
add x11, x11, #4
b WriteEnd
Write2:
add x2, x2, #8
st1 {v8.2s}, [x11], x8
cmp x6, #1
beq WriteEnd
st1 {v10.2s}, [x11], x8
cmp x6, #2
beq WriteEnd
st1 {v12.2s}, [x11], x8
cmp x6, #3
beq WriteEnd
st1 {v14.2s}, [x11], x8
cmp x6, #4
beq WriteEnd
st1 {v16.2s}, [x11], x8
cmp x6, #5
beq WriteEnd
st1 {v18.2s}, [x11], x8
cmp x6, #6
beq WriteEnd
st1 {v20.2s}, [x11], x8
cmp x6, #7
beq WriteEnd
st1 {v22.2s}, [x11], x8
add x11, x11, #8
b WriteEnd
Write3:
add x2, x2, #12
add x19, x11, #8
st1 {v8.2s}, [x11], x8
st1 {v8.s}[2], [x19], x8
cmp x6, #1
beq WriteEnd
st1 {v10.2s}, [x11], x8
st1 {v10.s}[2], [x19], x8
cmp x6, #2
beq WriteEnd
st1 {v12.2s}, [x11], x8
st1 {v12.s}[2], [x19], x8
cmp x6, #3
beq WriteEnd
st1 {v14.2s}, [x11], x8
st1 {v14.s}[2], [x19], x8
cmp x6, #4
beq WriteEnd
st1 {v16.2s}, [x11], x8
st1 {v16.s}[2], [x19], x8
cmp x6, #5
beq WriteEnd
st1 {v18.2s}, [x11], x8
st1 {v18.s}[2], [x19], x8
cmp x6, #6
beq WriteEnd
st1 {v20.2s}, [x11], x8
st1 {v20.s}[2], [x19], x8
cmp x6, #7
beq WriteEnd
st1 {v22.2s}, [x11], x8
st1 {v22.s}[2], [x19], x8
add x11, x11, #12
b WriteEnd
Write4:
add x2, x2, #16
st1 {v8.4s}, [x11], x8
cmp x6, #1
beq WriteEnd
st1 {v10.4s}, [x11], x8
cmp x6, #2
beq WriteEnd
st1 {v12.4s}, [x11], x8
cmp x6, #3
beq WriteEnd
st1 {v14.4s}, [x11], x8
cmp x6, #4
beq WriteEnd
st1 {v16.4s}, [x11], x8
cmp x6, #5
beq WriteEnd
st1 {v18.4s}, [x11], x8
cmp x6, #6
beq WriteEnd
st1 {v20.4s}, [x11], x8
cmp x6, #7
beq WriteEnd
st1 {v22.4s}, [x11], x8
add x11, x11, #16
b WriteEnd
Write5:
add x2, x2, #20
add x19, x11, #16
st1 {v8.4s}, [x11], x8
str s9, [x19]
cmp x6, #1
beq WriteEnd
add x19, x19, x8
st1 {v10.4s}, [x11], x8
str s11, [x19]
cmp x6, #2
beq WriteEnd
add x19, x19, x8
st1 {v12.4s}, [x11], x8
str s13, [x19]
cmp x6, #3
beq WriteEnd
add x19, x19, x8
st1 {v14.4s}, [x11], x8
str s15, [x19]
cmp x6, #4
beq WriteEnd
add x19, x19, x8
st1 {v16.4s}, [x11], x8
str s17, [x19]
cmp x6, #5
beq WriteEnd
add x19, x19, x8
st1 {v18.4s}, [x11], x8
str s19, [x19]
cmp x6, #6
beq WriteEnd
add x19, x19, x8
st1 {v20.4s}, [x11], x8
str s21, [x19]
cmp x6, #7
beq WriteEnd
add x19, x19, x8
st1 {v22.4s}, [x11], x8
str s23, [x19]
add x11, x11, #20
b WriteEnd
Write6:
add x2, x2, #24
add x19, x11, #16
st1 {v8.4s}, [x11], x8
st1 {v9.2s}, [x19], x8
cmp x6, #1
beq WriteEnd
st1 {v10.4s}, [x11], x8
st1 {v11.2s}, [x19], x8
cmp x6, #2
beq WriteEnd
st1 {v12.4s}, [x11], x8
st1 {v13.2s}, [x19], x8
cmp x6, #3
beq WriteEnd
st1 {v14.4s}, [x11], x8
st1 {v15.2s}, [x19], x8
cmp x6, #4
beq WriteEnd
st1 {v16.4s}, [x11], x8
st1 {v17.2s}, [x19], x8
cmp x6, #5
beq WriteEnd
st1 {v18.4s}, [x11], x8
st1 {v19.2s}, [x19], x8
cmp x6, #6
beq WriteEnd
st1 {v20.4s}, [x11], x8
st1 {v21.2s}, [x19], x8
cmp x6, #7
beq WriteEnd
st1 {v22.4s}, [x11], x8
st1 {v23.2s}, [x19], x8
add x11, x11, #24
b WriteEnd
Write7:
add x2, x2, #28
add x19, x11, #16
add x20, x11, #24
st1 {v8.4s}, [x11], x8
st1 {v9.2s}, [x19], x8
st1 {v9.s}[2], [x20], x8
cmp x6, #1
beq WriteEnd
st1 {v10.4s}, [x11], x8
st1 {v11.2s}, [x19], x8
st1 {v11.s}[2], [x20], x8
cmp x6, #2
beq WriteEnd
st1 {v12.4s}, [x11], x8
st1 {v13.2s}, [x19], x8
st1 {v13.s}[2], [x20], x8
cmp x6, #3
beq WriteEnd
st1 {v14.4s}, [x11], x8
st1 {v15.2s}, [x19], x8
st1 {v15.s}[2], [x20], x8
cmp x6, #4
beq WriteEnd
st1 {v16.4s}, [x11], x8
st1 {v17.2s}, [x19], x8
st1 {v17.s}[2], [x20], x8
cmp x6, #5
beq WriteEnd
st1 {v18.4s}, [x11], x8
st1 {v19.2s}, [x19], x8
st1 {v19.s}[2], [x20], x8
cmp x6, #6
beq WriteEnd
st1 {v20.4s}, [x11], x8
st1 {v21.2s}, [x19], x8
st1 {v21.s}[2], [x20], x8
cmp x6, #7
beq WriteEnd
st1 {v22.4s}, [x11], x8
st1 {v23.2s}, [x19], x8
st1 {v23.s}[2], [x20], x8
add x11, x11, #28
b WriteEnd
WriteC8:
mov x19, x11
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x19], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x19], #64
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x19], #64
st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x19], #64
add x11, x11, x16
b WriteEnd
WriteWino:
add x2, x11, x16
st1 {v8.4s, v9.4s}, [x11], x15
st1 {v10.4s, v11.4s}, [x11], x15
st1 {v12.4s, v13.4s}, [x11], x15
st1 {v14.4s, v15.4s}, [x11], x15
st1 {v16.4s, v17.4s}, [x11], x15
st1 {v18.4s, v19.4s}, [x11], x15
st1 {v20.4s, v21.4s}, [x11], x15
st1 {v22.4s, v23.4s}, [x11], x15
b WriteEnd
Write8:
add x2, x2, #32
st1 {v8.4s, v9.4s}, [x11], x8
cmp x6, #1
beq WriteEnd
st1 {v10.4s, v11.4s}, [x11], x8
cmp x6, #2
beq WriteEnd
st1 {v12.4s, v13.4s}, [x11], x8
cmp x6, #3
beq WriteEnd
st1 {v14.4s, v15.4s}, [x11], x8
cmp x6, #4
beq WriteEnd
st1 {v16.4s, v17.4s}, [x11], x8
cmp x6, #5
beq WriteEnd
st1 {v18.4s, v19.4s}, [x11], x8
cmp x6, #6
beq WriteEnd
st1 {v20.4s, v21.4s}, [x11], x8
cmp x6, #7
beq WriteEnd
st1 {v22.4s, v23.4s}, [x11], x8
add x11, x11, #32

WriteEnd:
subs x13, x13, #8 // rhs col - 8
bgt LoopCol8

LoopColEnd:
add x0, x0, x17
cbz x9, C8DstStep
mov x21, #4
mul x21, x21, x7
sub x11, x11, x21
mov x2, x11
b NoDstStep
C8DstStep:
add x2, x2, #384
mov x11, x2
NoDstStep:
subs x6, x6, #12
bgt LoopCol8

sub sp, sp, #160
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ret
#endif

+ 40
- 7
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.c View File

@@ -27,12 +27,35 @@ void ConvFp32(const float *input_data, float *packed_input, const float *packed_
int out_channel = conv_param->output_channel_;
int deep = conv_param->kernel_h_ * conv_param->kernel_w_ * conv_param->input_channel_;
int output_count = conv_param->output_h_ * conv_param->output_w_;
Row2ColMajorFuncPtr Row2ColMajor = NULL;
#ifdef ENABLE_AVX
const int cal_num = C6NUM;
Row2ColMajor = RowMajor2Col6Major;
#elif defined(ENABLE_SSE)
const int cal_num = C4NUM;
Row2ColMajor = RowMajor2Col4Major;
#elif defined(ENABLE_ARM64)
int cal_num = 0;
MatmulFloatOptFuncPtr MatmulFloatOpt = NULL;
if (output_count <= C4NUM) {
cal_num = C4NUM;
Row2ColMajor = RowMajor2Col4Major;
MatmulFloatOpt = MatmulFloatNeon64OptRow4;
} else if (output_count <= C8NUM) {
cal_num = C8NUM;
Row2ColMajor = RowMajor2Col8Major;
MatmulFloatOpt = MatmulFloatNeon64OptRow8;
} else {
cal_num = C12NUM;
Row2ColMajor = RowMajor2Col12Major;
MatmulFloatOpt = MatmulFloatNeon64OptRow12;
}
#elif defined(ENABLE_ARM32)
const int cal_num = C12NUM;
Row2ColMajor = RowMajor2Col12Major;
#else
const int cal_num = C12NUM;
Row2ColMajor = RowMajor2Col12Major;
#endif
int output_tile_count = UP_DIV(output_count, cal_num);

@@ -54,15 +77,25 @@ void ConvFp32(const float *input_data, float *packed_input, const float *packed_

int out_offset = thread_id * cal_num * out_channel + out_batch_offset;
float *gemm_output = output_data + out_offset;
#ifdef ENABLE_AVX
RowMajor2Col6Major(gemm_input, col_major_gemm_input, cal_num, deep);
#elif defined(ENABLE_SSE)
RowMajor2Col4Major(gemm_input, col_major_gemm_input, cal_num, deep);

Row2ColMajor(gemm_input, col_major_gemm_input, cal_num, deep);
// x86 func param types are different
#if ENABLE_AVX
MatmulFloatAvxOpt(col_major_gemm_input, packed_weight, gemm_output, bias_data, (size_t)conv_param->act_type_,
deep, real_cal_num, out_channel, (size_t)out_channel, (size_t)OutType_Nhwc);
#elif ENABLE_SSE
MatmulFloatSse64Opt(col_major_gemm_input, packed_weight, gemm_output, bias_data, (int)conv_param->act_type_, deep,
real_cal_num, out_channel, (size_t)out_channel, (int)OutType_Nhwc);
#elif ENABLE_ARM32
MatmulFloatNeon32Opt12x4(col_major_gemm_input, packed_weight, gemm_output, bias_data, (int)conv_param->act_type_,
deep, real_cal_num, out_channel, out_channel, OutType_Nhwc);
#elif ENABLE_ARM64
MatmulFloatOpt(col_major_gemm_input, packed_weight, gemm_output, bias_data, conv_param->act_type_, deep,
real_cal_num, out_channel, out_channel, OutType_Nhwc);
#else
RowMajor2Col12Major(gemm_input, col_major_gemm_input, cal_num, deep);
MatMul12x8(col_major_gemm_input, packed_weight, gemm_output, bias_data, (int)conv_param->act_type_, deep,
real_cal_num, out_channel, out_channel, OutType_Nhwc);
#endif
MatMulOpt(col_major_gemm_input, packed_weight, gemm_output, bias_data, conv_param->act_type_, deep, real_cal_num,
out_channel, out_channel, OutType_Nhwc);
}
}
}


+ 5
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.h View File

@@ -25,6 +25,11 @@
#ifdef __cplusplus
extern "C" {
#endif
typedef void (*Row2ColMajorFuncPtr)(const float *src_ptr, float *dst_ptr, size_t row, size_t col);
#ifdef ENABLE_ARM64
typedef void (*MatmulFloatOptFuncPtr)(const float *a, const float *b, float *c, const float *bias, int act_type,
int depth, int row, int col, size_t stride, size_t write_mode);
#endif

// fp32 convolution common (im2col+gemm)
void ConvFp32(const float *input_data, float *packed_input, const float *packed_weight, const float *bias_data,


+ 8
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/matmul_fp32.h View File

@@ -57,6 +57,12 @@ void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bi
int col, size_t stride, size_t writeNhwc, size_t WriteWino);
void MatmulFloatNeon64Opt(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row,
int col, size_t stride, size_t write_mode);
void MatmulFloatNeon64OptRow8(const float *a, const float *b, float *c, const float *bias, int act_type, int depth,
int row, int col, size_t stride, size_t write_mode);
void MatmulFloatNeon64OptRow4(const float *a, const float *b, float *c, const float *bias, int act_type, int depth,
int row, int col, size_t stride, size_t write_mode);
void MatmulFloatNeon64OptRow12(const float *a, const float *b, float *c, const float *bias, int act_type, int depth,
int row, int col, size_t stride, size_t write_mode);
#elif ENABLE_ARM32
void MatmulFloatNeon32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row,
int col, int stride, size_t writeNhwc, size_t WriteWino);
@@ -75,6 +81,8 @@ void MatmulFloatAvxOpt(const float *a, const float *b, float *c, const float *bi
size_t row, size_t col, size_t stride, size_t write_mode);
#endif
#endif
void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, ActType act_type, int deep, int row,
int col, int stride, int out_type);

#ifdef __cplusplus
}


+ 3
- 0
mindspore/lite/micro/coder/opcoders/nnacl/fp32/convolution_fp32_coder.cc View File

@@ -146,6 +146,9 @@ int ConvolutionFP32Coder::DoCode(CoderContext *const context) {
{
"MatmulFp32.S",
"MatmulFp32Opt.S",
"MatmulFp32OptRow4.S",
"MatmulFp32OptRow8.S",
"MatmulFp32OptRow12.S",
"PreSum4x16Int8Peroc.S",
"MatVecMulFp32.S",
"PreSum4x16Int8Peroc.S",


+ 3
- 0
mindspore/lite/micro/coder/opcoders/nnacl/fp32/matmul_fp32_base_coder.cc View File

@@ -149,6 +149,9 @@ int MatMulFP32BaseCoder::CollectFilesForTarget(CoderContext *const context) {
{
"MatmulFp32.S",
"MatmulFp32Opt.S",
"MatmulFp32OptRow4.S",
"MatmulFp32OptRow8.S",
"MatmulFp32OptRow12.S",
"MatVecMulFp32.S",
});
}


Loading…
Cancel
Save