Browse Source

!6820 [MSLITE][Develop] optimization for fp32 kernel on arm32

Merge pull request !6820 from lixian/master
tags/v1.1.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
12e52aa418
3 changed files with 97 additions and 63 deletions
  1. +91
    -56
      mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S
  2. +3
    -4
      mindspore/lite/nnacl/fp32/matmul.c
  3. +3
    -3
      mindspore/lite/nnacl/fp32/matmul.h

+ 91
- 56
mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S View File

@@ -7,7 +7,7 @@
#endif

// void MatmulFloatNeon32Opt(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
// int row, int col, size_t stride, size_t writeNhwc, size_t WriteWino)
// int row, int col, size_t stride, size_t writeMode)
// r0: a
// r1: b
// r2: c
@@ -25,14 +25,20 @@ MatmulFloatNeon32Opt:
add sp, sp, #48

ldr r5, [sp, #4]
ldr r6, [sp, #8]
ldr r7, [sp, #12]
ldr r8, [sp, #16]

mov lr, #32 // sizeof(float) * 8
mul r12, r5, lr // block stride of lhs/rhs: sizeof(float) * 8 * depth
ldr lr, [sp, #24]
mov lr, #16 // sizeof(float) * 4
mul r12, r5, lr // block stride of lhs/rhs: sizeof(float) * 4 * depth
ldr lr, [sp, #20]
cmp lr, #0
beq NoWinoSteps
bne NoC8Steps
mov lr, #32
mul r10, r6, lr
NoC8Steps:
cmp lr, #2
bne NoWinoSteps
mov lr, #4
mul r11, r7, r8 // stride * col * sizeof(float)
mul r11, r11, lr
@@ -42,22 +48,32 @@ NoWinoSteps:
mov lr, #4
mul r8, r8, lr // stride * sizeof(float)

LoopCol:
ldr r6, [sp, #8] // reload lhs row
ldr r0, [sp, #-48] // reload lhs ptr
ldr r2, [sp, #-40] // reload dst ptr
LoopRow:
ldr r1, [sp, #-44] // reload rhs ptr
ldr r7, [sp, #12] // reload rhs col
ldr r3, [sp, #-36] // reload bias ptr

LoopRow:
ldr r1, [sp, #-44] // reload rhs ptr
LoopCol:
ldr lr, [sp, #20]
cmp lr, #0
beq NoReloadDst
ldr r2, [sp, #-40] // reload dst ptr
NoReloadDst:
ldr r0, [sp, #-48] // reload lhs ptr
ldr r5, [sp, #4] // reload depth
veor q8, q8, q8
veor q9, q9, q9
veor q10, q10, q10
veor q11, q11, q11
veor q12, q12, q12
veor q13, q13, q13
veor q14, q14, q14
veor q15, q15, q15
vld1.32 {q0}, [r0]!
vld1.32 {q1, q2}, [r1]!
vmul.f32 q8, q1, d0[0]
vmul.f32 q9, q2, d0[0]
vmul.f32 q10, q1, d0[1]
vmul.f32 q11, q2, d0[1]
vmul.f32 q12, q1, d1[0]
vmul.f32 q13, q2, d1[0]
vmul.f32 q14, q1, d1[1]
vmul.f32 q15, q2, d1[1]

subs r5, r5, #1
beq Bias

LoopDepth:
vld1.32 {q0}, [r0]!
@@ -78,8 +94,7 @@ LoopCol:
cmp r3, #0
beq Activation
vld1.32 {q0}, [r3]!
vld1.32 {q1}, [r3]
sub r3, r3, #16
vld1.32 {q1}, [r3]!
vadd.f32 q8, q8, q0
vadd.f32 q9, q9, q1
vadd.f32 q10, q10, q0
@@ -121,10 +136,9 @@ LoopCol:
vmax.f32 q15, q15, q3

Write:
ldr lr, [sp, #24]
cmp lr, #0
bne WriteWino
ldr lr, [sp, #20]
cmp lr, #2
beq WriteWino
cmp lr, #0
beq WriteC8
cmp r7, #1
@@ -144,6 +158,8 @@ LoopCol:
b Write8

Write1:
add lr, r2, #4
str lr, [sp, #-40]
vst1.32 d16[0], [r2]
cmp r6, #1
beq WriteEnd
@@ -158,8 +174,11 @@ LoopCol:
add r2, r2, r8
vst1.32 d28[0], [r2]
add r2, r2, r8
add r2, r2, #4
b WriteEnd
Write2:
add lr, r2, #8
str lr, [sp, #-40]
vst1.32 d16, [r2]
cmp r6, #1
beq WriteEnd
@@ -174,8 +193,11 @@ LoopCol:
add r2, r2, r8
vst1.32 d28, [r2]
add r2, r2, r8
add r2, r2, #8
b WriteEnd
Write3:
add lr, r2, #12
str lr, [sp, #-40]
add r4, r2, #8
vst1.32 d16, [r2]
vst1.32 d17[0], [r4]
@@ -198,8 +220,11 @@ LoopCol:
vst1.32 d28, [r2]
vst1.32 d29[0], [r4]
add r2, r2, r8
add r2, r2, #12
b WriteEnd
Write4:
add lr, r2, #16
str lr, [sp, #-40]
vst1.32 q8, [r2]
cmp r6, #1
beq WriteEnd
@@ -214,8 +239,11 @@ LoopCol:
add r2, r2, r8
vst1.32 q14, [r2]
add r2, r2, r8
add r2, r2, #16
b WriteEnd
Write5:
add lr, r2, #20
str lr, [sp, #-40]
add r4, r2, #16
vst1.32 q8, [r2]
vst1.32 d18[0], [r4]
@@ -238,8 +266,11 @@ LoopCol:
vst1.32 q14, [r2]
vst1.32 d30[0], [r4]
add r2, r2, r8
add r2, r2, #20
b WriteEnd
Write6:
add lr, r2, #24
str lr, [sp, #-40]
add r4, r2, #16
vst1.32 q8, [r2]
vst1.32 d18, [r4]
@@ -262,8 +293,11 @@ LoopCol:
vst1.32 q14, [r2]
vst1.32 d30, [r4]
add r2, r2, r8
add r2, r2, #24
b WriteEnd
Write7:
add lr, r2, #28
str lr, [sp, #-40]
add lr, r2, #24
add r4, r2, #16
vst1.32 q8, [r2]
@@ -294,15 +328,18 @@ LoopCol:
vst1.32 d30, [r4]
vst1.32 d31[0], [lr]
add r2, r2, r8
add r2, r2, #28
b WriteEnd
WriteC8:
vst1.32 {q8, q9}, [r2]!
vst1.32 {q10, q11}, [r2]!
vst1.32 {q12, q13}, [r2]!
vst1.32 {q14, q15}, [r2]!
str r2, [sp, #-40]
mov lr, r2
vst1.32 {q8, q9}, [lr]!
vst1.32 {q10, q11}, [lr]!
vst1.32 {q12, q13}, [lr]!
vst1.32 {q14, q15}, [lr]!
add r2, r2, r10
b WriteEnd
WriteWino:
add lr, r2, r10
vst1.32 {q8, q9}, [r2]
add r2, r2, r11
vst1.32 {q10, q11}, [r2]
@@ -310,9 +347,11 @@ LoopCol:
vst1.32 {q12, q13}, [r2]
add r2, r2, r11
vst1.32 {q14, q15}, [r2]
add r2, r2, r11
str lr, [sp, #-40]
b WriteEnd
Write8:
add lr, r2, #32
str lr, [sp, #-40]
vst1.32 {q8, q9}, [r2]
cmp r6, #1
beq WriteEnd
@@ -327,42 +366,38 @@ LoopCol:
add r2, r2, r8
vst1.32 {q14, q15}, [r2]
add r2, r2, r8
add r2, r2, #32

WriteEnd:
cmp r6, #4
ble LoopRowEnd
sub r6, r6, #4 // lhs row - 4
b LoopRow
cmp r7, #8
ble LoopColEnd
sub r7, r7, #8 // rhs col - 8
b LoopCol

LoopRowEnd:
ldr r1, [sp, #-44]
add r1, r1, r12 // rhs ptr + stride
str r1, [sp, #-44]
cmp r3, #0
beq NoBiasStep
add r3, r3, #32 // bias ptr + stride
NoBiasStep:
ldr lr, [sp, #24]
cmp lr, #0
bne WinoDstStep
LoopColEnd:
ldr r0, [sp, #-48]
add r0, r0, r12 // rhs ptr + stride
str r0, [sp, #-48]
ldr lr, [sp, #20]
cmp lr, #0
beq NoDstStep
ldr r2, [sp, #-40]
add r2, r2, #32 // dst ptr + stride
beq C8DstStep
mov lr, #4
ldr r7, [sp, #12] // reload rhs col
mul lr, lr, r7
sub r2, r2, lr
str r2, [sp, #-40]
b NoDstStep
WinoDstStep:
ldr r2, [sp, #-40]
add r2, r2, r10
C8DstStep:
ldr lr, [sp, #-40]
add r2, lr, #128
str r2, [sp, #-40]
NoDstStep:
cmp r7, #8
ble LoopColEnd
sub r7, r7, #8 // rhs col - 8
b LoopCol
cmp r6, #4
ble LoopRowEnd
sub r6, r6, #4 // lhs row - 4
b LoopRow

LoopColEnd:
LoopRowEnd:
sub sp, sp, #48
pop {r0-r8, r10, r11, pc}
#endif

+ 3
- 4
mindspore/lite/nnacl/fp32/matmul.c View File

@@ -477,8 +477,7 @@ void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActT
(int)(out_type == OutType_TileC8));
}
#elif ENABLE_ARM32
MatmulFloatNeon32Opt(a, b, c, bias, (int)act_type, deep, row, col, stride, (int)(out_type == OutType_Nhwc),
(int)(out_type == OutType_TileC8));
MatmulFloatNeon32Opt(a, b, c, bias, (int)act_type, deep, row, col, stride, (int)(out_type));
#else
MatMul12x8(a, b, c, bias, act_type, deep, row, col, stride, out_type);
#endif
@@ -491,8 +490,8 @@ static void SwapDims(int *dims, int index1, int index2) {
dims[index2] = tmp;
}

int MatMulInferShape(int **in_shape, int in_num, size_t *dim_size, int *out_shape, int *in_format,
int *out_format, int *in_datatype, int *out_datatype, OpParameter *param) {
int MatMulInferShape(int **in_shape, int in_num, size_t *dim_size, int *out_shape, int *in_format, int *out_format,
int *in_datatype, int *out_datatype, OpParameter *param) {
*out_datatype = in_datatype[0];
*out_format = in_format[0];
if (dim_size[0] < 2 || dim_size[1] < 2) {


+ 3
- 3
mindspore/lite/nnacl/fp32/matmul.h View File

@@ -42,12 +42,12 @@ void MatmulFloatNeon64Opt(const float *a, const float *b, float *c, const float
void MatmulFloatNeon64OptRemain(const float *a, const float *b, float *c, int depth, int row, int col, size_t stride);
#elif ENABLE_ARM32
void MatmulFloatNeon32Opt(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row,
int col, size_t stride, size_t write_nhwc, size_t write_c4);
int col, int stride, int write_mode);
#endif

#ifdef ENABLE_NNACL_INFER_SHAPE
int MatMulInferShape(int **in_shape, int in_num, size_t *dim_size, int *out_shape, int *in_format,
int *out_format, int *in_datatype, int *out_datatype, OpParameter *param);
int MatMulInferShape(int **in_shape, int in_num, size_t *dim_size, int *out_shape, int *in_format, int *out_format,
int *in_datatype, int *out_datatype, OpParameter *param);
#endif
#ifdef __cplusplus
}


Loading…
Cancel
Save