Browse Source

fp16

pull/15195/head
lzk 4 years ago
parent
commit
afaae456b5
9 changed files with 808 additions and 20 deletions
  1. +222
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/MatVecMulFp16.S
  2. +93
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/TiledC4MatmulFp16.S
  3. +150
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/WinogradTransLeft.S
  4. +148
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/WinogradTransRight.S
  5. +8
    -4
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/common_func_fp16.c
  6. +124
    -9
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_winograd_fp16.c
  7. +42
    -3
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/matmul_fp16.c
  8. +7
    -3
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/matmul_fp16.h
  9. +14
    -1
      mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc

+ 222
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/MatVecMulFp16.S View File

@@ -0,0 +1,222 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5

// void MatVecMulA32NeonFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type,
// int depth, int col) {
// r0: a
// r1: b
// r2: c
// r3: bias
// r4: act_type
// r5: depth
// r6: col

asm_function MatVecMulA32NeonFp16
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
push {r0-r8, r9, r10, r11, lr}
add sp, sp, #52

ldr r4, [sp]
ldr r5, [sp, #4]
ldr r6, [sp, #8]

add r10, r5, r5 // stride = depth * sizeof(float16_t)
mov lr, #4
mul r11, r10, lr // stride x 4

cmp r6, #4
blt Col1Loop
Col4Loop:
mov r7, r0 // reload a(vector) ptr
mov r9, r1 // reload b(matrix) ptr
mov r8, r5 // reload depth value

veor q9, q9, q9
veor q10, q10, q10
veor q11, q11, q11
veor q12, q12, q12
veor q15, q15, q15

cmp r8, #8
bge Col4Depth8
cmp r8, #4
bge Col4Depth4
cmp r8, #1
bge Col4Depth1
b Col4End

Col4Depth8:
vld1.16 {q8}, [r7]!
add lr, r9, r10
vld1.16 {q0}, [r9]!
vld1.16 {q1}, [lr], r10
vld1.16 {q2}, [lr], r10
vld1.16 {q3}, [lr]

vmla.f16 q9, q8, q0
vmla.f16 q10, q8, q1
vmla.f16 q11, q8, q2
vmla.f16 q12, q8, q3
sub r8, r8, #8
cmp r8, #8
bge Col4Depth8
cmp r8, #4
bge Col4Depth4
b AddC4

Col4Depth4:
vld1.16 {d16}, [r7]!
add lr, r9, r10
vld1.16 {d0}, [r9]!
vld1.16 {d2}, [lr], r10
vld1.16 {d4}, [lr], r10
vld1.16 {d6}, [lr]

vmla.f16 d18, d16, d0
vmla.f16 d20, d16, d2
vmla.f16 d22, d16, d4
vmla.f16 d24, d16, d6
sub r8, r8, #4
cmp r8, #4
bge Col4Depth4

AddC4:
vpadd.f16 d0, d18, d19
vpadd.f16 d1, d20, d21
vpadd.f16 d2, d22, d23
vpadd.f16 d4, d24, d25
vpadd.f16 d30, d0, d1
vpadd.f16 d31, d2, d4
vpadd.f16 d30, d30, d31
cmp r8, #1
bge Col4Depth1
b Col4End

Col4Depth1:
vld1.16 {d0[0]}, [r7]!
add lr, r9, r10
vld1.16 {d2[0]}, [r9]!
vld1.16 {d2[1]}, [lr], r10
vld1.16 {d2[2]}, [lr], r10
vld1.16 {d2[3]}, [lr]

vmla.f16 d30, d2, d0[0]
subs r8, r8, #1
bne Col4Depth1

Col4End:
cmp r3, #0
beq Col4Activation
vld1.16 {d26}, [r3]!
vadd.f16 d30, d30, d26

Col4Activation:
cmp r4, #3
beq Col4Relu6
cmp r4, #1
beq Col4Relu
b Col4Write

Col4Relu6:
vmov.i16 q12, #6
vcvt.f16.s16 q12, q12
vmin.f16 d30, d30, d24

Col4Relu:
veor q13, q13, q13
vmax.f16 d30, d30, d26

Col4Write:
vst1.16 {d30}, [r2]!
subs r6, r6, #4
beq End
add r1, r1, r11
cmp r6, #4
bge Col4Loop

Col1Loop:
mov r7, r0 // reload a(vector) ptr
mov r9, r1 // reload b(matrix) ptr
mov r8, r5 // reload depth value
veor q10, q10, q10
veor q15, q15, q15

cmp r8, #8
bge Col1Depth8
cmp r8, #4
bge Col1Depth4
cmp r8, #1
bge Col1Depth1
b Col1End

Col1Depth8:
vld1.16 {q0}, [r7]!
vld1.16 {q1}, [r9]!
vmla.f16 q10, q1, q0
sub r8, r8, #8
cmp r8, #8
bge Col1Depth8
cmp r8, #4
bge Col1Depth4
b AddC1

Col1Depth4:
vld1.16 {d0}, [r7]!
vld1.16 {d2}, [r9]!
vmla.f16 d20, d2, d0
sub r8, r8, #4
cmp r8, #4
bge Col1Depth4

AddC1:
vpadd.f16 d30, d20, d21
vpadd.f16 d30, d30, d20
vpadd.f16 d30, d30, d20
cmp r8, #1
bge Col1Depth1
b Col1End

Col1Depth1:
vld1.16 {d0[0]}, [r7]!
vld1.16 {d2[0]}, [r9]!
vmla.f16 d30, d2, d0[0]
subs r8, r8, #1
bne Col1Depth1

Col1End:
cmp r3, #0
beq Col1Activation
vld1.16 {d28[0]}, [r3]!
vadd.f16 d30, d30, d28

Col1Activation:
cmp r4, #3
beq Col1Relu6
cmp r4, #1
beq Col1Relu
b Col1Write

Col1Relu6:
vmov.i16 d26, #6
vcvt.f16.s16 d26, d26
vmin.f16 d30, d30, d26

Col1Relu:
veor d24, d24, d24
vmax.f16 d30, d30, d24

Col1Write:
vst1.16 {d30[0]}, [r2]!
subs r6, r6, #1
beq End
add r1, r1, r10
b Col1Loop

End:
sub sp, sp, #52
pop {r0-r8, r9, r10, r11, pc}
#endif

+ 93
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/TiledC4MatmulFp16.S View File

@@ -0,0 +1,93 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5

asm_function TiledC4MatmulFp16
// void TiledC4MatmulFp16(float16_t *dst, const float16_t *src, const float16_t *weight, size_t cal_num, size_t ic4,
// size_t oc4);
// r0: dst
// r1: src
// r2: weight
// r3: cal_num
// r4(sp): ic4
// r5(sp + #4): oc4
push {r4-r11, lr}
vpush {q4-q7}
add sp, sp, #100
ldr r4, [sp]
ldr r5, [sp, #4] // oc4
add r3, r3, r3
mov r7, r1

cmp r5, #1
blt LoopOCEnd
cmp r4, #1
blt LoopICEnd
LoopOC:
ldr r4, [sp]
veor q15, q15, q15
veor q14, q14, q14
veor q13, q13, q13
veor q12, q12, q12
LoopIC:
vld1.16 {q4, q5}, [r2]! // weight
vld1.16 {q2, q3}, [r1]! // 16 number src
vmla.f16 d24, d8, d4[0]
vmla.f16 d24, d9, d4[1]
vmla.f16 d24, d10, d4[2]
vmla.f16 d24, d11, d4[3]
vmla.f16 d25, d8, d5[0]
vmla.f16 d25, d9, d5[1]
vmla.f16 d25, d10, d5[2]
vmla.f16 d25, d11, d5[3]

vmla.f16 d26, d8, d6[0]
vmla.f16 d26, d9, d6[1]
vmla.f16 d26, d10, d6[2]
vmla.f16 d26, d11, d6[3]

vmla.f16 d27, d8, d7[0]
vmla.f16 d27, d9, d7[1]
vmla.f16 d27, d10, d7[2]
vmla.f16 d27, d11, d7[3]

vld1.16 {q0, q1}, [r1]! // 16 number src
vmla.f16 d28, d8, d0[0]
vmla.f16 d28, d9, d0[1]
vmla.f16 d28, d10, d0[2]
vmla.f16 d28, d11, d0[3]
vmla.f16 d29, d8, d1[0]
vmla.f16 d29, d9, d1[1]
vmla.f16 d29, d10, d1[2]
vmla.f16 d29, d11, d1[3]

vmla.f16 d30, d8, d2[0]
vmla.f16 d30, d9, d2[1]
vmla.f16 d30, d10, d2[2]
vmla.f16 d30, d11, d2[3]

vmla.f16 d31, d8, d3[0]
vmla.f16 d31, d9, d3[1]
vmla.f16 d31, d10, d3[2]
vmla.f16 d31, d11, d3[3]

subs r4, r4, #1
bne LoopIC
b LoopICEnd
LoopICEnd:
mov lr, r0
vst1.16 {q12, q13}, [lr]!
vst1.16 {q14, q15}, [lr]!
add r0, r0, r3 // dst += cal_num
mov r1, r7
subs r5, r5, #1
bne LoopOC
LoopOCEnd:
sub sp, sp, #100
vpop {q4-q7}
pop {r4-r11, pc}
#endif

+ 150
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/WinogradTransLeft.S View File

@@ -0,0 +1,150 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5

// void WinogradTransLeftFp16(const float16_t *S, const float16_t *B, float16_t *M, size_t w, size_t h, size_t k,
// size_t length);
//r0: S
//r1: B
//r2: M
//r3: w
//r4: h
//r5: k
//r6: length
asm_function WinogradTransLeftFp16
push {r0, r3, r4-r11, lr}
vpush {q4-q7}
add sp, sp, #108
ldr r4, [sp]
ldr r6, [sp, #8]

mov r8, #8 // 4 * sizeof(float16_t)
mul r8, r6, r8 // length * 4 * 2
mul r7, r3, r8 // step for S
add r10, r4, r4 // step for B

cmp r4, #1
blt LoopHEnd
cmp r3, #1
blt LoopHEnd
LoopH:
ldr r3, [sp, #-40] // w
ldr r0, [sp, #-44]
LoopW:
mov r11, r0 // S
mov lr, r1 // B_src
veor q6, q6, q6
ldr r6, [sp, #8]
InitZero:
vst1.16 {d12}, [r2]!
subs r6, r6, #1
bne InitZero
sub r2, r2, r8

ldr r5, [sp, #4]
cmp r5, #4
bge LoopK4
cmp r5, #3
bge LoopK3
cmp r5, #1
bge LoopK1
b LoopKEnd

LoopK4:
ldr r6, [sp, #8]
vld1.16 {d1[0]}, [lr], r10
vld1.16 {d3[0]}, [lr], r10
vld1.16 {d5[0]}, [lr], r10
vld1.16 {d7[0]}, [lr], r10

add r12, r11, r7
add r14, r12, r7
add r9, r14, r7
LoopK4L4:
vld1.16 {d12}, [r2]
vld1.16 {d0}, [r11]!
vld1.16 {d2}, [r12]!
vmla.f16 d12, d0, d1[0]
vld1.16 {d4}, [r14]!
vmla.f16 d12, d2, d3[0]
vld1.16 {d6}, [r9]!
vmla.f16 d12, d4, d5[0]
vmla.f16 d12, d6, d7[0]
vst1.16 {d12}, [r2]! // dst
subs r6, r6, #1 // length
bne LoopK4L4

subs r5, r5, #4 // k
beq LoopKEnd
sub r2, r2, r8 // dst - step
sub r9, r9, r8
add r11, r9, r7
cmp r5, #4
bge LoopK4
cmp r5, #3
bge LoopK3
b LoopK1

LoopK3:
ldr r6, [sp, #8]
vld1.16 {d1[0]}, [lr], r10
vld1.16 {d3[0]}, [lr], r10
vld1.16 {d5[0]}, [lr], r10

add r12, r11, r7
add r9, r12, r7
LoopK3L4:
vld1.16 {d12}, [r2]
vld1.16 {d0}, [r11]!
vld1.16 {d2}, [r12]!
vmla.f16 d12, d0, d1[0]
vld1.16 {d4}, [r9]!
vmla.f16 d12, d2, d3[0]
vmla.f16 d12, d4, d5[0]
vst1.16 {d12}, [r2]! // dst
subs r6, r6, #1 // length
bne LoopK3L4

subs r5, r5, #3 // k
beq LoopKEnd
sub r2, r2, r8 // dst - step
sub r9, r9, r8
add r11, r9, r7
cmp r5, #3
bge LoopK3
b LoopK1

LoopK1:
ldr r6, [sp, #8]
vld1.16 {d1[0]}, [lr], r10

LoopK1L4:
vld1.16 {d12}, [r2]
vld1.16 {d0}, [r11]!
vmla.f16 d12, d0, d1[0]
vst1.16 {d12}, [r2]! // dst
subs r6, r6, #1 // length
bne LoopK1L4

subs r5, r5, #1 // k
beq LoopKEnd
sub r2, r2, r8 // dst - step
sub r11, r11, r8
add r11, r11, r7
b LoopK1
LoopKEnd:
add r0, r0, r8 // S += unitstep
subs r3, r3, #1
bne LoopW
LoopWEnd:
subs r4, r4, #1
beq LoopHEnd
add r1, r1, #2 // B += 1
b LoopH
LoopHEnd:
sub sp, sp, #108
vpop {q4-q7}
pop {r0, r3, r4-r11, pc}
#endif

+ 148
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/WinogradTransRight.S View File

@@ -0,0 +1,148 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5

// void WinogradTransRightFp16(const float16_t *S, const float16_t *B, float16_t *M, size_t w, size_t h, size_t k,
// size_t length);
//r0: S
//r1: B
//r2: M
//r3: w
//r4: h
//r5: k
//r6: length
asm_function WinogradTransRightFp16
push {r1, r3, r4-r11, lr}
vpush {q4-q7}
add sp, sp, #108
ldr r4, [sp]
ldr r5, [sp, #4]
ldr r6, [sp, #8]

mov r8, #8 // 4 * sizeof(float16_t)
mul r8, r6, r8 // length * 4 * 2
mul r7, r5, r8 // step for S = k * unitStep * 4
add r10, r4, r4 // step for B = 2 * h

cmp r4, #1
blt LoopHEnd
cmp r3, #1
blt LoopHEnd
LoopH:
ldr r3, [sp, #-40] // w
ldr r1, [sp, #-44]
LoopW:
mov r11, r0 // S
mov lr, r1 // B_src
veor q6, q6, q6
ldr r6, [sp, #8]
InitZero:
vst1.16 {d12}, [r2]!
subs r6, r6, #1
bne InitZero
sub r2, r2, r8

ldr r5, [sp, #4]
cmp r5, #4
bge LoopK4
cmp r5, #3
bge LoopK3
cmp r5, #1
bge LoopK1
b LoopKEnd

LoopK4:
ldr r6, [sp, #8]
vld1.16 {d1[0]}, [lr], r10
vld1.16 {d3[0]}, [lr], r10
vld1.16 {d5[0]}, [lr], r10
vld1.16 {d7[0]}, [lr], r10

add r12, r11, r8
add r14, r12, r8
add r9, r14, r8
LoopK4L4:
vld1.16 {d12}, [r2]
vld1.16 {d0}, [r11]!
vld1.16 {d2}, [r12]!
vmla.f16 d12, d0, d1[0]
vld1.16 {d4}, [r14]!
vmla.f16 d12, d2, d3[0]
vld1.16 {d6}, [r9]!
vmla.f16 d12, d4, d5[0]
vmla.f16 d12, d6, d7[0]
vst1.16 {d12}, [r2]! // dst
subs r6, r6, #1 // length
bne LoopK4L4

subs r5, r5, #4 // k
beq LoopKEnd
sub r2, r2, r8 // dst - step
mov r11, r9
cmp r5, #4
bge LoopK4
cmp r5, #3
bge LoopK3
b LoopK1

LoopK3:
ldr r6, [sp, #8]
vld1.16 {d1[0]}, [lr], r10
vld1.16 {d3[0]}, [lr], r10
vld1.16 {d5[0]}, [lr], r10

add r12, r11, r8
add r9, r12, r8
LoopK3L4:
vld1.16 {d12}, [r2]
vld1.16 {d0}, [r11]!
vld1.16 {d2}, [r12]!
vmla.f16 d12, d0, d1[0]
vld1.16 {d4}, [r9]!
vmla.f16 d12, d2, d3[0]
vmla.f16 d12, d4, d5[0]
vst1.16 {d12}, [r2]! // dst
subs r6, r6, #1 // length
bne LoopK3L4

subs r5, r5, #3 // k
beq LoopKEnd
sub r2, r2, r8 // dst - step
mov r11, r9
cmp r5, #3
bge LoopK3
b LoopK1

LoopK1:
ldr r6, [sp, #8]
vld1.16 {d1[0]}, [lr], r10

LoopK1L4:
vld1.16 {d12}, [r2]
vld1.16 {d0}, [r11]!
vmla.f16 d12, d0, d1[0]
vst1.16 {d12}, [r2]! // dst
subs r6, r6, #1 // length
bne LoopK1L4

subs r5, r5, #1 // k
beq LoopKEnd
sub r2, r2, r8 // dst - step
b LoopK1

LoopKEnd:
add r1, r1, #2 // B[x]
subs r3, r3, #1
bne LoopW
LoopWEnd:
add r0, r0, r7
subs r4, r4, #1
beq LoopHEnd
b LoopH
LoopHEnd:
sub sp, sp, #108
vpop {q4-q7}
pop {r1, r3, r4-r11, pc}
#endif

+ 8
- 4
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/common_func_fp16.c View File

@@ -41,32 +41,36 @@ void PostConvFuncCommFp16(float16_t *out_ptr, const float16_t *src_ptr_, const f


void PostConvFuncFp16C8(const float16_t *c8_out, float16_t *nhwc_out, const float16_t *bias, size_t oc, size_t plane, void PostConvFuncFp16C8(const float16_t *c8_out, float16_t *nhwc_out, const float16_t *bias, size_t oc, size_t plane,
size_t oc_stride, ActType act_type) { size_t oc_stride, ActType act_type) {
#ifdef ENABLE_ARM64
size_t oc8mod = oc % C8NUM; size_t oc8mod = oc % C8NUM;
size_t oc8div = oc - oc8mod; size_t oc8div = oc - oc8mod;
size_t stride_size = oc_stride * sizeof(float16_t); size_t stride_size = oc_stride * sizeof(float16_t);
PostFuncBiasReluC8Fp16(nhwc_out, c8_out, bias, oc8div, oc8mod, plane, stride_size, act_type); PostFuncBiasReluC8Fp16(nhwc_out, c8_out, bias, oc8div, oc8mod, plane, stride_size, act_type);
return;
#else
PostConvFuncCommFp16(nhwc_out, c8_out, bias, oc, plane, oc_stride, plane, act_type, C8NUM);
#endif
} }


void PostConvFuncFp16C4(const float16_t *c4_out, float16_t *nhwc_out, const float16_t *bias, size_t oc, size_t plane, void PostConvFuncFp16C4(const float16_t *c4_out, float16_t *nhwc_out, const float16_t *bias, size_t oc, size_t plane,
size_t plane_stride, ActType act_type) { size_t plane_stride, ActType act_type) {
#ifdef ENABLE_ARM64
size_t oc4mod = oc % C4NUM; size_t oc4mod = oc % C4NUM;
size_t oc4div = oc - oc4mod; size_t oc4div = oc - oc4mod;
size_t stride_size = (plane_stride - plane) * C4NUM * sizeof(float16_t); size_t stride_size = (plane_stride - plane) * C4NUM * sizeof(float16_t);
PostFuncBiasReluC4Fp16(nhwc_out, c4_out, bias, oc4div, oc4mod, plane, stride_size, act_type); PostFuncBiasReluC4Fp16(nhwc_out, c4_out, bias, oc4div, oc4mod, plane, stride_size, act_type);
return;
#else
PostConvFuncCommFp16(nhwc_out, c4_out, bias, oc, plane, oc, plane_stride, act_type, C4NUM);
#endif
} }


#ifdef ENABLE_ARM82_A32 #ifdef ENABLE_ARM82_A32
void PostFuncBiasReluC4Fp16(float16_t *dst, const float16_t *src, const float16_t *bias, size_t oc4div, size_t oc4mod, void PostFuncBiasReluC4Fp16(float16_t *dst, const float16_t *src, const float16_t *bias, size_t oc4div, size_t oc4mod,
size_t plane_size, size_t plane_stride, size_t relu_type) { size_t plane_size, size_t plane_stride, size_t relu_type) {
// TODO(fun): function // TODO(fun): function
return;
} }


void PostFuncBiasReluC8Fp16(float16_t *dst, const float16_t *src, const float16_t *bias, size_t oc8div, size_t oc8mod, void PostFuncBiasReluC8Fp16(float16_t *dst, const float16_t *src, const float16_t *bias, size_t oc8div, size_t oc8mod,
size_t plane_size, size_t stride, size_t relu_type) { size_t plane_size, size_t stride, size_t relu_type) {
// TODO(fun): function // TODO(fun): function
return;
} }
#endif #endif

+ 124
- 9
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_winograd_fp16.c View File

@@ -41,6 +41,55 @@ void DeConvWgInputPackFp16(float16_t *src_ptr, float16_t *dst_ptr, int channel,
return; return;
} }


#ifdef ENABLE_ARM82_A32
void DeconvWgMergeFp16A32Fun(const float16_t *src_ptr, float16_t *dst_ptr, size_t src_step, size_t dst_step) {
asm volatile(
"mov r7, %[src_ptr]\n"
"mov r8, %[dst_ptr]\n"
"mov r10, r8\n"

"vld1.16 {d0}, [r7], %[src_step]\n"
"vld1.16 {d2}, [r8], %[dst_step]\n"
"vld1.16 {d4}, [r7], %[src_step]\n"
"vld1.16 {d6}, [r8], %[dst_step]\n"
"vadd.f16 d0, d0, d2\n"
"vld1.16 {d8}, [r7], %[src_step]\n"
"vadd.f16 d4, d4, d6\n"
"vst1.16 {d0}, [r10], %[dst_step]\n"
"vst1.16 {d4}, [r10], %[dst_step]\n"

"vld1.16 {d10}, [r8], %[dst_step]\n"
"vld1.16 {d12}, [r7], %[src_step]\n"
"vadd.f16 d8, d8, d10\n"
"vld1.16 {d14}, [r8], %[dst_step]\n"
"vadd.f16 d12, d12, d14\n"
"vld1.16 {d0}, [r7], %[src_step]\n"
"vst1.16 {d8}, [r10], %[dst_step]\n"
"vst1.16 {d12}, [r10], %[dst_step]\n"

"vld1.16 {d2}, [r8], %[dst_step]\n"
"vld1.16 {d4}, [r7], %[src_step]\n"
"vld1.16 {d6}, [r8], %[dst_step]\n"
"vadd.f16 d0, d0, d2\n"
"vadd.f16 d4, d4, d6\n"
"vst1.16 {d0}, [r10], %[dst_step]\n"
"vst1.16 {d4}, [r10], %[dst_step]\n"

"vld1.16 {d8}, [r7], %[src_step]\n"
"vld1.16 {d10}, [r8], %[dst_step]\n"
"vld1.16 {d12}, [r7], %[src_step]\n"
"vld1.16 {d14}, [r8], %[dst_step]\n"
"vadd.f16 d8, d8, d10\n"
"vadd.f16 d12, d12, d14\n"
"vst1.16 {d8}, [r10], %[dst_step]\n"
"vst1.16 {d12}, [r10], %[dst_step]\n"

:
: [ src_ptr ] "r"(src_ptr), [ dst_ptr ] "r"(dst_ptr), [ src_step ] "r"(src_step), [ dst_step ] "r"(dst_step)
: "r7", "r8", "r10", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
}
#endif

void DeConvWgMergeFp16(const float16_t *src, float16_t *dst, size_t src_stride, size_t dst_stride, size_t count) { void DeConvWgMergeFp16(const float16_t *src, float16_t *dst, size_t src_stride, size_t dst_stride, size_t count) {
const float16_t *src_ptr = src; const float16_t *src_ptr = src;
float16_t *dst_ptr = dst; float16_t *dst_ptr = dst;
@@ -94,8 +143,18 @@ void DeConvWgMergeFp16(const float16_t *src, float16_t *dst, size_t src_stride,
: :
: [ src_ptr ] "r"(src_ptr), [ dst_ptr ] "r"(dst_ptr), [ src_step ] "r"(src_step), [ dst_step ] "r"(dst_step) : [ src_ptr ] "r"(src_ptr), [ dst_ptr ] "r"(dst_ptr), [ src_step ] "r"(src_step), [ dst_step ] "r"(dst_step)
: "x7", "x8", "x10", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); : "x7", "x8", "x10", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#elif defined(ENABLE_ARM82_A32)
size_t src_step = src_stride * sizeof(float16_t);
size_t dst_step = dst_stride * sizeof(float16_t);
DeconvWgMergeFp16A32Fun(src_ptr, dst_ptr, src_step, dst_step);
#else #else
// TODO(fun): arm32
for (int j = 0; j < 8; j++) {
const float16_t *s = src_ptr + j * src_stride;
float16_t *d = dst_ptr + j * dst_stride;
for (int k = 0; k < 4; k++) {
d[k] += s[k];
}
}
#endif #endif
src_ptr += C8NUM * src_stride; src_ptr += C8NUM * src_stride;
dst_ptr += C8NUM * dst_stride; dst_ptr += C8NUM * dst_stride;
@@ -377,22 +436,78 @@ void DeconvWgPostFp16(float16_t *tile_out, float16_t *nc4hw4_output, ConvParamet
return; return;
} }


#ifdef ENABLE_ARM82_A32
#ifndef ENABLE_ARM
void WinogradTransLeftFp16(const float16_t *S, const float16_t *B, float16_t *M, size_t w, size_t h, size_t k, void WinogradTransLeftFp16(const float16_t *S, const float16_t *B, float16_t *M, size_t w, size_t h, size_t k,
size_t length) { size_t length) {
// TODO(fun): function
return;
const int unitStep = 4 * length;
for (int y = 0; y < h; ++y) {
float16_t *dstY = M + y * w * unitStep;
for (int x = 0; x < w; ++x) {
float16_t *dstX = dstY + x * unitStep;
const float16_t *srcX = S + x * unitStep;
memset(dstX, 0, unitStep * sizeof(float16_t));
for (int i = 0; i < k; ++i) {
float16_t b = B[i * h + y];
const float16_t *srcY = srcX + i * w * unitStep;
if (0.0f == b) {
continue;
}
for (int j = 0; j < unitStep; ++j) {
dstX[j] += srcY[j] * b;
}
}
}
}
} }


void WinogradTransRightFp16(const float16_t *S, const float16_t *B, float16_t *M, size_t w, size_t h, size_t k, void WinogradTransRightFp16(const float16_t *S, const float16_t *B, float16_t *M, size_t w, size_t h, size_t k,
size_t length) { size_t length) {
// TODO(fun): function
return;
const int unitStep = 4 * length;
for (int y = 0; y < h; ++y) {
float16_t *dstY = M + y * w * unitStep;
const float16_t *srcY = S + y * k * unitStep;

for (int x = 0; x < w; ++x) {
float16_t *dstX = dstY + x * unitStep;
memset(dstX, 0, unitStep * sizeof(float16_t));
for (int i = 0; i < k; ++i) {
const float16_t *srcX = srcY + i * unitStep;
float16_t b = B[i * h + x];
if (0.0f == b) {
continue;
}
for (int j = 0; j < unitStep; ++j) {
dstX[j] += srcX[j] * b;
}
}
}
}
} }


void TiledC4MatmulFp16(float16_t *dst, const float16_t *src, const float16_t *weight, size_t ic4, size_t cal_num,
void TiledC4MatmulFp16(float16_t *dst, const float16_t *src, const float16_t *weight, size_t cal_num, size_t ic4,
size_t oc4) { size_t oc4) {
// TODO(fun): function
return;
int dx, sz, dz;
int src_depth_step = 4 * DECONV_WINOGRAD_DEFAULT_TILE;
for (dz = 0; dz < oc4; ++dz) {
float16_t *dst_z = dst + dz * cal_num;
const float16_t *weight_dz = weight + dz * ic4 * 16;
for (dx = 0; dx < DECONV_WINOGRAD_DEFAULT_TILE; ++dx) {
float16_t *dst_x = dst_z + dx * 4;
dst_x[0] = 0.0f;
dst_x[1] = 0.0f;
dst_x[2] = 0.0f;
dst_x[3] = 0.0f;
const float16_t *src_dx = src + 4 * dx;
for (sz = 0; sz < ic4; ++sz) {
const float16_t *src_z = src_dx + sz * src_depth_step;
const float16_t *weight_z = weight_dz + sz * 16;
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 4; ++j) {
dst_x[j] += src_z[i] * weight_z[4 * i + j];
}
}
}
}
}
} }
#endif #endif

+ 42
- 3
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/matmul_fp16.c View File

@@ -371,8 +371,16 @@ void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const floa
#ifdef ENABLE_ARM82_A32 #ifdef ENABLE_ARM82_A32
void MatVecMulA32Fp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, void MatVecMulA32Fp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type,
int depth, int col) { int depth, int col) {
// TODO(fun): function
return;
for (int ci = 0; ci < col; ci++) {
float value = 0;
for (int di = 0; di < depth; di++) {
value += a[di] * b[ci * depth + di];
}
if (bias != NULL) value += bias[ci];
if (act_type == ActType_Relu6) value = MSMIN(6.0f, value);
if (act_type == ActType_Relu || act_type == ActType_Relu6) value = MSMAX(0.0f, value);
c[ci] = value;
}
} }
#endif #endif


@@ -381,7 +389,7 @@ void MatVecMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const f
#ifdef ENABLE_ARM64 #ifdef ENABLE_ARM64
MatVecMulFp16Neon64(a, b, c, bias, (int)act_type, depth, col); MatVecMulFp16Neon64(a, b, c, bias, (int)act_type, depth, col);
#else #else
MatVecMulA32Fp16(a, b, c, bias, (int)act_type, depth, col);
MatVecMulA32NeonFp16(a, b, c, bias, (int)act_type, depth, col);
#endif #endif
} }


@@ -609,6 +617,23 @@ void RowMajor2Col16MajorFp16(const void *src, float16_t *dst, int row, int col,
return; return;
} }


void RowMajor2Col12MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) {
if (is_fp32_src) {
const float *fp32_src = (const float *)src;
for (int r = 0; r < row; r++) {
for (int c = 0; c < col; c++) {
int r_div12 = r / 12;
int r_mod12 = r % 12;
dst[r_div12 * 12 * col + c * 12 + r_mod12] = (float16_t)(fp32_src[r * col + c]);
}
}
} else {
const float16_t *fp16_src = (const float16_t *)src;
RowMajor2Col12MajorFp16Opt(fp16_src, dst, row, col);
}
return;
}

void RowMajor2Row16MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) { void RowMajor2Row16MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) {
for (int r = 0; r < row; r++) { for (int r = 0; r < row; r++) {
for (int c = 0; c < col; c++) { for (int c = 0; c < col; c++) {
@@ -623,6 +648,20 @@ void RowMajor2Row16MajorFp16(const void *src, float16_t *dst, int row, int col,
} }
} }


void RowMajor2Row12MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) {
for (int r = 0; r < row; r++) {
for (int c = 0; c < col; c++) {
int c_div12 = c / 12;
int c_mod12 = c % 12;
if (is_fp32_src) {
dst[c_div12 * 12 * row + r * 12 + c_mod12] = (float16_t)(((const float *)src)[r * col + c]);
} else {
dst[c_div12 * 12 * row + r * 12 + c_mod12] = ((const float16_t *)src)[r * col + c];
}
}
}
}

void RowMajor2Row8MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) { void RowMajor2Row8MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) {
for (int r = 0; r < row; r++) { for (int r = 0; r < row; r++) {
for (int c = 0; c < col; c++) { for (int c = 0; c < col; c++) {


+ 7
- 3
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/matmul_fp16.h View File

@@ -19,9 +19,6 @@


#include <float.h> #include <float.h>
#include <string.h> #include <string.h>
#ifdef ENABLE_ARM64
#include <arm_neon.h>
#endif
#include "nnacl/errorcode.h" #include "nnacl/errorcode.h"
#include "nnacl/matmul_parameter.h" #include "nnacl/matmul_parameter.h"
#include "nnacl/op_base.h" #include "nnacl/op_base.h"
@@ -63,6 +60,9 @@ void MatMul12x8A32Fp16(const float16_t *a, const float16_t *b, float16_t *dst, c


void MatVecMulA32Fp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, void MatVecMulA32Fp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type,
int depth, int col); int depth, int col);

void MatVecMulA32NeonFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type,
int depth, int col);
#endif #endif


void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type, void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type,
@@ -79,8 +79,12 @@ void RowMajor2Col12MajorFp16Opt(const float16_t *src_ptr, float16_t *dst_ptr, si


void RowMajor2Col16MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); void RowMajor2Col16MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src);


void RowMajor2Col12MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src);

void RowMajor2Row16MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); void RowMajor2Row16MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src);


void RowMajor2Row12MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src);

void RowMajor2Row8MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); void RowMajor2Row8MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src);


void RowMajor2Col8MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); void RowMajor2Col8MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src);


+ 14
- 1
mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc View File

@@ -114,7 +114,12 @@ void MatmulBaseFP16CPUKernel::ResizeParameter() {
params_->row_align_ = 1; params_->row_align_ = 1;
params_->col_align_ = params_->col_; params_->col_align_ = params_->col_;
} else { } else {
params_->row_align_ = UP_ROUND(params_->row_, C16NUM);
#ifdef ENABLE_ARM64
int row_tile = C16NUM;
#else
int row_tile = C12NUM;
#endif
params_->row_align_ = UP_ROUND(params_->row_, row_tile);
params_->col_align_ = UP_ROUND(params_->col_, C8NUM); params_->col_align_ = UP_ROUND(params_->col_, C8NUM);
} }
return; return;
@@ -163,9 +168,17 @@ void MatmulBaseFP16CPUKernel::InitMatrixA(void *src_ptr) {
int8_t *src = int8_src + i * params_->deep_ * params_->row_ * lite::DataTypeSize(src_data_type); int8_t *src = int8_src + i * params_->deep_ * params_->row_ * lite::DataTypeSize(src_data_type);
float16_t *dst = a_pack_ptr_ + i * params_->deep_ * params_->row_align_; float16_t *dst = a_pack_ptr_ + i * params_->deep_ * params_->row_align_;
if (params_->a_transpose_) { if (params_->a_transpose_) {
#ifdef ENABLE_ARM64
RowMajor2Row16MajorFp16(src, dst, params_->deep_, params_->row_, src_data_type == kNumberTypeFloat32); RowMajor2Row16MajorFp16(src, dst, params_->deep_, params_->row_, src_data_type == kNumberTypeFloat32);
#else
RowMajor2Row12MajorFp16(src, dst, params_->deep_, params_->row_, src_data_type == kNumberTypeFloat32);
#endif
} else { } else {
#ifdef ENABLE_ARM64
RowMajor2Col16MajorFp16(src, dst, params_->row_, params_->deep_, src_data_type == kNumberTypeFloat32); RowMajor2Col16MajorFp16(src, dst, params_->row_, params_->deep_, src_data_type == kNumberTypeFloat32);
#else
RowMajor2Col12MajorFp16(src, dst, params_->row_, params_->deep_, src_data_type == kNumberTypeFloat32);
#endif
} }
} }
return; return;


Loading…
Cancel
Save