Browse Source

!7527 [MS][LITE][Develop]add fp32 deconv kernels

Merge pull request !7527 from lixian/master
tags/v1.1.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
9b2b062642
12 changed files with 1211 additions and 31 deletions
  1. +198
    -0
      mindspore/lite/nnacl/assembly/arm32/TiledC4MatmulFp32.S
  2. +218
    -0
      mindspore/lite/nnacl/assembly/arm32/WinogradTransLeft.S
  3. +208
    -0
      mindspore/lite/nnacl/assembly/arm32/WinogradTransRight.S
  4. +267
    -0
      mindspore/lite/nnacl/assembly/arm64/TiledC4MatmulFp32.S
  5. +147
    -0
      mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S
  6. +144
    -0
      mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S
  7. +4
    -2
      mindspore/lite/nnacl/fp32/common_func.c
  8. +2
    -2
      mindspore/lite/nnacl/fp32/common_func.h
  9. +20
    -25
      mindspore/lite/nnacl/fp32/deconv_winograd.c
  10. +1
    -0
      mindspore/lite/nnacl/fp32/deconv_winograd.h
  11. +1
    -1
      mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc
  12. +1
    -1
      mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd.cc

+ 198
- 0
mindspore/lite/nnacl/assembly/arm32/TiledC4MatmulFp32.S View File

@@ -0,0 +1,198 @@
#ifdef ENABLE_ARM32
.text
.align 5
.global TiledC4MatmulFp32
#ifndef __APPLE__
.type TiledC4MatmulFp32, %function
#endif

TiledC4MatmulFp32:
//void TiledC4MatmulFp32(float* dst, const float* src, const float* weight, size_t cal_num, size_t ic4, size_t oc4)
//x0: dst
//x1: src
//x2: weight
//x3: cal_num
//x4: ic4
//x5: oc4

push {r4-r8, lr}
ldr r4, [sp, #24]
ldr r5, [sp, #28]
//step multi by sizeof(float)
mov r8, #4
mul r3, r8, r3

vpush {q4-q7}

LoopOc:
mov r6, r1
mov r8, r0
subs r7, r4, #1
vld1.32 {q0, q1}, [r1]!
vld1.32 {q2, q3}, [r1]!
vld1.32 {q4, q5}, [r2]!
vld1.32 {q6, q7}, [r2]!

vmul.f32 q8, q4, d0[0]
vmul.f32 q9, q4, d2[0]
vmul.f32 q10, q4, d4[0]
vmul.f32 q11, q4, d6[0]

vmla.f32 q8, q5, d0[1]
vmla.f32 q9, q5, d2[1]
vmla.f32 q10, q5, d4[1]
vmla.f32 q11, q5, d6[1]

vmla.f32 q8, q6, d1[0]
vmla.f32 q9, q6, d3[0]
vmla.f32 q10, q6, d5[0]
vmla.f32 q11, q6, d7[0]

vmla.f32 q8, q7, d1[1]
vmla.f32 q9, q7, d3[1]
vmla.f32 q10, q7, d5[1]
vmla.f32 q11, q7, d7[1]

vld1.32 {q0, q1}, [r1]!
vld1.32 {q2, q3}, [r1]!

vmul.f32 q12, q4, d0[0]
vmul.f32 q13, q4, d2[0]
vmul.f32 q14, q4, d4[0]
vmul.f32 q15, q4, d6[0]

vmla.f32 q12, q5, d0[1]
vmla.f32 q13, q5, d2[1]
vmla.f32 q14, q5, d4[1]
vmla.f32 q15, q5, d6[1]

vmla.f32 q12, q6, d1[0]
vmla.f32 q13, q6, d3[0]
vmla.f32 q14, q6, d5[0]
vmla.f32 q15, q6, d7[0]

vmla.f32 q12, q7, d1[1]
vmla.f32 q13, q7, d3[1]
vmla.f32 q14, q7, d5[1]
vmla.f32 q15, q7, d7[1]
beq LoopIcEnd

subs r7, r7, #1

vld1.32 {q4, q5}, [r2]!
vld1.32 {q0, q1}, [r1]!
vld1.32 {q2, q3}, [r1]!

vmla.f32 q8, q4, d0[0]
vmla.f32 q9, q4, d2[0]
beq LoopIcEndHalf

LoopIc:
vmla.f32 q10, q4, d4[0]
vmla.f32 q11, q4, d6[0]

vmla.f32 q8, q5, d0[1]
vmla.f32 q9, q5, d2[1]
vld1.32 {q6, q7}, [r2]!
vmla.f32 q10, q5, d4[1]
vmla.f32 q11, q5, d6[1]

vmla.f32 q8, q6, d1[0]
vmla.f32 q9, q6, d3[0]
vmla.f32 q10, q6, d5[0]
vmla.f32 q11, q6, d7[0]

vmla.f32 q8, q7, d1[1]
vmla.f32 q9, q7, d3[1]
vmla.f32 q10, q7, d5[1]
vld1.32 {q0, q1}, [r1]!
vmla.f32 q11, q7, d7[1]

vld1.32 {q2, q3}, [r1]!

vmla.f32 q12, q4, d0[0]
vmla.f32 q13, q4, d2[0]
vmla.f32 q14, q4, d4[0]
vmla.f32 q15, q4, d6[0]

vmla.f32 q12, q5, d0[1]
vmla.f32 q13, q5, d2[1]
vmla.f32 q14, q5, d4[1]
vmla.f32 q15, q5, d6[1]

vmla.f32 q12, q6, d1[0]
vmla.f32 q13, q6, d3[0]
vmla.f32 q14, q6, d5[0]
vld1.32 {q4, q5}, [r2]!
vmla.f32 q15, q6, d7[0]

vmla.f32 q12, q7, d1[1]
vmla.f32 q13, q7, d3[1]
vmla.f32 q14, q7, d5[1]
vld1.32 {q0, q1}, [r1]!
vmla.f32 q15, q7, d7[1]

vld1.32 {q2, q3}, [r1]!

vmla.f32 q8, q4, d0[0]
vmla.f32 q9, q4, d2[0]

subs r7, r7, #1
bne LoopIc
LoopIcEndHalf:
vmla.f32 q10, q4, d4[0]
vmla.f32 q11, q4, d6[0]

vmla.f32 q8, q5, d0[1]
vmla.f32 q9, q5, d2[1]
vld1.32 {q6, q7}, [r2]!
vmla.f32 q10, q5, d4[1]
vmla.f32 q11, q5, d6[1]

vmla.f32 q8, q6, d1[0]
vmla.f32 q9, q6, d3[0]
vmla.f32 q10, q6, d5[0]
vmla.f32 q11, q6, d7[0]

vmla.f32 q8, q7, d1[1]
vmla.f32 q9, q7, d3[1]
vmla.f32 q10, q7, d5[1]
vld1.32 {q0, q1}, [r1]!
vmla.f32 q11, q7, d7[1]

vld1.32 {q2, q3}, [r1]!

vmla.f32 q12, q4, d0[0]
vmla.f32 q13, q4, d2[0]
vmla.f32 q14, q4, d4[0]
vmla.f32 q15, q4, d6[0]

vmla.f32 q12, q5, d0[1]
vmla.f32 q13, q5, d2[1]
vmla.f32 q14, q5, d4[1]
vmla.f32 q15, q5, d6[1]

vmla.f32 q12, q6, d1[0]
vmla.f32 q13, q6, d3[0]
vmla.f32 q14, q6, d5[0]
vmla.f32 q15, q6, d7[0]

vmla.f32 q12, q7, d1[1]
vmla.f32 q13, q7, d3[1]
vmla.f32 q14, q7, d5[1]
vmla.f32 q15, q7, d7[1]
LoopIcEnd:
vst1.32 {q8, q9}, [r0]!
vst1.32 {q10, q11}, [r0]!
vst1.32 {q12, q13}, [r0]!
vst1.32 {q14, q15}, [r0]!
mov r1, r6

subs r5, r5, #1
add r0, r8, r3
bne LoopOc

vpop {q4-q7}
pop {r4-r8, pc}

#endif

+ 218
- 0
mindspore/lite/nnacl/assembly/arm32/WinogradTransLeft.S View File

@@ -0,0 +1,218 @@
#ifdef ENABLE_ARM32

.text
.align 5
.global WinogradTransLeft
#ifndef __APPLE__
.type WinogradTransLeft, %function
#endif

//void WinogradTransLeft(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
//x0: S
//x1: B
//x2: M
//x3: w
//x4: h
//x5: k
//x6: length
WinogradTransLeft:
push {r4-r11, lr}
ldr r4, [sp, #36]
ldr r5, [sp, #40]
ldr r6, [sp, #44]

mov r8, #16 // 4 * sizeof(float)
mul r8, r6, r8
mul r9, r3, r8
sub r9, r9, r8
add r7, r9, r8 // step for S
mov r10, #4
mul r10, r4, r10 // step for B

LoopH:
push {r0, r3}
LoopW:
push {r0, r1}
vmov.i32 q14, #0
mov r11, r6
InitZero:
vst1.32 {q14}, [r2]!
subs r11, r11, #1
bne InitZero

sub r2, r2, r8
mov r12, r5

LoopKStart7:
cmp r12, #7
blt LoopKStart4
push {r3-r7}
LoopK7:
vld1.32 {d0[0]}, [r1], r10
vld1.32 {d0[1]}, [r1], r10
vld1.32 {d1[0]}, [r1], r10
vld1.32 {d1[1]}, [r1], r10
vld1.32 {d2[0]}, [r1], r10
vld1.32 {d2[1]}, [r1], r10
vld1.32 {d3[0]}, [r1], r10
mov r11, r6
vmov.32 d30[0], r1

add r1, r0, r7
add r3, r1, r7
add r4, r3, r7
add r5, r4, r7
add r6, r5, r7
add r7, r6, r7

LoopLength7:
vld1.32 {q8}, [r2]
vld1.32 {q12}, [r0]!
vmla.f32 q8, q12, d0[0]
vld1.32 {q13}, [r1]!
vmul.f32 q9, q13, d0[1]
vld1.32 {q12}, [r3]!
vmla.f32 q8, q12, d1[0]
vld1.32 {q13}, [r4]!
vmla.f32 q9, q13, d1[1]
vld1.32 {q12}, [r5]!
vmla.f32 q8, q12, d2[0]
vld1.32 {q13}, [r6]!
vmla.f32 q9, q13, d2[1]
vld1.32 {q12}, [r7]!
vmla.f32 q8, q12, d3[0]

vadd.f32 q9, q8, q9
vst1.32 {q9}, [r2]!
subs r11, r11, #1
bne LoopLength7

sub r2, r2, r8
sub r12, r12, #7
add r0, r7, r9
vmov.32 r1, d30[0]
cmp r12, #7
bge LoopK7

pop {r3-r7}

LoopKStart4:
cmp r12, #4
blt LoopKStart3
vmov.32 d30[1], r3
vmov.32 d31[0], r4
LoopK4:
vld1.32 {d0[0]}, [r1], r10
vld1.32 {d0[1]}, [r1], r10
vld1.32 {d1[0]}, [r1], r10
vld1.32 {d1[1]}, [r1], r10
mov r11, r6
vmov.32 d30[0], r1

add r1, r0, r7
add r3, r1, r7
add r4, r3, r7

LoopLength4:
vld1.32 {q8}, [r2]
vld1.32 {q12}, [r0]!
vmla.f32 q8, q12, d0[0]
vld1.32 {q13}, [r1]!
vmul.f32 q9, q13, d0[1]
vld1.32 {q12}, [r3]!
vmla.f32 q8, q12, d1[0]
vld1.32 {q13}, [r4]!
vmla.f32 q9, q13, d1[1]

vadd.f32 q9, q8, q9
vst1.32 {q9}, [r2]!
subs r11, r11, #1
bne LoopLength4

sub r2, r2, r8
sub r12, r12, #4
add r0, r4, r9
vmov.32 r1, d30[0]
cmp r12, #4
bge LoopK4

vmov.32 r3, d30[1]
vmov.32 r4, d31[0]

LoopKStart3:
cmp r12, #3
blt LoopKStart
vmov.32 d30[1], r3
vmov.32 d31[0], r4
LoopK3:
vld1.32 {d0[0]}, [r1], r10
vld1.32 {d0[1]}, [r1], r10
vld1.32 {d1[0]}, [r1], r10
mov r11, r6
vmov.32 d30[0], r1

add r1, r0, r7
add r3, r1, r7

LoopLength3:
vld1.32 {q8}, [r2]
vld1.32 {q12}, [r0]!
vmla.f32 q8, q12, d0[0]
vld1.32 {q13}, [r1]!
vmul.f32 q9, q13, d0[1]
vld1.32 {q12}, [r3]!
vmla.f32 q8, q12, d1[0]

vadd.f32 q9, q8, q9
vst1.32 {q9}, [r2]!
subs r11, r11, #1
bne LoopLength3

sub r2, r2, r8
sub r12, r12, #3
add r0, r3, r9
vmov.32 r1, d30[0]
cmp r12, #3
bge LoopK3

vmov.32 r3, d30[1]
vmov.32 r4, d31[0]

LoopKStart:
cmp r12, #0
beq LoopKEnd

LoopK:
vld1.32 {d30[0]}, [r1], r10

vdup.32 q15, d30[0]
mov r11, r6
LoopLength:
vld1.32 {q0}, [r2]
vld1.32 {q1}, [r0]!
vmla.f32 q0, q1, q15

vst1.32 {q0}, [r2]!
subs r11, r11, #1
bne LoopLength
subs r12, r12, #1

sub r2, r2, r8
add r0, r0, r9
bne LoopK

LoopKEnd:
pop {r0, r1}
subs r3, r3, #1
add r0, r0, r8
add r2, r2, r8
bne LoopW

pop {r0, r3}
add r1, r1, #4 //sizeof(float)
subs r4, r4, #1
bne LoopH

pop {r4-r11, pc}

#endif

+ 208
- 0
mindspore/lite/nnacl/assembly/arm32/WinogradTransRight.S View File

@@ -0,0 +1,208 @@
#ifdef ENABLE_ARM32

.text
.align 5
.global WinogradTransRight
#ifndef __APPLE__
.type WinogradTransRight, %function
#endif

//void WinogradTransRight(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
//x0: S
//x1: B
//x2: M
//x3: w
//x4: h
//x5: k
//x6: length
WinogradTransRight:
push {r4-r11, lr}
ldr r4, [sp, #36]
ldr r5, [sp, #40]
ldr r6, [sp, #44]

mov r8, #16 // 4 * sizeof(float)
mul r8, r6, r8
mul r9, r5, r8 // step for S
mov r10, #4
mul r10, r4, r10 // step for B

LoopH:
push {r1, r3}
LoopW:
push {r0, r1}
vmov.i32 q14, #0
mov r11, r6
InitZero:
vst1.32 {q14}, [r2]!
subs r11, r11, #1
bne InitZero

sub r2, r2, r8
mov r12, r5
LoopKStart7:
cmp r12, #7
blt LoopKStart4
push {r3-r7}
LoopK7:
vld1.32 {d0[0]}, [r1], r10
vld1.32 {d0[1]}, [r1], r10
vld1.32 {d1[0]}, [r1], r10
vld1.32 {d1[1]}, [r1], r10
vld1.32 {d2[0]}, [r1], r10
vld1.32 {d2[1]}, [r1], r10
vld1.32 {d3[0]}, [r1], r10
mov r11, r6
vmov.32 d30[0], r1

add r1, r0, r8
add r3, r1, r8
add r4, r3, r8
add r5, r4, r8
add r6, r5, r8
add r7, r6, r8
LoopLength7:
vld1.32 {q8}, [r2]
vld1.32 {q12}, [r0]!
vmla.f32 q8, q12, d0[0]
vld1.32 {q13}, [r1]!
vmul.f32 q9, q13, d0[1]
vld1.32 {q12}, [r3]!
vmla.f32 q8, q12, d1[0]
vld1.32 {q13}, [r4]!
vmla.f32 q9, q13, d1[1]
vld1.32 {q12}, [r5]!
vmla.f32 q8, q12, d2[0]
vld1.32 {q13}, [r6]!
vmla.f32 q9, q13, d2[1]
vld1.32 {q12}, [r7]!
vmla.f32 q8, q12, d3[0]

vadd.f32 q9, q8, q9
vst1.32 {q9}, [r2]!
subs r11, r11, #1
bne LoopLength7

sub r2, r2, r8
sub r12, r12, #7
mov r0, r7
vmov.32 r1, d30[0]
cmp r12, #7
bge LoopK7

pop {r3-r7}

LoopKStart4:
cmp r12, #4
blt LoopKStart3
vmov.32 d30[1], r3
vmov.32 d31[0], r4
LoopK4:
vld1.32 {d0[0]}, [r1], r10
vld1.32 {d0[1]}, [r1], r10
vld1.32 {d1[0]}, [r1], r10
vld1.32 {d1[1]}, [r1], r10
mov r11, r6
vmov.32 d30[0], r1

add r1, r0, r8
add r3, r1, r8
add r4, r3, r8

LoopLength4:
vld1.32 {q8}, [r2]
vld1.32 {q12}, [r0]!
vmla.f32 q8, q12, d0[0]
vld1.32 {q13}, [r1]!
vmul.f32 q9, q13, d0[1]
vld1.32 {q12}, [r3]!
vmla.f32 q8, q12, d1[0]
vld1.32 {q13}, [r4]!
vmla.f32 q9, q13, d1[1]

vadd.f32 q9, q8, q9
vst1.32 {q9}, [r2]!
subs r11, r11, #1
bne LoopLength4

sub r2, r2, r8
sub r12, r12, #4
mov r0, r4
vmov.32 r1, d30[0]
cmp r12, #4
bge LoopK4

vmov.32 r3, d30[1]
vmov.32 r4, d31[0]

LoopKStart3:
cmp r12, #3
blt LoopKStart
vmov.32 d30[1], r3
LoopK3:
vld1.32 {d0[0]}, [r1], r10
vld1.32 {d0[1]}, [r1], r10
vld1.32 {d1[0]}, [r1], r10
mov r11, r6
vmov.32 d30[0], r1

add r1, r0, r8
add r3, r1, r8

LoopLength3:
vld1.32 {q8}, [r2]
vld1.32 {q12}, [r0]!
vmla.f32 q8, q12, d0[0]
vld1.32 {q13}, [r1]!
vmul.f32 q9, q13, d0[1]
vld1.32 {q12}, [r3]!
vmla.f32 q8, q12, d1[0]

vadd.f32 q9, q8, q9
vst1.32 {q9}, [r2]!
subs r11, r11, #1
bne LoopLength3

sub r2, r2, r8
sub r12, r12, #3
mov r0, r3
vmov.32 r1, d30[0]
cmp r12, #3
bge LoopK3

vmov.32 r3, d30[1]

LoopKStart:
cmp r12, #0
beq LoopKEnd
LoopK:
vld1.32 {d30[0]}, [r1], r10
vdup.32 q15, d30[0]
mov r11, r6
LoopLength:
vld1.32 {q0}, [r2]
vld1.32 {q1}, [r0]!
vmla.f32 q0, q1, q15

vst1.32 {q0}, [r2]!
subs r11, r11, #1
bne LoopLength

subs r12, r12, #1
sub r2, r2, r8
bne LoopK
LoopKEnd:
pop {r0, r1}
subs r3, r3, #1
add r2, r2, r8
add r1, r1, #4 //sizeof(float)
bne LoopW

pop {r1, r3}
add r0, r0, r9
subs r4, r4, #1
bne LoopH

pop {r4-r11, pc}

#endif

+ 267
- 0
mindspore/lite/nnacl/assembly/arm64/TiledC4MatmulFp32.S View File

@@ -0,0 +1,267 @@
#ifdef __aarch64__

.text
.align 5
.global TiledC4MatmulFp32
#ifndef __APPLE__
.type TiledC4MatmulFp32, %function
#endif

TiledC4MatmulFp32:
//void TiledC4MatmulFp32(float* dst, const float* src, const float* weight, size_t ic4, size_t cal_num, size_t oc4)
//x0: dst
//x1: src
//x2: weight
//x3: cal_num
//x4: ic4
//x5: oc4

sub sp, sp, #128
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64

mov x7, #4 //sizeof(float)
mul x3, x3, x7
mov x7, #64
mul x10, x4, x7

cmp x5, #2
blt LoopOcHalf
LoopOc:
mov x8, x1
subs x9, x4, #1

add x6, x2, x10
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x2], #64
fmul v16.4s, v8.4s, v0.s[0]
fmul v17.4s, v8.4s, v1.s[0]
ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x8], #64
fmul v18.4s, v8.4s, v2.s[0]
fmul v19.4s, v8.4s, v3.s[0]
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x6], #64
fmul v20.4s, v8.4s, v4.s[0]
fmul v21.4s, v8.4s, v5.s[0]
fmul v22.4s, v8.4s, v6.s[0]
fmul v23.4s, v8.4s, v7.s[0]
fmul v24.4s, v12.4s, v0.s[0]
fmul v25.4s, v12.4s, v1.s[0]
fmul v26.4s, v12.4s, v2.s[0]
fmul v27.4s, v12.4s, v3.s[0]
fmul v28.4s, v12.4s, v4.s[0]
fmul v29.4s, v12.4s, v5.s[0]
fmul v30.4s, v12.4s, v6.s[0]
fmul v31.4s, v12.4s, v7.s[0]

beq LoopIcEnd
LoopIc:
add x2, x2, #128
prfm pldl1keep, [x2]
prfm pldl1keep, [x2, x10]
sub x2, x2, #128
prfm pldl1keep, [x8, #128]
prfm pldl1keep, [x8, #192]

fmla v16.4s, v9.4s, v0.s[1]
fmla v17.4s, v9.4s, v1.s[1]
fmla v18.4s, v9.4s, v2.s[1]
fmla v19.4s, v9.4s, v3.s[1]
fmla v20.4s, v9.4s, v4.s[1]
fmla v21.4s, v9.4s, v5.s[1]
fmla v22.4s, v9.4s, v6.s[1]
fmla v23.4s, v9.4s, v7.s[1]
fmla v24.4s, v13.4s, v0.s[1]
fmla v25.4s, v13.4s, v1.s[1]
fmla v26.4s, v13.4s, v2.s[1]
fmla v27.4s, v13.4s, v3.s[1]
fmla v28.4s, v13.4s, v4.s[1]
fmla v29.4s, v13.4s, v5.s[1]
fmla v30.4s, v13.4s, v6.s[1]
fmla v31.4s, v13.4s, v7.s[1]

fmla v16.4s, v10.4s, v0.s[2]
fmla v17.4s, v10.4s, v1.s[2]
fmla v18.4s, v10.4s, v2.s[2]
fmla v19.4s, v10.4s, v3.s[2]
fmla v20.4s, v10.4s, v4.s[2]
fmla v21.4s, v10.4s, v5.s[2]
fmla v22.4s, v10.4s, v6.s[2]
fmla v23.4s, v10.4s, v7.s[2]
fmla v24.4s, v14.4s, v0.s[2]
fmla v25.4s, v14.4s, v1.s[2]
fmla v26.4s, v14.4s, v2.s[2]
fmla v27.4s, v14.4s, v3.s[2]
fmla v28.4s, v14.4s, v4.s[2]
fmla v29.4s, v14.4s, v5.s[2]
fmla v30.4s, v14.4s, v6.s[2]
fmla v31.4s, v14.4s, v7.s[2]

fmla v16.4s, v11.4s, v0.s[3]
fmla v17.4s, v11.4s, v1.s[3]
fmla v18.4s, v11.4s, v2.s[3]
fmla v19.4s, v11.4s, v3.s[3]
fmla v20.4s, v11.4s, v4.s[3]
fmla v21.4s, v11.4s, v5.s[3]
fmla v22.4s, v11.4s, v6.s[3]
fmla v23.4s, v11.4s, v7.s[3]
fmla v24.4s, v15.4s, v0.s[3]
fmla v25.4s, v15.4s, v1.s[3]
fmla v26.4s, v15.4s, v2.s[3]
fmla v27.4s, v15.4s, v3.s[3]
fmla v28.4s, v15.4s, v4.s[3]
fmla v29.4s, v15.4s, v5.s[3]
fmla v30.4s, v15.4s, v6.s[3]
fmla v31.4s, v15.4s, v7.s[3]

ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x2], #64
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
fmla v16.4s, v8.4s, v0.s[0]
fmla v17.4s, v8.4s, v1.s[0]
ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x8], #64
fmla v18.4s, v8.4s, v2.s[0]
fmla v19.4s, v8.4s, v3.s[0]
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x6], #64
fmla v20.4s, v8.4s, v4.s[0]
fmla v21.4s, v8.4s, v5.s[0]
fmla v22.4s, v8.4s, v6.s[0]
fmla v23.4s, v8.4s, v7.s[0]
fmla v24.4s, v12.4s, v0.s[0]
fmla v25.4s, v12.4s, v1.s[0]
fmla v26.4s, v12.4s, v2.s[0]
fmla v27.4s, v12.4s, v3.s[0]
fmla v28.4s, v12.4s, v4.s[0]
fmla v29.4s, v12.4s, v5.s[0]
fmla v30.4s, v12.4s, v6.s[0]
fmla v31.4s, v12.4s, v7.s[0]

subs x9, x9, #1
bne LoopIc

LoopIcEnd:
fmla v16.4s, v9.4s, v0.s[1]
fmla v17.4s, v9.4s, v1.s[1]
fmla v18.4s, v9.4s, v2.s[1]
fmla v19.4s, v9.4s, v3.s[1]
fmla v20.4s, v9.4s, v4.s[1]
fmla v21.4s, v9.4s, v5.s[1]
fmla v22.4s, v9.4s, v6.s[1]
fmla v23.4s, v9.4s, v7.s[1]
fmla v24.4s, v13.4s, v0.s[1]
fmla v25.4s, v13.4s, v1.s[1]
fmla v26.4s, v13.4s, v2.s[1]
fmla v27.4s, v13.4s, v3.s[1]
fmla v28.4s, v13.4s, v4.s[1]
fmla v29.4s, v13.4s, v5.s[1]
fmla v30.4s, v13.4s, v6.s[1]
fmla v31.4s, v13.4s, v7.s[1]

fmla v16.4s, v10.4s, v0.s[2]
fmla v17.4s, v10.4s, v1.s[2]
fmla v18.4s, v10.4s, v2.s[2]
fmla v19.4s, v10.4s, v3.s[2]
fmla v20.4s, v10.4s, v4.s[2]
fmla v21.4s, v10.4s, v5.s[2]
fmla v22.4s, v10.4s, v6.s[2]
fmla v23.4s, v10.4s, v7.s[2]
fmla v24.4s, v14.4s, v0.s[2]
fmla v25.4s, v14.4s, v1.s[2]
fmla v26.4s, v14.4s, v2.s[2]
fmla v27.4s, v14.4s, v3.s[2]
fmla v28.4s, v14.4s, v4.s[2]
fmla v29.4s, v14.4s, v5.s[2]
fmla v30.4s, v14.4s, v6.s[2]
fmla v31.4s, v14.4s, v7.s[2]

add x7, x0, #64

fmla v16.4s, v11.4s, v0.s[3]
fmla v17.4s, v11.4s, v1.s[3]
fmla v18.4s, v11.4s, v2.s[3]
fmla v19.4s, v11.4s, v3.s[3]
fmla v20.4s, v11.4s, v4.s[3]
fmla v21.4s, v11.4s, v5.s[3]
fmla v22.4s, v11.4s, v6.s[3]
fmla v23.4s, v11.4s, v7.s[3]
fmla v24.4s, v15.4s, v0.s[3]
fmla v25.4s, v15.4s, v1.s[3]
fmla v26.4s, v15.4s, v2.s[3]
fmla v27.4s, v15.4s, v3.s[3]
fmla v28.4s, v15.4s, v4.s[3]
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], x3
fmla v29.4s, v15.4s, v5.s[3]
st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x7], x3
fmla v30.4s, v15.4s, v6.s[3]
st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], x3
mov x2, x6
fmla v31.4s, v15.4s, v7.s[3]
st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x7]

subs x5, x5, #2
beq LoopOcEnd
cmp x5, #2
bge LoopOc

LoopOcHalf:
mov x8, x1
mov x9, x4
dup v16.4s, wzr
dup v17.4s, wzr
dup v18.4s, wzr
dup v19.4s, wzr
dup v20.4s, wzr
dup v21.4s, wzr
dup v22.4s, wzr
dup v23.4s, wzr

LoopIcHalf:
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x2], #64
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
fmla v16.4s, v8.4s, v0.s[0]
fmla v17.4s, v8.4s, v1.s[0]
ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x8], #64
fmla v18.4s, v8.4s, v2.s[0]
fmla v19.4s, v8.4s, v3.s[0]
fmla v20.4s, v8.4s, v4.s[0]
fmla v21.4s, v8.4s, v5.s[0]
fmla v22.4s, v8.4s, v6.s[0]
fmla v23.4s, v8.4s, v7.s[0]

fmla v16.4s, v9.4s, v0.s[1]
fmla v17.4s, v9.4s, v1.s[1]
fmla v18.4s, v9.4s, v2.s[1]
fmla v19.4s, v9.4s, v3.s[1]
fmla v20.4s, v9.4s, v4.s[1]
fmla v21.4s, v9.4s, v5.s[1]
fmla v22.4s, v9.4s, v6.s[1]
fmla v23.4s, v9.4s, v7.s[1]

fmla v16.4s, v10.4s, v0.s[2]
fmla v17.4s, v10.4s, v1.s[2]
fmla v18.4s, v10.4s, v2.s[2]
fmla v19.4s, v10.4s, v3.s[2]
fmla v20.4s, v10.4s, v4.s[2]
fmla v21.4s, v10.4s, v5.s[2]
fmla v22.4s, v10.4s, v6.s[2]
fmla v23.4s, v10.4s, v7.s[2]

fmla v16.4s, v11.4s, v0.s[3]
fmla v17.4s, v11.4s, v1.s[3]
fmla v18.4s, v11.4s, v2.s[3]
fmla v19.4s, v11.4s, v3.s[3]
fmla v20.4s, v11.4s, v4.s[3]
fmla v21.4s, v11.4s, v5.s[3]
fmla v22.4s, v11.4s, v6.s[3]
fmla v23.4s, v11.4s, v7.s[3]

subs x9, x9, #1
bne LoopIcHalf

st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64

LoopOcEnd:
sub sp, sp, #128
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ret
#endif

+ 147
- 0
mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S View File

@@ -0,0 +1,147 @@
#ifdef __aarch64__

.text
.align 5
.global WinogradTransLeft
#ifndef __APPLE__
.type WinogradTransLeft, %function
#endif

WinogradTransLeft:
//void WinogradTransLeft(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
//x0: S
//x1: B
//x2: M
//x3: w
//x4: h
//x5: k
//x6:length

sub sp, sp, #32
stp x19, x20, [sp], #32

mov x8, #16 // 4 * sizeof(float)
mul x8, x6, x8
mul x9, x3, x8
sub x9, x9, x8
add x7, x9, x8 // step for S
mov x10, #4
mul x10, x4, x10 // step for B

LoopH:
mov x13, x0
mov x15, x3
LoopW:
mov x14, x13
mov x17, x1
dup v30.4s, wzr
mov x11, x6
InitZero:
st1 {v30.4s}, [x2], #16
subs x11, x11, #1
bne InitZero

sub x2, x2, x8
mov x12, x5
LoopKStart4:
cmp x12, #4
blt LoopKStart3
mov x16, x15
mov x19, x4
LoopK4:
ld1 {v0.s}[0], [x17], x10
ld1 {v0.s}[1], [x17], x10
ld1 {v0.s}[2], [x17], x10
ld1 {v0.s}[3], [x17], x10
mov x11, x6
mov x18, x17
add x18, x14, x7
add x16, x18, x7
add x19, x16, x7

LoopLength4:
ld1 {v16.4s}, [x2]
ld1 {v20.4s}, [x14], #16
fmla v16.4s, v20.4s, v0.s[0]
ld1 {v21.4s}, [x18], #16
fmul v17.4s, v21.4s, v0.s[1]
ld1 {v20.4s}, [x16], #16
fmla v16.4s, v20.4s, v0.s[2]
ld1 {v21.4s}, [x19], #16
fmla v17.4s, v21.4s, v0.s[3]
fadd v17.4s, v16.4s, v17.4s
st1 {v17.4s}, [x2], #16
subs x11, x11, #1
bne LoopLength4

sub x2, x2, x8
sub x12, x12, #4
add x14, x19, x9
cmp x12, #4
bge LoopK4

LoopKStart3:
cmp x12, #3
blt LoopKStart
mov x16, x15
LoopK3:
ld1 {v0.s}[0], [x17], x10
ld1 {v0.s}[1], [x17], x10
ld1 {v0.s}[2], [x17], x10
mov x11, x6
mov x18, x17
add x18, x14, x7
add x16, x18, x7
LoopLength3:
ld1 {v16.4s}, [x2]
ld1 {v20.4s}, [x14], #16
fmla v16.4s, v20.4s, v0.s[0]
ld1 {v21.4s}, [x18], #16
fmul v17.4s, v21.4s, v0.s[1]
ld1 {v20.4s}, [x16], #16
fmla v16.4s, v20.4s, v0.s[2]
fadd v17.4s, v16.4s, v17.4s
st1 {v17.4s}, [x2], #16
subs x11, x11, #1
bne LoopLength3

sub x2, x2, x8
sub x12, x12, #3
add x14, x16, x9
cmp x12, #3
bge LoopK3

LoopKStart:
cmp x12, #0
beq LKEnd
LoopK:
ld1r {v31.4s}, [x17], x10
mov x11, x6
LoopLength:
ld1 {v0.4s}, [x2]
ld1 {v1.4s}, [x14], #16
fmla v0.4s, v1.4s, v31.4s
st1 {v0.4s}, [x2], #16
subs x11, x11, #1
bne LoopLength

subs x12, x12, #1
sub x2, x2, x8
add x14, x14, x9
bne LoopK

LKEnd:
subs x15, x15, #1
add x13, x13, x8
add x2, x2, x8
bne LoopW

add x1, x1, #4 //sizeof(float)
subs x4, x4, #1
bne LoopH

sub sp, sp, #32
ldp x19, x20, [sp], #32
ret

#endif

+ 144
- 0
mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S View File

@@ -0,0 +1,144 @@
#ifdef __aarch64__

.text
.align 5
.global WinogradTransRight
#ifndef __APPLE__
.type WinogradTransRight, %function
#endif

WinogradTransRight:
//void WinogradTransRight(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
//x0: S
//x1: B
//x2: M
//x3: w
//x4: h
//x5: k
//x6: length

mov x8, #16 // 4 * sizeof(float)
mul x8, x6, x8
mul x9, x5, x8 // step for S
mov x10, #4
mul x10, x4, x10 // step for B

LoopH:
mov x7, x1
mov x15, x3
LoopW:
mov x17, x0
mov x13, x7
dup v30.4s, wzr
mov x11, x6
InitZero:
st1 {v30.4s}, [x2], #16
subs x11, x11, #1
bne InitZero
sub x2, x2, x8
mov x12, x5

LoopKStart4:
cmp x12, #4
blt LoopKStart3
mov x16, x15
mov x18, x4
LoopK4:
ld1 {v0.s}[0], [x13], x10
ld1 {v0.s}[1], [x13], x10
ld1 {v0.s}[2], [x13], x10
ld1 {v0.s}[3], [x13], x10
mov x11, x6
mov x14, x13

add x14, x17, x8
add x16, x14, x8
add x18, x16, x8

LoopLength4:
ld1 {v16.4s}, [x2]
ld1 {v20.4s}, [x17], #16
fmla v16.4s, v20.4s, v0.s[0]
ld1 {v21.4s}, [x14], #16
fmul v17.4s, v21.4s, v0.s[1]
ld1 {v20.4s}, [x16], #16
fmla v16.4s, v20.4s, v0.s[2]
ld1 {v21.4s}, [x18], #16
fmla v17.4s, v21.4s, v0.s[3]

fadd v17.4s, v16.4s, v17.4s
st1 {v17.4s}, [x2], #16
subs x11, x11, #1
bne LoopLength4
sub x2, x2, x8
sub x12, x12, #4
mov x17, x18

cmp x12, #4
bge LoopK4

LoopKStart3:
cmp x12, #3
blt LoopKStart
mov x16, x15
LoopK3:
ld1 {v0.s}[0], [x13], x10
ld1 {v0.s}[1], [x13], x10
ld1 {v0.s}[2], [x13], x10
mov x11, x6
mov x14, x13

add x14, x17, x8
add x16, x14, x8

LoopLength3:
ld1 {v16.4s}, [x2]
ld1 {v20.4s}, [x17], #16
fmla v16.4s, v20.4s, v0.s[0]
ld1 {v21.4s}, [x14], #16
fmul v17.4s, v21.4s, v0.s[1]
ld1 {v20.4s}, [x16], #16
fmla v16.4s, v20.4s, v0.s[2]

fadd v17.4s, v16.4s, v17.4s
st1 {v17.4s}, [x2], #16
subs x11, x11, #1
bne LoopLength3
sub x2, x2, x8
sub x12, x12, #3
mov x17, x18
cmp x12, #3
bge LoopK3

LoopKStart:
cmp x12, #0
beq LoopKEnd

LoopK:
ld1r {v31.4s}, [x13], x10

mov x11, x6
LoopLength:
ld1 {v0.4s}, [x2]
ld1 {v1.4s}, [x17], #16
fmla v0.4s, v1.4s, v31.4s

st1 {v0.4s}, [x2], #16
subs x11, x11, #1
bne LoopLength
subs x12, x12, #1

sub x2, x2, x8
bne LoopK
LoopKEnd:
subs x15, x15, #1
add x2, x2, x8
add x7, x7, #4 //sizeof(float)
bne LoopW

add x0, x0, x9
subs x4, x4, #1
bne LoopH

ret
#endif

+ 4
- 2
mindspore/lite/nnacl/fp32/common_func.c View File

@@ -68,7 +68,8 @@ void PostConvFuncFp32C4(const float *c4_out_ptr, float *out_ptr, const float *bi
return;
}

void WinogradMatrixProductLeft(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length) {
#ifndef ENABLE_ARM
void WinogradTransLeft(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length) {
int unitStep = 4 * length;
for (int y = 0; y < h; ++y) {
float *dstY = M + y * w * unitStep;
@@ -91,7 +92,7 @@ void WinogradMatrixProductLeft(const float *S, const float *B, float *M, size_t
}

// M = S * B , M = w*h * l, S = k*h * l, B = w*k
void WinogradMatrixProductRight(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length) {
void WinogradTransRight(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length) {
int unitStep = 4 * length;
for (int y = 0; y < h; ++y) {
float *dstY = M + y * w * unitStep;
@@ -113,6 +114,7 @@ void WinogradMatrixProductRight(const float *S, const float *B, float *M, size_t
}
}
}
#endif

union float32_bits {
unsigned int u;


+ 2
- 2
mindspore/lite/nnacl/fp32/common_func.h View File

@@ -32,8 +32,8 @@ void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bi
void PostConvFuncFp32C4(const float *c4_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel,
size_t plane_size, size_t plane_stride, size_t relu_type);

void WinogradMatrixProductLeft(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length);
void WinogradMatrixProductRight(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length);
void WinogradTransLeft(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length);
void WinogradTransRight(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length);

float ShortToFloat32(uint16_t src_value);



+ 20
- 25
mindspore/lite/nnacl/fp32/deconv_winograd.c View File

@@ -130,21 +130,21 @@ void DeConvWgInputPack(float *src_ptr, float *dst_ptr, int channel, int stride)
return;
}

void MSGemmFloatCommon_4(float *dst, const float *src, const float *weight, size_t src_depth_quad, size_t dst_step,
size_t dst_depth_quad, size_t width, size_t weight_depth_offset) {
#ifndef ENABLE_ARM
void TiledC4MatmulFp32(float *dst, const float *src, const float *weight, size_t cal_num, size_t ic4, size_t oc4) {
int dx, sz, dz;
int src_depth_step = 4 * width;
for (dz = 0; dz < dst_depth_quad; ++dz) {
float *dst_z = dst + dz * dst_step;
const float *weight_dz = weight + dz * (src_depth_quad * 16 + weight_depth_offset);
for (dx = 0; dx < width; ++dx) {
int src_depth_step = 4 * DECONV_WINOGRAD_DEFAULT_TILE;
for (dz = 0; dz < oc4; ++dz) {
float *dst_z = dst + dz * cal_num;
const float *weight_dz = weight + dz * ic4 * 16;
for (dx = 0; dx < DECONV_WINOGRAD_DEFAULT_TILE; ++dx) {
float *dst_x = dst_z + dx * 4;
dst_x[0] = 0.0f;
dst_x[1] = 0.0f;
dst_x[2] = 0.0f;
dst_x[3] = 0.0f;
const float *src_dx = src + 4 * dx;
for (sz = 0; sz < src_depth_quad; ++sz) {
for (sz = 0; sz < ic4; ++sz) {
const float *src_z = src_dx + sz * src_depth_step;
const float *weight_z = weight_dz + sz * 16;
for (int i = 0; i < 4; ++i) {
@@ -156,12 +156,7 @@ void MSGemmFloatCommon_4(float *dst, const float *src, const float *weight, size
}
}
}

void MSGemmFloatUnit_4(float *dstOrigin, const float *src, const float *weight, size_t src_depth_quad, size_t dst_step,
size_t dst_depth_quad, size_t weight_depth_offset) {
MSGemmFloatCommon_4(dstOrigin, src, weight, src_depth_quad, dst_step, dst_depth_quad, DECONV_WINOGRAD_DEFAULT_TILE,
weight_depth_offset);
}
#endif

void DeConvWgMerge(const float *src, float *dst, size_t src_stride, size_t dst_stride, size_t count) {
for (int i = 0; i < count; ++i) {
@@ -179,10 +174,10 @@ void _deConvWinograd(float *tile_in, float *tile_out, float *weight_buf, float *
int unit_size, int w_start, int h_start, ConvParameter *conv_param, DeConvParam *deconv_param) {
int winograd_plane = unit_size * unit_size;
if (!transfered[unit_size]) {
WinogradMatrixProductLeft(tile_in, at_buf, a_mid_buf, DECONV_WINOGRAD_DEFAULT_UNIT, unit_size,
DECONV_WINOGRAD_DEFAULT_UNIT, deconv_param->ic_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);
WinogradMatrixProductRight(a_mid_buf, at_buf, trans_a_buf, unit_size, unit_size, DECONV_WINOGRAD_DEFAULT_UNIT,
deconv_param->ic_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);
WinogradTransLeft(tile_in, at_buf, a_mid_buf, DECONV_WINOGRAD_DEFAULT_UNIT, unit_size, DECONV_WINOGRAD_DEFAULT_UNIT,
deconv_param->ic_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);
WinogradTransRight(a_mid_buf, at_buf, trans_a_buf, unit_size, unit_size, DECONV_WINOGRAD_DEFAULT_UNIT,
deconv_param->ic_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);
transfered[unit_size] = true;
}

@@ -190,14 +185,14 @@ void _deConvWinograd(float *tile_in, float *tile_out, float *weight_buf, float *
float *src = trans_a_buf + index * DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->ic_up4_;
float *dst = tmp_buf + index * deconv_param->oc_up4_ * DECONV_WINOGRAD_DEFAULT_TILE;
float *weight = weight_buf + index * deconv_param->ic_up4_ * deconv_param->oc_up4_;
MSGemmFloatUnit_4(dst, src, weight, deconv_param->ic_div4_, DECONV_WINOGRAD_DEFAULT_TILE * C4NUM,
deconv_param->oc_div4_, 0);
TiledC4MatmulFp32(dst, src, weight, DECONV_WINOGRAD_DEFAULT_TILE * C4NUM, deconv_param->ic_div4_,
deconv_param->oc_div4_);
}

WinogradMatrixProductLeft(tmp_buf, bt_buf, b_tmp_buf, unit_size, unit_size, unit_size,
deconv_param->oc_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);
WinogradMatrixProductRight(b_tmp_buf, bt_buf, tmp_buf, unit_size, unit_size, unit_size,
deconv_param->oc_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);
WinogradTransLeft(tmp_buf, bt_buf, b_tmp_buf, unit_size, unit_size, unit_size,
deconv_param->oc_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);
WinogradTransRight(b_tmp_buf, bt_buf, tmp_buf, unit_size, unit_size, unit_size,
deconv_param->oc_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);

// Add to dest
for (int uhi = 0; uhi < unit_size; uhi++) {
@@ -223,7 +218,7 @@ void _deConvCommon(float *tile_in, float *tile_out, float *weight, float *tmp_bu
for (int hi = 0; hi < DECONV_WINOGRAD_DEFAULT_UNIT; hi++) {
for (int wi = 0; wi < DECONV_WINOGRAD_DEFAULT_UNIT; wi++) {
float *src_in = tile_in + (wi + hi * DECONV_WINOGRAD_DEFAULT_UNIT) * in_stride;
MSGemmFloatUnit_4(tmp_buf, src_in, weight, deconv_param->ic_div4_, DECONV_WINOGRAD_DEFAULT_TILE * 4, count, 0);
TiledC4MatmulFp32(tmp_buf, src_in, weight, DECONV_WINOGRAD_DEFAULT_TILE * 4, deconv_param->ic_div4_, count);

for (int uhi = 0; uhi < h_size; uhi++) {
for (int uwi = 0; uwi < w_size; uwi++) {


+ 1
- 0
mindspore/lite/nnacl/fp32/deconv_winograd.h View File

@@ -34,6 +34,7 @@ void DeconvWg(float *nhwc_input_, float *tile_in, float *tile_out, int start_ind
ConvParameter *conv_param, DeConvParam *deconv_param, int task_id);
void DeconvWgPost(float *tile_out, float *nc4hw4_output, ConvParameter *conv_param, DeConvParam *deconv_param,
int calculate_count, int tile_index);
void TiledC4MatmulFp32(float *dst, const float *src, const float *weight, size_t ic4, size_t cal_num, size_t oc4);

#ifdef __cplusplus
}


+ 1
- 1
mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc View File

@@ -249,7 +249,7 @@ kernel::LiteKernel *CpuDeConvFp32KernelCreator(const std::vector<lite::Tensor *>
if ((conv_param->stride_h_ != 1 || conv_param->stride_w_ != 1) &&
(conv_param->dilation_w_ == 1 && conv_param->dilation_h_ == 1)) {
/* DeConvolutionWinogradCPUKernel */
kernel = new (std::nothrow) kernel::DeConvolutionCPUKernel(opParameter, inputs, outputs, ctx, primitive);
kernel = new (std::nothrow) kernel::DeConvolutionWinogradCPUKernel(opParameter, inputs, outputs, ctx, primitive);
} else {
kernel = new (std::nothrow) kernel::DeConvolutionCPUKernel(opParameter, inputs, outputs, ctx, primitive);
}


+ 1
- 1
mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd.cc View File

@@ -258,10 +258,10 @@ int DeConvolutionWinogradCPUKernel::InitDataParam() {
}

/* bias */
auto bias_tensor = in_tensors_.at(kBiasIndex);
bias_data_ = malloc(deconv_param_->oc_up4_ * sizeof(float));
memset(bias_data_, 0, deconv_param_->oc_up4_ * sizeof(float));
if (in_tensors_.size() == 3) {
auto bias_tensor = in_tensors_.at(kBiasIndex);
memcpy(bias_data_, bias_tensor->data_c(), conv_param_->output_channel_ * sizeof(float));
}
return RET_OK;


Loading…
Cancel
Save