Browse Source

!4120 fix depth wise compilation bugs on arm32

Merge pull request !4120 from lixian/master
tags/v0.7.0-beta
mindspore-ci-bot Gitee 5 years ago
parent
commit
a7185d7e3f
2 changed files with 11 additions and 8 deletions
  1. +4
    -4
      mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm32/ConvDwFp32Center.S
  2. +7
    -4
      mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm32/ConvDwInt8Center.S

+ 4
- 4
mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm32/ConvDwFp32Center.S View File

@@ -21,7 +21,7 @@ ConvDwFp32Center:
// clang's rule seems more simple, though there are no subroutine calls here // clang's rule seems more simple, though there are no subroutine calls here
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
push {r0-r8, r10, r11, lr} push {r0-r8, r10, r11, lr}
vpush {v4-v7}
vpush {q4-q7}
add sp, sp, #112 add sp, sp, #112


ldr r4, [sp, #48] ldr r4, [sp, #48]
@@ -38,7 +38,7 @@ ConvDwFp32Center:
cmp r5, #4 cmp r5, #4
blt LoopW blt LoopW
LoopW4: LoopW4:
mov r11, [sp, #76] // in_sw_step
ldr r11, [sp, #76] // in_sw_step
mov r8, r1 // src_kh mov r8, r1 // src_kh
ldr r2, [sp, #8] // weight_kh ldr r2, [sp, #8] // weight_kh
ldr r6, [sp, #56] // kernel_h ldr r6, [sp, #56] // kernel_h
@@ -100,7 +100,7 @@ ConvDwFp32Center:
mul r11, r11, r12 mul r11, r11, r12
add r1, r1, r11 add r1, r1, r11
sub r5, r5, #4 sub r5, r5, #4
cmp r5, r5, #0
cmp r5, #0
ble LoopWEnd ble LoopWEnd
cmp r5, #4 cmp r5, #4
bge LoopW bge LoopW
@@ -155,7 +155,7 @@ ConvDwFp32Center:
bne LoopH bne LoopH
LoopWEnd: LoopWEnd:
sub sp, sp, #112 sub sp, sp, #112
vpop {v4-v7}
vpop {q4-q7}
pop {r0-r8, r10, r11, pc} pop {r0-r8, r10, r11, pc}
#endif #endif
#endif #endif

+ 7
- 4
mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm32/ConvDwInt8Center.S View File

@@ -52,7 +52,7 @@ ConvDwInt8Center:
ldr r5, [sp, #52] // width ldr r5, [sp, #52] // width
ldr r0, [sp] // dst_w ldr r0, [sp] // dst_w
LoopW4: LoopW4:
mov r11, [sp, #76] // in_sw_step
ldr r11, [sp, #76] // in_sw_step
mov r8, r1 // src_kh mov r8, r1 // src_kh
ldr r2, [sp, #8] // weight_kh ldr r2, [sp, #8] // weight_kh
ldr r6, [sp, #56] // kernel_h ldr r6, [sp, #56] // kernel_h
@@ -145,8 +145,11 @@ ConvDwInt8Center:
mov r12, #4 mov r12, #4
mul r11, r11, r12 mul r11, r11, r12
add r1, r1, r11 add r1, r1, r11
subs r5, r5, #1
bne LoopW4
sub r5, r5, #4
cmp r5, #0
ble LoopWEnd
cmp r5, #4
bge LoopW4
LoopW: LoopW:
mov r8, r1 // src_kh mov r8, r1 // src_kh
ldr r2, [sp, #8] // weight_kh ldr r2, [sp, #8] // weight_kh
@@ -199,7 +202,7 @@ ConvDwInt8Center:
str r12, [sp, #4] str r12, [sp, #4]
subs r4, r4, #1 subs r4, r4, #1
bne LoopH bne LoopH
LoopWEnd:
sub sp, sp, #112 sub sp, sp, #112
vpop {q4-q7} vpop {q4-q7}
pop {r0-r8, r10, r11, pc} pop {r0-r8, r10, r11, pc}


Loading…
Cancel
Save