From 52945ce8268b5e7652cffd2c6730f23c698f343e Mon Sep 17 00:00:00 2001 From: yefeng Date: Thu, 11 Mar 2021 18:25:50 +0800 Subject: [PATCH] fix_assembly_for_ios_5 --- .../assembly/arm32/ConvDw3x3Int8BorderPixel.S | 3 +- .../nnacl/assembly/arm32/ConvDwFp32Border.S | 3 +- .../nnacl/assembly/arm32/ConvDwFp32Center.S | 3 +- .../lite/nnacl/assembly/arm32/ConvDwFp32Row.S | 3 +- .../nnacl/assembly/arm32/ConvDwInt8Center.S | 3 +- .../assembly/arm32/ConvDwInt8PostAlign4.S | 3 +- .../arm32/ConvDwInt8PostAlign4PerChannel.S | 3 +- .../lite/nnacl/assembly/arm32/ConvDwInt8Row.S | 3 +- .../nnacl/assembly/arm32/DeconvDwFp32Center.S | 3 +- .../nnacl/assembly/arm32/DeconvDwInt8Center.S | 3 +- .../nnacl/assembly/arm32/DeconvDwInt8Post.S | 3 +- .../arm32/IndirectGemmInt16to32_8x4.S | 3 +- .../assembly/arm32/IndirectGemmInt8_2x4.S | 5 +-- .../lite/nnacl/assembly/arm32/MatVecMulFp32.S | 3 +- .../lite/nnacl/assembly/arm32/MatmulFp32.S | 3 +- .../lite/nnacl/assembly/arm32/MatmulFp32Opt.S | 3 +- .../nnacl/assembly/arm32/MatmulFp32Opt12x4.S | 3 +- .../lite/nnacl/assembly/arm32/MatmulInt8.S | 7 ++-- .../lite/nnacl/assembly/arm32/MatmulInt8Opt.S | 3 +- .../nnacl/assembly/arm32/MatmulWinogradFp32.S | 3 +- .../nnacl/assembly/arm32/PostFuncBiasReluC4.S | 3 +- .../nnacl/assembly/arm32/PostFuncBiasReluC8.S | 3 +- .../assembly/arm32/PreSum4x16Int8Peroc.S | 3 +- .../nnacl/assembly/arm32/PreSum4x16Int8Pert.S | 3 +- .../nnacl/assembly/arm32/TiledC4MatmulFp32.S | 3 +- .../nnacl/assembly/arm32/WinogradTransLeft.S | 3 +- .../nnacl/assembly/arm32/WinogradTransRight.S | 3 +- .../lite/nnacl/assembly/arm64/AdderFp32.S | 3 +- .../assembly/arm64/ConvDw3x3Fp32Corner.S | 3 +- .../assembly/arm64/ConvDw3x3Fp32Horizontal.S | 3 +- .../assembly/arm64/ConvDw3x3Fp32Stride1.S | 3 +- .../assembly/arm64/ConvDw3x3Fp32Stride2.S | 3 +- .../assembly/arm64/ConvDw3x3Fp32Vertical.S | 3 +- .../lite/nnacl/assembly/arm64/ConvDw3x3Int8.S | 3 +- .../assembly/arm64/ConvDw3x3Int8Corner.S | 3 +- .../assembly/arm64/ConvDw3x3Int8Horizontal.S | 3 +- .../assembly/arm64/ConvDw3x3Int8Stride2.S | 3 +- .../assembly/arm64/ConvDw3x3Int8Vertical.S | 3 +- .../nnacl/assembly/arm64/ConvDwFp32Border.S | 3 +- .../nnacl/assembly/arm64/ConvDwFp32Center.S | 3 +- .../assembly/arm64/ConvDwFp32Indirect3x3.S | 3 +- .../assembly/arm64/ConvDwFp32Indirect5x5.S | 3 +- .../lite/nnacl/assembly/arm64/ConvDwFp32Row.S | 3 +- .../nnacl/assembly/arm64/ConvDwInt8Center.S | 3 +- .../assembly/arm64/ConvDwInt8PostAlign4.S | 3 +- .../arm64/ConvDwInt8PostAlign4PerChannel.S | 3 +- .../lite/nnacl/assembly/arm64/ConvDwInt8Row.S | 3 +- .../nnacl/assembly/arm64/ConvFp32Center.S | 3 +- .../nnacl/assembly/arm64/DeconvDwFp32Border.S | 3 +- .../nnacl/assembly/arm64/DeconvDwFp32Center.S | 3 +- .../nnacl/assembly/arm64/DeconvDwInt8Center.S | 3 +- .../nnacl/assembly/arm64/DeconvDwInt8Post.S | 3 +- .../arm64/IndirectGemmInt16to32_8x4.S | 3 +- .../lite/nnacl/assembly/arm64/MatVecMulFp32.S | 3 +- .../lite/nnacl/assembly/arm64/MatmulFp32.S | 3 +- .../lite/nnacl/assembly/arm64/MatmulFp32Opt.S | 3 +- .../lite/nnacl/assembly/arm64/MatmulInt8.S | 3 +- .../lite/nnacl/assembly/arm64/MatmulInt8Opt.S | 3 +- .../lite/nnacl/assembly/arm64/MatmulR4Int8.S | 3 +- .../nnacl/assembly/arm64/MatmulWinogradFp32.S | 3 +- .../nnacl/assembly/arm64/PostFuncBiasReluC4.S | 3 +- .../nnacl/assembly/arm64/PostFuncBiasReluC8.S | 3 +- .../assembly/arm64/PostFuncInt8C4Neon64.S | 3 +- .../assembly/arm64/PreSum4x16Int8Peroc.S | 5 +-- .../nnacl/assembly/arm64/PreSum4x16Int8Pert.S | 4 +-- .../nnacl/assembly/arm64/TiledC4MatmulFp32.S | 3 +- .../nnacl/assembly/arm64/WinogradTransLeft.S | 3 +- .../nnacl/assembly/arm64/WinogradTransRight.S | 3 +- .../nnacl/assembly/fp16/ConvDwFp16Border.S | 3 +- .../nnacl/assembly/fp16/ConvDwFp16Center.S | 3 +- .../lite/nnacl/assembly/fp16/ConvDwFp16Row.S | 3 +- .../nnacl/assembly/fp16/DeconvDwFp16Border.S | 3 +- .../nnacl/assembly/fp16/DeconvDwFp16Center.S | 3 +- .../nnacl/assembly/fp16/Float16ToFloat32.S | 3 +- .../nnacl/assembly/fp16/Float32ToFloat16.S | 3 +- .../assembly/fp16/IndirectGemmFp16_16x8.S | 9 +++--- .../lite/nnacl/assembly/fp16/MatVecMulFp16.S | 3 +- .../lite/nnacl/assembly/fp16/MatmulFp16.S | 3 +- .../lite/nnacl/assembly/fp16/MatmulFp16Opt.S | 3 +- .../nnacl/assembly/fp16/MatmulWinogradFp16.S | 3 +- .../assembly/fp16/PostFuncBiasReluC4Fp16.S | 3 +- .../assembly/fp16/PostFuncBiasReluC8Fp16.S | 3 +- .../nnacl/assembly/fp16/TiledC4MatmulFp16.S | 3 +- .../assembly/fp16/WinogradTransLeftFp16.S | 3 +- .../assembly/fp16/WinogradTransRightFp16.S | 3 +- .../lite/nnacl/assembly/opt/MatmulDpInt8.S | 3 +- .../lite/nnacl/assembly/opt/MatmulDpInt8Opt.S | 3 +- .../lite/nnacl/assembly/opt/MatmulOptR4Int8.S | 3 +- mindspore/lite/nnacl/assembly_global.h | 32 +++++++++++++++++++ 89 files changed, 215 insertions(+), 96 deletions(-) create mode 100644 mindspore/lite/nnacl/assembly_global.h diff --git a/mindspore/lite/nnacl/assembly/arm32/ConvDw3x3Int8BorderPixel.S b/mindspore/lite/nnacl/assembly/arm32/ConvDw3x3Int8BorderPixel.S index 3ce8b9fb35..c5732590d8 100644 --- a/mindspore/lite/nnacl/assembly/arm32/ConvDw3x3Int8BorderPixel.S +++ b/mindspore/lite/nnacl/assembly/arm32/ConvDw3x3Int8BorderPixel.S @@ -1,5 +1,6 @@ #ifdef __arm__ #ifndef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -17,7 +18,7 @@ // r0: dst, r1: src, r2: weight, r3: bias, r4: height, r5: width, r6: in_kh_step, r7: in_kw_step, // r8: channel, r9: in_zp, r10: out_zp, r11: out_multiplier, r12: left_shift, r13: right_shift // r14: acc_min, r15: acc_max -ConvDw3x3Int8BorderPixel: +asm_function ConvDw3x3Int8BorderPixel // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" // according to https://stackoverflow.com/questions/53625807 // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway diff --git a/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Border.S b/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Border.S index 5bf15c59e9..9f9d53cfc3 100644 --- a/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Border.S +++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Border.S @@ -1,4 +1,5 @@ #ifdef ENABLE_ARM32 +#include "nnacl/assembly_global.h" .text .align 5 @@ -11,7 +12,7 @@ // size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6) // r0: dst, r1: src, r2: weight, r3: bias, r4: height, r5: width, r6: in_kh_step, r7: in_kw_step, // r8: kernel_w, r9: relu, r10: relu6 -ConvDwFp32Border: +asm_function ConvDwFp32Border // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf push {r4-r12, lr} vpush {q4-q7} diff --git a/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Center.S b/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Center.S index a90d2fa014..ffcee6f380 100644 --- a/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Center.S +++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Center.S @@ -1,5 +1,6 @@ #ifdef __arm__ #ifndef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -14,7 +15,7 @@ // r0: dst, r1: src, r2: weight, r3: bias, #0: height, #4: width, #8: kernel_h, #12: kernel_w, // #16: out_h_step, #20: block_channel, #24: in_sh_step, #28: in_sw_step, #32: in_kh_step,#36: in_kw_step // #40: relu, #44: relu6 -ConvDwFp32Center: +asm_function ConvDwFp32Center // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" // according to https://stackoverflow.com/questions/53625807 // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway diff --git a/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Row.S b/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Row.S index f77c64b773..30a8693dcb 100644 --- a/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Row.S +++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Row.S @@ -1,4 +1,5 @@ #ifdef ENABLE_ARM32 +#include "nnacl/assembly_global.h" .text .align 5 @@ -11,7 +12,7 @@ // size_t num_pixels, size_t input_channel, size_t input_step) // r0: output_ptr, r1: input_ptr, r2: filter_ptr, r3: num_pixels, // r4: input_channel, r5: input_step -ConvDwFp32Row: +asm_function ConvDwFp32Row // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf push {r4-r6, r8, r10, r11} diff --git a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Center.S b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Center.S index 66045743d7..73d43abb45 100644 --- a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Center.S +++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Center.S @@ -1,5 +1,6 @@ #ifdef __arm__ #ifndef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -15,7 +16,7 @@ // #-48: dst, #-44: src, #-40: weight, #-36: bias, #0: height, #4: width, #8: kernel_h, #12: kernel_w, // #16: out_h_step, #20: block_channel, #24: in_sh_step, #28: in_sw_step, #32: in_kh_step, #36: in_kw_step // #40: in_zp, #44: out_zp, #48: out_multiplier, #52: left_shift, #56: right_shift, #60:acc_min, #64: acc_max -ConvDwInt8Center: +asm_function ConvDwInt8Center // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" // according to https://stackoverflow.com/questions/53625807 // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway diff --git a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4.S b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4.S index b9d0e9b92a..3367ab390f 100644 --- a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4.S +++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4.S @@ -1,5 +1,6 @@ #ifdef __arm__ #ifndef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -13,7 +14,7 @@ // r0: dst, r1: buffer, r2: num_pixels, r3: output_zp, r4: out_multiplier, // r5: left_shift, r6: right_shift, r7: acc_min, r8: acc_max -ConvDwInt8PostAlign4: +asm_function ConvDwInt8PostAlign4 // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" // according to https://stackoverflow.com/questions/53625807 // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway diff --git a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4PerChannel.S b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4PerChannel.S index d6740355f4..270c959ee8 100644 --- a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4PerChannel.S +++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4PerChannel.S @@ -1,5 +1,6 @@ #ifdef __arm__ #ifndef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -13,7 +14,7 @@ // r0: dst, r1: buffer, r2: num_pixels, r3: output_zp, r4: out_multiplier, // r5: left_shift, r6: right_shift, r7: acc_min, r8: acc_max -ConvDwInt8PostAlign4PerChannel: +asm_function ConvDwInt8PostAlign4PerChannel // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" // according to https://stackoverflow.com/questions/53625807 // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway diff --git a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Row.S b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Row.S index 9b5bfa1242..48ddccfc4f 100644 --- a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Row.S +++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Row.S @@ -1,5 +1,6 @@ #ifdef __arm__ #ifndef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -13,7 +14,7 @@ // r0: output_ptr, r1: input_ptr, r2: weight_ptr, r3: num_pixels, // r4: output_channel, r5: input_step, r6: input_zp, -ConvDwInt8Row: +asm_function ConvDwInt8Row // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" // according to https://stackoverflow.com/questions/53625807 // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway diff --git a/mindspore/lite/nnacl/assembly/arm32/DeconvDwFp32Center.S b/mindspore/lite/nnacl/assembly/arm32/DeconvDwFp32Center.S index 06c38740a5..d0244746bc 100644 --- a/mindspore/lite/nnacl/assembly/arm32/DeconvDwFp32Center.S +++ b/mindspore/lite/nnacl/assembly/arm32/DeconvDwFp32Center.S @@ -1,5 +1,6 @@ #ifdef __arm__ #ifndef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -13,7 +14,7 @@ // size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); // r0: dst, r1: src, r2: weight, r3: height, r4: width, #52: kernel_h, #56: kernel_w, #60: out_h_step // #64: block_channel, #68: in_sh_step, #72: in_sw_step, #76: in_kh_step, #80: in_kw_step -DeconvDwFp32Center: +asm_function DeconvDwFp32Center // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" // according to https://stackoverflow.com/questions/53625807 // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway diff --git a/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Center.S b/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Center.S index 68b23a01e5..5db46b7a35 100644 --- a/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Center.S +++ b/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Center.S @@ -1,5 +1,6 @@ #ifdef __arm__ #ifndef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -13,7 +14,7 @@ // size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); // r0: dst, r1: src, r2: weight, r3: height, r4: width, #52: kernel_h, #56: kernel_w, #60: out_h_step // #64: block_channel, #68: in_sh_step, #72: in_sw_step, #76: in_kh_step, #80: in_kw_step -DeconvDwInt8Center: +asm_function DeconvDwInt8Center // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" // according to https://stackoverflow.com/questions/53625807 // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway diff --git a/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Post.S b/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Post.S index 86a3cd29cd..3722126b9a 100644 --- a/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Post.S +++ b/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Post.S @@ -1,5 +1,6 @@ #ifdef __arm__ #ifndef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -14,7 +15,7 @@ // r0: dst, r1: output_buffer, r2: bias, r3: block_channel, r4: pixel_nums, r5: out_multiplier, // r6: left_shift, r7: right_shift, r8: out_zp, r9: acc_min, r10: acc_max -DeconvDwInt8Post: +asm_function DeconvDwInt8Post // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" // according to https://stackoverflow.com/questions/53625807 // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway diff --git a/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt16to32_8x4.S b/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt16to32_8x4.S index eaf11da242..f8abe1c7a7 100644 --- a/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt16to32_8x4.S +++ b/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt16to32_8x4.S @@ -1,4 +1,5 @@ #ifdef ENABLE_ARM32 +#include "nnacl/assembly_global.h" .text .align 5 @@ -9,7 +10,7 @@ // void IndirectGemmInt16to32_8x4(int *output, short *input, short *weight, size_t kszie, size_t ic8, size_t oc4, size_t offset); // r0: output, r1: input, r2: weight, r3: kszie, r4: ic8, r5: oc4, r6: offset -IndirectGemmInt16to32_8x4: +asm_function IndirectGemmInt16to32_8x4 .macro INIT_ZERO // we could also use "vmov.s32 q12, #0" to initialize q12 by 0 diff --git a/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt8_2x4.S b/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt8_2x4.S index c3cf470ab0..caea16f738 100644 --- a/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt8_2x4.S +++ b/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt8_2x4.S @@ -1,5 +1,6 @@ #ifdef __arm__ #ifndef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -13,7 +14,7 @@ // int32_t *shift_before, int32_t *shift_after, size_t asymmetric, size_t per_channel, size_t per_channel_offset); // r0: output, r1: input, r2: weight, r3: bias, r4: kSize, r5: ic4, r6: oc, r7: offset // r8: input_sum, r10: act_min, r11: act_max, r10: out_zp, r11: out_multiplier, r10: shift_before, r11: shift_after -IndirectGemmInt8_2x4: +asm_function IndirectGemmInt8_2x4 .macro INIT_BIAS veor q10, q10, q10 @@ -221,7 +222,7 @@ IndirectGemmInt8_2x4: vqmovn.s32 d31, q12 vqmovn.s16 d0, q15 - // prefetching is not prefered while writing results in spite of cache missings + // prefetching is not preferred while writing results in spite of cache missing // you could try prfm pstl2strm WriteStart: cmp r6, #1 diff --git a/mindspore/lite/nnacl/assembly/arm32/MatVecMulFp32.S b/mindspore/lite/nnacl/assembly/arm32/MatVecMulFp32.S index c06301c941..4569c9599d 100644 --- a/mindspore/lite/nnacl/assembly/arm32/MatVecMulFp32.S +++ b/mindspore/lite/nnacl/assembly/arm32/MatVecMulFp32.S @@ -1,5 +1,6 @@ #ifdef __arm__ #ifndef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -17,7 +18,7 @@ // r5: depth // r6: col -MatVecMulFp32: +asm_function MatVecMulFp32 // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf push {r0-r8, r10, r11, lr} add sp, sp, #48 diff --git a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32.S b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32.S index 8ae2f10ef6..7ad42d5df8 100644 --- a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32.S +++ b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32.S @@ -1,4 +1,5 @@ #ifdef ENABLE_ARM32 +#include "nnacl/assembly_global.h" .text .align 5 .global MatmulFloatNeon32 @@ -19,7 +20,7 @@ // r8: stride // lr: writeNhwc/writeWino -MatmulFloatNeon32: +asm_function MatmulFloatNeon32 // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf push {r0-r8, r10, r11, lr} add sp, sp, #48 diff --git a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S index 20cfa58a8c..4a13bc92aa 100644 --- a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S +++ b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S @@ -1,4 +1,5 @@ #ifdef ENABLE_ARM32 +#include "nnacl/assembly_global.h" .text .align 5 .global MatmulFloatNeon32Opt @@ -19,7 +20,7 @@ // r8: stride // lr: writeNhwc/writeWino -MatmulFloatNeon32Opt: +asm_function MatmulFloatNeon32Opt // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf push {r0-r8, r10, r11, lr} add sp, sp, #48 diff --git a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S index bb765a7534..fc6a2225a1 100644 --- a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S +++ b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S @@ -1,4 +1,5 @@ #ifdef ENABLE_ARM32 +#include "nnacl/assembly_global.h" .text .align 5 .global MatmulFloatNeon32Opt12x4 @@ -19,7 +20,7 @@ // r8: stride // lr: OutType_C8 = 0, OutType_Nhwc = 1, OutType_TileC8 = 2 -MatmulFloatNeon32Opt12x4: +asm_function MatmulFloatNeon32Opt12x4 // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf push {r0-r8, r10, r11, lr} vpush {q4-q7} diff --git a/mindspore/lite/nnacl/assembly/arm32/MatmulInt8.S b/mindspore/lite/nnacl/assembly/arm32/MatmulInt8.S index 5756ff5bf5..5d3e20fc29 100644 --- a/mindspore/lite/nnacl/assembly/arm32/MatmulInt8.S +++ b/mindspore/lite/nnacl/assembly/arm32/MatmulInt8.S @@ -1,5 +1,6 @@ #ifdef __arm__ #ifndef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -15,7 +16,7 @@ // #0: col, #4: deep16, #8: input_sums, #12: weight_bias, #16: act_min, #20: act_max, #24: out_zp // #28: multiplier, #32: left_shift, #36: right_shift, #40: stride, #44: per_channel -MatmulInt8Neon32: +asm_function MatmulInt8Neon32 push {r0-r11, lr} vpush {q4-q7} add sp, sp, #116 @@ -117,7 +118,7 @@ End3: bgt PerChannel PerTensor: - // Substract input_sums + // Subtract input_sums vld1.32 {d24, d25}, [r6]! vdup.32 d20, d24[0] vdup.32 d21, d24[1] @@ -157,7 +158,7 @@ PerTensor: b AddDstZP PerChannel: - // Substract input_sums + // Subtract input_sums vld1.32 {d24, d25, d26, d27}, [r6]! vsub.s32 d28, d28, d24 vsub.s32 d29, d29, d25 diff --git a/mindspore/lite/nnacl/assembly/arm32/MatmulInt8Opt.S b/mindspore/lite/nnacl/assembly/arm32/MatmulInt8Opt.S index 5fa70921a9..03c45a17d7 100644 --- a/mindspore/lite/nnacl/assembly/arm32/MatmulInt8Opt.S +++ b/mindspore/lite/nnacl/assembly/arm32/MatmulInt8Opt.S @@ -1,5 +1,6 @@ #ifdef __arm__ #ifndef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -16,7 +17,7 @@ // #0: col, #4: deep16, #8: input_sums, #12: weight_bias, #16: act_min, #20: act_max, #24: out_zp // #28: multiplier, #32: left_shift, #36: right_shift, #40: stride, #44: per_channel, #48: filter_zp -MatmulInt8Opt: +asm_function MatmulInt8Opt push {r0-r8, r10, r11, lr} vpush {q4-q7} add sp, sp, #112 diff --git a/mindspore/lite/nnacl/assembly/arm32/MatmulWinogradFp32.S b/mindspore/lite/nnacl/assembly/arm32/MatmulWinogradFp32.S index 4300db884e..8bc5533b9e 100644 --- a/mindspore/lite/nnacl/assembly/arm32/MatmulWinogradFp32.S +++ b/mindspore/lite/nnacl/assembly/arm32/MatmulWinogradFp32.S @@ -1,4 +1,5 @@ #ifdef ENABLE_ARM32 +#include "nnacl/assembly_global.h" .text .align 5 @@ -10,7 +11,7 @@ // MatrixMultiplyWinograd(float *matix_a, float *matrix_b, float *matrix_c, int m, int k, int n, int in_channel, int c4_channel) // r0: matrix_a, r1: matrix_b, r2: matrix_c, r3: m, r4: k, r5: n, r6: in_channel, r7: c4_channel * 4 // #-56: matrix_a, #-52: matrix_b, #-48: matrix_c, #-44: m, #0: k, #4: n, #8: in_channel, #12: c4_channel * 4 -MatrixMultiplyWinograd: +asm_function MatrixMultiplyWinograd // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" // according to https://stackoverflow.com/questions/53625807 // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway diff --git a/mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC4.S b/mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC4.S index f2aff94866..da9ea71f95 100644 --- a/mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC4.S +++ b/mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC4.S @@ -1,3 +1,4 @@ +#include "nnacl/assembly_global.h" .text .align 5 @@ -7,7 +8,7 @@ .type PostFuncBiasReluC4, %function #endif -PostFuncBiasReluC4: +asm_function PostFuncBiasReluC4 push {r4-r8, r10, r11, lr} add sp, sp, #32 diff --git a/mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC8.S b/mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC8.S index ae20ead629..6716129c0e 100644 --- a/mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC8.S +++ b/mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC8.S @@ -1,4 +1,5 @@ #ifdef ENABLE_ARM32 +#include "nnacl/assembly_global.h" .text .align 5 @@ -21,7 +22,7 @@ // lr oc8 loop control // r8 hw loop control -PostFuncBiasReluC8: +asm_function PostFuncBiasReluC8 push {r4-r8, r10, r11, lr} add sp, sp, #32 diff --git a/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Peroc.S b/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Peroc.S index 439000be86..e5f0629ed6 100644 --- a/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Peroc.S +++ b/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Peroc.S @@ -1,3 +1,4 @@ +#include "nnacl/assembly_global.h" .text .align 5 @@ -19,7 +20,7 @@ // r6 oc_res2 // r7 stride -PreSum4x16Int8Peroc: +asm_function PreSum4x16Int8Peroc push {r4-r11, lr} vpush {q4-q7} add sp, sp, #100 diff --git a/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Pert.S b/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Pert.S index 052931fa2f..15ebaa139d 100644 --- a/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Pert.S +++ b/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Pert.S @@ -1,3 +1,4 @@ +#include "nnacl/assembly_global.h" .text .align 5 @@ -15,7 +16,7 @@ // r3 co16 // r4 filter_zp -PreSum4x16Int8Pert: +asm_function PreSum4x16Int8Pert push {r4-r8, r10, r11, lr} vpush {q4-q7} add sp, sp, #96 diff --git a/mindspore/lite/nnacl/assembly/arm32/TiledC4MatmulFp32.S b/mindspore/lite/nnacl/assembly/arm32/TiledC4MatmulFp32.S index 239ef022bb..e7961e37d3 100644 --- a/mindspore/lite/nnacl/assembly/arm32/TiledC4MatmulFp32.S +++ b/mindspore/lite/nnacl/assembly/arm32/TiledC4MatmulFp32.S @@ -1,4 +1,5 @@ #ifdef ENABLE_ARM32 +#include "nnacl/assembly_global.h" .text .align 5 .global TiledC4MatmulFp32 @@ -6,7 +7,7 @@ .type TiledC4MatmulFp32, %function #endif -TiledC4MatmulFp32: +asm_function TiledC4MatmulFp32 //void TiledC4MatmulFp32(float* dst, const float* src, const float* weight, size_t cal_num, size_t ic4, size_t oc4) //x0: dst //x1: src diff --git a/mindspore/lite/nnacl/assembly/arm32/WinogradTransLeft.S b/mindspore/lite/nnacl/assembly/arm32/WinogradTransLeft.S index 3ca05a5583..8ea2bc70d1 100644 --- a/mindspore/lite/nnacl/assembly/arm32/WinogradTransLeft.S +++ b/mindspore/lite/nnacl/assembly/arm32/WinogradTransLeft.S @@ -1,4 +1,5 @@ #ifdef ENABLE_ARM32 +#include "nnacl/assembly_global.h" .text .align 5 @@ -15,7 +16,7 @@ //x4: h //x5: k //x6: length -WinogradTransLeft: +asm_function WinogradTransLeft push {r4-r11, lr} ldr r4, [sp, #36] ldr r5, [sp, #40] diff --git a/mindspore/lite/nnacl/assembly/arm32/WinogradTransRight.S b/mindspore/lite/nnacl/assembly/arm32/WinogradTransRight.S index 4d1d172911..0b1c8f9a12 100644 --- a/mindspore/lite/nnacl/assembly/arm32/WinogradTransRight.S +++ b/mindspore/lite/nnacl/assembly/arm32/WinogradTransRight.S @@ -1,4 +1,5 @@ #ifdef ENABLE_ARM32 +#include "nnacl/assembly_global.h" .text .align 5 @@ -15,7 +16,7 @@ //x4: h //x5: k //x6: length -WinogradTransRight: +asm_function WinogradTransRight push {r4-r11, lr} ldr r4, [sp, #36] ldr r5, [sp, #40] diff --git a/mindspore/lite/nnacl/assembly/arm64/AdderFp32.S b/mindspore/lite/nnacl/assembly/arm64/AdderFp32.S index 13fb0ace0d..985074fed6 100644 --- a/mindspore/lite/nnacl/assembly/arm64/AdderFp32.S +++ b/mindspore/lite/nnacl/assembly/arm64/AdderFp32.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 .global AdderFloatNeon64 @@ -19,7 +20,7 @@ // x8: stride // x9: writeMode -AdderFloatNeon64: +asm_function AdderFloatNeon64 sub sp, sp, #144 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Corner.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Corner.S index 9e3d3ddd29..d7b04b15bf 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Corner.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Corner.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -12,7 +13,7 @@ // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, x6: channel, x7: relu, x8: relu6 -ConvDw3x3Corner: +asm_function ConvDw3x3Corner // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Horizontal.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Horizontal.S index b21ba18082..b28b7ab557 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Horizontal.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Horizontal.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -12,7 +13,7 @@ // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, x6: channel, x7: relu, x8: relu6 -ConvDw3x3Horizontal: +asm_function ConvDw3x3Horizontal // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S index 527a7deb8a..b28fc16704 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -23,7 +24,7 @@ // w9: relu // w10: relu6 -ConvDw3x3Stride1: +asm_function ConvDw3x3Stride1 sub sp, sp, #128 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S index 51c4390b5e..e77f60fd09 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -23,7 +24,7 @@ // w9: relu // w10: relu6 -ConvDw3x3Stride2: +asm_function ConvDw3x3Stride2 sub sp, sp, #128 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Vertical.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Vertical.S index 95197a916b..b1f8de19f7 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Vertical.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Vertical.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -12,7 +13,7 @@ // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, x6: channel, x7: relu, x8: relu6 -ConvDw3x3Vertical: +asm_function ConvDw3x3Vertical // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S index 5087d94dfb..3b46f4d810 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -31,7 +32,7 @@ // w15: acc_max // w16: per_channel -ConvDw3x3Int8Neon64: +asm_function ConvDw3x3Int8Neon64 sub sp, sp, #176 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S index fce898a286..7ffdf0fd6f 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -14,7 +15,7 @@ // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, // x6: channel, x7: in_zp, x8: out_zp, x9: out_multiplier, x10: left_shift, x11: right_shift // x12: acc_min, x13: acc_max, x14: per_channel -ConvDw3x3Int8Corner: +asm_function ConvDw3x3Int8Corner // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S index 339ea05b77..5c1b11c919 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -14,7 +15,7 @@ // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, // x6: channel, x7: in_zp, x8: out_zp, x9: out_multiplier, x10: left_shift, x11: right_shift // x12: acc_min, x13: acc_max, x14: per_channel -ConvDw3x3Int8Horizontal: +asm_function ConvDw3x3Int8Horizontal // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S index 28c51d0f6e..8f843192db 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -31,7 +32,7 @@ // w15: acc_max // w16: per_channel -ConvDw3x3Int8Stride2: +asm_function ConvDw3x3Int8Stride2 sub sp, sp, #176 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S index d1b0f02732..825aa583d8 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -14,7 +15,7 @@ // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, // x6: channel, x7: in_zp, x8: out_zp, x9: out_multiplier, x10: left_shift, x11: right_shift // x12: acc_min, x13: acc_max, x14: per_channel -ConvDw3x3Int8Vertical: +asm_function ConvDw3x3Int8Vertical // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Border.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Border.S index 151d054ad0..f3ce920f5d 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Border.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Border.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -12,7 +13,7 @@ // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: in_kh_step, x7: in_kw_step, // x8: kernel_w, x9: relu, x10: relu6 -ConvDwFp32Border: +asm_function ConvDwFp32Border // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Center.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Center.S index 3a59d08da0..c43932f5ec 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Center.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Center.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -13,7 +14,7 @@ // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: kernel_h, x7: kernel_w, // x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step // x14: relu, x15: relu6 -ConvDwFp32Center: +asm_function ConvDwFp32Center // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S index 52891efa98..5be857a793 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -11,7 +12,7 @@ // size_t input_stride, size_t relu, size_t relu6) // x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6 -ConvDwFp32Indirect3x3: +asm_function ConvDwFp32Indirect3x3 sub sp, sp, #16 stp x19, x20, [sp], #16 diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S index eb1c74a206..2ffb4a041a 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -11,7 +12,7 @@ // size_t input_stride, size_t relu, size_t relu6) // x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6 -ConvDwFp32Indirect5x5: +asm_function ConvDwFp32Indirect5x5 sub sp, sp, #160 stp x19, x20, [sp, #64] stp x21, x22, [sp, #80] diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S index 3ca68cd60e..1f5c76df3d 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -12,7 +13,7 @@ // x0: output_ptr, x1: input_ptr, x2: filter_ptr, x3: num_pixels, // x4: input_channel, x5: input_step // -ConvDwFp32Row: +asm_function ConvDwFp32Row // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S index 424e1a82ae..03fd8afe0c 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -16,7 +17,7 @@ // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w, // x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step // x14: in_zp, #56: out_zp, #64: out_multiplier, #72:left_shift, #80: right_shift, #88: acc_min, #96: acc_max -ConvDwInt8Center: +asm_function ConvDwInt8Center // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S index d78589dbe1..2f8ee9d1dc 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -12,7 +13,7 @@ // x0: dst, x1: buffer, x2: num_pixels, x3: output_zp, x4: out_multiplier, // x5: left_shift, x6: right_shift, x7: acc_min, x8: acc_max -ConvDwInt8PostAlign4: +asm_function ConvDwInt8PostAlign4 // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4PerChannel.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4PerChannel.S index 35c2eb7dd8..b56fd6a34b 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4PerChannel.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4PerChannel.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -12,7 +13,7 @@ // x0: dst, x1: buffer, x2: num_pixels, x3: output_zp, x4: out_multiplier, // x5: left_shift, x6: right_shift, x7: acc_min, x8: acc_max -ConvDwInt8PostAlign4PerChannel: +asm_function ConvDwInt8PostAlign4PerChannel // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S index 34749bc4a0..c15d860863 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -12,7 +13,7 @@ // x0: output_ptr, x1: input_ptr, x2: weight_ptr, x3: num_pixels, // x4: output_channel, x5: input_step, x6: input_zp // -ConvDwInt8Row: +asm_function ConvDwInt8Row // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvFp32Center.S b/mindspore/lite/nnacl/assembly/arm64/ConvFp32Center.S index 27d1201e9a..ff4ac86616 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvFp32Center.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvFp32Center.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -13,7 +14,7 @@ // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: kernel_h, x7: kernel_w, // x8: out_h_step, x9: block_channel, x10: ic4, x11: in_sh_step, x12: in_sw_step, x13: in_kh_step, x14: in_kw_step // x26: relu, x16: relu6 -ConvSwFp32Center: +asm_function ConvSwFp32Center // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Border.S b/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Border.S index 88e2e84ae3..31b186b8d2 100644 --- a/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Border.S +++ b/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Border.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -11,7 +12,7 @@ // size_t in_kh_step, size_t in_kw_step, size_t kernel_w) // x0: dst, x1: src, x2: weight, x3: height, x4: width, x5: in_kh_step, x6: in_kw_step, x7: kernel_w -DeconvDwFp32Border: +asm_function DeconvDwFp32Border // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Center.S b/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Center.S index 07cd1a5cea..19601f5779 100644 --- a/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Center.S +++ b/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Center.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -12,7 +13,7 @@ // size_t in_kh_step, size_t in_kw_step); // x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step // x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step -DeconvDwFp32Center: +asm_function DeconvDwFp32Center // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Center.S b/mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Center.S index 32d402d025..8a69813657 100644 --- a/mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Center.S +++ b/mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Center.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -12,7 +13,7 @@ // size_t in_kh_step, size_t in_kw_step); // x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step // x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step -DeconvDwInt8Center: +asm_function DeconvDwInt8Center // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Post.S b/mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Post.S index e56262474d..ad3ba50ca5 100644 --- a/mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Post.S +++ b/mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Post.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -13,7 +14,7 @@ // x0: dst, x1: output_buffer, x2: bias, x3: block_channel, x4: pixel_nums, x5: out_multiplier // x6: left_shift, x7: right_shift, x8: out_zp, x9: acc_min, x10: acc_max -DeconvDwInt8Post: +asm_function DeconvDwInt8Post // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/arm64/IndirectGemmInt16to32_8x4.S b/mindspore/lite/nnacl/assembly/arm64/IndirectGemmInt16to32_8x4.S index bfad61a362..5e63493241 100644 --- a/mindspore/lite/nnacl/assembly/arm64/IndirectGemmInt16to32_8x4.S +++ b/mindspore/lite/nnacl/assembly/arm64/IndirectGemmInt16to32_8x4.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -9,7 +10,7 @@ // void IndirectGemmInt16to32_8x4(int *output, short *input, short *weight, size_t ksize, size_t ic8, size_t oc4, size_t offset); // x0: output, x1: input, x2: weight, x3: ksize, x4: ic8, x5: oc4, x6: offset -IndirectGemmInt16to32_8x4: +asm_function IndirectGemmInt16to32_8x4 .macro INIT_ZERO dup v28.4s, wzr diff --git a/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S b/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S index 36383dfb30..88824e8aed 100644 --- a/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S +++ b/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 .global MatVecMulFp32 @@ -15,7 +16,7 @@ // w5: depth // w6: col -MatVecMulFp32: +asm_function MatVecMulFp32 sub sp, sp, #128 st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S index a7d39105ab..5c7024ea94 100644 --- a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S +++ b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 .global MatmulFloatNeon64 @@ -19,7 +20,7 @@ // w17: stride // w13: c8_nhwc_c4 -MatmulFloatNeon64: +asm_function MatmulFloatNeon64 sub sp, sp, #128 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S index 3d85651687..7a103239b5 100644 --- a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S +++ b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 .global MatmulFloatNeon64Opt @@ -19,7 +20,7 @@ // x8: stride // x9: writeMode -MatmulFloatNeon64Opt: +asm_function MatmulFloatNeon64Opt sub sp, sp, #144 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S b/mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S index 9974e5c771..883d07fb09 100644 --- a/mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S +++ b/mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 .global MatmulInt8Neon64 @@ -29,7 +30,7 @@ // w24: stride // w27: filter_peroc -MatmulInt8Neon64: +asm_function MatmulInt8Neon64 sub sp, sp, #208 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S b/mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S index 90da4924ac..c08607df9e 100644 --- a/mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S +++ b/mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 .global MatmulInt8Opt @@ -28,7 +29,7 @@ // x15: filter_peroc // x28: filter_zp -MatmulInt8Opt: +asm_function MatmulInt8Opt sub sp, sp, #208 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulR4Int8.S b/mindspore/lite/nnacl/assembly/arm64/MatmulR4Int8.S index 3ae66901b8..3f6cf4644b 100644 --- a/mindspore/lite/nnacl/assembly/arm64/MatmulR4Int8.S +++ b/mindspore/lite/nnacl/assembly/arm64/MatmulR4Int8.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 .global MatMulR4Int8Neon64 @@ -18,7 +19,7 @@ // x6: a_sums // x7: bias -MatMulR4Int8Neon64: +asm_function MatMulR4Int8Neon64 sub sp, sp, #128 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulWinogradFp32.S b/mindspore/lite/nnacl/assembly/arm64/MatmulWinogradFp32.S index e0437210d9..a378f1527e 100644 --- a/mindspore/lite/nnacl/assembly/arm64/MatmulWinogradFp32.S +++ b/mindspore/lite/nnacl/assembly/arm64/MatmulWinogradFp32.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -9,7 +10,7 @@ // MatrixMultiplyWinograd(float *matix_a, float *matrix_b, float *matrix_c, int m, int k, int n, int in_channel, int c4_channel) // x0: matrix_a, x1: matrix_b, x2: matrix_c, x3: m, x4: k, x5: n, x6: in_channel, x7: c4_channel -MatrixMultiplyWinograd: +asm_function MatrixMultiplyWinograd // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC4.S b/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC4.S index 3ba57222fa..63794dd4d1 100644 --- a/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC4.S +++ b/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC4.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -23,7 +24,7 @@ // w13 hw loop control -PostFuncBiasReluC4: +asm_function PostFuncBiasReluC4 movi v26.4s, #6 scvtf v26.4s, v26.4s diff --git a/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC8.S b/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC8.S index 02c125de07..05bde14ccf 100644 --- a/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC8.S +++ b/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC8.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -21,7 +22,7 @@ // w10 oc8 loop control // w13 hw loop control -PostFuncBiasReluC8: +asm_function PostFuncBiasReluC8 sub sp, sp, #128 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 diff --git a/mindspore/lite/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S b/mindspore/lite/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S index cb0256fe27..270c1aefc1 100644 --- a/mindspore/lite/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S +++ b/mindspore/lite/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -42,7 +43,7 @@ // w15 oc4 loop control // w16 hw loop control -PostFuncInt8C4Neon64: +asm_function PostFuncInt8C4Neon64 ldr w8, [sp] ldr w9, [sp, #8] diff --git a/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Peroc.S b/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Peroc.S index a55d1d46c0..374c5d60de 100644 --- a/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Peroc.S +++ b/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Peroc.S @@ -1,5 +1,6 @@ - #ifdef __aarch64__ +#include "nnacl/assembly_global.h" + .text .align 5 //.p2align 5,,15 @@ -20,7 +21,7 @@ // w6 oc_res4 // w7 stride -PreSum4x16Int8Peroc: +asm_function PreSum4x16Int8Peroc mov w8, #0 RowLoop: diff --git a/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Pert.S b/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Pert.S index d4c61a2242..af9d4b4061 100644 --- a/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Pert.S +++ b/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Pert.S @@ -1,5 +1,5 @@ - #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 //.p2align 5,,15 @@ -16,7 +16,7 @@ // w3 co16 // w4 filter_zp -PreSum4x16Int8Pert: +asm_function PreSum4x16Int8Pert dup v17.4s, w4 mov w5, #0 diff --git a/mindspore/lite/nnacl/assembly/arm64/TiledC4MatmulFp32.S b/mindspore/lite/nnacl/assembly/arm64/TiledC4MatmulFp32.S index c964366975..5e931e73b1 100644 --- a/mindspore/lite/nnacl/assembly/arm64/TiledC4MatmulFp32.S +++ b/mindspore/lite/nnacl/assembly/arm64/TiledC4MatmulFp32.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -7,7 +8,7 @@ .type TiledC4MatmulFp32, %function #endif -TiledC4MatmulFp32: +asm_function TiledC4MatmulFp32 //void TiledC4MatmulFp32(float* dst, const float* src, const float* weight, size_t ic4, size_t cal_num, size_t oc4) //x0: dst //x1: src diff --git a/mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S b/mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S index ec3a30e7c1..84a0ed9ab4 100644 --- a/mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S +++ b/mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -7,7 +8,7 @@ .type WinogradTransLeft, %function #endif -WinogradTransLeft: +asm_function WinogradTransLeft //void WinogradTransLeft(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length); //x0: S //x1: B diff --git a/mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S b/mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S index ff65ef0122..7b96ed500e 100644 --- a/mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S +++ b/mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -7,7 +8,7 @@ .type WinogradTransRight, %function #endif -WinogradTransRight: +asm_function WinogradTransRight //void WinogradTransRight(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length); //x0: S //x1: B diff --git a/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Border.S b/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Border.S index b4558e2262..dc0e98bad1 100644 --- a/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Border.S +++ b/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Border.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -13,7 +14,7 @@ // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: in_kh_step, x7: in_kw_step, // x8: kernel_w, x9: relu, x10: relu6 -ConvDwFp16Border: +asm_function ConvDwFp16Border // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Center.S b/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Center.S index 7d98767ba3..74cc4c4bf7 100644 --- a/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Center.S +++ b/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Center.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -13,7 +14,7 @@ // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w, // x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step // x14: relu, x15: relu6 -ConvDwFp16Center: +asm_function ConvDwFp16Center // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Row.S b/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Row.S index 6cc0a2cf40..324f0303ae 100644 --- a/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Row.S +++ b/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Row.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -12,7 +13,7 @@ // x0: output_ptr, x1: input_ptr, x2: filter_ptr, x3: num_pixels, // x4: input_channel, x5: input_step // -ConvDwFp16Row: +asm_function ConvDwFp16Row // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Border.S b/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Border.S index 73d5232233..a807b5300a 100644 --- a/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Border.S +++ b/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Border.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -11,7 +12,7 @@ // size_t in_kh_step, size_t in_kw_step, size_t kernel_w) // x0: dst, x1: src, x2: weight, x3: height, x4: width, x5: in_kh_step, x6: in_kw_step, x7: kernel_w -DeconvDwFp16Border: +asm_function DeconvDwFp16Border // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Center.S b/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Center.S index 1087856cb5..c0ec1a6bbe 100644 --- a/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Center.S +++ b/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Center.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -12,7 +13,7 @@ // size_t in_kh_step, size_t in_kw_step); // x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step // x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step -DeconvDwFp16Center: +asm_function DeconvDwFp16Center // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/fp16/Float16ToFloat32.S b/mindspore/lite/nnacl/assembly/fp16/Float16ToFloat32.S index 2cb3219589..650caa89fa 100644 --- a/mindspore/lite/nnacl/assembly/fp16/Float16ToFloat32.S +++ b/mindspore/lite/nnacl/assembly/fp16/Float16ToFloat32.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -9,7 +10,7 @@ // void Float16ToFloat32(const float16_t *input, float *output, int number); // x0: input, x1: output, x2: number -Float16ToFloat32: +asm_function Float16ToFloat32 // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/fp16/Float32ToFloat16.S b/mindspore/lite/nnacl/assembly/fp16/Float32ToFloat16.S index a321b16a34..7a9c794838 100644 --- a/mindspore/lite/nnacl/assembly/fp16/Float32ToFloat16.S +++ b/mindspore/lite/nnacl/assembly/fp16/Float32ToFloat16.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -9,7 +10,7 @@ // void Float32ToFloat16(const float *input, float16_t output, int number); // x0: input, x1: output, x2: number -Float32ToFloat16: +asm_function Float32ToFloat16 // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S b/mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S index 3c50aa362c..e1f2498278 100644 --- a/mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S +++ b/mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -12,7 +13,7 @@ // x0: output, x1: input, x2: weight, x3: bias, x4: step, x5: ic4, x6: oc8, x7: offset, // x8:mode, x9: writeC4, x10:relu, x11: relu6 // compute 8 channel for 16 outputs -IndirectGemmFp16_16x8: +asm_function IndirectGemmFp16_16x8 .macro INIT_BIAS dup v16.4s, wzr @@ -41,7 +42,7 @@ IndirectGemmFp16_16x8: // x19 ~ r29 should be also preserved // whereas our coding style do not permit such amount of parameters sub sp, sp, #128 - // performance between storing 4 registers at the same time and seperatly storing them on in-order cores + // performance between storing 4 registers at the same time and separately storing them on in-order cores // is not tested yet st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 @@ -86,7 +87,7 @@ IndirectGemmStart: fmla v19.8h, v9.8h, v1.h[5] // load input for output 9-16 // input cache should be refreshed after loading - // ATTENTION: advance is prefered, but advancing too much may lead to invalid prefetching + // ATTENTION: advance is preferred, but advancing too much may lead to invalid prefetching ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x12], #64 // last 2 steps for output 1 and 3 fmla v16.8h, v10.8h, v0.h[2] @@ -295,7 +296,7 @@ IndirectGemmStart: cmp x6, #7 beq Write7 b Write8 - // prefetching is not prefered while writing results in spite of cache missings + // prefetching is not preferred while writing results in spite of cache missing // you could try prfm pstl2strm // there are almost no benefits observed though Write1: diff --git a/mindspore/lite/nnacl/assembly/fp16/MatVecMulFp16.S b/mindspore/lite/nnacl/assembly/fp16/MatVecMulFp16.S index 9ba601a797..5a7adbb76b 100644 --- a/mindspore/lite/nnacl/assembly/fp16/MatVecMulFp16.S +++ b/mindspore/lite/nnacl/assembly/fp16/MatVecMulFp16.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 .global MatVecMulFp16Neon64 @@ -15,7 +16,7 @@ // w5: depth // w6: col -MatVecMulFp16Neon64: +asm_function MatVecMulFp16Neon64 sub sp, sp, #128 st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 diff --git a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16.S b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16.S index fd2622d210..bc3644ad21 100644 --- a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16.S +++ b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 .global MatmulFp16Neon64 @@ -19,7 +20,7 @@ // w17: stride // w13: writeC8 -MatmulFp16Neon64: +asm_function MatmulFp16Neon64 sub sp, sp, #128 st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 diff --git a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S index 20285677fe..503a0f6f23 100644 --- a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S +++ b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 .global MatmulFp16Neon64Opt @@ -19,7 +20,7 @@ // x8: stride // x9: writeMode -MatmulFp16Neon64Opt: +asm_function MatmulFp16Neon64Opt sub sp, sp, #80 st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 stp x19, x20, [sp], #16 diff --git a/mindspore/lite/nnacl/assembly/fp16/MatmulWinogradFp16.S b/mindspore/lite/nnacl/assembly/fp16/MatmulWinogradFp16.S index 38f869c8ee..daaed9163a 100644 --- a/mindspore/lite/nnacl/assembly/fp16/MatmulWinogradFp16.S +++ b/mindspore/lite/nnacl/assembly/fp16/MatmulWinogradFp16.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -9,7 +10,7 @@ // MatrixMultiplyWinogradFp16(float16_t *matix_a, float16_t *matrix_b, float16_t *matrix_c, int m, int k, int n, int in_channel) // x0: matrix_a, x1: matrix_b, x2: matrix_c, x3: m, x4: k, x5: n, x6: in_channel -MatrixMultiplyWinogradFp16: +asm_function MatrixMultiplyWinogradFp16 // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers // x19 ~ x29 should be also preserved diff --git a/mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC4Fp16.S b/mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC4Fp16.S index e8bc7f9fd0..2bf2f786b4 100644 --- a/mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC4Fp16.S +++ b/mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC4Fp16.S @@ -1,3 +1,4 @@ +#include "nnacl/assembly_global.h" .text .align 5 @@ -13,7 +14,7 @@ // w3 oc4div w4 oc4mod w5 plane_size // x6 plane_stride x7 relu_type -PostFuncBiasReluC4Fp16: +asm_function PostFuncBiasReluC4Fp16 movi v26.4h, #6 scvtf v26.4h, v26.4h diff --git a/mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC8Fp16.S b/mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC8Fp16.S index 6127435102..dad91b9332 100644 --- a/mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC8Fp16.S +++ b/mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC8Fp16.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 @@ -21,7 +22,7 @@ // w10 oc8 loop control // w13 hw loop control -PostFuncBiasReluC8Fp16: +asm_function PostFuncBiasReluC8Fp16 movi v26.8h, #0x46, lsl #8 dup v27.8h, wzr mov w10, #0 diff --git a/mindspore/lite/nnacl/assembly/fp16/TiledC4MatmulFp16.S b/mindspore/lite/nnacl/assembly/fp16/TiledC4MatmulFp16.S index af23543225..720ee3e1ac 100644 --- a/mindspore/lite/nnacl/assembly/fp16/TiledC4MatmulFp16.S +++ b/mindspore/lite/nnacl/assembly/fp16/TiledC4MatmulFp16.S @@ -1,3 +1,4 @@ +#include "nnacl/assembly_global.h" .text .align 5 @@ -6,7 +7,7 @@ .type TiledC4MatmulFp16, %function #endif -TiledC4MatmulFp16: +asm_function TiledC4MatmulFp16 sub sp, sp, #128 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 diff --git a/mindspore/lite/nnacl/assembly/fp16/WinogradTransLeftFp16.S b/mindspore/lite/nnacl/assembly/fp16/WinogradTransLeftFp16.S index ca13c5a7e3..df1d88750e 100644 --- a/mindspore/lite/nnacl/assembly/fp16/WinogradTransLeftFp16.S +++ b/mindspore/lite/nnacl/assembly/fp16/WinogradTransLeftFp16.S @@ -1,3 +1,4 @@ +#include "nnacl/assembly_global.h" .text .align 5 @@ -6,7 +7,7 @@ .type WinogradTransLeftFp16, %function #endif -WinogradTransLeftFp16: +asm_function WinogradTransLeftFp16 sub sp, sp, #32 stp x19, x20, [sp], #32 diff --git a/mindspore/lite/nnacl/assembly/fp16/WinogradTransRightFp16.S b/mindspore/lite/nnacl/assembly/fp16/WinogradTransRightFp16.S index d3f5860c33..c889803691 100644 --- a/mindspore/lite/nnacl/assembly/fp16/WinogradTransRightFp16.S +++ b/mindspore/lite/nnacl/assembly/fp16/WinogradTransRightFp16.S @@ -1,3 +1,4 @@ +#include "nnacl/assembly_global.h" .text .align 5 @@ -6,7 +7,7 @@ .type WinogradTransRightFp16, %function #endif -WinogradTransRightFp16: +asm_function WinogradTransRightFp16 mov x8, #8 // 4 * sizeof(float16) mul x8, x6, x8 diff --git a/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S b/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S index 077131ba99..38a38433b1 100644 --- a/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S +++ b/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 .global MatmulInt8DpNeon64 @@ -29,7 +30,7 @@ // w24: stride // w27: filter_peroc -MatmulInt8DpNeon64: +asm_function MatmulInt8DpNeon64 sub sp, sp, #208 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 diff --git a/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8Opt.S b/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8Opt.S index ee276f01bc..fc3ef28b86 100644 --- a/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8Opt.S +++ b/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8Opt.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 .global MatmulInt8DpOpt @@ -28,7 +29,7 @@ // x15: filter_peroc // x28: filter_zp -MatmulInt8DpOpt: +asm_function MatmulInt8DpOpt sub sp, sp, #208 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 diff --git a/mindspore/lite/nnacl/assembly/opt/MatmulOptR4Int8.S b/mindspore/lite/nnacl/assembly/opt/MatmulOptR4Int8.S index be158bd9ed..03342a3986 100644 --- a/mindspore/lite/nnacl/assembly/opt/MatmulOptR4Int8.S +++ b/mindspore/lite/nnacl/assembly/opt/MatmulOptR4Int8.S @@ -1,4 +1,5 @@ #ifdef __aarch64__ +#include "nnacl/assembly_global.h" .text .align 5 .global MatMulOptR4Int8Neon64 @@ -18,7 +19,7 @@ // x6: a_sums // x7: bias -MatMulOptR4Int8Neon64: +asm_function MatMulOptR4Int8Neon64 sub sp, sp, #128 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 diff --git a/mindspore/lite/nnacl/assembly_global.h b/mindspore/lite/nnacl/assembly_global.h new file mode 100644 index 0000000000..d739f2240b --- /dev/null +++ b/mindspore/lite/nnacl/assembly_global.h @@ -0,0 +1,32 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_NNACL_ASSEMBLY_GLOBAL_H +#define MINDSPORE_LITE_NNACL_ASSEMBLY_GLOBAL_H + +.macro asm_function fname +#ifdef __APPLE__ + .globl _\fname _\fname : +#else + .global \fname +#ifdef __ELE__ + .hidden \fname.type \fname, + % function +#endif +\fname : +#endif + .endm + +#endif // MINDSPORE_LITE_NNACL_ASSEMBLY_GLOBAL_H