From 52945ce8268b5e7652cffd2c6730f23c698f343e Mon Sep 17 00:00:00 2001
From: yefeng <yefeng24@huawei.com>
Date: Thu, 11 Mar 2021 18:25:50 +0800
Subject: [PATCH] fix_assembly_for_ios_5

---
 .../assembly/arm32/ConvDw3x3Int8BorderPixel.S |  3 +-
 .../nnacl/assembly/arm32/ConvDwFp32Border.S   |  3 +-
 .../nnacl/assembly/arm32/ConvDwFp32Center.S   |  3 +-
 .../lite/nnacl/assembly/arm32/ConvDwFp32Row.S |  3 +-
 .../nnacl/assembly/arm32/ConvDwInt8Center.S   |  3 +-
 .../assembly/arm32/ConvDwInt8PostAlign4.S     |  3 +-
 .../arm32/ConvDwInt8PostAlign4PerChannel.S    |  3 +-
 .../lite/nnacl/assembly/arm32/ConvDwInt8Row.S |  3 +-
 .../nnacl/assembly/arm32/DeconvDwFp32Center.S |  3 +-
 .../nnacl/assembly/arm32/DeconvDwInt8Center.S |  3 +-
 .../nnacl/assembly/arm32/DeconvDwInt8Post.S   |  3 +-
 .../arm32/IndirectGemmInt16to32_8x4.S         |  3 +-
 .../assembly/arm32/IndirectGemmInt8_2x4.S     |  5 +--
 .../lite/nnacl/assembly/arm32/MatVecMulFp32.S |  3 +-
 .../lite/nnacl/assembly/arm32/MatmulFp32.S    |  3 +-
 .../lite/nnacl/assembly/arm32/MatmulFp32Opt.S |  3 +-
 .../nnacl/assembly/arm32/MatmulFp32Opt12x4.S  |  3 +-
 .../lite/nnacl/assembly/arm32/MatmulInt8.S    |  7 ++--
 .../lite/nnacl/assembly/arm32/MatmulInt8Opt.S |  3 +-
 .../nnacl/assembly/arm32/MatmulWinogradFp32.S |  3 +-
 .../nnacl/assembly/arm32/PostFuncBiasReluC4.S |  3 +-
 .../nnacl/assembly/arm32/PostFuncBiasReluC8.S |  3 +-
 .../assembly/arm32/PreSum4x16Int8Peroc.S      |  3 +-
 .../nnacl/assembly/arm32/PreSum4x16Int8Pert.S |  3 +-
 .../nnacl/assembly/arm32/TiledC4MatmulFp32.S  |  3 +-
 .../nnacl/assembly/arm32/WinogradTransLeft.S  |  3 +-
 .../nnacl/assembly/arm32/WinogradTransRight.S |  3 +-
 .../lite/nnacl/assembly/arm64/AdderFp32.S     |  3 +-
 .../assembly/arm64/ConvDw3x3Fp32Corner.S      |  3 +-
 .../assembly/arm64/ConvDw3x3Fp32Horizontal.S  |  3 +-
 .../assembly/arm64/ConvDw3x3Fp32Stride1.S     |  3 +-
 .../assembly/arm64/ConvDw3x3Fp32Stride2.S     |  3 +-
 .../assembly/arm64/ConvDw3x3Fp32Vertical.S    |  3 +-
 .../lite/nnacl/assembly/arm64/ConvDw3x3Int8.S |  3 +-
 .../assembly/arm64/ConvDw3x3Int8Corner.S      |  3 +-
 .../assembly/arm64/ConvDw3x3Int8Horizontal.S  |  3 +-
 .../assembly/arm64/ConvDw3x3Int8Stride2.S     |  3 +-
 .../assembly/arm64/ConvDw3x3Int8Vertical.S    |  3 +-
 .../nnacl/assembly/arm64/ConvDwFp32Border.S   |  3 +-
 .../nnacl/assembly/arm64/ConvDwFp32Center.S   |  3 +-
 .../assembly/arm64/ConvDwFp32Indirect3x3.S    |  3 +-
 .../assembly/arm64/ConvDwFp32Indirect5x5.S    |  3 +-
 .../lite/nnacl/assembly/arm64/ConvDwFp32Row.S |  3 +-
 .../nnacl/assembly/arm64/ConvDwInt8Center.S   |  3 +-
 .../assembly/arm64/ConvDwInt8PostAlign4.S     |  3 +-
 .../arm64/ConvDwInt8PostAlign4PerChannel.S    |  3 +-
 .../lite/nnacl/assembly/arm64/ConvDwInt8Row.S |  3 +-
 .../nnacl/assembly/arm64/ConvFp32Center.S     |  3 +-
 .../nnacl/assembly/arm64/DeconvDwFp32Border.S |  3 +-
 .../nnacl/assembly/arm64/DeconvDwFp32Center.S |  3 +-
 .../nnacl/assembly/arm64/DeconvDwInt8Center.S |  3 +-
 .../nnacl/assembly/arm64/DeconvDwInt8Post.S   |  3 +-
 .../arm64/IndirectGemmInt16to32_8x4.S         |  3 +-
 .../lite/nnacl/assembly/arm64/MatVecMulFp32.S |  3 +-
 .../lite/nnacl/assembly/arm64/MatmulFp32.S    |  3 +-
 .../lite/nnacl/assembly/arm64/MatmulFp32Opt.S |  3 +-
 .../lite/nnacl/assembly/arm64/MatmulInt8.S    |  3 +-
 .../lite/nnacl/assembly/arm64/MatmulInt8Opt.S |  3 +-
 .../lite/nnacl/assembly/arm64/MatmulR4Int8.S  |  3 +-
 .../nnacl/assembly/arm64/MatmulWinogradFp32.S |  3 +-
 .../nnacl/assembly/arm64/PostFuncBiasReluC4.S |  3 +-
 .../nnacl/assembly/arm64/PostFuncBiasReluC8.S |  3 +-
 .../assembly/arm64/PostFuncInt8C4Neon64.S     |  3 +-
 .../assembly/arm64/PreSum4x16Int8Peroc.S      |  5 +--
 .../nnacl/assembly/arm64/PreSum4x16Int8Pert.S |  4 +--
 .../nnacl/assembly/arm64/TiledC4MatmulFp32.S  |  3 +-
 .../nnacl/assembly/arm64/WinogradTransLeft.S  |  3 +-
 .../nnacl/assembly/arm64/WinogradTransRight.S |  3 +-
 .../nnacl/assembly/fp16/ConvDwFp16Border.S    |  3 +-
 .../nnacl/assembly/fp16/ConvDwFp16Center.S    |  3 +-
 .../lite/nnacl/assembly/fp16/ConvDwFp16Row.S  |  3 +-
 .../nnacl/assembly/fp16/DeconvDwFp16Border.S  |  3 +-
 .../nnacl/assembly/fp16/DeconvDwFp16Center.S  |  3 +-
 .../nnacl/assembly/fp16/Float16ToFloat32.S    |  3 +-
 .../nnacl/assembly/fp16/Float32ToFloat16.S    |  3 +-
 .../assembly/fp16/IndirectGemmFp16_16x8.S     |  9 +++---
 .../lite/nnacl/assembly/fp16/MatVecMulFp16.S  |  3 +-
 .../lite/nnacl/assembly/fp16/MatmulFp16.S     |  3 +-
 .../lite/nnacl/assembly/fp16/MatmulFp16Opt.S  |  3 +-
 .../nnacl/assembly/fp16/MatmulWinogradFp16.S  |  3 +-
 .../assembly/fp16/PostFuncBiasReluC4Fp16.S    |  3 +-
 .../assembly/fp16/PostFuncBiasReluC8Fp16.S    |  3 +-
 .../nnacl/assembly/fp16/TiledC4MatmulFp16.S   |  3 +-
 .../assembly/fp16/WinogradTransLeftFp16.S     |  3 +-
 .../assembly/fp16/WinogradTransRightFp16.S    |  3 +-
 .../lite/nnacl/assembly/opt/MatmulDpInt8.S    |  3 +-
 .../lite/nnacl/assembly/opt/MatmulDpInt8Opt.S |  3 +-
 .../lite/nnacl/assembly/opt/MatmulOptR4Int8.S |  3 +-
 mindspore/lite/nnacl/assembly_global.h        | 32 +++++++++++++++++++
 89 files changed, 215 insertions(+), 96 deletions(-)
 create mode 100644 mindspore/lite/nnacl/assembly_global.h

diff --git a/mindspore/lite/nnacl/assembly/arm32/ConvDw3x3Int8BorderPixel.S b/mindspore/lite/nnacl/assembly/arm32/ConvDw3x3Int8BorderPixel.S
index 3ce8b9fb35..c5732590d8 100644
--- a/mindspore/lite/nnacl/assembly/arm32/ConvDw3x3Int8BorderPixel.S
+++ b/mindspore/lite/nnacl/assembly/arm32/ConvDw3x3Int8BorderPixel.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -17,7 +18,7 @@
 // r0: dst, r1: src, r2: weight, r3: bias, r4: height, r5: width, r6: in_kh_step, r7: in_kw_step,
 // r8: channel, r9: in_zp,  r10: out_zp, r11: out_multiplier, r12: left_shift, r13: right_shift
 // r14: acc_min, r15: acc_max
-ConvDw3x3Int8BorderPixel:
+asm_function ConvDw3x3Int8BorderPixel
     // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
     // according to https://stackoverflow.com/questions/53625807
     // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
diff --git a/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Border.S b/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Border.S
index 5bf15c59e9..9f9d53cfc3 100644
--- a/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Border.S
+++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Border.S
@@ -1,4 +1,5 @@
 #ifdef ENABLE_ARM32
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -11,7 +12,7 @@
 //                       size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6)
 // r0: dst, r1: src, r2: weight, r3: bias, r4: height, r5: width, r6: in_kh_step, r7: in_kw_step,
 // r8: kernel_w, r9: relu, r10: relu6
-ConvDwFp32Border:
+asm_function ConvDwFp32Border
     // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
     push {r4-r12, lr}
     vpush {q4-q7}
diff --git a/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Center.S b/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Center.S
index a90d2fa014..ffcee6f380 100644
--- a/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Center.S
+++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Center.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -14,7 +15,7 @@
 // r0: dst, r1: src, r2: weight, r3: bias, #0: height, #4: width, #8: kernel_h, #12: kernel_w,
 // #16: out_h_step, #20: block_channel, #24: in_sh_step, #28: in_sw_step, #32: in_kh_step,#36: in_kw_step
 // #40: relu, #44: relu6
-ConvDwFp32Center:
+asm_function ConvDwFp32Center
     // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
     // according to https://stackoverflow.com/questions/53625807
     // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
diff --git a/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Row.S b/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Row.S
index f77c64b773..30a8693dcb 100644
--- a/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Row.S
+++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Row.S
@@ -1,4 +1,5 @@
 #ifdef ENABLE_ARM32
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -11,7 +12,7 @@
 //                   size_t num_pixels, size_t input_channel, size_t input_step)
 // r0: output_ptr, r1: input_ptr, r2: filter_ptr, r3: num_pixels,
 // r4: input_channel, r5: input_step
-ConvDwFp32Row:
+asm_function ConvDwFp32Row
     // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
 
     push {r4-r6, r8, r10, r11}
diff --git a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Center.S b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Center.S
index 66045743d7..73d43abb45 100644
--- a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Center.S
+++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Center.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -15,7 +16,7 @@
 // #-48: dst, #-44: src, #-40: weight, #-36: bias, #0: height, #4: width, #8: kernel_h, #12: kernel_w,
 // #16: out_h_step, #20: block_channel, #24: in_sh_step, #28: in_sw_step, #32: in_kh_step, #36: in_kw_step
 // #40: in_zp, #44: out_zp, #48: out_multiplier, #52: left_shift, #56: right_shift, #60:acc_min, #64: acc_max
-ConvDwInt8Center:
+asm_function ConvDwInt8Center
 // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
 // according to https://stackoverflow.com/questions/53625807
 // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
diff --git a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4.S b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4.S
index b9d0e9b92a..3367ab390f 100644
--- a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4.S
+++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -13,7 +14,7 @@
 // r0: dst, r1: buffer, r2: num_pixels, r3: output_zp, r4: out_multiplier,
 // r5: left_shift, r6: right_shift, r7: acc_min, r8: acc_max
 
-ConvDwInt8PostAlign4:
+asm_function ConvDwInt8PostAlign4
     // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
     // according to https://stackoverflow.com/questions/53625807
     // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
diff --git a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4PerChannel.S b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4PerChannel.S
index d6740355f4..270c959ee8 100644
--- a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4PerChannel.S
+++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4PerChannel.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -13,7 +14,7 @@
 // r0: dst, r1: buffer, r2: num_pixels, r3: output_zp, r4: out_multiplier,
 // r5: left_shift, r6: right_shift, r7: acc_min, r8: acc_max
 
-ConvDwInt8PostAlign4PerChannel:
+asm_function ConvDwInt8PostAlign4PerChannel
     // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
     // according to https://stackoverflow.com/questions/53625807
     // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
diff --git a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Row.S b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Row.S
index 9b5bfa1242..48ddccfc4f 100644
--- a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Row.S
+++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Row.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -13,7 +14,7 @@
 // r0: output_ptr, r1: input_ptr, r2: weight_ptr, r3: num_pixels,
 // r4: output_channel, r5: input_step, r6: input_zp,
 
-ConvDwInt8Row:
+asm_function ConvDwInt8Row
     // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
     // according to https://stackoverflow.com/questions/53625807
     // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
diff --git a/mindspore/lite/nnacl/assembly/arm32/DeconvDwFp32Center.S b/mindspore/lite/nnacl/assembly/arm32/DeconvDwFp32Center.S
index 06c38740a5..d0244746bc 100644
--- a/mindspore/lite/nnacl/assembly/arm32/DeconvDwFp32Center.S
+++ b/mindspore/lite/nnacl/assembly/arm32/DeconvDwFp32Center.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -13,7 +14,7 @@
 //                      size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
 // r0: dst, r1: src, r2: weight, r3: height, r4: width, #52: kernel_h, #56: kernel_w, #60: out_h_step
 // #64: block_channel, #68: in_sh_step, #72: in_sw_step, #76: in_kh_step, #80: in_kw_step
-DeconvDwFp32Center:
+asm_function DeconvDwFp32Center
     // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
     // according to https://stackoverflow.com/questions/53625807
     // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
diff --git a/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Center.S b/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Center.S
index 68b23a01e5..5db46b7a35 100644
--- a/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Center.S
+++ b/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Center.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -13,7 +14,7 @@
 //                         size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
 // r0: dst, r1: src, r2: weight, r3: height, r4: width, #52: kernel_h, #56: kernel_w, #60: out_h_step
 // #64: block_channel, #68: in_sh_step, #72: in_sw_step, #76: in_kh_step, #80: in_kw_step
-DeconvDwInt8Center:
+asm_function DeconvDwInt8Center
     // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
     // according to https://stackoverflow.com/questions/53625807
     // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
diff --git a/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Post.S b/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Post.S
index 86a3cd29cd..3722126b9a 100644
--- a/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Post.S
+++ b/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Post.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -14,7 +15,7 @@
 // r0: dst, r1: output_buffer, r2: bias, r3: block_channel, r4: pixel_nums, r5: out_multiplier,
 // r6: left_shift, r7: right_shift, r8: out_zp, r9: acc_min, r10: acc_max
 
-DeconvDwInt8Post:
+asm_function DeconvDwInt8Post
     // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
     // according to https://stackoverflow.com/questions/53625807
     // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
diff --git a/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt16to32_8x4.S b/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt16to32_8x4.S
index eaf11da242..f8abe1c7a7 100644
--- a/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt16to32_8x4.S
+++ b/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt16to32_8x4.S
@@ -1,4 +1,5 @@
 #ifdef ENABLE_ARM32
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -9,7 +10,7 @@
 
 // void IndirectGemmInt16to32_8x4(int *output, short *input, short *weight, size_t kszie, size_t ic8, size_t oc4, size_t offset);
 // r0: output, r1: input, r2: weight, r3: kszie, r4: ic8, r5: oc4, r6: offset
-IndirectGemmInt16to32_8x4:
+asm_function IndirectGemmInt16to32_8x4
 
     .macro INIT_ZERO
         // we could also use "vmov.s32 q12, #0" to initialize q12 by 0
diff --git a/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt8_2x4.S b/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt8_2x4.S
index c3cf470ab0..caea16f738 100644
--- a/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt8_2x4.S
+++ b/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt8_2x4.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -13,7 +14,7 @@
 // int32_t *shift_before, int32_t *shift_after, size_t asymmetric, size_t per_channel, size_t per_channel_offset);
 // r0: output, r1: input, r2: weight, r3: bias, r4: kSize, r5: ic4, r6: oc, r7: offset
 // r8: input_sum, r10: act_min, r11: act_max, r10: out_zp, r11: out_multiplier, r10: shift_before, r11: shift_after
-IndirectGemmInt8_2x4:
+asm_function IndirectGemmInt8_2x4
 
     .macro INIT_BIAS
         veor q10, q10, q10
@@ -221,7 +222,7 @@ IndirectGemmInt8_2x4:
                 vqmovn.s32 d31, q12
                 vqmovn.s16 d0, q15
 
-            // prefetching is not prefered while writing results in spite of cache missings
+            // prefetching is not preferred while writing results in spite of cache missing
             // you could try prfm pstl2strm
             WriteStart:
                 cmp r6, #1
diff --git a/mindspore/lite/nnacl/assembly/arm32/MatVecMulFp32.S b/mindspore/lite/nnacl/assembly/arm32/MatVecMulFp32.S
index c06301c941..4569c9599d 100644
--- a/mindspore/lite/nnacl/assembly/arm32/MatVecMulFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatVecMulFp32.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -17,7 +18,7 @@
 // r5: depth
 // r6: col
 
-MatVecMulFp32:
+asm_function MatVecMulFp32
   // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
   push {r0-r8, r10, r11, lr}
   add sp, sp, #48
diff --git a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32.S b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32.S
index 8ae2f10ef6..7ad42d5df8 100644
--- a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32.S
@@ -1,4 +1,5 @@
 #ifdef ENABLE_ARM32
+#include "nnacl/assembly_global.h"
     .text
     .align 5
     .global MatmulFloatNeon32
@@ -19,7 +20,7 @@
 // r8: stride
 // lr: writeNhwc/writeWino
 
-MatmulFloatNeon32:
+asm_function MatmulFloatNeon32
     // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
     push {r0-r8, r10, r11, lr}
     add sp, sp, #48
diff --git a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S
index 20cfa58a8c..4a13bc92aa 100644
--- a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S
@@ -1,4 +1,5 @@
 #ifdef ENABLE_ARM32
+#include "nnacl/assembly_global.h"
     .text
     .align 5
     .global MatmulFloatNeon32Opt
@@ -19,7 +20,7 @@
 // r8: stride
 // lr: writeNhwc/writeWino
 
-MatmulFloatNeon32Opt:
+asm_function MatmulFloatNeon32Opt
     // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
     push {r0-r8, r10, r11, lr}
     add sp, sp, #48
diff --git a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S
index bb765a7534..fc6a2225a1 100644
--- a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S
@@ -1,4 +1,5 @@
 #ifdef ENABLE_ARM32
+#include "nnacl/assembly_global.h"
     .text
     .align 5
     .global MatmulFloatNeon32Opt12x4
@@ -19,7 +20,7 @@
 // r8: stride
 // lr: OutType_C8 = 0, OutType_Nhwc = 1, OutType_TileC8 = 2
 
-MatmulFloatNeon32Opt12x4:
+asm_function MatmulFloatNeon32Opt12x4
     // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
     push {r0-r8, r10, r11, lr}
     vpush {q4-q7}
diff --git a/mindspore/lite/nnacl/assembly/arm32/MatmulInt8.S b/mindspore/lite/nnacl/assembly/arm32/MatmulInt8.S
index 5756ff5bf5..5d3e20fc29 100644
--- a/mindspore/lite/nnacl/assembly/arm32/MatmulInt8.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatmulInt8.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -15,7 +16,7 @@
 // #0: col, #4: deep16, #8: input_sums, #12: weight_bias, #16: act_min, #20: act_max, #24: out_zp
 // #28: multiplier, #32: left_shift, #36: right_shift, #40: stride, #44: per_channel
 
-MatmulInt8Neon32:
+asm_function MatmulInt8Neon32
   push {r0-r11, lr}
   vpush {q4-q7}
   add sp, sp, #116
@@ -117,7 +118,7 @@ End3:
   bgt PerChannel 
 
 PerTensor:
-  // Substract input_sums
+  // Subtract input_sums
   vld1.32 {d24, d25}, [r6]!
   vdup.32 d20, d24[0]
   vdup.32 d21, d24[1]
@@ -157,7 +158,7 @@ PerTensor:
   b AddDstZP
 
 PerChannel:
-  // Substract input_sums
+  // Subtract input_sums
   vld1.32 {d24, d25, d26, d27}, [r6]!
   vsub.s32 d28, d28, d24
   vsub.s32 d29, d29, d25
diff --git a/mindspore/lite/nnacl/assembly/arm32/MatmulInt8Opt.S b/mindspore/lite/nnacl/assembly/arm32/MatmulInt8Opt.S
index 5fa70921a9..03c45a17d7 100644
--- a/mindspore/lite/nnacl/assembly/arm32/MatmulInt8Opt.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatmulInt8Opt.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -16,7 +17,7 @@
 // #0: col, #4: deep16, #8: input_sums, #12: weight_bias, #16: act_min, #20: act_max, #24: out_zp
 // #28: multiplier, #32: left_shift, #36: right_shift, #40: stride, #44: per_channel, #48: filter_zp
 
-MatmulInt8Opt:
+asm_function MatmulInt8Opt
     push {r0-r8, r10, r11, lr}
     vpush {q4-q7}
     add sp, sp, #112
diff --git a/mindspore/lite/nnacl/assembly/arm32/MatmulWinogradFp32.S b/mindspore/lite/nnacl/assembly/arm32/MatmulWinogradFp32.S
index 4300db884e..8bc5533b9e 100644
--- a/mindspore/lite/nnacl/assembly/arm32/MatmulWinogradFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatmulWinogradFp32.S
@@ -1,4 +1,5 @@
 #ifdef ENABLE_ARM32
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -10,7 +11,7 @@
 // MatrixMultiplyWinograd(float *matix_a, float *matrix_b, float *matrix_c, int m, int k, int n, int in_channel, int c4_channel)
     // r0: matrix_a, r1: matrix_b, r2: matrix_c, r3: m, r4: k, r5: n, r6: in_channel, r7: c4_channel * 4
     // #-56: matrix_a, #-52: matrix_b, #-48: matrix_c, #-44: m, #0: k, #4: n, #8: in_channel, #12: c4_channel * 4
-MatrixMultiplyWinograd:
+asm_function MatrixMultiplyWinograd
     // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
     // according to https://stackoverflow.com/questions/53625807
     // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
diff --git a/mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC4.S b/mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC4.S
index f2aff94866..da9ea71f95 100644
--- a/mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC4.S
+++ b/mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC4.S
@@ -1,3 +1,4 @@
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -7,7 +8,7 @@
 .type PostFuncBiasReluC4, %function
 #endif
 
-PostFuncBiasReluC4:
+asm_function PostFuncBiasReluC4
   push {r4-r8, r10, r11, lr}
   add sp, sp, #32
 
diff --git a/mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC8.S b/mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC8.S
index ae20ead629..6716129c0e 100644
--- a/mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC8.S
+++ b/mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC8.S
@@ -1,4 +1,5 @@
 #ifdef ENABLE_ARM32
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -21,7 +22,7 @@
 // lr  oc8 loop control
 // r8  hw  loop control
 
-PostFuncBiasReluC8:
+asm_function PostFuncBiasReluC8
   push {r4-r8, r10, r11, lr}
   add sp, sp, #32
 
diff --git a/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Peroc.S b/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Peroc.S
index 439000be86..e5f0629ed6 100644
--- a/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Peroc.S
+++ b/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Peroc.S
@@ -1,3 +1,4 @@
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -19,7 +20,7 @@
 // r6 oc_res2
 // r7 stride
 
-PreSum4x16Int8Peroc:
+asm_function PreSum4x16Int8Peroc
   push {r4-r11, lr}
   vpush {q4-q7}
   add sp, sp, #100
diff --git a/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Pert.S b/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Pert.S
index 052931fa2f..15ebaa139d 100644
--- a/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Pert.S
+++ b/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Pert.S
@@ -1,3 +1,4 @@
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -15,7 +16,7 @@
 // r3 co16
 // r4 filter_zp
 
-PreSum4x16Int8Pert:
+asm_function PreSum4x16Int8Pert
   push {r4-r8, r10, r11, lr}
   vpush {q4-q7}
   add sp, sp, #96
diff --git a/mindspore/lite/nnacl/assembly/arm32/TiledC4MatmulFp32.S b/mindspore/lite/nnacl/assembly/arm32/TiledC4MatmulFp32.S
index 239ef022bb..e7961e37d3 100644
--- a/mindspore/lite/nnacl/assembly/arm32/TiledC4MatmulFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm32/TiledC4MatmulFp32.S
@@ -1,4 +1,5 @@
 #ifdef ENABLE_ARM32
+#include "nnacl/assembly_global.h"
     .text
     .align 5
     .global TiledC4MatmulFp32
@@ -6,7 +7,7 @@
     .type TiledC4MatmulFp32, %function
 #endif
 
-TiledC4MatmulFp32:
+asm_function TiledC4MatmulFp32
 //void TiledC4MatmulFp32(float* dst, const float* src, const float* weight, size_t cal_num, size_t ic4, size_t oc4)
 //x0: dst
 //x1: src
diff --git a/mindspore/lite/nnacl/assembly/arm32/WinogradTransLeft.S b/mindspore/lite/nnacl/assembly/arm32/WinogradTransLeft.S
index 3ca05a5583..8ea2bc70d1 100644
--- a/mindspore/lite/nnacl/assembly/arm32/WinogradTransLeft.S
+++ b/mindspore/lite/nnacl/assembly/arm32/WinogradTransLeft.S
@@ -1,4 +1,5 @@
 #ifdef ENABLE_ARM32
+#include "nnacl/assembly_global.h"
 
     .text
     .align 5
@@ -15,7 +16,7 @@
 //x4: h
 //x5: k
 //x6: length
-WinogradTransLeft:
+asm_function WinogradTransLeft
     push {r4-r11, lr}
     ldr r4, [sp, #36]
     ldr r5, [sp, #40]
diff --git a/mindspore/lite/nnacl/assembly/arm32/WinogradTransRight.S b/mindspore/lite/nnacl/assembly/arm32/WinogradTransRight.S
index 4d1d172911..0b1c8f9a12 100644
--- a/mindspore/lite/nnacl/assembly/arm32/WinogradTransRight.S
+++ b/mindspore/lite/nnacl/assembly/arm32/WinogradTransRight.S
@@ -1,4 +1,5 @@
 #ifdef ENABLE_ARM32
+#include "nnacl/assembly_global.h"
 
     .text
     .align 5
@@ -15,7 +16,7 @@
 //x4: h
 //x5: k
 //x6: length
-WinogradTransRight:
+asm_function WinogradTransRight
     push {r4-r11, lr}
     ldr r4, [sp, #36]
     ldr r5, [sp, #40]
diff --git a/mindspore/lite/nnacl/assembly/arm64/AdderFp32.S b/mindspore/lite/nnacl/assembly/arm64/AdderFp32.S
index 13fb0ace0d..985074fed6 100644
--- a/mindspore/lite/nnacl/assembly/arm64/AdderFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm64/AdderFp32.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
     .text
     .align 5
     .global AdderFloatNeon64
@@ -19,7 +20,7 @@
 // x8: stride
 // x9: writeMode
 
-AdderFloatNeon64:
+asm_function AdderFloatNeon64
     sub sp, sp, #144
     st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
     st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Corner.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Corner.S
index 9e3d3ddd29..d7b04b15bf 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Corner.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Corner.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -12,7 +13,7 @@
                      
 // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, x6: channel, x7: relu,  x8: relu6
 
-ConvDw3x3Corner:
+asm_function ConvDw3x3Corner
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Horizontal.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Horizontal.S
index b21ba18082..b28b7ab557 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Horizontal.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Horizontal.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -12,7 +13,7 @@
                      
 // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, x6: channel, x7: relu,  x8: relu6
 
-ConvDw3x3Horizontal:
+asm_function ConvDw3x3Horizontal
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S
index 527a7deb8a..b28fc16704 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -23,7 +24,7 @@
 // w9: relu
 // w10: relu6
 
-ConvDw3x3Stride1:
+asm_function ConvDw3x3Stride1
     sub sp, sp, #128
     st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
     st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S
index 51c4390b5e..e77f60fd09 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -23,7 +24,7 @@
 // w9: relu
 // w10: relu6
 
-ConvDw3x3Stride2:
+asm_function ConvDw3x3Stride2
     sub sp, sp, #128
     st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
     st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Vertical.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Vertical.S
index 95197a916b..b1f8de19f7 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Vertical.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Vertical.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -12,7 +13,7 @@
                      
 // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, x6: channel, x7: relu,  x8: relu6
 
-ConvDw3x3Vertical:
+asm_function ConvDw3x3Vertical
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S
index 5087d94dfb..3b46f4d810 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -31,7 +32,7 @@
 // w15: acc_max
 // w16: per_channel
 
-ConvDw3x3Int8Neon64:
+asm_function ConvDw3x3Int8Neon64
   sub sp, sp, #176
   st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
   st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S
index fce898a286..7ffdf0fd6f 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -14,7 +15,7 @@
 // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step,
 // x6: channel, x7: in_zp,  x8: out_zp, x9: out_multiplier, x10: left_shift, x11: right_shift
 // x12: acc_min, x13: acc_max, x14: per_channel
-ConvDw3x3Int8Corner:
+asm_function ConvDw3x3Int8Corner
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S
index 339ea05b77..5c1b11c919 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -14,7 +15,7 @@
 // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step,
 // x6: channel, x7: in_zp,  x8: out_zp, x9: out_multiplier, x10: left_shift, x11: right_shift
 // x12: acc_min, x13: acc_max, x14: per_channel
-ConvDw3x3Int8Horizontal:
+asm_function ConvDw3x3Int8Horizontal
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S
index 28c51d0f6e..8f843192db 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -31,7 +32,7 @@
 // w15: acc_max
 // w16: per_channel
 
-ConvDw3x3Int8Stride2:
+asm_function ConvDw3x3Int8Stride2
     sub sp, sp, #176
     st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
     st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S
index d1b0f02732..825aa583d8 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -14,7 +15,7 @@
 // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step,
 // x6: channel, x7: in_zp,  x8: out_zp, x9: out_multiplier, x10: left_shift, x11: right_shift
 // x12: acc_min, x13: acc_max, x14: per_channel
-ConvDw3x3Int8Vertical:
+asm_function ConvDw3x3Int8Vertical
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Border.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Border.S
index 151d054ad0..f3ce920f5d 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Border.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Border.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -12,7 +13,7 @@
 
 // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: in_kh_step, x7: in_kw_step,
 // x8: kernel_w, x9: relu, x10: relu6
-ConvDwFp32Border:
+asm_function ConvDwFp32Border
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Center.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Center.S
index 3a59d08da0..c43932f5ec 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Center.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Center.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -13,7 +14,7 @@
 // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: kernel_h, x7: kernel_w,
 // x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step
 // x14: relu, x15: relu6
-ConvDwFp32Center:
+asm_function ConvDwFp32Center
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S
index 52891efa98..5be857a793 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -11,7 +12,7 @@
 //                            size_t input_stride, size_t relu, size_t relu6)
 // x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6
 
-ConvDwFp32Indirect3x3:
+asm_function ConvDwFp32Indirect3x3
     sub sp, sp, #16
     stp x19, x20, [sp], #16
 
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S
index eb1c74a206..2ffb4a041a 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -11,7 +12,7 @@
 //                            size_t input_stride, size_t relu, size_t relu6)
 // x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6
 
-ConvDwFp32Indirect5x5:
+asm_function ConvDwFp32Indirect5x5
     sub sp, sp, #160
     stp x19, x20, [sp, #64]
     stp x21, x22, [sp, #80]
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S
index 3ca68cd60e..1f5c76df3d 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -12,7 +13,7 @@
 // x0: output_ptr, x1: input_ptr, x2: filter_ptr, x3: num_pixels,
 // x4: input_channel, x5: input_step
 //
-ConvDwFp32Row:
+asm_function ConvDwFp32Row
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S
index 424e1a82ae..03fd8afe0c 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -16,7 +17,7 @@
 // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w, 
 // x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step
 // x14: in_zp, #56: out_zp, #64: out_multiplier, #72:left_shift, #80: right_shift, #88: acc_min, #96: acc_max
-ConvDwInt8Center:
+asm_function ConvDwInt8Center
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S
index d78589dbe1..2f8ee9d1dc 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -12,7 +13,7 @@
 // x0: dst, x1: buffer, x2: num_pixels, x3: output_zp, x4: out_multiplier,
 // x5: left_shift, x6: right_shift, x7: acc_min, x8: acc_max
 
-ConvDwInt8PostAlign4:
+asm_function ConvDwInt8PostAlign4
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4PerChannel.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4PerChannel.S
index 35c2eb7dd8..b56fd6a34b 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4PerChannel.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4PerChannel.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -12,7 +13,7 @@
 // x0: dst, x1: buffer, x2: num_pixels, x3: output_zp, x4: out_multiplier,
 // x5: left_shift, x6: right_shift, x7: acc_min, x8: acc_max
 
-ConvDwInt8PostAlign4PerChannel:
+asm_function ConvDwInt8PostAlign4PerChannel
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S
index 34749bc4a0..c15d860863 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -12,7 +13,7 @@
 // x0: output_ptr, x1: input_ptr, x2: weight_ptr, x3: num_pixels,
 // x4: output_channel, x5: input_step, x6: input_zp
 //
-ConvDwInt8Row:
+asm_function ConvDwInt8Row
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvFp32Center.S b/mindspore/lite/nnacl/assembly/arm64/ConvFp32Center.S
index 27d1201e9a..ff4ac86616 100644
--- a/mindspore/lite/nnacl/assembly/arm64/ConvFp32Center.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvFp32Center.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -13,7 +14,7 @@
 // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: kernel_h, x7: kernel_w,
 // x8: out_h_step, x9: block_channel, x10: ic4, x11: in_sh_step, x12: in_sw_step, x13: in_kh_step, x14: in_kw_step
 // x26: relu, x16: relu6
-ConvSwFp32Center:
+asm_function ConvSwFp32Center
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Border.S b/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Border.S
index 88e2e84ae3..31b186b8d2 100644
--- a/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Border.S
+++ b/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Border.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -11,7 +12,7 @@
 //                         size_t in_kh_step, size_t in_kw_step, size_t kernel_w)
 
 // x0: dst, x1: src, x2: weight, x3: height, x4: width, x5: in_kh_step, x6: in_kw_step, x7: kernel_w
-DeconvDwFp32Border:
+asm_function DeconvDwFp32Border
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Center.S b/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Center.S
index 07cd1a5cea..19601f5779 100644
--- a/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Center.S
+++ b/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Center.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -12,7 +13,7 @@
 //                      size_t in_kh_step, size_t in_kw_step);
 // x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step
 // x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step
-DeconvDwFp32Center:
+asm_function DeconvDwFp32Center
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Center.S b/mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Center.S
index 32d402d025..8a69813657 100644
--- a/mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Center.S
+++ b/mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Center.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -12,7 +13,7 @@
 //                      size_t in_kh_step, size_t in_kw_step);
 // x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step
 // x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step
-DeconvDwInt8Center:
+asm_function DeconvDwInt8Center
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Post.S b/mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Post.S
index e56262474d..ad3ba50ca5 100644
--- a/mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Post.S
+++ b/mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Post.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -13,7 +14,7 @@
 // x0: dst, x1: output_buffer, x2: bias, x3: block_channel, x4: pixel_nums, x5: out_multiplier
 // x6: left_shift, x7: right_shift, x8: out_zp, x9: acc_min, x10: acc_max
 
-DeconvDwInt8Post:
+asm_function DeconvDwInt8Post
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/arm64/IndirectGemmInt16to32_8x4.S b/mindspore/lite/nnacl/assembly/arm64/IndirectGemmInt16to32_8x4.S
index bfad61a362..5e63493241 100644
--- a/mindspore/lite/nnacl/assembly/arm64/IndirectGemmInt16to32_8x4.S
+++ b/mindspore/lite/nnacl/assembly/arm64/IndirectGemmInt16to32_8x4.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -9,7 +10,7 @@
 
 // void IndirectGemmInt16to32_8x4(int *output, short *input, short *weight, size_t ksize, size_t ic8, size_t oc4, size_t offset);
 // x0: output, x1: input, x2: weight, x3: ksize, x4: ic8, x5: oc4, x6: offset
-IndirectGemmInt16to32_8x4:
+asm_function IndirectGemmInt16to32_8x4
 
     .macro INIT_ZERO
         dup v28.4s, wzr
diff --git a/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S b/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S
index 36383dfb30..88824e8aed 100644
--- a/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
     .text
     .align 5
     .global MatVecMulFp32
@@ -15,7 +16,7 @@
 // w5: depth
 // w6: col
 
-MatVecMulFp32:
+asm_function MatVecMulFp32
   sub sp, sp, #128
   st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
   st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S
index a7d39105ab..5c7024ea94 100644
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
     .text
     .align 5
     .global MatmulFloatNeon64
@@ -19,7 +20,7 @@
 // w17: stride
 // w13: c8_nhwc_c4
 
-MatmulFloatNeon64:
+asm_function MatmulFloatNeon64
   sub sp, sp, #128
   st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
   st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S
index 3d85651687..7a103239b5 100644
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
     .text
     .align 5
     .global MatmulFloatNeon64Opt
@@ -19,7 +20,7 @@
 // x8: stride
 // x9: writeMode
 
-MatmulFloatNeon64Opt:
+asm_function MatmulFloatNeon64Opt
     sub sp, sp, #144
     st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
     st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S b/mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S
index 9974e5c771..883d07fb09 100644
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
     .text
     .align 5
     .global MatmulInt8Neon64
@@ -29,7 +30,7 @@
 // w24: stride
 // w27: filter_peroc
 
-MatmulInt8Neon64:
+asm_function MatmulInt8Neon64
   sub sp, sp, #208
   st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
   st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S b/mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S
index 90da4924ac..c08607df9e 100644
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
     .text
     .align 5
     .global MatmulInt8Opt
@@ -28,7 +29,7 @@
 // x15: filter_peroc
 // x28: filter_zp
 
-MatmulInt8Opt:
+asm_function MatmulInt8Opt
     sub sp, sp, #208
     st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
     st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulR4Int8.S b/mindspore/lite/nnacl/assembly/arm64/MatmulR4Int8.S
index 3ae66901b8..3f6cf4644b 100644
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulR4Int8.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulR4Int8.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
     .text
     .align 5
     .global MatMulR4Int8Neon64
@@ -18,7 +19,7 @@
 // x6: a_sums
 // x7: bias
 
-MatMulR4Int8Neon64:
+asm_function MatMulR4Int8Neon64
   sub sp, sp, #128
   st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
   st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
diff --git a/mindspore/lite/nnacl/assembly/arm64/MatmulWinogradFp32.S b/mindspore/lite/nnacl/assembly/arm64/MatmulWinogradFp32.S
index e0437210d9..a378f1527e 100644
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulWinogradFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulWinogradFp32.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -9,7 +10,7 @@
 
 // MatrixMultiplyWinograd(float *matix_a, float *matrix_b, float *matrix_c, int m, int k, int n, int in_channel, int c4_channel)
                // x0: matrix_a, x1: matrix_b, x2: matrix_c, x3: m, x4: k, x5: n, x6: in_channel, x7: c4_channel
-MatrixMultiplyWinograd:
+asm_function MatrixMultiplyWinograd
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC4.S b/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC4.S
index 3ba57222fa..63794dd4d1 100644
--- a/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC4.S
+++ b/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC4.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
     .text
     .align 5
@@ -23,7 +24,7 @@
 // w13  hw  loop control
 
 
-PostFuncBiasReluC4:
+asm_function PostFuncBiasReluC4
 
   movi v26.4s, #6
   scvtf v26.4s, v26.4s
diff --git a/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC8.S b/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC8.S
index 02c125de07..05bde14ccf 100644
--- a/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC8.S
+++ b/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC8.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
     .text
     .align 5
@@ -21,7 +22,7 @@
 // w10  oc8 loop control
 // w13  hw  loop control
 
-PostFuncBiasReluC8:
+asm_function PostFuncBiasReluC8
   sub sp, sp, #128
   st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
   st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
diff --git a/mindspore/lite/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S b/mindspore/lite/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S
index cb0256fe27..270c1aefc1 100644
--- a/mindspore/lite/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S
+++ b/mindspore/lite/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
     .text
     .align 5
@@ -42,7 +43,7 @@
 // w15  oc4 loop control
 // w16  hw  loop control
 
-PostFuncInt8C4Neon64:
+asm_function PostFuncInt8C4Neon64
 
   ldr w8, [sp]
   ldr w9, [sp, #8]
diff --git a/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Peroc.S b/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Peroc.S
index a55d1d46c0..374c5d60de 100644
--- a/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Peroc.S
+++ b/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Peroc.S
@@ -1,5 +1,6 @@
-
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
+
     .text
     .align 5
     //.p2align 5,,15
@@ -20,7 +21,7 @@
 // w6 oc_res4
 // w7 stride
 
-PreSum4x16Int8Peroc:
+asm_function PreSum4x16Int8Peroc
  mov w8, #0
 
 RowLoop:
diff --git a/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Pert.S b/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Pert.S
index d4c61a2242..af9d4b4061 100644
--- a/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Pert.S
+++ b/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Pert.S
@@ -1,5 +1,5 @@
-
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
     .text
     .align 5
     //.p2align 5,,15
@@ -16,7 +16,7 @@
 // w3 co16
 // w4 filter_zp
 
-PreSum4x16Int8Pert:
+asm_function PreSum4x16Int8Pert
   dup v17.4s, w4
   mov w5, #0
 
diff --git a/mindspore/lite/nnacl/assembly/arm64/TiledC4MatmulFp32.S b/mindspore/lite/nnacl/assembly/arm64/TiledC4MatmulFp32.S
index c964366975..5e931e73b1 100644
--- a/mindspore/lite/nnacl/assembly/arm64/TiledC4MatmulFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm64/TiledC4MatmulFp32.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
     .text
     .align 5
@@ -7,7 +8,7 @@
     .type TiledC4MatmulFp32, %function
 #endif
 
-TiledC4MatmulFp32:
+asm_function TiledC4MatmulFp32
 //void TiledC4MatmulFp32(float* dst, const float* src, const float* weight, size_t ic4, size_t cal_num, size_t oc4)
 //x0: dst
 //x1: src
diff --git a/mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S b/mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S
index ec3a30e7c1..84a0ed9ab4 100644
--- a/mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S
+++ b/mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
     .text
     .align 5
@@ -7,7 +8,7 @@
     .type WinogradTransLeft, %function
 #endif
 
-WinogradTransLeft:
+asm_function WinogradTransLeft
 //void WinogradTransLeft(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
 //x0: S
 //x1: B
diff --git a/mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S b/mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S
index ff65ef0122..7b96ed500e 100644
--- a/mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S
+++ b/mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
     .text
     .align 5
@@ -7,7 +8,7 @@
     .type WinogradTransRight, %function
 #endif
 
-WinogradTransRight:
+asm_function WinogradTransRight
 //void WinogradTransRight(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
 //x0: S
 //x1: B
diff --git a/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Border.S b/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Border.S
index b4558e2262..dc0e98bad1 100644
--- a/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Border.S
+++ b/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Border.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -13,7 +14,7 @@
 
 // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: in_kh_step, x7: in_kw_step,
 // x8: kernel_w, x9: relu, x10: relu6
-ConvDwFp16Border:
+asm_function ConvDwFp16Border
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Center.S b/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Center.S
index 7d98767ba3..74cc4c4bf7 100644
--- a/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Center.S
+++ b/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Center.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -13,7 +14,7 @@
 // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w, 
 // x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step
 // x14: relu, x15: relu6
-ConvDwFp16Center:
+asm_function ConvDwFp16Center
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Row.S b/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Row.S
index 6cc0a2cf40..324f0303ae 100644
--- a/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Row.S
+++ b/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Row.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -12,7 +13,7 @@
 // x0: output_ptr, x1: input_ptr, x2: filter_ptr, x3: num_pixels,
 // x4: input_channel, x5: input_step
 //
-ConvDwFp16Row:
+asm_function ConvDwFp16Row
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Border.S b/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Border.S
index 73d5232233..a807b5300a 100644
--- a/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Border.S
+++ b/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Border.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -11,7 +12,7 @@
 //                         size_t in_kh_step, size_t in_kw_step, size_t kernel_w)
 
 // x0: dst, x1: src, x2: weight, x3: height, x4: width, x5: in_kh_step, x6: in_kw_step, x7: kernel_w
-DeconvDwFp16Border:
+asm_function DeconvDwFp16Border
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Center.S b/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Center.S
index 1087856cb5..c0ec1a6bbe 100644
--- a/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Center.S
+++ b/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Center.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -12,7 +13,7 @@
 //                      size_t in_kh_step, size_t in_kw_step);
 // x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step
 // x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step
-DeconvDwFp16Center:
+asm_function DeconvDwFp16Center
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/fp16/Float16ToFloat32.S b/mindspore/lite/nnacl/assembly/fp16/Float16ToFloat32.S
index 2cb3219589..650caa89fa 100644
--- a/mindspore/lite/nnacl/assembly/fp16/Float16ToFloat32.S
+++ b/mindspore/lite/nnacl/assembly/fp16/Float16ToFloat32.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -9,7 +10,7 @@
 
 // void Float16ToFloat32(const float16_t *input, float *output, int number);
 // x0: input, x1: output, x2: number
-Float16ToFloat32:
+asm_function Float16ToFloat32
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/fp16/Float32ToFloat16.S b/mindspore/lite/nnacl/assembly/fp16/Float32ToFloat16.S
index a321b16a34..7a9c794838 100644
--- a/mindspore/lite/nnacl/assembly/fp16/Float32ToFloat16.S
+++ b/mindspore/lite/nnacl/assembly/fp16/Float32ToFloat16.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -9,7 +10,7 @@
 
 // void Float32ToFloat16(const float *input, float16_t output, int number);
 // x0: input, x1: output, x2: number
-Float32ToFloat16:
+asm_function Float32ToFloat16
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S b/mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S
index 3c50aa362c..e1f2498278 100644
--- a/mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S
+++ b/mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -12,7 +13,7 @@
 // x0: output, x1: input, x2: weight, x3: bias, x4: step, x5: ic4, x6: oc8, x7: offset, 
 // x8:mode, x9: writeC4, x10:relu, x11: relu6
 // compute 8 channel for 16 outputs
-IndirectGemmFp16_16x8:
+asm_function IndirectGemmFp16_16x8
 
     .macro INIT_BIAS
         dup v16.4s, wzr
@@ -41,7 +42,7 @@ IndirectGemmFp16_16x8:
     // x19 ~ r29 should be also preserved
     // whereas our coding style do not permit such amount of parameters
     sub sp, sp, #128
-    // performance between storing 4 registers at the same time and seperatly storing them on in-order cores
+    // performance between storing 4 registers at the same time and separately storing them on in-order cores
     // is not tested yet
     st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
     st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
@@ -86,7 +87,7 @@ IndirectGemmStart:
             fmla v19.8h, v9.8h, v1.h[5]
             // load input  for output 9-16
             // input cache should be refreshed after loading
-            // ATTENTION: advance is prefered, but advancing too much may lead to invalid prefetching 
+            // ATTENTION: advance is preferred, but advancing too much may lead to invalid prefetching
             ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x12], #64
             //  last 2 steps for output 1 and 3
             fmla v16.8h, v10.8h, v0.h[2]
@@ -295,7 +296,7 @@ IndirectGemmStart:
                 cmp x6, #7
                 beq Write7
                 b Write8
-                // prefetching is not prefered while writing results in spite of cache missings
+                // prefetching is not preferred while writing results in spite of cache missing
                 // you could try prfm pstl2strm
                 // there are almost no benefits observed though
             Write1:
diff --git a/mindspore/lite/nnacl/assembly/fp16/MatVecMulFp16.S b/mindspore/lite/nnacl/assembly/fp16/MatVecMulFp16.S
index 9ba601a797..5a7adbb76b 100644
--- a/mindspore/lite/nnacl/assembly/fp16/MatVecMulFp16.S
+++ b/mindspore/lite/nnacl/assembly/fp16/MatVecMulFp16.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
     .text
     .align 5
     .global MatVecMulFp16Neon64
@@ -15,7 +16,7 @@
 // w5: depth
 // w6: col
 
-MatVecMulFp16Neon64:
+asm_function MatVecMulFp16Neon64
   sub sp, sp, #128
   st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
   st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
diff --git a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16.S b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16.S
index fd2622d210..bc3644ad21 100644
--- a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16.S
+++ b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
     .text
     .align 5
     .global MatmulFp16Neon64
@@ -19,7 +20,7 @@
 // w17: stride
 // w13: writeC8
 
-MatmulFp16Neon64:
+asm_function MatmulFp16Neon64
   sub sp, sp, #128
   st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
   st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
diff --git a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S
index 20285677fe..503a0f6f23 100644
--- a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S
+++ b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
     .text
     .align 5
     .global MatmulFp16Neon64Opt
@@ -19,7 +20,7 @@
 // x8: stride
 // x9: writeMode
 
-MatmulFp16Neon64Opt:
+asm_function MatmulFp16Neon64Opt
     sub sp, sp, #80
     st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
     stp x19, x20, [sp], #16
diff --git a/mindspore/lite/nnacl/assembly/fp16/MatmulWinogradFp16.S b/mindspore/lite/nnacl/assembly/fp16/MatmulWinogradFp16.S
index 38f869c8ee..daaed9163a 100644
--- a/mindspore/lite/nnacl/assembly/fp16/MatmulWinogradFp16.S
+++ b/mindspore/lite/nnacl/assembly/fp16/MatmulWinogradFp16.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -9,7 +10,7 @@
 
 // MatrixMultiplyWinogradFp16(float16_t *matix_a, float16_t *matrix_b, float16_t *matrix_c, int m, int k, int n, int in_channel)
     // x0: matrix_a, x1: matrix_b, x2: matrix_c, x3: m, x4: k, x5: n, x6: in_channel
-MatrixMultiplyWinogradFp16:
+asm_function MatrixMultiplyWinogradFp16
     // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
     // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
     // x19 ~ x29 should be also preserved
diff --git a/mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC4Fp16.S b/mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC4Fp16.S
index e8bc7f9fd0..2bf2f786b4 100644
--- a/mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC4Fp16.S
+++ b/mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC4Fp16.S
@@ -1,3 +1,4 @@
+#include "nnacl/assembly_global.h"
 
     .text
     .align 5
@@ -13,7 +14,7 @@
 // w3 oc4div        w4 oc4mod        w5 plane_size
 // x6 plane_stride  x7 relu_type
 
-PostFuncBiasReluC4Fp16:
+asm_function PostFuncBiasReluC4Fp16
 
   movi v26.4h, #6
   scvtf v26.4h, v26.4h
diff --git a/mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC8Fp16.S b/mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC8Fp16.S
index 6127435102..dad91b9332 100644
--- a/mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC8Fp16.S
+++ b/mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC8Fp16.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
 
     .text
     .align 5
@@ -21,7 +22,7 @@
 // w10  oc8 loop control
 // w13  hw  loop control
 
-PostFuncBiasReluC8Fp16:
+asm_function PostFuncBiasReluC8Fp16
   movi v26.8h, #0x46, lsl #8
   dup v27.8h, wzr
   mov w10, #0
diff --git a/mindspore/lite/nnacl/assembly/fp16/TiledC4MatmulFp16.S b/mindspore/lite/nnacl/assembly/fp16/TiledC4MatmulFp16.S
index af23543225..720ee3e1ac 100644
--- a/mindspore/lite/nnacl/assembly/fp16/TiledC4MatmulFp16.S
+++ b/mindspore/lite/nnacl/assembly/fp16/TiledC4MatmulFp16.S
@@ -1,3 +1,4 @@
+#include "nnacl/assembly_global.h"
 
 .text
 .align 5
@@ -6,7 +7,7 @@
 .type TiledC4MatmulFp16, %function
 #endif
 
-TiledC4MatmulFp16:
+asm_function TiledC4MatmulFp16
 
 sub sp, sp, #128
 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
diff --git a/mindspore/lite/nnacl/assembly/fp16/WinogradTransLeftFp16.S b/mindspore/lite/nnacl/assembly/fp16/WinogradTransLeftFp16.S
index ca13c5a7e3..df1d88750e 100644
--- a/mindspore/lite/nnacl/assembly/fp16/WinogradTransLeftFp16.S
+++ b/mindspore/lite/nnacl/assembly/fp16/WinogradTransLeftFp16.S
@@ -1,3 +1,4 @@
+#include "nnacl/assembly_global.h"
 
   .text
   .align 5
@@ -6,7 +7,7 @@
   .type WinogradTransLeftFp16, %function
 #endif
 
-WinogradTransLeftFp16:
+asm_function WinogradTransLeftFp16
 
 sub sp, sp, #32
 stp x19, x20, [sp], #32
diff --git a/mindspore/lite/nnacl/assembly/fp16/WinogradTransRightFp16.S b/mindspore/lite/nnacl/assembly/fp16/WinogradTransRightFp16.S
index d3f5860c33..c889803691 100644
--- a/mindspore/lite/nnacl/assembly/fp16/WinogradTransRightFp16.S
+++ b/mindspore/lite/nnacl/assembly/fp16/WinogradTransRightFp16.S
@@ -1,3 +1,4 @@
+#include "nnacl/assembly_global.h"
 
   .text
   .align 5
@@ -6,7 +7,7 @@
   .type WinogradTransRightFp16, %function
 #endif
 
-WinogradTransRightFp16:
+asm_function WinogradTransRightFp16
 
 mov x8, #8 // 4 * sizeof(float16)
 mul x8, x6, x8
diff --git a/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S b/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S
index 077131ba99..38a38433b1 100644
--- a/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S
+++ b/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
     .text
     .align 5
     .global MatmulInt8DpNeon64
@@ -29,7 +30,7 @@
 // w24: stride
 // w27: filter_peroc
 
-MatmulInt8DpNeon64:
+asm_function MatmulInt8DpNeon64
   sub sp, sp, #208
   st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
   st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
diff --git a/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8Opt.S b/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8Opt.S
index ee276f01bc..fc3ef28b86 100644
--- a/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8Opt.S
+++ b/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8Opt.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
     .text
     .align 5
     .global MatmulInt8DpOpt
@@ -28,7 +29,7 @@
 // x15: filter_peroc
 // x28: filter_zp
 
-MatmulInt8DpOpt:
+asm_function MatmulInt8DpOpt
   sub sp, sp, #208
   st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
   st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
diff --git a/mindspore/lite/nnacl/assembly/opt/MatmulOptR4Int8.S b/mindspore/lite/nnacl/assembly/opt/MatmulOptR4Int8.S
index be158bd9ed..03342a3986 100644
--- a/mindspore/lite/nnacl/assembly/opt/MatmulOptR4Int8.S
+++ b/mindspore/lite/nnacl/assembly/opt/MatmulOptR4Int8.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
+#include "nnacl/assembly_global.h"
     .text
     .align 5
     .global MatMulOptR4Int8Neon64
@@ -18,7 +19,7 @@
 // x6: a_sums
 // x7: bias
 
-MatMulOptR4Int8Neon64:
+asm_function MatMulOptR4Int8Neon64
   sub sp, sp, #128
   st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
   st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
diff --git a/mindspore/lite/nnacl/assembly_global.h b/mindspore/lite/nnacl/assembly_global.h
new file mode 100644
index 0000000000..d739f2240b
--- /dev/null
+++ b/mindspore/lite/nnacl/assembly_global.h
@@ -0,0 +1,32 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_NNACL_ASSEMBLY_GLOBAL_H
+#define MINDSPORE_LITE_NNACL_ASSEMBLY_GLOBAL_H
+
+.macro asm_function fname
+#ifdef __APPLE__
+  .globl _\fname _\fname :
+#else
+  .global \fname
+#ifdef __ELE__
+  .hidden \fname.type \fname,
+  % function
+#endif
+\fname :
+#endif
+  .endm
+
+#endif  // MINDSPORE_LITE_NNACL_ASSEMBLY_GLOBAL_H