fix_assembly_for_ios_5

4 years ago · 52945ce826
--- a/mindspore/lite/nnacl/assembly/arm32/ConvDw3x3Int8BorderPixel.S
+++ b/mindspore/lite/nnacl/assembly/arm32/ConvDw3x3Int8BorderPixel.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -17,7 +18,7 @@
 // r0: dst, r1: src, r2: weight, r3: bias, r4: height, r5: width, r6: in_kh_step, r7: in_kw_step,
 // r8: channel, r9: in_zp,  r10: out_zp, r11: out_multiplier, r12: left_shift, r13: right_shift
 // r14: acc_min, r15: acc_max
 ConvDw3x3Int8BorderPixel:
 asm_function ConvDw3x3Int8BorderPixel
    // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
    // according to https://stackoverflow.com/questions/53625807
    // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
--- a/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Border.S
+++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Border.S
@@ -1,4 +1,5 @@
 #ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -11,7 +12,7 @@
 //                       size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6)
 // r0: dst, r1: src, r2: weight, r3: bias, r4: height, r5: width, r6: in_kh_step, r7: in_kw_step,
 // r8: kernel_w, r9: relu, r10: relu6
 ConvDwFp32Border:
 asm_function ConvDwFp32Border
    // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
    push {r4-r12, lr}
    vpush {q4-q7}
--- a/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Center.S
+++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Center.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -14,7 +15,7 @@
 // r0: dst, r1: src, r2: weight, r3: bias, #0: height, #4: width, #8: kernel_h, #12: kernel_w,
 // #16: out_h_step, #20: block_channel, #24: in_sh_step, #28: in_sw_step, #32: in_kh_step,#36: in_kw_step
 // #40: relu, #44: relu6
 ConvDwFp32Center:
 asm_function ConvDwFp32Center
    // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
    // according to https://stackoverflow.com/questions/53625807
    // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
--- a/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Row.S
+++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Row.S
@@ -1,4 +1,5 @@
 #ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -11,7 +12,7 @@
 //                   size_t num_pixels, size_t input_channel, size_t input_step)
 // r0: output_ptr, r1: input_ptr, r2: filter_ptr, r3: num_pixels,
 // r4: input_channel, r5: input_step
 ConvDwFp32Row:
 asm_function ConvDwFp32Row
    // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf

    push {r4-r6, r8, r10, r11}
--- a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Center.S
+++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Center.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -15,7 +16,7 @@
 // #-48: dst, #-44: src, #-40: weight, #-36: bias, #0: height, #4: width, #8: kernel_h, #12: kernel_w,
 // #16: out_h_step, #20: block_channel, #24: in_sh_step, #28: in_sw_step, #32: in_kh_step, #36: in_kw_step
 // #40: in_zp, #44: out_zp, #48: out_multiplier, #52: left_shift, #56: right_shift, #60:acc_min, #64: acc_max
 ConvDwInt8Center:
 asm_function ConvDwInt8Center
 // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
 // according to https://stackoverflow.com/questions/53625807
 // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
--- a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4.S
+++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -13,7 +14,7 @@
 // r0: dst, r1: buffer, r2: num_pixels, r3: output_zp, r4: out_multiplier,
 // r5: left_shift, r6: right_shift, r7: acc_min, r8: acc_max

 ConvDwInt8PostAlign4:
 asm_function ConvDwInt8PostAlign4
    // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
    // according to https://stackoverflow.com/questions/53625807
    // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
--- a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4PerChannel.S
+++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4PerChannel.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -13,7 +14,7 @@
 // r0: dst, r1: buffer, r2: num_pixels, r3: output_zp, r4: out_multiplier,
 // r5: left_shift, r6: right_shift, r7: acc_min, r8: acc_max

 ConvDwInt8PostAlign4PerChannel:
 asm_function ConvDwInt8PostAlign4PerChannel
    // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
    // according to https://stackoverflow.com/questions/53625807
    // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
--- a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Row.S
+++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Row.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -13,7 +14,7 @@
 // r0: output_ptr, r1: input_ptr, r2: weight_ptr, r3: num_pixels,
 // r4: output_channel, r5: input_step, r6: input_zp,

 ConvDwInt8Row:
 asm_function ConvDwInt8Row
    // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
    // according to https://stackoverflow.com/questions/53625807
    // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
--- a/mindspore/lite/nnacl/assembly/arm32/DeconvDwFp32Center.S
+++ b/mindspore/lite/nnacl/assembly/arm32/DeconvDwFp32Center.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -13,7 +14,7 @@
 //                      size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
 // r0: dst, r1: src, r2: weight, r3: height, r4: width, #52: kernel_h, #56: kernel_w, #60: out_h_step
 // #64: block_channel, #68: in_sh_step, #72: in_sw_step, #76: in_kh_step, #80: in_kw_step
 DeconvDwFp32Center:
 asm_function DeconvDwFp32Center
    // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
    // according to https://stackoverflow.com/questions/53625807
    // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
--- a/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Center.S
+++ b/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Center.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -13,7 +14,7 @@
 //                         size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
 // r0: dst, r1: src, r2: weight, r3: height, r4: width, #52: kernel_h, #56: kernel_w, #60: out_h_step
 // #64: block_channel, #68: in_sh_step, #72: in_sw_step, #76: in_kh_step, #80: in_kw_step
 DeconvDwInt8Center:
 asm_function DeconvDwInt8Center
    // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
    // according to https://stackoverflow.com/questions/53625807
    // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
--- a/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Post.S
+++ b/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Post.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -14,7 +15,7 @@
 // r0: dst, r1: output_buffer, r2: bias, r3: block_channel, r4: pixel_nums, r5: out_multiplier,
 // r6: left_shift, r7: right_shift, r8: out_zp, r9: acc_min, r10: acc_max

 DeconvDwInt8Post:
 asm_function DeconvDwInt8Post
    // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
    // according to https://stackoverflow.com/questions/53625807
    // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
--- a/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt16to32_8x4.S
+++ b/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt16to32_8x4.S
@@ -1,4 +1,5 @@
 #ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -9,7 +10,7 @@

 // void IndirectGemmInt16to32_8x4(int *output, short *input, short *weight, size_t kszie, size_t ic8, size_t oc4, size_t offset);
 // r0: output, r1: input, r2: weight, r3: kszie, r4: ic8, r5: oc4, r6: offset
 IndirectGemmInt16to32_8x4:
 asm_function IndirectGemmInt16to32_8x4

    .macro INIT_ZERO
        // we could also use "vmov.s32 q12, #0" to initialize q12 by 0
--- a/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt8_2x4.S
+++ b/mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt8_2x4.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -13,7 +14,7 @@
 // int32_t *shift_before, int32_t *shift_after, size_t asymmetric, size_t per_channel, size_t per_channel_offset);
 // r0: output, r1: input, r2: weight, r3: bias, r4: kSize, r5: ic4, r6: oc, r7: offset
 // r8: input_sum, r10: act_min, r11: act_max, r10: out_zp, r11: out_multiplier, r10: shift_before, r11: shift_after
 IndirectGemmInt8_2x4:
 asm_function IndirectGemmInt8_2x4

    .macro INIT_BIAS
        veor q10, q10, q10
@@ -221,7 +222,7 @@ IndirectGemmInt8_2x4:
                vqmovn.s32 d31, q12
                vqmovn.s16 d0, q15

            // prefetching is not prefered while writing results in spite of cache missings
            // prefetching is not preferred while writing results in spite of cache missing
            // you could try prfm pstl2strm
            WriteStart:
                cmp r6, #1
--- a/mindspore/lite/nnacl/assembly/arm32/MatVecMulFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatVecMulFp32.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -17,7 +18,7 @@
 // r5: depth
 // r6: col

 MatVecMulFp32:
 asm_function MatVecMulFp32
  // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
  push {r0-r8, r10, r11, lr}
  add sp, sp, #48
--- a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32.S
@@ -1,4 +1,5 @@
 #ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"
    .text
    .align 5
    .global MatmulFloatNeon32
@@ -19,7 +20,7 @@
 // r8: stride
 // lr: writeNhwc/writeWino

 MatmulFloatNeon32:
 asm_function MatmulFloatNeon32
    // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
    push {r0-r8, r10, r11, lr}
    add sp, sp, #48
--- a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S
@@ -1,4 +1,5 @@
 #ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"
    .text
    .align 5
    .global MatmulFloatNeon32Opt
@@ -19,7 +20,7 @@
 // r8: stride
 // lr: writeNhwc/writeWino

 MatmulFloatNeon32Opt:
 asm_function MatmulFloatNeon32Opt
    // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
    push {r0-r8, r10, r11, lr}
    add sp, sp, #48
--- a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S
@@ -1,4 +1,5 @@
 #ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"
    .text
    .align 5
    .global MatmulFloatNeon32Opt12x4
@@ -19,7 +20,7 @@
 // r8: stride
 // lr: OutType_C8 = 0, OutType_Nhwc = 1, OutType_TileC8 = 2

 MatmulFloatNeon32Opt12x4:
 asm_function MatmulFloatNeon32Opt12x4
    // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
    push {r0-r8, r10, r11, lr}
    vpush {q4-q7}
--- a/mindspore/lite/nnacl/assembly/arm32/MatmulInt8.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatmulInt8.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -15,7 +16,7 @@
 // #0: col, #4: deep16, #8: input_sums, #12: weight_bias, #16: act_min, #20: act_max, #24: out_zp
 // #28: multiplier, #32: left_shift, #36: right_shift, #40: stride, #44: per_channel

 MatmulInt8Neon32:
 asm_function MatmulInt8Neon32
  push {r0-r11, lr}
  vpush {q4-q7}
  add sp, sp, #116
@@ -117,7 +118,7 @@ End3:
  bgt PerChannel 

 PerTensor:
  // Substract input_sums
  // Subtract input_sums
  vld1.32 {d24, d25}, [r6]!
  vdup.32 d20, d24[0]
  vdup.32 d21, d24[1]
@@ -157,7 +158,7 @@ PerTensor:
  b AddDstZP

 PerChannel:
  // Substract input_sums
  // Subtract input_sums
  vld1.32 {d24, d25, d26, d27}, [r6]!
  vsub.s32 d28, d28, d24
  vsub.s32 d29, d29, d25
--- a/mindspore/lite/nnacl/assembly/arm32/MatmulInt8Opt.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatmulInt8Opt.S
@@ -1,5 +1,6 @@
 #ifdef __arm__
 #ifndef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -16,7 +17,7 @@
 // #0: col, #4: deep16, #8: input_sums, #12: weight_bias, #16: act_min, #20: act_max, #24: out_zp
 // #28: multiplier, #32: left_shift, #36: right_shift, #40: stride, #44: per_channel, #48: filter_zp

 MatmulInt8Opt:
 asm_function MatmulInt8Opt
    push {r0-r8, r10, r11, lr}
    vpush {q4-q7}
    add sp, sp, #112
--- a/mindspore/lite/nnacl/assembly/arm32/MatmulWinogradFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatmulWinogradFp32.S
@@ -1,4 +1,5 @@
 #ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -10,7 +11,7 @@
 // MatrixMultiplyWinograd(float *matix_a, float *matrix_b, float *matrix_c, int m, int k, int n, int in_channel, int c4_channel)
    // r0: matrix_a, r1: matrix_b, r2: matrix_c, r3: m, r4: k, r5: n, r6: in_channel, r7: c4_channel * 4
    // #-56: matrix_a, #-52: matrix_b, #-48: matrix_c, #-44: m, #0: k, #4: n, #8: in_channel, #12: c4_channel * 4
 MatrixMultiplyWinograd:
 asm_function MatrixMultiplyWinograd
    // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
    // according to https://stackoverflow.com/questions/53625807
    // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
--- a/mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC4.S
+++ b/mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC4.S
@@ -1,3 +1,4 @@
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -7,7 +8,7 @@
 .type PostFuncBiasReluC4, %function
 #endif

 PostFuncBiasReluC4:
 asm_function PostFuncBiasReluC4
  push {r4-r8, r10, r11, lr}
  add sp, sp, #32

--- a/mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC8.S
+++ b/mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC8.S
@@ -1,4 +1,5 @@
 #ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -21,7 +22,7 @@
 // lr  oc8 loop control
 // r8  hw  loop control

 PostFuncBiasReluC8:
 asm_function PostFuncBiasReluC8
  push {r4-r8, r10, r11, lr}
  add sp, sp, #32

--- a/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Peroc.S
+++ b/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Peroc.S
@@ -1,3 +1,4 @@
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -19,7 +20,7 @@
 // r6 oc_res2
 // r7 stride

 PreSum4x16Int8Peroc:
 asm_function PreSum4x16Int8Peroc
  push {r4-r11, lr}
  vpush {q4-q7}
  add sp, sp, #100
--- a/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Pert.S
+++ b/mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Pert.S
@@ -1,3 +1,4 @@
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -15,7 +16,7 @@
 // r3 co16
 // r4 filter_zp

 PreSum4x16Int8Pert:
 asm_function PreSum4x16Int8Pert
  push {r4-r8, r10, r11, lr}
  vpush {q4-q7}
  add sp, sp, #96
--- a/mindspore/lite/nnacl/assembly/arm32/TiledC4MatmulFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm32/TiledC4MatmulFp32.S
@@ -1,4 +1,5 @@
 #ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"
    .text
    .align 5
    .global TiledC4MatmulFp32
@@ -6,7 +7,7 @@
    .type TiledC4MatmulFp32, %function
 #endif

 TiledC4MatmulFp32:
 asm_function TiledC4MatmulFp32
 //void TiledC4MatmulFp32(float* dst, const float* src, const float* weight, size_t cal_num, size_t ic4, size_t oc4)
 //x0: dst
 //x1: src
--- a/mindspore/lite/nnacl/assembly/arm32/WinogradTransLeft.S
+++ b/mindspore/lite/nnacl/assembly/arm32/WinogradTransLeft.S
@@ -1,4 +1,5 @@
 #ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"

    .text
    .align 5
@@ -15,7 +16,7 @@
 //x4: h
 //x5: k
 //x6: length
 WinogradTransLeft:
 asm_function WinogradTransLeft
    push {r4-r11, lr}
    ldr r4, [sp, #36]
    ldr r5, [sp, #40]
--- a/mindspore/lite/nnacl/assembly/arm32/WinogradTransRight.S
+++ b/mindspore/lite/nnacl/assembly/arm32/WinogradTransRight.S
@@ -1,4 +1,5 @@
 #ifdef ENABLE_ARM32
 #include "nnacl/assembly_global.h"

    .text
    .align 5
@@ -15,7 +16,7 @@
 //x4: h
 //x5: k
 //x6: length
 WinogradTransRight:
 asm_function WinogradTransRight
    push {r4-r11, lr}
    ldr r4, [sp, #36]
    ldr r5, [sp, #40]
--- a/mindspore/lite/nnacl/assembly/arm64/AdderFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm64/AdderFp32.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"
    .text
    .align 5
    .global AdderFloatNeon64
@@ -19,7 +20,7 @@
 // x8: stride
 // x9: writeMode

 AdderFloatNeon64:
 asm_function AdderFloatNeon64
    sub sp, sp, #144
    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Corner.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Corner.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -12,7 +13,7 @@
                     
 // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, x6: channel, x7: relu,  x8: relu6

 ConvDw3x3Corner:
 asm_function ConvDw3x3Corner
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Horizontal.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Horizontal.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -12,7 +13,7 @@
                     
 // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, x6: channel, x7: relu,  x8: relu6

 ConvDw3x3Horizontal:
 asm_function ConvDw3x3Horizontal
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -23,7 +24,7 @@
 // w9: relu
 // w10: relu6

 ConvDw3x3Stride1:
 asm_function ConvDw3x3Stride1
    sub sp, sp, #128
    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -23,7 +24,7 @@
 // w9: relu
 // w10: relu6

 ConvDw3x3Stride2:
 asm_function ConvDw3x3Stride2
    sub sp, sp, #128
    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Vertical.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Vertical.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -12,7 +13,7 @@
                     
 // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, x6: channel, x7: relu,  x8: relu6

 ConvDw3x3Vertical:
 asm_function ConvDw3x3Vertical
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -31,7 +32,7 @@
 // w15: acc_max
 // w16: per_channel

 ConvDw3x3Int8Neon64:
 asm_function ConvDw3x3Int8Neon64
  sub sp, sp, #176
  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -14,7 +15,7 @@
 // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step,
 // x6: channel, x7: in_zp,  x8: out_zp, x9: out_multiplier, x10: left_shift, x11: right_shift
 // x12: acc_min, x13: acc_max, x14: per_channel
 ConvDw3x3Int8Corner:
 asm_function ConvDw3x3Int8Corner
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -14,7 +15,7 @@
 // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step,
 // x6: channel, x7: in_zp,  x8: out_zp, x9: out_multiplier, x10: left_shift, x11: right_shift
 // x12: acc_min, x13: acc_max, x14: per_channel
 ConvDw3x3Int8Horizontal:
 asm_function ConvDw3x3Int8Horizontal
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -31,7 +32,7 @@
 // w15: acc_max
 // w16: per_channel

 ConvDw3x3Int8Stride2:
 asm_function ConvDw3x3Int8Stride2
    sub sp, sp, #176
    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -14,7 +15,7 @@
 // x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step,
 // x6: channel, x7: in_zp,  x8: out_zp, x9: out_multiplier, x10: left_shift, x11: right_shift
 // x12: acc_min, x13: acc_max, x14: per_channel
 ConvDw3x3Int8Vertical:
 asm_function ConvDw3x3Int8Vertical
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Border.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Border.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -12,7 +13,7 @@

 // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: in_kh_step, x7: in_kw_step,
 // x8: kernel_w, x9: relu, x10: relu6
 ConvDwFp32Border:
 asm_function ConvDwFp32Border
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Center.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Center.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -13,7 +14,7 @@
 // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: kernel_h, x7: kernel_w,
 // x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step
 // x14: relu, x15: relu6
 ConvDwFp32Center:
 asm_function ConvDwFp32Center
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -11,7 +12,7 @@
 //                            size_t input_stride, size_t relu, size_t relu6)
 // x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6

 ConvDwFp32Indirect3x3:
 asm_function ConvDwFp32Indirect3x3
    sub sp, sp, #16
    stp x19, x20, [sp], #16

--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -11,7 +12,7 @@
 //                            size_t input_stride, size_t relu, size_t relu6)
 // x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6

 ConvDwFp32Indirect5x5:
 asm_function ConvDwFp32Indirect5x5
    sub sp, sp, #160
    stp x19, x20, [sp, #64]
    stp x21, x22, [sp, #80]
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -12,7 +13,7 @@
 // x0: output_ptr, x1: input_ptr, x2: filter_ptr, x3: num_pixels,
 // x4: input_channel, x5: input_step
 //
 ConvDwFp32Row:
 asm_function ConvDwFp32Row
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -16,7 +17,7 @@
 // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w, 
 // x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step
 // x14: in_zp, #56: out_zp, #64: out_multiplier, #72:left_shift, #80: right_shift, #88: acc_min, #96: acc_max
 ConvDwInt8Center:
 asm_function ConvDwInt8Center
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -12,7 +13,7 @@
 // x0: dst, x1: buffer, x2: num_pixels, x3: output_zp, x4: out_multiplier,
 // x5: left_shift, x6: right_shift, x7: acc_min, x8: acc_max

 ConvDwInt8PostAlign4:
 asm_function ConvDwInt8PostAlign4
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4PerChannel.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4PerChannel.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -12,7 +13,7 @@
 // x0: dst, x1: buffer, x2: num_pixels, x3: output_zp, x4: out_multiplier,
 // x5: left_shift, x6: right_shift, x7: acc_min, x8: acc_max

 ConvDwInt8PostAlign4PerChannel:
 asm_function ConvDwInt8PostAlign4PerChannel
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -12,7 +13,7 @@
 // x0: output_ptr, x1: input_ptr, x2: weight_ptr, x3: num_pixels,
 // x4: output_channel, x5: input_step, x6: input_zp
 //
 ConvDwInt8Row:
 asm_function ConvDwInt8Row
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/arm64/ConvFp32Center.S
+++ b/mindspore/lite/nnacl/assembly/arm64/ConvFp32Center.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -13,7 +14,7 @@
 // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: kernel_h, x7: kernel_w,
 // x8: out_h_step, x9: block_channel, x10: ic4, x11: in_sh_step, x12: in_sw_step, x13: in_kh_step, x14: in_kw_step
 // x26: relu, x16: relu6
 ConvSwFp32Center:
 asm_function ConvSwFp32Center
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Border.S
+++ b/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Border.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -11,7 +12,7 @@
 //                         size_t in_kh_step, size_t in_kw_step, size_t kernel_w)

 // x0: dst, x1: src, x2: weight, x3: height, x4: width, x5: in_kh_step, x6: in_kw_step, x7: kernel_w
 DeconvDwFp32Border:
 asm_function DeconvDwFp32Border
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Center.S
+++ b/mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Center.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -12,7 +13,7 @@
 //                      size_t in_kh_step, size_t in_kw_step);
 // x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step
 // x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step
 DeconvDwFp32Center:
 asm_function DeconvDwFp32Center
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Center.S
+++ b/mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Center.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -12,7 +13,7 @@
 //                      size_t in_kh_step, size_t in_kw_step);
 // x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step
 // x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step
 DeconvDwInt8Center:
 asm_function DeconvDwInt8Center
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Post.S
+++ b/mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Post.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -13,7 +14,7 @@
 // x0: dst, x1: output_buffer, x2: bias, x3: block_channel, x4: pixel_nums, x5: out_multiplier
 // x6: left_shift, x7: right_shift, x8: out_zp, x9: acc_min, x10: acc_max

 DeconvDwInt8Post:
 asm_function DeconvDwInt8Post
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/arm64/IndirectGemmInt16to32_8x4.S
+++ b/mindspore/lite/nnacl/assembly/arm64/IndirectGemmInt16to32_8x4.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -9,7 +10,7 @@

 // void IndirectGemmInt16to32_8x4(int *output, short *input, short *weight, size_t ksize, size_t ic8, size_t oc4, size_t offset);
 // x0: output, x1: input, x2: weight, x3: ksize, x4: ic8, x5: oc4, x6: offset
 IndirectGemmInt16to32_8x4:
 asm_function IndirectGemmInt16to32_8x4

    .macro INIT_ZERO
        dup v28.4s, wzr
--- a/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"
    .text
    .align 5
    .global MatVecMulFp32
@@ -15,7 +16,7 @@
 // w5: depth
 // w6: col

 MatVecMulFp32:
 asm_function MatVecMulFp32
  sub sp, sp, #128
  st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
  st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"
    .text
    .align 5
    .global MatmulFloatNeon64
@@ -19,7 +20,7 @@
 // w17: stride
 // w13: c8_nhwc_c4

 MatmulFloatNeon64:
 asm_function MatmulFloatNeon64
  sub sp, sp, #128
  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"
    .text
    .align 5
    .global MatmulFloatNeon64Opt
@@ -19,7 +20,7 @@
 // x8: stride
 // x9: writeMode

 MatmulFloatNeon64Opt:
 asm_function MatmulFloatNeon64Opt
    sub sp, sp, #144
    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"
    .text
    .align 5
    .global MatmulInt8Neon64
@@ -29,7 +30,7 @@
 // w24: stride
 // w27: filter_peroc

 MatmulInt8Neon64:
 asm_function MatmulInt8Neon64
  sub sp, sp, #208
  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"
    .text
    .align 5
    .global MatmulInt8Opt
@@ -28,7 +29,7 @@
 // x15: filter_peroc
 // x28: filter_zp

 MatmulInt8Opt:
 asm_function MatmulInt8Opt
    sub sp, sp, #208
    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulR4Int8.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulR4Int8.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"
    .text
    .align 5
    .global MatMulR4Int8Neon64
@@ -18,7 +19,7 @@
 // x6: a_sums
 // x7: bias

 MatMulR4Int8Neon64:
 asm_function MatMulR4Int8Neon64
  sub sp, sp, #128
  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulWinogradFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulWinogradFp32.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -9,7 +10,7 @@

 // MatrixMultiplyWinograd(float *matix_a, float *matrix_b, float *matrix_c, int m, int k, int n, int in_channel, int c4_channel)
               // x0: matrix_a, x1: matrix_b, x2: matrix_c, x3: m, x4: k, x5: n, x6: in_channel, x7: c4_channel
 MatrixMultiplyWinograd:
 asm_function MatrixMultiplyWinograd
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC4.S
+++ b/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC4.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

    .text
    .align 5
@@ -23,7 +24,7 @@
 // w13  hw  loop control


 PostFuncBiasReluC4:
 asm_function PostFuncBiasReluC4

  movi v26.4s, #6
  scvtf v26.4s, v26.4s
--- a/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC8.S
+++ b/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC8.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

    .text
    .align 5
@@ -21,7 +22,7 @@
 // w10  oc8 loop control
 // w13  hw  loop control

 PostFuncBiasReluC8:
 asm_function PostFuncBiasReluC8
  sub sp, sp, #128
  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
--- a/mindspore/lite/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S
+++ b/mindspore/lite/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

    .text
    .align 5
@@ -42,7 +43,7 @@
 // w15  oc4 loop control
 // w16  hw  loop control

 PostFuncInt8C4Neon64:
 asm_function PostFuncInt8C4Neon64

  ldr w8, [sp]
  ldr w9, [sp, #8]
--- a/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Peroc.S
+++ b/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Peroc.S
@@ -1,5 +1,6 @@

 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

    .text
    .align 5
    //.p2align 5,,15
@@ -20,7 +21,7 @@
 // w6 oc_res4
 // w7 stride

 PreSum4x16Int8Peroc:
 asm_function PreSum4x16Int8Peroc
 mov w8, #0

 RowLoop:
--- a/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Pert.S
+++ b/mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Pert.S
@@ -1,5 +1,5 @@

 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"
    .text
    .align 5
    //.p2align 5,,15
@@ -16,7 +16,7 @@
 // w3 co16
 // w4 filter_zp

 PreSum4x16Int8Pert:
 asm_function PreSum4x16Int8Pert
  dup v17.4s, w4
  mov w5, #0

--- a/mindspore/lite/nnacl/assembly/arm64/TiledC4MatmulFp32.S
+++ b/mindspore/lite/nnacl/assembly/arm64/TiledC4MatmulFp32.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

    .text
    .align 5
@@ -7,7 +8,7 @@
    .type TiledC4MatmulFp32, %function
 #endif

 TiledC4MatmulFp32:
 asm_function TiledC4MatmulFp32
 //void TiledC4MatmulFp32(float* dst, const float* src, const float* weight, size_t ic4, size_t cal_num, size_t oc4)
 //x0: dst
 //x1: src
--- a/mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S
+++ b/mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

    .text
    .align 5
@@ -7,7 +8,7 @@
    .type WinogradTransLeft, %function
 #endif

 WinogradTransLeft:
 asm_function WinogradTransLeft
 //void WinogradTransLeft(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
 //x0: S
 //x1: B
--- a/mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S
+++ b/mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

    .text
    .align 5
@@ -7,7 +8,7 @@
    .type WinogradTransRight, %function
 #endif

 WinogradTransRight:
 asm_function WinogradTransRight
 //void WinogradTransRight(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
 //x0: S
 //x1: B
--- a/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Border.S
+++ b/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Border.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -13,7 +14,7 @@

 // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: in_kh_step, x7: in_kw_step,
 // x8: kernel_w, x9: relu, x10: relu6
 ConvDwFp16Border:
 asm_function ConvDwFp16Border
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Center.S
+++ b/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Center.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -13,7 +14,7 @@
 // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w, 
 // x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step
 // x14: relu, x15: relu6
 ConvDwFp16Center:
 asm_function ConvDwFp16Center
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Row.S
+++ b/mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Row.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -12,7 +13,7 @@
 // x0: output_ptr, x1: input_ptr, x2: filter_ptr, x3: num_pixels,
 // x4: input_channel, x5: input_step
 //
 ConvDwFp16Row:
 asm_function ConvDwFp16Row
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Border.S
+++ b/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Border.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -11,7 +12,7 @@
 //                         size_t in_kh_step, size_t in_kw_step, size_t kernel_w)

 // x0: dst, x1: src, x2: weight, x3: height, x4: width, x5: in_kh_step, x6: in_kw_step, x7: kernel_w
 DeconvDwFp16Border:
 asm_function DeconvDwFp16Border
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Center.S
+++ b/mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Center.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -12,7 +13,7 @@
 //                      size_t in_kh_step, size_t in_kw_step);
 // x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step
 // x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step
 DeconvDwFp16Center:
 asm_function DeconvDwFp16Center
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/fp16/Float16ToFloat32.S
+++ b/mindspore/lite/nnacl/assembly/fp16/Float16ToFloat32.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -9,7 +10,7 @@

 // void Float16ToFloat32(const float16_t *input, float *output, int number);
 // x0: input, x1: output, x2: number
 Float16ToFloat32:
 asm_function Float16ToFloat32
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/fp16/Float32ToFloat16.S
+++ b/mindspore/lite/nnacl/assembly/fp16/Float32ToFloat16.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -9,7 +10,7 @@

 // void Float32ToFloat16(const float *input, float16_t output, int number);
 // x0: input, x1: output, x2: number
 Float32ToFloat16:
 asm_function Float32ToFloat16
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S
+++ b/mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -12,7 +13,7 @@
 // x0: output, x1: input, x2: weight, x3: bias, x4: step, x5: ic4, x6: oc8, x7: offset, 
 // x8:mode, x9: writeC4, x10:relu, x11: relu6
 // compute 8 channel for 16 outputs
 IndirectGemmFp16_16x8:
 asm_function IndirectGemmFp16_16x8

    .macro INIT_BIAS
        dup v16.4s, wzr
@@ -41,7 +42,7 @@ IndirectGemmFp16_16x8:
    // x19 ~ r29 should be also preserved
    // whereas our coding style do not permit such amount of parameters
    sub sp, sp, #128
    // performance between storing 4 registers at the same time and seperatly storing them on in-order cores
    // performance between storing 4 registers at the same time and separately storing them on in-order cores
    // is not tested yet
    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
@@ -86,7 +87,7 @@ IndirectGemmStart:
            fmla v19.8h, v9.8h, v1.h[5]
            // load input  for output 9-16
            // input cache should be refreshed after loading
            // ATTENTION: advance is prefered, but advancing too much may lead to invalid prefetching 
            // ATTENTION: advance is preferred, but advancing too much may lead to invalid prefetching
            ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x12], #64
            //  last 2 steps for output 1 and 3
            fmla v16.8h, v10.8h, v0.h[2]
@@ -295,7 +296,7 @@ IndirectGemmStart:
                cmp x6, #7
                beq Write7
                b Write8
                // prefetching is not prefered while writing results in spite of cache missings
                // prefetching is not preferred while writing results in spite of cache missing
                // you could try prfm pstl2strm
                // there are almost no benefits observed though
            Write1:
--- a/mindspore/lite/nnacl/assembly/fp16/MatVecMulFp16.S
+++ b/mindspore/lite/nnacl/assembly/fp16/MatVecMulFp16.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"
    .text
    .align 5
    .global MatVecMulFp16Neon64
@@ -15,7 +16,7 @@
 // w5: depth
 // w6: col

 MatVecMulFp16Neon64:
 asm_function MatVecMulFp16Neon64
  sub sp, sp, #128
  st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
  st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
--- a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16.S
+++ b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"
    .text
    .align 5
    .global MatmulFp16Neon64
@@ -19,7 +20,7 @@
 // w17: stride
 // w13: writeC8

 MatmulFp16Neon64:
 asm_function MatmulFp16Neon64
  sub sp, sp, #128
  st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
  st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
--- a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S
+++ b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"
    .text
    .align 5
    .global MatmulFp16Neon64Opt
@@ -19,7 +20,7 @@
 // x8: stride
 // x9: writeMode

 MatmulFp16Neon64Opt:
 asm_function MatmulFp16Neon64Opt
    sub sp, sp, #80
    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
    stp x19, x20, [sp], #16
--- a/mindspore/lite/nnacl/assembly/fp16/MatmulWinogradFp16.S
+++ b/mindspore/lite/nnacl/assembly/fp16/MatmulWinogradFp16.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -9,7 +10,7 @@

 // MatrixMultiplyWinogradFp16(float16_t *matix_a, float16_t *matrix_b, float16_t *matrix_c, int m, int k, int n, int in_channel)
    // x0: matrix_a, x1: matrix_b, x2: matrix_c, x3: m, x4: k, x5: n, x6: in_channel
 MatrixMultiplyWinogradFp16:
 asm_function MatrixMultiplyWinogradFp16
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
--- a/mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC4Fp16.S
+++ b/mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC4Fp16.S
@@ -1,3 +1,4 @@
 #include "nnacl/assembly_global.h"

    .text
    .align 5
@@ -13,7 +14,7 @@
 // w3 oc4div        w4 oc4mod        w5 plane_size
 // x6 plane_stride  x7 relu_type

 PostFuncBiasReluC4Fp16:
 asm_function PostFuncBiasReluC4Fp16

  movi v26.4h, #6
  scvtf v26.4h, v26.4h
--- a/mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC8Fp16.S
+++ b/mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC8Fp16.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"

    .text
    .align 5
@@ -21,7 +22,7 @@
 // w10  oc8 loop control
 // w13  hw  loop control

 PostFuncBiasReluC8Fp16:
 asm_function PostFuncBiasReluC8Fp16
  movi v26.8h, #0x46, lsl #8
  dup v27.8h, wzr
  mov w10, #0
--- a/mindspore/lite/nnacl/assembly/fp16/TiledC4MatmulFp16.S
+++ b/mindspore/lite/nnacl/assembly/fp16/TiledC4MatmulFp16.S
@@ -1,3 +1,4 @@
 #include "nnacl/assembly_global.h"

 .text
 .align 5
@@ -6,7 +7,7 @@
 .type TiledC4MatmulFp16, %function
 #endif

 TiledC4MatmulFp16:
 asm_function TiledC4MatmulFp16

 sub sp, sp, #128
 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
--- a/mindspore/lite/nnacl/assembly/fp16/WinogradTransLeftFp16.S
+++ b/mindspore/lite/nnacl/assembly/fp16/WinogradTransLeftFp16.S
@@ -1,3 +1,4 @@
 #include "nnacl/assembly_global.h"

  .text
  .align 5
@@ -6,7 +7,7 @@
  .type WinogradTransLeftFp16, %function
 #endif

 WinogradTransLeftFp16:
 asm_function WinogradTransLeftFp16

 sub sp, sp, #32
 stp x19, x20, [sp], #32
--- a/mindspore/lite/nnacl/assembly/fp16/WinogradTransRightFp16.S
+++ b/mindspore/lite/nnacl/assembly/fp16/WinogradTransRightFp16.S
@@ -1,3 +1,4 @@
 #include "nnacl/assembly_global.h"

  .text
  .align 5
@@ -6,7 +7,7 @@
  .type WinogradTransRightFp16, %function
 #endif

 WinogradTransRightFp16:
 asm_function WinogradTransRightFp16

 mov x8, #8 // 4 * sizeof(float16)
 mul x8, x6, x8
--- a/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S
+++ b/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"
    .text
    .align 5
    .global MatmulInt8DpNeon64
@@ -29,7 +30,7 @@
 // w24: stride
 // w27: filter_peroc

 MatmulInt8DpNeon64:
 asm_function MatmulInt8DpNeon64
  sub sp, sp, #208
  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
--- a/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8Opt.S
+++ b/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8Opt.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"
    .text
    .align 5
    .global MatmulInt8DpOpt
@@ -28,7 +29,7 @@
 // x15: filter_peroc
 // x28: filter_zp

 MatmulInt8DpOpt:
 asm_function MatmulInt8DpOpt
  sub sp, sp, #208
  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
--- a/mindspore/lite/nnacl/assembly/opt/MatmulOptR4Int8.S
+++ b/mindspore/lite/nnacl/assembly/opt/MatmulOptR4Int8.S
@@ -1,4 +1,5 @@
 #ifdef __aarch64__
 #include "nnacl/assembly_global.h"
    .text
    .align 5
    .global MatMulOptR4Int8Neon64
@@ -18,7 +19,7 @@
 // x6: a_sums
 // x7: bias

 MatMulOptR4Int8Neon64:
 asm_function MatMulOptR4Int8Neon64
  sub sp, sp, #128
  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
--- a/mindspore/lite/nnacl/assembly_global.h
+++ b/mindspore/lite/nnacl/assembly_global.h
@@ -0,0 +1,32 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_LITE_NNACL_ASSEMBLY_GLOBAL_H
 #define MINDSPORE_LITE_NNACL_ASSEMBLY_GLOBAL_H

 .macro asm_function fname
 #ifdef __APPLE__
  .globl _\fname _\fname :
 #else
  .global \fname
 #ifdef __ELE__
  .hidden \fname.type \fname,
  % function
 #endif
 \fname :
 #endif
  .endm

 #endif  // MINDSPORE_LITE_NNACL_ASSEMBLY_GLOBAL_H