Browse Source

fix_assembly_for_ios_5

tags/v1.2.0-rc1
yefeng 4 years ago
parent
commit
52945ce826
89 changed files with 215 additions and 96 deletions
  1. +2
    -1
      mindspore/lite/nnacl/assembly/arm32/ConvDw3x3Int8BorderPixel.S
  2. +2
    -1
      mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Border.S
  3. +2
    -1
      mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Center.S
  4. +2
    -1
      mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Row.S
  5. +2
    -1
      mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Center.S
  6. +2
    -1
      mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4.S
  7. +2
    -1
      mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4PerChannel.S
  8. +2
    -1
      mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Row.S
  9. +2
    -1
      mindspore/lite/nnacl/assembly/arm32/DeconvDwFp32Center.S
  10. +2
    -1
      mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Center.S
  11. +2
    -1
      mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Post.S
  12. +2
    -1
      mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt16to32_8x4.S
  13. +3
    -2
      mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt8_2x4.S
  14. +2
    -1
      mindspore/lite/nnacl/assembly/arm32/MatVecMulFp32.S
  15. +2
    -1
      mindspore/lite/nnacl/assembly/arm32/MatmulFp32.S
  16. +2
    -1
      mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S
  17. +2
    -1
      mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S
  18. +4
    -3
      mindspore/lite/nnacl/assembly/arm32/MatmulInt8.S
  19. +2
    -1
      mindspore/lite/nnacl/assembly/arm32/MatmulInt8Opt.S
  20. +2
    -1
      mindspore/lite/nnacl/assembly/arm32/MatmulWinogradFp32.S
  21. +2
    -1
      mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC4.S
  22. +2
    -1
      mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC8.S
  23. +2
    -1
      mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Peroc.S
  24. +2
    -1
      mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Pert.S
  25. +2
    -1
      mindspore/lite/nnacl/assembly/arm32/TiledC4MatmulFp32.S
  26. +2
    -1
      mindspore/lite/nnacl/assembly/arm32/WinogradTransLeft.S
  27. +2
    -1
      mindspore/lite/nnacl/assembly/arm32/WinogradTransRight.S
  28. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/AdderFp32.S
  29. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Corner.S
  30. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Horizontal.S
  31. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S
  32. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S
  33. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Vertical.S
  34. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S
  35. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S
  36. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S
  37. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S
  38. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S
  39. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Border.S
  40. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Center.S
  41. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S
  42. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S
  43. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S
  44. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S
  45. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S
  46. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4PerChannel.S
  47. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S
  48. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/ConvFp32Center.S
  49. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Border.S
  50. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Center.S
  51. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Center.S
  52. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Post.S
  53. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/IndirectGemmInt16to32_8x4.S
  54. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S
  55. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S
  56. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S
  57. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S
  58. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S
  59. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/MatmulR4Int8.S
  60. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/MatmulWinogradFp32.S
  61. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC4.S
  62. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC8.S
  63. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S
  64. +3
    -2
      mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Peroc.S
  65. +2
    -2
      mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Pert.S
  66. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/TiledC4MatmulFp32.S
  67. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S
  68. +2
    -1
      mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S
  69. +2
    -1
      mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Border.S
  70. +2
    -1
      mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Center.S
  71. +2
    -1
      mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Row.S
  72. +2
    -1
      mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Border.S
  73. +2
    -1
      mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Center.S
  74. +2
    -1
      mindspore/lite/nnacl/assembly/fp16/Float16ToFloat32.S
  75. +2
    -1
      mindspore/lite/nnacl/assembly/fp16/Float32ToFloat16.S
  76. +5
    -4
      mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S
  77. +2
    -1
      mindspore/lite/nnacl/assembly/fp16/MatVecMulFp16.S
  78. +2
    -1
      mindspore/lite/nnacl/assembly/fp16/MatmulFp16.S
  79. +2
    -1
      mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S
  80. +2
    -1
      mindspore/lite/nnacl/assembly/fp16/MatmulWinogradFp16.S
  81. +2
    -1
      mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC4Fp16.S
  82. +2
    -1
      mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC8Fp16.S
  83. +2
    -1
      mindspore/lite/nnacl/assembly/fp16/TiledC4MatmulFp16.S
  84. +2
    -1
      mindspore/lite/nnacl/assembly/fp16/WinogradTransLeftFp16.S
  85. +2
    -1
      mindspore/lite/nnacl/assembly/fp16/WinogradTransRightFp16.S
  86. +2
    -1
      mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S
  87. +2
    -1
      mindspore/lite/nnacl/assembly/opt/MatmulDpInt8Opt.S
  88. +2
    -1
      mindspore/lite/nnacl/assembly/opt/MatmulOptR4Int8.S
  89. +32
    -0
      mindspore/lite/nnacl/assembly_global.h

+ 2
- 1
mindspore/lite/nnacl/assembly/arm32/ConvDw3x3Int8BorderPixel.S View File

@@ -1,5 +1,6 @@
#ifdef __arm__
#ifndef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -17,7 +18,7 @@
// r0: dst, r1: src, r2: weight, r3: bias, r4: height, r5: width, r6: in_kh_step, r7: in_kw_step,
// r8: channel, r9: in_zp, r10: out_zp, r11: out_multiplier, r12: left_shift, r13: right_shift
// r14: acc_min, r15: acc_max
ConvDw3x3Int8BorderPixel:
asm_function ConvDw3x3Int8BorderPixel
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
// according to https://stackoverflow.com/questions/53625807
// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway


+ 2
- 1
mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Border.S View File

@@ -1,4 +1,5 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -11,7 +12,7 @@
// size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6)
// r0: dst, r1: src, r2: weight, r3: bias, r4: height, r5: width, r6: in_kh_step, r7: in_kw_step,
// r8: kernel_w, r9: relu, r10: relu6
ConvDwFp32Border:
asm_function ConvDwFp32Border
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
push {r4-r12, lr}
vpush {q4-q7}


+ 2
- 1
mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Center.S View File

@@ -1,5 +1,6 @@
#ifdef __arm__
#ifndef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -14,7 +15,7 @@
// r0: dst, r1: src, r2: weight, r3: bias, #0: height, #4: width, #8: kernel_h, #12: kernel_w,
// #16: out_h_step, #20: block_channel, #24: in_sh_step, #28: in_sw_step, #32: in_kh_step,#36: in_kw_step
// #40: relu, #44: relu6
ConvDwFp32Center:
asm_function ConvDwFp32Center
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
// according to https://stackoverflow.com/questions/53625807
// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway


+ 2
- 1
mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Row.S View File

@@ -1,4 +1,5 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -11,7 +12,7 @@
// size_t num_pixels, size_t input_channel, size_t input_step)
// r0: output_ptr, r1: input_ptr, r2: filter_ptr, r3: num_pixels,
// r4: input_channel, r5: input_step
ConvDwFp32Row:
asm_function ConvDwFp32Row
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf

push {r4-r6, r8, r10, r11}


+ 2
- 1
mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Center.S View File

@@ -1,5 +1,6 @@
#ifdef __arm__
#ifndef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -15,7 +16,7 @@
// #-48: dst, #-44: src, #-40: weight, #-36: bias, #0: height, #4: width, #8: kernel_h, #12: kernel_w,
// #16: out_h_step, #20: block_channel, #24: in_sh_step, #28: in_sw_step, #32: in_kh_step, #36: in_kw_step
// #40: in_zp, #44: out_zp, #48: out_multiplier, #52: left_shift, #56: right_shift, #60:acc_min, #64: acc_max
ConvDwInt8Center:
asm_function ConvDwInt8Center
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
// according to https://stackoverflow.com/questions/53625807
// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway


+ 2
- 1
mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4.S View File

@@ -1,5 +1,6 @@
#ifdef __arm__
#ifndef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -13,7 +14,7 @@
// r0: dst, r1: buffer, r2: num_pixels, r3: output_zp, r4: out_multiplier,
// r5: left_shift, r6: right_shift, r7: acc_min, r8: acc_max

ConvDwInt8PostAlign4:
asm_function ConvDwInt8PostAlign4
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
// according to https://stackoverflow.com/questions/53625807
// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway


+ 2
- 1
mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4PerChannel.S View File

@@ -1,5 +1,6 @@
#ifdef __arm__
#ifndef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -13,7 +14,7 @@
// r0: dst, r1: buffer, r2: num_pixels, r3: output_zp, r4: out_multiplier,
// r5: left_shift, r6: right_shift, r7: acc_min, r8: acc_max

ConvDwInt8PostAlign4PerChannel:
asm_function ConvDwInt8PostAlign4PerChannel
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
// according to https://stackoverflow.com/questions/53625807
// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway


+ 2
- 1
mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Row.S View File

@@ -1,5 +1,6 @@
#ifdef __arm__
#ifndef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -13,7 +14,7 @@
// r0: output_ptr, r1: input_ptr, r2: weight_ptr, r3: num_pixels,
// r4: output_channel, r5: input_step, r6: input_zp,

ConvDwInt8Row:
asm_function ConvDwInt8Row
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
// according to https://stackoverflow.com/questions/53625807
// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway


+ 2
- 1
mindspore/lite/nnacl/assembly/arm32/DeconvDwFp32Center.S View File

@@ -1,5 +1,6 @@
#ifdef __arm__
#ifndef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -13,7 +14,7 @@
// size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
// r0: dst, r1: src, r2: weight, r3: height, r4: width, #52: kernel_h, #56: kernel_w, #60: out_h_step
// #64: block_channel, #68: in_sh_step, #72: in_sw_step, #76: in_kh_step, #80: in_kw_step
DeconvDwFp32Center:
asm_function DeconvDwFp32Center
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
// according to https://stackoverflow.com/questions/53625807
// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway


+ 2
- 1
mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Center.S View File

@@ -1,5 +1,6 @@
#ifdef __arm__
#ifndef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -13,7 +14,7 @@
// size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
// r0: dst, r1: src, r2: weight, r3: height, r4: width, #52: kernel_h, #56: kernel_w, #60: out_h_step
// #64: block_channel, #68: in_sh_step, #72: in_sw_step, #76: in_kh_step, #80: in_kw_step
DeconvDwInt8Center:
asm_function DeconvDwInt8Center
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
// according to https://stackoverflow.com/questions/53625807
// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway


+ 2
- 1
mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Post.S View File

@@ -1,5 +1,6 @@
#ifdef __arm__
#ifndef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -14,7 +15,7 @@
// r0: dst, r1: output_buffer, r2: bias, r3: block_channel, r4: pixel_nums, r5: out_multiplier,
// r6: left_shift, r7: right_shift, r8: out_zp, r9: acc_min, r10: acc_max

DeconvDwInt8Post:
asm_function DeconvDwInt8Post
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
// according to https://stackoverflow.com/questions/53625807
// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway


+ 2
- 1
mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt16to32_8x4.S View File

@@ -1,4 +1,5 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -9,7 +10,7 @@

// void IndirectGemmInt16to32_8x4(int *output, short *input, short *weight, size_t kszie, size_t ic8, size_t oc4, size_t offset);
// r0: output, r1: input, r2: weight, r3: kszie, r4: ic8, r5: oc4, r6: offset
IndirectGemmInt16to32_8x4:
asm_function IndirectGemmInt16to32_8x4

.macro INIT_ZERO
// we could also use "vmov.s32 q12, #0" to initialize q12 by 0


+ 3
- 2
mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt8_2x4.S View File

@@ -1,5 +1,6 @@
#ifdef __arm__
#ifndef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -13,7 +14,7 @@
// int32_t *shift_before, int32_t *shift_after, size_t asymmetric, size_t per_channel, size_t per_channel_offset);
// r0: output, r1: input, r2: weight, r3: bias, r4: kSize, r5: ic4, r6: oc, r7: offset
// r8: input_sum, r10: act_min, r11: act_max, r10: out_zp, r11: out_multiplier, r10: shift_before, r11: shift_after
IndirectGemmInt8_2x4:
asm_function IndirectGemmInt8_2x4

.macro INIT_BIAS
veor q10, q10, q10
@@ -221,7 +222,7 @@ IndirectGemmInt8_2x4:
vqmovn.s32 d31, q12
vqmovn.s16 d0, q15

// prefetching is not prefered while writing results in spite of cache missings
// prefetching is not preferred while writing results in spite of cache missing
// you could try prfm pstl2strm
WriteStart:
cmp r6, #1


+ 2
- 1
mindspore/lite/nnacl/assembly/arm32/MatVecMulFp32.S View File

@@ -1,5 +1,6 @@
#ifdef __arm__
#ifndef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -17,7 +18,7 @@
// r5: depth
// r6: col

MatVecMulFp32:
asm_function MatVecMulFp32
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
push {r0-r8, r10, r11, lr}
add sp, sp, #48


+ 2
- 1
mindspore/lite/nnacl/assembly/arm32/MatmulFp32.S View File

@@ -1,4 +1,5 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatmulFloatNeon32
@@ -19,7 +20,7 @@
// r8: stride
// lr: writeNhwc/writeWino

MatmulFloatNeon32:
asm_function MatmulFloatNeon32
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
push {r0-r8, r10, r11, lr}
add sp, sp, #48


+ 2
- 1
mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S View File

@@ -1,4 +1,5 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatmulFloatNeon32Opt
@@ -19,7 +20,7 @@
// r8: stride
// lr: writeNhwc/writeWino

MatmulFloatNeon32Opt:
asm_function MatmulFloatNeon32Opt
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
push {r0-r8, r10, r11, lr}
add sp, sp, #48


+ 2
- 1
mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S View File

@@ -1,4 +1,5 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatmulFloatNeon32Opt12x4
@@ -19,7 +20,7 @@
// r8: stride
// lr: OutType_C8 = 0, OutType_Nhwc = 1, OutType_TileC8 = 2

MatmulFloatNeon32Opt12x4:
asm_function MatmulFloatNeon32Opt12x4
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
push {r0-r8, r10, r11, lr}
vpush {q4-q7}


+ 4
- 3
mindspore/lite/nnacl/assembly/arm32/MatmulInt8.S View File

@@ -1,5 +1,6 @@
#ifdef __arm__
#ifndef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -15,7 +16,7 @@
// #0: col, #4: deep16, #8: input_sums, #12: weight_bias, #16: act_min, #20: act_max, #24: out_zp
// #28: multiplier, #32: left_shift, #36: right_shift, #40: stride, #44: per_channel

MatmulInt8Neon32:
asm_function MatmulInt8Neon32
push {r0-r11, lr}
vpush {q4-q7}
add sp, sp, #116
@@ -117,7 +118,7 @@ End3:
bgt PerChannel

PerTensor:
// Substract input_sums
// Subtract input_sums
vld1.32 {d24, d25}, [r6]!
vdup.32 d20, d24[0]
vdup.32 d21, d24[1]
@@ -157,7 +158,7 @@ PerTensor:
b AddDstZP

PerChannel:
// Substract input_sums
// Subtract input_sums
vld1.32 {d24, d25, d26, d27}, [r6]!
vsub.s32 d28, d28, d24
vsub.s32 d29, d29, d25


+ 2
- 1
mindspore/lite/nnacl/assembly/arm32/MatmulInt8Opt.S View File

@@ -1,5 +1,6 @@
#ifdef __arm__
#ifndef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -16,7 +17,7 @@
// #0: col, #4: deep16, #8: input_sums, #12: weight_bias, #16: act_min, #20: act_max, #24: out_zp
// #28: multiplier, #32: left_shift, #36: right_shift, #40: stride, #44: per_channel, #48: filter_zp

MatmulInt8Opt:
asm_function MatmulInt8Opt
push {r0-r8, r10, r11, lr}
vpush {q4-q7}
add sp, sp, #112


+ 2
- 1
mindspore/lite/nnacl/assembly/arm32/MatmulWinogradFp32.S View File

@@ -1,4 +1,5 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -10,7 +11,7 @@
// MatrixMultiplyWinograd(float *matix_a, float *matrix_b, float *matrix_c, int m, int k, int n, int in_channel, int c4_channel)
// r0: matrix_a, r1: matrix_b, r2: matrix_c, r3: m, r4: k, r5: n, r6: in_channel, r7: c4_channel * 4
// #-56: matrix_a, #-52: matrix_b, #-48: matrix_c, #-44: m, #0: k, #4: n, #8: in_channel, #12: c4_channel * 4
MatrixMultiplyWinograd:
asm_function MatrixMultiplyWinograd
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
// according to https://stackoverflow.com/questions/53625807
// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway


+ 2
- 1
mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC4.S View File

@@ -1,3 +1,4 @@
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -7,7 +8,7 @@
.type PostFuncBiasReluC4, %function
#endif

PostFuncBiasReluC4:
asm_function PostFuncBiasReluC4
push {r4-r8, r10, r11, lr}
add sp, sp, #32



+ 2
- 1
mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC8.S View File

@@ -1,4 +1,5 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -21,7 +22,7 @@
// lr oc8 loop control
// r8 hw loop control

PostFuncBiasReluC8:
asm_function PostFuncBiasReluC8
push {r4-r8, r10, r11, lr}
add sp, sp, #32



+ 2
- 1
mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Peroc.S View File

@@ -1,3 +1,4 @@
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -19,7 +20,7 @@
// r6 oc_res2
// r7 stride

PreSum4x16Int8Peroc:
asm_function PreSum4x16Int8Peroc
push {r4-r11, lr}
vpush {q4-q7}
add sp, sp, #100


+ 2
- 1
mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Pert.S View File

@@ -1,3 +1,4 @@
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -15,7 +16,7 @@
// r3 co16
// r4 filter_zp

PreSum4x16Int8Pert:
asm_function PreSum4x16Int8Pert
push {r4-r8, r10, r11, lr}
vpush {q4-q7}
add sp, sp, #96


+ 2
- 1
mindspore/lite/nnacl/assembly/arm32/TiledC4MatmulFp32.S View File

@@ -1,4 +1,5 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"
.text
.align 5
.global TiledC4MatmulFp32
@@ -6,7 +7,7 @@
.type TiledC4MatmulFp32, %function
#endif

TiledC4MatmulFp32:
asm_function TiledC4MatmulFp32
//void TiledC4MatmulFp32(float* dst, const float* src, const float* weight, size_t cal_num, size_t ic4, size_t oc4)
//x0: dst
//x1: src


+ 2
- 1
mindspore/lite/nnacl/assembly/arm32/WinogradTransLeft.S View File

@@ -1,4 +1,5 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -15,7 +16,7 @@
//x4: h
//x5: k
//x6: length
WinogradTransLeft:
asm_function WinogradTransLeft
push {r4-r11, lr}
ldr r4, [sp, #36]
ldr r5, [sp, #40]


+ 2
- 1
mindspore/lite/nnacl/assembly/arm32/WinogradTransRight.S View File

@@ -1,4 +1,5 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -15,7 +16,7 @@
//x4: h
//x5: k
//x6: length
WinogradTransRight:
asm_function WinogradTransRight
push {r4-r11, lr}
ldr r4, [sp, #36]
ldr r5, [sp, #40]


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/AdderFp32.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"
.text
.align 5
.global AdderFloatNeon64
@@ -19,7 +20,7 @@
// x8: stride
// x9: writeMode

AdderFloatNeon64:
asm_function AdderFloatNeon64
sub sp, sp, #144
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Corner.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -12,7 +13,7 @@
// x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, x6: channel, x7: relu, x8: relu6

ConvDw3x3Corner:
asm_function ConvDw3x3Corner
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Horizontal.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -12,7 +13,7 @@
// x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, x6: channel, x7: relu, x8: relu6

ConvDw3x3Horizontal:
asm_function ConvDw3x3Horizontal
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -23,7 +24,7 @@
// w9: relu
// w10: relu6

ConvDw3x3Stride1:
asm_function ConvDw3x3Stride1
sub sp, sp, #128
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -23,7 +24,7 @@
// w9: relu
// w10: relu6

ConvDw3x3Stride2:
asm_function ConvDw3x3Stride2
sub sp, sp, #128
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Vertical.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -12,7 +13,7 @@
// x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step, x6: channel, x7: relu, x8: relu6

ConvDw3x3Vertical:
asm_function ConvDw3x3Vertical
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -31,7 +32,7 @@
// w15: acc_max
// w16: per_channel

ConvDw3x3Int8Neon64:
asm_function ConvDw3x3Int8Neon64
sub sp, sp, #176
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -14,7 +15,7 @@
// x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step,
// x6: channel, x7: in_zp, x8: out_zp, x9: out_multiplier, x10: left_shift, x11: right_shift
// x12: acc_min, x13: acc_max, x14: per_channel
ConvDw3x3Int8Corner:
asm_function ConvDw3x3Int8Corner
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -14,7 +15,7 @@
// x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step,
// x6: channel, x7: in_zp, x8: out_zp, x9: out_multiplier, x10: left_shift, x11: right_shift
// x12: acc_min, x13: acc_max, x14: per_channel
ConvDw3x3Int8Horizontal:
asm_function ConvDw3x3Int8Horizontal
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -31,7 +32,7 @@
// w15: acc_max
// w16: per_channel

ConvDw3x3Int8Stride2:
asm_function ConvDw3x3Int8Stride2
sub sp, sp, #176
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -14,7 +15,7 @@
// x0: dst, x1: src, x2: weight, x3: bias, x4: in_kh_step, x5: in_kw_step,
// x6: channel, x7: in_zp, x8: out_zp, x9: out_multiplier, x10: left_shift, x11: right_shift
// x12: acc_min, x13: acc_max, x14: per_channel
ConvDw3x3Int8Vertical:
asm_function ConvDw3x3Int8Vertical
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Border.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -12,7 +13,7 @@

// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: in_kh_step, x7: in_kw_step,
// x8: kernel_w, x9: relu, x10: relu6
ConvDwFp32Border:
asm_function ConvDwFp32Border
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Center.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -13,7 +14,7 @@
// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: kernel_h, x7: kernel_w,
// x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step
// x14: relu, x15: relu6
ConvDwFp32Center:
asm_function ConvDwFp32Center
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -11,7 +12,7 @@
// size_t input_stride, size_t relu, size_t relu6)
// x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6

ConvDwFp32Indirect3x3:
asm_function ConvDwFp32Indirect3x3
sub sp, sp, #16
stp x19, x20, [sp], #16



+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -11,7 +12,7 @@
// size_t input_stride, size_t relu, size_t relu6)
// x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6

ConvDwFp32Indirect5x5:
asm_function ConvDwFp32Indirect5x5
sub sp, sp, #160
stp x19, x20, [sp, #64]
stp x21, x22, [sp, #80]


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -12,7 +13,7 @@
// x0: output_ptr, x1: input_ptr, x2: filter_ptr, x3: num_pixels,
// x4: input_channel, x5: input_step
//
ConvDwFp32Row:
asm_function ConvDwFp32Row
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -16,7 +17,7 @@
// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w,
// x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step
// x14: in_zp, #56: out_zp, #64: out_multiplier, #72:left_shift, #80: right_shift, #88: acc_min, #96: acc_max
ConvDwInt8Center:
asm_function ConvDwInt8Center
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -12,7 +13,7 @@
// x0: dst, x1: buffer, x2: num_pixels, x3: output_zp, x4: out_multiplier,
// x5: left_shift, x6: right_shift, x7: acc_min, x8: acc_max

ConvDwInt8PostAlign4:
asm_function ConvDwInt8PostAlign4
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4PerChannel.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -12,7 +13,7 @@
// x0: dst, x1: buffer, x2: num_pixels, x3: output_zp, x4: out_multiplier,
// x5: left_shift, x6: right_shift, x7: acc_min, x8: acc_max

ConvDwInt8PostAlign4PerChannel:
asm_function ConvDwInt8PostAlign4PerChannel
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -12,7 +13,7 @@
// x0: output_ptr, x1: input_ptr, x2: weight_ptr, x3: num_pixels,
// x4: output_channel, x5: input_step, x6: input_zp
//
ConvDwInt8Row:
asm_function ConvDwInt8Row
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/ConvFp32Center.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -13,7 +14,7 @@
// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: kernel_h, x7: kernel_w,
// x8: out_h_step, x9: block_channel, x10: ic4, x11: in_sh_step, x12: in_sw_step, x13: in_kh_step, x14: in_kw_step
// x26: relu, x16: relu6
ConvSwFp32Center:
asm_function ConvSwFp32Center
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Border.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -11,7 +12,7 @@
// size_t in_kh_step, size_t in_kw_step, size_t kernel_w)

// x0: dst, x1: src, x2: weight, x3: height, x4: width, x5: in_kh_step, x6: in_kw_step, x7: kernel_w
DeconvDwFp32Border:
asm_function DeconvDwFp32Border
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Center.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -12,7 +13,7 @@
// size_t in_kh_step, size_t in_kw_step);
// x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step
// x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step
DeconvDwFp32Center:
asm_function DeconvDwFp32Center
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Center.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -12,7 +13,7 @@
// size_t in_kh_step, size_t in_kw_step);
// x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step
// x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step
DeconvDwInt8Center:
asm_function DeconvDwInt8Center
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Post.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -13,7 +14,7 @@
// x0: dst, x1: output_buffer, x2: bias, x3: block_channel, x4: pixel_nums, x5: out_multiplier
// x6: left_shift, x7: right_shift, x8: out_zp, x9: acc_min, x10: acc_max

DeconvDwInt8Post:
asm_function DeconvDwInt8Post
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/IndirectGemmInt16to32_8x4.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -9,7 +10,7 @@

// void IndirectGemmInt16to32_8x4(int *output, short *input, short *weight, size_t ksize, size_t ic8, size_t oc4, size_t offset);
// x0: output, x1: input, x2: weight, x3: ksize, x4: ic8, x5: oc4, x6: offset
IndirectGemmInt16to32_8x4:
asm_function IndirectGemmInt16to32_8x4

.macro INIT_ZERO
dup v28.4s, wzr


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatVecMulFp32
@@ -15,7 +16,7 @@
// w5: depth
// w6: col

MatVecMulFp32:
asm_function MatVecMulFp32
sub sp, sp, #128
st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatmulFloatNeon64
@@ -19,7 +20,7 @@
// w17: stride
// w13: c8_nhwc_c4

MatmulFloatNeon64:
asm_function MatmulFloatNeon64
sub sp, sp, #128
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatmulFloatNeon64Opt
@@ -19,7 +20,7 @@
// x8: stride
// x9: writeMode

MatmulFloatNeon64Opt:
asm_function MatmulFloatNeon64Opt
sub sp, sp, #144
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatmulInt8Neon64
@@ -29,7 +30,7 @@
// w24: stride
// w27: filter_peroc

MatmulInt8Neon64:
asm_function MatmulInt8Neon64
sub sp, sp, #208
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatmulInt8Opt
@@ -28,7 +29,7 @@
// x15: filter_peroc
// x28: filter_zp

MatmulInt8Opt:
asm_function MatmulInt8Opt
sub sp, sp, #208
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/MatmulR4Int8.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatMulR4Int8Neon64
@@ -18,7 +19,7 @@
// x6: a_sums
// x7: bias

MatMulR4Int8Neon64:
asm_function MatMulR4Int8Neon64
sub sp, sp, #128
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/MatmulWinogradFp32.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -9,7 +10,7 @@

// MatrixMultiplyWinograd(float *matix_a, float *matrix_b, float *matrix_c, int m, int k, int n, int in_channel, int c4_channel)
// x0: matrix_a, x1: matrix_b, x2: matrix_c, x3: m, x4: k, x5: n, x6: in_channel, x7: c4_channel
MatrixMultiplyWinograd:
asm_function MatrixMultiplyWinograd
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC4.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -23,7 +24,7 @@
// w13 hw loop control


PostFuncBiasReluC4:
asm_function PostFuncBiasReluC4

movi v26.4s, #6
scvtf v26.4s, v26.4s


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC8.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -21,7 +22,7 @@
// w10 oc8 loop control
// w13 hw loop control

PostFuncBiasReluC8:
asm_function PostFuncBiasReluC8
sub sp, sp, #128
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -42,7 +43,7 @@
// w15 oc4 loop control
// w16 hw loop control

PostFuncInt8C4Neon64:
asm_function PostFuncInt8C4Neon64

ldr w8, [sp]
ldr w9, [sp, #8]


+ 3
- 2
mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Peroc.S View File

@@ -1,5 +1,6 @@

#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
//.p2align 5,,15
@@ -20,7 +21,7 @@
// w6 oc_res4
// w7 stride

PreSum4x16Int8Peroc:
asm_function PreSum4x16Int8Peroc
mov w8, #0

RowLoop:


+ 2
- 2
mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Pert.S View File

@@ -1,5 +1,5 @@

#ifdef __aarch64__
#include "nnacl/assembly_global.h"
.text
.align 5
//.p2align 5,,15
@@ -16,7 +16,7 @@
// w3 co16
// w4 filter_zp

PreSum4x16Int8Pert:
asm_function PreSum4x16Int8Pert
dup v17.4s, w4
mov w5, #0



+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/TiledC4MatmulFp32.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -7,7 +8,7 @@
.type TiledC4MatmulFp32, %function
#endif

TiledC4MatmulFp32:
asm_function TiledC4MatmulFp32
//void TiledC4MatmulFp32(float* dst, const float* src, const float* weight, size_t ic4, size_t cal_num, size_t oc4)
//x0: dst
//x1: src


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -7,7 +8,7 @@
.type WinogradTransLeft, %function
#endif

WinogradTransLeft:
asm_function WinogradTransLeft
//void WinogradTransLeft(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
//x0: S
//x1: B


+ 2
- 1
mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -7,7 +8,7 @@
.type WinogradTransRight, %function
#endif

WinogradTransRight:
asm_function WinogradTransRight
//void WinogradTransRight(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
//x0: S
//x1: B


+ 2
- 1
mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Border.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -13,7 +14,7 @@

// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: in_kh_step, x7: in_kw_step,
// x8: kernel_w, x9: relu, x10: relu6
ConvDwFp16Border:
asm_function ConvDwFp16Border
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Center.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -13,7 +14,7 @@
// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w,
// x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step
// x14: relu, x15: relu6
ConvDwFp16Center:
asm_function ConvDwFp16Center
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Row.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -12,7 +13,7 @@
// x0: output_ptr, x1: input_ptr, x2: filter_ptr, x3: num_pixels,
// x4: input_channel, x5: input_step
//
ConvDwFp16Row:
asm_function ConvDwFp16Row
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Border.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -11,7 +12,7 @@
// size_t in_kh_step, size_t in_kw_step, size_t kernel_w)

// x0: dst, x1: src, x2: weight, x3: height, x4: width, x5: in_kh_step, x6: in_kw_step, x7: kernel_w
DeconvDwFp16Border:
asm_function DeconvDwFp16Border
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Center.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -12,7 +13,7 @@
// size_t in_kh_step, size_t in_kw_step);
// x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step
// x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step
DeconvDwFp16Center:
asm_function DeconvDwFp16Center
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/fp16/Float16ToFloat32.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -9,7 +10,7 @@

// void Float16ToFloat32(const float16_t *input, float *output, int number);
// x0: input, x1: output, x2: number
Float16ToFloat32:
asm_function Float16ToFloat32
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/fp16/Float32ToFloat16.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -9,7 +10,7 @@

// void Float32ToFloat16(const float *input, float16_t output, int number);
// x0: input, x1: output, x2: number
Float32ToFloat16:
asm_function Float32ToFloat16
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 5
- 4
mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -12,7 +13,7 @@
// x0: output, x1: input, x2: weight, x3: bias, x4: step, x5: ic4, x6: oc8, x7: offset,
// x8:mode, x9: writeC4, x10:relu, x11: relu6
// compute 8 channel for 16 outputs
IndirectGemmFp16_16x8:
asm_function IndirectGemmFp16_16x8

.macro INIT_BIAS
dup v16.4s, wzr
@@ -41,7 +42,7 @@ IndirectGemmFp16_16x8:
// x19 ~ r29 should be also preserved
// whereas our coding style do not permit such amount of parameters
sub sp, sp, #128
// performance between storing 4 registers at the same time and seperatly storing them on in-order cores
// performance between storing 4 registers at the same time and separately storing them on in-order cores
// is not tested yet
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
@@ -86,7 +87,7 @@ IndirectGemmStart:
fmla v19.8h, v9.8h, v1.h[5]
// load input for output 9-16
// input cache should be refreshed after loading
// ATTENTION: advance is prefered, but advancing too much may lead to invalid prefetching
// ATTENTION: advance is preferred, but advancing too much may lead to invalid prefetching
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x12], #64
// last 2 steps for output 1 and 3
fmla v16.8h, v10.8h, v0.h[2]
@@ -295,7 +296,7 @@ IndirectGemmStart:
cmp x6, #7
beq Write7
b Write8
// prefetching is not prefered while writing results in spite of cache missings
// prefetching is not preferred while writing results in spite of cache missing
// you could try prfm pstl2strm
// there are almost no benefits observed though
Write1:


+ 2
- 1
mindspore/lite/nnacl/assembly/fp16/MatVecMulFp16.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatVecMulFp16Neon64
@@ -15,7 +16,7 @@
// w5: depth
// w6: col

MatVecMulFp16Neon64:
asm_function MatVecMulFp16Neon64
sub sp, sp, #128
st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64


+ 2
- 1
mindspore/lite/nnacl/assembly/fp16/MatmulFp16.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatmulFp16Neon64
@@ -19,7 +20,7 @@
// w17: stride
// w13: writeC8

MatmulFp16Neon64:
asm_function MatmulFp16Neon64
sub sp, sp, #128
st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64


+ 2
- 1
mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatmulFp16Neon64Opt
@@ -19,7 +20,7 @@
// x8: stride
// x9: writeMode

MatmulFp16Neon64Opt:
asm_function MatmulFp16Neon64Opt
sub sp, sp, #80
st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
stp x19, x20, [sp], #16


+ 2
- 1
mindspore/lite/nnacl/assembly/fp16/MatmulWinogradFp16.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -9,7 +10,7 @@

// MatrixMultiplyWinogradFp16(float16_t *matix_a, float16_t *matrix_b, float16_t *matrix_c, int m, int k, int n, int in_channel)
// x0: matrix_a, x1: matrix_b, x2: matrix_c, x3: m, x4: k, x5: n, x6: in_channel
MatrixMultiplyWinogradFp16:
asm_function MatrixMultiplyWinogradFp16
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved


+ 2
- 1
mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC4Fp16.S View File

@@ -1,3 +1,4 @@
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -13,7 +14,7 @@
// w3 oc4div w4 oc4mod w5 plane_size
// x6 plane_stride x7 relu_type

PostFuncBiasReluC4Fp16:
asm_function PostFuncBiasReluC4Fp16

movi v26.4h, #6
scvtf v26.4h, v26.4h


+ 2
- 1
mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC8Fp16.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -21,7 +22,7 @@
// w10 oc8 loop control
// w13 hw loop control

PostFuncBiasReluC8Fp16:
asm_function PostFuncBiasReluC8Fp16
movi v26.8h, #0x46, lsl #8
dup v27.8h, wzr
mov w10, #0


+ 2
- 1
mindspore/lite/nnacl/assembly/fp16/TiledC4MatmulFp16.S View File

@@ -1,3 +1,4 @@
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -6,7 +7,7 @@
.type TiledC4MatmulFp16, %function
#endif

TiledC4MatmulFp16:
asm_function TiledC4MatmulFp16

sub sp, sp, #128
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64


+ 2
- 1
mindspore/lite/nnacl/assembly/fp16/WinogradTransLeftFp16.S View File

@@ -1,3 +1,4 @@
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -6,7 +7,7 @@
.type WinogradTransLeftFp16, %function
#endif

WinogradTransLeftFp16:
asm_function WinogradTransLeftFp16

sub sp, sp, #32
stp x19, x20, [sp], #32


+ 2
- 1
mindspore/lite/nnacl/assembly/fp16/WinogradTransRightFp16.S View File

@@ -1,3 +1,4 @@
#include "nnacl/assembly_global.h"

.text
.align 5
@@ -6,7 +7,7 @@
.type WinogradTransRightFp16, %function
#endif

WinogradTransRightFp16:
asm_function WinogradTransRightFp16

mov x8, #8 // 4 * sizeof(float16)
mul x8, x6, x8


+ 2
- 1
mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatmulInt8DpNeon64
@@ -29,7 +30,7 @@
// w24: stride
// w27: filter_peroc

MatmulInt8DpNeon64:
asm_function MatmulInt8DpNeon64
sub sp, sp, #208
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64


+ 2
- 1
mindspore/lite/nnacl/assembly/opt/MatmulDpInt8Opt.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatmulInt8DpOpt
@@ -28,7 +29,7 @@
// x15: filter_peroc
// x28: filter_zp

MatmulInt8DpOpt:
asm_function MatmulInt8DpOpt
sub sp, sp, #208
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64


+ 2
- 1
mindspore/lite/nnacl/assembly/opt/MatmulOptR4Int8.S View File

@@ -1,4 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatMulOptR4Int8Neon64
@@ -18,7 +19,7 @@
// x6: a_sums
// x7: bias

MatMulOptR4Int8Neon64:
asm_function MatMulOptR4Int8Neon64
sub sp, sp, #128
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64


+ 32
- 0
mindspore/lite/nnacl/assembly_global.h View File

@@ -0,0 +1,32 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_NNACL_ASSEMBLY_GLOBAL_H
#define MINDSPORE_LITE_NNACL_ASSEMBLY_GLOBAL_H

.macro asm_function fname
#ifdef __APPLE__
.globl _\fname _\fname :
#else
.global \fname
#ifdef __ELE__
.hidden \fname.type \fname,
% function
#endif
\fname :
#endif
.endm

#endif // MINDSPORE_LITE_NNACL_ASSEMBLY_GLOBAL_H

Loading…
Cancel
Save