Browse Source

!14741 [MS][LITE][Develop]fix assembly headers and declaration style

From: @lx0095
Reviewed-by: @zhanghaibo5,@zhang_xue_tong
Signed-off-by: @zhang_xue_tong
pull/14741/MERGE
mindspore-ci-bot Gitee 5 years ago
parent
commit
f871ce803d
93 changed files with 191 additions and 567 deletions
  1. +1
    -7
      mindspore/lite/nnacl/assembly/arm32/ConvDw3x3Int8BorderPixel.S
  2. +0
    -4
      mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Border.S
  3. +1
    -7
      mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Center.S
  4. +0
    -4
      mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Row.S
  5. +2
    -7
      mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Center.S
  6. +1
    -7
      mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4.S
  7. +1
    -7
      mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4PerChannel.S
  8. +1
    -7
      mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Row.S
  9. +1
    -7
      mindspore/lite/nnacl/assembly/arm32/DeconvDwFp32Center.S
  10. +1
    -7
      mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Center.S
  11. +1
    -7
      mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Post.S
  12. +0
    -4
      mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt16to32_8x4.S
  13. +1
    -7
      mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt8_2x4.S
  14. +1
    -7
      mindspore/lite/nnacl/assembly/arm32/MatVecMulFp32.S
  15. +3
    -6
      mindspore/lite/nnacl/assembly/arm32/MatmulFp32.S
  16. +3
    -6
      mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S
  17. +3
    -6
      mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S
  18. +1
    -7
      mindspore/lite/nnacl/assembly/arm32/MatmulInt8.S
  19. +1
    -7
      mindspore/lite/nnacl/assembly/arm32/MatmulInt8Opt.S
  20. +0
    -4
      mindspore/lite/nnacl/assembly/arm32/MatmulWinogradFp32.S
  21. +2
    -5
      mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC4.S
  22. +0
    -5
      mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC8.S
  23. +2
    -5
      mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Peroc.S
  24. +2
    -5
      mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Pert.S
  25. +3
    -6
      mindspore/lite/nnacl/assembly/arm32/TiledC4MatmulFp32.S
  26. +2
    -6
      mindspore/lite/nnacl/assembly/arm32/WinogradTransLeft.S
  27. +2
    -6
      mindspore/lite/nnacl/assembly/arm32/WinogradTransRight.S
  28. +4
    -7
      mindspore/lite/nnacl/assembly/arm64/AdderFp32.S
  29. +1
    -5
      mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Corner.S
  30. +1
    -5
      mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Horizontal.S
  31. +1
    -6
      mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S
  32. +1
    -6
      mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S
  33. +1
    -5
      mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Vertical.S
  34. +1
    -6
      mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S
  35. +1
    -5
      mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S
  36. +1
    -5
      mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S
  37. +1
    -6
      mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S
  38. +1
    -5
      mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S
  39. +1
    -5
      mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Border.S
  40. +1
    -5
      mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Center.S
  41. +1
    -5
      mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S
  42. +1
    -5
      mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S
  43. +1
    -5
      mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S
  44. +1
    -5
      mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S
  45. +1
    -5
      mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S
  46. +1
    -5
      mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4PerChannel.S
  47. +1
    -5
      mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S
  48. +1
    -5
      mindspore/lite/nnacl/assembly/arm64/ConvFp32Center.S
  49. +1
    -5
      mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Border.S
  50. +1
    -5
      mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Center.S
  51. +1
    -5
      mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Center.S
  52. +1
    -5
      mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Post.S
  53. +1
    -5
      mindspore/lite/nnacl/assembly/arm64/IndirectGemmInt16to32_8x4.S
  54. +4
    -7
      mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S
  55. +4
    -7
      mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S
  56. +4
    -7
      mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S
  57. +4
    -7
      mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S
  58. +4
    -7
      mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S
  59. +4
    -7
      mindspore/lite/nnacl/assembly/arm64/MatmulR4Int8.S
  60. +1
    -5
      mindspore/lite/nnacl/assembly/arm64/MatmulWinogradFp32.S
  61. +3
    -8
      mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC4.S
  62. +3
    -8
      mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC8.S
  63. +3
    -9
      mindspore/lite/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S
  64. +4
    -8
      mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Peroc.S
  65. +4
    -8
      mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Pert.S
  66. +3
    -7
      mindspore/lite/nnacl/assembly/arm64/TiledC4MatmulFp32.S
  67. +3
    -7
      mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S
  68. +3
    -7
      mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S
  69. +0
    -6
      mindspore/lite/nnacl/assembly/avx/ConvDwFp32Avx3x3.S
  70. +5
    -9
      mindspore/lite/nnacl/assembly/avx/ConvDwFp32BorderAvx.S
  71. +5
    -9
      mindspore/lite/nnacl/assembly/avx/ConvDwFp32RowAvx.S
  72. +3
    -8
      mindspore/lite/nnacl/assembly/avx/MatmulAvx.S
  73. +1
    -5
      mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Border.S
  74. +1
    -5
      mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Center.S
  75. +1
    -5
      mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Row.S
  76. +1
    -5
      mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Border.S
  77. +1
    -5
      mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Center.S
  78. +1
    -5
      mindspore/lite/nnacl/assembly/fp16/Float16ToFloat32.S
  79. +1
    -5
      mindspore/lite/nnacl/assembly/fp16/Float32ToFloat16.S
  80. +1
    -5
      mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S
  81. +4
    -7
      mindspore/lite/nnacl/assembly/fp16/MatVecMulFp16.S
  82. +4
    -7
      mindspore/lite/nnacl/assembly/fp16/MatmulFp16.S
  83. +4
    -7
      mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S
  84. +1
    -5
      mindspore/lite/nnacl/assembly/fp16/MatmulWinogradFp16.S
  85. +5
    -7
      mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC4Fp16.S
  86. +3
    -8
      mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC8Fp16.S
  87. +2
    -4
      mindspore/lite/nnacl/assembly/fp16/TiledC4MatmulFp16.S
  88. +5
    -6
      mindspore/lite/nnacl/assembly/fp16/WinogradTransLeftFp16.S
  89. +6
    -7
      mindspore/lite/nnacl/assembly/fp16/WinogradTransRightFp16.S
  90. +3
    -7
      mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S
  91. +3
    -7
      mindspore/lite/nnacl/assembly/opt/MatmulDpInt8Opt.S
  92. +3
    -7
      mindspore/lite/nnacl/assembly/opt/MatmulOptR4Int8.S
  93. +11
    -8
      mindspore/lite/nnacl/assembly_global.h

+ 1
- 7
mindspore/lite/nnacl/assembly/arm32/ConvDw3x3Int8BorderPixel.S View File

@@ -1,13 +1,8 @@
#ifdef __arm__
#ifndef __aarch64__
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDw3x3Int8BorderPixel
#ifndef __APPLE__
.type ConvDw3x3Int8BorderPixel, %function
#endif

// void ConvDw3x3Int8BorderPixel(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, size_t height,
// size_t width, size_t in_kh_step, size_t in_kw_step, size_t channel, size_t in_zp, size_t out_zp,
@@ -116,4 +111,3 @@ asm_function ConvDw3x3Int8BorderPixel
vpop {q4-q7}
pop {r4-r8, r9-r12, pc}
#endif
#endif

+ 0
- 4
mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Border.S View File

@@ -3,10 +3,6 @@

.text
.align 5
.global ConvDwFp32Border
#ifndef __APPLE__
.type ConvDwFp32Border, %function
#endif

// void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
// size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6)


+ 1
- 7
mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Center.S View File

@@ -1,13 +1,8 @@
#ifdef __arm__
#ifndef __aarch64__
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDwFp32Center
#ifndef __APPLE__
.type ConvDwFp32Center, %function
#endif

// void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
@@ -164,4 +159,3 @@ LoopWEnd:
vpop {q4-q7}
pop {r0-r8, r10, r11, pc}
#endif
#endif

+ 0
- 4
mindspore/lite/nnacl/assembly/arm32/ConvDwFp32Row.S View File

@@ -3,10 +3,6 @@

.text
.align 5
.global ConvDwFp32Row
#ifndef __APPLE__
.type ConvDwFp32Row, %function
#endif

// voidConvDwFp32Row(float* output_ptr, const float* input_ptr, const float* filter_ptr,
// size_t num_pixels, size_t input_channel, size_t input_step)


+ 2
- 7
mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Center.S View File

@@ -1,13 +1,9 @@
#ifdef __arm__
#ifndef __aarch64__
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDwInt8Center
#ifndef __APPLE__
.type ConvDwInt8Center, %function
#endif

// void DepthwiseCenterInt8(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, int height,
// int width, int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step,
// int in_sw_step, int in_kh_step, int in_kw_step, int8_t *in_zp, int32_t *out_zp,
@@ -277,4 +273,3 @@ asm_function ConvDwInt8Center
vpop {q4-q7}
pop {r0-r8, r10, r11, pc}
#endif
#endif

+ 1
- 7
mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4.S View File

@@ -1,13 +1,8 @@
#ifdef __arm__
#ifndef __aarch64__
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDwInt8PostAlign4
#ifndef __APPLE__
.type ConvDwInt8PostAlign4, %function
#endif

// void ConvDwInt8PostAlign4(int8_t *dst, int32_t *buffer, int num_pixels, int32_t output_zp, int32_t out_multiplier,
// int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max);
@@ -108,4 +103,3 @@ asm_function ConvDwInt8PostAlign4
bx lr

#endif
#endif

+ 1
- 7
mindspore/lite/nnacl/assembly/arm32/ConvDwInt8PostAlign4PerChannel.S View File

@@ -1,13 +1,8 @@
#ifdef __arm__
#ifndef __aarch64__
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDwInt8PostAlign4PerChannel
#ifndef __APPLE__
.type ConvDwInt8PostAlign4PerChannel, %function
#endif

// void ConvDwInt8PostAlign4PerChannel(int8_t *dst, int32_t *buffer, int channel4, int32_t output_zp, int32_t *out_multiplier,
// int32_t *left_shift, int32_t *right_shift, int32_t acc_min, int32_t acc_max);
@@ -111,4 +106,3 @@ asm_function ConvDwInt8PostAlign4PerChannel
bx lr

#endif
#endif

+ 1
- 7
mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Row.S View File

@@ -1,13 +1,8 @@
#ifdef __arm__
#ifndef __aarch64__
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDwInt8Row
#ifndef __APPLE__
.type ConvDwInt8Row, %function
#endif

// void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels,
// int output_channel, int input_step, int8_t input_zp)
@@ -132,4 +127,3 @@ asm_function ConvDwInt8Row
vpop {q4-q7}
pop {r4-r8, r9-r12, pc}
#endif
#endif

+ 1
- 7
mindspore/lite/nnacl/assembly/arm32/DeconvDwFp32Center.S View File

@@ -1,13 +1,8 @@
#ifdef __arm__
#ifndef __aarch64__
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5
.global DeconvDwFp32Center
#ifndef __APPLE__
.type DeconvDwFp32Center, %function
#endif

// void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width,
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
@@ -67,4 +62,3 @@ asm_function DeconvDwFp32Center

pop {r0-r8, r10, r11, pc}
#endif
#endif

+ 1
- 7
mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Center.S View File

@@ -1,13 +1,8 @@
#ifdef __arm__
#ifndef __aarch64__
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5
.global DeconvDwInt8Center
#ifndef __APPLE__
.type DeconvDwInt8Center, %function
#endif

// void DeconvDwInt8Center(int32_t *dst, const int16_t *src, const int16_t *weight, size_t height, size_t width,
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
@@ -67,4 +62,3 @@ asm_function DeconvDwInt8Center

pop {r0-r8, r10, r11, pc}
#endif
#endif

+ 1
- 7
mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Post.S View File

@@ -1,13 +1,8 @@
#ifdef __arm__
#ifndef __aarch64__
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5
.global DeconvDwInt8Post
#ifndef __APPLE__
.type DeconvDwInt8Post, %function
#endif

// void DeconvDwInt8Post(int8_t *dst, int32_t *output_buffer, const int32_t *bias, int block_channel, int pixel_nums,
// int out_multiplier, int left_shift, int right_shift, int32_t out_zp, int32_t acc_min,
@@ -72,4 +67,3 @@ asm_function DeconvDwInt8Post
bx lr

#endif
#endif

+ 0
- 4
mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt16to32_8x4.S View File

@@ -3,10 +3,6 @@

.text
.align 5
.global IndirectGemmInt16to32_8x4
#ifndef __APPLE__
.type IndirectGemmInt16to32_8x4, %function
#endif

// void IndirectGemmInt16to32_8x4(int *output, short *input, short *weight, size_t kszie, size_t ic8, size_t oc4, size_t offset);
// r0: output, r1: input, r2: weight, r3: kszie, r4: ic8, r5: oc4, r6: offset


+ 1
- 7
mindspore/lite/nnacl/assembly/arm32/IndirectGemmInt8_2x4.S View File

@@ -1,13 +1,8 @@
#ifdef __arm__
#ifndef __aarch64__
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5
.global IndirectGemmInt8_2x4
#ifndef __APPLE__
.type IndirectGemmInt8_2x4, %function
#endif

// void IndirectGemmInt8_2x4(int8_t *output, int8_t *input, int8_t *weight, int32_t *bias, size_t ksize, size_t ic4,
// size_t oc, size_t offset, int32_t *input_sum, size_t act_min, size_t act_max, size_t out_zp, int32_t *out_multiplier,
@@ -294,4 +289,3 @@ LoopOcEnd:
vpop {q4-q7}
pop {r4-r8, r10, r11, pc}
#endif
#endif

+ 1
- 7
mindspore/lite/nnacl/assembly/arm32/MatVecMulFp32.S View File

@@ -1,13 +1,8 @@
#ifdef __arm__
#ifndef __aarch64__
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5
.global MatVecMulFp32
#ifndef __APPLE__
.type MatVecMulFp32, %function
#endif

// void MatVecMulFp32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col)
// r0: a
@@ -183,4 +178,3 @@ End:
sub sp, sp, #52
pop {r0-r8, r9, r10, r11, pc}
#endif
#endif

+ 3
- 6
mindspore/lite/nnacl/assembly/arm32/MatmulFp32.S View File

@@ -1,11 +1,8 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatmulFloatNeon32
#ifndef __APPLE__
.type MatmulFloatNeon32, %function
#endif

.text
.align 5

// void MatmulFloatNeon32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
// int row, int col, size_t stride, size_t writeNhwc, size_t WriteWino)


+ 3
- 6
mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S View File

@@ -1,11 +1,8 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatmulFloatNeon32Opt
#ifndef __APPLE__
.type MatmulFloatNeon32Opt, %function
#endif

.text
.align 5

// void MatmulFloatNeon32Opt(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
// int row, int col, size_t stride, size_t writeMode)


+ 3
- 6
mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt12x4.S View File

@@ -1,11 +1,8 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatmulFloatNeon32Opt12x4
#ifndef __APPLE__
.type MatmulFloatNeon32Opt12x4, %function
#endif

.text
.align 5

// void MatmulFloatNeon32Opt12x4(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
// int row, int col, size_t stride, size_t writeMode)


+ 1
- 7
mindspore/lite/nnacl/assembly/arm32/MatmulInt8.S View File

@@ -1,13 +1,8 @@
#ifdef __arm__
#ifndef __aarch64__
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5
.global MatmulInt8Neon32
#ifndef __APPLE__
.type MatmulInt8Neon32, %function
#endif

//void MatmulInt8Neon32(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep16,
// const int *input_sums, const int *weight_bias, int act_min, int act_max, int out_zp,
@@ -286,4 +281,3 @@ End1:
vpop {q4-q7}
pop {r0-r11, pc}
#endif
#endif

+ 1
- 7
mindspore/lite/nnacl/assembly/arm32/MatmulInt8Opt.S View File

@@ -1,13 +1,8 @@
#ifdef __arm__
#ifndef __aarch64__
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5
.global MatmulInt8Opt
#ifndef __APPLE__
.type MatmulInt8Opt, %function
#endif

//void MatmulInt8Neon32Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep16,
// const int *input_sums, const int *weight_bias, int act_min, int act_max, int out_zp,
@@ -288,4 +283,3 @@ LoopRowEnd:
vpop {q4-q7}
pop {r0-r8, r10, r11, pc}
#endif
#endif

+ 0
- 4
mindspore/lite/nnacl/assembly/arm32/MatmulWinogradFp32.S View File

@@ -3,10 +3,6 @@

.text
.align 5
.global MatrixMultiplyWinograd
#ifndef __APPLE__
.type MatrixMultiplyWinograd, %function
#endif

// MatrixMultiplyWinograd(float *matix_a, float *matrix_b, float *matrix_c, int m, int k, int n, int in_channel, int c4_channel)
// r0: matrix_a, r1: matrix_b, r2: matrix_c, r3: m, r4: k, r5: n, r6: in_channel, r7: c4_channel * 4


+ 2
- 5
mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC4.S View File

@@ -1,12 +1,8 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5
//.p2align 5,,15
.global PostFuncBiasReluC4
#ifndef __APPLE__
.type PostFuncBiasReluC4, %function
#endif

asm_function PostFuncBiasReluC4
push {r4-r8, r10, r11, lr}
@@ -234,3 +230,4 @@ Loop_C1_3_Write:
End:
sub sp, sp, #32
pop {r4-r8, r10, r11, pc}
#endif

+ 0
- 5
mindspore/lite/nnacl/assembly/arm32/PostFuncBiasReluC8.S View File

@@ -3,11 +3,6 @@

.text
.align 5
//.p2align 5,,15
.global PostFuncBiasReluC8
#ifndef __APPLE__
.type PostFuncBiasReluC8, %function
#endif

//void PostFuncBiasReluC8(float *dst, const float *src, const float *bias, size_t oc8div,size_t oc8mod
// size_t plane_size, size_t stride, int relu_type);


+ 2
- 5
mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Peroc.S View File

@@ -1,12 +1,8 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5
.global PreSum4x16Int8Peroc
#ifndef __APPLE__
.type PreSum4x16Int8Peroc, %function
#endif


//void PreSum4x16Int8Peroc(const int8_t *src, int32_t *sum, int32_t *zp, size_t hw4, size_t ic16, int32_t oc_div2,
// size_t oc_res2, size_t stride);
@@ -129,3 +125,4 @@ End:
sub sp, sp, #100
vpop {q4-q7}
pop {r4-r11, pc}
#endif

+ 2
- 5
mindspore/lite/nnacl/assembly/arm32/PreSum4x16Int8Pert.S View File

@@ -1,12 +1,8 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5
.global PreSum4x16Int8Pert
#ifndef __APPLE__
.type PreSum4x16Int8Pert, %function
#endif


// void PreSum4x16Int8Pert(const int8_t *src, int32_t *sum, size_t row4, size_t col16, int32_t filter_zp);

@@ -80,3 +76,4 @@ End:
sub sp, sp, #96
vpop {q4-q7}
pop {r4-r8, r10, r11, pc}
#endif

+ 3
- 6
mindspore/lite/nnacl/assembly/arm32/TiledC4MatmulFp32.S View File

@@ -1,11 +1,8 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"
.text
.align 5
.global TiledC4MatmulFp32
#ifndef __APPLE__
.type TiledC4MatmulFp32, %function
#endif

.text
.align 5

asm_function TiledC4MatmulFp32
//void TiledC4MatmulFp32(float* dst, const float* src, const float* weight, size_t cal_num, size_t ic4, size_t oc4)


+ 2
- 6
mindspore/lite/nnacl/assembly/arm32/WinogradTransLeft.S View File

@@ -1,12 +1,8 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5
.global WinogradTransLeft
#ifndef __APPLE__
.type WinogradTransLeft, %function
#endif
.text
.align 5

//void WinogradTransLeft(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
//x0: S


+ 2
- 6
mindspore/lite/nnacl/assembly/arm32/WinogradTransRight.S View File

@@ -1,12 +1,8 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5
.global WinogradTransRight
#ifndef __APPLE__
.type WinogradTransRight, %function
#endif
.text
.align 5

//void WinogradTransRight(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
//x0: S


+ 4
- 7
mindspore/lite/nnacl/assembly/arm64/AdderFp32.S View File

@@ -1,11 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"
.text
.align 5
.global AdderFloatNeon64
#ifndef __APPLE__
.type AdderFloatNeon64, %function
#endif

.text
.align 5

// void AdderFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
// int row, int col, size_t stride)


+ 1
- 5
mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Corner.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDw3x3Corner
#ifndef __APPLE__
.type ConvDw3x3Corner, %function
#endif

// void ConvDw3x3Corner(float *dst, const float *src, const float *weight, const float *bias, int in_kh_step,
// int in_kw_step, int channel, size_t relu, size_t relu6)


+ 1
- 5
mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Horizontal.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDw3x3Horizontal
#ifndef __APPLE__
.type ConvDw3x3Horizontal, %function
#endif

// void ConvDw3x3Horizontal(float *dst, const float *src, const float *weight, const float *bias, int in_kh_step,
// int in_kw_step, int channel, size_t relu, size_t relu6)


+ 1
- 6
mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride1.S View File

@@ -1,13 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDw3x3Stride1
#ifndef __APPLE__
.type ConvDw3x3Stride1, %function
#endif


// void ConvDw3x3Stride1(float *output, const float *buffer, const float *weight, const float *bias, int col_size,
// int row_size, int channel, int output_h, int output_w, size_t relu, size_t relu6)


+ 1
- 6
mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Stride2.S View File

@@ -1,13 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDw3x3Stride2
#ifndef __APPLE__
.type ConvDw3x3Stride2, %function
#endif


// void ConvDw3x3Stride2(float *output, const float *buffer, const float *weight, const float *bias, int col_size,
// int row_size, int channel, int output_h, int output_w, size_t relu, size_t relu6)


+ 1
- 5
mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Fp32Vertical.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDw3x3Vertical
#ifndef __APPLE__
.type ConvDw3x3Vertical, %function
#endif

// void ConvDw3x3Vertical(float *dst, const float *src, const float *weight, const float *bias, int in_kh_step,
// int in_kw_step, int channel, size_t relu, size_t relu6)


+ 1
- 6
mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S View File

@@ -1,13 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDw3x3Int8Neon64
#ifndef __APPLE__
.type ConvDw3x3Int8Neon64, %function
#endif


// void ConvDw3x3Int8Neon64(int8_t *output, const int8_t *input, const int16_t *weight, const int32_t *bias, int input_col_size,
// int input_row_size, int channel, int output_h, int output_w, int8_t in_zp, int32_t out_zp,


+ 1
- 5
mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDw3x3Int8Corner
#ifndef __APPLE__
.type ConvDw3x3Int8Corner, %function
#endif

// void ConvDw3x3Int8Corner(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, size_t in_kh_step,
// size_t in_kw_step, size_t channel, size_t in_zp, size_t out_zp, int32_t *out_multiplier,


+ 1
- 5
mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDw3x3Int8Horizontal
#ifndef __APPLE__
.type ConvDw3x3Int8Horizontal, %function
#endif

// void ConvDw3x3Int8Horizontal(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, size_t in_kh_step,
// size_t in_kw_step, size_t channel, size_t in_zp, size_t out_zp, int32_t *out_multiplier,


+ 1
- 6
mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Stride2.S View File

@@ -1,13 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDw3x3Int8Stride2
#ifndef __APPLE__
.type ConvDw3x3Int8Stride2, %function
#endif


// void ConvDw3x3Int8Stride2(int8_t *output, const int8_t *input, const int16_t *weight, const int32_t *bias, int input_col_size,
// int input_row_size, int channel, int output_h, int output_w, int8_t in_zp, int32_t out_zp,


+ 1
- 5
mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDw3x3Int8Vertical
#ifndef __APPLE__
.type ConvDw3x3Int8Vertical, %function
#endif

// void ConvDw3x3Int8Vertical(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, size_t in_kh_step,
// size_t in_kw_step, size_t channel, size_t in_zp, size_t out_zp, int32_t *out_multiplier,


+ 1
- 5
mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Border.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDwFp32Border
#ifndef __APPLE__
.type ConvDwFp32Border, %function
#endif

// void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
// size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6)


+ 1
- 5
mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Center.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDwFp32Center
#ifndef __APPLE__
.type ConvDwFp32Center, %function
#endif

// void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,


+ 1
- 5
mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDwFp32Indirect3x3
#ifndef __APPLE__
.type ConvDwFp32Indirect3x3, %function
#endif

// void ConvDwFp32Indirect3x3(float *output, float **input, const float *weights, const float *bias, int channels, int output_width,
// size_t input_stride, size_t relu, size_t relu6)


+ 1
- 5
mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDwFp32Indirect5x5
#ifndef __APPLE__
.type ConvDwFp32Indirect5x5, %function
#endif

// void ConvDwFp32Indirect5x5(float *output, float **input, const float *weights, const float *bias, int channels, int output_width,
// size_t input_stride, size_t relu, size_t relu6)


+ 1
- 5
mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDwFp32Row
#ifndef __APPLE__
.type ConvDwFp32Row, %function
#endif

// void ConvDwFp32Row(float* output_ptr, const float* input_ptr,const float* filter_ptr,
// size_t num_pixels, size_t input_channel, size_t input_step)


+ 1
- 5
mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Center.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDwInt8Center
#ifndef __APPLE__
.type ConvDwInt8Center, %function
#endif

// void ConvDwInt8Center(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, size_t height,
// size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel,


+ 1
- 5
mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDwInt8PostAlign4
#ifndef __APPLE__
.type ConvDwInt8PostAlign4, %function
#endif

// void ConvDwInt8PostAlign4(int8_t *dst, int32_t *buffer, int num_pixels, int32_t output_zp, int32_t out_multiplier,
// int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max);


+ 1
- 5
mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4PerChannel.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDwInt8PostAlign4PerChannel
#ifndef __APPLE__
.type ConvDwInt8PostAlign4PerChannel, %function
#endif

// void ConvDwInt8PostAlign4PerChannel(int8_t *dst, int32_t *buffer, int channel4, int32_t output_zp, int32_t *out_multiplier,
// int32_t *left_shift, int32_t *right_shift, int32_t acc_min, int32_t acc_max);


+ 1
- 5
mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDwInt8Row
#ifndef __APPLE__
.type ConvDwInt8Row, %function
#endif

// void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels,
// int output_channel, int input_step, int8_t input_zp)


+ 1
- 5
mindspore/lite/nnacl/assembly/arm64/ConvFp32Center.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvSwFp32Center
#ifndef __APPLE__
.type ConvSwFp32Center, %function
#endif

// void ConvSwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t ic4, size_t in_sh_step,


+ 1
- 5
mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Border.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global DeconvDwFp32Border
#ifndef __APPLE__
.type DeconvDwFp32Border, %function
#endif

// void DeconvDwFp32Border(float *dst, const float *src, const float *weight, size_t height, size_t width,
// size_t in_kh_step, size_t in_kw_step, size_t kernel_w)


+ 1
- 5
mindspore/lite/nnacl/assembly/arm64/DeconvDwFp32Center.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global DeconvDwFp32Center
#ifndef __APPLE__
.type DeconvDwFp32Center, %function
#endif

// void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width,
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,


+ 1
- 5
mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Center.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global DeconvDwInt8Center
#ifndef __APPLE__
.type DeconvDwInt8Center, %function
#endif

// void DeconvDwInt8Center(int32_t *dst, const int16_t *src, const int16_t *weight, size_t height, size_t width,
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,


+ 1
- 5
mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Post.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global DeconvDwInt8Post
#ifndef __APPLE__
.type DeconvDwInt8Post, %function
#endif

// void DeconvDwInt8Post(int8_t *dst, int32_t *output_buffer, const int32_t *bias, int block_channel, int pixel_nums,
// int out_multiplier, int left_shift, int right_shift, int32_t out_zp, int32_t acc_min,


+ 1
- 5
mindspore/lite/nnacl/assembly/arm64/IndirectGemmInt16to32_8x4.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global IndirectGemmInt16to32_8x4
#ifndef __APPLE__
.type IndirectGemmInt16to32_8x4, %function
#endif

// void IndirectGemmInt16to32_8x4(int *output, short *input, short *weight, size_t ksize, size_t ic8, size_t oc4, size_t offset);
// x0: output, x1: input, x2: weight, x3: ksize, x4: ic8, x5: oc4, x6: offset


+ 4
- 7
mindspore/lite/nnacl/assembly/arm64/MatVecMulFp32.S View File

@@ -1,11 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatVecMulFp32
#ifndef __APPLE__
.type MatVecMulFp32, %function
#endif
.text
.align 5

// void MatVecMulFp32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col)
// x0: a


+ 4
- 7
mindspore/lite/nnacl/assembly/arm64/MatmulFp32.S View File

@@ -1,11 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatmulFloatNeon64
#ifndef __APPLE__
.type MatmulFloatNeon64, %function
#endif

.text
.align 5

// void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
// int row, int col, size_t stride, size_t writeNhwc, size_t WriteWino)


+ 4
- 7
mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S View File

@@ -1,11 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatmulFloatNeon64Opt
#ifndef __APPLE__
.type MatmulFloatNeon64Opt, %function
#endif

.text
.align 5

// void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
// int row, int col, size_t stride, size_t writeMode)


+ 4
- 7
mindspore/lite/nnacl/assembly/arm64/MatmulInt8.S View File

@@ -1,11 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatmulInt8Neon64
#ifndef __APPLE__
.type MatmulInt8Neon64, %function
#endif

.text
.align 5

//void MatmulInt8Neon64(const int8_t *a, const int8_t *b, int8_t *dst, int row4, int col4, int deep16, const int *a_sums,
// const int *bias, int act_min, int act_max, int out_zp, int32_t *multiplier, int32_t *left_shift,


+ 4
- 7
mindspore/lite/nnacl/assembly/arm64/MatmulInt8Opt.S View File

@@ -1,11 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatmulInt8Opt
#ifndef __APPLE__
.type MatmulInt8Opt, %function
#endif

.text
.align 5

//void MatmulInt8Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep16, const int *a_sums,
// const int *bias, int act_min, int act_max, int out_zp, int32_t *multiplier, int32_t *left_shift,


+ 4
- 7
mindspore/lite/nnacl/assembly/arm64/MatmulR4Int8.S View File

@@ -1,11 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatMulR4Int8Neon64
#ifndef __APPLE__
.type MatMulR4Int8Neon64, %function
#endif

.text
.align 5

//void MatMulR4Int8Neon64(const int8_t *a, const int8_t *b, int32_t *dst, int row4, int col4, int deep16,
// const int *input_sum, const int *bias)


+ 1
- 5
mindspore/lite/nnacl/assembly/arm64/MatmulWinogradFp32.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global MatrixMultiplyWinograd
#ifndef __APPLE__
.type MatrixMultiplyWinograd, %function
#endif

// MatrixMultiplyWinograd(float *matix_a, float *matrix_b, float *matrix_c, int m, int k, int n, int in_channel, int c4_channel)
// x0: matrix_a, x1: matrix_b, x2: matrix_c, x3: m, x4: k, x5: n, x6: in_channel, x7: c4_channel


+ 3
- 8
mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC4.S View File

@@ -1,13 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
//.p2align 5,,15
.global PostFuncBiasReluC4
#ifndef __APPLE__
.type PostFuncBiasReluC4, %function
#endif
.text
.align 5

//void PostFuncBiasReluC4(float *dst, const float *src, const float *bias, size_t oc4div, size_t oc4mod,
// size_t plane_size, size_t plane_stride, size_t relu_type);


+ 3
- 8
mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC8.S View File

@@ -1,13 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
//.p2align 5,,15
.global PostFuncBiasReluC8
#ifndef __APPLE__
.type PostFuncBiasReluC8, %function
#endif
.text
.align 5

//void PostFuncBiasReluC8(float *dst, const float *src, const float *bias, size_t oc8div,size_t oc8mod
// size_t plane_size, size_t stride, int relu_type);


+ 3
- 9
mindspore/lite/nnacl/assembly/arm64/PostFuncInt8C4Neon64.S View File

@@ -1,14 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
//.p2align 5,,15
.global PostFuncInt8C4Neon64
#ifndef __APPLE__
.type PostFuncInt8C4Neon64, %function
#endif

.text
.align 5

//void PostFuncInt8C4Neon64(const int32_t *in, const int32_t *bias, int8_t *out, size_t oc4div, size_t oc4res,
// size_t plane, size_t stride, int32_t multiplier, int32_t left_shift, int32_t right_shift,


+ 4
- 8
mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Peroc.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"
.text
.align 5
//.p2align 5,,15
.global PreSum4x16Int8Peroc
#ifndef __APPLE__
.type PreSum4x16Int8Peroc, %function
#endif

.text
.align 5

//void PreSum4x16Int8Peroc(const int8_t *src, int32_t *sum, int32_t *zp, size_t hw4, size_t ic16, int32_t oc_div4,
// size_t oc_res4, size_t stride);


+ 4
- 8
mindspore/lite/nnacl/assembly/arm64/PreSum4x16Int8Pert.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"
.text
.align 5
//.p2align 5,,15
.global PreSum4x16Int8Pert
#ifndef __APPLE__
.type PreSum4x16Int8Pert, %function
#endif

.text
.align 5

// void PreSum4x16Int8Pert(const int8_t *src, int32_t *dst, size_t row4, size_t col16, int32_t filter_zp);



+ 3
- 7
mindspore/lite/nnacl/assembly/arm64/TiledC4MatmulFp32.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global TiledC4MatmulFp32
#ifndef __APPLE__
.type TiledC4MatmulFp32, %function
#endif
.text
.align 5

asm_function TiledC4MatmulFp32
//void TiledC4MatmulFp32(float* dst, const float* src, const float* weight, size_t ic4, size_t cal_num, size_t oc4)


+ 3
- 7
mindspore/lite/nnacl/assembly/arm64/WinogradTransLeft.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global WinogradTransLeft
#ifndef __APPLE__
.type WinogradTransLeft, %function
#endif
.text
.align 5

asm_function WinogradTransLeft
//void WinogradTransLeft(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);


+ 3
- 7
mindspore/lite/nnacl/assembly/arm64/WinogradTransRight.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global WinogradTransRight
#ifndef __APPLE__
.type WinogradTransRight, %function
#endif
.text
.align 5

asm_function WinogradTransRight
//void WinogradTransRight(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);


+ 0
- 6
mindspore/lite/nnacl/assembly/avx/ConvDwFp32Avx3x3.S View File

@@ -2,12 +2,6 @@
#include "nnacl/assembly_global.h"
.text
.align 4
.global ConvDwFp32Avx3x3
#ifndef __APPLE__
#ifndef WIN32
.type ConvDwFp32Avx3x3, %function
#endif
#endif

// void ConvDwFp32Avx3x3(float *output, float **input, const float *weights, const float *bias, size_t channels, size_t output_width,
// size_t input_stride, size_t relum, szie_t relu6)


+ 5
- 9
mindspore/lite/nnacl/assembly/avx/ConvDwFp32BorderAvx.S View File

@@ -1,18 +1,14 @@
#ifdef ENABLE_AVX
.text
.align 4
.global ConvDwFp32Border
#ifndef __APPLE__
#ifndef WIN32
.type ConvDwFp32Border, %function
#endif
#endif
#include "nnacl/assembly_global.h"

.text
.align 4

// void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height,
// size_t width, size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu,
// size_t relu6);

ConvDwFp32Border:
asm_function ConvDwFp32Border
pushq %r15
pushq %r14
pushq %r13


+ 5
- 9
mindspore/lite/nnacl/assembly/avx/ConvDwFp32RowAvx.S View File

@@ -1,12 +1,8 @@
#ifdef ENABLE_AVX
.text
.align 4
.global ConvDwFp32Row
#ifndef __APPLE__
#ifndef WIN32
.type ConvDwFp32Row, %function
#endif
#endif
#include "nnacl/assembly_global.h"

.text
.align 4

// void ConvDwFp32Row(float *output_ptr, const float *input_tmp, const float *weight_ptr, size_t num_pixels,
// size_t output_channel, size_t input_step);
@@ -26,7 +22,7 @@
// 40: output_channel
// 48: input_step

ConvDwFp32Row:
asm_function ConvDwFp32Row
pushq %r15
pushq %r14
pushq %r13


+ 3
- 8
mindspore/lite/nnacl/assembly/avx/MatmulAvx.S View File

@@ -1,13 +1,8 @@
#ifdef ENABLE_AVX
#include "nnacl/assembly_global.h"
.text
.align 4
.global MatmulFloatAvxOpt
#ifndef __APPLE__
#ifndef WIN32
.type MatmulFloatAvxOpt, %function
#endif
#endif

.text
.align 4

// void MatmulFloatAvxOpt(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
// int row, int col, size_t stride, size_t writeMode)


+ 1
- 5
mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Border.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDwFp16Border
#ifndef __APPLE__
.type ConvDwFp16Border, %function
#endif

// void ConvDwFp16Border(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
// size_t height, size_t width, size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu,


+ 1
- 5
mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Center.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDwFp16Center
#ifndef __APPLE__
.type ConvDwFp16Center, %function
#endif

// void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, size_t height, size_t width,
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,


+ 1
- 5
mindspore/lite/nnacl/assembly/fp16/ConvDwFp16Row.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global ConvDwFp16Row
#ifndef __APPLE__
.type ConvDwFp16Row, %function
#endif

// void ConvDwFp16Row(float16_t* output_ptr, const float16_t* input_ptr,const float16_t* filter_ptr,
// size_t num_pixels, size_t input_channel, size_t input_step)


+ 1
- 5
mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Border.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global DeconvDwFp16Border
#ifndef __APPLE__
.type DeconvDwFp16Border, %function
#endif

// void DeconvDwFp16Border(float *dst, const float *src, const float *weight, size_t height, size_t width,
// size_t in_kh_step, size_t in_kw_step, size_t kernel_w)


+ 1
- 5
mindspore/lite/nnacl/assembly/fp16/DeconvDwFp16Center.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global DeconvDwFp16Center
#ifndef __APPLE__
.type DeconvDwFp16Center, %function
#endif

// void DeconvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width,
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,


+ 1
- 5
mindspore/lite/nnacl/assembly/fp16/Float16ToFloat32.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global Float16ToFloat32
#ifndef __APPLE__
.type Float16ToFloat32, %function
#endif

// void Float16ToFloat32(const float16_t *input, float *output, int number);
// x0: input, x1: output, x2: number


+ 1
- 5
mindspore/lite/nnacl/assembly/fp16/Float32ToFloat16.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global Float32ToFloat16
#ifndef __APPLE__
.type Float32ToFloat16, %function
#endif

// void Float32ToFloat16(const float *input, float16_t output, int number);
// x0: input, x1: output, x2: number


+ 1
- 5
mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global IndirectGemmFp16_16x8
#ifndef __APPLE__
.type IndirectGemmFp16_16x8, %function
#endif

// void IndirectGemmFp16_16x8(float16_t *output, float16_t *input, float16_t *weight, float16_t *bias,
// size_t step, size_t ic4, size_t oc8, size_t offset, size_t mode, size_t writeC4, size_t relu, size_t relu6);


+ 4
- 7
mindspore/lite/nnacl/assembly/fp16/MatVecMulFp16.S View File

@@ -1,11 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatVecMulFp16Neon64
#ifndef __APPLE__
.type MatVecMulFp16Neon64, %function
#endif

.text
.align 5

// void MatVecMulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth, int col)
// x0: a


+ 4
- 7
mindspore/lite/nnacl/assembly/fp16/MatmulFp16.S View File

@@ -1,11 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatmulFp16Neon64
#ifndef __APPLE__
.type MatmulFp16Neon64, %function
#endif

.text
.align 5

// void MatmulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type,
// int depth, int row, int col, int stride, bool write_nhwc)


+ 4
- 7
mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S View File

@@ -1,11 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatmulFp16Neon64Opt
#ifndef __APPLE__
.type MatmulFp16Neon64Opt, %function
#endif

.text
.align 5

// void MatmulFp16Neon64Opt(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type,
// int depth, int row, int col, size_t stride, size_t writeMode)


+ 1
- 5
mindspore/lite/nnacl/assembly/fp16/MatmulWinogradFp16.S View File

@@ -1,12 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global MatrixMultiplyWinogradFp16
#ifndef __APPLE__
.type MatrixMultiplyWinogradFp16, %function
#endif

// MatrixMultiplyWinogradFp16(float16_t *matix_a, float16_t *matrix_b, float16_t *matrix_c, int m, int k, int n, int in_channel)
// x0: matrix_a, x1: matrix_b, x2: matrix_c, x3: m, x4: k, x5: n, x6: in_channel


+ 5
- 7
mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC4Fp16.S View File

@@ -1,12 +1,8 @@
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
//.p2align 5,,15
.global PostFuncBiasReluC4Fp16
#ifndef __APPLE__
.type PostFuncBiasReluC4Fp16, %function
#endif
.text
.align 5

//void PostFuncBiasReluC4Fp16(float16_t *dst, const float16_t *src, const float16_t *bias, size_t oc4div, size_t oc4mod,
// size_t plane_size, size_t plane_stride, size_t relu_type);
@@ -278,3 +274,5 @@ Loop_C1_3_Write:

End:
ret

#endif

+ 3
- 8
mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC8Fp16.S View File

@@ -1,13 +1,8 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
//.p2align 5,,15
.global PostFuncBiasReluC8Fp16
#ifndef __APPLE__
.type PostFuncBiasReluC8Fp16, %function
#endif
.text
.align 5

//void PostFuncBiasReluC8Fp16(float *dst, const float *src, const float *bias, size_t oc8div,size_t oc8mod
// size_t plane_size, size_t stride, int relu_type);


+ 2
- 4
mindspore/lite/nnacl/assembly/fp16/TiledC4MatmulFp16.S View File

@@ -1,11 +1,8 @@
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global TiledC4MatmulFp16
#ifndef __APPLE__
.type TiledC4MatmulFp16, %function
#endif

asm_function TiledC4MatmulFp16

@@ -258,3 +255,4 @@ LoopOcEnd:
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ret

#endif

+ 5
- 6
mindspore/lite/nnacl/assembly/fp16/WinogradTransLeftFp16.S View File

@@ -1,11 +1,8 @@
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global WinogradTransLeftFp16
#ifndef __APPLE__
.type WinogradTransLeftFp16, %function
#endif
.text
.align 5

asm_function WinogradTransLeftFp16

@@ -135,3 +132,5 @@ LoopH:
sub sp, sp, #16
ldp x19, x20, [sp], #16
ret

#endif

+ 6
- 7
mindspore/lite/nnacl/assembly/fp16/WinogradTransRightFp16.S View File

@@ -1,11 +1,8 @@
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5
.global WinogradTransRightFp16
#ifndef __APPLE__
.type WinogradTransRightFp16, %function
#endif
.text
.align 5

asm_function WinogradTransRightFp16

@@ -138,4 +135,6 @@ LoopH:
sub sp, sp, #16
ldp x19, x20, [sp], #16

ret
ret

#endif

+ 3
- 7
mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S View File

@@ -1,11 +1,7 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatmulInt8DpNeon64
#ifndef __APPLE__
.type MatmulInt8DpNeon64, %function
#endif
.text
.align 5

//void MatmulInt8DpNeon64(const int8_t *a, const int8_t *b, int8_t *dst, int row8, int col8, int deep4,
// const int *a_sums, const int *bias, int act_min, int act_max, int out_zp,


+ 3
- 7
mindspore/lite/nnacl/assembly/opt/MatmulDpInt8Opt.S View File

@@ -1,11 +1,7 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatmulInt8DpOpt
#ifndef __APPLE__
.type MatmulInt8DpOpt, %function
#endif
.text
.align 5

//void MatmulInt8DpOpt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep4, const int *a_sums,
// const int *bias, int act_min, int act_max, int out_zp, int32_t *multiplier, int32_t *left_shift,


+ 3
- 7
mindspore/lite/nnacl/assembly/opt/MatmulOptR4Int8.S View File

@@ -1,11 +1,7 @@
#ifdef __aarch64__
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"
.text
.align 5
.global MatMulOptR4Int8Neon64
#ifndef __APPLE__
.type MatMulOptR4Int8Neon64, %function
#endif
.text
.align 5

//void MatMulOptR4Int8Neon64(const int8_t *a, const int8_t *b, int *dst, int row4, int col4, int deep16,
// const int *input_sum, const int *bias)


+ 11
- 8
mindspore/lite/nnacl/assembly_global.h View File

@@ -16,18 +16,21 @@
#ifndef MINDSPORE_LITE_NNACL_ASSEMBLY_GLOBAL_H
#define MINDSPORE_LITE_NNACL_ASSEMBLY_GLOBAL_H

// clang-format off
.macro asm_function fname
#ifdef __APPLE__
.globl _\fname;
_\fname :
.globl _\fname
_\fname:
#else
.global \fname;
#ifdef __ELE__
.hidden \fname;
.type \fname, % function;
.global \fname
#ifdef __ELF__
.hidden \fname
.type \fname, %function
#endif
\fname :
\fname:
#endif
.endm
.endm

// clang-format on

#endif // MINDSPORE_LITE_NNACL_ASSEMBLY_GLOBAL_H

Loading…
Cancel
Save