From 0cb2b89a4c7c27f544777f2e48fe091f5dc516df Mon Sep 17 00:00:00 2001
From: lixian <179220644@qq.com>
Date: Sun, 16 Aug 2020 23:52:42 +0800
Subject: [PATCH] add fast cast algorithm

---
 .../arm/nnacl/assembly/opt/Float16ToFloat32.S | 54 +++++++++++++++++++
 .../arm/nnacl/assembly/opt/Float32ToFloat16.S | 54 +++++++++++++++++++
 .../runtime/kernel/arm/nnacl/fp16/cast_fp16.c |  3 +-
 3 files changed, 110 insertions(+), 1 deletion(-)
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/Float16ToFloat32.S
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/Float32ToFloat16.S

diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/Float16ToFloat32.S b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/Float16ToFloat32.S
new file mode 100644
index 0000000000..6bbc60a82c
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/Float16ToFloat32.S
@@ -0,0 +1,54 @@
+#ifdef __aarch64__
+
+.text
+.align 5
+.global Float16ToFloat32
+#ifndef __APPLE__
+.type Float16ToFloat32, %function
+#endif
+
+// void Float16ToFloat32(const float16_t *input, float *output, int number);
+// x0: input, x1: output, x2: number
+Float16ToFloat32:
+    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
+    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
+    // x19 ~ x29 should be also preserved
+    // whereas our coding style do not permit such amount of parameters
+    cmp x2, #64
+    blt Loop
+    Loop64:
+        ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        fcvtl v16.4s, v0.4h
+        fcvtl2 v17.4s, v0.8h
+        fcvtl v18.4s, v1.4h
+        fcvtl2 v19.4s, v1.8h
+        fcvtl v20.4s, v2.4h
+        fcvtl2 v21.4s, v2.8h
+        fcvtl v22.4s, v3.4h
+        fcvtl2 v23.4s, v3.8h
+        fcvtl v24.4s, v4.4h
+        fcvtl2 v25.4s, v4.8h
+        fcvtl v26.4s, v5.4h
+        fcvtl2 v27.4s, v5.8h
+        fcvtl v28.4s, v6.4h
+        fcvtl2 v29.4s, v6.8h
+        fcvtl v30.4s, v7.4h
+        fcvtl2 v31.4s, v7.8h
+        st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x1], #64
+        st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x1], #64
+        st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x1], #64
+        st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x1], #64
+        subs x2, x2, #64
+        ble LoopEnd
+        cmp x2, #64
+        bge Loop64
+    Loop:
+        ldr h0, [x0], #2
+        fcvt s0, h0
+        str s0, [x1], #4
+        subs x2, x2, #1
+        bgt Loop
+    LoopEnd:
+        ret
+#endif
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/Float32ToFloat16.S b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/Float32ToFloat16.S
new file mode 100644
index 0000000000..10e029d246
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/Float32ToFloat16.S
@@ -0,0 +1,54 @@
+#ifdef __aarch64__
+
+.text
+.align 5
+.global Float32ToFloat16
+#ifndef __APPLE__
+.type Float32ToFloat16, %function
+#endif
+
+// void Float32ToFloat16(const float *input, float16_t output, int number);
+// x0: input, x1: output, x2: number
+Float32ToFloat16:
+    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
+    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
+    // x19 ~ x29 should be also preserved
+    // whereas our coding style do not permit such amount of parameters
+    cmp x2, #64
+    blt Loop
+    Loop64:
+        ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
+        ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
+        ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64
+        ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], #64
+        fcvtn v0.4h, v16.4s
+        fcvtn2 v0.8h, v17.4s
+        fcvtn v1.4h, v18.4s
+        fcvtn2 v1.8h, v19.4s
+        fcvtn v2.4h, v20.4s
+        fcvtn2 v2.8h, v21.4s
+        fcvtn v3.4h, v22.4s
+        fcvtn2 v3.8h, v23.4s
+        fcvtn v4.4h, v24.4s
+        fcvtn2 v4.8h, v25.4s
+        fcvtn v5.4h, v26.4s
+        fcvtn2 v5.8h, v27.4s
+        fcvtn v6.4h, v28.4s
+        fcvtn2 v6.8h, v29.4s
+        fcvtn v7.4h, v30.4s
+        fcvtn2 v7.8h, v31.4s
+        st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
+        st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], #64
+        subs x2, x2, #64
+        ble LoopEnd
+        cmp x2, #64
+        bge Loop64
+    Loop:
+        ldr s0, [x0], #4
+        fcvt h0, s0
+        str h0, [x1], #2
+        subs x2, x2, #1
+        bgt Loop
+    LoopEnd:
+        ret
+#endif
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/cast_fp16.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/cast_fp16.c
index 75770dbd9e..ee870324f8 100644
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/cast_fp16.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/cast_fp16.c
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 #include "nnacl/fp16/cast_fp16.h"
-
+#ifndef ENABLE_ARM64
 void Float32ToFloat16(const float *input, float16_t *output, int number) {
   for (int i = 0; i < number; ++i) {
     output[i] = (float16_t)input[i];
@@ -26,3 +26,4 @@ void Float16ToFloat32(const float16_t *input, float *output, int number) {
     output[i] = (float)input[i];
   }
 }
+#endif