From 0cb2b89a4c7c27f544777f2e48fe091f5dc516df Mon Sep 17 00:00:00 2001 From: lixian <179220644@qq.com> Date: Sun, 16 Aug 2020 23:52:42 +0800 Subject: [PATCH] add fast cast algorithm --- .../arm/nnacl/assembly/opt/Float16ToFloat32.S | 54 +++++++++++++++++++ .../arm/nnacl/assembly/opt/Float32ToFloat16.S | 54 +++++++++++++++++++ .../runtime/kernel/arm/nnacl/fp16/cast_fp16.c | 3 +- 3 files changed, 110 insertions(+), 1 deletion(-) create mode 100644 mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/Float16ToFloat32.S create mode 100644 mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/Float32ToFloat16.S diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/Float16ToFloat32.S b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/Float16ToFloat32.S new file mode 100644 index 0000000000..6bbc60a82c --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/Float16ToFloat32.S @@ -0,0 +1,54 @@ +#ifdef __aarch64__ + +.text +.align 5 +.global Float16ToFloat32 +#ifndef __APPLE__ +.type Float16ToFloat32, %function +#endif + +// void Float16ToFloat32(const float16_t *input, float *output, int number); +// x0: input, x1: output, x2: number +Float16ToFloat32: + // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to + // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers + // x19 ~ x29 should be also preserved + // whereas our coding style do not permit such amount of parameters + cmp x2, #64 + blt Loop + Loop64: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + fcvtl v16.4s, v0.4h + fcvtl2 v17.4s, v0.8h + fcvtl v18.4s, v1.4h + fcvtl2 v19.4s, v1.8h + fcvtl v20.4s, v2.4h + fcvtl2 v21.4s, v2.8h + fcvtl v22.4s, v3.4h + fcvtl2 v23.4s, v3.8h + fcvtl v24.4s, v4.4h + fcvtl2 v25.4s, v4.8h + fcvtl v26.4s, v5.4h + fcvtl2 v27.4s, v5.8h + fcvtl v28.4s, v6.4h + fcvtl2 v29.4s, v6.8h + fcvtl v30.4s, v7.4h + fcvtl2 v31.4s, v7.8h + st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x1], #64 + st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x1], #64 + st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x1], #64 + st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x1], #64 + subs x2, x2, #64 + ble LoopEnd + cmp x2, #64 + bge Loop64 + Loop: + ldr h0, [x0], #2 + fcvt s0, h0 + str s0, [x1], #4 + subs x2, x2, #1 + bgt Loop + LoopEnd: + ret +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/Float32ToFloat16.S b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/Float32ToFloat16.S new file mode 100644 index 0000000000..10e029d246 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/Float32ToFloat16.S @@ -0,0 +1,54 @@ +#ifdef __aarch64__ + +.text +.align 5 +.global Float32ToFloat16 +#ifndef __APPLE__ +.type Float32ToFloat16, %function +#endif + +// void Float32ToFloat16(const float *input, float16_t output, int number); +// x0: input, x1: output, x2: number +Float32ToFloat16: + // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to + // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers + // x19 ~ x29 should be also preserved + // whereas our coding style do not permit such amount of parameters + cmp x2, #64 + blt Loop + Loop64: + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64 + ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64 + ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64 + ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], #64 + fcvtn v0.4h, v16.4s + fcvtn2 v0.8h, v17.4s + fcvtn v1.4h, v18.4s + fcvtn2 v1.8h, v19.4s + fcvtn v2.4h, v20.4s + fcvtn2 v2.8h, v21.4s + fcvtn v3.4h, v22.4s + fcvtn2 v3.8h, v23.4s + fcvtn v4.4h, v24.4s + fcvtn2 v4.8h, v25.4s + fcvtn v5.4h, v26.4s + fcvtn2 v5.8h, v27.4s + fcvtn v6.4h, v28.4s + fcvtn2 v6.8h, v29.4s + fcvtn v7.4h, v30.4s + fcvtn2 v7.8h, v31.4s + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], #64 + subs x2, x2, #64 + ble LoopEnd + cmp x2, #64 + bge Loop64 + Loop: + ldr s0, [x0], #4 + fcvt h0, s0 + str h0, [x1], #2 + subs x2, x2, #1 + bgt Loop + LoopEnd: + ret +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/cast_fp16.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/cast_fp16.c index 75770dbd9e..ee870324f8 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/cast_fp16.c +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/cast_fp16.c @@ -14,7 +14,7 @@ * limitations under the License. */ #include "nnacl/fp16/cast_fp16.h" - +#ifndef ENABLE_ARM64 void Float32ToFloat16(const float *input, float16_t *output, int number) { for (int i = 0; i < number; ++i) { output[i] = (float16_t)input[i]; @@ -26,3 +26,4 @@ void Float16ToFloat32(const float16_t *input, float *output, int number) { output[i] = (float)input[i]; } } +#endif