| @@ -0,0 +1,54 @@ | |||
| #ifdef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| .global Float16ToFloat32 | |||
| #ifndef __APPLE__ | |||
| .type Float16ToFloat32, %function | |||
| #endif | |||
| // void Float16ToFloat32(const float16_t *input, float *output, int number); | |||
| // x0: input, x1: output, x2: number | |||
| Float16ToFloat32: | |||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | |||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | |||
| // x19 ~ x29 should be also preserved | |||
| // whereas our coding style do not permit such amount of parameters | |||
| cmp x2, #64 | |||
| blt Loop | |||
| Loop64: | |||
| ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 | |||
| ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 | |||
| fcvtl v16.4s, v0.4h | |||
| fcvtl2 v17.4s, v0.8h | |||
| fcvtl v18.4s, v1.4h | |||
| fcvtl2 v19.4s, v1.8h | |||
| fcvtl v20.4s, v2.4h | |||
| fcvtl2 v21.4s, v2.8h | |||
| fcvtl v22.4s, v3.4h | |||
| fcvtl2 v23.4s, v3.8h | |||
| fcvtl v24.4s, v4.4h | |||
| fcvtl2 v25.4s, v4.8h | |||
| fcvtl v26.4s, v5.4h | |||
| fcvtl2 v27.4s, v5.8h | |||
| fcvtl v28.4s, v6.4h | |||
| fcvtl2 v29.4s, v6.8h | |||
| fcvtl v30.4s, v7.4h | |||
| fcvtl2 v31.4s, v7.8h | |||
| st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x1], #64 | |||
| st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x1], #64 | |||
| st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x1], #64 | |||
| st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x1], #64 | |||
| subs x2, x2, #64 | |||
| ble LoopEnd | |||
| cmp x2, #64 | |||
| bge Loop64 | |||
| Loop: | |||
| ldr h0, [x0], #2 | |||
| fcvt s0, h0 | |||
| str s0, [x1], #4 | |||
| subs x2, x2, #1 | |||
| bgt Loop | |||
| LoopEnd: | |||
| ret | |||
| #endif | |||
| @@ -0,0 +1,54 @@ | |||
| #ifdef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| .global Float32ToFloat16 | |||
| #ifndef __APPLE__ | |||
| .type Float32ToFloat16, %function | |||
| #endif | |||
| // void Float32ToFloat16(const float *input, float16_t output, int number); | |||
| // x0: input, x1: output, x2: number | |||
| Float32ToFloat16: | |||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | |||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | |||
| // x19 ~ x29 should be also preserved | |||
| // whereas our coding style do not permit such amount of parameters | |||
| cmp x2, #64 | |||
| blt Loop | |||
| Loop64: | |||
| ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64 | |||
| ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64 | |||
| ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64 | |||
| ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], #64 | |||
| fcvtn v0.4h, v16.4s | |||
| fcvtn2 v0.8h, v17.4s | |||
| fcvtn v1.4h, v18.4s | |||
| fcvtn2 v1.8h, v19.4s | |||
| fcvtn v2.4h, v20.4s | |||
| fcvtn2 v2.8h, v21.4s | |||
| fcvtn v3.4h, v22.4s | |||
| fcvtn2 v3.8h, v23.4s | |||
| fcvtn v4.4h, v24.4s | |||
| fcvtn2 v4.8h, v25.4s | |||
| fcvtn v5.4h, v26.4s | |||
| fcvtn2 v5.8h, v27.4s | |||
| fcvtn v6.4h, v28.4s | |||
| fcvtn2 v6.8h, v29.4s | |||
| fcvtn v7.4h, v30.4s | |||
| fcvtn2 v7.8h, v31.4s | |||
| st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64 | |||
| st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], #64 | |||
| subs x2, x2, #64 | |||
| ble LoopEnd | |||
| cmp x2, #64 | |||
| bge Loop64 | |||
| Loop: | |||
| ldr s0, [x0], #4 | |||
| fcvt h0, s0 | |||
| str h0, [x1], #2 | |||
| subs x2, x2, #1 | |||
| bgt Loop | |||
| LoopEnd: | |||
| ret | |||
| #endif | |||
| @@ -14,7 +14,7 @@ | |||
| * limitations under the License. | |||
| */ | |||
| #include "nnacl/fp16/cast_fp16.h" | |||
| #ifndef ENABLE_ARM64 | |||
| void Float32ToFloat16(const float *input, float16_t *output, int number) { | |||
| for (int i = 0; i < number; ++i) { | |||
| output[i] = (float16_t)input[i]; | |||
| @@ -26,3 +26,4 @@ void Float16ToFloat32(const float16_t *input, float *output, int number) { | |||
| output[i] = (float)input[i]; | |||
| } | |||
| } | |||
| #endif | |||