add fast cast algorithm

5 years ago · 0cb2b89a4c
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/Float16ToFloat32.S
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/Float16ToFloat32.S
@@ -0,0 +1,54 @@
 #ifdef __aarch64__

 .text
 .align 5
 .global Float16ToFloat32
 #ifndef __APPLE__
 .type Float16ToFloat32, %function
 #endif

 // void Float16ToFloat32(const float16_t *input, float *output, int number);
 // x0: input, x1: output, x2: number
 Float16ToFloat32:
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
    // whereas our coding style do not permit such amount of parameters
    cmp x2, #64
    blt Loop
    Loop64:
        ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
        fcvtl v16.4s, v0.4h
        fcvtl2 v17.4s, v0.8h
        fcvtl v18.4s, v1.4h
        fcvtl2 v19.4s, v1.8h
        fcvtl v20.4s, v2.4h
        fcvtl2 v21.4s, v2.8h
        fcvtl v22.4s, v3.4h
        fcvtl2 v23.4s, v3.8h
        fcvtl v24.4s, v4.4h
        fcvtl2 v25.4s, v4.8h
        fcvtl v26.4s, v5.4h
        fcvtl2 v27.4s, v5.8h
        fcvtl v28.4s, v6.4h
        fcvtl2 v29.4s, v6.8h
        fcvtl v30.4s, v7.4h
        fcvtl2 v31.4s, v7.8h
        st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x1], #64
        st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x1], #64
        st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x1], #64
        st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x1], #64
        subs x2, x2, #64
        ble LoopEnd
        cmp x2, #64
        bge Loop64
    Loop:
        ldr h0, [x0], #2
        fcvt s0, h0
        str s0, [x1], #4
        subs x2, x2, #1
        bgt Loop
    LoopEnd:
        ret
 #endif
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/Float32ToFloat16.S
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/Float32ToFloat16.S
@@ -0,0 +1,54 @@
 #ifdef __aarch64__

 .text
 .align 5
 .global Float32ToFloat16
 #ifndef __APPLE__
 .type Float32ToFloat16, %function
 #endif

 // void Float32ToFloat16(const float *input, float16_t output, int number);
 // x0: input, x1: output, x2: number
 Float32ToFloat16:
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
    // whereas our coding style do not permit such amount of parameters
    cmp x2, #64
    blt Loop
    Loop64:
        ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
        ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
        ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64
        ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], #64
        fcvtn v0.4h, v16.4s
        fcvtn2 v0.8h, v17.4s
        fcvtn v1.4h, v18.4s
        fcvtn2 v1.8h, v19.4s
        fcvtn v2.4h, v20.4s
        fcvtn2 v2.8h, v21.4s
        fcvtn v3.4h, v22.4s
        fcvtn2 v3.8h, v23.4s
        fcvtn v4.4h, v24.4s
        fcvtn2 v4.8h, v25.4s
        fcvtn v5.4h, v26.4s
        fcvtn2 v5.8h, v27.4s
        fcvtn v6.4h, v28.4s
        fcvtn2 v6.8h, v29.4s
        fcvtn v7.4h, v30.4s
        fcvtn2 v7.8h, v31.4s
        st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
        st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], #64
        subs x2, x2, #64
        ble LoopEnd
        cmp x2, #64
        bge Loop64
    Loop:
        ldr s0, [x0], #4
        fcvt h0, s0
        str h0, [x1], #2
        subs x2, x2, #1
        bgt Loop
    LoopEnd:
        ret
 #endif
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/cast_fp16.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/cast_fp16.c
@@ -14,7 +14,7 @@
 * limitations under the License.
 */
 #include "nnacl/fp16/cast_fp16.h"

 #ifndef ENABLE_ARM64
 void Float32ToFloat16(const float *input, float16_t *output, int number) {
  for (int i = 0; i < number; ++i) {
    output[i] = (float16_t)input[i];
@@ -26,3 +26,4 @@ void Float16ToFloat32(const float16_t *input, float *output, int number) {
    output[i] = (float)input[i];
  }
 }
 #endif