From 3e2b3fa04d13a940add6bf1aa1e758c8c2e0569a Mon Sep 17 00:00:00 2001
From: nihui <shuizhuyuanluo@126.com>
Date: Mon, 10 Oct 2022 14:44:27 +0800
Subject: [PATCH] more stricter armv7 fp16 and armv84 bf16 compiler check, fix
 #4147 fix #4222 (#4247)

---
 CMakeLists.txt                          |   2 +-
 src/layer/arm/cast_bf16.h               |   2 +-
 src/layer/arm/cast_fp16.h               | 171 +++++++++++++++++-------
 src/layer/arm/innerproduct_fp16s.h      |  28 ++--
 src/layer/arm/innerproduct_gemm_fp16s.h |  20 +--
 src/layer/arm/neon_mathfun_fp16s.h      |  16 +--
 6 files changed, 153 insertions(+), 86 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4ece7ada7..291c5a6d1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -171,7 +171,7 @@ if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm")
         check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; float16x8_t _a, _b; _s = vfmlalq_low_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML)
 
         set(CMAKE_REQUIRED_FLAGS "-march=armv8.4-a+bf16")
-        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vbfmmlaq_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16)
+        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(_s, _a, _b))); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16)
 
         set(CMAKE_REQUIRED_FLAGS "-march=armv8.4-a+i8mm")
         check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vmmlaq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_I8MM)
diff --git a/src/layer/arm/cast_bf16.h b/src/layer/arm/cast_bf16.h
index 468e5eca3..aa8223d73 100644
--- a/src/layer/arm/cast_bf16.h
+++ b/src/layer/arm/cast_bf16.h
@@ -150,7 +150,7 @@ static void cast_fp32_to_bf16_neon(const Mat& bottom_blob, Mat& top_blob, const
 
 static void cast_bf16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
 {
-#if NCNN_ARM84BF16 && __aarch64__ && !__ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+#if NCNN_RUNTIME_CPU && NCNN_ARM84BF16 && __aarch64__ && !__ARM_FEATURE_BF16_VECTOR_ARITHMETIC
     if (ncnn::cpu_support_arm_bf16())
     {
         cast_bf16_to_fp32_neon_bf16(bottom_blob, top_blob, opt);
diff --git a/src/layer/arm/cast_fp16.h b/src/layer/arm/cast_fp16.h
index bb326a970..7e6db748a 100644
--- a/src/layer/arm/cast_fp16.h
+++ b/src/layer/arm/cast_fp16.h
@@ -47,12 +47,12 @@ static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const
         {
 #if __aarch64__
             asm volatile(
-                "prfm   pldl1keep, [%0, #512]   \n"
+                "prfm   pldl1keep, [%0, #512]       \n"
                 "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
-                "fcvtn  v0.4h, v0.4s            \n"
-                "fcvtn  v1.4h, v1.4s            \n"
-                "fcvtn  v2.4h, v2.4s            \n"
-                "fcvtn  v3.4h, v3.4s            \n"
+                "fcvtn  v0.4h, v0.4s                \n"
+                "fcvtn  v1.4h, v1.4s                \n"
+                "fcvtn  v2.4h, v2.4s                \n"
+                "fcvtn  v3.4h, v3.4s                \n"
                 "st1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%1], #32 \n"
                 : "=r"(ptr),   // %0
                 "=r"(outptr) // %1
@@ -61,12 +61,12 @@ static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const
                 : "memory", "v0", "v1", "v2", "v3");
 #else  // __aarch64__
             asm volatile(
-                "pld        [%0, #512]      \n"
-                "vldm       %0!, {d0-d7}    \n"
-                "vcvt.f16.f32 d0, q0        \n"
-                "vcvt.f16.f32 d1, q1        \n"
-                "vcvt.f16.f32 d2, q2        \n"
-                "vcvt.f16.f32 d3, q3        \n"
+                "pld        [%0, #512]          \n"
+                "vldm       %0!, {d0-d7}        \n"
+                "vcvt.f16.f32 d0, q0            \n"
+                "vcvt.f16.f32 d1, q1            \n"
+                "vcvt.f16.f32 d2, q2            \n"
+                "vcvt.f16.f32 d3, q3            \n"
                 "vst1.u16   {d0-d3}, [%1 :128]! \n"
                 : "=r"(ptr),   // %0
                 "=r"(outptr) // %1
@@ -77,24 +77,61 @@ static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const
         }
         for (; i + 7 < size; i += 8)
         {
-            float32x4_t _p0_fp32 = vld1q_f32(ptr);
-            float32x4_t _p1_fp32 = vld1q_f32(ptr + 4);
-            float16x4_t _p0_fp16 = vcvt_f16_f32(_p0_fp32);
-            float16x4_t _p1_fp16 = vcvt_f16_f32(_p1_fp32);
-            uint16x8_t _p_fp16 = vcombine_u16(vreinterpret_u16_f16(_p0_fp16), vreinterpret_u16_f16(_p1_fp16));
-            vst1q_u16(outptr, _p_fp16);
-            ptr += 8;
-            outptr += 8;
+            // This is originally implemented with neon fp16 intrinsics.
+            // In the new version of gcc, __ARM_FP16_FORMAT_IEEE or __ARM_FP16_FORMAT_ALTERNATIVE needs to be defined to use the float16x4_t type.
+            // That leads to compiler error when compiled with -mfpu=neon-vfpv4 but without -mfp16-format=ieee flag.
+            // We could add more macro conditions to differentiate between old and new versions, but that's pretty ugly!
+            // Just use all inline assembly here ~
+            //          --- nihui
+#if __aarch64__
+            asm volatile(
+                "ld1    {v0.4s, v1.4s}, [%0], #32   \n"
+                "fcvtn  v0.4h, v0.4s                \n"
+                "fcvtn  v1.4h, v1.4s                \n"
+                "st1    {v0.4h, v1.4h}, [%1], #16   \n"
+                : "=r"(ptr),   // %0
+                "=r"(outptr) // %1
+                : "0"(ptr),
+                "1"(outptr)
+                : "memory", "v0", "v1");
+#else  // __aarch64__
+            asm volatile(
+                "vld1.f32   {d0-d3}, [%0]!  \n"
+                "vcvt.f16.f32 d0, q0        \n"
+                "vcvt.f16.f32 d1, q1        \n"
+                "vst1.u16   {d0-d1}, [%1]!  \n"
+                : "=r"(ptr),   // %0
+                "=r"(outptr) // %1
+                : "0"(ptr),
+                "1"(outptr)
+                : "memory", "q0", "q1");
+#endif // __aarch64__
         }
         for (; i + 3 < size; i += 4)
         {
-            float32x4_t _p_fp32 = vld1q_f32(ptr);
-            float16x4_t _p_fp16 = vcvt_f16_f32(_p_fp32);
-            vst1_u16(outptr, vreinterpret_u16_f16(_p_fp16));
-            ptr += 4;
-            outptr += 4;
+#if __aarch64__
+            asm volatile(
+                "ld1    {v0.4s}, [%0], #16  \n"
+                "fcvtn  v0.4h, v0.4s        \n"
+                "st1    {v0.4h}, [%1], #8   \n"
+                : "=r"(ptr),   // %0
+                "=r"(outptr) // %1
+                : "0"(ptr),
+                "1"(outptr)
+                : "memory", "v0");
+#else  // __aarch64__
+            asm volatile(
+                "vld1.f32   {d0-d1}, [%0]!  \n"
+                "vcvt.f16.f32 d0, q0        \n"
+                "vst1.u16   {d0}, [%1]!     \n"
+                : "=r"(ptr),   // %0
+                "=r"(outptr) // %1
+                : "0"(ptr),
+                "1"(outptr)
+                : "memory", "q0");
+#endif // __aarch64__
         }
-#endif
+#endif // (__ARM_FP & 2)
         for (; i < size; i++)
         {
             *outptr++ = float32_to_float16(*ptr++);
@@ -104,7 +141,7 @@ static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const
 
 static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
 {
-#if NCNN_VFPV4 && __ARM_NEON && !(__ARM_FP & 2)
+#if NCNN_RUNTIME_CPU && NCNN_VFPV4 && __ARM_NEON && !(__ARM_FP & 2)
     if (ncnn::cpu_support_arm_vfpv4())
     {
         cast_fp16_to_fp32_neon_vfpv4(bottom_blob, top_blob, opt);
@@ -132,12 +169,12 @@ static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const
         {
 #if __aarch64__
             asm volatile(
-                "prfm   pldl1keep, [%0, #256]   \n"
+                "prfm   pldl1keep, [%0, #256]       \n"
                 "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%0], #32 \n"
-                "fcvtl  v0.4s, v0.4h            \n"
-                "fcvtl  v1.4s, v1.4h            \n"
-                "fcvtl  v2.4s, v2.4h            \n"
-                "fcvtl  v3.4s, v3.4h            \n"
+                "fcvtl  v0.4s, v0.4h                \n"
+                "fcvtl  v1.4s, v1.4h                \n"
+                "fcvtl  v2.4s, v2.4h                \n"
+                "fcvtl  v3.4s, v3.4h                \n"
                 "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
                 : "=r"(ptr),   // %0
                 "=r"(outptr) // %1
@@ -146,13 +183,13 @@ static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const
                 : "memory", "v0", "v1", "v2", "v3");
 #else  // __aarch64__
             asm volatile(
-                "pld        [%0, #256]      \n"
+                "pld        [%0, #256]          \n"
                 "vld1.u16   {d4-d7}, [%0 :128]! \n"
-                "vcvt.f32.f16 q0, d4        \n"
-                "vcvt.f32.f16 q1, d5        \n"
-                "vcvt.f32.f16 q2, d6        \n"
-                "vcvt.f32.f16 q3, d7        \n"
-                "vstm       %1!, {d0-d7}    \n"
+                "vcvt.f32.f16 q0, d4            \n"
+                "vcvt.f32.f16 q1, d5            \n"
+                "vcvt.f32.f16 q2, d6            \n"
+                "vcvt.f32.f16 q3, d7            \n"
+                "vstm       %1!, {d0-d7}        \n"
                 : "=r"(ptr),   // %0
                 "=r"(outptr) // %1
                 : "0"(ptr),
@@ -162,25 +199,55 @@ static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const
         }
         for (; i + 7 < size; i += 8)
         {
-            uint16x8_t _p_fp16 = vld1q_u16(ptr);
-            float16x4_t _p0_fp16 = vreinterpret_f16_u16(vget_low_u16(_p_fp16));
-            float16x4_t _p1_fp16 = vreinterpret_f16_u16(vget_high_u16(_p_fp16));
-            float32x4_t _p0_fp32 = vcvt_f32_f16(_p0_fp16);
-            float32x4_t _p1_fp32 = vcvt_f32_f16(_p1_fp16);
-            vst1q_f32(outptr, _p0_fp32);
-            vst1q_f32(outptr + 4, _p1_fp32);
-            ptr += 8;
-            outptr += 8;
+#if __aarch64__
+            asm volatile(
+                "ld1    {v0.4h, v1.4h}, [%0], #16   \n"
+                "fcvtl  v0.4s, v0.4h                \n"
+                "fcvtl  v1.4s, v1.4h                \n"
+                "st1    {v0.4s, v1.4s}, [%1], #32   \n"
+                : "=r"(ptr),   // %0
+                "=r"(outptr) // %1
+                : "0"(ptr),
+                "1"(outptr)
+                : "memory", "v0", "v1");
+#else  // __aarch64__
+            asm volatile(
+                "vld1.u16   {d4-d5}, [%0]!  \n"
+                "vcvt.f32.f16 q0, d4        \n"
+                "vcvt.f32.f16 q1, d5        \n"
+                "vst1.f32   {d0-d3}, [%1]!  \n"
+                : "=r"(ptr),   // %0
+                "=r"(outptr) // %1
+                : "0"(ptr),
+                "1"(outptr)
+                : "memory", "q0", "q1", "q2");
+#endif // __aarch64__
         }
         for (; i + 3 < size; i += 4)
         {
-            float16x4_t _p_fp16 = vreinterpret_f16_u16(vld1_u16(ptr));
-            float32x4_t _p_fp32 = vcvt_f32_f16(_p_fp16);
-            vst1q_f32(outptr, _p_fp32);
-            ptr += 4;
-            outptr += 4;
+#if __aarch64__
+            asm volatile(
+                "ld1    {v0.4h}, [%0], #8   \n"
+                "fcvtl  v0.4s, v0.4h        \n"
+                "st1    {v0.4s}, [%1], #16  \n"
+                : "=r"(ptr),   // %0
+                "=r"(outptr) // %1
+                : "0"(ptr),
+                "1"(outptr)
+                : "memory", "v0");
+#else  // __aarch64__
+            asm volatile(
+                "vld1.u16   {d2}, [%0]!     \n"
+                "vcvt.f32.f16 q0, d2        \n"
+                "vst1.f32   {d0-d1}, [%1]!  \n"
+                : "=r"(ptr),   // %0
+                "=r"(outptr) // %1
+                : "0"(ptr),
+                "1"(outptr)
+                : "memory", "q0", "q1");
+#endif // __aarch64__
         }
-#endif
+#endif // (__ARM_FP & 2)
         for (; i < size; i++)
         {
             *outptr++ = float16_to_float32(*ptr++);
diff --git a/src/layer/arm/innerproduct_fp16s.h b/src/layer/arm/innerproduct_fp16s.h
index 18214bc91..31edd9ed6 100644
--- a/src/layer/arm/innerproduct_fp16s.h
+++ b/src/layer/arm/innerproduct_fp16s.h
@@ -253,10 +253,10 @@ static void innerproduct_pack4_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
             float32x4_t _val = vld1q_f32(sptr);
             uint16x8_t _w01 = vld1q_u16(kptr);
             uint16x8_t _w23 = vld1q_u16(kptr + 8);
-            float32x4_t _w0 = vcvt_f32_f16(vreinterpret_f16_u16(vget_low_u16(_w01)));
-            float32x4_t _w1 = vcvt_f32_f16(vreinterpret_f16_u16(vget_high_u16(_w01)));
-            float32x4_t _w2 = vcvt_f32_f16(vreinterpret_f16_u16(vget_low_u16(_w23)));
-            float32x4_t _w3 = vcvt_f32_f16(vreinterpret_f16_u16(vget_high_u16(_w23)));
+            float32x4_t _w0 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w01)));
+            float32x4_t _w1 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w01)));
+            float32x4_t _w2 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w23)));
+            float32x4_t _w3 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w23)));
 #endif
 
 #if __aarch64__
@@ -281,7 +281,7 @@ static void innerproduct_pack4_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
 #else
-            float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr)));
+            float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr)));
 #endif
             _sum0 = vfmaq_f32(_sum0, _val, _w);
 
@@ -410,10 +410,10 @@ static void innerproduct_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const
             float32x4_t _w3 = vcvt_f32_f16(vld1_f16(kptr3));
 #else
             float32x4_t _val = vld1q_f32(sptr);
-            float32x4_t _w0 = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr0)));
-            float32x4_t _w1 = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr1)));
-            float32x4_t _w2 = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr2)));
-            float32x4_t _w3 = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr3)));
+            float32x4_t _w0 = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr0)));
+            float32x4_t _w1 = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr1)));
+            float32x4_t _w2 = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr2)));
+            float32x4_t _w3 = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr3)));
 #endif
 
             _sum0 = vfmaq_f32(_sum0, _val, _w0);
@@ -507,7 +507,7 @@ static void innerproduct_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const
             float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
 #else
             float32x4_t _val = vld1q_f32(sptr);
-            float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr)));
+            float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr)));
 #endif
             _sum = vfmaq_f32(_sum, _val, _w);
 
@@ -713,10 +713,10 @@ static void innerproduct_transform_kernel_fp16s_neon(const Mat& weight_data, Mat
             {
                 // transpose 4x4
                 uint16x4x4_t _p;
-                _p.val[0] = vreinterpret_u16_f16(vcvt_f16_f32(vld1q_f32(k0)));
-                _p.val[1] = vreinterpret_u16_f16(vcvt_f16_f32(vld1q_f32(k1)));
-                _p.val[2] = vreinterpret_u16_f16(vcvt_f16_f32(vld1q_f32(k2)));
-                _p.val[3] = vreinterpret_u16_f16(vcvt_f16_f32(vld1q_f32(k3)));
+                _p.val[0] = (uint16x4_t)(vcvt_f16_f32(vld1q_f32(k0)));
+                _p.val[1] = (uint16x4_t)(vcvt_f16_f32(vld1q_f32(k1)));
+                _p.val[2] = (uint16x4_t)(vcvt_f16_f32(vld1q_f32(k2)));
+                _p.val[3] = (uint16x4_t)(vcvt_f16_f32(vld1q_f32(k3)));
                 vst4_u16(g0, _p);
 
                 k0 += 4;
diff --git a/src/layer/arm/innerproduct_gemm_fp16s.h b/src/layer/arm/innerproduct_gemm_fp16s.h
index 8e6731dc8..f7daa17da 100644
--- a/src/layer/arm/innerproduct_gemm_fp16s.h
+++ b/src/layer/arm/innerproduct_gemm_fp16s.h
@@ -120,7 +120,7 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
                     float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
 #else
                     float32x4_t _val = vld1q_f32(m);
-                    float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr)));
+                    float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr)));
 #endif
 
 #if __aarch64__
@@ -214,10 +214,10 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
                     float32x4_t _val = vld1q_f32(m);
                     uint16x8_t _w01 = vld1q_u16(kptr);
                     uint16x8_t _w23 = vld1q_u16(kptr + 8);
-                    float32x4_t _w0 = vcvt_f32_f16(vreinterpret_f16_u16(vget_low_u16(_w01)));
-                    float32x4_t _w1 = vcvt_f32_f16(vreinterpret_f16_u16(vget_high_u16(_w01)));
-                    float32x4_t _w2 = vcvt_f32_f16(vreinterpret_f16_u16(vget_low_u16(_w23)));
-                    float32x4_t _w3 = vcvt_f32_f16(vreinterpret_f16_u16(vget_high_u16(_w23)));
+                    float32x4_t _w0 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w01)));
+                    float32x4_t _w1 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w01)));
+                    float32x4_t _w2 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w23)));
+                    float32x4_t _w3 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w23)));
 #endif
 
 #if __aarch64__
@@ -242,7 +242,7 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                     float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
 #else
-                    float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr)));
+                    float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr)));
 #endif
                     _sum0 = vfmaq_f32(_sum0, _val, _w);
 
@@ -317,7 +317,7 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
                     float32x4_t _val1 = vld1q_f32(m + 4);
                     float32x4_t _val2 = vld1q_f32(m + 8);
                     float32x4_t _val3 = vld1q_f32(m + 12);
-                    float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr)));
+                    float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr)));
 #endif
 
 #if __aarch64__
@@ -414,8 +414,8 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
                     float32x4_t _val0 = vld1q_f32(m);
                     float32x4_t _val1 = vld1q_f32(m + 4);
                     uint16x8_t _w01 = vld1q_u16(kptr);
-                    float32x4_t _w0 = vcvt_f32_f16(vreinterpret_f16_u16(vget_low_u16(_w01)));
-                    float32x4_t _w1 = vcvt_f32_f16(vreinterpret_f16_u16(vget_high_u16(_w01)));
+                    float32x4_t _w0 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w01)));
+                    float32x4_t _w1 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w01)));
 #endif
 
                     _sum0 = vfmaq_f32(_sum0, _val0, _w0);
@@ -433,7 +433,7 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
                     float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
 #else
                     float32x4_t _val = vld1q_f32(m);
-                    float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr)));
+                    float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr)));
 #endif
 
                     _sum0 = vfmaq_f32(_sum0, _val, _w);
diff --git a/src/layer/arm/neon_mathfun_fp16s.h b/src/layer/arm/neon_mathfun_fp16s.h
index f1759f518..074681809 100644
--- a/src/layer/arm/neon_mathfun_fp16s.h
+++ b/src/layer/arm/neon_mathfun_fp16s.h
@@ -89,9 +89,9 @@ static inline float16x4_t log_ps(float16x4_t x)
      *     } else { x = x - 1.0; }
      */
     uint16x4_t mask = vclt_f16(x, vdup_n_f16(c_cephes_SQRTHF));
-    float16x4_t tmp = vreinterpret_f16_u16(vand_u16(vreinterpret_u16_f16(x), mask));
+    float16x4_t tmp = (float16x4_t)(vand_u16((uint16x4_t)(x), mask));
     x = vsub_f16(x, one);
-    e = vsub_f16(e, vreinterpret_f16_u16(vand_u16(vreinterpret_u16_f16(one), mask)));
+    e = vsub_f16(e, (float16x4_t)(vand_u16((uint16x4_t)(one), mask)));
     x = vadd_f16(x, tmp);
 
     float16x4_t z = vmul_f16(x, x);
@@ -115,7 +115,7 @@ static inline float16x4_t log_ps(float16x4_t x)
 
     x = vadd_f16(x, y);
     x = vfma_f16(x, e, vdup_n_f16(c_cephes_log_q2));
-    x = vreinterpret_f16_u16(vorr_u16(vreinterpret_u16_f16(x), invalid_mask)); // negative arg will be NAN
+    x = (float16x4_t)(vorr_u16((uint16x4_t)(x), invalid_mask)); // negative arg will be NAN
     return x;
 }
 
@@ -208,9 +208,9 @@ static inline float16x4_t exp_ps(float16x4_t x)
 
     /* if greater, substract 1 */
     uint16x4_t mask = vcgt_f16(tmp, fx);
-    mask = vand_u16(mask, vreinterpret_u16_f16(one));
+    mask = vand_u16(mask, (uint16x4_t)(one));
 
-    fx = vsub_f16(tmp, vreinterpret_f16_u16(mask));
+    fx = vsub_f16(tmp, (float16x4_t)(mask));
 
     tmp = vmul_f16(fx, vdup_n_f16(c_cephes_exp_C1));
     float16x4_t z = vmul_f16(fx, vdup_n_f16(c_cephes_exp_C2));
@@ -489,7 +489,7 @@ static inline float16x4_t tanh_ps(float16x4_t x)
 
     // clamp the inputs to the range [-9, 9] since anything outside
     // this range is -/+1.0f in single-precision.
-    x2 = vreinterpret_f16_u16(vbsl_u16(vcge_f16(vdup_n_f16(c_tanh_hi), x2), vreinterpret_u16_f16(x2), vreinterpret_u16_f16(vdup_n_f16(c_tanh_hi))));
+    x2 = (float16x4_t)(vbsl_u16(vcge_f16(vdup_n_f16(c_tanh_hi), x2), (uint16x4_t)(x2), (uint16x4_t)(vdup_n_f16(c_tanh_hi))));
 
     // since the polynomials are odd/even, we need x**2.
     float16x4_t z = vmul_f16(x2, x2);
@@ -514,10 +514,10 @@ static inline float16x4_t tanh_ps(float16x4_t x)
     y = vdiv_f16(y, w);
 
     // reinstate the sign.
-    y = vreinterpret_f16_u16(vbsl_u16(vdup_n_u16(1u << 15), vreinterpret_u16_f16(x), vreinterpret_u16_f16(y)));
+    y = (float16x4_t)(vbsl_u16(vdup_n_u16(1u << 15), (uint16x4_t)(x), (uint16x4_t)(y)));
 
     // when the argument is very small in magnitude it's more accurate to just return it.
-    y = vreinterpret_f16_u16(vbsl_u16(tiny_mask, vreinterpret_u16_f16(y), vreinterpret_u16_f16(x)));
+    y = (float16x4_t)(vbsl_u16(tiny_mask, (uint16x4_t)(y), (uint16x4_t)(x)));
 
     return y;
 }