| @@ -171,7 +171,7 @@ if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm") | |||
| check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; float16x8_t _a, _b; _s = vfmlalq_low_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML) | |||
| set(CMAKE_REQUIRED_FLAGS "-march=armv8.4-a+bf16") | |||
| check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vbfmmlaq_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16) | |||
| check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(_s, _a, _b))); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16) | |||
| set(CMAKE_REQUIRED_FLAGS "-march=armv8.4-a+i8mm") | |||
| check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vmmlaq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_I8MM) | |||
| @@ -150,7 +150,7 @@ static void cast_fp32_to_bf16_neon(const Mat& bottom_blob, Mat& top_blob, const | |||
| static void cast_bf16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt) | |||
| { | |||
| #if NCNN_ARM84BF16 && __aarch64__ && !__ARM_FEATURE_BF16_VECTOR_ARITHMETIC | |||
| #if NCNN_RUNTIME_CPU && NCNN_ARM84BF16 && __aarch64__ && !__ARM_FEATURE_BF16_VECTOR_ARITHMETIC | |||
| if (ncnn::cpu_support_arm_bf16()) | |||
| { | |||
| cast_bf16_to_fp32_neon_bf16(bottom_blob, top_blob, opt); | |||
| @@ -47,12 +47,12 @@ static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const | |||
| { | |||
| #if __aarch64__ | |||
| asm volatile( | |||
| "prfm pldl1keep, [%0, #512] \n" | |||
| "prfm pldl1keep, [%0, #512] \n" | |||
| "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n" | |||
| "fcvtn v0.4h, v0.4s \n" | |||
| "fcvtn v1.4h, v1.4s \n" | |||
| "fcvtn v2.4h, v2.4s \n" | |||
| "fcvtn v3.4h, v3.4s \n" | |||
| "fcvtn v0.4h, v0.4s \n" | |||
| "fcvtn v1.4h, v1.4s \n" | |||
| "fcvtn v2.4h, v2.4s \n" | |||
| "fcvtn v3.4h, v3.4s \n" | |||
| "st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [%1], #32 \n" | |||
| : "=r"(ptr), // %0 | |||
| "=r"(outptr) // %1 | |||
| @@ -61,12 +61,12 @@ static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const | |||
| : "memory", "v0", "v1", "v2", "v3"); | |||
| #else // __aarch64__ | |||
| asm volatile( | |||
| "pld [%0, #512] \n" | |||
| "vldm %0!, {d0-d7} \n" | |||
| "vcvt.f16.f32 d0, q0 \n" | |||
| "vcvt.f16.f32 d1, q1 \n" | |||
| "vcvt.f16.f32 d2, q2 \n" | |||
| "vcvt.f16.f32 d3, q3 \n" | |||
| "pld [%0, #512] \n" | |||
| "vldm %0!, {d0-d7} \n" | |||
| "vcvt.f16.f32 d0, q0 \n" | |||
| "vcvt.f16.f32 d1, q1 \n" | |||
| "vcvt.f16.f32 d2, q2 \n" | |||
| "vcvt.f16.f32 d3, q3 \n" | |||
| "vst1.u16 {d0-d3}, [%1 :128]! \n" | |||
| : "=r"(ptr), // %0 | |||
| "=r"(outptr) // %1 | |||
| @@ -77,24 +77,61 @@ static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const | |||
| } | |||
| for (; i + 7 < size; i += 8) | |||
| { | |||
| float32x4_t _p0_fp32 = vld1q_f32(ptr); | |||
| float32x4_t _p1_fp32 = vld1q_f32(ptr + 4); | |||
| float16x4_t _p0_fp16 = vcvt_f16_f32(_p0_fp32); | |||
| float16x4_t _p1_fp16 = vcvt_f16_f32(_p1_fp32); | |||
| uint16x8_t _p_fp16 = vcombine_u16(vreinterpret_u16_f16(_p0_fp16), vreinterpret_u16_f16(_p1_fp16)); | |||
| vst1q_u16(outptr, _p_fp16); | |||
| ptr += 8; | |||
| outptr += 8; | |||
| // This is originally implemented with neon fp16 intrinsics. | |||
| // In the new version of gcc, __ARM_FP16_FORMAT_IEEE or __ARM_FP16_FORMAT_ALTERNATIVE needs to be defined to use the float16x4_t type. | |||
| // That leads to compiler error when compiled with -mfpu=neon-vfpv4 but without -mfp16-format=ieee flag. | |||
| // We could add more macro conditions to differentiate between old and new versions, but that's pretty ugly! | |||
| // Just use all inline assembly here ~ | |||
| // --- nihui | |||
| #if __aarch64__ | |||
| asm volatile( | |||
| "ld1 {v0.4s, v1.4s}, [%0], #32 \n" | |||
| "fcvtn v0.4h, v0.4s \n" | |||
| "fcvtn v1.4h, v1.4s \n" | |||
| "st1 {v0.4h, v1.4h}, [%1], #16 \n" | |||
| : "=r"(ptr), // %0 | |||
| "=r"(outptr) // %1 | |||
| : "0"(ptr), | |||
| "1"(outptr) | |||
| : "memory", "v0", "v1"); | |||
| #else // __aarch64__ | |||
| asm volatile( | |||
| "vld1.f32 {d0-d3}, [%0]! \n" | |||
| "vcvt.f16.f32 d0, q0 \n" | |||
| "vcvt.f16.f32 d1, q1 \n" | |||
| "vst1.u16 {d0-d1}, [%1]! \n" | |||
| : "=r"(ptr), // %0 | |||
| "=r"(outptr) // %1 | |||
| : "0"(ptr), | |||
| "1"(outptr) | |||
| : "memory", "q0", "q1"); | |||
| #endif // __aarch64__ | |||
| } | |||
| for (; i + 3 < size; i += 4) | |||
| { | |||
| float32x4_t _p_fp32 = vld1q_f32(ptr); | |||
| float16x4_t _p_fp16 = vcvt_f16_f32(_p_fp32); | |||
| vst1_u16(outptr, vreinterpret_u16_f16(_p_fp16)); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| #if __aarch64__ | |||
| asm volatile( | |||
| "ld1 {v0.4s}, [%0], #16 \n" | |||
| "fcvtn v0.4h, v0.4s \n" | |||
| "st1 {v0.4h}, [%1], #8 \n" | |||
| : "=r"(ptr), // %0 | |||
| "=r"(outptr) // %1 | |||
| : "0"(ptr), | |||
| "1"(outptr) | |||
| : "memory", "v0"); | |||
| #else // __aarch64__ | |||
| asm volatile( | |||
| "vld1.f32 {d0-d1}, [%0]! \n" | |||
| "vcvt.f16.f32 d0, q0 \n" | |||
| "vst1.u16 {d0}, [%1]! \n" | |||
| : "=r"(ptr), // %0 | |||
| "=r"(outptr) // %1 | |||
| : "0"(ptr), | |||
| "1"(outptr) | |||
| : "memory", "q0"); | |||
| #endif // __aarch64__ | |||
| } | |||
| #endif | |||
| #endif // (__ARM_FP & 2) | |||
| for (; i < size; i++) | |||
| { | |||
| *outptr++ = float32_to_float16(*ptr++); | |||
| @@ -104,7 +141,7 @@ static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const | |||
| static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt) | |||
| { | |||
| #if NCNN_VFPV4 && __ARM_NEON && !(__ARM_FP & 2) | |||
| #if NCNN_RUNTIME_CPU && NCNN_VFPV4 && __ARM_NEON && !(__ARM_FP & 2) | |||
| if (ncnn::cpu_support_arm_vfpv4()) | |||
| { | |||
| cast_fp16_to_fp32_neon_vfpv4(bottom_blob, top_blob, opt); | |||
| @@ -132,12 +169,12 @@ static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const | |||
| { | |||
| #if __aarch64__ | |||
| asm volatile( | |||
| "prfm pldl1keep, [%0, #256] \n" | |||
| "prfm pldl1keep, [%0, #256] \n" | |||
| "ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [%0], #32 \n" | |||
| "fcvtl v0.4s, v0.4h \n" | |||
| "fcvtl v1.4s, v1.4h \n" | |||
| "fcvtl v2.4s, v2.4h \n" | |||
| "fcvtl v3.4s, v3.4h \n" | |||
| "fcvtl v0.4s, v0.4h \n" | |||
| "fcvtl v1.4s, v1.4h \n" | |||
| "fcvtl v2.4s, v2.4h \n" | |||
| "fcvtl v3.4s, v3.4h \n" | |||
| "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n" | |||
| : "=r"(ptr), // %0 | |||
| "=r"(outptr) // %1 | |||
| @@ -146,13 +183,13 @@ static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const | |||
| : "memory", "v0", "v1", "v2", "v3"); | |||
| #else // __aarch64__ | |||
| asm volatile( | |||
| "pld [%0, #256] \n" | |||
| "pld [%0, #256] \n" | |||
| "vld1.u16 {d4-d7}, [%0 :128]! \n" | |||
| "vcvt.f32.f16 q0, d4 \n" | |||
| "vcvt.f32.f16 q1, d5 \n" | |||
| "vcvt.f32.f16 q2, d6 \n" | |||
| "vcvt.f32.f16 q3, d7 \n" | |||
| "vstm %1!, {d0-d7} \n" | |||
| "vcvt.f32.f16 q0, d4 \n" | |||
| "vcvt.f32.f16 q1, d5 \n" | |||
| "vcvt.f32.f16 q2, d6 \n" | |||
| "vcvt.f32.f16 q3, d7 \n" | |||
| "vstm %1!, {d0-d7} \n" | |||
| : "=r"(ptr), // %0 | |||
| "=r"(outptr) // %1 | |||
| : "0"(ptr), | |||
| @@ -162,25 +199,55 @@ static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const | |||
| } | |||
| for (; i + 7 < size; i += 8) | |||
| { | |||
| uint16x8_t _p_fp16 = vld1q_u16(ptr); | |||
| float16x4_t _p0_fp16 = vreinterpret_f16_u16(vget_low_u16(_p_fp16)); | |||
| float16x4_t _p1_fp16 = vreinterpret_f16_u16(vget_high_u16(_p_fp16)); | |||
| float32x4_t _p0_fp32 = vcvt_f32_f16(_p0_fp16); | |||
| float32x4_t _p1_fp32 = vcvt_f32_f16(_p1_fp16); | |||
| vst1q_f32(outptr, _p0_fp32); | |||
| vst1q_f32(outptr + 4, _p1_fp32); | |||
| ptr += 8; | |||
| outptr += 8; | |||
| #if __aarch64__ | |||
| asm volatile( | |||
| "ld1 {v0.4h, v1.4h}, [%0], #16 \n" | |||
| "fcvtl v0.4s, v0.4h \n" | |||
| "fcvtl v1.4s, v1.4h \n" | |||
| "st1 {v0.4s, v1.4s}, [%1], #32 \n" | |||
| : "=r"(ptr), // %0 | |||
| "=r"(outptr) // %1 | |||
| : "0"(ptr), | |||
| "1"(outptr) | |||
| : "memory", "v0", "v1"); | |||
| #else // __aarch64__ | |||
| asm volatile( | |||
| "vld1.u16 {d4-d5}, [%0]! \n" | |||
| "vcvt.f32.f16 q0, d4 \n" | |||
| "vcvt.f32.f16 q1, d5 \n" | |||
| "vst1.f32 {d0-d3}, [%1]! \n" | |||
| : "=r"(ptr), // %0 | |||
| "=r"(outptr) // %1 | |||
| : "0"(ptr), | |||
| "1"(outptr) | |||
| : "memory", "q0", "q1", "q2"); | |||
| #endif // __aarch64__ | |||
| } | |||
| for (; i + 3 < size; i += 4) | |||
| { | |||
| float16x4_t _p_fp16 = vreinterpret_f16_u16(vld1_u16(ptr)); | |||
| float32x4_t _p_fp32 = vcvt_f32_f16(_p_fp16); | |||
| vst1q_f32(outptr, _p_fp32); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| #if __aarch64__ | |||
| asm volatile( | |||
| "ld1 {v0.4h}, [%0], #8 \n" | |||
| "fcvtl v0.4s, v0.4h \n" | |||
| "st1 {v0.4s}, [%1], #16 \n" | |||
| : "=r"(ptr), // %0 | |||
| "=r"(outptr) // %1 | |||
| : "0"(ptr), | |||
| "1"(outptr) | |||
| : "memory", "v0"); | |||
| #else // __aarch64__ | |||
| asm volatile( | |||
| "vld1.u16 {d2}, [%0]! \n" | |||
| "vcvt.f32.f16 q0, d2 \n" | |||
| "vst1.f32 {d0-d1}, [%1]! \n" | |||
| : "=r"(ptr), // %0 | |||
| "=r"(outptr) // %1 | |||
| : "0"(ptr), | |||
| "1"(outptr) | |||
| : "memory", "q0", "q1"); | |||
| #endif // __aarch64__ | |||
| } | |||
| #endif | |||
| #endif // (__ARM_FP & 2) | |||
| for (; i < size; i++) | |||
| { | |||
| *outptr++ = float16_to_float32(*ptr++); | |||
| @@ -253,10 +253,10 @@ static void innerproduct_pack4_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, | |||
| float32x4_t _val = vld1q_f32(sptr); | |||
| uint16x8_t _w01 = vld1q_u16(kptr); | |||
| uint16x8_t _w23 = vld1q_u16(kptr + 8); | |||
| float32x4_t _w0 = vcvt_f32_f16(vreinterpret_f16_u16(vget_low_u16(_w01))); | |||
| float32x4_t _w1 = vcvt_f32_f16(vreinterpret_f16_u16(vget_high_u16(_w01))); | |||
| float32x4_t _w2 = vcvt_f32_f16(vreinterpret_f16_u16(vget_low_u16(_w23))); | |||
| float32x4_t _w3 = vcvt_f32_f16(vreinterpret_f16_u16(vget_high_u16(_w23))); | |||
| float32x4_t _w0 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w01))); | |||
| float32x4_t _w1 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w01))); | |||
| float32x4_t _w2 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w23))); | |||
| float32x4_t _w3 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w23))); | |||
| #endif | |||
| #if __aarch64__ | |||
| @@ -281,7 +281,7 @@ static void innerproduct_pack4_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr)); | |||
| #else | |||
| float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr))); | |||
| float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr))); | |||
| #endif | |||
| _sum0 = vfmaq_f32(_sum0, _val, _w); | |||
| @@ -410,10 +410,10 @@ static void innerproduct_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const | |||
| float32x4_t _w3 = vcvt_f32_f16(vld1_f16(kptr3)); | |||
| #else | |||
| float32x4_t _val = vld1q_f32(sptr); | |||
| float32x4_t _w0 = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr0))); | |||
| float32x4_t _w1 = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr1))); | |||
| float32x4_t _w2 = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr2))); | |||
| float32x4_t _w3 = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr3))); | |||
| float32x4_t _w0 = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr0))); | |||
| float32x4_t _w1 = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr1))); | |||
| float32x4_t _w2 = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr2))); | |||
| float32x4_t _w3 = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr3))); | |||
| #endif | |||
| _sum0 = vfmaq_f32(_sum0, _val, _w0); | |||
| @@ -507,7 +507,7 @@ static void innerproduct_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const | |||
| float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr)); | |||
| #else | |||
| float32x4_t _val = vld1q_f32(sptr); | |||
| float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr))); | |||
| float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr))); | |||
| #endif | |||
| _sum = vfmaq_f32(_sum, _val, _w); | |||
| @@ -713,10 +713,10 @@ static void innerproduct_transform_kernel_fp16s_neon(const Mat& weight_data, Mat | |||
| { | |||
| // transpose 4x4 | |||
| uint16x4x4_t _p; | |||
| _p.val[0] = vreinterpret_u16_f16(vcvt_f16_f32(vld1q_f32(k0))); | |||
| _p.val[1] = vreinterpret_u16_f16(vcvt_f16_f32(vld1q_f32(k1))); | |||
| _p.val[2] = vreinterpret_u16_f16(vcvt_f16_f32(vld1q_f32(k2))); | |||
| _p.val[3] = vreinterpret_u16_f16(vcvt_f16_f32(vld1q_f32(k3))); | |||
| _p.val[0] = (uint16x4_t)(vcvt_f16_f32(vld1q_f32(k0))); | |||
| _p.val[1] = (uint16x4_t)(vcvt_f16_f32(vld1q_f32(k1))); | |||
| _p.val[2] = (uint16x4_t)(vcvt_f16_f32(vld1q_f32(k2))); | |||
| _p.val[3] = (uint16x4_t)(vcvt_f16_f32(vld1q_f32(k3))); | |||
| vst4_u16(g0, _p); | |||
| k0 += 4; | |||
| @@ -120,7 +120,7 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, | |||
| float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr)); | |||
| #else | |||
| float32x4_t _val = vld1q_f32(m); | |||
| float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr))); | |||
| float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr))); | |||
| #endif | |||
| #if __aarch64__ | |||
| @@ -214,10 +214,10 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, | |||
| float32x4_t _val = vld1q_f32(m); | |||
| uint16x8_t _w01 = vld1q_u16(kptr); | |||
| uint16x8_t _w23 = vld1q_u16(kptr + 8); | |||
| float32x4_t _w0 = vcvt_f32_f16(vreinterpret_f16_u16(vget_low_u16(_w01))); | |||
| float32x4_t _w1 = vcvt_f32_f16(vreinterpret_f16_u16(vget_high_u16(_w01))); | |||
| float32x4_t _w2 = vcvt_f32_f16(vreinterpret_f16_u16(vget_low_u16(_w23))); | |||
| float32x4_t _w3 = vcvt_f32_f16(vreinterpret_f16_u16(vget_high_u16(_w23))); | |||
| float32x4_t _w0 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w01))); | |||
| float32x4_t _w1 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w01))); | |||
| float32x4_t _w2 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w23))); | |||
| float32x4_t _w3 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w23))); | |||
| #endif | |||
| #if __aarch64__ | |||
| @@ -242,7 +242,7 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr)); | |||
| #else | |||
| float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr))); | |||
| float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr))); | |||
| #endif | |||
| _sum0 = vfmaq_f32(_sum0, _val, _w); | |||
| @@ -317,7 +317,7 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, | |||
| float32x4_t _val1 = vld1q_f32(m + 4); | |||
| float32x4_t _val2 = vld1q_f32(m + 8); | |||
| float32x4_t _val3 = vld1q_f32(m + 12); | |||
| float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr))); | |||
| float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr))); | |||
| #endif | |||
| #if __aarch64__ | |||
| @@ -414,8 +414,8 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, | |||
| float32x4_t _val0 = vld1q_f32(m); | |||
| float32x4_t _val1 = vld1q_f32(m + 4); | |||
| uint16x8_t _w01 = vld1q_u16(kptr); | |||
| float32x4_t _w0 = vcvt_f32_f16(vreinterpret_f16_u16(vget_low_u16(_w01))); | |||
| float32x4_t _w1 = vcvt_f32_f16(vreinterpret_f16_u16(vget_high_u16(_w01))); | |||
| float32x4_t _w0 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w01))); | |||
| float32x4_t _w1 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w01))); | |||
| #endif | |||
| _sum0 = vfmaq_f32(_sum0, _val0, _w0); | |||
| @@ -433,7 +433,7 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, | |||
| float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr)); | |||
| #else | |||
| float32x4_t _val = vld1q_f32(m); | |||
| float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr))); | |||
| float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr))); | |||
| #endif | |||
| _sum0 = vfmaq_f32(_sum0, _val, _w); | |||
| @@ -89,9 +89,9 @@ static inline float16x4_t log_ps(float16x4_t x) | |||
| * } else { x = x - 1.0; } | |||
| */ | |||
| uint16x4_t mask = vclt_f16(x, vdup_n_f16(c_cephes_SQRTHF)); | |||
| float16x4_t tmp = vreinterpret_f16_u16(vand_u16(vreinterpret_u16_f16(x), mask)); | |||
| float16x4_t tmp = (float16x4_t)(vand_u16((uint16x4_t)(x), mask)); | |||
| x = vsub_f16(x, one); | |||
| e = vsub_f16(e, vreinterpret_f16_u16(vand_u16(vreinterpret_u16_f16(one), mask))); | |||
| e = vsub_f16(e, (float16x4_t)(vand_u16((uint16x4_t)(one), mask))); | |||
| x = vadd_f16(x, tmp); | |||
| float16x4_t z = vmul_f16(x, x); | |||
| @@ -115,7 +115,7 @@ static inline float16x4_t log_ps(float16x4_t x) | |||
| x = vadd_f16(x, y); | |||
| x = vfma_f16(x, e, vdup_n_f16(c_cephes_log_q2)); | |||
| x = vreinterpret_f16_u16(vorr_u16(vreinterpret_u16_f16(x), invalid_mask)); // negative arg will be NAN | |||
| x = (float16x4_t)(vorr_u16((uint16x4_t)(x), invalid_mask)); // negative arg will be NAN | |||
| return x; | |||
| } | |||
| @@ -208,9 +208,9 @@ static inline float16x4_t exp_ps(float16x4_t x) | |||
| /* if greater, substract 1 */ | |||
| uint16x4_t mask = vcgt_f16(tmp, fx); | |||
| mask = vand_u16(mask, vreinterpret_u16_f16(one)); | |||
| mask = vand_u16(mask, (uint16x4_t)(one)); | |||
| fx = vsub_f16(tmp, vreinterpret_f16_u16(mask)); | |||
| fx = vsub_f16(tmp, (float16x4_t)(mask)); | |||
| tmp = vmul_f16(fx, vdup_n_f16(c_cephes_exp_C1)); | |||
| float16x4_t z = vmul_f16(fx, vdup_n_f16(c_cephes_exp_C2)); | |||
| @@ -489,7 +489,7 @@ static inline float16x4_t tanh_ps(float16x4_t x) | |||
| // clamp the inputs to the range [-9, 9] since anything outside | |||
| // this range is -/+1.0f in single-precision. | |||
| x2 = vreinterpret_f16_u16(vbsl_u16(vcge_f16(vdup_n_f16(c_tanh_hi), x2), vreinterpret_u16_f16(x2), vreinterpret_u16_f16(vdup_n_f16(c_tanh_hi)))); | |||
| x2 = (float16x4_t)(vbsl_u16(vcge_f16(vdup_n_f16(c_tanh_hi), x2), (uint16x4_t)(x2), (uint16x4_t)(vdup_n_f16(c_tanh_hi)))); | |||
| // since the polynomials are odd/even, we need x**2. | |||
| float16x4_t z = vmul_f16(x2, x2); | |||
| @@ -514,10 +514,10 @@ static inline float16x4_t tanh_ps(float16x4_t x) | |||
| y = vdiv_f16(y, w); | |||
| // reinstate the sign. | |||
| y = vreinterpret_f16_u16(vbsl_u16(vdup_n_u16(1u << 15), vreinterpret_u16_f16(x), vreinterpret_u16_f16(y))); | |||
| y = (float16x4_t)(vbsl_u16(vdup_n_u16(1u << 15), (uint16x4_t)(x), (uint16x4_t)(y))); | |||
| // when the argument is very small in magnitude it's more accurate to just return it. | |||
| y = vreinterpret_f16_u16(vbsl_u16(tiny_mask, vreinterpret_u16_f16(y), vreinterpret_u16_f16(x))); | |||
| y = (float16x4_t)(vbsl_u16(tiny_mask, (uint16x4_t)(y), (uint16x4_t)(x))); | |||
| return y; | |||
| } | |||