|
|
@@ -1,5 +1,5 @@ |
|
|
/** |
|
|
/** |
|
|
* Copyright 2020 Huawei Technologies Co., Ltd |
|
|
|
|
|
|
|
|
* Copyright 2020-2021 Huawei Technologies Co., Ltd |
|
|
* |
|
|
* |
|
|
* Licensed under the Apache License, Version 2.0 (the "License"); |
|
|
* Licensed under the Apache License, Version 2.0 (the "License"); |
|
|
* you may not use this file except in compliance with the License. |
|
|
* you may not use this file except in compliance with the License. |
|
|
@@ -13,8 +13,8 @@ |
|
|
* See the License for the specific language governing permissions and |
|
|
* See the License for the specific language governing permissions and |
|
|
* limitations under the License. |
|
|
* limitations under the License. |
|
|
*/ |
|
|
*/ |
|
|
|
|
|
|
|
|
#include "nnacl/fp16/activation_fp16.h" |
|
|
#include "nnacl/fp16/activation_fp16.h" |
|
|
|
|
|
#include "nnacl/fp32/exp_fp32.h" |
|
|
#include "nnacl/errorcode.h" |
|
|
#include "nnacl/errorcode.h" |
|
|
|
|
|
|
|
|
int ReluFp16(const float16_t *src, float16_t *dst, int ele_num) { |
|
|
int ReluFp16(const float16_t *src, float16_t *dst, int ele_num) { |
|
|
@@ -60,8 +60,19 @@ int LReluFp16(const float16_t *src, float16_t *dst, int ele_num, float16_t alpha |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
int SigmoidFp16(const float16_t *src, float16_t *dst, int ele_num) { |
|
|
int SigmoidFp16(const float16_t *src, float16_t *dst, int ele_num) { |
|
|
for (int i = 0; i < ele_num; ++i) { |
|
|
|
|
|
dst[i] = (float16_t)1.0f / (float16_t)(1.0f + exp(-src[i])); |
|
|
|
|
|
|
|
|
int i = 0; |
|
|
|
|
|
#ifdef ENABLE_ARM64 |
|
|
|
|
|
int count = (ele_num / C4NUM) * C4NUM; |
|
|
|
|
|
for (; i < count; i += C4NUM) { |
|
|
|
|
|
float32x4_t tmp; |
|
|
|
|
|
simd_exp(vnegq_f32(vcvt_f32_f16(vld1_f16(src + i))), (float *)&tmp); |
|
|
|
|
|
vst1_f16(dst + i, vcvt_f16_f32(vdivq_f32(vdupq_n_f32(1.0f), vaddq_f32(vdupq_n_f32(1.0f), tmp)))); |
|
|
|
|
|
} |
|
|
|
|
|
#endif |
|
|
|
|
|
for (; i < ele_num; ++i) { |
|
|
|
|
|
float temp; |
|
|
|
|
|
single_exp(-src[i], &temp); |
|
|
|
|
|
dst[i] = (float16_t)1.0f / ((float16_t)1.0f + temp); |
|
|
} |
|
|
} |
|
|
return NNACL_OK; |
|
|
return NNACL_OK; |
|
|
} |
|
|
} |
|
|
@@ -80,8 +91,33 @@ float16_t TanhOptFp16(float16_t src) { |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
int TanhFp16(const float16_t *src, float16_t *dst, int ele_num) { |
|
|
int TanhFp16(const float16_t *src, float16_t *dst, int ele_num) { |
|
|
for (int i = 0; i < ele_num; ++i) { |
|
|
|
|
|
dst[i] = TanhOptFp16(src[i]); |
|
|
|
|
|
|
|
|
int i = 0; |
|
|
|
|
|
#ifdef ENABLE_ARM64 |
|
|
|
|
|
static float32x4_t paramv[] = {{378.0f, 378.0f, 378.0f, 378.0f}, |
|
|
|
|
|
{17325.0f, 17325.0f, 17325.0f, 17325.0f}, |
|
|
|
|
|
{135135.0f, 135135.0f, 135135.0f, 135135.0f}, |
|
|
|
|
|
{28.0f, 28.0f, 28.0f, 28.0f}, |
|
|
|
|
|
{3150.0f, 3150.0f, 3150.0f, 3150.0f}, |
|
|
|
|
|
{62370.0f, 62370.0f, 62370.0f, 62370.0f}}; |
|
|
|
|
|
int count = (ele_num / C4NUM) * C4NUM; |
|
|
|
|
|
for (; i < count; i += C4NUM) { |
|
|
|
|
|
float32x4_t input = vcvt_f32_f16(vld1_f16(src + i)); |
|
|
|
|
|
float32x4_t square = vmulq_f32(input, input); |
|
|
|
|
|
float32x4_t a = vmulq_f32( |
|
|
|
|
|
vaddq_f32(vmulq_f32(vaddq_f32(vmulq_f32(vaddq_f32(square, paramv[0]), square), paramv[1]), square), paramv[2]), |
|
|
|
|
|
input); |
|
|
|
|
|
float32x4_t b = vaddq_f32( |
|
|
|
|
|
vmulq_f32(vaddq_f32(vmulq_f32(vaddq_f32(vmulq_f32(paramv[3], square), paramv[4]), square), paramv[5]), square), |
|
|
|
|
|
paramv[2]); |
|
|
|
|
|
vst1_f16(dst + i, vcvt_f16_f32(vdivq_f32(a, b))); |
|
|
|
|
|
} |
|
|
|
|
|
#endif |
|
|
|
|
|
for (; i < ele_num; ++i) { |
|
|
|
|
|
float input = src[i]; |
|
|
|
|
|
float square = input * input; |
|
|
|
|
|
float a = (((square + 378.0f) * square + 17325.0f) * square + 135135.0f) * input; |
|
|
|
|
|
float b = ((28.0f * square + 3150.0f) * square + 62370.0f) * square + 135135.0f; |
|
|
|
|
|
dst[i] = a / b; |
|
|
} |
|
|
} |
|
|
return NNACL_OK; |
|
|
return NNACL_OK; |
|
|
} |
|
|
} |
|
|
|