diff --git a/src/layer/arm/neon_mathfun.h b/src/layer/arm/neon_mathfun.h index daffae56e..5d85028ee 100644 --- a/src/layer/arm/neon_mathfun.h +++ b/src/layer/arm/neon_mathfun.h @@ -310,6 +310,14 @@ static inline float32x4_t div_ps(float32x4_t a, float32x4_t b) #endif } +static inline float32x4_t tan_ps(float32x4_t x) +{ + float32x4_t ysin, ycos; + sincos_ps(x, &ysin, &ycos); + float32x4_t ytan = div_ps(ysin, ycos); + return ytan; +} + static inline float32x4_t pow_ps(float32x4_t a, float32x4_t b) { // pow(x, m) = exp(m * log(x)) diff --git a/src/layer/arm/unaryop_arm.cpp b/src/layer/arm/unaryop_arm.cpp index e427b7284..c637ab1bc 100644 --- a/src/layer/arm/unaryop_arm.cpp +++ b/src/layer/arm/unaryop_arm.cpp @@ -185,14 +185,7 @@ struct unary_op_tan_pack4 { float32x4_t operator()(const float32x4_t& x) const { - // TODO neon optimize - float tmp[4]; - vst1q_f32(tmp, x); - tmp[0] = tan(tmp[0]); - tmp[1] = tan(tmp[1]); - tmp[2] = tan(tmp[2]); - tmp[3] = tan(tmp[3]); - return vld1q_f32(tmp); + return tan_ps(x); } };