diff --git a/src/layer/arm/neon_mathfun.h b/src/layer/arm/neon_mathfun.h
index daffae56e..5d85028ee 100644
--- a/src/layer/arm/neon_mathfun.h
+++ b/src/layer/arm/neon_mathfun.h
@@ -310,6 +310,14 @@ static inline float32x4_t div_ps(float32x4_t a, float32x4_t b)
 #endif
 }
 
+static inline float32x4_t tan_ps(float32x4_t x)
+{
+    float32x4_t ysin, ycos;
+    sincos_ps(x, &ysin, &ycos);
+    float32x4_t ytan = div_ps(ysin, ycos);
+    return ytan;
+}
+
 static inline float32x4_t pow_ps(float32x4_t a, float32x4_t b)
 {
     // pow(x, m) = exp(m * log(x))
diff --git a/src/layer/arm/unaryop_arm.cpp b/src/layer/arm/unaryop_arm.cpp
index e427b7284..c637ab1bc 100644
--- a/src/layer/arm/unaryop_arm.cpp
+++ b/src/layer/arm/unaryop_arm.cpp
@@ -185,14 +185,7 @@ struct unary_op_tan_pack4
 {
     float32x4_t operator()(const float32x4_t& x) const
     {
-        // TODO neon optimize
-        float tmp[4];
-        vst1q_f32(tmp, x);
-        tmp[0] = tan(tmp[0]);
-        tmp[1] = tan(tmp[1]);
-        tmp[2] = tan(tmp[2]);
-        tmp[3] = tan(tmp[3]);
-        return vld1q_f32(tmp);
+        return tan_ps(x);
     }
 };