Browse Source

less unroll for unaryop arm, fix padding arm warning

tags/20220701
nihui 4 years ago
parent
commit
9376ba71c1
3 changed files with 8 additions and 34 deletions
  1. +8
    -0
      src/layer/arm/padding_arm.cpp
  2. +0
    -18
      src/layer/arm/unaryop_arm.cpp
  3. +0
    -16
      src/layer/arm/unaryop_arm_asimdhp.cpp

+ 8
- 0
src/layer/arm/padding_arm.cpp View File

@@ -419,6 +419,8 @@ int Padding_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
else
#endif
{
// shall never reach here
pad_value = vdup_n_u16(0);
}
// *INDENT-ON*
// clang-format on
@@ -464,6 +466,8 @@ int Padding_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
else
#endif
{
// shall never reach here
pad_value = vdup_n_u16(0);
}
// *INDENT-ON*
// clang-format on
@@ -516,6 +520,8 @@ int Padding_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
else
#endif
{
// shall never reach here
pad_value = vdup_n_u16(0);
}
// *INDENT-ON*
// clang-format on
@@ -574,6 +580,8 @@ int Padding_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
else
#endif
{
// shall never reach here
pad_value = vdup_n_u16(0);
}
// *INDENT-ON*
// clang-format on


+ 0
- 18
src/layer/arm/unaryop_arm.cpp View File

@@ -58,24 +58,6 @@ static int unary_op_inplace(Mat& a, const Option& opt)

int i = 0;
#if __ARM_NEON
#if __aarch64__
for (; i + 15 < size; i += 16)
{
float32x4_t _p0 = vld1q_f32(ptr);
float32x4_t _p1 = vld1q_f32(ptr + 4);
float32x4_t _p2 = vld1q_f32(ptr + 8);
float32x4_t _p3 = vld1q_f32(ptr + 12);
_p0 = op.func_pack4(_p0);
_p1 = op.func_pack4(_p1);
_p2 = op.func_pack4(_p2);
_p3 = op.func_pack4(_p3);
vst1q_f32(ptr, _p0);
vst1q_f32(ptr + 4, _p1);
vst1q_f32(ptr + 8, _p2);
vst1q_f32(ptr + 12, _p3);
ptr += 16;
}
#endif // __aarch64__
for (; i + 7 < size; i += 8)
{
float32x4_t _p0 = vld1q_f32(ptr);


+ 0
- 16
src/layer/arm/unaryop_arm_asimdhp.cpp View File

@@ -44,22 +44,6 @@ static int unary_op_inplace_fp16s(Mat& a, const Option& opt)
__fp16* ptr = a.channel(q);

int i = 0;
for (; i + 31 < size; i += 32)
{
float16x8_t _p0 = vld1q_f16(ptr);
float16x8_t _p1 = vld1q_f16(ptr + 8);
float16x8_t _p2 = vld1q_f16(ptr + 16);
float16x8_t _p3 = vld1q_f16(ptr + 24);
_p0 = op.func_pack8(_p0);
_p1 = op.func_pack8(_p1);
_p2 = op.func_pack8(_p2);
_p3 = op.func_pack8(_p3);
vst1q_f16(ptr, _p0);
vst1q_f16(ptr + 8, _p1);
vst1q_f16(ptr + 16, _p2);
vst1q_f16(ptr + 24, _p3);
ptr += 32;
}
for (; i + 15 < size; i += 16)
{
float16x8_t _p0 = vld1q_f16(ptr);


Loading…
Cancel
Save