Browse Source

fix padding arm pack4 per_channel_pad

tags/20200226
nihui 6 years ago
parent
commit
4cd80f6636
1 changed files with 14 additions and 14 deletions
  1. +14
    -14
      src/layer/arm/padding_arm.cpp

+ 14
- 14
src/layer/arm/padding_arm.cpp View File

@@ -30,7 +30,7 @@ Padding_arm::Padding_arm()
}

#if __ARM_NEON
static void padding_constant_pack4_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right, float v)
static void padding_constant_pack4_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right, float32x4_t v)
{
const float* ptr = src;
float* outptr = dst;
@@ -43,10 +43,10 @@ static void padding_constant_pack4_neon(const Mat& src, Mat& dst, int top, int b

#if __aarch64__
asm volatile(
"dup v0.4s, %w10 \n"
"dup v1.4s, %w10 \n"
"dup v2.4s, %w10 \n"
"dup v3.4s, %w10 \n"
"mov v0.4s, %10.4s \n"
"mov v1.4s, %10.4s \n"
"mov v2.4s, %10.4s \n"
"mov v3.4s, %10.4s \n"

// fill top
"lsr w4, %w8, #3 \n"// w4 = nn = top_size >> 3
@@ -198,15 +198,15 @@ static void padding_constant_pack4_neon(const Mat& src, Mat& dst, int top, int b
"r"(right), // %7
"r"(top_size), // %8
"r"(bottom_size), // %9
"r"(v) // %10
"w"(v) // %10
: "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
);
#else // __aarch64__
asm volatile(
"vdup.f32 q0, %10 \n"
"vdup.f32 q1, %10 \n"
"vdup.f32 q2, %10 \n"
"vdup.f32 q3, %10 \n"
"vmov.f32 q0, %q10 \n"
"vmov.f32 q1, %q10 \n"
"vmov.f32 q2, %q10 \n"
"vmov.f32 q3, %q10 \n"

// fill top
"lsr r4, %8, #3 \n"// r4 = nn = top_size >> 3
@@ -358,7 +358,7 @@ static void padding_constant_pack4_neon(const Mat& src, Mat& dst, int top, int b
"r"(right), // %7
"r"(top_size), // %8
"r"(bottom_size), // %9
"r"(v) // %10
"r"(w) // %10
: "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
#endif // __aarch64__
@@ -553,7 +553,7 @@ int Padding_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
return -100;

if (type == 0)
padding_constant_pack4_neon(bottom_blob, top_blob, 0, 0, left, right, value);
padding_constant_pack4_neon(bottom_blob, top_blob, 0, 0, left, right, vdupq_n_f32(value));
else if (type == 1)
padding_replicate_pack4_neon(bottom_blob, top_blob, 0, 0, left, right);
else // if (type == 2)
@@ -571,7 +571,7 @@ int Padding_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
return -100;

if (type == 0)
padding_constant_pack4_neon(bottom_blob, top_blob, top, bottom, left, right, value);
padding_constant_pack4_neon(bottom_blob, top_blob, top, bottom, left, right, vdupq_n_f32(value));
else if (type == 1)
padding_replicate_pack4_neon(bottom_blob, top_blob, top, bottom, left, right);
else // if (type == 2)
@@ -592,7 +592,7 @@ int Padding_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
const Mat m = bottom_blob.channel(q);
Mat borderm = top_blob.channel(q);

float pad_value = per_channel_pad_data_size ? per_channel_pad_data[q] : value;
float32x4_t pad_value = per_channel_pad_data_size ? vld1q_f32((const float*)per_channel_pad_data + q * 4) : vdupq_n_f32(value);

if (type == 0)
padding_constant_pack4_neon(m, borderm, top, bottom, left, right, pad_value);


Loading…
Cancel
Save