diff --git a/src/layer/arm/padding_arm.cpp b/src/layer/arm/padding_arm.cpp index 4f02c40dd..7f6bb4df1 100644 --- a/src/layer/arm/padding_arm.cpp +++ b/src/layer/arm/padding_arm.cpp @@ -30,7 +30,7 @@ Padding_arm::Padding_arm() } #if __ARM_NEON -static void padding_constant_pack4_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right, float v) +static void padding_constant_pack4_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right, float32x4_t v) { const float* ptr = src; float* outptr = dst; @@ -43,10 +43,10 @@ static void padding_constant_pack4_neon(const Mat& src, Mat& dst, int top, int b #if __aarch64__ asm volatile( - "dup v0.4s, %w10 \n" - "dup v1.4s, %w10 \n" - "dup v2.4s, %w10 \n" - "dup v3.4s, %w10 \n" + "mov v0.4s, %10.4s \n" + "mov v1.4s, %10.4s \n" + "mov v2.4s, %10.4s \n" + "mov v3.4s, %10.4s \n" // fill top "lsr w4, %w8, #3 \n"// w4 = nn = top_size >> 3 @@ -198,15 +198,15 @@ static void padding_constant_pack4_neon(const Mat& src, Mat& dst, int top, int b "r"(right), // %7 "r"(top_size), // %8 "r"(bottom_size), // %9 - "r"(v) // %10 + "w"(v) // %10 : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" ); #else // __aarch64__ asm volatile( - "vdup.f32 q0, %10 \n" - "vdup.f32 q1, %10 \n" - "vdup.f32 q2, %10 \n" - "vdup.f32 q3, %10 \n" + "vmov.f32 q0, %q10 \n" + "vmov.f32 q1, %q10 \n" + "vmov.f32 q2, %q10 \n" + "vmov.f32 q3, %q10 \n" // fill top "lsr r4, %8, #3 \n"// r4 = nn = top_size >> 3 @@ -358,7 +358,7 @@ static void padding_constant_pack4_neon(const Mat& src, Mat& dst, int top, int b "r"(right), // %7 "r"(top_size), // %8 "r"(bottom_size), // %9 - "r"(v) // %10 + "r"(w) // %10 : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); #endif // __aarch64__ @@ -553,7 +553,7 @@ int Padding_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op return -100; if (type == 0) - padding_constant_pack4_neon(bottom_blob, top_blob, 0, 0, left, right, value); + padding_constant_pack4_neon(bottom_blob, top_blob, 0, 0, left, right, vdupq_n_f32(value)); else if (type == 1) padding_replicate_pack4_neon(bottom_blob, top_blob, 0, 0, left, right); else // if (type == 2) @@ -571,7 +571,7 @@ int Padding_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op return -100; if (type == 0) - padding_constant_pack4_neon(bottom_blob, top_blob, top, bottom, left, right, value); + padding_constant_pack4_neon(bottom_blob, top_blob, top, bottom, left, right, vdupq_n_f32(value)); else if (type == 1) padding_replicate_pack4_neon(bottom_blob, top_blob, top, bottom, left, right); else // if (type == 2) @@ -592,7 +592,7 @@ int Padding_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op const Mat m = bottom_blob.channel(q); Mat borderm = top_blob.channel(q); - float pad_value = per_channel_pad_data_size ? per_channel_pad_data[q] : value; + float32x4_t pad_value = per_channel_pad_data_size ? vld1q_f32((const float*)per_channel_pad_data + q * 4) : vdupq_n_f32(value); if (type == 0) padding_constant_pack4_neon(m, borderm, top, bottom, left, right, pad_value);