| @@ -28,8 +28,634 @@ static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| const float* kernel = _kernel; | |||
| const float* bias = _bias; | |||
| int nn_outch = outch >> 1; | |||
| int remain_outch_start = nn_outch << 1; | |||
| #pragma omp parallel for | |||
| for (int p=0; p<outch; p++) | |||
| for (int pp=0; pp<nn_outch; pp++) | |||
| { | |||
| int p = pp * 2; | |||
| Mat out0 = top_blob.channel(p); | |||
| Mat out1 = top_blob.channel(p+1); | |||
| const float bias0 = bias ? bias[p] : 0.f; | |||
| const float bias1 = bias ? bias[p+1] : 0.f; | |||
| out0.fill(bias0); | |||
| out1.fill(bias1); | |||
| const float* k0 = kernel + p*inch*9; | |||
| const float* k1 = kernel + (p+1)*inch*9; | |||
| for (int q=0; q<inch; q++) | |||
| { | |||
| float* outptr0 = out0; | |||
| float* outptr1 = out1; | |||
| float* outptr0n = outptr0 + outw; | |||
| float* outptr1n = outptr1 + outw; | |||
| const float* img0 = bottom_blob.channel(q); | |||
| const float* r0 = img0; | |||
| const float* r1 = img0 + w; | |||
| const float* r2 = img0 + w*2; | |||
| const float* r3 = img0 + w*3; | |||
| #if __ARM_NEON | |||
| float32x4_t _k00 = vld1q_f32(k0); | |||
| float32x4_t _k03 = vld1q_f32(k0+3); | |||
| float32x4_t _k06 = vld1q_f32(k0+6); | |||
| float32x4_t _k10 = vld1q_f32(k1); | |||
| float32x4_t _k13 = vld1q_f32(k1+3); | |||
| float32x4_t _k16 = vld1q_f32(k1+6); | |||
| #endif // __ARM_NEON | |||
| int i = 0; | |||
| #if 1 | |||
| for (; i+1 < outh; i+=2) | |||
| { | |||
| #if __ARM_NEON | |||
| int nn = outw >> 2; | |||
| int remain = outw & 3; | |||
| #else | |||
| int remain = outw; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _sum0 = vld1q_f32(outptr0); | |||
| float32x4_t _sum1 = vld1q_f32(outptr1); | |||
| float32x4_t _sum0n = vld1q_f32(outptr0n); | |||
| float32x4_t _sum1n = vld1q_f32(outptr1n); | |||
| float32x4_t _r00 = vld1q_f32(r0); | |||
| float32x4_t _r00n = vld1q_f32(r0 + 4); | |||
| float32x4_t _r01 = vextq_f32(_r00, _r00n, 1); | |||
| float32x4_t _r02 = vextq_f32(_r00, _r00n, 2); | |||
| float32x4_t _r10 = vld1q_f32(r1); | |||
| float32x4_t _r10n = vld1q_f32(r1 + 4); | |||
| float32x4_t _r11 = vextq_f32(_r10, _r10n, 1); | |||
| float32x4_t _r12 = vextq_f32(_r10, _r10n, 2); | |||
| float32x4_t _r20 = vld1q_f32(r2); | |||
| float32x4_t _r20n = vld1q_f32(r2 + 4); | |||
| float32x4_t _r21 = vextq_f32(_r20, _r20n, 1); | |||
| float32x4_t _r22 = vextq_f32(_r20, _r20n, 2); | |||
| float32x4_t _r30 = vld1q_f32(r3); | |||
| float32x4_t _r30n = vld1q_f32(r3 + 4); | |||
| float32x4_t _r31 = vextq_f32(_r30, _r30n, 1); | |||
| float32x4_t _r32 = vextq_f32(_r30, _r30n, 2); | |||
| _sum0 = vfmaq_laneq_f32(_sum0, _r00, _k00, 0); | |||
| _sum0 = vfmaq_laneq_f32(_sum0, _r01, _k00, 1); | |||
| _sum0 = vfmaq_laneq_f32(_sum0, _r02, _k00, 2); | |||
| _sum0 = vfmaq_laneq_f32(_sum0, _r10, _k03, 0); | |||
| _sum0 = vfmaq_laneq_f32(_sum0, _r11, _k03, 1); | |||
| _sum0 = vfmaq_laneq_f32(_sum0, _r12, _k03, 2); | |||
| _sum0 = vfmaq_laneq_f32(_sum0, _r20, _k06, 0); | |||
| _sum0 = vfmaq_laneq_f32(_sum0, _r21, _k06, 1); | |||
| _sum0 = vfmaq_laneq_f32(_sum0, _r22, _k06, 2); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r00, _k10, 0); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r01, _k10, 1); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r02, _k10, 2); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r10, _k13, 0); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r11, _k13, 1); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r12, _k13, 2); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r20, _k16, 0); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r21, _k16, 1); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r22, _k16, 2); | |||
| _sum0n = vfmaq_laneq_f32(_sum0n, _r10, _k00, 0); | |||
| _sum0n = vfmaq_laneq_f32(_sum0n, _r11, _k00, 1); | |||
| _sum0n = vfmaq_laneq_f32(_sum0n, _r12, _k00, 2); | |||
| _sum0n = vfmaq_laneq_f32(_sum0n, _r20, _k03, 0); | |||
| _sum0n = vfmaq_laneq_f32(_sum0n, _r21, _k03, 1); | |||
| _sum0n = vfmaq_laneq_f32(_sum0n, _r22, _k03, 2); | |||
| _sum0n = vfmaq_laneq_f32(_sum0n, _r30, _k06, 0); | |||
| _sum0n = vfmaq_laneq_f32(_sum0n, _r31, _k06, 1); | |||
| _sum0n = vfmaq_laneq_f32(_sum0n, _r32, _k06, 2); | |||
| _sum1n = vfmaq_laneq_f32(_sum1n, _r10, _k10, 0); | |||
| _sum1n = vfmaq_laneq_f32(_sum1n, _r11, _k10, 1); | |||
| _sum1n = vfmaq_laneq_f32(_sum1n, _r12, _k10, 2); | |||
| _sum1n = vfmaq_laneq_f32(_sum1n, _r20, _k13, 0); | |||
| _sum1n = vfmaq_laneq_f32(_sum1n, _r21, _k13, 1); | |||
| _sum1n = vfmaq_laneq_f32(_sum1n, _r22, _k13, 2); | |||
| _sum1n = vfmaq_laneq_f32(_sum1n, _r30, _k16, 0); | |||
| _sum1n = vfmaq_laneq_f32(_sum1n, _r31, _k16, 1); | |||
| _sum1n = vfmaq_laneq_f32(_sum1n, _r32, _k16, 2); | |||
| vst1q_f32(outptr0, _sum0); | |||
| vst1q_f32(outptr1, _sum1); | |||
| vst1q_f32(outptr0n, _sum0n); | |||
| vst1q_f32(outptr1n, _sum1n); | |||
| r0 += 4; | |||
| r1 += 4; | |||
| r2 += 4; | |||
| r3 += 4; | |||
| outptr0 += 4; | |||
| outptr1 += 4; | |||
| outptr0n += 4; | |||
| outptr1n += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "pld [%5, #192] \n" | |||
| "vld1.f32 {d16-d18}, [%5 :64] \n"// r0 | |||
| "add %5, #16 \n" | |||
| "pld [%8, #192] \n" | |||
| "vld1.f32 {d28-d31}, [%8] \n"// r3 | |||
| "add %8, #16 \n" | |||
| "vext.32 q10, q8, q9, #1 \n" | |||
| "vext.32 q11, q14, q15, #2 \n" | |||
| "0: \n" | |||
| "pld [%1, #128] \n" | |||
| "vld1.f32 {d12-d13}, [%1 :64] \n"// _sum0 | |||
| "pld [%2, #128] \n" | |||
| "vld1.f32 {d14-d15}, [%2 :64] \n"// _sum1 | |||
| "vmla.f32 q6, q8, %e18[0] \n" | |||
| "vmla.f32 q7, q8, %e21[0] \n" | |||
| "pld [%3, #128] \n" | |||
| "vld1.f32 {d24-d25}, [%3] \n"// _sum0n | |||
| "pld [%4, #128] \n" | |||
| "vld1.f32 {d26-d27}, [%4] \n"// _sum1n | |||
| "vmla.f32 q12, q14, %e20[0] \n" | |||
| "vmla.f32 q13, q14, %e23[0] \n" | |||
| "vext.32 q8, q8, q9, #2 \n" | |||
| "vext.32 q9, q14, q15, #1 \n" | |||
| "vmla.f32 q6, q10, %e18[1] \n" | |||
| "vmla.f32 q7, q10, %e21[1] \n" | |||
| "vmla.f32 q12, q11, %f20[0] \n" | |||
| "vmla.f32 q13, q11, %f23[0] \n" | |||
| "pld [%6, #192] \n" | |||
| "vld1.f32 {d28-d31}, [%6] \n"// r1 | |||
| "add %6, #16 \n" | |||
| "vmla.f32 q6, q8, %f18[0] \n" | |||
| "vmla.f32 q7, q8, %f21[0] \n" | |||
| "vmla.f32 q12, q9, %e20[1] \n" | |||
| "vmla.f32 q13, q9, %e23[1] \n" | |||
| "vext.32 q10, q14, q15, #1 \n" | |||
| "vmla.f32 q6, q14, %e19[0] \n" | |||
| "vmla.f32 q7, q14, %e22[0] \n" | |||
| "vmla.f32 q12, q14, %e18[0] \n" | |||
| "vmla.f32 q13, q14, %e21[0] \n" | |||
| "vext.32 q11, q14, q15, #2 \n" | |||
| "vmla.f32 q6, q10, %e19[1] \n" | |||
| "vmla.f32 q7, q10, %e22[1] \n" | |||
| "vmla.f32 q12, q10, %e18[1] \n" | |||
| "vmla.f32 q13, q10, %e21[1] \n" | |||
| "pld [%7, #192] \n" | |||
| "vld1.f32 {d16-d18}, [%7 :64] \n"// r2 | |||
| "add %7, #16 \n" | |||
| "vmla.f32 q6, q11, %f19[0] \n" | |||
| "vmla.f32 q7, q11, %f22[0] \n" | |||
| "vmla.f32 q12, q11, %f18[0] \n" | |||
| "vmla.f32 q13, q11, %f21[0] \n" | |||
| "vext.32 q10, q8, q9, #1 \n" | |||
| "vmla.f32 q6, q8, %e20[0] \n" | |||
| "vmla.f32 q7, q8, %e23[0] \n" | |||
| "vmla.f32 q12, q8, %e19[0] \n" | |||
| "vmla.f32 q13, q8, %e22[0] \n" | |||
| "vext.32 q11, q8, q9, #2 \n" | |||
| "vmla.f32 q6, q10, %e20[1] \n" | |||
| "vmla.f32 q7, q10, %e23[1] \n" | |||
| "vmla.f32 q12, q10, %e19[1] \n" | |||
| "vmla.f32 q13, q10, %e22[1] \n" | |||
| "pld [%5, #192] \n" | |||
| "vld1.f32 {d16-d18}, [%5 :64] \n"// r0 | |||
| "add %5, #16 \n" | |||
| "vmla.f32 q6, q11, %f20[0] \n" | |||
| "vmla.f32 q7, q11, %f23[0] \n" | |||
| "vmla.f32 q12, q11, %f19[0] \n" | |||
| "vmla.f32 q13, q11, %f22[0] \n" | |||
| "pld [%8, #192] \n" | |||
| "vld1.f32 {d28-d31}, [%8] \n"// r3 | |||
| "add %8, #16 \n" | |||
| "vext.32 q10, q8, q9, #1 \n" | |||
| "vst1.f32 {d12-d13}, [%1 : 64]!\n" | |||
| "vst1.f32 {d14-d15}, [%2 : 64]!\n" | |||
| "vext.32 q11, q14, q15, #2 \n" | |||
| "vst1.f32 {d24-d25}, [%3]! \n" | |||
| "vst1.f32 {d26-d27}, [%4]! \n" | |||
| "subs %0, #1 \n" | |||
| "bne 0b \n" | |||
| "sub %5, #16 \n" | |||
| "sub %8, #16 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr0), // %1 | |||
| "=r"(outptr1), // %2 | |||
| "=r"(outptr0n), // %3 | |||
| "=r"(outptr1n), // %4 | |||
| "=r"(r0), // %5 | |||
| "=r"(r1), // %6 | |||
| "=r"(r2), // %7 | |||
| "=r"(r3) // %8 | |||
| : "0"(nn), | |||
| "1"(outptr0), | |||
| "2"(outptr1), | |||
| "3"(outptr0n), | |||
| "4"(outptr1n), | |||
| "5"(r0), | |||
| "6"(r1), | |||
| "7"(r2), | |||
| "8"(r3), | |||
| "w"(_k00), // %18 | |||
| "w"(_k03), // %19 | |||
| "w"(_k06), // %20 | |||
| "w"(_k10), // %21 | |||
| "w"(_k13), // %22 | |||
| "w"(_k16) // %23 | |||
| : "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| #if __ARM_NEON | |||
| float32x4_t _r00 = vld1q_f32(r0); | |||
| float32x4_t _r10 = vld1q_f32(r1); | |||
| float32x4_t _r20 = vld1q_f32(r2); | |||
| float32x4_t _r30 = vld1q_f32(r3); | |||
| float32x4_t _sum0 = vmulq_f32(_r00, _k00); | |||
| float32x4_t _sum1 = vmulq_f32(_r00, _k10); | |||
| _sum0 = vmlaq_f32(_sum0, _r10, _k03); | |||
| _sum1 = vmlaq_f32(_sum1, _r10, _k13); | |||
| _sum0 = vmlaq_f32(_sum0, _r20, _k06); | |||
| _sum1 = vmlaq_f32(_sum1, _r20, _k16); | |||
| float32x4_t _sum0n = vmulq_f32(_r10, _k00); | |||
| float32x4_t _sum1n = vmulq_f32(_r10, _k10); | |||
| _sum0n = vmlaq_f32(_sum0n, _r20, _k03); | |||
| _sum1n = vmlaq_f32(_sum1n, _r20, _k13); | |||
| _sum0n = vmlaq_f32(_sum0n, _r30, _k06); | |||
| _sum1n = vmlaq_f32(_sum1n, _r30, _k16); | |||
| _sum0 = vsetq_lane_f32(*outptr0, _sum0, 3); | |||
| _sum1 = vsetq_lane_f32(*outptr1, _sum1, 3); | |||
| _sum0n = vsetq_lane_f32(*outptr0n, _sum0n, 3); | |||
| _sum1n = vsetq_lane_f32(*outptr1n, _sum1n, 3); | |||
| #if __aarch64__ | |||
| *outptr0 = vaddvq_f32(_sum0); | |||
| *outptr1 = vaddvq_f32(_sum1); | |||
| *outptr0n = vaddvq_f32(_sum0n); | |||
| *outptr1n = vaddvq_f32(_sum1n); | |||
| #else | |||
| float32x2_t _ss0 = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0)); | |||
| float32x2_t _ss1 = vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); | |||
| float32x2_t _ss0n = vadd_f32(vget_low_f32(_sum0n), vget_high_f32(_sum0n)); | |||
| float32x2_t _ss1n = vadd_f32(vget_low_f32(_sum1n), vget_high_f32(_sum1n)); | |||
| float32x2_t _ss01 = vpadd_f32(_ss0, _ss1); | |||
| float32x2_t _ss01n = vpadd_f32(_ss0n, _ss1n); | |||
| *outptr0 = vget_lane_f32(_ss01, 0); | |||
| *outptr1 = vget_lane_f32(_ss01, 1); | |||
| *outptr0n = vget_lane_f32(_ss01n, 0); | |||
| *outptr1n = vget_lane_f32(_ss01n, 1); | |||
| #endif // __aarch64__ | |||
| #else | |||
| float sum0 = 0.f; | |||
| float sum0n = 0.f; | |||
| float sum1 = 0.f; | |||
| float sum1n = 0.f; | |||
| sum0 += r0[0] * k0[0]; | |||
| sum0 += r0[1] * k0[1]; | |||
| sum0 += r0[2] * k0[2]; | |||
| sum0 += r1[0] * k0[3]; | |||
| sum0 += r1[1] * k0[4]; | |||
| sum0 += r1[2] * k0[5]; | |||
| sum0 += r2[0] * k0[6]; | |||
| sum0 += r2[1] * k0[7]; | |||
| sum0 += r2[2] * k0[8]; | |||
| sum1 += r0[0] * k1[0]; | |||
| sum1 += r0[1] * k1[1]; | |||
| sum1 += r0[2] * k1[2]; | |||
| sum1 += r1[0] * k1[3]; | |||
| sum1 += r1[1] * k1[4]; | |||
| sum1 += r1[2] * k1[5]; | |||
| sum1 += r2[0] * k1[6]; | |||
| sum1 += r2[1] * k1[7]; | |||
| sum1 += r2[2] * k1[8]; | |||
| sum0n += r1[0] * k0[0]; | |||
| sum0n += r1[1] * k0[1]; | |||
| sum0n += r1[2] * k0[2]; | |||
| sum0n += r2[0] * k0[3]; | |||
| sum0n += r2[1] * k0[4]; | |||
| sum0n += r2[2] * k0[5]; | |||
| sum0n += r3[0] * k0[6]; | |||
| sum0n += r3[1] * k0[7]; | |||
| sum0n += r3[2] * k0[8]; | |||
| sum1n += r1[0] * k1[0]; | |||
| sum1n += r1[1] * k1[1]; | |||
| sum1n += r1[2] * k1[2]; | |||
| sum1n += r2[0] * k1[3]; | |||
| sum1n += r2[1] * k1[4]; | |||
| sum1n += r2[2] * k1[5]; | |||
| sum1n += r3[0] * k1[6]; | |||
| sum1n += r3[1] * k1[7]; | |||
| sum1n += r3[2] * k1[8]; | |||
| *outptr0 += sum0; | |||
| *outptr1 += sum1; | |||
| *outptr0n += sum0n; | |||
| *outptr1n += sum1n; | |||
| #endif // __ARM_NEON | |||
| r0++; | |||
| r1++; | |||
| r2++; | |||
| r3++; | |||
| outptr0++; | |||
| outptr1++; | |||
| outptr0n++; | |||
| outptr1n++; | |||
| } | |||
| r0 += 2 + w; | |||
| r1 += 2 + w; | |||
| r2 += 2 + w; | |||
| r3 += 2 + w; | |||
| outptr0 += outw; | |||
| outptr1 += outw; | |||
| outptr0n += outw; | |||
| outptr1n += outw; | |||
| } | |||
| #endif | |||
| for (; i < outh; i++) | |||
| { | |||
| #if __ARM_NEON | |||
| int nn = outw >> 2; | |||
| int remain = outw & 3; | |||
| #else | |||
| int remain = outw; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _sum0 = vld1q_f32(outptr0); | |||
| float32x4_t _sum1 = vld1q_f32(outptr1); | |||
| float32x4_t _r00 = vld1q_f32(r0); | |||
| float32x4_t _r00n = vld1q_f32(r0 + 4); | |||
| float32x4_t _r01 = vextq_f32(_r00, _r00n, 1); | |||
| float32x4_t _r02 = vextq_f32(_r00, _r00n, 2); | |||
| float32x4_t _r10 = vld1q_f32(r1); | |||
| float32x4_t _r10n = vld1q_f32(r1 + 4); | |||
| float32x4_t _r11 = vextq_f32(_r10, _r10n, 1); | |||
| float32x4_t _r12 = vextq_f32(_r10, _r10n, 2); | |||
| float32x4_t _r20 = vld1q_f32(r2); | |||
| float32x4_t _r20n = vld1q_f32(r2 + 4); | |||
| float32x4_t _r21 = vextq_f32(_r20, _r20n, 1); | |||
| float32x4_t _r22 = vextq_f32(_r20, _r20n, 2); | |||
| _sum0 = vfmaq_laneq_f32(_sum0, _r00, _k00, 0); | |||
| _sum0 = vfmaq_laneq_f32(_sum0, _r01, _k00, 1); | |||
| _sum0 = vfmaq_laneq_f32(_sum0, _r02, _k00, 2); | |||
| _sum0 = vfmaq_laneq_f32(_sum0, _r10, _k03, 0); | |||
| _sum0 = vfmaq_laneq_f32(_sum0, _r11, _k03, 1); | |||
| _sum0 = vfmaq_laneq_f32(_sum0, _r12, _k03, 2); | |||
| _sum0 = vfmaq_laneq_f32(_sum0, _r20, _k06, 0); | |||
| _sum0 = vfmaq_laneq_f32(_sum0, _r21, _k06, 1); | |||
| _sum0 = vfmaq_laneq_f32(_sum0, _r22, _k06, 2); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r00, _k10, 0); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r01, _k10, 1); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r02, _k10, 2); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r10, _k13, 0); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r11, _k13, 1); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r12, _k13, 2); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r20, _k16, 0); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r21, _k16, 1); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r22, _k16, 2); | |||
| vst1q_f32(outptr0, _sum0); | |||
| vst1q_f32(outptr1, _sum1); | |||
| r0 += 4; | |||
| r1 += 4; | |||
| r2 += 4; | |||
| outptr0 += 4; | |||
| outptr1 += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "0: \n" | |||
| "pld [%3, #192] \n" | |||
| "vld1.f32 {d16-d18}, [%3] \n"// r0 | |||
| "add %3, #16 \n" | |||
| "pld [%1, #128] \n" | |||
| "vld1.f32 {d12-d13}, [%1] \n"// _sum0 | |||
| "pld [%2, #128] \n" | |||
| "vld1.f32 {d14-d15}, [%2] \n"// _sum1 | |||
| "vmul.f32 q14, q8, %e12[0] \n" | |||
| "vmul.f32 q15, q8, %e15[0] \n" | |||
| "vext.32 q10, q8, q9, #1 \n" | |||
| "vext.32 q11, q8, q9, #2 \n" | |||
| "vmla.f32 q6, q10, %e12[1] \n" | |||
| "vmla.f32 q7, q10, %e15[1] \n" | |||
| "pld [%4, #192] \n" | |||
| "vld1.f32 {d16-d18}, [%4] \n"// r1 | |||
| "add %4, #16 \n" | |||
| "vmla.f32 q14, q11, %f12[0] \n" | |||
| "vmla.f32 q15, q11, %f15[0] \n" | |||
| "vmla.f32 q6, q8, %e13[0] \n" | |||
| "vmla.f32 q7, q8, %e16[0] \n" | |||
| "vext.32 q10, q8, q9, #1 \n" | |||
| "vext.32 q11, q8, q9, #2 \n" | |||
| "vmla.f32 q14, q10, %e13[1] \n" | |||
| "vmla.f32 q15, q10, %e16[1] \n" | |||
| "pld [%5, #192] \n" | |||
| "vld1.f32 {d16-d18}, [%5] \n"// r2 | |||
| "add %5, #16 \n" | |||
| "vmla.f32 q6, q11, %f13[0] \n" | |||
| "vmla.f32 q7, q11, %f16[0] \n" | |||
| "vmla.f32 q14, q8, %e14[0] \n" | |||
| "vmla.f32 q15, q8, %e17[0] \n" | |||
| "vext.32 q10, q8, q9, #1 \n" | |||
| "vext.32 q11, q8, q9, #2 \n" | |||
| "vmla.f32 q6, q10, %e14[1] \n" | |||
| "vmla.f32 q7, q10, %e17[1] \n" | |||
| "vmla.f32 q14, q11, %f14[0] \n" | |||
| "vmla.f32 q15, q11, %f17[0] \n" | |||
| "vadd.f32 q6, q6, q14 \n" | |||
| "vadd.f32 q7, q7, q15 \n" | |||
| "vst1.f32 {d12-d13}, [%1]! \n" | |||
| "vst1.f32 {d14-d15}, [%2]! \n" | |||
| "subs %0, #1 \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr0), // %1 | |||
| "=r"(outptr1), // %2 | |||
| "=r"(r0), // %3 | |||
| "=r"(r1), // %4 | |||
| "=r"(r2) // %5 | |||
| : "0"(nn), | |||
| "1"(outptr0), | |||
| "2"(outptr1), | |||
| "3"(r0), | |||
| "4"(r1), | |||
| "5"(r2), | |||
| "w"(_k00), // %12 | |||
| "w"(_k03), // %13 | |||
| "w"(_k06), // %14 | |||
| "w"(_k10), // %15 | |||
| "w"(_k13), // %16 | |||
| "w"(_k16) // %17 | |||
| : "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| #if __ARM_NEON | |||
| float32x4_t _r00 = vld1q_f32(r0); | |||
| float32x4_t _r10 = vld1q_f32(r1); | |||
| float32x4_t _r20 = vld1q_f32(r2); | |||
| float32x4_t _sum0 = vmulq_f32(_r00, _k00); | |||
| float32x4_t _sum1 = vmulq_f32(_r00, _k10); | |||
| _sum0 = vmlaq_f32(_sum0, _r10, _k03); | |||
| _sum1 = vmlaq_f32(_sum1, _r10, _k13); | |||
| _sum0 = vmlaq_f32(_sum0, _r20, _k06); | |||
| _sum1 = vmlaq_f32(_sum1, _r20, _k16); | |||
| _sum0 = vsetq_lane_f32(*outptr0, _sum0, 3); | |||
| _sum1 = vsetq_lane_f32(*outptr1, _sum1, 3); | |||
| #if __aarch64__ | |||
| *outptr0 = vaddvq_f32(_sum0); | |||
| *outptr1 = vaddvq_f32(_sum1); | |||
| #else | |||
| float32x2_t _ss0 = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0)); | |||
| float32x2_t _ss1 = vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); | |||
| float32x2_t _ss01 = vpadd_f32(_ss0, _ss1); | |||
| *outptr0 = vget_lane_f32(_ss01, 0); | |||
| *outptr1 = vget_lane_f32(_ss01, 1); | |||
| #endif // __aarch64__ | |||
| #else | |||
| float sum0 = 0.f; | |||
| float sum1 = 0.f; | |||
| sum0 += r0[0] * k0[0]; | |||
| sum0 += r0[1] * k0[1]; | |||
| sum0 += r0[2] * k0[2]; | |||
| sum0 += r1[0] * k0[3]; | |||
| sum0 += r1[1] * k0[4]; | |||
| sum0 += r1[2] * k0[5]; | |||
| sum0 += r2[0] * k0[6]; | |||
| sum0 += r2[1] * k0[7]; | |||
| sum0 += r2[2] * k0[8]; | |||
| sum1 += r0[0] * k1[0]; | |||
| sum1 += r0[1] * k1[1]; | |||
| sum1 += r0[2] * k1[2]; | |||
| sum1 += r1[0] * k1[3]; | |||
| sum1 += r1[1] * k1[4]; | |||
| sum1 += r1[2] * k1[5]; | |||
| sum1 += r2[0] * k1[6]; | |||
| sum1 += r2[1] * k1[7]; | |||
| sum1 += r2[2] * k1[8]; | |||
| *outptr0 += sum0; | |||
| *outptr1 += sum1; | |||
| #endif // __ARM_NEON | |||
| r0++; | |||
| r1++; | |||
| r2++; | |||
| outptr0++; | |||
| outptr1++; | |||
| } | |||
| r0 += 2; | |||
| r1 += 2; | |||
| r2 += 2; | |||
| } | |||
| k0 += 9; | |||
| k1 += 9; | |||
| } | |||
| } | |||
| #pragma omp parallel for | |||
| for (int p=remain_outch_start; p<outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||