diff --git a/src/layer/arm/convolution_3x3_pack4.h b/src/layer/arm/convolution_3x3_pack4.h index 10682bec2..92af139ca 100644 --- a/src/layer/arm/convolution_3x3_pack4.h +++ b/src/layer/arm/convolution_3x3_pack4.h @@ -67,16 +67,130 @@ static void conv3x3s1_winograd64_transform_kernel_pack4_neon(const Mat& kernel, // interleave // src = 64-inch-outch // dst = 4b-4a-inch/4a-64-outch/4b; +#if __aarch64__ + kernel_tm_pack4.create(2 * inch/4, 64, (outch/4)/2 + (outch/4)%2, (size_t)4u*16, 16); +#else kernel_tm_pack4.create(inch/4, 64, outch/4, (size_t)4u*16, 16); +#endif + + int q=0; +#if __aarch64__ + for (; q+7 0 @@ -677,11 +789,8 @@ static void conv3x3s1_winograd64_pack4_neon(const Mat& bottom_blob, Mat& top_blo "prfm pldl1keep, [%3, #512] \n" "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" - "prfm pldl1keep, [%4, #128] \n" - "ld1 {v4.4s, v5.4s}, [%4], #32 \n"// w0123_0 - - "prfm pldl1keep, [%5, #128] \n" - "ld1 {v6.4s, v7.4s}, [%5], #32 \n"// w0123_1 + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%4], #64 \n"// w0011_01 "fmla v8.4s, v4.4s, v0.s[0] \n" "fmla v9.4s, v4.4s, v0.s[1] \n" @@ -696,23 +805,23 @@ static void conv3x3s1_winograd64_pack4_neon(const Mat& bottom_blob, Mat& top_blo "fmla v18.4s, v4.4s, v2.s[2] \n" "fmla v19.4s, v4.4s, v2.s[3] \n" - "fmla v20.4s, v6.4s, v0.s[0] \n" - "fmla v21.4s, v6.4s, v0.s[1] \n" - "fmla v22.4s, v6.4s, v0.s[2] \n" - "fmla v23.4s, v6.4s, v0.s[3] \n" - "fmla v24.4s, v6.4s, v1.s[0] \n" - "fmla v25.4s, v6.4s, v1.s[1] \n" - "fmla v26.4s, v6.4s, v1.s[2] \n" - "fmla v27.4s, v6.4s, v1.s[3] \n" - "fmla v28.4s, v6.4s, v2.s[0] \n" - "fmla v29.4s, v6.4s, v2.s[1] \n" - "fmla v30.4s, v6.4s, v2.s[2] \n" - "fmla v31.4s, v6.4s, v2.s[3] \n" - - "fmla v8.4s, v5.4s, v3.s[0] \n" - "fmla v9.4s, v5.4s, v3.s[1] \n" - "fmla v10.4s, v5.4s, v3.s[2] \n" - "fmla v11.4s, v5.4s, v3.s[3] \n" + "fmla v20.4s, v5.4s, v0.s[0] \n" + "fmla v21.4s, v5.4s, v0.s[1] \n" + "fmla v22.4s, v5.4s, v0.s[2] \n" + "fmla v23.4s, v5.4s, v0.s[3] \n" + "fmla v24.4s, v5.4s, v1.s[0] \n" + "fmla v25.4s, v5.4s, v1.s[1] \n" + "fmla v26.4s, v5.4s, v1.s[2] \n" + "fmla v27.4s, v5.4s, v1.s[3] \n" + "fmla v28.4s, v5.4s, v2.s[0] \n" + "fmla v29.4s, v5.4s, v2.s[1] \n" + "fmla v30.4s, v5.4s, v2.s[2] \n" + "fmla v31.4s, v5.4s, v2.s[3] \n" + + "fmla v8.4s, v6.4s, v3.s[0] \n" + "fmla v9.4s, v6.4s, v3.s[1] \n" + "fmla v10.4s, v6.4s, v3.s[2] \n" + "fmla v11.4s, v6.4s, v3.s[3] \n" "fmla v20.4s, v7.4s, v3.s[0] \n" "fmla v21.4s, v7.4s, v3.s[1] \n" @@ -722,14 +831,14 @@ static void conv3x3s1_winograd64_pack4_neon(const Mat& bottom_blob, Mat& top_blo "prfm pldl1keep, [%3, #512] \n" "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" - "fmla v12.4s, v5.4s, v0.s[0] \n" - "fmla v13.4s, v5.4s, v0.s[1] \n" - "fmla v14.4s, v5.4s, v0.s[2] \n" - "fmla v15.4s, v5.4s, v0.s[3] \n" - "fmla v16.4s, v5.4s, v1.s[0] \n" - "fmla v17.4s, v5.4s, v1.s[1] \n" - "fmla v18.4s, v5.4s, v1.s[2] \n" - "fmla v19.4s, v5.4s, v1.s[3] \n" + "fmla v12.4s, v6.4s, v0.s[0] \n" + "fmla v13.4s, v6.4s, v0.s[1] \n" + "fmla v14.4s, v6.4s, v0.s[2] \n" + "fmla v15.4s, v6.4s, v0.s[3] \n" + "fmla v16.4s, v6.4s, v1.s[0] \n" + "fmla v17.4s, v6.4s, v1.s[1] \n" + "fmla v18.4s, v6.4s, v1.s[2] \n" + "fmla v19.4s, v6.4s, v1.s[3] \n" "fmla v24.4s, v7.4s, v0.s[0] \n" "fmla v25.4s, v7.4s, v0.s[1] \n" @@ -740,11 +849,8 @@ static void conv3x3s1_winograd64_pack4_neon(const Mat& bottom_blob, Mat& top_blo "fmla v30.4s, v7.4s, v1.s[2] \n" "fmla v31.4s, v7.4s, v1.s[3] \n" - "prfm pldl1keep, [%4, #128] \n" - "ld1 {v4.4s, v5.4s}, [%4], #32 \n"// w0123_0 - - "prfm pldl1keep, [%5, #128] \n" - "ld1 {v6.4s, v7.4s}, [%5], #32 \n"// w0123_1 + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%4], #64 \n"// w2233_01 "fmla v8.4s, v4.4s, v2.s[0] \n" "fmla v9.4s, v4.4s, v2.s[1] \n" @@ -755,14 +861,14 @@ static void conv3x3s1_winograd64_pack4_neon(const Mat& bottom_blob, Mat& top_blo "fmla v14.4s, v4.4s, v3.s[2] \n" "fmla v15.4s, v4.4s, v3.s[3] \n" - "fmla v20.4s, v6.4s, v2.s[0] \n" - "fmla v21.4s, v6.4s, v2.s[1] \n" - "fmla v22.4s, v6.4s, v2.s[2] \n" - "fmla v23.4s, v6.4s, v2.s[3] \n" - "fmla v24.4s, v6.4s, v3.s[0] \n" - "fmla v25.4s, v6.4s, v3.s[1] \n" - "fmla v26.4s, v6.4s, v3.s[2] \n" - "fmla v27.4s, v6.4s, v3.s[3] \n" + "fmla v20.4s, v5.4s, v2.s[0] \n" + "fmla v21.4s, v5.4s, v2.s[1] \n" + "fmla v22.4s, v5.4s, v2.s[2] \n" + "fmla v23.4s, v5.4s, v2.s[3] \n" + "fmla v24.4s, v5.4s, v3.s[0] \n" + "fmla v25.4s, v5.4s, v3.s[1] \n" + "fmla v26.4s, v5.4s, v3.s[2] \n" + "fmla v27.4s, v5.4s, v3.s[3] \n" "prfm pldl1keep, [%3, #512] \n" "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" @@ -772,23 +878,23 @@ static void conv3x3s1_winograd64_pack4_neon(const Mat& bottom_blob, Mat& top_blo "fmla v18.4s, v4.4s, v0.s[2] \n" "fmla v19.4s, v4.4s, v0.s[3] \n" - "fmla v28.4s, v6.4s, v0.s[0] \n" - "fmla v29.4s, v6.4s, v0.s[1] \n" - "fmla v30.4s, v6.4s, v0.s[2] \n" - "fmla v31.4s, v6.4s, v0.s[3] \n" - - "fmla v8.4s, v5.4s, v1.s[0] \n" - "fmla v9.4s, v5.4s, v1.s[1] \n" - "fmla v10.4s, v5.4s, v1.s[2] \n" - "fmla v11.4s, v5.4s, v1.s[3] \n" - "fmla v12.4s, v5.4s, v2.s[0] \n" - "fmla v13.4s, v5.4s, v2.s[1] \n" - "fmla v14.4s, v5.4s, v2.s[2] \n" - "fmla v15.4s, v5.4s, v2.s[3] \n" - "fmla v16.4s, v5.4s, v3.s[0] \n" - "fmla v17.4s, v5.4s, v3.s[1] \n" - "fmla v18.4s, v5.4s, v3.s[2] \n" - "fmla v19.4s, v5.4s, v3.s[3] \n" + "fmla v28.4s, v5.4s, v0.s[0] \n" + "fmla v29.4s, v5.4s, v0.s[1] \n" + "fmla v30.4s, v5.4s, v0.s[2] \n" + "fmla v31.4s, v5.4s, v0.s[3] \n" + + "fmla v8.4s, v6.4s, v1.s[0] \n" + "fmla v9.4s, v6.4s, v1.s[1] \n" + "fmla v10.4s, v6.4s, v1.s[2] \n" + "fmla v11.4s, v6.4s, v1.s[3] \n" + "fmla v12.4s, v6.4s, v2.s[0] \n" + "fmla v13.4s, v6.4s, v2.s[1] \n" + "fmla v14.4s, v6.4s, v2.s[2] \n" + "fmla v15.4s, v6.4s, v2.s[3] \n" + "fmla v16.4s, v6.4s, v3.s[0] \n" + "fmla v17.4s, v6.4s, v3.s[1] \n" + "fmla v18.4s, v6.4s, v3.s[2] \n" + "fmla v19.4s, v6.4s, v3.s[3] \n" "subs %w0, %w0, #1 \n" @@ -818,14 +924,12 @@ static void conv3x3s1_winograd64_pack4_neon(const Mat& bottom_blob, Mat& top_blo "=r"(output0_tm), // %1 "=r"(output1_tm), // %2 "=r"(r0), // %3 - "=r"(k0), // %4 - "=r"(k1) // %5 + "=r"(k01) // %4 : "0"(nn), "1"(output0_tm), "2"(output1_tm), "3"(r0), - "4"(k0), - "5"(k1) + "4"(k01) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" ); } @@ -833,8 +937,7 @@ static void conv3x3s1_winograd64_pack4_neon(const Mat& bottom_blob, Mat& top_blo { const float* r0 = bb2.row(i/12 + (i%12)/8); - const float* k0 = kernel0_tm.row(r); - const float* k1 = kernel1_tm.row(r); + const float* k01 = kernel01_tm.row(r); int nn = inch;// inch always > 0 @@ -862,77 +965,76 @@ static void conv3x3s1_winograd64_pack4_neon(const Mat& bottom_blob, Mat& top_blo "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n"// r0 r1 r2 r3 "prfm pldl1keep, [%4, #512] \n" - "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n"// w0123_0 + "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n"// w0011_01 + + "prfm pldl1keep, [%3, #512] \n" + "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%3], #64 \n"// r4 r5 r6 r7 "fmla v16.4s, v8.4s, v0.s[0] \n" "fmla v17.4s, v8.4s, v1.s[0] \n" "fmla v18.4s, v8.4s, v2.s[0] \n" "fmla v19.4s, v8.4s, v3.s[0] \n" - - "prfm pldl1keep, [%3, #512] \n" - "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%3], #64 \n"// r4 r5 r6 r7 - "fmla v20.4s, v8.4s, v4.s[0] \n" "fmla v21.4s, v8.4s, v5.s[0] \n" "fmla v22.4s, v8.4s, v6.s[0] \n" "fmla v23.4s, v8.4s, v7.s[0] \n" - "fmla v16.4s, v9.4s, v0.s[1] \n" - "fmla v17.4s, v9.4s, v1.s[1] \n" - "fmla v18.4s, v9.4s, v2.s[1] \n" - "fmla v19.4s, v9.4s, v3.s[1] \n" - "fmla v20.4s, v9.4s, v4.s[1] \n" - "fmla v21.4s, v9.4s, v5.s[1] \n" - "fmla v22.4s, v9.4s, v6.s[1] \n" - "fmla v23.4s, v9.4s, v7.s[1] \n" - - "fmla v16.4s, v10.4s, v0.s[2] \n" - "fmla v17.4s, v10.4s, v1.s[2] \n" - "fmla v18.4s, v10.4s, v2.s[2] \n" - "fmla v19.4s, v10.4s, v3.s[2] \n" - "fmla v20.4s, v10.4s, v4.s[2] \n" - "fmla v21.4s, v10.4s, v5.s[2] \n" - "fmla v22.4s, v10.4s, v6.s[2] \n" - "fmla v23.4s, v10.4s, v7.s[2] \n" - - "prfm pldl1keep, [%5, #512] \n" - "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%5], #64 \n"// w0123_1 - - "fmla v16.4s, v11.4s, v0.s[3] \n" - "fmla v17.4s, v11.4s, v1.s[3] \n" - "fmla v18.4s, v11.4s, v2.s[3] \n" - "fmla v19.4s, v11.4s, v3.s[3] \n" - "fmla v20.4s, v11.4s, v4.s[3] \n" - "fmla v21.4s, v11.4s, v5.s[3] \n" - "fmla v22.4s, v11.4s, v6.s[3] \n" - "fmla v23.4s, v11.4s, v7.s[3] \n" + "fmla v24.4s, v9.4s, v0.s[0] \n" + "fmla v25.4s, v9.4s, v1.s[0] \n" + "fmla v26.4s, v9.4s, v2.s[0] \n" + "fmla v27.4s, v9.4s, v3.s[0] \n" + "fmla v28.4s, v9.4s, v4.s[0] \n" + "fmla v29.4s, v9.4s, v5.s[0] \n" + "fmla v30.4s, v9.4s, v6.s[0] \n" + "fmla v31.4s, v9.4s, v7.s[0] \n" - "fmla v24.4s, v12.4s, v0.s[0] \n" - "fmla v25.4s, v12.4s, v1.s[0] \n" - "fmla v26.4s, v12.4s, v2.s[0] \n" - "fmla v27.4s, v12.4s, v3.s[0] \n" - "fmla v28.4s, v12.4s, v4.s[0] \n" - "fmla v29.4s, v12.4s, v5.s[0] \n" - "fmla v30.4s, v12.4s, v6.s[0] \n" - "fmla v31.4s, v12.4s, v7.s[0] \n" - - "fmla v24.4s, v13.4s, v0.s[1] \n" - "fmla v25.4s, v13.4s, v1.s[1] \n" - "fmla v26.4s, v13.4s, v2.s[1] \n" - "fmla v27.4s, v13.4s, v3.s[1] \n" - "fmla v28.4s, v13.4s, v4.s[1] \n" - "fmla v29.4s, v13.4s, v5.s[1] \n" - "fmla v30.4s, v13.4s, v6.s[1] \n" - "fmla v31.4s, v13.4s, v7.s[1] \n" - - "fmla v24.4s, v14.4s, v0.s[2] \n" - "fmla v25.4s, v14.4s, v1.s[2] \n" - "fmla v26.4s, v14.4s, v2.s[2] \n" - "fmla v27.4s, v14.4s, v3.s[2] \n" - "fmla v28.4s, v14.4s, v4.s[2] \n" - "fmla v29.4s, v14.4s, v5.s[2] \n" - "fmla v30.4s, v14.4s, v6.s[2] \n" - "fmla v31.4s, v14.4s, v7.s[2] \n" + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n"// w2233_01 + + "fmla v16.4s, v10.4s, v0.s[1] \n" + "fmla v17.4s, v10.4s, v1.s[1] \n" + "fmla v18.4s, v10.4s, v2.s[1] \n" + "fmla v19.4s, v10.4s, v3.s[1] \n" + "fmla v20.4s, v10.4s, v4.s[1] \n" + "fmla v21.4s, v10.4s, v5.s[1] \n" + "fmla v22.4s, v10.4s, v6.s[1] \n" + "fmla v23.4s, v10.4s, v7.s[1] \n" + + "fmla v24.4s, v11.4s, v0.s[1] \n" + "fmla v25.4s, v11.4s, v1.s[1] \n" + "fmla v26.4s, v11.4s, v2.s[1] \n" + "fmla v27.4s, v11.4s, v3.s[1] \n" + "fmla v28.4s, v11.4s, v4.s[1] \n" + "fmla v29.4s, v11.4s, v5.s[1] \n" + "fmla v30.4s, v11.4s, v6.s[1] \n" + "fmla v31.4s, v11.4s, v7.s[1] \n" + + "fmla v16.4s, v12.4s, v0.s[2] \n" + "fmla v17.4s, v12.4s, v1.s[2] \n" + "fmla v18.4s, v12.4s, v2.s[2] \n" + "fmla v19.4s, v12.4s, v3.s[2] \n" + "fmla v20.4s, v12.4s, v4.s[2] \n" + "fmla v21.4s, v12.4s, v5.s[2] \n" + "fmla v22.4s, v12.4s, v6.s[2] \n" + "fmla v23.4s, v12.4s, v7.s[2] \n" + + "fmla v24.4s, v13.4s, v0.s[2] \n" + "fmla v25.4s, v13.4s, v1.s[2] \n" + "fmla v26.4s, v13.4s, v2.s[2] \n" + "fmla v27.4s, v13.4s, v3.s[2] \n" + "fmla v28.4s, v13.4s, v4.s[2] \n" + "fmla v29.4s, v13.4s, v5.s[2] \n" + "fmla v30.4s, v13.4s, v6.s[2] \n" + "fmla v31.4s, v13.4s, v7.s[2] \n" + + "fmla v16.4s, v14.4s, v0.s[3] \n" + "fmla v17.4s, v14.4s, v1.s[3] \n" + "fmla v18.4s, v14.4s, v2.s[3] \n" + "fmla v19.4s, v14.4s, v3.s[3] \n" + "fmla v20.4s, v14.4s, v4.s[3] \n" + "fmla v21.4s, v14.4s, v5.s[3] \n" + "fmla v22.4s, v14.4s, v6.s[3] \n" + "fmla v23.4s, v14.4s, v7.s[3] \n" "subs %w0, %w0, #1 \n" @@ -956,14 +1058,12 @@ static void conv3x3s1_winograd64_pack4_neon(const Mat& bottom_blob, Mat& top_blo "=r"(output0_tm), // %1 "=r"(output1_tm), // %2 "=r"(r0), // %3 - "=r"(k0), // %4 - "=r"(k1) // %5 + "=r"(k01) // %4 : "0"(nn), "1"(output0_tm), "2"(output1_tm), "3"(r0), - "4"(k0), - "5"(k1) + "4"(k01) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" ); } @@ -971,8 +1071,7 @@ static void conv3x3s1_winograd64_pack4_neon(const Mat& bottom_blob, Mat& top_blo { const float* r0 = bb2.row(i/12 + (i%12)/8 + (i%8)/4); - const float* k0 = kernel0_tm.row(r); - const float* k1 = kernel1_tm.row(r); + const float* k01 = kernel01_tm.row(r); int nn = inch;// inch always > 0 @@ -992,47 +1091,47 @@ static void conv3x3s1_winograd64_pack4_neon(const Mat& bottom_blob, Mat& top_blo "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n"// r0 r1 r2 r3 "prfm pldl1keep, [%4, #512] \n" - "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n"// w0123_0 + "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n"// w0011_01 "fmla v16.4s, v8.4s, v0.s[0] \n" "fmla v17.4s, v8.4s, v1.s[0] \n" "fmla v18.4s, v8.4s, v2.s[0] \n" "fmla v19.4s, v8.4s, v3.s[0] \n" - "prfm pldl1keep, [%5, #512] \n" - "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%5], #64 \n"// w0123_1 + "fmla v20.4s, v9.4s, v0.s[0] \n" + "fmla v21.4s, v9.4s, v1.s[0] \n" + "fmla v22.4s, v9.4s, v2.s[0] \n" + "fmla v23.4s, v9.4s, v3.s[0] \n" - "fmla v20.4s, v12.4s, v0.s[0] \n" - "fmla v21.4s, v12.4s, v1.s[0] \n" - "fmla v22.4s, v12.4s, v2.s[0] \n" - "fmla v23.4s, v12.4s, v3.s[0] \n" + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n"// w2233_01 - "fmla v16.4s, v9.4s, v0.s[1] \n" - "fmla v17.4s, v9.4s, v1.s[1] \n" - "fmla v18.4s, v9.4s, v2.s[1] \n" - "fmla v19.4s, v9.4s, v3.s[1] \n" + "fmla v16.4s, v10.4s, v0.s[1] \n" + "fmla v17.4s, v10.4s, v1.s[1] \n" + "fmla v18.4s, v10.4s, v2.s[1] \n" + "fmla v19.4s, v10.4s, v3.s[1] \n" - "fmla v20.4s, v13.4s, v0.s[1] \n" - "fmla v21.4s, v13.4s, v1.s[1] \n" - "fmla v22.4s, v13.4s, v2.s[1] \n" - "fmla v23.4s, v13.4s, v3.s[1] \n" + "fmla v20.4s, v11.4s, v0.s[1] \n" + "fmla v21.4s, v11.4s, v1.s[1] \n" + "fmla v22.4s, v11.4s, v2.s[1] \n" + "fmla v23.4s, v11.4s, v3.s[1] \n" - "fmla v16.4s, v10.4s, v0.s[2] \n" - "fmla v17.4s, v10.4s, v1.s[2] \n" - "fmla v18.4s, v10.4s, v2.s[2] \n" - "fmla v19.4s, v10.4s, v3.s[2] \n" + "fmla v16.4s, v12.4s, v0.s[2] \n" + "fmla v17.4s, v12.4s, v1.s[2] \n" + "fmla v18.4s, v12.4s, v2.s[2] \n" + "fmla v19.4s, v12.4s, v3.s[2] \n" - "fmla v20.4s, v14.4s, v0.s[2] \n" - "fmla v21.4s, v14.4s, v1.s[2] \n" - "fmla v22.4s, v14.4s, v2.s[2] \n" - "fmla v23.4s, v14.4s, v3.s[2] \n" + "fmla v20.4s, v13.4s, v0.s[2] \n" + "fmla v21.4s, v13.4s, v1.s[2] \n" + "fmla v22.4s, v13.4s, v2.s[2] \n" + "fmla v23.4s, v13.4s, v3.s[2] \n" "subs %w0, %w0, #1 \n" - "fmla v16.4s, v11.4s, v0.s[3] \n" - "fmla v17.4s, v11.4s, v1.s[3] \n" - "fmla v18.4s, v11.4s, v2.s[3] \n" - "fmla v19.4s, v11.4s, v3.s[3] \n" + "fmla v16.4s, v14.4s, v0.s[3] \n" + "fmla v17.4s, v14.4s, v1.s[3] \n" + "fmla v18.4s, v14.4s, v2.s[3] \n" + "fmla v19.4s, v14.4s, v3.s[3] \n" "fmla v20.4s, v15.4s, v0.s[3] \n" "fmla v21.4s, v15.4s, v1.s[3] \n" @@ -1048,14 +1147,12 @@ static void conv3x3s1_winograd64_pack4_neon(const Mat& bottom_blob, Mat& top_blo "=r"(output0_tm), // %1 "=r"(output1_tm), // %2 "=r"(r0), // %3 - "=r"(k0), // %4 - "=r"(k1) // %5 + "=r"(k01) // %4 : "0"(nn), "1"(output0_tm), "2"(output1_tm), "3"(r0), - "4"(k0), - "5"(k1) + "4"(k01) : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" ); } @@ -1063,8 +1160,7 @@ static void conv3x3s1_winograd64_pack4_neon(const Mat& bottom_blob, Mat& top_blo { const float* r0 = bb2.row(i/12 + (i%12)/8 + (i%8)/4 + (i%4)/2); - const float* k0 = kernel0_tm.row(r); - const float* k1 = kernel1_tm.row(r); + const float* k01 = kernel01_tm.row(r); int nn = inch;// inch always > 0 @@ -1080,31 +1176,30 @@ static void conv3x3s1_winograd64_pack4_neon(const Mat& bottom_blob, Mat& top_blo "ld1 {v0.4s, v1.4s}, [%3], #32 \n"// r0 r1 "prfm pldl1keep, [%4, #512] \n" - "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n"// w0123_0 + "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n"// w0011_01 "fmla v16.4s, v8.4s, v0.s[0] \n" "fmla v17.4s, v8.4s, v1.s[0] \n" + "fmla v18.4s, v9.4s, v0.s[0] \n" + "fmla v19.4s, v9.4s, v1.s[0] \n" - "prfm pldl1keep, [%5, #512] \n" - "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%5], #64 \n"// w0123_1 - - "fmla v18.4s, v12.4s, v0.s[0] \n" - "fmla v19.4s, v12.4s, v1.s[0] \n" + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n"// w2233_01 - "fmla v16.4s, v9.4s, v0.s[1] \n" - "fmla v17.4s, v9.4s, v1.s[1] \n" - "fmla v18.4s, v13.4s, v0.s[1] \n" - "fmla v19.4s, v13.4s, v1.s[1] \n" + "fmla v16.4s, v10.4s, v0.s[1] \n" + "fmla v17.4s, v10.4s, v1.s[1] \n" + "fmla v18.4s, v11.4s, v0.s[1] \n" + "fmla v19.4s, v11.4s, v1.s[1] \n" - "fmla v16.4s, v10.4s, v0.s[2] \n" - "fmla v17.4s, v10.4s, v1.s[2] \n" - "fmla v18.4s, v14.4s, v0.s[2] \n" - "fmla v19.4s, v14.4s, v1.s[2] \n" + "fmla v16.4s, v12.4s, v0.s[2] \n" + "fmla v17.4s, v12.4s, v1.s[2] \n" + "fmla v18.4s, v13.4s, v0.s[2] \n" + "fmla v19.4s, v13.4s, v1.s[2] \n" "subs %w0, %w0, #1 \n" - "fmla v16.4s, v11.4s, v0.s[3] \n" - "fmla v17.4s, v11.4s, v1.s[3] \n" + "fmla v16.4s, v14.4s, v0.s[3] \n" + "fmla v17.4s, v14.4s, v1.s[3] \n" "fmla v18.4s, v15.4s, v0.s[3] \n" "fmla v19.4s, v15.4s, v1.s[3] \n" @@ -1117,14 +1212,12 @@ static void conv3x3s1_winograd64_pack4_neon(const Mat& bottom_blob, Mat& top_blo "=r"(output0_tm), // %1 "=r"(output1_tm), // %2 "=r"(r0), // %3 - "=r"(k0), // %4 - "=r"(k1) // %5 + "=r"(k01) // %4 : "0"(nn), "1"(output0_tm), "2"(output1_tm), "3"(r0), - "4"(k0), - "5"(k1) + "4"(k01) : "cc", "memory", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19" ); } @@ -1132,8 +1225,7 @@ static void conv3x3s1_winograd64_pack4_neon(const Mat& bottom_blob, Mat& top_blo { const float* r0 = bb2.row(i/12 + (i%12)/8 + (i%8)/4 + (i%4)/2 + i%2); - const float* k0 = kernel0_tm.row(r); - const float* k1 = kernel1_tm.row(r); + const float* k01 = kernel01_tm.row(r); int nn = inch;// inch always > 0 @@ -1147,24 +1239,23 @@ static void conv3x3s1_winograd64_pack4_neon(const Mat& bottom_blob, Mat& top_blo "ld1 {v0.4s}, [%3], #16 \n"// r0 "prfm pldl1keep, [%4, #512] \n" - "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n"// w0123_0 + "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n"// w0011_01 "fmla v16.4s, v8.4s, v0.s[0] \n" + "fmla v17.4s, v9.4s, v0.s[0] \n" - "prfm pldl1keep, [%5, #512] \n" - "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%5], #64 \n"// w0123_1 - - "fmla v17.4s, v12.4s, v0.s[0] \n" + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n"// w2233_01 - "fmla v16.4s, v9.4s, v0.s[1] \n" - "fmla v17.4s, v13.4s, v0.s[1] \n" + "fmla v16.4s, v10.4s, v0.s[1] \n" + "fmla v17.4s, v11.4s, v0.s[1] \n" - "fmla v16.4s, v10.4s, v0.s[2] \n" - "fmla v17.4s, v14.4s, v0.s[2] \n" + "fmla v16.4s, v12.4s, v0.s[2] \n" + "fmla v17.4s, v13.4s, v0.s[2] \n" "subs %w0, %w0, #1 \n" - "fmla v16.4s, v11.4s, v0.s[3] \n" + "fmla v16.4s, v14.4s, v0.s[3] \n" "fmla v17.4s, v15.4s, v0.s[3] \n" "bne 0b \n" @@ -1176,14 +1267,12 @@ static void conv3x3s1_winograd64_pack4_neon(const Mat& bottom_blob, Mat& top_blo "=r"(output0_tm), // %1 "=r"(output1_tm), // %2 "=r"(r0), // %3 - "=r"(k0), // %4 - "=r"(k1) // %5 + "=r"(k01) // %4 : "0"(nn), "1"(output0_tm), "2"(output1_tm), "3"(r0), - "4"(k0), - "5"(k1) + "4"(k01) : "cc", "memory", "v0", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17" ); } @@ -1197,7 +1286,11 @@ static void conv3x3s1_winograd64_pack4_neon(const Mat& bottom_blob, Mat& top_blo { float* output0_tm = top_blob_tm.channel(p); +#if __aarch64__ + const Mat kernel0_tm = kernel_tm.channel(p/2+p%2); +#else const Mat kernel0_tm = kernel_tm.channel(p); +#endif for (int r=0; r<64; r++) {