diff --git a/src/layer/arm/convolution_1x1_pack8_fp16s.h b/src/layer/arm/convolution_1x1_pack8_fp16s.h index 25202150d..7cb2f6ada 100644 --- a/src/layer/arm/convolution_1x1_pack8_fp16s.h +++ b/src/layer/arm/convolution_1x1_pack8_fp16s.h @@ -105,24 +105,30 @@ static void conv1x1s1_sgemm_pack8_fp16sa_neon(const Mat& bottom_blob, Mat& top_b for (int q = 0; q < inch; q++) { - float16x8_t _v0 = vld1q_f16(img0); - float16x8_t _v1 = vld1q_f16(img0 + 8); - float16x8_t _v2 = vld1q_f16(img0 + 16); - float16x8_t _v3 = vld1q_f16(img0 + 24); - float16x8_t _v4 = vld1q_f16(img0 + 32); - float16x8_t _v5 = vld1q_f16(img0 + 40); - float16x8_t _v6 = vld1q_f16(img0 + 48); - float16x8_t _v7 = vld1q_f16(img0 + 56); - vst1q_f16(tmpptr, _v0); - vst1q_f16(tmpptr + 8, _v1); - vst1q_f16(tmpptr + 16, _v2); - vst1q_f16(tmpptr + 24, _v3); - vst1q_f16(tmpptr + 32, _v4); - vst1q_f16(tmpptr + 40, _v5); - vst1q_f16(tmpptr + 48, _v6); - vst1q_f16(tmpptr + 56, _v7); - - tmpptr += 64; + // transpose 8x8 + asm volatile( + "prfm pldl1keep, [%0, #512] \n" + "ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%0], #64 \n" + "ld4 {v4.8h, v5.8h, v6.8h, v7.8h}, [%0] \n" + "sub %0, %0, #64 \n" + + "uzp1 v16.8h, v0.8h, v4.8h \n" + "uzp2 v20.8h, v0.8h, v4.8h \n" + "uzp1 v17.8h, v1.8h, v5.8h \n" + "uzp2 v21.8h, v1.8h, v5.8h \n" + "uzp1 v18.8h, v2.8h, v6.8h \n" + "uzp2 v22.8h, v2.8h, v6.8h \n" + "uzp1 v19.8h, v3.8h, v7.8h \n" + "uzp2 v23.8h, v3.8h, v7.8h \n" + + "st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%1], #64 \n" + "st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%1], #64 \n" + : "=r"(img0), // %0 + "=r"(tmpptr) // %1 + : "0"(img0), + "1"(tmpptr) + : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); + img0 += bottom_blob.cstep * 8; } } @@ -142,16 +148,16 @@ static void conv1x1s1_sgemm_pack8_fp16sa_neon(const Mat& bottom_blob, Mat& top_b for (int q = 0; q < inch; q++) { - float16x8_t _v0 = vld1q_f16(img0); - float16x8_t _v1 = vld1q_f16(img0 + 8); - float16x8_t _v2 = vld1q_f16(img0 + 16); - float16x8_t _v3 = vld1q_f16(img0 + 24); - vst1q_f16(tmpptr, _v0); - vst1q_f16(tmpptr + 8, _v1); - vst1q_f16(tmpptr + 16, _v2); - vst1q_f16(tmpptr + 24, _v3); - - tmpptr += 32; + asm volatile( + "prfm pldl1keep, [%0, #512] \n" + "ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [%0] \n" + "st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [%1], #64 \n" + : "=r"(img0), // %0 + "=r"(tmpptr) // %1 + : "0"(img0), + "1"(tmpptr) + : "memory", "v0", "v1", "v2", "v3"); + img0 += bottom_blob.cstep * 8; } } @@ -171,12 +177,16 @@ static void conv1x1s1_sgemm_pack8_fp16sa_neon(const Mat& bottom_blob, Mat& top_b for (int q = 0; q < inch; q++) { - float16x8_t _v0 = vld1q_f16(img0); - float16x8_t _v1 = vld1q_f16(img0 + 8); - vst1q_f16(tmpptr, _v0); - vst1q_f16(tmpptr + 8, _v1); + asm volatile( + "prfm pldl1keep, [%0, #256] \n" + "ld1 {v0.8h, v1.8h}, [%0] \n" + "st1 {v0.8h, v1.8h}, [%1], #32 \n" + : "=r"(img0), // %0 + "=r"(tmpptr) // %1 + : "0"(img0), + "1"(tmpptr) + : "memory", "v0", "v1"); - tmpptr += 16; img0 += bottom_blob.cstep * 8; } } @@ -193,10 +203,16 @@ static void conv1x1s1_sgemm_pack8_fp16sa_neon(const Mat& bottom_blob, Mat& top_b for (int q = 0; q < inch; q++) { - float16x8_t _v = vld1q_f16(img0); - vst1q_f16(tmpptr, _v); + asm volatile( + "prfm pldl1keep, [%0, #128] \n" + "ld1 {v0.8h}, [%0] \n" + "st1 {v0.8h}, [%1], #16 \n" + : "=r"(img0), // %0 + "=r"(tmpptr) // %1 + : "0"(img0), + "1"(tmpptr) + : "memory", "v0"); - tmpptr += 8; img0 += bottom_blob.cstep * 8; } } @@ -216,280 +232,315 @@ static void conv1x1s1_sgemm_pack8_fp16sa_neon(const Mat& bottom_blob, Mat& top_b __fp16* tmpptr = tmp.channel(i / 8); const __fp16* kptr0 = kernel.channel(p); - float16x8_t _sum0 = vld1q_f16(biasptr); - float16x8_t _sum1 = vld1q_f16(biasptr); - float16x8_t _sum2 = vld1q_f16(biasptr); - float16x8_t _sum3 = vld1q_f16(biasptr); - float16x8_t _sum4 = vld1q_f16(biasptr); - float16x8_t _sum5 = vld1q_f16(biasptr); - float16x8_t _sum6 = vld1q_f16(biasptr); - float16x8_t _sum7 = vld1q_f16(biasptr); - - for (int q=0; q 0 + + asm volatile( + "ld1 {v16.8h}, [%8] \n" + "mov v17.16b, v16.16b \n" + "mov v18.16b, v16.16b \n" + "mov v19.16b, v16.16b \n" + "mov v20.16b, v16.16b \n" + "mov v21.16b, v16.16b \n" + "mov v22.16b, v16.16b \n" + "mov v23.16b, v16.16b \n" + + "0: \n" + + "prfm pldl1keep, [%2, #512] \n" + "ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [%2], #64 \n" // r0123 + + "prfm pldl1keep, [%3, #512] \n" + "ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [%3], #64 \n" // w0123 + + "fmla v16.8h, v8.8h, v0.h[0] \n" + "fmla v17.8h, v8.8h, v0.h[1] \n" + "fmla v18.8h, v8.8h, v0.h[2] \n" + "fmla v19.8h, v8.8h, v0.h[3] \n" + "fmla v20.8h, v8.8h, v0.h[4] \n" + "fmla v21.8h, v8.8h, v0.h[5] \n" + "fmla v22.8h, v8.8h, v0.h[6] \n" + "fmla v23.8h, v8.8h, v0.h[7] \n" + + "fmla v16.8h, v9.8h, v1.h[0] \n" + "fmla v17.8h, v9.8h, v1.h[1] \n" + "fmla v18.8h, v9.8h, v1.h[2] \n" + "fmla v19.8h, v9.8h, v1.h[3] \n" + "fmla v20.8h, v9.8h, v1.h[4] \n" + "fmla v21.8h, v9.8h, v1.h[5] \n" + "fmla v22.8h, v9.8h, v1.h[6] \n" + "fmla v23.8h, v9.8h, v1.h[7] \n" + + "prfm pldl1keep, [%2, #512] \n" + "ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [%2], #64 \n" // r4567 + + "fmla v16.8h, v10.8h, v2.h[0] \n" + "fmla v17.8h, v10.8h, v2.h[1] \n" + "fmla v18.8h, v10.8h, v2.h[2] \n" + "fmla v19.8h, v10.8h, v2.h[3] \n" + "fmla v20.8h, v10.8h, v2.h[4] \n" + "fmla v21.8h, v10.8h, v2.h[5] \n" + "fmla v22.8h, v10.8h, v2.h[6] \n" + "fmla v23.8h, v10.8h, v2.h[7] \n" + + "prfm pldl1keep, [%3, #512] \n" + "ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [%3], #64 \n" // w4567 + + "fmla v16.8h, v11.8h, v3.h[0] \n" + "fmla v17.8h, v11.8h, v3.h[1] \n" + "fmla v18.8h, v11.8h, v3.h[2] \n" + "fmla v19.8h, v11.8h, v3.h[3] \n" + "fmla v20.8h, v11.8h, v3.h[4] \n" + "fmla v21.8h, v11.8h, v3.h[5] \n" + "fmla v22.8h, v11.8h, v3.h[6] \n" + "fmla v23.8h, v11.8h, v3.h[7] \n" + + "fmla v16.8h, v12.8h, v4.h[0] \n" + "fmla v17.8h, v12.8h, v4.h[1] \n" + "fmla v18.8h, v12.8h, v4.h[2] \n" + "fmla v19.8h, v12.8h, v4.h[3] \n" + "fmla v20.8h, v12.8h, v4.h[4] \n" + "fmla v21.8h, v12.8h, v4.h[5] \n" + "fmla v22.8h, v12.8h, v4.h[6] \n" + "fmla v23.8h, v12.8h, v4.h[7] \n" + + "fmla v16.8h, v13.8h, v5.h[0] \n" + "fmla v17.8h, v13.8h, v5.h[1] \n" + "fmla v18.8h, v13.8h, v5.h[2] \n" + "fmla v19.8h, v13.8h, v5.h[3] \n" + "fmla v20.8h, v13.8h, v5.h[4] \n" + "fmla v21.8h, v13.8h, v5.h[5] \n" + "fmla v22.8h, v13.8h, v5.h[6] \n" + "fmla v23.8h, v13.8h, v5.h[7] \n" + + "fmla v16.8h, v14.8h, v6.h[0] \n" + "fmla v17.8h, v14.8h, v6.h[1] \n" + "fmla v18.8h, v14.8h, v6.h[2] \n" + "fmla v19.8h, v14.8h, v6.h[3] \n" + "fmla v20.8h, v14.8h, v6.h[4] \n" + "fmla v21.8h, v14.8h, v6.h[5] \n" + "fmla v22.8h, v14.8h, v6.h[6] \n" + "fmla v23.8h, v14.8h, v6.h[7] \n" + + "subs %w0, %w0, #1 \n" + + "fmla v16.8h, v15.8h, v7.h[0] \n" + "fmla v17.8h, v15.8h, v7.h[1] \n" + "fmla v18.8h, v15.8h, v7.h[2] \n" + "fmla v19.8h, v15.8h, v7.h[3] \n" + "fmla v20.8h, v15.8h, v7.h[4] \n" + "fmla v21.8h, v15.8h, v7.h[5] \n" + "fmla v22.8h, v15.8h, v7.h[6] \n" + "fmla v23.8h, v15.8h, v7.h[7] \n" + + "bne 0b \n" + + "st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%1], #64 \n" + "st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%1], #64 \n" + + : "=r"(nn), // %0 + "=r"(outptr0), // %1 + "=r"(tmpptr), // %2 + "=r"(kptr0) // %3 + : "0"(nn), + "1"(outptr0), + "2"(tmpptr), + "3"(kptr0), + "r"(biasptr) // %8 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); } for (; i + 3 < size; i += 4) { __fp16* tmpptr = tmp.channel(i / 8 + (i % 8) / 4); const __fp16* kptr0 = kernel.channel(p); - float16x8_t _sum0 = vld1q_f16(biasptr); - float16x8_t _sum1 = vld1q_f16(biasptr); - float16x8_t _sum2 = vld1q_f16(biasptr); - float16x8_t _sum3 = vld1q_f16(biasptr); - - for (int q=0; q 0 + + asm volatile( + "ld1 {v16.8h}, [%8] \n" + "mov v17.16b, v16.16b \n" + "mov v18.16b, v16.16b \n" + "mov v19.16b, v16.16b \n" + + "0: \n" + + "prfm pldl1keep, [%2, #512] \n" + "ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [%2], #64 \n" // r0123 + + "prfm pldl1keep, [%3, #512] \n" + "ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [%3], #64 \n" // w0123 + + "fmla v16.8h, v8.8h, v0.h[0] \n" + "fmla v17.8h, v8.8h, v1.h[0] \n" + "fmla v18.8h, v8.8h, v2.h[0] \n" + "fmla v19.8h, v8.8h, v3.h[0] \n" + + "fmla v16.8h, v9.8h, v0.h[1] \n" + "fmla v17.8h, v9.8h, v1.h[1] \n" + "fmla v18.8h, v9.8h, v2.h[1] \n" + "fmla v19.8h, v9.8h, v3.h[1] \n" + + "prfm pldl1keep, [%3, #512] \n" + "ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [%3], #64 \n" // w4567 + + "fmla v16.8h, v10.8h, v0.h[2] \n" + "fmla v17.8h, v10.8h, v1.h[2] \n" + "fmla v18.8h, v10.8h, v2.h[2] \n" + "fmla v19.8h, v10.8h, v3.h[2] \n" + + "fmla v16.8h, v11.8h, v0.h[3] \n" + "fmla v17.8h, v11.8h, v1.h[3] \n" + "fmla v18.8h, v11.8h, v2.h[3] \n" + "fmla v19.8h, v11.8h, v3.h[3] \n" + + "fmla v16.8h, v12.8h, v0.h[4] \n" + "fmla v17.8h, v12.8h, v1.h[4] \n" + "fmla v18.8h, v12.8h, v2.h[4] \n" + "fmla v19.8h, v12.8h, v3.h[4] \n" + + "fmla v16.8h, v13.8h, v0.h[5] \n" + "fmla v17.8h, v13.8h, v1.h[5] \n" + "fmla v18.8h, v13.8h, v2.h[5] \n" + "fmla v19.8h, v13.8h, v3.h[5] \n" + + "fmla v16.8h, v14.8h, v0.h[6] \n" + "fmla v17.8h, v14.8h, v1.h[6] \n" + "fmla v18.8h, v14.8h, v2.h[6] \n" + "fmla v19.8h, v14.8h, v3.h[6] \n" + + "subs %w0, %w0, #1 \n" + + "fmla v16.8h, v15.8h, v0.h[7] \n" + "fmla v17.8h, v15.8h, v1.h[7] \n" + "fmla v18.8h, v15.8h, v2.h[7] \n" + "fmla v19.8h, v15.8h, v3.h[7] \n" + + "bne 0b \n" + + "st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%1], #64 \n" + + : "=r"(nn), // %0 + "=r"(outptr0), // %1 + "=r"(tmpptr), // %2 + "=r"(kptr0) // %3 + : "0"(nn), + "1"(outptr0), + "2"(tmpptr), + "3"(kptr0), + "r"(biasptr) // %8 + : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19"); } for (; i + 1 < size; i += 2) { __fp16* tmpptr = tmp.channel(i / 8 + (i % 8) / 4 + (i % 4) / 2); const __fp16* kptr0 = kernel.channel(p); - float16x8_t _sum0 = vld1q_f16(biasptr); - float16x8_t _sum1 = vld1q_f16(biasptr); + int nn = inch; // inch always > 0 - for (int q=0; q 0 - for (int q=0; q(i / 8); const __fp16* k0 = kernel0_tm.row(r); - float16x8_t _sum0 = vdupq_n_f16((__fp16)0.f); - float16x8_t _sum1 = vdupq_n_f16((__fp16)0.f); - float16x8_t _sum2 = vdupq_n_f16((__fp16)0.f); - float16x8_t _sum3 = vdupq_n_f16((__fp16)0.f); - float16x8_t _sum4 = vdupq_n_f16((__fp16)0.f); - float16x8_t _sum5 = vdupq_n_f16((__fp16)0.f); - float16x8_t _sum6 = vdupq_n_f16((__fp16)0.f); - float16x8_t _sum7 = vdupq_n_f16((__fp16)0.f); - - for (int q=0; q 0 + + asm volatile( + "eor v16.16b, v16.16b, v16.16b \n" + "eor v17.16b, v17.16b, v17.16b \n" + "eor v18.16b, v18.16b, v18.16b \n" + "eor v19.16b, v19.16b, v19.16b \n" + "eor v20.16b, v20.16b, v20.16b \n" + "eor v21.16b, v21.16b, v21.16b \n" + "eor v22.16b, v22.16b, v22.16b \n" + "eor v23.16b, v23.16b, v23.16b \n" + + "0: \n" + + "prfm pldl1keep, [%2, #512] \n" + "ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [%2], #64 \n" // r0123 + + "prfm pldl1keep, [%3, #512] \n" + "ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [%3], #64 \n" // w0123 + + "fmla v16.8h, v8.8h, v0.h[0] \n" + "fmla v17.8h, v8.8h, v0.h[1] \n" + "fmla v18.8h, v8.8h, v0.h[2] \n" + "fmla v19.8h, v8.8h, v0.h[3] \n" + "fmla v20.8h, v8.8h, v0.h[4] \n" + "fmla v21.8h, v8.8h, v0.h[5] \n" + "fmla v22.8h, v8.8h, v0.h[6] \n" + "fmla v23.8h, v8.8h, v0.h[7] \n" + + "fmla v16.8h, v9.8h, v1.h[0] \n" + "fmla v17.8h, v9.8h, v1.h[1] \n" + "fmla v18.8h, v9.8h, v1.h[2] \n" + "fmla v19.8h, v9.8h, v1.h[3] \n" + "fmla v20.8h, v9.8h, v1.h[4] \n" + "fmla v21.8h, v9.8h, v1.h[5] \n" + "fmla v22.8h, v9.8h, v1.h[6] \n" + "fmla v23.8h, v9.8h, v1.h[7] \n" + + "prfm pldl1keep, [%2, #512] \n" + "ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [%2], #64 \n" // r4567 + + "fmla v16.8h, v10.8h, v2.h[0] \n" + "fmla v17.8h, v10.8h, v2.h[1] \n" + "fmla v18.8h, v10.8h, v2.h[2] \n" + "fmla v19.8h, v10.8h, v2.h[3] \n" + "fmla v20.8h, v10.8h, v2.h[4] \n" + "fmla v21.8h, v10.8h, v2.h[5] \n" + "fmla v22.8h, v10.8h, v2.h[6] \n" + "fmla v23.8h, v10.8h, v2.h[7] \n" + + "prfm pldl1keep, [%3, #512] \n" + "ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [%3], #64 \n" // w4567 + + "fmla v16.8h, v11.8h, v3.h[0] \n" + "fmla v17.8h, v11.8h, v3.h[1] \n" + "fmla v18.8h, v11.8h, v3.h[2] \n" + "fmla v19.8h, v11.8h, v3.h[3] \n" + "fmla v20.8h, v11.8h, v3.h[4] \n" + "fmla v21.8h, v11.8h, v3.h[5] \n" + "fmla v22.8h, v11.8h, v3.h[6] \n" + "fmla v23.8h, v11.8h, v3.h[7] \n" + + "fmla v16.8h, v12.8h, v4.h[0] \n" + "fmla v17.8h, v12.8h, v4.h[1] \n" + "fmla v18.8h, v12.8h, v4.h[2] \n" + "fmla v19.8h, v12.8h, v4.h[3] \n" + "fmla v20.8h, v12.8h, v4.h[4] \n" + "fmla v21.8h, v12.8h, v4.h[5] \n" + "fmla v22.8h, v12.8h, v4.h[6] \n" + "fmla v23.8h, v12.8h, v4.h[7] \n" + + "fmla v16.8h, v13.8h, v5.h[0] \n" + "fmla v17.8h, v13.8h, v5.h[1] \n" + "fmla v18.8h, v13.8h, v5.h[2] \n" + "fmla v19.8h, v13.8h, v5.h[3] \n" + "fmla v20.8h, v13.8h, v5.h[4] \n" + "fmla v21.8h, v13.8h, v5.h[5] \n" + "fmla v22.8h, v13.8h, v5.h[6] \n" + "fmla v23.8h, v13.8h, v5.h[7] \n" + + "fmla v16.8h, v14.8h, v6.h[0] \n" + "fmla v17.8h, v14.8h, v6.h[1] \n" + "fmla v18.8h, v14.8h, v6.h[2] \n" + "fmla v19.8h, v14.8h, v6.h[3] \n" + "fmla v20.8h, v14.8h, v6.h[4] \n" + "fmla v21.8h, v14.8h, v6.h[5] \n" + "fmla v22.8h, v14.8h, v6.h[6] \n" + "fmla v23.8h, v14.8h, v6.h[7] \n" + + "subs %w0, %w0, #1 \n" + + "fmla v16.8h, v15.8h, v7.h[0] \n" + "fmla v17.8h, v15.8h, v7.h[1] \n" + "fmla v18.8h, v15.8h, v7.h[2] \n" + "fmla v19.8h, v15.8h, v7.h[3] \n" + "fmla v20.8h, v15.8h, v7.h[4] \n" + "fmla v21.8h, v15.8h, v7.h[5] \n" + "fmla v22.8h, v15.8h, v7.h[6] \n" + "fmla v23.8h, v15.8h, v7.h[7] \n" + + "bne 0b \n" + + "st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%1], #64 \n" + + "st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%1], #64 \n" + + : "=r"(nn), // %0 + "=r"(output0_tm), // %1 + "=r"(r0), // %2 + "=r"(k0) // %3 + : "0"(nn), + "1"(output0_tm), + "2"(r0), + "3"(k0) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); } for (; i + 3 < tiles; i += 4) { const __fp16* r0 = bb2.row(i / 8 + (i % 8) / 4); const __fp16* k0 = kernel0_tm.row(r); - float16x8_t _sum0 = vdupq_n_f16((__fp16)0.f); - float16x8_t _sum1 = vdupq_n_f16((__fp16)0.f); - float16x8_t _sum2 = vdupq_n_f16((__fp16)0.f); - float16x8_t _sum3 = vdupq_n_f16((__fp16)0.f); + int nn = inch; // inch always > 0 - for (int q=0; q(i / 8 + (i % 8) / 4 + (i % 4) / 2); const __fp16* k0 = kernel0_tm.row(r); - float16x8_t _sum0 = vdupq_n_f16((__fp16)0.f); - float16x8_t _sum1 = vdupq_n_f16((__fp16)0.f); + int nn = inch; // inch always > 0 - for (int q=0; q(i / 8 + (i % 8) / 4 + (i % 4) / 2 + i % 2); const __fp16* k0 = kernel0_tm.row(r); - float16x8_t _sum = vdupq_n_f16((__fp16)0.f); + int nn = inch; // inch always > 0 - for (int q=0; q