diff --git a/src/layer/riscv/convolution_sgemm_packn.h b/src/layer/riscv/convolution_sgemm_packn.h index a970504c9..66335b273 100644 --- a/src/layer/riscv/convolution_sgemm_packn.h +++ b/src/layer/riscv/convolution_sgemm_packn.h @@ -54,6 +54,22 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons for (int k = 0; k < maxk; k++) { +#if RVV_SPEC_0_7 + for (int l = 0; l < packn; l++) + { + tmpptr[0] = img0[l]; + tmpptr[1] = img0[l + packn]; + tmpptr[2] = img0[l + packn * 2]; + tmpptr[3] = img0[l + packn * 3]; + tmpptr[4] = img0[l + packn * 4]; + tmpptr[5] = img0[l + packn * 5]; + tmpptr[6] = img0[l + packn * 6]; + tmpptr[7] = img0[l + packn * 7]; + tmpptr += 8; + } + + img0 += size * packn; +#else vfloat32m1_t _val0 = vle32_v_f32m1(img0, vl); vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl); vfloat32m1_t _val2 = vle32_v_f32m1(img0 + packn * 2, vl); @@ -66,6 +82,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons img0 += size * packn; tmpptr += packn * 8; +#endif } } } @@ -86,6 +103,18 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons for (int k = 0; k < maxk; k++) { +#if RVV_SPEC_0_7 + for (int l = 0; l < packn; l++) + { + tmpptr[0] = img0[l]; + tmpptr[1] = img0[l + packn]; + tmpptr[2] = img0[l + packn * 2]; + tmpptr[3] = img0[l + packn * 3]; + tmpptr += 4; + } + + img0 += size * packn; +#else vfloat32m1_t _val0 = vle32_v_f32m1(img0, vl); vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl); vfloat32m1_t _val2 = vle32_v_f32m1(img0 + packn * 2, vl); @@ -94,6 +123,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons img0 += size * packn; tmpptr += packn * 4; +#endif } } } @@ -114,12 +144,23 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons for (int k = 0; k < maxk; k++) { +#if RVV_SPEC_0_7 + for (int l = 0; l < packn; l++) + { + tmpptr[0] = img0[l]; + tmpptr[1] = img0[l + packn]; + tmpptr += 2; + } + + img0 += size * packn; +#else vfloat32m1_t _val0 = vle32_v_f32m1(img0, vl); vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl); vsseg2e32_v_f32m1x2(tmpptr, vcreate_f32m1x2(_val0, _val1), vl); img0 += size * packn; tmpptr += packn * 2; +#endif } } } diff --git a/src/layer/riscv/convolution_sgemm_packn_fp16s.h b/src/layer/riscv/convolution_sgemm_packn_fp16s.h index 6dbcf64ca..3a8ef2c39 100644 --- a/src/layer/riscv/convolution_sgemm_packn_fp16s.h +++ b/src/layer/riscv/convolution_sgemm_packn_fp16s.h @@ -54,6 +54,22 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo for (int k = 0; k < maxk; k++) { +#if RVV_SPEC_0_7 + for (int l = 0; l < packn; l++) + { + tmpptr[0] = img0[l]; + tmpptr[1] = img0[l + packn]; + tmpptr[2] = img0[l + packn * 2]; + tmpptr[3] = img0[l + packn * 3]; + tmpptr[4] = img0[l + packn * 4]; + tmpptr[5] = img0[l + packn * 5]; + tmpptr[6] = img0[l + packn * 6]; + tmpptr[7] = img0[l + packn * 7]; + tmpptr += 8; + } + + img0 += size * packn; +#else vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl); vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl); vfloat16m1_t _val2 = vle16_v_f16m1(img0 + packn * 2, vl); @@ -66,6 +82,7 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo img0 += size * packn; tmpptr += packn * 8; +#endif } } } @@ -86,6 +103,18 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo for (int k = 0; k < maxk; k++) { +#if RVV_SPEC_0_7 + for (int l = 0; l < packn; l++) + { + tmpptr[0] = img0[l]; + tmpptr[1] = img0[l + packn]; + tmpptr[2] = img0[l + packn * 2]; + tmpptr[3] = img0[l + packn * 3]; + tmpptr += 4; + } + + img0 += size * packn; +#else vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl); vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl); vfloat16m1_t _val2 = vle16_v_f16m1(img0 + packn * 2, vl); @@ -94,11 +123,13 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo img0 += size * packn; tmpptr += packn * 4; +#endif } } } remain_size_start += nn_size << 2; + nn_size = (size - remain_size_start) >> 1; #pragma omp parallel for num_threads(opt.num_threads) @@ -114,12 +145,23 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo for (int k = 0; k < maxk; k++) { +#if RVV_SPEC_0_7 + for (int l = 0; l < packn; l++) + { + tmpptr[0] = img0[l]; + tmpptr[1] = img0[l + packn]; + tmpptr += 2; + } + + img0 += size * packn; +#else vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl); vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl); vsseg2e16_v_f16m1x2(tmpptr, vcreate_f16m1x2(_val0, _val1), vl); img0 += size * packn; tmpptr += packn * 2; +#endif } } } diff --git a/src/layer/riscv/riscv_v_071_fix.h b/src/layer/riscv/riscv_v_071_fix.h index af69599ae..591fed383 100644 --- a/src/layer/riscv/riscv_v_071_fix.h +++ b/src/layer/riscv/riscv_v_071_fix.h @@ -35,17 +35,20 @@ typedef float32xm2_t vfloat32m2_t; typedef float32xm4_t vfloat32m4_t; typedef float32xm8_t vfloat32m8_t; +typedef float32x2xm1_t vfloat32m1x2_t; typedef float32x4xm1_t vfloat32m1x4_t; +typedef float32x8xm1_t vfloat32m1x8_t; typedef float32x4xm2_t vfloat32m2x4_t; typedef float32x2xm4_t vfloat32m4x2_t; -typedef float32x8xm1_t vfloat32m1x8_t; typedef float16xm1_t vfloat16m1_t; typedef float16xm2_t vfloat16m2_t; typedef float16xm4_t vfloat16m4_t; typedef float16xm8_t vfloat16m8_t; +typedef float16x2xm1_t vfloat16m1x2_t; typedef float16x4xm1_t vfloat16m1x4_t; +typedef float16x8xm1_t vfloat16m1x8_t; typedef float16x4xm2_t vfloat16m2x4_t; typedef int32xm1_t vint32m1_t; @@ -129,29 +132,41 @@ typedef uint16x8xm1_t vuint16m1x8_t; #define vsse32_v_f32m4 vssev_float32xm4 #define vsse32_v_f32m8 vssev_float32xm8 -#define vlseg4e32_v_f32m2x4 vlseg4ev_float32x4xm2 -#define vsseg4e32_v_f32m2x4 vsseg4ev_float32x4xm2 +#define vlseg2e32_v_f32m1x2 vlseg2ev_float32x2xm1 +#define vsseg2e32_v_f32m1x2 vsseg2ev_float32x2xm1 #define vlseg4e32_v_f32m1x4 vlseg4ev_float32x4xm1 #define vsseg4e32_v_f32m1x4 vsseg4ev_float32x4xm1 -#define vlseg2e32_v_f32m4x2 vlseg2ev_float32x2xm4 -#define vsseg2e32_v_f32m4x2 vsseg2ev_float32x2xm4 - #define vlseg8e32_v_f32m1x8 vlseg8ev_float32x8xm1 #define vsseg8e32_v_f32m1x8 vsseg8ev_float32x8xm1 +#define vlseg4e32_v_f32m2x4 vlseg4ev_float32x4xm2 +#define vsseg4e32_v_f32m2x4 vsseg4ev_float32x4xm2 + +#define vlseg2e32_v_f32m4x2 vlseg2ev_float32x2xm4 +#define vsseg2e32_v_f32m4x2 vsseg2ev_float32x2xm4 + #define vloxseg2ei32_v_f32m4x2(a, i, vl) vlxseg2ev_float32x2xm4(a, reinterpret_cast(i), vl) +#define vset_f32m1x2 vseg_element_set_float32x2xm1 #define vset_f32m1x4 vseg_element_set_float32x4xm1 +#define vset_f32m1x8 vseg_element_set_float32x8xm1 #define vset_f32m2x4 vseg_element_set_float32x4xm2 #define vset_f32m4x2 vseg_element_set_float32x2xm4 +#define vget_f32m1x2_f32m1 vseg_element_get_float32x2xm1 #define vget_f32m1x4_f32m1 vseg_element_get_float32x4xm1 +#define vget_f32m1x8_f32m1 vseg_element_get_float32x8xm1 #define vget_f32m2x4_f32m2 vseg_element_get_float32x4xm2 #define vget_f32m4x2_f32m4 vseg_element_get_float32x2xm4 -#define vset_f32m1x8 vseg_element_set_float32x8xm1 -#define vget_f32m1x8_f32m1 vseg_element_get_float32x8xm1 +static inline vfloat32m1x2_t vcreate_f32m1x2(vfloat32m1_t v0, vfloat32m1_t v1) +{ + vfloat32m1x2_t p; + p = vset_f32m1x2(p, 0, v0); + p = vset_f32m1x2(p, 1, v1); + return p; +} static inline vfloat32m1x4_t vcreate_f32m1x4(vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3) { @@ -162,6 +177,7 @@ static inline vfloat32m1x4_t vcreate_f32m1x4(vfloat32m1_t v0, vfloat32m1_t v1, v p = vset_f32m1x4(p, 3, v3); return p; } + static inline vfloat32m2x4_t vcreate_f32m2x4(vfloat32m2_t v0, vfloat32m2_t v1, vfloat32m2_t v2, vfloat32m2_t v3) { vfloat32m2x4_t p; @@ -171,6 +187,7 @@ static inline vfloat32m2x4_t vcreate_f32m2x4(vfloat32m2_t v0, vfloat32m2_t v1, v p = vset_f32m2x4(p, 3, v3); return p; } + static inline vfloat32m1x8_t vcreate_f32m1x8(vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, vfloat32m1_t v4, vfloat32m1_t v5, vfloat32m1_t v6, vfloat32m1_t v7) { vfloat32m1x8_t p; @@ -549,14 +566,56 @@ static inline vfloat32m1_t vfredsum_vs_f32m8_f32m1(vfloat32m1_t dst, vfloat32m8_ #define vse16_v_f16m4 vsev_float16xm4 #define vse16_v_f16m8 vsev_float16xm8 +#define vlseg2e16_v_f16m1x2 vlseg2ev_float16x2xm1 +#define vsseg2e16_v_f16m1x2 vsseg2ev_float16x2xm1 + +#define vlseg4e16_v_f16m1x4 vlseg4ev_float16x4xm1 +#define vsseg4e16_v_f16m1x4 vsseg4ev_float16x4xm1 + #define vlseg8e16_v_f16m1x8 vlseg8ev_float16x8xm1 #define vsseg8e16_v_f16m1x8 vsseg8ev_float16x8xm1 #define vlseg4e16_v_f16m2x4 vlseg4ev_float16x4xm2 #define vsseg4e16_v_f16m2x4 vsseg4ev_float16x4xm2 -#define vlseg4e16_v_f16m1x4 vlseg4ev_float16x4xm1 -#define vsseg4e16_v_f16m1x4 vsseg4ev_float16x4xm1 +#define vset_f16m1x2 vseg_element_set_float16x2xm1 +#define vset_f16m1x4 vseg_element_set_float16x4xm1 +#define vset_f16m1x8 vseg_element_set_float16x8xm1 +#define vget_f16m1x2_f16m1 vseg_element_get_float16x2xm1 +#define vget_f16m1x4_f16m1 vseg_element_get_float16x4xm1 +#define vget_f16m1x8_f16m1 vseg_element_get_float16x8xm1 + +static inline vfloat16m1x2_t vcreate_f16m1x2(vfloat16m1_t v0, vfloat16m1_t v1) +{ + vfloat16m1x2_t p; + p = vset_f16m1x2(p, 0, v0); + p = vset_f16m1x2(p, 1, v1); + return p; +} + +static inline vfloat16m1x4_t vcreate_f16m1x4(vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3) +{ + vfloat16m1x4_t p; + p = vset_f16m1x4(p, 0, v0); + p = vset_f16m1x4(p, 1, v1); + p = vset_f16m1x4(p, 2, v2); + p = vset_f16m1x4(p, 3, v3); + return p; +} + +static inline vfloat16m1x8_t vcreate_f16m1x8(vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, vfloat16m1_t v4, vfloat16m1_t v5, vfloat16m1_t v6, vfloat16m1_t v7) +{ + vfloat16m1x8_t p; + p = vset_f16m1x8(p, 0, v0); + p = vset_f16m1x8(p, 1, v1); + p = vset_f16m1x8(p, 2, v2); + p = vset_f16m1x8(p, 3, v3); + p = vset_f16m1x8(p, 4, v4); + p = vset_f16m1x8(p, 5, v5); + p = vset_f16m1x8(p, 6, v6); + p = vset_f16m1x8(p, 7, v7); + return p; +} #define vfmv_s_f_f16m1(a, b, vl) vfmvsf_float16xm1(b, vl) #define vfmv_s_f_f16m2(a, b, vl) vfmvsf_float16xm2(b, vl)