Browse Source

fix build on c906

tags/20210720
nihui 4 years ago
parent
commit
2cbece80ad
3 changed files with 152 additions and 10 deletions
  1. +41
    -0
      src/layer/riscv/convolution_sgemm_packn.h
  2. +42
    -0
      src/layer/riscv/convolution_sgemm_packn_fp16s.h
  3. +69
    -10
      src/layer/riscv/riscv_v_071_fix.h

+ 41
- 0
src/layer/riscv/convolution_sgemm_packn.h View File

@@ -54,6 +54,22 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons

for (int k = 0; k < maxk; k++)
{
#if RVV_SPEC_0_7
for (int l = 0; l < packn; l++)
{
tmpptr[0] = img0[l];
tmpptr[1] = img0[l + packn];
tmpptr[2] = img0[l + packn * 2];
tmpptr[3] = img0[l + packn * 3];
tmpptr[4] = img0[l + packn * 4];
tmpptr[5] = img0[l + packn * 5];
tmpptr[6] = img0[l + packn * 6];
tmpptr[7] = img0[l + packn * 7];
tmpptr += 8;
}

img0 += size * packn;
#else
vfloat32m1_t _val0 = vle32_v_f32m1(img0, vl);
vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl);
vfloat32m1_t _val2 = vle32_v_f32m1(img0 + packn * 2, vl);
@@ -66,6 +82,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons

img0 += size * packn;
tmpptr += packn * 8;
#endif
}
}
}
@@ -86,6 +103,18 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons

for (int k = 0; k < maxk; k++)
{
#if RVV_SPEC_0_7
for (int l = 0; l < packn; l++)
{
tmpptr[0] = img0[l];
tmpptr[1] = img0[l + packn];
tmpptr[2] = img0[l + packn * 2];
tmpptr[3] = img0[l + packn * 3];
tmpptr += 4;
}

img0 += size * packn;
#else
vfloat32m1_t _val0 = vle32_v_f32m1(img0, vl);
vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl);
vfloat32m1_t _val2 = vle32_v_f32m1(img0 + packn * 2, vl);
@@ -94,6 +123,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons

img0 += size * packn;
tmpptr += packn * 4;
#endif
}
}
}
@@ -114,12 +144,23 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons

for (int k = 0; k < maxk; k++)
{
#if RVV_SPEC_0_7
for (int l = 0; l < packn; l++)
{
tmpptr[0] = img0[l];
tmpptr[1] = img0[l + packn];
tmpptr += 2;
}

img0 += size * packn;
#else
vfloat32m1_t _val0 = vle32_v_f32m1(img0, vl);
vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl);
vsseg2e32_v_f32m1x2(tmpptr, vcreate_f32m1x2(_val0, _val1), vl);

img0 += size * packn;
tmpptr += packn * 2;
#endif
}
}
}


+ 42
- 0
src/layer/riscv/convolution_sgemm_packn_fp16s.h View File

@@ -54,6 +54,22 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo

for (int k = 0; k < maxk; k++)
{
#if RVV_SPEC_0_7
for (int l = 0; l < packn; l++)
{
tmpptr[0] = img0[l];
tmpptr[1] = img0[l + packn];
tmpptr[2] = img0[l + packn * 2];
tmpptr[3] = img0[l + packn * 3];
tmpptr[4] = img0[l + packn * 4];
tmpptr[5] = img0[l + packn * 5];
tmpptr[6] = img0[l + packn * 6];
tmpptr[7] = img0[l + packn * 7];
tmpptr += 8;
}

img0 += size * packn;
#else
vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl);
vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl);
vfloat16m1_t _val2 = vle16_v_f16m1(img0 + packn * 2, vl);
@@ -66,6 +82,7 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo

img0 += size * packn;
tmpptr += packn * 8;
#endif
}
}
}
@@ -86,6 +103,18 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo

for (int k = 0; k < maxk; k++)
{
#if RVV_SPEC_0_7
for (int l = 0; l < packn; l++)
{
tmpptr[0] = img0[l];
tmpptr[1] = img0[l + packn];
tmpptr[2] = img0[l + packn * 2];
tmpptr[3] = img0[l + packn * 3];
tmpptr += 4;
}

img0 += size * packn;
#else
vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl);
vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl);
vfloat16m1_t _val2 = vle16_v_f16m1(img0 + packn * 2, vl);
@@ -94,11 +123,13 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo

img0 += size * packn;
tmpptr += packn * 4;
#endif
}
}
}

remain_size_start += nn_size << 2;

nn_size = (size - remain_size_start) >> 1;

#pragma omp parallel for num_threads(opt.num_threads)
@@ -114,12 +145,23 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo

for (int k = 0; k < maxk; k++)
{
#if RVV_SPEC_0_7
for (int l = 0; l < packn; l++)
{
tmpptr[0] = img0[l];
tmpptr[1] = img0[l + packn];
tmpptr += 2;
}

img0 += size * packn;
#else
vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl);
vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl);
vsseg2e16_v_f16m1x2(tmpptr, vcreate_f16m1x2(_val0, _val1), vl);

img0 += size * packn;
tmpptr += packn * 2;
#endif
}
}
}


+ 69
- 10
src/layer/riscv/riscv_v_071_fix.h View File

@@ -35,17 +35,20 @@ typedef float32xm2_t vfloat32m2_t;
typedef float32xm4_t vfloat32m4_t;
typedef float32xm8_t vfloat32m8_t;

typedef float32x2xm1_t vfloat32m1x2_t;
typedef float32x4xm1_t vfloat32m1x4_t;
typedef float32x8xm1_t vfloat32m1x8_t;
typedef float32x4xm2_t vfloat32m2x4_t;
typedef float32x2xm4_t vfloat32m4x2_t;
typedef float32x8xm1_t vfloat32m1x8_t;

typedef float16xm1_t vfloat16m1_t;
typedef float16xm2_t vfloat16m2_t;
typedef float16xm4_t vfloat16m4_t;
typedef float16xm8_t vfloat16m8_t;

typedef float16x2xm1_t vfloat16m1x2_t;
typedef float16x4xm1_t vfloat16m1x4_t;
typedef float16x8xm1_t vfloat16m1x8_t;
typedef float16x4xm2_t vfloat16m2x4_t;

typedef int32xm1_t vint32m1_t;
@@ -129,29 +132,41 @@ typedef uint16x8xm1_t vuint16m1x8_t;
#define vsse32_v_f32m4 vssev_float32xm4
#define vsse32_v_f32m8 vssev_float32xm8

#define vlseg4e32_v_f32m2x4 vlseg4ev_float32x4xm2
#define vsseg4e32_v_f32m2x4 vsseg4ev_float32x4xm2
#define vlseg2e32_v_f32m1x2 vlseg2ev_float32x2xm1
#define vsseg2e32_v_f32m1x2 vsseg2ev_float32x2xm1

#define vlseg4e32_v_f32m1x4 vlseg4ev_float32x4xm1
#define vsseg4e32_v_f32m1x4 vsseg4ev_float32x4xm1

#define vlseg2e32_v_f32m4x2 vlseg2ev_float32x2xm4
#define vsseg2e32_v_f32m4x2 vsseg2ev_float32x2xm4

#define vlseg8e32_v_f32m1x8 vlseg8ev_float32x8xm1
#define vsseg8e32_v_f32m1x8 vsseg8ev_float32x8xm1

#define vlseg4e32_v_f32m2x4 vlseg4ev_float32x4xm2
#define vsseg4e32_v_f32m2x4 vsseg4ev_float32x4xm2

#define vlseg2e32_v_f32m4x2 vlseg2ev_float32x2xm4
#define vsseg2e32_v_f32m4x2 vsseg2ev_float32x2xm4

#define vloxseg2ei32_v_f32m4x2(a, i, vl) vlxseg2ev_float32x2xm4(a, reinterpret_cast<int32xm4_t>(i), vl)

#define vset_f32m1x2 vseg_element_set_float32x2xm1
#define vset_f32m1x4 vseg_element_set_float32x4xm1
#define vset_f32m1x8 vseg_element_set_float32x8xm1
#define vset_f32m2x4 vseg_element_set_float32x4xm2
#define vset_f32m4x2 vseg_element_set_float32x2xm4
#define vget_f32m1x2_f32m1 vseg_element_get_float32x2xm1
#define vget_f32m1x4_f32m1 vseg_element_get_float32x4xm1
#define vget_f32m1x8_f32m1 vseg_element_get_float32x8xm1
#define vget_f32m2x4_f32m2 vseg_element_get_float32x4xm2
#define vget_f32m4x2_f32m4 vseg_element_get_float32x2xm4

#define vset_f32m1x8 vseg_element_set_float32x8xm1
#define vget_f32m1x8_f32m1 vseg_element_get_float32x8xm1
static inline vfloat32m1x2_t vcreate_f32m1x2(vfloat32m1_t v0, vfloat32m1_t v1)
{
vfloat32m1x2_t p;
p = vset_f32m1x2(p, 0, v0);
p = vset_f32m1x2(p, 1, v1);
return p;
}

static inline vfloat32m1x4_t vcreate_f32m1x4(vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3)
{
@@ -162,6 +177,7 @@ static inline vfloat32m1x4_t vcreate_f32m1x4(vfloat32m1_t v0, vfloat32m1_t v1, v
p = vset_f32m1x4(p, 3, v3);
return p;
}

static inline vfloat32m2x4_t vcreate_f32m2x4(vfloat32m2_t v0, vfloat32m2_t v1, vfloat32m2_t v2, vfloat32m2_t v3)
{
vfloat32m2x4_t p;
@@ -171,6 +187,7 @@ static inline vfloat32m2x4_t vcreate_f32m2x4(vfloat32m2_t v0, vfloat32m2_t v1, v
p = vset_f32m2x4(p, 3, v3);
return p;
}

static inline vfloat32m1x8_t vcreate_f32m1x8(vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, vfloat32m1_t v4, vfloat32m1_t v5, vfloat32m1_t v6, vfloat32m1_t v7)
{
vfloat32m1x8_t p;
@@ -549,14 +566,56 @@ static inline vfloat32m1_t vfredsum_vs_f32m8_f32m1(vfloat32m1_t dst, vfloat32m8_
#define vse16_v_f16m4 vsev_float16xm4
#define vse16_v_f16m8 vsev_float16xm8

#define vlseg2e16_v_f16m1x2 vlseg2ev_float16x2xm1
#define vsseg2e16_v_f16m1x2 vsseg2ev_float16x2xm1

#define vlseg4e16_v_f16m1x4 vlseg4ev_float16x4xm1
#define vsseg4e16_v_f16m1x4 vsseg4ev_float16x4xm1

#define vlseg8e16_v_f16m1x8 vlseg8ev_float16x8xm1
#define vsseg8e16_v_f16m1x8 vsseg8ev_float16x8xm1

#define vlseg4e16_v_f16m2x4 vlseg4ev_float16x4xm2
#define vsseg4e16_v_f16m2x4 vsseg4ev_float16x4xm2

#define vlseg4e16_v_f16m1x4 vlseg4ev_float16x4xm1
#define vsseg4e16_v_f16m1x4 vsseg4ev_float16x4xm1
#define vset_f16m1x2 vseg_element_set_float16x2xm1
#define vset_f16m1x4 vseg_element_set_float16x4xm1
#define vset_f16m1x8 vseg_element_set_float16x8xm1
#define vget_f16m1x2_f16m1 vseg_element_get_float16x2xm1
#define vget_f16m1x4_f16m1 vseg_element_get_float16x4xm1
#define vget_f16m1x8_f16m1 vseg_element_get_float16x8xm1

static inline vfloat16m1x2_t vcreate_f16m1x2(vfloat16m1_t v0, vfloat16m1_t v1)
{
vfloat16m1x2_t p;
p = vset_f16m1x2(p, 0, v0);
p = vset_f16m1x2(p, 1, v1);
return p;
}

static inline vfloat16m1x4_t vcreate_f16m1x4(vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3)
{
vfloat16m1x4_t p;
p = vset_f16m1x4(p, 0, v0);
p = vset_f16m1x4(p, 1, v1);
p = vset_f16m1x4(p, 2, v2);
p = vset_f16m1x4(p, 3, v3);
return p;
}

static inline vfloat16m1x8_t vcreate_f16m1x8(vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, vfloat16m1_t v4, vfloat16m1_t v5, vfloat16m1_t v6, vfloat16m1_t v7)
{
vfloat16m1x8_t p;
p = vset_f16m1x8(p, 0, v0);
p = vset_f16m1x8(p, 1, v1);
p = vset_f16m1x8(p, 2, v2);
p = vset_f16m1x8(p, 3, v3);
p = vset_f16m1x8(p, 4, v4);
p = vset_f16m1x8(p, 5, v5);
p = vset_f16m1x8(p, 6, v6);
p = vset_f16m1x8(p, 7, v7);
return p;
}

#define vfmv_s_f_f16m1(a, b, vl) vfmvsf_float16xm1(b, vl)
#define vfmv_s_f_f16m2(a, b, vl) vfmvsf_float16xm2(b, vl)


Loading…
Cancel
Save