Browse Source

discard riscv weight memory (#3874)

* discard riscv innerproduct weight
* drop riscv conv convdw weight
* drop riscv deconv deconvdw weight
tags/20220701
nihui GitHub 4 years ago
parent
commit
40a69a2dd3
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 560 additions and 266 deletions
  1. +30
    -9
      src/layer/arm/convolutiondepthwise_arm.cpp
  2. +5
    -0
      src/layer/mips/convolutiondepthwise_mips.cpp
  3. +121
    -64
      src/layer/riscv/convolution_riscv.cpp
  4. +1
    -3
      src/layer/riscv/convolution_riscv.h
  5. +50
    -21
      src/layer/riscv/convolutiondepthwise_riscv.cpp
  6. +1
    -3
      src/layer/riscv/convolutiondepthwise_riscv.h
  7. +26
    -16
      src/layer/riscv/deconvolution_riscv.cpp
  8. +1
    -3
      src/layer/riscv/deconvolution_riscv.h
  9. +31
    -11
      src/layer/riscv/deconvolutiondepthwise_riscv.cpp
  10. +1
    -3
      src/layer/riscv/deconvolutiondepthwise_riscv.h
  11. +279
    -123
      src/layer/riscv/innerproduct_riscv.cpp
  12. +2
    -1
      src/layer/riscv/innerproduct_riscv.h
  13. +12
    -9
      src/layer/x86/convolutiondepthwise_x86.cpp

+ 30
- 9
src/layer/arm/convolutiondepthwise_arm.cpp View File

@@ -130,6 +130,11 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)

ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);

if (opt.lightmode)
{
weight_data.release();
}

return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -153,6 +158,11 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)
ncnn::cast_float32_to_bfloat16(weight_data, weight_data_tm, opt);
}

if (opt.lightmode)
{
weight_data.release();
}

return 0;
}
#endif // NCNN_BF16
@@ -163,8 +173,6 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)
{
Mat weight_data_r2 = weight_data.reshape(maxk, group);
convert_packing(weight_data_r2, weight_data_tm, 4, opt);

return 0;
}
#endif // __ARM_NEON

@@ -173,24 +181,32 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)
if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
weight_data_tm = weight_data;
return 0;
}
if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
weight_data_tm = weight_data;
return 0;
}
if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
weight_data_tm = weight_data;
return 0;
}
if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
weight_data_tm = weight_data;
return 0;
}
else
{
// group convolution
create_group_ops(opt);
}
}

if (opt.lightmode)
{
weight_data.release();
}

return 0;
}

// group convolution
@@ -1591,6 +1607,11 @@ int ConvolutionDepthWise_arm::create_pipeline_int8_arm(const Option& opt)
weight_data_tm = weight_data;
}

if (opt.lightmode)
{
weight_data.release();
}

return 0;
}



+ 5
- 0
src/layer/mips/convolutiondepthwise_mips.cpp View File

@@ -83,6 +83,11 @@ int ConvolutionDepthWise_mips::create_pipeline(const Option& opt)
weight_data_tm = weight_data;
}

if (opt.lightmode)
{
weight_data.release();
}

return 0;
}



+ 121
- 64
src/layer/riscv/convolution_riscv.cpp View File

@@ -88,7 +88,7 @@ Convolution_riscv::Convolution_riscv()
activation = 0;
}

static void convolution_transform_kernel_packed_rvv(const Mat& weight_data, Mat& weight_data_packed, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
static void convolution_transform_kernel_packed_rvv(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
{
const int maxk = kernel_w * kernel_h;

@@ -97,11 +97,11 @@ static void convolution_transform_kernel_packed_rvv(const Mat& weight_data, Mat&
{
Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);

weight_data_packed.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack);
weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack);

for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
{
float* g00 = weight_data_packed.channel(q / out_elempack);
float* g00 = weight_data_tm.channel(q / out_elempack);

for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
{
@@ -131,6 +131,14 @@ int Convolution_riscv::create_pipeline(const Option& opt)

activation = create_activation_layer(activation_type, activation_params, opt);

#if NCNN_INT8
if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
{
// TODO implement int8
return 0;
}
#endif

#if __riscv_vector && __riscv_zfh
if (opt.use_fp16_storage)
{
@@ -166,14 +174,14 @@ int Convolution_riscv::create_pipeline(const Option& opt)
}
else
{
convolution_transform_kernel_packed_rvv(weight_data, weight_data_packed, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
convolution_transform_kernel_packed_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
}
}

// pack1ton
if (elempack == 1 && out_elempack == packn)
{
convolution_transform_kernel_packed_rvv(weight_data, weight_data_packed, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
convolution_transform_kernel_packed_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
}

// packnto1
@@ -181,19 +189,19 @@ int Convolution_riscv::create_pipeline(const Option& opt)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
convolution_im2col_sgemm_transform_kernel_packnto1_rvv(weight_data, weight_data_packed, num_input, num_output, kernel_w, kernel_h);
convolution_im2col_sgemm_transform_kernel_packnto1_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
convolution_im2col_sgemm_transform_kernel_packnto1_rvv(weight_data, weight_data_packed, num_input, num_output, kernel_w, kernel_h);
convolution_im2col_sgemm_transform_kernel_packnto1_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_transform_kernel_packnto1_rvv(weight_data, weight_data_packed, num_input, num_output, kernel_w, kernel_h);
convolution_im2col_sgemm_transform_kernel_packnto1_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
}
else
{
convolution_transform_kernel_packed_rvv(weight_data, weight_data_packed, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
convolution_transform_kernel_packed_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
}
}
#endif // __riscv_vector
@@ -203,7 +211,7 @@ int Convolution_riscv::create_pipeline(const Option& opt)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
convolution_im2col_sgemm_transform_kernel_rvv(weight_data, weight_data_packed, num_input, num_output, kernel_w, kernel_h);
convolution_im2col_sgemm_transform_kernel_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
}
else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
@@ -218,10 +226,19 @@ int Convolution_riscv::create_pipeline(const Option& opt)
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_transform_kernel_rvv(weight_data, weight_data_packed, num_input, num_output, kernel_w, kernel_h);
convolution_im2col_sgemm_transform_kernel_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
}
else
{
weight_data_tm = weight_data;
}
}

if (opt.lightmode)
{
weight_data.release();
}

return 0;
}

@@ -240,7 +257,7 @@ int Convolution_riscv::destroy_pipeline(const Option& opt)
int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
#if NCNN_INT8
if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
if (opt.use_int8_inference && int8_scale_term)
{
Mat bottom_blob_unpacked = bottom_blob;
if (bottom_blob.elempack != 1)
@@ -266,9 +283,44 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
}
#endif

if (bottom_blob.dims != 3)
// flattened blob, implement as InnerProduct
if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
{
return Convolution::forward(bottom_blob, top_blob, opt);
Mat bottom_blob_3d;
if (bottom_blob.elemsize % 16 == 0)
{
bottom_blob_3d = bottom_blob;
bottom_blob_3d.dims = 3;
bottom_blob_3d.w = 1;
bottom_blob_3d.h = 1;
bottom_blob_3d.c = bottom_blob.w;
bottom_blob_3d.cstep = 1;
}
else
{
bottom_blob_3d = bottom_blob.reshape(1, 1, bottom_blob.w, opt.workspace_allocator);
}

Mat top_blob_3d;
int ret = forward(bottom_blob_3d, top_blob_3d, opt);
if (ret != 0)
return ret;

if (top_blob_3d.elemsize % 16 == 0)
{
top_blob = top_blob_3d;
top_blob.dims = 1;
top_blob.w = top_blob_3d.c;
top_blob.h = 1;
top_blob.c = 1;
bottom_blob_3d.cstep = top_blob_3d.c;
}
else
{
top_blob = top_blob_3d.reshape(top_blob_3d.c, opt.blob_allocator);
}

return 0;
}

int elembits = bottom_blob.elembits();
@@ -328,7 +380,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv1x1s1_sgemm_packn_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
conv1x1s1_sgemm_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

if (activation)
{
@@ -337,7 +389,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv1x1s2_sgemm_packn_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
conv1x1s2_sgemm_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

if (activation)
{
@@ -363,7 +415,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_packn_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
convolution_im2col_sgemm_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);

if (activation)
{
@@ -372,7 +424,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
}
else
{
convolution_packn_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
convolution_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
}
}

@@ -380,7 +432,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv1x1s1_sgemm_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
conv1x1s1_sgemm_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

if (activation)
{
@@ -389,7 +441,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv3x3s1_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
conv3x3s1_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

if (activation)
{
@@ -398,7 +450,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv3x3s2_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
conv3x3s2_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

if (activation)
{
@@ -407,7 +459,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
}
else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv7x7s2_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
conv7x7s2_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

if (activation)
{
@@ -416,7 +468,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
convolution_im2col_sgemm_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);

if (activation)
{
@@ -425,7 +477,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
}
else
{
convolution_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
convolution_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
}
}

@@ -433,7 +485,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv1x1s1_sgemm_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
conv1x1s1_sgemm_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

if (activation)
{
@@ -442,7 +494,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv1x1s2_sgemm_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
conv1x1s2_sgemm_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

if (activation)
{
@@ -451,7 +503,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
convolution_im2col_sgemm_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);

if (activation)
{
@@ -460,7 +512,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
}
else
{
convolution_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
convolution_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
}
}
#endif // __riscv_vector
@@ -469,7 +521,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv1x1s1_sgemm_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
conv1x1s1_sgemm_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

if (activation)
{
@@ -494,7 +546,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
convolution_im2col_sgemm_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);

if (activation)
{
@@ -541,7 +593,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
sum = bias_data[p];
}

const float* kptr = (const float*)weight_data + maxk * channels * p;
const float* kptr = (const float*)weight_data_tm + maxk * channels * p;

// channels
for (int q = 0; q < channels; q++)
@@ -666,7 +718,7 @@ int Convolution_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector
}

#if __riscv_vector && __riscv_zfh
static void convolution_transform_kernel_packed_fp16s_rvv(const Mat& weight_data, Mat& weight_data_fp16, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
static void convolution_transform_kernel_packed_fp16s_rvv(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
{
const int maxk = kernel_w * kernel_h;

@@ -675,11 +727,11 @@ static void convolution_transform_kernel_packed_fp16s_rvv(const Mat& weight_data
{
Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);

weight_data_fp16.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack);
weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack);

for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
{
__fp16* g00 = weight_data_fp16.channel(q / out_elempack);
__fp16* g00 = weight_data_tm.channel(q / out_elempack);

for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
{
@@ -728,14 +780,14 @@ int Convolution_riscv::create_pipeline_fp16s(const Option& opt)
}
else
{
convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
}
}

// pack1ton
if (elempack == 1 && out_elempack == packn)
{
convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
}

// packnto1
@@ -743,19 +795,19 @@ int Convolution_riscv::create_pipeline_fp16s(const Option& opt)
{
if (opt.use_fp16_arithmetic && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h);
convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
}
else if (opt.use_fp16_arithmetic && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h);
convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
}
else if (opt.use_fp16_arithmetic && opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h);
convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
}
else
{
convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
}
}

@@ -764,15 +816,15 @@ int Convolution_riscv::create_pipeline_fp16s(const Option& opt)
{
if (opt.use_fp16_arithmetic && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
convolution_im2col_sgemm_transform_kernel_fp16sa_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h);
convolution_im2col_sgemm_transform_kernel_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
}
else if (opt.use_fp16_arithmetic && opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_transform_kernel_fp16sa_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h);
convolution_im2col_sgemm_transform_kernel_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
}
else
{
convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
}
}

@@ -781,6 +833,11 @@ int Convolution_riscv::create_pipeline_fp16s(const Option& opt)
ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
}

if (opt.lightmode)
{
weight_data.release();
}

return 0;
}

@@ -818,28 +875,28 @@ int Convolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
if (elempack == packn && out_elempack == packn)
{
{
convolution_packn_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
convolution_packn_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
}
}

if (elempack == 1 && out_elempack == packn)
{
{
convolution_pack1ton_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
convolution_pack1ton_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
}
}

if (elempack == packn && out_elempack == 1)
{
{
convolution_packnto1_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
convolution_packnto1_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
}
}

if (elempack == 1 && out_elempack == 1)
{
{
convolution_fp16s(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
convolution_fp16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
}
}

@@ -884,7 +941,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv1x1s1_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt);
conv1x1s1_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

if (activation)
{
@@ -893,7 +950,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv1x1s2_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt);
conv1x1s2_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

if (activation)
{
@@ -919,7 +976,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
convolution_im2col_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);

if (activation)
{
@@ -928,7 +985,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
}
else
{
convolution_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
convolution_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
}
}

@@ -936,7 +993,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv1x1s1_sgemm_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt);
conv1x1s1_sgemm_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

if (activation)
{
@@ -945,7 +1002,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv3x3s1_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt);
conv3x3s1_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

if (activation)
{
@@ -954,7 +1011,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv3x3s2_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt);
conv3x3s2_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

if (activation)
{
@@ -963,7 +1020,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
}
else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv7x7s2_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt);
conv7x7s2_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

if (activation)
{
@@ -972,7 +1029,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
convolution_im2col_sgemm_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);

if (activation)
{
@@ -981,7 +1038,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
}
else
{
convolution_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
convolution_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
}
}

@@ -989,7 +1046,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv1x1s1_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt);
conv1x1s1_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

if (activation)
{
@@ -998,7 +1055,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv1x1s2_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt);
conv1x1s2_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

if (activation)
{
@@ -1007,7 +1064,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
convolution_im2col_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);

if (activation)
{
@@ -1016,7 +1073,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
}
else
{
convolution_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
convolution_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
}
}

@@ -1024,7 +1081,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv1x1s1_sgemm_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt);
conv1x1s1_sgemm_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

if (activation)
{
@@ -1033,7 +1090,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
convolution_im2col_sgemm_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);

if (activation)
{
@@ -1042,7 +1099,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
}
else
{
convolution_fp16s(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
convolution_fp16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
}
}



+ 1
- 3
src/layer/riscv/convolution_riscv.h View File

@@ -41,14 +41,12 @@ protected:
public:
Layer* activation;

// packn
Mat weight_data_packed;
Mat weight_data_tm;
Mat weight_winograd23_data;
Mat weight_winograd43_data;
Mat weight_winograd63_data;

// fp16
Mat weight_data_fp16;
Mat bias_data_fp16;
};



+ 50
- 21
src/layer/riscv/convolutiondepthwise_riscv.cpp View File

@@ -61,6 +61,14 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt)

activation = create_activation_layer(activation_type, activation_params, opt);

#if NCNN_INT8
if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
{
// TODO implement int8
return 0;
}
#endif

#if __riscv_vector && __riscv_zfh
if (opt.use_fp16_storage)
{
@@ -91,12 +99,18 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt)
if (elempack == packn)
{
Mat weight_data_r2 = weight_data.reshape(maxk, group);
convert_packing(weight_data_r2, weight_data_packed, packn, opt);
convert_packing(weight_data_r2, weight_data_tm, packn, opt);
}
#endif // __riscv_vector

if (elempack == 1)
{
weight_data_tm = weight_data;
}

if (opt.lightmode)
{
weight_data.release();
}

return 0;
@@ -105,6 +119,11 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt)
// group convolution
create_group_ops(opt);

if (opt.lightmode)
{
weight_data.release();
}

return 0;
}

@@ -126,7 +145,7 @@ int ConvolutionDepthWise_riscv::create_group_ops(const Option& opt)

for (int g = 0; g < group; g++)
{
Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g);
Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g).clone();
Mat bias_data_g;
if (bias_term)
bias_data_g = bias_data.range(num_output_g * g, num_output_g);
@@ -227,7 +246,7 @@ int ConvolutionDepthWise_riscv::destroy_pipeline(const Option& opt)
int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
#if NCNN_INT8
if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
if (opt.use_int8_inference && int8_scale_term)
{
Mat bottom_blob_unpacked = bottom_blob;
if (bottom_blob.elempack != 1)
@@ -310,7 +329,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c
{
if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
convdw3x3s1_packn_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
convdw3x3s1_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

if (activation)
{
@@ -319,7 +338,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
convdw3x3s2_packn_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
convdw3x3s2_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

if (activation)
{
@@ -328,7 +347,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c
}
else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
convdw5x5s1_packn_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
convdw5x5s1_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

if (activation)
{
@@ -337,7 +356,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c
}
else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
convdw5x5s2_packn_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
convdw5x5s2_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

if (activation)
{
@@ -371,7 +390,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c
for (int g = 0; g < channels; g++)
{
float* outptr = top_blob.channel(g);
const float* kptr = (const float*)weight_data_packed + maxk * g * packn;
const float* kptr = (const float*)weight_data_tm + maxk * g * packn;
const Mat m = bottom_blob_bordered.channel(g);

for (int i = 0; i < outh; i++)
@@ -410,7 +429,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c
{
if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
convdw3x3s1_rvv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
convdw3x3s1_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

if (activation)
{
@@ -419,7 +438,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
convdw3x3s2_rvv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
convdw3x3s2_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

if (activation)
{
@@ -453,7 +472,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c
for (int g = 0; g < group; g++)
{
float* outptr = top_blob.channel(g);
const float* kptr = (const float*)weight_data + maxk * g;
const float* kptr = (const float*)weight_data_tm + maxk * g;
const Mat m = bottom_blob_bordered.channel(g);

for (int i = 0; i < outh; i++)
@@ -663,22 +682,32 @@ int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
Mat weight_data_r2_packed;
convert_packing(weight_data_r2, weight_data_r2_packed, packn, opt);

ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_fp16, opt);
ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_tm, opt);
}

if (elempack == 1)
{
ncnn::cast_float32_to_float16(weight_data, weight_data_fp16, opt);
ncnn::cast_float32_to_float16(weight_data, weight_data_tm, opt);
}

ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);

if (opt.lightmode)
{
weight_data.release();
}

return 0;
}

// group convolution
create_group_ops(opt);

if (opt.lightmode)
{
weight_data.release();
}

return 0;
}

@@ -744,7 +773,7 @@ int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_b
for (int g = 0; g < channels; g++)
{
__fp16* outptr = top_blob.channel(g);
const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * packn;
const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn;
const Mat m = bottom_blob_bordered.channel(g);

for (int i = 0; i < outh; i++)
@@ -806,7 +835,7 @@ int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_b
for (int g = 0; g < group; g++)
{
__fp16* outptr = top_blob.channel(g);
const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g;
const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g;
const Mat m = bottom_blob_bordered.channel(g);

for (int i = 0; i < outh; i++)
@@ -930,7 +959,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_
{
if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
convdw3x3s1_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt);
convdw3x3s1_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

if (activation)
{
@@ -939,7 +968,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
convdw3x3s2_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt);
convdw3x3s2_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

if (activation)
{
@@ -948,7 +977,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_
}
else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
convdw5x5s1_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt);
convdw5x5s1_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

if (activation)
{
@@ -957,7 +986,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_
}
else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
convdw5x5s2_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt);
convdw5x5s2_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

if (activation)
{
@@ -991,7 +1020,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_
for (int g = 0; g < channels; g++)
{
__fp16* outptr = top_blob.channel(g);
const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * packn;
const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn;
const Mat m = bottom_blob_bordered.channel(g);

for (int i = 0; i < outh; i++)
@@ -1053,7 +1082,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_
for (int g = 0; g < group; g++)
{
__fp16* outptr = top_blob.channel(g);
const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g;
const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g;
const Mat m = bottom_blob_bordered.channel(g);

for (int i = 0; i < outh; i++)


+ 1
- 3
src/layer/riscv/convolutiondepthwise_riscv.h View File

@@ -43,11 +43,9 @@ public:
Layer* activation;
std::vector<ncnn::Layer*> group_ops;

// packing
Mat weight_data_packed;
Mat weight_data_tm;

// fp16
Mat weight_data_fp16;
Mat bias_data_fp16;
};



+ 26
- 16
src/layer/riscv/deconvolution_riscv.cpp View File

@@ -100,11 +100,11 @@ int Deconvolution_riscv::create_pipeline(const Option& opt)
{
Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output);

weight_data_packed.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack);
weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack);

for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
{
float* g00 = weight_data_packed.channel(q / out_elempack);
float* g00 = weight_data_tm.channel(q / out_elempack);

for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
{
@@ -148,6 +148,11 @@ int Deconvolution_riscv::create_pipeline(const Option& opt)
{
}

if (opt.lightmode)
{
weight_data.release();
}

return 0;
}

@@ -218,21 +223,21 @@ int Deconvolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Op
if (elempack == packn && out_elempack == packn)
{
{
deconvolution_packn_rvv(bottom_blob, top_blob_bordered, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
deconvolution_packn_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
}
}

if (elempack == 1 && out_elempack == packn)
{
{
deconvolution_pack1ton_rvv(bottom_blob, top_blob_bordered, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
deconvolution_pack1ton_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
}
}

if (elempack == packn && out_elempack == 1)
{
{
deconvolution_packnto1_rvv(bottom_blob, top_blob_bordered, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
deconvolution_packnto1_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
}
}
#endif // __riscv_vector
@@ -257,7 +262,7 @@ int Deconvolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Op
sum = bias_data[p];
}

const float* kptr = (const float*)weight_data_packed.channel(p);
const float* kptr = (const float*)weight_data_tm.channel(p);

// channels
for (int q = 0; q < channels; q++)
@@ -356,11 +361,11 @@ int Deconvolution_riscv::create_pipeline_fp16s(const Option& opt)
{
Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output);

weight_data_fp16.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack);
weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack);

for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
{
__fp16* g00 = weight_data_fp16.channel(q / out_elempack);
__fp16* g00 = weight_data_tm.channel(q / out_elempack);

for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
{
@@ -404,6 +409,11 @@ int Deconvolution_riscv::create_pipeline_fp16s(const Option& opt)

ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);

if (opt.lightmode)
{
weight_data.release();
}

return 0;
}

@@ -446,28 +456,28 @@ int Deconvolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co
if (elempack == packn && out_elempack == packn)
{
{
deconvolution_packn_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
deconvolution_packn_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
}
}

if (elempack == 1 && out_elempack == packn)
{
{
deconvolution_pack1ton_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
deconvolution_pack1ton_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
}
}

if (elempack == packn && out_elempack == 1)
{
{
deconvolution_packnto1_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
deconvolution_packnto1_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
}
}

if (elempack == 1 && out_elempack == 1)
{
{
deconvolution_fp16s(bottom_blob, top_blob_bordered, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
deconvolution_fp16s(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
}
}

@@ -517,28 +527,28 @@ int Deconvolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
if (elempack == packn && out_elempack == packn)
{
{
deconvolution_packn_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
deconvolution_packn_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
}
}

if (elempack == 1 && out_elempack == packn)
{
{
deconvolution_pack1ton_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
deconvolution_pack1ton_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
}
}

if (elempack == packn && out_elempack == 1)
{
{
deconvolution_packnto1_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
deconvolution_packnto1_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
}
}

if (elempack == 1 && out_elempack == 1)
{
{
deconvolution_fp16s(bottom_blob, top_blob_bordered, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
deconvolution_fp16s(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
}
}



+ 1
- 3
src/layer/riscv/deconvolution_riscv.h View File

@@ -37,11 +37,9 @@ protected:
#endif

public:
// packn
Mat weight_data_packed;
Mat weight_data_tm;

// fp16
Mat weight_data_fp16;
Mat bias_data_fp16;
};



+ 31
- 11
src/layer/riscv/deconvolutiondepthwise_riscv.cpp View File

@@ -88,13 +88,18 @@ int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt)
if (elempack == packn)
{
Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
convert_packing(weight_data_r2, weight_data_packed, packn, opt);
convert_packing(weight_data_r2, weight_data_tm, packn, opt);
}
#endif // __riscv_vector

if (elempack == 1)
{
weight_data_packed = weight_data_transposed;
weight_data_tm = weight_data_transposed;
}

if (opt.lightmode)
{
weight_data.release();
}

return 0;
@@ -103,6 +108,11 @@ int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt)
// group convolution
create_group_ops(opt);

if (opt.lightmode)
{
weight_data.release();
}

return 0;
}

@@ -124,7 +134,7 @@ int DeconvolutionDepthWise_riscv::create_group_ops(const Option& opt)

for (int g = 0; g < group; g++)
{
Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g);
Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g).clone();
Mat bias_data_g;
if (bias_term)
bias_data_g = bias_data.range(num_output_g * g, num_output_g);
@@ -256,7 +266,7 @@ int DeconvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob,
for (int g = 0; g < channels; g++)
{
float* outptr = top_blob_bordered.channel(g);
const float* kptr = (const float*)weight_data_packed + maxk * g * packn;
const float* kptr = (const float*)weight_data_tm + maxk * g * packn;
const Mat m = bottom_blob.channel(g);

for (int i = 0; i < outh; i++)
@@ -318,7 +328,7 @@ int DeconvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob,
for (int g = 0; g < channels; g++)
{
float* outptr = top_blob_bordered.channel(g);
const float* kptr = (const float*)weight_data_packed + maxk * g;
const float* kptr = (const float*)weight_data_tm + maxk * g;
const Mat m = bottom_blob.channel(g);

for (int i = 0; i < outh; i++)
@@ -480,22 +490,32 @@ int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
Mat weight_data_r2_packed;
convert_packing(weight_data_r2, weight_data_r2_packed, packn, opt);

ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_fp16, opt);
ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_tm, opt);
}

if (elempack == 1)
{
ncnn::cast_float32_to_float16(weight_data_transposed, weight_data_fp16, opt);
ncnn::cast_float32_to_float16(weight_data_transposed, weight_data_tm, opt);
}

ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);

if (opt.lightmode)
{
weight_data.release();
}

return 0;
}

// group convolution
create_group_ops(opt);

if (opt.lightmode)
{
weight_data.release();
}

return 0;
}

@@ -543,7 +563,7 @@ int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top
for (int g = 0; g < channels; g++)
{
__fp16* outptr = top_blob_bordered.channel(g);
const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * packn;
const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn;
const Mat m = bottom_blob.channel(g);

for (int i = 0; i < outh; i++)
@@ -605,7 +625,7 @@ int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top
for (int g = 0; g < channels; g++)
{
__fp16* outptr = top_blob_bordered.channel(g);
const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g;
const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g;
const Mat m = bottom_blob.channel(g);

for (int i = 0; i < outh; i++)
@@ -764,7 +784,7 @@ int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& to
for (int g = 0; g < channels; g++)
{
__fp16* outptr = top_blob_bordered.channel(g);
const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * packn;
const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn;
const Mat m = bottom_blob.channel(g);

for (int i = 0; i < outh; i++)
@@ -826,7 +846,7 @@ int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& to
for (int g = 0; g < channels; g++)
{
__fp16* outptr = top_blob_bordered.channel(g);
const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g;
const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g;
const Mat m = bottom_blob.channel(g);

for (int i = 0; i < outh; i++)


+ 1
- 3
src/layer/riscv/deconvolutiondepthwise_riscv.h View File

@@ -40,11 +40,9 @@ protected:
public:
std::vector<ncnn::Layer*> group_ops;

// packing
Mat weight_data_packed;
Mat weight_data_tm;

// fp16
Mat weight_data_fp16;
Mat bias_data_fp16;
};



+ 279
- 123
src/layer/riscv/innerproduct_riscv.cpp View File

@@ -43,8 +43,6 @@ InnerProduct_riscv::InnerProduct_riscv()

int InnerProduct_riscv::create_pipeline(const Option& opt)
{
#if __riscv_vector
if (opt.use_packing_layout || opt.use_int8_inference)
{
flatten = ncnn::create_layer(ncnn::LayerType::Flatten);

@@ -54,7 +52,14 @@ int InnerProduct_riscv::create_pipeline(const Option& opt)

flatten->create_pipeline(opt);
}
#endif // __riscv_vector

#if NCNN_INT8
if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
{
// TODO implement int8
return 0;
}
#endif

#if __riscv_vector && __riscv_zfh
if (opt.use_fp16_storage)
@@ -63,6 +68,53 @@ int InnerProduct_riscv::create_pipeline(const Option& opt)
}
#endif

int out_elempack = 1;

#if __riscv_vector
const int packn = csrr_vlenb() / 4;

const int num_input = weight_data_size / num_output;

if (opt.use_packing_layout)
{
out_elempack = num_output % packn == 0 ? packn : 1;
}

if (out_elempack == packn)
{
// src = inch-outch
// dst = packn-inch-outch/packn
{
Mat weight_data_r2 = weight_data.reshape(num_input, num_output);

weight_data_tm.create(num_input, num_output / packn, (size_t)4u * packn, packn);

for (int q = 0; q + (packn - 1) < num_output; q += packn)
{
float* g0 = weight_data_tm.row(q / packn);

for (int p = 0; p < num_input; p++)
{
for (int j = 0; j < packn; j++)
{
*g0++ = weight_data_r2.row(q + j)[p];
}
}
}
}
}
#endif // __riscv_vector

if (out_elempack == 1)
{
weight_data_tm = weight_data;
}

if (opt.lightmode)
{
weight_data.release();
}

return 0;
}

@@ -81,7 +133,7 @@ int InnerProduct_riscv::destroy_pipeline(const Option& opt)
int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
#if NCNN_INT8
if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
if (opt.use_int8_inference && int8_scale_term)
{
Mat bottom_blob_unpacked = bottom_blob;
if (bottom_blob.elempack != 1)
@@ -136,20 +188,104 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
if (top_blob.empty())
return -100;

int num_output_elempack = 1;
#if __riscv_vector
if (opt.use_packing_layout)
{
num_output_elempack = num_output % packn == 0 ? packn : 1;
}
#endif

#pragma omp parallel for num_threads(opt.num_threads)
for (int j = 0; j < h; j++)
{
#if __riscv_vector
if (elempack == packn)
if (elempack == packn && num_output_elempack == packn)
{
const word_type vl = vsetvl_e32m1(packn);

float* outptr = top_blob.row(j);

for (int p = 0; p < num_output / num_output_elempack; p++)
{
for (int l = 0; l < packn; l++)
{
const float* kptr = weight_data_tm.row(p) + l;
const float* m = bottom_blob.row(j);

vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl);

if (bias_term)
{
_sum = vfmv_v_f_f32m1(bias_data[p * packn + l], vl);
}

int n = num_input;
while (n > 0)
{
vfloat32m1_t _val = vle32_v_f32m1(m, vl);
_sum = vfmacc_vf_f32m1(_sum, *kptr, _val, vl);

m += packn;
kptr += packn;
n -= 1;
}

_sum = activation_ps(_sum, activation_type, activation_params, vl);

vse32_v_f32m1(outptr, _sum, vl);
outptr += packn;
}
}
}

if (elempack == 1 && num_output_elempack == packn)
{
const word_type vl = vsetvl_e32m1(packn);

float* outptr = top_blob.row(j);

for (int p = 0; p < num_output / num_output_elempack; p++)
{
const float* kptr = weight_data_tm.row(p);
const float* m = bottom_blob.row(j);

vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl);

if (bias_term)
{
_sum = vle32_v_f32m1((const float*)bias_data + p * packn, vl);
}

int n = num_input;
while (n > 0)
{
vfloat32m1_t _w = vle32_v_f32m1(kptr, vl);
_sum = vfmacc_vf_f32m1(_sum, *m, _w, vl);

m += 1;
kptr += packn;
n -= 1;
}

_sum = activation_ps(_sum, activation_type, activation_params, vl);

vse32_v_f32m1(outptr, _sum, vl);
outptr += packn;
}
}

if (elempack == packn && num_output_elempack == 1)
{
const word_type vl = vsetvl_e32m1(packn);

float* outptr = top_blob.row(j);

for (int p = 0; p < num_output; p++)
{
const float* kptr = (const float*)weight_data + num_input * p;
const float* kptr = (const float*)weight_data_tm + num_input * p;
const float* m = bottom_blob.row(j);

const word_type vl = vsetvl_e32m1(packn);
vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl);

if (bias_term)
@@ -176,13 +312,13 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
}
#endif // __riscv_vector

if (elempack == 1)
if (elempack == 1 && num_output_elempack == 1)
{
float* outptr = top_blob.row(j);

for (int p = 0; p < num_output; p++)
{
const float* kptr = (const float*)weight_data + num_input * p;
const float* kptr = (const float*)weight_data_tm + num_input * p;
const float* m = bottom_blob.row(j);

float sum = 0.f;
@@ -208,69 +344,93 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
return 0;
}

int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;
int size = w * h;

#if __riscv_vector
if (elempack == packn)
// flatten
Mat bottom_blob_flattened = bottom_blob;
if (bottom_blob.dims != 1)
{
// flatten
Mat bottom_blob_flattened = bottom_blob;
if (bottom_blob.dims != 1)
{
Option opt_flatten = opt;
opt_flatten.blob_allocator = opt.workspace_allocator;
Option opt_flatten = opt;
opt_flatten.blob_allocator = opt.workspace_allocator;

flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten);
}
flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten);
}

// pack1
{
bottom_blob_flattened.w *= bottom_blob_flattened.elempack;
bottom_blob_flattened.cstep = bottom_blob_flattened.w;
bottom_blob_flattened.elemsize = 4u;
bottom_blob_flattened.elempack = 1;
}
size_t elemsize = bottom_blob_flattened.elemsize;
int elempack = bottom_blob_flattened.elempack;

return forward(bottom_blob_flattened, top_blob, opt);
int out_elempack = 1;
#if __riscv_vector
if (opt.use_packing_layout)
{
out_elempack = num_output % packn == 0 ? packn : 1;
}
#endif
size_t out_elemsize = elemsize / elempack * out_elempack;

top_blob.create(num_output, elemsize, opt.blob_allocator);
top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
if (top_blob.empty())
return -100;

const float* weight_data_ptr = weight_data;

#if __riscv_vector
int nn_num_output = num_output / packn;
int remain_num_output_start = nn_num_output * packn;

#pragma omp parallel for num_threads(opt.num_threads)
for (int pp = 0; pp < nn_num_output; pp++)
if (out_elempack == packn)
{
int p = pp * packn;
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < num_output / out_elempack; p++)
{
const word_type vl = vsetvl_e32m1(packn);
vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl);

const word_type vl = vsetvl_e32m1(packn);
vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl);
if (bias_term)
{
_sum = vle32_v_f32m1((const float*)bias_data + p * packn, vl);
}

if (bias_term)
{
_sum = vle32_v_f32m1((const float*)bias_data + p, vl);
const float* kptr = weight_data_tm.row(p);

const float* sptr = bottom_blob_flattened;

int n = num_input;
while (n > 0)
{
vfloat32m1_t _w = vle32_v_f32m1(kptr, vl);
_sum = vfmacc_vf_f32m1(_sum, *sptr, _w, vl);

sptr += 1;
kptr += packn;
n -= 1;
}

_sum = activation_ps(_sum, activation_type, activation_params, vl);

float* outptr = top_blob;
vse32_v_f32m1(outptr + p * packn, _sum, vl);
}
}
#endif // __riscv_vector

const float* w = weight_data_ptr + num_input * p;
if (out_elempack == 1)
{
#if __riscv_vector
int nn_num_output = num_output / packn;
int remain_num_output_start = nn_num_output * packn;

// channels
for (int q = 0; q < channels; q++)
#pragma omp parallel for num_threads(opt.num_threads)
for (int pp = 0; pp < nn_num_output; pp++)
{
const float* m = bottom_blob.channel(q);
int p = pp * packn;

int n = size;
const word_type vl = vsetvl_e32m1(packn);
vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl);

if (bias_term)
{
_sum = vle32_v_f32m1((const float*)bias_data + p, vl);
}

const float* w = (const float*)weight_data_tm + num_input * p;

const float* m = bottom_blob_flattened;

int n = num_input;
while (n > 0)
{
vfloat32m1_t _w = vlse32_v_f32m1(w, num_input * sizeof(float), vl);
@@ -281,45 +441,41 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
w += 1;
n -= 1;
}
}

_sum = activation_ps(_sum, activation_type, activation_params, vl);
_sum = activation_ps(_sum, activation_type, activation_params, vl);

vse32_v_f32m1((float*)top_blob + p, _sum, vl);
}
vse32_v_f32m1((float*)top_blob + p, _sum, vl);
}
#else // __riscv_vector
int nn_num_output = num_output / 4;
int remain_num_output_start = nn_num_output * 4;
int nn_num_output = num_output / 4;
int remain_num_output_start = nn_num_output * 4;

#pragma omp parallel for num_threads(opt.num_threads)
for (int pp = 0; pp < nn_num_output; pp++)
{
int p = pp * 4;
#pragma omp parallel for num_threads(opt.num_threads)
for (int pp = 0; pp < nn_num_output; pp++)
{
int p = pp * 4;

float sum0 = 0.f;
float sum1 = 0.f;
float sum2 = 0.f;
float sum3 = 0.f;
float sum0 = 0.f;
float sum1 = 0.f;
float sum2 = 0.f;
float sum3 = 0.f;

if (bias_term)
{
sum0 = bias_data[p];
sum1 = bias_data[p + 1];
sum2 = bias_data[p + 2];
sum3 = bias_data[p + 3];
}
if (bias_term)
{
sum0 = bias_data[p];
sum1 = bias_data[p + 1];
sum2 = bias_data[p + 2];
sum3 = bias_data[p + 3];
}

const float* w0 = weight_data_ptr + num_input * p;
const float* w1 = weight_data_ptr + num_input * (p + 1);
const float* w2 = weight_data_ptr + num_input * (p + 2);
const float* w3 = weight_data_ptr + num_input * (p + 3);
const float* w0 = (const float*)weight_data_tm + num_input * p;
const float* w1 = (const float*)weight_data_tm + num_input * (p + 1);
const float* w2 = (const float*)weight_data_tm + num_input * (p + 2);
const float* w3 = (const float*)weight_data_tm + num_input * (p + 3);

// channels
for (int q = 0; q < channels; q++)
{
const float* m = bottom_blob.channel(q);
const float* m = bottom_blob_flattened;

for (int i = 0; i < size; i++)
for (int i = 0; i < num_input; i++)
{
sum0 += *m * *w0;
sum1 += *m * *w1;
@@ -332,48 +488,43 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
w2++;
w3++;
}
}

sum0 = activation_ss(sum0, activation_type, activation_params);
sum1 = activation_ss(sum1, activation_type, activation_params);
sum2 = activation_ss(sum2, activation_type, activation_params);
sum3 = activation_ss(sum3, activation_type, activation_params);
sum0 = activation_ss(sum0, activation_type, activation_params);
sum1 = activation_ss(sum1, activation_type, activation_params);
sum2 = activation_ss(sum2, activation_type, activation_params);
sum3 = activation_ss(sum3, activation_type, activation_params);

top_blob[p] = sum0;
top_blob[p + 1] = sum1;
top_blob[p + 2] = sum2;
top_blob[p + 3] = sum3;
}
top_blob[p] = sum0;
top_blob[p + 1] = sum1;
top_blob[p + 2] = sum2;
top_blob[p + 3] = sum3;
}
#endif // __riscv_vector

// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = remain_num_output_start; p < num_output; p++)
{
float sum = 0.f;
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = remain_num_output_start; p < num_output; p++)
{
float sum = 0.f;

if (bias_term)
sum = bias_data[p];
if (bias_term)
sum = bias_data[p];

const float* w = weight_data_ptr + num_input * p;
const float* w = (const float*)weight_data_tm + num_input * p;

// channels
for (int q = 0; q < channels; q++)
{
const float* m = bottom_blob.channel(q);
const float* m = bottom_blob_flattened;

for (int i = 0; i < size; i++)
for (int i = 0; i < num_input; i++)
{
sum += *m * *w;

m++;
w++;
}
}

sum = activation_ss(sum, activation_type, activation_params);
sum = activation_ss(sum, activation_type, activation_params);

top_blob[p] = sum;
top_blob[p] = sum;
}
}

return 0;
@@ -398,11 +549,11 @@ int InnerProduct_riscv::create_pipeline_fp16s(const Option& opt)
{
Mat weight_data_r2 = weight_data.reshape(num_input, num_output);

weight_data_fp16.create(num_input, num_output / out_elempack, (size_t)2u * out_elempack, out_elempack);
weight_data_tm.create(num_input, num_output / out_elempack, (size_t)2u * out_elempack, out_elempack);

for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
{
__fp16* g0 = weight_data_fp16.row<__fp16>(q / out_elempack);
__fp16* g0 = weight_data_tm.row<__fp16>(q / out_elempack);

for (int p = 0; p < num_input; p++)
{
@@ -416,6 +567,11 @@ int InnerProduct_riscv::create_pipeline_fp16s(const Option& opt)

ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);

if (opt.lightmode)
{
weight_data.release();
}

return 0;
}

@@ -451,7 +607,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con
{
for (int l = 0; l < packn; l++)
{
const __fp16* kptr = (const __fp16*)weight_data_fp16 + num_input * p * packn + l;
const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn + l;
const __fp16* m = bottom_blob.row<const __fp16>(j);

vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl);
@@ -489,7 +645,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con

for (int p = 0; p < num_output / num_output_elempack; p++)
{
const __fp16* kptr = (const __fp16*)weight_data_fp16 + num_input * p * packn;
const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn;
const __fp16* m = bottom_blob.row<const __fp16>(j);

vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl);
@@ -526,7 +682,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con

for (int p = 0; p < num_output; p++)
{
const __fp16* kptr = (const __fp16*)weight_data_fp16 + num_input * p;
const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p;
const __fp16* m = bottom_blob.row<const __fp16>(j);

vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl);
@@ -561,7 +717,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con

for (int p = 0; p < num_output; p++)
{
const __fp16* kptr = (const __fp16*)weight_data_fp16 + num_input * p;
const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p;
const __fp16* m = bottom_blob.row<const __fp16>(j);

float sum = 0.f;
@@ -621,7 +777,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con
_sum = vle32_v_f32m2((const float*)bias_data + p * packn, vl);
}

const __fp16* kptr = weight_data_fp16.row<const __fp16>(p);
const __fp16* kptr = weight_data_tm.row<const __fp16>(p);

const __fp16* sptr = bottom_blob_flattened;

@@ -655,7 +811,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con
if (bias_term)
sum = bias_data[p];

const __fp16* kptr = weight_data_fp16.row<__fp16>(p);
const __fp16* kptr = weight_data_tm.row<__fp16>(p);

const __fp16* sptr = bottom_blob_flattened;

@@ -713,7 +869,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co
{
for (int l = 0; l < packn; l++)
{
const __fp16* kptr = (const __fp16*)weight_data_fp16 + num_input * p * packn + l;
const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn + l;
const __fp16* m = bottom_blob.row<const __fp16>(j);

vfloat16m1_t _sum = vfmv_v_f_f16m1((__fp16)0.f, vl);
@@ -751,7 +907,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co

for (int p = 0; p < num_output / num_output_elempack; p++)
{
const __fp16* kptr = (const __fp16*)weight_data_fp16 + num_input * p * packn;
const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn;
const __fp16* m = bottom_blob.row<const __fp16>(j);

vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl);
@@ -788,7 +944,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co

for (int p = 0; p < num_output; p++)
{
const __fp16* kptr = (const __fp16*)weight_data_fp16 + num_input * p;
const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p;
const __fp16* m = bottom_blob.row<const __fp16>(j);

vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl);
@@ -823,7 +979,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co

for (int p = 0; p < num_output; p++)
{
const __fp16* kptr = (const __fp16*)weight_data_fp16 + num_input * p;
const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p;
const __fp16* m = bottom_blob.row<const __fp16>(j);

float sum = 0.f;
@@ -883,7 +1039,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co
_sum = vle16_v_f16m1((const __fp16*)bias_data_fp16 + p * packn, vl);
}

const __fp16* kptr = weight_data_fp16.row<const __fp16>(p);
const __fp16* kptr = weight_data_tm.row<const __fp16>(p);

const __fp16* sptr = bottom_blob_flattened;

@@ -917,7 +1073,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co
if (bias_term)
sum = bias_data[p];

const __fp16* kptr = weight_data_fp16.row<__fp16>(p);
const __fp16* kptr = weight_data_tm.row<__fp16>(p);

const __fp16* sptr = bottom_blob_flattened;



+ 2
- 1
src/layer/riscv/innerproduct_riscv.h View File

@@ -39,8 +39,9 @@ protected:
public:
Layer* flatten;

Mat weight_data_tm;

// fp16
Mat weight_data_fp16;
Mat bias_data_fp16;
};



+ 12
- 9
src/layer/x86/convolutiondepthwise_x86.cpp View File

@@ -96,8 +96,6 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt)
{
Mat weight_data_r2 = weight_data.reshape(maxk, group);
convert_packing(weight_data_r2, weight_data_tm, 16, opt);

return 0;
}
#endif // __AVX512F__

@@ -106,8 +104,6 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt)
{
Mat weight_data_r2 = weight_data.reshape(maxk, group);
convert_packing(weight_data_r2, weight_data_tm, 8, opt);

return 0;
}
#endif // __AVX__

@@ -116,8 +112,6 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt)
{
Mat weight_data_r2 = weight_data.reshape(maxk, group);
convert_packing(weight_data_r2, weight_data_tm, 4, opt);

return 0;
}
#endif // __SSE2__

@@ -127,14 +121,23 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt)
if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
weight_data_tm = weight_data;
return 0;
}
if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
weight_data_tm = weight_data;
return 0;
}
else
{
create_group_ops(opt);
}
}

if (opt.lightmode)
{
weight_data.release();
}

return 0;
}

// group convolution


Loading…
Cancel
Save