diff --git a/src/layer/arm/convolutiondepthwise_arm.cpp b/src/layer/arm/convolutiondepthwise_arm.cpp index fc7fbecd3..57dc20cbb 100644 --- a/src/layer/arm/convolutiondepthwise_arm.cpp +++ b/src/layer/arm/convolutiondepthwise_arm.cpp @@ -130,6 +130,11 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt) ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); + if (opt.lightmode) + { + weight_data.release(); + } + return 0; } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC @@ -153,6 +158,11 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt) ncnn::cast_float32_to_bfloat16(weight_data, weight_data_tm, opt); } + if (opt.lightmode) + { + weight_data.release(); + } + return 0; } #endif // NCNN_BF16 @@ -163,8 +173,6 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt) { Mat weight_data_r2 = weight_data.reshape(maxk, group); convert_packing(weight_data_r2, weight_data_tm, 4, opt); - - return 0; } #endif // __ARM_NEON @@ -173,24 +181,32 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt) if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { weight_data_tm = weight_data; - return 0; } - if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) { weight_data_tm = weight_data; - return 0; } - if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { weight_data_tm = weight_data; - return 0; } - if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) { weight_data_tm = weight_data; - return 0; + } + else + { + // group convolution + create_group_ops(opt); } } + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; } // group convolution @@ -1591,6 +1607,11 @@ int ConvolutionDepthWise_arm::create_pipeline_int8_arm(const Option& opt) weight_data_tm = weight_data; } + if (opt.lightmode) + { + weight_data.release(); + } + return 0; } diff --git a/src/layer/mips/convolutiondepthwise_mips.cpp b/src/layer/mips/convolutiondepthwise_mips.cpp index 497e3400d..39bac692d 100644 --- a/src/layer/mips/convolutiondepthwise_mips.cpp +++ b/src/layer/mips/convolutiondepthwise_mips.cpp @@ -83,6 +83,11 @@ int ConvolutionDepthWise_mips::create_pipeline(const Option& opt) weight_data_tm = weight_data; } + if (opt.lightmode) + { + weight_data.release(); + } + return 0; } diff --git a/src/layer/riscv/convolution_riscv.cpp b/src/layer/riscv/convolution_riscv.cpp index fe56560b9..4b68d8fec 100644 --- a/src/layer/riscv/convolution_riscv.cpp +++ b/src/layer/riscv/convolution_riscv.cpp @@ -88,7 +88,7 @@ Convolution_riscv::Convolution_riscv() activation = 0; } -static void convolution_transform_kernel_packed_rvv(const Mat& weight_data, Mat& weight_data_packed, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack) +static void convolution_transform_kernel_packed_rvv(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack) { const int maxk = kernel_w * kernel_h; @@ -97,11 +97,11 @@ static void convolution_transform_kernel_packed_rvv(const Mat& weight_data, Mat& { Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); - weight_data_packed.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack); + weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack); for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) { - float* g00 = weight_data_packed.channel(q / out_elempack); + float* g00 = weight_data_tm.channel(q / out_elempack); for (int p = 0; p + (elempack - 1) < num_input; p += elempack) { @@ -131,6 +131,14 @@ int Convolution_riscv::create_pipeline(const Option& opt) activation = create_activation_layer(activation_type, activation_params, opt); +#if NCNN_INT8 + if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) + { + // TODO implement int8 + return 0; + } +#endif + #if __riscv_vector && __riscv_zfh if (opt.use_fp16_storage) { @@ -166,14 +174,14 @@ int Convolution_riscv::create_pipeline(const Option& opt) } else { - convolution_transform_kernel_packed_rvv(weight_data, weight_data_packed, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + convolution_transform_kernel_packed_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); } } // pack1ton if (elempack == 1 && out_elempack == packn) { - convolution_transform_kernel_packed_rvv(weight_data, weight_data_packed, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + convolution_transform_kernel_packed_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); } // packnto1 @@ -181,19 +189,19 @@ int Convolution_riscv::create_pipeline(const Option& opt) { if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { - convolution_im2col_sgemm_transform_kernel_packnto1_rvv(weight_data, weight_data_packed, num_input, num_output, kernel_w, kernel_h); + convolution_im2col_sgemm_transform_kernel_packnto1_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); } else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) { - convolution_im2col_sgemm_transform_kernel_packnto1_rvv(weight_data, weight_data_packed, num_input, num_output, kernel_w, kernel_h); + convolution_im2col_sgemm_transform_kernel_packnto1_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); } else if (opt.use_sgemm_convolution) { - convolution_im2col_sgemm_transform_kernel_packnto1_rvv(weight_data, weight_data_packed, num_input, num_output, kernel_w, kernel_h); + convolution_im2col_sgemm_transform_kernel_packnto1_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); } else { - convolution_transform_kernel_packed_rvv(weight_data, weight_data_packed, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + convolution_transform_kernel_packed_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); } } #endif // __riscv_vector @@ -203,7 +211,7 @@ int Convolution_riscv::create_pipeline(const Option& opt) { if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { - convolution_im2col_sgemm_transform_kernel_rvv(weight_data, weight_data_packed, num_input, num_output, kernel_w, kernel_h); + convolution_im2col_sgemm_transform_kernel_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); } else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { @@ -218,10 +226,19 @@ int Convolution_riscv::create_pipeline(const Option& opt) } else if (opt.use_sgemm_convolution) { - convolution_im2col_sgemm_transform_kernel_rvv(weight_data, weight_data_packed, num_input, num_output, kernel_w, kernel_h); + convolution_im2col_sgemm_transform_kernel_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); + } + else + { + weight_data_tm = weight_data; } } + if (opt.lightmode) + { + weight_data.release(); + } + return 0; } @@ -240,7 +257,7 @@ int Convolution_riscv::destroy_pipeline(const Option& opt) int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { #if NCNN_INT8 - if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) + if (opt.use_int8_inference && int8_scale_term) { Mat bottom_blob_unpacked = bottom_blob; if (bottom_blob.elempack != 1) @@ -266,9 +283,44 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti } #endif - if (bottom_blob.dims != 3) + // flattened blob, implement as InnerProduct + if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1) { - return Convolution::forward(bottom_blob, top_blob, opt); + Mat bottom_blob_3d; + if (bottom_blob.elemsize % 16 == 0) + { + bottom_blob_3d = bottom_blob; + bottom_blob_3d.dims = 3; + bottom_blob_3d.w = 1; + bottom_blob_3d.h = 1; + bottom_blob_3d.c = bottom_blob.w; + bottom_blob_3d.cstep = 1; + } + else + { + bottom_blob_3d = bottom_blob.reshape(1, 1, bottom_blob.w, opt.workspace_allocator); + } + + Mat top_blob_3d; + int ret = forward(bottom_blob_3d, top_blob_3d, opt); + if (ret != 0) + return ret; + + if (top_blob_3d.elemsize % 16 == 0) + { + top_blob = top_blob_3d; + top_blob.dims = 1; + top_blob.w = top_blob_3d.c; + top_blob.h = 1; + top_blob.c = 1; + bottom_blob_3d.cstep = top_blob_3d.c; + } + else + { + top_blob = top_blob_3d.reshape(top_blob_3d.c, opt.blob_allocator); + } + + return 0; } int elembits = bottom_blob.elembits(); @@ -328,7 +380,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti { if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { - conv1x1s1_sgemm_packn_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); + conv1x1s1_sgemm_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); if (activation) { @@ -337,7 +389,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti } else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) { - conv1x1s2_sgemm_packn_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); + conv1x1s2_sgemm_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); if (activation) { @@ -363,7 +415,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti } else if (opt.use_sgemm_convolution) { - convolution_im2col_sgemm_packn_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + convolution_im2col_sgemm_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); if (activation) { @@ -372,7 +424,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti } else { - convolution_packn_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + convolution_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } @@ -380,7 +432,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti { if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { - conv1x1s1_sgemm_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); + conv1x1s1_sgemm_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); if (activation) { @@ -389,7 +441,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti } else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { - conv3x3s1_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); + conv3x3s1_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); if (activation) { @@ -398,7 +450,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti } else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) { - conv3x3s2_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); + conv3x3s2_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); if (activation) { @@ -407,7 +459,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti } else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) { - conv7x7s2_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); + conv7x7s2_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); if (activation) { @@ -416,7 +468,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti } else if (opt.use_sgemm_convolution) { - convolution_im2col_sgemm_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + convolution_im2col_sgemm_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); if (activation) { @@ -425,7 +477,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti } else { - convolution_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + convolution_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } @@ -433,7 +485,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti { if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { - conv1x1s1_sgemm_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); + conv1x1s1_sgemm_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); if (activation) { @@ -442,7 +494,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti } else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) { - conv1x1s2_sgemm_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); + conv1x1s2_sgemm_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); if (activation) { @@ -451,7 +503,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti } else if (opt.use_sgemm_convolution) { - convolution_im2col_sgemm_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + convolution_im2col_sgemm_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); if (activation) { @@ -460,7 +512,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti } else { - convolution_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + convolution_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } #endif // __riscv_vector @@ -469,7 +521,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti { if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { - conv1x1s1_sgemm_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); + conv1x1s1_sgemm_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); if (activation) { @@ -494,7 +546,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti } else if (opt.use_sgemm_convolution) { - convolution_im2col_sgemm_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + convolution_im2col_sgemm_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); if (activation) { @@ -541,7 +593,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti sum = bias_data[p]; } - const float* kptr = (const float*)weight_data + maxk * channels * p; + const float* kptr = (const float*)weight_data_tm + maxk * channels * p; // channels for (int q = 0; q < channels; q++) @@ -666,7 +718,7 @@ int Convolution_riscv::forward(const std::vector& bottom_blobs, std::vector } #if __riscv_vector && __riscv_zfh -static void convolution_transform_kernel_packed_fp16s_rvv(const Mat& weight_data, Mat& weight_data_fp16, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack) +static void convolution_transform_kernel_packed_fp16s_rvv(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack) { const int maxk = kernel_w * kernel_h; @@ -675,11 +727,11 @@ static void convolution_transform_kernel_packed_fp16s_rvv(const Mat& weight_data { Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); - weight_data_fp16.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack); + weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack); for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) { - __fp16* g00 = weight_data_fp16.channel(q / out_elempack); + __fp16* g00 = weight_data_tm.channel(q / out_elempack); for (int p = 0; p + (elempack - 1) < num_input; p += elempack) { @@ -728,14 +780,14 @@ int Convolution_riscv::create_pipeline_fp16s(const Option& opt) } else { - convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); } } // pack1ton if (elempack == 1 && out_elempack == packn) { - convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); } // packnto1 @@ -743,19 +795,19 @@ int Convolution_riscv::create_pipeline_fp16s(const Option& opt) { if (opt.use_fp16_arithmetic && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { - convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h); + convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); } else if (opt.use_fp16_arithmetic && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) { - convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h); + convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); } else if (opt.use_fp16_arithmetic && opt.use_sgemm_convolution) { - convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h); + convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); } else { - convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); } } @@ -764,15 +816,15 @@ int Convolution_riscv::create_pipeline_fp16s(const Option& opt) { if (opt.use_fp16_arithmetic && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { - convolution_im2col_sgemm_transform_kernel_fp16sa_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h); + convolution_im2col_sgemm_transform_kernel_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); } else if (opt.use_fp16_arithmetic && opt.use_sgemm_convolution) { - convolution_im2col_sgemm_transform_kernel_fp16sa_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h); + convolution_im2col_sgemm_transform_kernel_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); } else { - convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); } } @@ -781,6 +833,11 @@ int Convolution_riscv::create_pipeline_fp16s(const Option& opt) ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); } + if (opt.lightmode) + { + weight_data.release(); + } + return 0; } @@ -818,28 +875,28 @@ int Convolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, cons if (elempack == packn && out_elempack == packn) { { - convolution_packn_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + convolution_packn_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } if (elempack == 1 && out_elempack == packn) { { - convolution_pack1ton_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + convolution_pack1ton_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } if (elempack == packn && out_elempack == 1) { { - convolution_packnto1_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + convolution_packnto1_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } if (elempack == 1 && out_elempack == 1) { { - convolution_fp16s(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + convolution_fp16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } @@ -884,7 +941,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con { if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { - conv1x1s1_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); + conv1x1s1_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); if (activation) { @@ -893,7 +950,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con } else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) { - conv1x1s2_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); + conv1x1s2_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); if (activation) { @@ -919,7 +976,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con } else if (opt.use_sgemm_convolution) { - convolution_im2col_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + convolution_im2col_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); if (activation) { @@ -928,7 +985,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con } else { - convolution_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + convolution_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } @@ -936,7 +993,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con { if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { - conv1x1s1_sgemm_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); + conv1x1s1_sgemm_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); if (activation) { @@ -945,7 +1002,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con } else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { - conv3x3s1_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); + conv3x3s1_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); if (activation) { @@ -954,7 +1011,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con } else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) { - conv3x3s2_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); + conv3x3s2_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); if (activation) { @@ -963,7 +1020,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con } else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) { - conv7x7s2_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); + conv7x7s2_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); if (activation) { @@ -972,7 +1029,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con } else if (opt.use_sgemm_convolution) { - convolution_im2col_sgemm_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + convolution_im2col_sgemm_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); if (activation) { @@ -981,7 +1038,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con } else { - convolution_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + convolution_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } @@ -989,7 +1046,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con { if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { - conv1x1s1_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); + conv1x1s1_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); if (activation) { @@ -998,7 +1055,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con } else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) { - conv1x1s2_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); + conv1x1s2_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); if (activation) { @@ -1007,7 +1064,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con } else if (opt.use_sgemm_convolution) { - convolution_im2col_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + convolution_im2col_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); if (activation) { @@ -1016,7 +1073,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con } else { - convolution_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + convolution_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } @@ -1024,7 +1081,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con { if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { - conv1x1s1_sgemm_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); + conv1x1s1_sgemm_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); if (activation) { @@ -1033,7 +1090,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con } else if (opt.use_sgemm_convolution) { - convolution_im2col_sgemm_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + convolution_im2col_sgemm_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); if (activation) { @@ -1042,7 +1099,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con } else { - convolution_fp16s(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + convolution_fp16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } diff --git a/src/layer/riscv/convolution_riscv.h b/src/layer/riscv/convolution_riscv.h index 806aa51bb..17bb43ca0 100644 --- a/src/layer/riscv/convolution_riscv.h +++ b/src/layer/riscv/convolution_riscv.h @@ -41,14 +41,12 @@ protected: public: Layer* activation; - // packn - Mat weight_data_packed; + Mat weight_data_tm; Mat weight_winograd23_data; Mat weight_winograd43_data; Mat weight_winograd63_data; // fp16 - Mat weight_data_fp16; Mat bias_data_fp16; }; diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp index 41a60a2cf..d29cf0e2b 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp @@ -61,6 +61,14 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt) activation = create_activation_layer(activation_type, activation_params, opt); +#if NCNN_INT8 + if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) + { + // TODO implement int8 + return 0; + } +#endif + #if __riscv_vector && __riscv_zfh if (opt.use_fp16_storage) { @@ -91,12 +99,18 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt) if (elempack == packn) { Mat weight_data_r2 = weight_data.reshape(maxk, group); - convert_packing(weight_data_r2, weight_data_packed, packn, opt); + convert_packing(weight_data_r2, weight_data_tm, packn, opt); } #endif // __riscv_vector if (elempack == 1) { + weight_data_tm = weight_data; + } + + if (opt.lightmode) + { + weight_data.release(); } return 0; @@ -105,6 +119,11 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt) // group convolution create_group_ops(opt); + if (opt.lightmode) + { + weight_data.release(); + } + return 0; } @@ -126,7 +145,7 @@ int ConvolutionDepthWise_riscv::create_group_ops(const Option& opt) for (int g = 0; g < group; g++) { - Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g); + Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g).clone(); Mat bias_data_g; if (bias_term) bias_data_g = bias_data.range(num_output_g * g, num_output_g); @@ -227,7 +246,7 @@ int ConvolutionDepthWise_riscv::destroy_pipeline(const Option& opt) int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { #if NCNN_INT8 - if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) + if (opt.use_int8_inference && int8_scale_term) { Mat bottom_blob_unpacked = bottom_blob; if (bottom_blob.elempack != 1) @@ -310,7 +329,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c { if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { - convdw3x3s1_packn_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); + convdw3x3s1_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); if (activation) { @@ -319,7 +338,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c } else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) { - convdw3x3s2_packn_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); + convdw3x3s2_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); if (activation) { @@ -328,7 +347,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c } else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { - convdw5x5s1_packn_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); + convdw5x5s1_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); if (activation) { @@ -337,7 +356,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c } else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) { - convdw5x5s2_packn_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); + convdw5x5s2_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); if (activation) { @@ -371,7 +390,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c for (int g = 0; g < channels; g++) { float* outptr = top_blob.channel(g); - const float* kptr = (const float*)weight_data_packed + maxk * g * packn; + const float* kptr = (const float*)weight_data_tm + maxk * g * packn; const Mat m = bottom_blob_bordered.channel(g); for (int i = 0; i < outh; i++) @@ -410,7 +429,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c { if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { - convdw3x3s1_rvv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); + convdw3x3s1_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); if (activation) { @@ -419,7 +438,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c } else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) { - convdw3x3s2_rvv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); + convdw3x3s2_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); if (activation) { @@ -453,7 +472,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c for (int g = 0; g < group; g++) { float* outptr = top_blob.channel(g); - const float* kptr = (const float*)weight_data + maxk * g; + const float* kptr = (const float*)weight_data_tm + maxk * g; const Mat m = bottom_blob_bordered.channel(g); for (int i = 0; i < outh; i++) @@ -663,22 +682,32 @@ int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt) Mat weight_data_r2_packed; convert_packing(weight_data_r2, weight_data_r2_packed, packn, opt); - ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_fp16, opt); + ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_tm, opt); } if (elempack == 1) { - ncnn::cast_float32_to_float16(weight_data, weight_data_fp16, opt); + ncnn::cast_float32_to_float16(weight_data, weight_data_tm, opt); } ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); + if (opt.lightmode) + { + weight_data.release(); + } + return 0; } // group convolution create_group_ops(opt); + if (opt.lightmode) + { + weight_data.release(); + } + return 0; } @@ -744,7 +773,7 @@ int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_b for (int g = 0; g < channels; g++) { __fp16* outptr = top_blob.channel(g); - const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * packn; + const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn; const Mat m = bottom_blob_bordered.channel(g); for (int i = 0; i < outh; i++) @@ -806,7 +835,7 @@ int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_b for (int g = 0; g < group; g++) { __fp16* outptr = top_blob.channel(g); - const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g; + const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g; const Mat m = bottom_blob_bordered.channel(g); for (int i = 0; i < outh; i++) @@ -930,7 +959,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_ { if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { - convdw3x3s1_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); + convdw3x3s1_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); if (activation) { @@ -939,7 +968,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_ } else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) { - convdw3x3s2_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); + convdw3x3s2_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); if (activation) { @@ -948,7 +977,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_ } else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { - convdw5x5s1_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); + convdw5x5s1_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); if (activation) { @@ -957,7 +986,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_ } else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) { - convdw5x5s2_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); + convdw5x5s2_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); if (activation) { @@ -991,7 +1020,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_ for (int g = 0; g < channels; g++) { __fp16* outptr = top_blob.channel(g); - const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * packn; + const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn; const Mat m = bottom_blob_bordered.channel(g); for (int i = 0; i < outh; i++) @@ -1053,7 +1082,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_ for (int g = 0; g < group; g++) { __fp16* outptr = top_blob.channel(g); - const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g; + const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g; const Mat m = bottom_blob_bordered.channel(g); for (int i = 0; i < outh; i++) diff --git a/src/layer/riscv/convolutiondepthwise_riscv.h b/src/layer/riscv/convolutiondepthwise_riscv.h index d7db20aa9..b0152e0b2 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv.h +++ b/src/layer/riscv/convolutiondepthwise_riscv.h @@ -43,11 +43,9 @@ public: Layer* activation; std::vector group_ops; - // packing - Mat weight_data_packed; + Mat weight_data_tm; // fp16 - Mat weight_data_fp16; Mat bias_data_fp16; }; diff --git a/src/layer/riscv/deconvolution_riscv.cpp b/src/layer/riscv/deconvolution_riscv.cpp index 321999974..6dfedcc18 100644 --- a/src/layer/riscv/deconvolution_riscv.cpp +++ b/src/layer/riscv/deconvolution_riscv.cpp @@ -100,11 +100,11 @@ int Deconvolution_riscv::create_pipeline(const Option& opt) { Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output); - weight_data_packed.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack); + weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack); for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) { - float* g00 = weight_data_packed.channel(q / out_elempack); + float* g00 = weight_data_tm.channel(q / out_elempack); for (int p = 0; p + (elempack - 1) < num_input; p += elempack) { @@ -148,6 +148,11 @@ int Deconvolution_riscv::create_pipeline(const Option& opt) { } + if (opt.lightmode) + { + weight_data.release(); + } + return 0; } @@ -218,21 +223,21 @@ int Deconvolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Op if (elempack == packn && out_elempack == packn) { { - deconvolution_packn_rvv(bottom_blob, top_blob_bordered, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + deconvolution_packn_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } if (elempack == 1 && out_elempack == packn) { { - deconvolution_pack1ton_rvv(bottom_blob, top_blob_bordered, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + deconvolution_pack1ton_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } if (elempack == packn && out_elempack == 1) { { - deconvolution_packnto1_rvv(bottom_blob, top_blob_bordered, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + deconvolution_packnto1_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } #endif // __riscv_vector @@ -257,7 +262,7 @@ int Deconvolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Op sum = bias_data[p]; } - const float* kptr = (const float*)weight_data_packed.channel(p); + const float* kptr = (const float*)weight_data_tm.channel(p); // channels for (int q = 0; q < channels; q++) @@ -356,11 +361,11 @@ int Deconvolution_riscv::create_pipeline_fp16s(const Option& opt) { Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output); - weight_data_fp16.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack); + weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack); for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) { - __fp16* g00 = weight_data_fp16.channel(q / out_elempack); + __fp16* g00 = weight_data_tm.channel(q / out_elempack); for (int p = 0; p + (elempack - 1) < num_input; p += elempack) { @@ -404,6 +409,11 @@ int Deconvolution_riscv::create_pipeline_fp16s(const Option& opt) ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); + if (opt.lightmode) + { + weight_data.release(); + } + return 0; } @@ -446,28 +456,28 @@ int Deconvolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co if (elempack == packn && out_elempack == packn) { { - deconvolution_packn_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + deconvolution_packn_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } if (elempack == 1 && out_elempack == packn) { { - deconvolution_pack1ton_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + deconvolution_pack1ton_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } if (elempack == packn && out_elempack == 1) { { - deconvolution_packnto1_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + deconvolution_packnto1_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } if (elempack == 1 && out_elempack == 1) { { - deconvolution_fp16s(bottom_blob, top_blob_bordered, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + deconvolution_fp16s(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } @@ -517,28 +527,28 @@ int Deconvolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c if (elempack == packn && out_elempack == packn) { { - deconvolution_packn_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + deconvolution_packn_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } if (elempack == 1 && out_elempack == packn) { { - deconvolution_pack1ton_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + deconvolution_pack1ton_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } if (elempack == packn && out_elempack == 1) { { - deconvolution_packnto1_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + deconvolution_packnto1_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } if (elempack == 1 && out_elempack == 1) { { - deconvolution_fp16s(bottom_blob, top_blob_bordered, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + deconvolution_fp16s(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } diff --git a/src/layer/riscv/deconvolution_riscv.h b/src/layer/riscv/deconvolution_riscv.h index 0419eab70..3574c09d1 100644 --- a/src/layer/riscv/deconvolution_riscv.h +++ b/src/layer/riscv/deconvolution_riscv.h @@ -37,11 +37,9 @@ protected: #endif public: - // packn - Mat weight_data_packed; + Mat weight_data_tm; // fp16 - Mat weight_data_fp16; Mat bias_data_fp16; }; diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp index d8566f70b..43d8f1bfc 100644 --- a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp @@ -88,13 +88,18 @@ int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt) if (elempack == packn) { Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group); - convert_packing(weight_data_r2, weight_data_packed, packn, opt); + convert_packing(weight_data_r2, weight_data_tm, packn, opt); } #endif // __riscv_vector if (elempack == 1) { - weight_data_packed = weight_data_transposed; + weight_data_tm = weight_data_transposed; + } + + if (opt.lightmode) + { + weight_data.release(); } return 0; @@ -103,6 +108,11 @@ int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt) // group convolution create_group_ops(opt); + if (opt.lightmode) + { + weight_data.release(); + } + return 0; } @@ -124,7 +134,7 @@ int DeconvolutionDepthWise_riscv::create_group_ops(const Option& opt) for (int g = 0; g < group; g++) { - Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g); + Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g).clone(); Mat bias_data_g; if (bias_term) bias_data_g = bias_data.range(num_output_g * g, num_output_g); @@ -256,7 +266,7 @@ int DeconvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, for (int g = 0; g < channels; g++) { float* outptr = top_blob_bordered.channel(g); - const float* kptr = (const float*)weight_data_packed + maxk * g * packn; + const float* kptr = (const float*)weight_data_tm + maxk * g * packn; const Mat m = bottom_blob.channel(g); for (int i = 0; i < outh; i++) @@ -318,7 +328,7 @@ int DeconvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, for (int g = 0; g < channels; g++) { float* outptr = top_blob_bordered.channel(g); - const float* kptr = (const float*)weight_data_packed + maxk * g; + const float* kptr = (const float*)weight_data_tm + maxk * g; const Mat m = bottom_blob.channel(g); for (int i = 0; i < outh; i++) @@ -480,22 +490,32 @@ int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt) Mat weight_data_r2_packed; convert_packing(weight_data_r2, weight_data_r2_packed, packn, opt); - ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_fp16, opt); + ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_tm, opt); } if (elempack == 1) { - ncnn::cast_float32_to_float16(weight_data_transposed, weight_data_fp16, opt); + ncnn::cast_float32_to_float16(weight_data_transposed, weight_data_tm, opt); } ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); + if (opt.lightmode) + { + weight_data.release(); + } + return 0; } // group convolution create_group_ops(opt); + if (opt.lightmode) + { + weight_data.release(); + } + return 0; } @@ -543,7 +563,7 @@ int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top for (int g = 0; g < channels; g++) { __fp16* outptr = top_blob_bordered.channel(g); - const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * packn; + const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn; const Mat m = bottom_blob.channel(g); for (int i = 0; i < outh; i++) @@ -605,7 +625,7 @@ int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top for (int g = 0; g < channels; g++) { __fp16* outptr = top_blob_bordered.channel(g); - const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g; + const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g; const Mat m = bottom_blob.channel(g); for (int i = 0; i < outh; i++) @@ -764,7 +784,7 @@ int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& to for (int g = 0; g < channels; g++) { __fp16* outptr = top_blob_bordered.channel(g); - const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * packn; + const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn; const Mat m = bottom_blob.channel(g); for (int i = 0; i < outh; i++) @@ -826,7 +846,7 @@ int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& to for (int g = 0; g < channels; g++) { __fp16* outptr = top_blob_bordered.channel(g); - const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g; + const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g; const Mat m = bottom_blob.channel(g); for (int i = 0; i < outh; i++) diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.h b/src/layer/riscv/deconvolutiondepthwise_riscv.h index c33ef0a58..ccda5f248 100644 --- a/src/layer/riscv/deconvolutiondepthwise_riscv.h +++ b/src/layer/riscv/deconvolutiondepthwise_riscv.h @@ -40,11 +40,9 @@ protected: public: std::vector group_ops; - // packing - Mat weight_data_packed; + Mat weight_data_tm; // fp16 - Mat weight_data_fp16; Mat bias_data_fp16; }; diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp index c4f6f2d69..e9cefc330 100644 --- a/src/layer/riscv/innerproduct_riscv.cpp +++ b/src/layer/riscv/innerproduct_riscv.cpp @@ -43,8 +43,6 @@ InnerProduct_riscv::InnerProduct_riscv() int InnerProduct_riscv::create_pipeline(const Option& opt) { -#if __riscv_vector - if (opt.use_packing_layout || opt.use_int8_inference) { flatten = ncnn::create_layer(ncnn::LayerType::Flatten); @@ -54,7 +52,14 @@ int InnerProduct_riscv::create_pipeline(const Option& opt) flatten->create_pipeline(opt); } -#endif // __riscv_vector + +#if NCNN_INT8 + if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) + { + // TODO implement int8 + return 0; + } +#endif #if __riscv_vector && __riscv_zfh if (opt.use_fp16_storage) @@ -63,6 +68,53 @@ int InnerProduct_riscv::create_pipeline(const Option& opt) } #endif + int out_elempack = 1; + +#if __riscv_vector + const int packn = csrr_vlenb() / 4; + + const int num_input = weight_data_size / num_output; + + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } + + if (out_elempack == packn) + { + // src = inch-outch + // dst = packn-inch-outch/packn + { + Mat weight_data_r2 = weight_data.reshape(num_input, num_output); + + weight_data_tm.create(num_input, num_output / packn, (size_t)4u * packn, packn); + + for (int q = 0; q + (packn - 1) < num_output; q += packn) + { + float* g0 = weight_data_tm.row(q / packn); + + for (int p = 0; p < num_input; p++) + { + for (int j = 0; j < packn; j++) + { + *g0++ = weight_data_r2.row(q + j)[p]; + } + } + } + } + } +#endif // __riscv_vector + + if (out_elempack == 1) + { + weight_data_tm = weight_data; + } + + if (opt.lightmode) + { + weight_data.release(); + } + return 0; } @@ -81,7 +133,7 @@ int InnerProduct_riscv::destroy_pipeline(const Option& opt) int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { #if NCNN_INT8 - if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) + if (opt.use_int8_inference && int8_scale_term) { Mat bottom_blob_unpacked = bottom_blob; if (bottom_blob.elempack != 1) @@ -136,20 +188,104 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt if (top_blob.empty()) return -100; + int num_output_elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + num_output_elempack = num_output % packn == 0 ? packn : 1; + } +#endif + #pragma omp parallel for num_threads(opt.num_threads) for (int j = 0; j < h; j++) { #if __riscv_vector - if (elempack == packn) + if (elempack == packn && num_output_elempack == packn) { + const word_type vl = vsetvl_e32m1(packn); + + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + for (int l = 0; l < packn; l++) + { + const float* kptr = weight_data_tm.row(p) + l; + const float* m = bottom_blob.row(j); + + vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + + if (bias_term) + { + _sum = vfmv_v_f_f32m1(bias_data[p * packn + l], vl); + } + + int n = num_input; + while (n > 0) + { + vfloat32m1_t _val = vle32_v_f32m1(m, vl); + _sum = vfmacc_vf_f32m1(_sum, *kptr, _val, vl); + + m += packn; + kptr += packn; + n -= 1; + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + vse32_v_f32m1(outptr, _sum, vl); + outptr += packn; + } + } + } + + if (elempack == 1 && num_output_elempack == packn) + { + const word_type vl = vsetvl_e32m1(packn); + + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + const float* kptr = weight_data_tm.row(p); + const float* m = bottom_blob.row(j); + + vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + + if (bias_term) + { + _sum = vle32_v_f32m1((const float*)bias_data + p * packn, vl); + } + + int n = num_input; + while (n > 0) + { + vfloat32m1_t _w = vle32_v_f32m1(kptr, vl); + _sum = vfmacc_vf_f32m1(_sum, *m, _w, vl); + + m += 1; + kptr += packn; + n -= 1; + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + vse32_v_f32m1(outptr, _sum, vl); + outptr += packn; + } + } + + if (elempack == packn && num_output_elempack == 1) + { + const word_type vl = vsetvl_e32m1(packn); + float* outptr = top_blob.row(j); for (int p = 0; p < num_output; p++) { - const float* kptr = (const float*)weight_data + num_input * p; + const float* kptr = (const float*)weight_data_tm + num_input * p; const float* m = bottom_blob.row(j); - const word_type vl = vsetvl_e32m1(packn); vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); if (bias_term) @@ -176,13 +312,13 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt } #endif // __riscv_vector - if (elempack == 1) + if (elempack == 1 && num_output_elempack == 1) { float* outptr = top_blob.row(j); for (int p = 0; p < num_output; p++) { - const float* kptr = (const float*)weight_data + num_input * p; + const float* kptr = (const float*)weight_data_tm + num_input * p; const float* m = bottom_blob.row(j); float sum = 0.f; @@ -208,69 +344,93 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt return 0; } - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - int size = w * h; - -#if __riscv_vector - if (elempack == packn) + // flatten + Mat bottom_blob_flattened = bottom_blob; + if (bottom_blob.dims != 1) { - // flatten - Mat bottom_blob_flattened = bottom_blob; - if (bottom_blob.dims != 1) - { - Option opt_flatten = opt; - opt_flatten.blob_allocator = opt.workspace_allocator; + Option opt_flatten = opt; + opt_flatten.blob_allocator = opt.workspace_allocator; - flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); - } + flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); + } - // pack1 - { - bottom_blob_flattened.w *= bottom_blob_flattened.elempack; - bottom_blob_flattened.cstep = bottom_blob_flattened.w; - bottom_blob_flattened.elemsize = 4u; - bottom_blob_flattened.elempack = 1; - } + size_t elemsize = bottom_blob_flattened.elemsize; + int elempack = bottom_blob_flattened.elempack; - return forward(bottom_blob_flattened, top_blob, opt); + int out_elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; } #endif + size_t out_elemsize = elemsize / elempack * out_elempack; - top_blob.create(num_output, elemsize, opt.blob_allocator); + top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - const float* weight_data_ptr = weight_data; - #if __riscv_vector - int nn_num_output = num_output / packn; - int remain_num_output_start = nn_num_output * packn; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int pp = 0; pp < nn_num_output; pp++) + if (out_elempack == packn) { - int p = pp * packn; + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output / out_elempack; p++) + { + const word_type vl = vsetvl_e32m1(packn); + vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); - const word_type vl = vsetvl_e32m1(packn); - vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + if (bias_term) + { + _sum = vle32_v_f32m1((const float*)bias_data + p * packn, vl); + } - if (bias_term) - { - _sum = vle32_v_f32m1((const float*)bias_data + p, vl); + const float* kptr = weight_data_tm.row(p); + + const float* sptr = bottom_blob_flattened; + + int n = num_input; + while (n > 0) + { + vfloat32m1_t _w = vle32_v_f32m1(kptr, vl); + _sum = vfmacc_vf_f32m1(_sum, *sptr, _w, vl); + + sptr += 1; + kptr += packn; + n -= 1; + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + float* outptr = top_blob; + vse32_v_f32m1(outptr + p * packn, _sum, vl); } + } +#endif // __riscv_vector - const float* w = weight_data_ptr + num_input * p; + if (out_elempack == 1) + { +#if __riscv_vector + int nn_num_output = num_output / packn; + int remain_num_output_start = nn_num_output * packn; - // channels - for (int q = 0; q < channels; q++) + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_num_output; pp++) { - const float* m = bottom_blob.channel(q); + int p = pp * packn; - int n = size; + const word_type vl = vsetvl_e32m1(packn); + vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + + if (bias_term) + { + _sum = vle32_v_f32m1((const float*)bias_data + p, vl); + } + + const float* w = (const float*)weight_data_tm + num_input * p; + + const float* m = bottom_blob_flattened; + + int n = num_input; while (n > 0) { vfloat32m1_t _w = vlse32_v_f32m1(w, num_input * sizeof(float), vl); @@ -281,45 +441,41 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt w += 1; n -= 1; } - } - _sum = activation_ps(_sum, activation_type, activation_params, vl); + _sum = activation_ps(_sum, activation_type, activation_params, vl); - vse32_v_f32m1((float*)top_blob + p, _sum, vl); - } + vse32_v_f32m1((float*)top_blob + p, _sum, vl); + } #else // __riscv_vector - int nn_num_output = num_output / 4; - int remain_num_output_start = nn_num_output * 4; + int nn_num_output = num_output / 4; + int remain_num_output_start = nn_num_output * 4; - #pragma omp parallel for num_threads(opt.num_threads) - for (int pp = 0; pp < nn_num_output; pp++) - { - int p = pp * 4; + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_num_output; pp++) + { + int p = pp * 4; - float sum0 = 0.f; - float sum1 = 0.f; - float sum2 = 0.f; - float sum3 = 0.f; + float sum0 = 0.f; + float sum1 = 0.f; + float sum2 = 0.f; + float sum3 = 0.f; - if (bias_term) - { - sum0 = bias_data[p]; - sum1 = bias_data[p + 1]; - sum2 = bias_data[p + 2]; - sum3 = bias_data[p + 3]; - } + if (bias_term) + { + sum0 = bias_data[p]; + sum1 = bias_data[p + 1]; + sum2 = bias_data[p + 2]; + sum3 = bias_data[p + 3]; + } - const float* w0 = weight_data_ptr + num_input * p; - const float* w1 = weight_data_ptr + num_input * (p + 1); - const float* w2 = weight_data_ptr + num_input * (p + 2); - const float* w3 = weight_data_ptr + num_input * (p + 3); + const float* w0 = (const float*)weight_data_tm + num_input * p; + const float* w1 = (const float*)weight_data_tm + num_input * (p + 1); + const float* w2 = (const float*)weight_data_tm + num_input * (p + 2); + const float* w3 = (const float*)weight_data_tm + num_input * (p + 3); - // channels - for (int q = 0; q < channels; q++) - { - const float* m = bottom_blob.channel(q); + const float* m = bottom_blob_flattened; - for (int i = 0; i < size; i++) + for (int i = 0; i < num_input; i++) { sum0 += *m * *w0; sum1 += *m * *w1; @@ -332,48 +488,43 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt w2++; w3++; } - } - sum0 = activation_ss(sum0, activation_type, activation_params); - sum1 = activation_ss(sum1, activation_type, activation_params); - sum2 = activation_ss(sum2, activation_type, activation_params); - sum3 = activation_ss(sum3, activation_type, activation_params); + sum0 = activation_ss(sum0, activation_type, activation_params); + sum1 = activation_ss(sum1, activation_type, activation_params); + sum2 = activation_ss(sum2, activation_type, activation_params); + sum3 = activation_ss(sum3, activation_type, activation_params); - top_blob[p] = sum0; - top_blob[p + 1] = sum1; - top_blob[p + 2] = sum2; - top_blob[p + 3] = sum3; - } + top_blob[p] = sum0; + top_blob[p + 1] = sum1; + top_blob[p + 2] = sum2; + top_blob[p + 3] = sum3; + } #endif // __riscv_vector - // num_output - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = remain_num_output_start; p < num_output; p++) - { - float sum = 0.f; + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_num_output_start; p < num_output; p++) + { + float sum = 0.f; - if (bias_term) - sum = bias_data[p]; + if (bias_term) + sum = bias_data[p]; - const float* w = weight_data_ptr + num_input * p; + const float* w = (const float*)weight_data_tm + num_input * p; - // channels - for (int q = 0; q < channels; q++) - { - const float* m = bottom_blob.channel(q); + const float* m = bottom_blob_flattened; - for (int i = 0; i < size; i++) + for (int i = 0; i < num_input; i++) { sum += *m * *w; m++; w++; } - } - sum = activation_ss(sum, activation_type, activation_params); + sum = activation_ss(sum, activation_type, activation_params); - top_blob[p] = sum; + top_blob[p] = sum; + } } return 0; @@ -398,11 +549,11 @@ int InnerProduct_riscv::create_pipeline_fp16s(const Option& opt) { Mat weight_data_r2 = weight_data.reshape(num_input, num_output); - weight_data_fp16.create(num_input, num_output / out_elempack, (size_t)2u * out_elempack, out_elempack); + weight_data_tm.create(num_input, num_output / out_elempack, (size_t)2u * out_elempack, out_elempack); for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) { - __fp16* g0 = weight_data_fp16.row<__fp16>(q / out_elempack); + __fp16* g0 = weight_data_tm.row<__fp16>(q / out_elempack); for (int p = 0; p < num_input; p++) { @@ -416,6 +567,11 @@ int InnerProduct_riscv::create_pipeline_fp16s(const Option& opt) ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); + if (opt.lightmode) + { + weight_data.release(); + } + return 0; } @@ -451,7 +607,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con { for (int l = 0; l < packn; l++) { - const __fp16* kptr = (const __fp16*)weight_data_fp16 + num_input * p * packn + l; + const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn + l; const __fp16* m = bottom_blob.row(j); vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); @@ -489,7 +645,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con for (int p = 0; p < num_output / num_output_elempack; p++) { - const __fp16* kptr = (const __fp16*)weight_data_fp16 + num_input * p * packn; + const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn; const __fp16* m = bottom_blob.row(j); vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); @@ -526,7 +682,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con for (int p = 0; p < num_output; p++) { - const __fp16* kptr = (const __fp16*)weight_data_fp16 + num_input * p; + const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p; const __fp16* m = bottom_blob.row(j); vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); @@ -561,7 +717,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con for (int p = 0; p < num_output; p++) { - const __fp16* kptr = (const __fp16*)weight_data_fp16 + num_input * p; + const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p; const __fp16* m = bottom_blob.row(j); float sum = 0.f; @@ -621,7 +777,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con _sum = vle32_v_f32m2((const float*)bias_data + p * packn, vl); } - const __fp16* kptr = weight_data_fp16.row(p); + const __fp16* kptr = weight_data_tm.row(p); const __fp16* sptr = bottom_blob_flattened; @@ -655,7 +811,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con if (bias_term) sum = bias_data[p]; - const __fp16* kptr = weight_data_fp16.row<__fp16>(p); + const __fp16* kptr = weight_data_tm.row<__fp16>(p); const __fp16* sptr = bottom_blob_flattened; @@ -713,7 +869,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co { for (int l = 0; l < packn; l++) { - const __fp16* kptr = (const __fp16*)weight_data_fp16 + num_input * p * packn + l; + const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn + l; const __fp16* m = bottom_blob.row(j); vfloat16m1_t _sum = vfmv_v_f_f16m1((__fp16)0.f, vl); @@ -751,7 +907,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co for (int p = 0; p < num_output / num_output_elempack; p++) { - const __fp16* kptr = (const __fp16*)weight_data_fp16 + num_input * p * packn; + const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn; const __fp16* m = bottom_blob.row(j); vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); @@ -788,7 +944,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co for (int p = 0; p < num_output; p++) { - const __fp16* kptr = (const __fp16*)weight_data_fp16 + num_input * p; + const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p; const __fp16* m = bottom_blob.row(j); vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); @@ -823,7 +979,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co for (int p = 0; p < num_output; p++) { - const __fp16* kptr = (const __fp16*)weight_data_fp16 + num_input * p; + const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p; const __fp16* m = bottom_blob.row(j); float sum = 0.f; @@ -883,7 +1039,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co _sum = vle16_v_f16m1((const __fp16*)bias_data_fp16 + p * packn, vl); } - const __fp16* kptr = weight_data_fp16.row(p); + const __fp16* kptr = weight_data_tm.row(p); const __fp16* sptr = bottom_blob_flattened; @@ -917,7 +1073,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co if (bias_term) sum = bias_data[p]; - const __fp16* kptr = weight_data_fp16.row<__fp16>(p); + const __fp16* kptr = weight_data_tm.row<__fp16>(p); const __fp16* sptr = bottom_blob_flattened; diff --git a/src/layer/riscv/innerproduct_riscv.h b/src/layer/riscv/innerproduct_riscv.h index 5fc200e7d..0503ea3d4 100644 --- a/src/layer/riscv/innerproduct_riscv.h +++ b/src/layer/riscv/innerproduct_riscv.h @@ -39,8 +39,9 @@ protected: public: Layer* flatten; + Mat weight_data_tm; + // fp16 - Mat weight_data_fp16; Mat bias_data_fp16; }; diff --git a/src/layer/x86/convolutiondepthwise_x86.cpp b/src/layer/x86/convolutiondepthwise_x86.cpp index b0c6eb37d..659bbd297 100644 --- a/src/layer/x86/convolutiondepthwise_x86.cpp +++ b/src/layer/x86/convolutiondepthwise_x86.cpp @@ -96,8 +96,6 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt) { Mat weight_data_r2 = weight_data.reshape(maxk, group); convert_packing(weight_data_r2, weight_data_tm, 16, opt); - - return 0; } #endif // __AVX512F__ @@ -106,8 +104,6 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt) { Mat weight_data_r2 = weight_data.reshape(maxk, group); convert_packing(weight_data_r2, weight_data_tm, 8, opt); - - return 0; } #endif // __AVX__ @@ -116,8 +112,6 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt) { Mat weight_data_r2 = weight_data.reshape(maxk, group); convert_packing(weight_data_r2, weight_data_tm, 4, opt); - - return 0; } #endif // __SSE2__ @@ -127,14 +121,23 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt) if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { weight_data_tm = weight_data; - return 0; } - if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) { weight_data_tm = weight_data; - return 0; + } + else + { + create_group_ops(opt); } } + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; } // group convolution