* discard riscv innerproduct weight * drop riscv conv convdw weight * drop riscv deconv deconvdw weighttags/20220701
| @@ -130,6 +130,11 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt) | |||
| ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); | |||
| if (opt.lightmode) | |||
| { | |||
| weight_data.release(); | |||
| } | |||
| return 0; | |||
| } | |||
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| @@ -153,6 +158,11 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt) | |||
| ncnn::cast_float32_to_bfloat16(weight_data, weight_data_tm, opt); | |||
| } | |||
| if (opt.lightmode) | |||
| { | |||
| weight_data.release(); | |||
| } | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| @@ -163,8 +173,6 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt) | |||
| { | |||
| Mat weight_data_r2 = weight_data.reshape(maxk, group); | |||
| convert_packing(weight_data_r2, weight_data_tm, 4, opt); | |||
| return 0; | |||
| } | |||
| #endif // __ARM_NEON | |||
| @@ -173,24 +181,32 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt) | |||
| if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| weight_data_tm = weight_data; | |||
| return 0; | |||
| } | |||
| if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| weight_data_tm = weight_data; | |||
| return 0; | |||
| } | |||
| if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| weight_data_tm = weight_data; | |||
| return 0; | |||
| } | |||
| if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| weight_data_tm = weight_data; | |||
| return 0; | |||
| } | |||
| else | |||
| { | |||
| // group convolution | |||
| create_group_ops(opt); | |||
| } | |||
| } | |||
| if (opt.lightmode) | |||
| { | |||
| weight_data.release(); | |||
| } | |||
| return 0; | |||
| } | |||
| // group convolution | |||
| @@ -1591,6 +1607,11 @@ int ConvolutionDepthWise_arm::create_pipeline_int8_arm(const Option& opt) | |||
| weight_data_tm = weight_data; | |||
| } | |||
| if (opt.lightmode) | |||
| { | |||
| weight_data.release(); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -83,6 +83,11 @@ int ConvolutionDepthWise_mips::create_pipeline(const Option& opt) | |||
| weight_data_tm = weight_data; | |||
| } | |||
| if (opt.lightmode) | |||
| { | |||
| weight_data.release(); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -88,7 +88,7 @@ Convolution_riscv::Convolution_riscv() | |||
| activation = 0; | |||
| } | |||
| static void convolution_transform_kernel_packed_rvv(const Mat& weight_data, Mat& weight_data_packed, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack) | |||
| static void convolution_transform_kernel_packed_rvv(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack) | |||
| { | |||
| const int maxk = kernel_w * kernel_h; | |||
| @@ -97,11 +97,11 @@ static void convolution_transform_kernel_packed_rvv(const Mat& weight_data, Mat& | |||
| { | |||
| Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); | |||
| weight_data_packed.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack); | |||
| weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack); | |||
| for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) | |||
| { | |||
| float* g00 = weight_data_packed.channel(q / out_elempack); | |||
| float* g00 = weight_data_tm.channel(q / out_elempack); | |||
| for (int p = 0; p + (elempack - 1) < num_input; p += elempack) | |||
| { | |||
| @@ -131,6 +131,14 @@ int Convolution_riscv::create_pipeline(const Option& opt) | |||
| activation = create_activation_layer(activation_type, activation_params, opt); | |||
| #if NCNN_INT8 | |||
| if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) | |||
| { | |||
| // TODO implement int8 | |||
| return 0; | |||
| } | |||
| #endif | |||
| #if __riscv_vector && __riscv_zfh | |||
| if (opt.use_fp16_storage) | |||
| { | |||
| @@ -166,14 +174,14 @@ int Convolution_riscv::create_pipeline(const Option& opt) | |||
| } | |||
| else | |||
| { | |||
| convolution_transform_kernel_packed_rvv(weight_data, weight_data_packed, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); | |||
| convolution_transform_kernel_packed_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); | |||
| } | |||
| } | |||
| // pack1ton | |||
| if (elempack == 1 && out_elempack == packn) | |||
| { | |||
| convolution_transform_kernel_packed_rvv(weight_data, weight_data_packed, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); | |||
| convolution_transform_kernel_packed_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); | |||
| } | |||
| // packnto1 | |||
| @@ -181,19 +189,19 @@ int Convolution_riscv::create_pipeline(const Option& opt) | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_packnto1_rvv(weight_data, weight_data_packed, num_input, num_output, kernel_w, kernel_h); | |||
| convolution_im2col_sgemm_transform_kernel_packnto1_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_packnto1_rvv(weight_data, weight_data_packed, num_input, num_output, kernel_w, kernel_h); | |||
| convolution_im2col_sgemm_transform_kernel_packnto1_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (opt.use_sgemm_convolution) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_packnto1_rvv(weight_data, weight_data_packed, num_input, num_output, kernel_w, kernel_h); | |||
| convolution_im2col_sgemm_transform_kernel_packnto1_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else | |||
| { | |||
| convolution_transform_kernel_packed_rvv(weight_data, weight_data_packed, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); | |||
| convolution_transform_kernel_packed_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); | |||
| } | |||
| } | |||
| #endif // __riscv_vector | |||
| @@ -203,7 +211,7 @@ int Convolution_riscv::create_pipeline(const Option& opt) | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_rvv(weight_data, weight_data_packed, num_input, num_output, kernel_w, kernel_h); | |||
| convolution_im2col_sgemm_transform_kernel_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| @@ -218,10 +226,19 @@ int Convolution_riscv::create_pipeline(const Option& opt) | |||
| } | |||
| else if (opt.use_sgemm_convolution) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_rvv(weight_data, weight_data_packed, num_input, num_output, kernel_w, kernel_h); | |||
| convolution_im2col_sgemm_transform_kernel_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else | |||
| { | |||
| weight_data_tm = weight_data; | |||
| } | |||
| } | |||
| if (opt.lightmode) | |||
| { | |||
| weight_data.release(); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -240,7 +257,7 @@ int Convolution_riscv::destroy_pipeline(const Option& opt) | |||
| int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| #if NCNN_INT8 | |||
| if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) | |||
| if (opt.use_int8_inference && int8_scale_term) | |||
| { | |||
| Mat bottom_blob_unpacked = bottom_blob; | |||
| if (bottom_blob.elempack != 1) | |||
| @@ -266,9 +283,44 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti | |||
| } | |||
| #endif | |||
| if (bottom_blob.dims != 3) | |||
| // flattened blob, implement as InnerProduct | |||
| if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1) | |||
| { | |||
| return Convolution::forward(bottom_blob, top_blob, opt); | |||
| Mat bottom_blob_3d; | |||
| if (bottom_blob.elemsize % 16 == 0) | |||
| { | |||
| bottom_blob_3d = bottom_blob; | |||
| bottom_blob_3d.dims = 3; | |||
| bottom_blob_3d.w = 1; | |||
| bottom_blob_3d.h = 1; | |||
| bottom_blob_3d.c = bottom_blob.w; | |||
| bottom_blob_3d.cstep = 1; | |||
| } | |||
| else | |||
| { | |||
| bottom_blob_3d = bottom_blob.reshape(1, 1, bottom_blob.w, opt.workspace_allocator); | |||
| } | |||
| Mat top_blob_3d; | |||
| int ret = forward(bottom_blob_3d, top_blob_3d, opt); | |||
| if (ret != 0) | |||
| return ret; | |||
| if (top_blob_3d.elemsize % 16 == 0) | |||
| { | |||
| top_blob = top_blob_3d; | |||
| top_blob.dims = 1; | |||
| top_blob.w = top_blob_3d.c; | |||
| top_blob.h = 1; | |||
| top_blob.c = 1; | |||
| bottom_blob_3d.cstep = top_blob_3d.c; | |||
| } | |||
| else | |||
| { | |||
| top_blob = top_blob_3d.reshape(top_blob_3d.c, opt.blob_allocator); | |||
| } | |||
| return 0; | |||
| } | |||
| int elembits = bottom_blob.elembits(); | |||
| @@ -328,7 +380,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv1x1s1_sgemm_packn_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); | |||
| conv1x1s1_sgemm_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); | |||
| if (activation) | |||
| { | |||
| @@ -337,7 +389,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti | |||
| } | |||
| else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| conv1x1s2_sgemm_packn_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); | |||
| conv1x1s2_sgemm_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); | |||
| if (activation) | |||
| { | |||
| @@ -363,7 +415,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti | |||
| } | |||
| else if (opt.use_sgemm_convolution) | |||
| { | |||
| convolution_im2col_sgemm_packn_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| convolution_im2col_sgemm_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| if (activation) | |||
| { | |||
| @@ -372,7 +424,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti | |||
| } | |||
| else | |||
| { | |||
| convolution_packn_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| convolution_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| } | |||
| } | |||
| @@ -380,7 +432,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv1x1s1_sgemm_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); | |||
| conv1x1s1_sgemm_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); | |||
| if (activation) | |||
| { | |||
| @@ -389,7 +441,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti | |||
| } | |||
| else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv3x3s1_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); | |||
| conv3x3s1_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); | |||
| if (activation) | |||
| { | |||
| @@ -398,7 +450,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti | |||
| } | |||
| else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| conv3x3s2_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); | |||
| conv3x3s2_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); | |||
| if (activation) | |||
| { | |||
| @@ -407,7 +459,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti | |||
| } | |||
| else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| conv7x7s2_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); | |||
| conv7x7s2_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); | |||
| if (activation) | |||
| { | |||
| @@ -416,7 +468,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti | |||
| } | |||
| else if (opt.use_sgemm_convolution) | |||
| { | |||
| convolution_im2col_sgemm_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| convolution_im2col_sgemm_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| if (activation) | |||
| { | |||
| @@ -425,7 +477,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti | |||
| } | |||
| else | |||
| { | |||
| convolution_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| convolution_pack1ton_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| } | |||
| } | |||
| @@ -433,7 +485,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv1x1s1_sgemm_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); | |||
| conv1x1s1_sgemm_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); | |||
| if (activation) | |||
| { | |||
| @@ -442,7 +494,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti | |||
| } | |||
| else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| conv1x1s2_sgemm_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); | |||
| conv1x1s2_sgemm_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); | |||
| if (activation) | |||
| { | |||
| @@ -451,7 +503,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti | |||
| } | |||
| else if (opt.use_sgemm_convolution) | |||
| { | |||
| convolution_im2col_sgemm_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| convolution_im2col_sgemm_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| if (activation) | |||
| { | |||
| @@ -460,7 +512,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti | |||
| } | |||
| else | |||
| { | |||
| convolution_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| convolution_packnto1_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| } | |||
| } | |||
| #endif // __riscv_vector | |||
| @@ -469,7 +521,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv1x1s1_sgemm_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); | |||
| conv1x1s1_sgemm_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); | |||
| if (activation) | |||
| { | |||
| @@ -494,7 +546,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti | |||
| } | |||
| else if (opt.use_sgemm_convolution) | |||
| { | |||
| convolution_im2col_sgemm_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| convolution_im2col_sgemm_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| if (activation) | |||
| { | |||
| @@ -541,7 +593,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti | |||
| sum = bias_data[p]; | |||
| } | |||
| const float* kptr = (const float*)weight_data + maxk * channels * p; | |||
| const float* kptr = (const float*)weight_data_tm + maxk * channels * p; | |||
| // channels | |||
| for (int q = 0; q < channels; q++) | |||
| @@ -666,7 +718,7 @@ int Convolution_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector | |||
| } | |||
| #if __riscv_vector && __riscv_zfh | |||
| static void convolution_transform_kernel_packed_fp16s_rvv(const Mat& weight_data, Mat& weight_data_fp16, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack) | |||
| static void convolution_transform_kernel_packed_fp16s_rvv(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack) | |||
| { | |||
| const int maxk = kernel_w * kernel_h; | |||
| @@ -675,11 +727,11 @@ static void convolution_transform_kernel_packed_fp16s_rvv(const Mat& weight_data | |||
| { | |||
| Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); | |||
| weight_data_fp16.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack); | |||
| weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack); | |||
| for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) | |||
| { | |||
| __fp16* g00 = weight_data_fp16.channel(q / out_elempack); | |||
| __fp16* g00 = weight_data_tm.channel(q / out_elempack); | |||
| for (int p = 0; p + (elempack - 1) < num_input; p += elempack) | |||
| { | |||
| @@ -728,14 +780,14 @@ int Convolution_riscv::create_pipeline_fp16s(const Option& opt) | |||
| } | |||
| else | |||
| { | |||
| convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); | |||
| convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); | |||
| } | |||
| } | |||
| // pack1ton | |||
| if (elempack == 1 && out_elempack == packn) | |||
| { | |||
| convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); | |||
| convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); | |||
| } | |||
| // packnto1 | |||
| @@ -743,19 +795,19 @@ int Convolution_riscv::create_pipeline_fp16s(const Option& opt) | |||
| { | |||
| if (opt.use_fp16_arithmetic && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h); | |||
| convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (opt.use_fp16_arithmetic && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h); | |||
| convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (opt.use_fp16_arithmetic && opt.use_sgemm_convolution) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h); | |||
| convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else | |||
| { | |||
| convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); | |||
| convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); | |||
| } | |||
| } | |||
| @@ -764,15 +816,15 @@ int Convolution_riscv::create_pipeline_fp16s(const Option& opt) | |||
| { | |||
| if (opt.use_fp16_arithmetic && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_fp16sa_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h); | |||
| convolution_im2col_sgemm_transform_kernel_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (opt.use_fp16_arithmetic && opt.use_sgemm_convolution) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_fp16sa_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h); | |||
| convolution_im2col_sgemm_transform_kernel_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else | |||
| { | |||
| convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_fp16, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); | |||
| convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); | |||
| } | |||
| } | |||
| @@ -781,6 +833,11 @@ int Convolution_riscv::create_pipeline_fp16s(const Option& opt) | |||
| ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); | |||
| } | |||
| if (opt.lightmode) | |||
| { | |||
| weight_data.release(); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -818,28 +875,28 @@ int Convolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, cons | |||
| if (elempack == packn && out_elempack == packn) | |||
| { | |||
| { | |||
| convolution_packn_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| convolution_packn_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| } | |||
| } | |||
| if (elempack == 1 && out_elempack == packn) | |||
| { | |||
| { | |||
| convolution_pack1ton_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| convolution_pack1ton_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| } | |||
| } | |||
| if (elempack == packn && out_elempack == 1) | |||
| { | |||
| { | |||
| convolution_packnto1_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| convolution_packnto1_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| } | |||
| } | |||
| if (elempack == 1 && out_elempack == 1) | |||
| { | |||
| { | |||
| convolution_fp16s(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| convolution_fp16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| } | |||
| } | |||
| @@ -884,7 +941,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv1x1s1_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); | |||
| conv1x1s1_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); | |||
| if (activation) | |||
| { | |||
| @@ -893,7 +950,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con | |||
| } | |||
| else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| conv1x1s2_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); | |||
| conv1x1s2_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); | |||
| if (activation) | |||
| { | |||
| @@ -919,7 +976,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con | |||
| } | |||
| else if (opt.use_sgemm_convolution) | |||
| { | |||
| convolution_im2col_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| convolution_im2col_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| if (activation) | |||
| { | |||
| @@ -928,7 +985,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con | |||
| } | |||
| else | |||
| { | |||
| convolution_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| convolution_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| } | |||
| } | |||
| @@ -936,7 +993,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv1x1s1_sgemm_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); | |||
| conv1x1s1_sgemm_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); | |||
| if (activation) | |||
| { | |||
| @@ -945,7 +1002,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con | |||
| } | |||
| else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv3x3s1_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); | |||
| conv3x3s1_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); | |||
| if (activation) | |||
| { | |||
| @@ -954,7 +1011,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con | |||
| } | |||
| else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| conv3x3s2_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); | |||
| conv3x3s2_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); | |||
| if (activation) | |||
| { | |||
| @@ -963,7 +1020,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con | |||
| } | |||
| else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| conv7x7s2_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); | |||
| conv7x7s2_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); | |||
| if (activation) | |||
| { | |||
| @@ -972,7 +1029,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con | |||
| } | |||
| else if (opt.use_sgemm_convolution) | |||
| { | |||
| convolution_im2col_sgemm_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| convolution_im2col_sgemm_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| if (activation) | |||
| { | |||
| @@ -981,7 +1038,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con | |||
| } | |||
| else | |||
| { | |||
| convolution_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| convolution_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| } | |||
| } | |||
| @@ -989,7 +1046,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv1x1s1_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); | |||
| conv1x1s1_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); | |||
| if (activation) | |||
| { | |||
| @@ -998,7 +1055,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con | |||
| } | |||
| else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| conv1x1s2_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); | |||
| conv1x1s2_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); | |||
| if (activation) | |||
| { | |||
| @@ -1007,7 +1064,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con | |||
| } | |||
| else if (opt.use_sgemm_convolution) | |||
| { | |||
| convolution_im2col_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| convolution_im2col_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| if (activation) | |||
| { | |||
| @@ -1016,7 +1073,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con | |||
| } | |||
| else | |||
| { | |||
| convolution_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| convolution_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| } | |||
| } | |||
| @@ -1024,7 +1081,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv1x1s1_sgemm_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); | |||
| conv1x1s1_sgemm_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); | |||
| if (activation) | |||
| { | |||
| @@ -1033,7 +1090,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con | |||
| } | |||
| else if (opt.use_sgemm_convolution) | |||
| { | |||
| convolution_im2col_sgemm_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| convolution_im2col_sgemm_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| if (activation) | |||
| { | |||
| @@ -1042,7 +1099,7 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con | |||
| } | |||
| else | |||
| { | |||
| convolution_fp16s(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| convolution_fp16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| } | |||
| } | |||
| @@ -41,14 +41,12 @@ protected: | |||
| public: | |||
| Layer* activation; | |||
| // packn | |||
| Mat weight_data_packed; | |||
| Mat weight_data_tm; | |||
| Mat weight_winograd23_data; | |||
| Mat weight_winograd43_data; | |||
| Mat weight_winograd63_data; | |||
| // fp16 | |||
| Mat weight_data_fp16; | |||
| Mat bias_data_fp16; | |||
| }; | |||
| @@ -61,6 +61,14 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt) | |||
| activation = create_activation_layer(activation_type, activation_params, opt); | |||
| #if NCNN_INT8 | |||
| if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) | |||
| { | |||
| // TODO implement int8 | |||
| return 0; | |||
| } | |||
| #endif | |||
| #if __riscv_vector && __riscv_zfh | |||
| if (opt.use_fp16_storage) | |||
| { | |||
| @@ -91,12 +99,18 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt) | |||
| if (elempack == packn) | |||
| { | |||
| Mat weight_data_r2 = weight_data.reshape(maxk, group); | |||
| convert_packing(weight_data_r2, weight_data_packed, packn, opt); | |||
| convert_packing(weight_data_r2, weight_data_tm, packn, opt); | |||
| } | |||
| #endif // __riscv_vector | |||
| if (elempack == 1) | |||
| { | |||
| weight_data_tm = weight_data; | |||
| } | |||
| if (opt.lightmode) | |||
| { | |||
| weight_data.release(); | |||
| } | |||
| return 0; | |||
| @@ -105,6 +119,11 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt) | |||
| // group convolution | |||
| create_group_ops(opt); | |||
| if (opt.lightmode) | |||
| { | |||
| weight_data.release(); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -126,7 +145,7 @@ int ConvolutionDepthWise_riscv::create_group_ops(const Option& opt) | |||
| for (int g = 0; g < group; g++) | |||
| { | |||
| Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g); | |||
| Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g).clone(); | |||
| Mat bias_data_g; | |||
| if (bias_term) | |||
| bias_data_g = bias_data.range(num_output_g * g, num_output_g); | |||
| @@ -227,7 +246,7 @@ int ConvolutionDepthWise_riscv::destroy_pipeline(const Option& opt) | |||
| int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| #if NCNN_INT8 | |||
| if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) | |||
| if (opt.use_int8_inference && int8_scale_term) | |||
| { | |||
| Mat bottom_blob_unpacked = bottom_blob; | |||
| if (bottom_blob.elempack != 1) | |||
| @@ -310,7 +329,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c | |||
| { | |||
| if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| convdw3x3s1_packn_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); | |||
| convdw3x3s1_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); | |||
| if (activation) | |||
| { | |||
| @@ -319,7 +338,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c | |||
| } | |||
| else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| convdw3x3s2_packn_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); | |||
| convdw3x3s2_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); | |||
| if (activation) | |||
| { | |||
| @@ -328,7 +347,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c | |||
| } | |||
| else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| convdw5x5s1_packn_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); | |||
| convdw5x5s1_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); | |||
| if (activation) | |||
| { | |||
| @@ -337,7 +356,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c | |||
| } | |||
| else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| convdw5x5s2_packn_rvv(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); | |||
| convdw5x5s2_packn_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); | |||
| if (activation) | |||
| { | |||
| @@ -371,7 +390,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c | |||
| for (int g = 0; g < channels; g++) | |||
| { | |||
| float* outptr = top_blob.channel(g); | |||
| const float* kptr = (const float*)weight_data_packed + maxk * g * packn; | |||
| const float* kptr = (const float*)weight_data_tm + maxk * g * packn; | |||
| const Mat m = bottom_blob_bordered.channel(g); | |||
| for (int i = 0; i < outh; i++) | |||
| @@ -410,7 +429,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c | |||
| { | |||
| if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| convdw3x3s1_rvv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); | |||
| convdw3x3s1_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); | |||
| if (activation) | |||
| { | |||
| @@ -419,7 +438,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c | |||
| } | |||
| else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| convdw3x3s2_rvv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); | |||
| convdw3x3s2_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); | |||
| if (activation) | |||
| { | |||
| @@ -453,7 +472,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c | |||
| for (int g = 0; g < group; g++) | |||
| { | |||
| float* outptr = top_blob.channel(g); | |||
| const float* kptr = (const float*)weight_data + maxk * g; | |||
| const float* kptr = (const float*)weight_data_tm + maxk * g; | |||
| const Mat m = bottom_blob_bordered.channel(g); | |||
| for (int i = 0; i < outh; i++) | |||
| @@ -663,22 +682,32 @@ int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt) | |||
| Mat weight_data_r2_packed; | |||
| convert_packing(weight_data_r2, weight_data_r2_packed, packn, opt); | |||
| ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_fp16, opt); | |||
| ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_tm, opt); | |||
| } | |||
| if (elempack == 1) | |||
| { | |||
| ncnn::cast_float32_to_float16(weight_data, weight_data_fp16, opt); | |||
| ncnn::cast_float32_to_float16(weight_data, weight_data_tm, opt); | |||
| } | |||
| ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); | |||
| if (opt.lightmode) | |||
| { | |||
| weight_data.release(); | |||
| } | |||
| return 0; | |||
| } | |||
| // group convolution | |||
| create_group_ops(opt); | |||
| if (opt.lightmode) | |||
| { | |||
| weight_data.release(); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -744,7 +773,7 @@ int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_b | |||
| for (int g = 0; g < channels; g++) | |||
| { | |||
| __fp16* outptr = top_blob.channel(g); | |||
| const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * packn; | |||
| const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn; | |||
| const Mat m = bottom_blob_bordered.channel(g); | |||
| for (int i = 0; i < outh; i++) | |||
| @@ -806,7 +835,7 @@ int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_b | |||
| for (int g = 0; g < group; g++) | |||
| { | |||
| __fp16* outptr = top_blob.channel(g); | |||
| const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g; | |||
| const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g; | |||
| const Mat m = bottom_blob_bordered.channel(g); | |||
| for (int i = 0; i < outh; i++) | |||
| @@ -930,7 +959,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_ | |||
| { | |||
| if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| convdw3x3s1_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); | |||
| convdw3x3s1_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); | |||
| if (activation) | |||
| { | |||
| @@ -939,7 +968,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_ | |||
| } | |||
| else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| convdw3x3s2_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); | |||
| convdw3x3s2_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); | |||
| if (activation) | |||
| { | |||
| @@ -948,7 +977,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_ | |||
| } | |||
| else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| convdw5x5s1_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); | |||
| convdw5x5s1_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); | |||
| if (activation) | |||
| { | |||
| @@ -957,7 +986,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_ | |||
| } | |||
| else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| convdw5x5s2_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); | |||
| convdw5x5s2_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); | |||
| if (activation) | |||
| { | |||
| @@ -991,7 +1020,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_ | |||
| for (int g = 0; g < channels; g++) | |||
| { | |||
| __fp16* outptr = top_blob.channel(g); | |||
| const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * packn; | |||
| const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn; | |||
| const Mat m = bottom_blob_bordered.channel(g); | |||
| for (int i = 0; i < outh; i++) | |||
| @@ -1053,7 +1082,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_ | |||
| for (int g = 0; g < group; g++) | |||
| { | |||
| __fp16* outptr = top_blob.channel(g); | |||
| const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g; | |||
| const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g; | |||
| const Mat m = bottom_blob_bordered.channel(g); | |||
| for (int i = 0; i < outh; i++) | |||
| @@ -43,11 +43,9 @@ public: | |||
| Layer* activation; | |||
| std::vector<ncnn::Layer*> group_ops; | |||
| // packing | |||
| Mat weight_data_packed; | |||
| Mat weight_data_tm; | |||
| // fp16 | |||
| Mat weight_data_fp16; | |||
| Mat bias_data_fp16; | |||
| }; | |||
| @@ -100,11 +100,11 @@ int Deconvolution_riscv::create_pipeline(const Option& opt) | |||
| { | |||
| Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output); | |||
| weight_data_packed.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack); | |||
| weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack); | |||
| for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) | |||
| { | |||
| float* g00 = weight_data_packed.channel(q / out_elempack); | |||
| float* g00 = weight_data_tm.channel(q / out_elempack); | |||
| for (int p = 0; p + (elempack - 1) < num_input; p += elempack) | |||
| { | |||
| @@ -148,6 +148,11 @@ int Deconvolution_riscv::create_pipeline(const Option& opt) | |||
| { | |||
| } | |||
| if (opt.lightmode) | |||
| { | |||
| weight_data.release(); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -218,21 +223,21 @@ int Deconvolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Op | |||
| if (elempack == packn && out_elempack == packn) | |||
| { | |||
| { | |||
| deconvolution_packn_rvv(bottom_blob, top_blob_bordered, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| deconvolution_packn_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| } | |||
| } | |||
| if (elempack == 1 && out_elempack == packn) | |||
| { | |||
| { | |||
| deconvolution_pack1ton_rvv(bottom_blob, top_blob_bordered, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| deconvolution_pack1ton_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| } | |||
| } | |||
| if (elempack == packn && out_elempack == 1) | |||
| { | |||
| { | |||
| deconvolution_packnto1_rvv(bottom_blob, top_blob_bordered, weight_data_packed, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| deconvolution_packnto1_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| } | |||
| } | |||
| #endif // __riscv_vector | |||
| @@ -257,7 +262,7 @@ int Deconvolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Op | |||
| sum = bias_data[p]; | |||
| } | |||
| const float* kptr = (const float*)weight_data_packed.channel(p); | |||
| const float* kptr = (const float*)weight_data_tm.channel(p); | |||
| // channels | |||
| for (int q = 0; q < channels; q++) | |||
| @@ -356,11 +361,11 @@ int Deconvolution_riscv::create_pipeline_fp16s(const Option& opt) | |||
| { | |||
| Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output); | |||
| weight_data_fp16.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack); | |||
| weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack); | |||
| for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) | |||
| { | |||
| __fp16* g00 = weight_data_fp16.channel(q / out_elempack); | |||
| __fp16* g00 = weight_data_tm.channel(q / out_elempack); | |||
| for (int p = 0; p + (elempack - 1) < num_input; p += elempack) | |||
| { | |||
| @@ -404,6 +409,11 @@ int Deconvolution_riscv::create_pipeline_fp16s(const Option& opt) | |||
| ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); | |||
| if (opt.lightmode) | |||
| { | |||
| weight_data.release(); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -446,28 +456,28 @@ int Deconvolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co | |||
| if (elempack == packn && out_elempack == packn) | |||
| { | |||
| { | |||
| deconvolution_packn_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| deconvolution_packn_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| } | |||
| } | |||
| if (elempack == 1 && out_elempack == packn) | |||
| { | |||
| { | |||
| deconvolution_pack1ton_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| deconvolution_pack1ton_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| } | |||
| } | |||
| if (elempack == packn && out_elempack == 1) | |||
| { | |||
| { | |||
| deconvolution_packnto1_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| deconvolution_packnto1_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| } | |||
| } | |||
| if (elempack == 1 && out_elempack == 1) | |||
| { | |||
| { | |||
| deconvolution_fp16s(bottom_blob, top_blob_bordered, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| deconvolution_fp16s(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| } | |||
| } | |||
| @@ -517,28 +527,28 @@ int Deconvolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c | |||
| if (elempack == packn && out_elempack == packn) | |||
| { | |||
| { | |||
| deconvolution_packn_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| deconvolution_packn_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| } | |||
| } | |||
| if (elempack == 1 && out_elempack == packn) | |||
| { | |||
| { | |||
| deconvolution_pack1ton_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| deconvolution_pack1ton_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| } | |||
| } | |||
| if (elempack == packn && out_elempack == 1) | |||
| { | |||
| { | |||
| deconvolution_packnto1_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_fp16, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| deconvolution_packnto1_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| } | |||
| } | |||
| if (elempack == 1 && out_elempack == 1) | |||
| { | |||
| { | |||
| deconvolution_fp16s(bottom_blob, top_blob_bordered, weight_data_fp16, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| deconvolution_fp16s(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); | |||
| } | |||
| } | |||
| @@ -37,11 +37,9 @@ protected: | |||
| #endif | |||
| public: | |||
| // packn | |||
| Mat weight_data_packed; | |||
| Mat weight_data_tm; | |||
| // fp16 | |||
| Mat weight_data_fp16; | |||
| Mat bias_data_fp16; | |||
| }; | |||
| @@ -88,13 +88,18 @@ int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt) | |||
| if (elempack == packn) | |||
| { | |||
| Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group); | |||
| convert_packing(weight_data_r2, weight_data_packed, packn, opt); | |||
| convert_packing(weight_data_r2, weight_data_tm, packn, opt); | |||
| } | |||
| #endif // __riscv_vector | |||
| if (elempack == 1) | |||
| { | |||
| weight_data_packed = weight_data_transposed; | |||
| weight_data_tm = weight_data_transposed; | |||
| } | |||
| if (opt.lightmode) | |||
| { | |||
| weight_data.release(); | |||
| } | |||
| return 0; | |||
| @@ -103,6 +108,11 @@ int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt) | |||
| // group convolution | |||
| create_group_ops(opt); | |||
| if (opt.lightmode) | |||
| { | |||
| weight_data.release(); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -124,7 +134,7 @@ int DeconvolutionDepthWise_riscv::create_group_ops(const Option& opt) | |||
| for (int g = 0; g < group; g++) | |||
| { | |||
| Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g); | |||
| Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g).clone(); | |||
| Mat bias_data_g; | |||
| if (bias_term) | |||
| bias_data_g = bias_data.range(num_output_g * g, num_output_g); | |||
| @@ -256,7 +266,7 @@ int DeconvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, | |||
| for (int g = 0; g < channels; g++) | |||
| { | |||
| float* outptr = top_blob_bordered.channel(g); | |||
| const float* kptr = (const float*)weight_data_packed + maxk * g * packn; | |||
| const float* kptr = (const float*)weight_data_tm + maxk * g * packn; | |||
| const Mat m = bottom_blob.channel(g); | |||
| for (int i = 0; i < outh; i++) | |||
| @@ -318,7 +328,7 @@ int DeconvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, | |||
| for (int g = 0; g < channels; g++) | |||
| { | |||
| float* outptr = top_blob_bordered.channel(g); | |||
| const float* kptr = (const float*)weight_data_packed + maxk * g; | |||
| const float* kptr = (const float*)weight_data_tm + maxk * g; | |||
| const Mat m = bottom_blob.channel(g); | |||
| for (int i = 0; i < outh; i++) | |||
| @@ -480,22 +490,32 @@ int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt) | |||
| Mat weight_data_r2_packed; | |||
| convert_packing(weight_data_r2, weight_data_r2_packed, packn, opt); | |||
| ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_fp16, opt); | |||
| ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_tm, opt); | |||
| } | |||
| if (elempack == 1) | |||
| { | |||
| ncnn::cast_float32_to_float16(weight_data_transposed, weight_data_fp16, opt); | |||
| ncnn::cast_float32_to_float16(weight_data_transposed, weight_data_tm, opt); | |||
| } | |||
| ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); | |||
| if (opt.lightmode) | |||
| { | |||
| weight_data.release(); | |||
| } | |||
| return 0; | |||
| } | |||
| // group convolution | |||
| create_group_ops(opt); | |||
| if (opt.lightmode) | |||
| { | |||
| weight_data.release(); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -543,7 +563,7 @@ int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top | |||
| for (int g = 0; g < channels; g++) | |||
| { | |||
| __fp16* outptr = top_blob_bordered.channel(g); | |||
| const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * packn; | |||
| const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn; | |||
| const Mat m = bottom_blob.channel(g); | |||
| for (int i = 0; i < outh; i++) | |||
| @@ -605,7 +625,7 @@ int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top | |||
| for (int g = 0; g < channels; g++) | |||
| { | |||
| __fp16* outptr = top_blob_bordered.channel(g); | |||
| const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g; | |||
| const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g; | |||
| const Mat m = bottom_blob.channel(g); | |||
| for (int i = 0; i < outh; i++) | |||
| @@ -764,7 +784,7 @@ int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& to | |||
| for (int g = 0; g < channels; g++) | |||
| { | |||
| __fp16* outptr = top_blob_bordered.channel(g); | |||
| const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * packn; | |||
| const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn; | |||
| const Mat m = bottom_blob.channel(g); | |||
| for (int i = 0; i < outh; i++) | |||
| @@ -826,7 +846,7 @@ int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& to | |||
| for (int g = 0; g < channels; g++) | |||
| { | |||
| __fp16* outptr = top_blob_bordered.channel(g); | |||
| const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g; | |||
| const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g; | |||
| const Mat m = bottom_blob.channel(g); | |||
| for (int i = 0; i < outh; i++) | |||
| @@ -40,11 +40,9 @@ protected: | |||
| public: | |||
| std::vector<ncnn::Layer*> group_ops; | |||
| // packing | |||
| Mat weight_data_packed; | |||
| Mat weight_data_tm; | |||
| // fp16 | |||
| Mat weight_data_fp16; | |||
| Mat bias_data_fp16; | |||
| }; | |||
| @@ -43,8 +43,6 @@ InnerProduct_riscv::InnerProduct_riscv() | |||
| int InnerProduct_riscv::create_pipeline(const Option& opt) | |||
| { | |||
| #if __riscv_vector | |||
| if (opt.use_packing_layout || opt.use_int8_inference) | |||
| { | |||
| flatten = ncnn::create_layer(ncnn::LayerType::Flatten); | |||
| @@ -54,7 +52,14 @@ int InnerProduct_riscv::create_pipeline(const Option& opt) | |||
| flatten->create_pipeline(opt); | |||
| } | |||
| #endif // __riscv_vector | |||
| #if NCNN_INT8 | |||
| if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) | |||
| { | |||
| // TODO implement int8 | |||
| return 0; | |||
| } | |||
| #endif | |||
| #if __riscv_vector && __riscv_zfh | |||
| if (opt.use_fp16_storage) | |||
| @@ -63,6 +68,53 @@ int InnerProduct_riscv::create_pipeline(const Option& opt) | |||
| } | |||
| #endif | |||
| int out_elempack = 1; | |||
| #if __riscv_vector | |||
| const int packn = csrr_vlenb() / 4; | |||
| const int num_input = weight_data_size / num_output; | |||
| if (opt.use_packing_layout) | |||
| { | |||
| out_elempack = num_output % packn == 0 ? packn : 1; | |||
| } | |||
| if (out_elempack == packn) | |||
| { | |||
| // src = inch-outch | |||
| // dst = packn-inch-outch/packn | |||
| { | |||
| Mat weight_data_r2 = weight_data.reshape(num_input, num_output); | |||
| weight_data_tm.create(num_input, num_output / packn, (size_t)4u * packn, packn); | |||
| for (int q = 0; q + (packn - 1) < num_output; q += packn) | |||
| { | |||
| float* g0 = weight_data_tm.row(q / packn); | |||
| for (int p = 0; p < num_input; p++) | |||
| { | |||
| for (int j = 0; j < packn; j++) | |||
| { | |||
| *g0++ = weight_data_r2.row(q + j)[p]; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| #endif // __riscv_vector | |||
| if (out_elempack == 1) | |||
| { | |||
| weight_data_tm = weight_data; | |||
| } | |||
| if (opt.lightmode) | |||
| { | |||
| weight_data.release(); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -81,7 +133,7 @@ int InnerProduct_riscv::destroy_pipeline(const Option& opt) | |||
| int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| #if NCNN_INT8 | |||
| if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) | |||
| if (opt.use_int8_inference && int8_scale_term) | |||
| { | |||
| Mat bottom_blob_unpacked = bottom_blob; | |||
| if (bottom_blob.elempack != 1) | |||
| @@ -136,20 +188,104 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| int num_output_elempack = 1; | |||
| #if __riscv_vector | |||
| if (opt.use_packing_layout) | |||
| { | |||
| num_output_elempack = num_output % packn == 0 ? packn : 1; | |||
| } | |||
| #endif | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int j = 0; j < h; j++) | |||
| { | |||
| #if __riscv_vector | |||
| if (elempack == packn) | |||
| if (elempack == packn && num_output_elempack == packn) | |||
| { | |||
| const word_type vl = vsetvl_e32m1(packn); | |||
| float* outptr = top_blob.row(j); | |||
| for (int p = 0; p < num_output / num_output_elempack; p++) | |||
| { | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| const float* kptr = weight_data_tm.row(p) + l; | |||
| const float* m = bottom_blob.row(j); | |||
| vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); | |||
| if (bias_term) | |||
| { | |||
| _sum = vfmv_v_f_f32m1(bias_data[p * packn + l], vl); | |||
| } | |||
| int n = num_input; | |||
| while (n > 0) | |||
| { | |||
| vfloat32m1_t _val = vle32_v_f32m1(m, vl); | |||
| _sum = vfmacc_vf_f32m1(_sum, *kptr, _val, vl); | |||
| m += packn; | |||
| kptr += packn; | |||
| n -= 1; | |||
| } | |||
| _sum = activation_ps(_sum, activation_type, activation_params, vl); | |||
| vse32_v_f32m1(outptr, _sum, vl); | |||
| outptr += packn; | |||
| } | |||
| } | |||
| } | |||
| if (elempack == 1 && num_output_elempack == packn) | |||
| { | |||
| const word_type vl = vsetvl_e32m1(packn); | |||
| float* outptr = top_blob.row(j); | |||
| for (int p = 0; p < num_output / num_output_elempack; p++) | |||
| { | |||
| const float* kptr = weight_data_tm.row(p); | |||
| const float* m = bottom_blob.row(j); | |||
| vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); | |||
| if (bias_term) | |||
| { | |||
| _sum = vle32_v_f32m1((const float*)bias_data + p * packn, vl); | |||
| } | |||
| int n = num_input; | |||
| while (n > 0) | |||
| { | |||
| vfloat32m1_t _w = vle32_v_f32m1(kptr, vl); | |||
| _sum = vfmacc_vf_f32m1(_sum, *m, _w, vl); | |||
| m += 1; | |||
| kptr += packn; | |||
| n -= 1; | |||
| } | |||
| _sum = activation_ps(_sum, activation_type, activation_params, vl); | |||
| vse32_v_f32m1(outptr, _sum, vl); | |||
| outptr += packn; | |||
| } | |||
| } | |||
| if (elempack == packn && num_output_elempack == 1) | |||
| { | |||
| const word_type vl = vsetvl_e32m1(packn); | |||
| float* outptr = top_blob.row(j); | |||
| for (int p = 0; p < num_output; p++) | |||
| { | |||
| const float* kptr = (const float*)weight_data + num_input * p; | |||
| const float* kptr = (const float*)weight_data_tm + num_input * p; | |||
| const float* m = bottom_blob.row(j); | |||
| const word_type vl = vsetvl_e32m1(packn); | |||
| vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); | |||
| if (bias_term) | |||
| @@ -176,13 +312,13 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt | |||
| } | |||
| #endif // __riscv_vector | |||
| if (elempack == 1) | |||
| if (elempack == 1 && num_output_elempack == 1) | |||
| { | |||
| float* outptr = top_blob.row(j); | |||
| for (int p = 0; p < num_output; p++) | |||
| { | |||
| const float* kptr = (const float*)weight_data + num_input * p; | |||
| const float* kptr = (const float*)weight_data_tm + num_input * p; | |||
| const float* m = bottom_blob.row(j); | |||
| float sum = 0.f; | |||
| @@ -208,69 +344,93 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt | |||
| return 0; | |||
| } | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| int size = w * h; | |||
| #if __riscv_vector | |||
| if (elempack == packn) | |||
| // flatten | |||
| Mat bottom_blob_flattened = bottom_blob; | |||
| if (bottom_blob.dims != 1) | |||
| { | |||
| // flatten | |||
| Mat bottom_blob_flattened = bottom_blob; | |||
| if (bottom_blob.dims != 1) | |||
| { | |||
| Option opt_flatten = opt; | |||
| opt_flatten.blob_allocator = opt.workspace_allocator; | |||
| Option opt_flatten = opt; | |||
| opt_flatten.blob_allocator = opt.workspace_allocator; | |||
| flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); | |||
| } | |||
| flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); | |||
| } | |||
| // pack1 | |||
| { | |||
| bottom_blob_flattened.w *= bottom_blob_flattened.elempack; | |||
| bottom_blob_flattened.cstep = bottom_blob_flattened.w; | |||
| bottom_blob_flattened.elemsize = 4u; | |||
| bottom_blob_flattened.elempack = 1; | |||
| } | |||
| size_t elemsize = bottom_blob_flattened.elemsize; | |||
| int elempack = bottom_blob_flattened.elempack; | |||
| return forward(bottom_blob_flattened, top_blob, opt); | |||
| int out_elempack = 1; | |||
| #if __riscv_vector | |||
| if (opt.use_packing_layout) | |||
| { | |||
| out_elempack = num_output % packn == 0 ? packn : 1; | |||
| } | |||
| #endif | |||
| size_t out_elemsize = elemsize / elempack * out_elempack; | |||
| top_blob.create(num_output, elemsize, opt.blob_allocator); | |||
| top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| const float* weight_data_ptr = weight_data; | |||
| #if __riscv_vector | |||
| int nn_num_output = num_output / packn; | |||
| int remain_num_output_start = nn_num_output * packn; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int pp = 0; pp < nn_num_output; pp++) | |||
| if (out_elempack == packn) | |||
| { | |||
| int p = pp * packn; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < num_output / out_elempack; p++) | |||
| { | |||
| const word_type vl = vsetvl_e32m1(packn); | |||
| vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); | |||
| const word_type vl = vsetvl_e32m1(packn); | |||
| vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); | |||
| if (bias_term) | |||
| { | |||
| _sum = vle32_v_f32m1((const float*)bias_data + p * packn, vl); | |||
| } | |||
| if (bias_term) | |||
| { | |||
| _sum = vle32_v_f32m1((const float*)bias_data + p, vl); | |||
| const float* kptr = weight_data_tm.row(p); | |||
| const float* sptr = bottom_blob_flattened; | |||
| int n = num_input; | |||
| while (n > 0) | |||
| { | |||
| vfloat32m1_t _w = vle32_v_f32m1(kptr, vl); | |||
| _sum = vfmacc_vf_f32m1(_sum, *sptr, _w, vl); | |||
| sptr += 1; | |||
| kptr += packn; | |||
| n -= 1; | |||
| } | |||
| _sum = activation_ps(_sum, activation_type, activation_params, vl); | |||
| float* outptr = top_blob; | |||
| vse32_v_f32m1(outptr + p * packn, _sum, vl); | |||
| } | |||
| } | |||
| #endif // __riscv_vector | |||
| const float* w = weight_data_ptr + num_input * p; | |||
| if (out_elempack == 1) | |||
| { | |||
| #if __riscv_vector | |||
| int nn_num_output = num_output / packn; | |||
| int remain_num_output_start = nn_num_output * packn; | |||
| // channels | |||
| for (int q = 0; q < channels; q++) | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int pp = 0; pp < nn_num_output; pp++) | |||
| { | |||
| const float* m = bottom_blob.channel(q); | |||
| int p = pp * packn; | |||
| int n = size; | |||
| const word_type vl = vsetvl_e32m1(packn); | |||
| vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); | |||
| if (bias_term) | |||
| { | |||
| _sum = vle32_v_f32m1((const float*)bias_data + p, vl); | |||
| } | |||
| const float* w = (const float*)weight_data_tm + num_input * p; | |||
| const float* m = bottom_blob_flattened; | |||
| int n = num_input; | |||
| while (n > 0) | |||
| { | |||
| vfloat32m1_t _w = vlse32_v_f32m1(w, num_input * sizeof(float), vl); | |||
| @@ -281,45 +441,41 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt | |||
| w += 1; | |||
| n -= 1; | |||
| } | |||
| } | |||
| _sum = activation_ps(_sum, activation_type, activation_params, vl); | |||
| _sum = activation_ps(_sum, activation_type, activation_params, vl); | |||
| vse32_v_f32m1((float*)top_blob + p, _sum, vl); | |||
| } | |||
| vse32_v_f32m1((float*)top_blob + p, _sum, vl); | |||
| } | |||
| #else // __riscv_vector | |||
| int nn_num_output = num_output / 4; | |||
| int remain_num_output_start = nn_num_output * 4; | |||
| int nn_num_output = num_output / 4; | |||
| int remain_num_output_start = nn_num_output * 4; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int pp = 0; pp < nn_num_output; pp++) | |||
| { | |||
| int p = pp * 4; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int pp = 0; pp < nn_num_output; pp++) | |||
| { | |||
| int p = pp * 4; | |||
| float sum0 = 0.f; | |||
| float sum1 = 0.f; | |||
| float sum2 = 0.f; | |||
| float sum3 = 0.f; | |||
| float sum0 = 0.f; | |||
| float sum1 = 0.f; | |||
| float sum2 = 0.f; | |||
| float sum3 = 0.f; | |||
| if (bias_term) | |||
| { | |||
| sum0 = bias_data[p]; | |||
| sum1 = bias_data[p + 1]; | |||
| sum2 = bias_data[p + 2]; | |||
| sum3 = bias_data[p + 3]; | |||
| } | |||
| if (bias_term) | |||
| { | |||
| sum0 = bias_data[p]; | |||
| sum1 = bias_data[p + 1]; | |||
| sum2 = bias_data[p + 2]; | |||
| sum3 = bias_data[p + 3]; | |||
| } | |||
| const float* w0 = weight_data_ptr + num_input * p; | |||
| const float* w1 = weight_data_ptr + num_input * (p + 1); | |||
| const float* w2 = weight_data_ptr + num_input * (p + 2); | |||
| const float* w3 = weight_data_ptr + num_input * (p + 3); | |||
| const float* w0 = (const float*)weight_data_tm + num_input * p; | |||
| const float* w1 = (const float*)weight_data_tm + num_input * (p + 1); | |||
| const float* w2 = (const float*)weight_data_tm + num_input * (p + 2); | |||
| const float* w3 = (const float*)weight_data_tm + num_input * (p + 3); | |||
| // channels | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const float* m = bottom_blob.channel(q); | |||
| const float* m = bottom_blob_flattened; | |||
| for (int i = 0; i < size; i++) | |||
| for (int i = 0; i < num_input; i++) | |||
| { | |||
| sum0 += *m * *w0; | |||
| sum1 += *m * *w1; | |||
| @@ -332,48 +488,43 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt | |||
| w2++; | |||
| w3++; | |||
| } | |||
| } | |||
| sum0 = activation_ss(sum0, activation_type, activation_params); | |||
| sum1 = activation_ss(sum1, activation_type, activation_params); | |||
| sum2 = activation_ss(sum2, activation_type, activation_params); | |||
| sum3 = activation_ss(sum3, activation_type, activation_params); | |||
| sum0 = activation_ss(sum0, activation_type, activation_params); | |||
| sum1 = activation_ss(sum1, activation_type, activation_params); | |||
| sum2 = activation_ss(sum2, activation_type, activation_params); | |||
| sum3 = activation_ss(sum3, activation_type, activation_params); | |||
| top_blob[p] = sum0; | |||
| top_blob[p + 1] = sum1; | |||
| top_blob[p + 2] = sum2; | |||
| top_blob[p + 3] = sum3; | |||
| } | |||
| top_blob[p] = sum0; | |||
| top_blob[p + 1] = sum1; | |||
| top_blob[p + 2] = sum2; | |||
| top_blob[p + 3] = sum3; | |||
| } | |||
| #endif // __riscv_vector | |||
| // num_output | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = remain_num_output_start; p < num_output; p++) | |||
| { | |||
| float sum = 0.f; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = remain_num_output_start; p < num_output; p++) | |||
| { | |||
| float sum = 0.f; | |||
| if (bias_term) | |||
| sum = bias_data[p]; | |||
| if (bias_term) | |||
| sum = bias_data[p]; | |||
| const float* w = weight_data_ptr + num_input * p; | |||
| const float* w = (const float*)weight_data_tm + num_input * p; | |||
| // channels | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const float* m = bottom_blob.channel(q); | |||
| const float* m = bottom_blob_flattened; | |||
| for (int i = 0; i < size; i++) | |||
| for (int i = 0; i < num_input; i++) | |||
| { | |||
| sum += *m * *w; | |||
| m++; | |||
| w++; | |||
| } | |||
| } | |||
| sum = activation_ss(sum, activation_type, activation_params); | |||
| sum = activation_ss(sum, activation_type, activation_params); | |||
| top_blob[p] = sum; | |||
| top_blob[p] = sum; | |||
| } | |||
| } | |||
| return 0; | |||
| @@ -398,11 +549,11 @@ int InnerProduct_riscv::create_pipeline_fp16s(const Option& opt) | |||
| { | |||
| Mat weight_data_r2 = weight_data.reshape(num_input, num_output); | |||
| weight_data_fp16.create(num_input, num_output / out_elempack, (size_t)2u * out_elempack, out_elempack); | |||
| weight_data_tm.create(num_input, num_output / out_elempack, (size_t)2u * out_elempack, out_elempack); | |||
| for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) | |||
| { | |||
| __fp16* g0 = weight_data_fp16.row<__fp16>(q / out_elempack); | |||
| __fp16* g0 = weight_data_tm.row<__fp16>(q / out_elempack); | |||
| for (int p = 0; p < num_input; p++) | |||
| { | |||
| @@ -416,6 +567,11 @@ int InnerProduct_riscv::create_pipeline_fp16s(const Option& opt) | |||
| ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); | |||
| if (opt.lightmode) | |||
| { | |||
| weight_data.release(); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -451,7 +607,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con | |||
| { | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| const __fp16* kptr = (const __fp16*)weight_data_fp16 + num_input * p * packn + l; | |||
| const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn + l; | |||
| const __fp16* m = bottom_blob.row<const __fp16>(j); | |||
| vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); | |||
| @@ -489,7 +645,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con | |||
| for (int p = 0; p < num_output / num_output_elempack; p++) | |||
| { | |||
| const __fp16* kptr = (const __fp16*)weight_data_fp16 + num_input * p * packn; | |||
| const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn; | |||
| const __fp16* m = bottom_blob.row<const __fp16>(j); | |||
| vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); | |||
| @@ -526,7 +682,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con | |||
| for (int p = 0; p < num_output; p++) | |||
| { | |||
| const __fp16* kptr = (const __fp16*)weight_data_fp16 + num_input * p; | |||
| const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p; | |||
| const __fp16* m = bottom_blob.row<const __fp16>(j); | |||
| vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); | |||
| @@ -561,7 +717,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con | |||
| for (int p = 0; p < num_output; p++) | |||
| { | |||
| const __fp16* kptr = (const __fp16*)weight_data_fp16 + num_input * p; | |||
| const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p; | |||
| const __fp16* m = bottom_blob.row<const __fp16>(j); | |||
| float sum = 0.f; | |||
| @@ -621,7 +777,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con | |||
| _sum = vle32_v_f32m2((const float*)bias_data + p * packn, vl); | |||
| } | |||
| const __fp16* kptr = weight_data_fp16.row<const __fp16>(p); | |||
| const __fp16* kptr = weight_data_tm.row<const __fp16>(p); | |||
| const __fp16* sptr = bottom_blob_flattened; | |||
| @@ -655,7 +811,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con | |||
| if (bias_term) | |||
| sum = bias_data[p]; | |||
| const __fp16* kptr = weight_data_fp16.row<__fp16>(p); | |||
| const __fp16* kptr = weight_data_tm.row<__fp16>(p); | |||
| const __fp16* sptr = bottom_blob_flattened; | |||
| @@ -713,7 +869,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co | |||
| { | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| const __fp16* kptr = (const __fp16*)weight_data_fp16 + num_input * p * packn + l; | |||
| const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn + l; | |||
| const __fp16* m = bottom_blob.row<const __fp16>(j); | |||
| vfloat16m1_t _sum = vfmv_v_f_f16m1((__fp16)0.f, vl); | |||
| @@ -751,7 +907,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co | |||
| for (int p = 0; p < num_output / num_output_elempack; p++) | |||
| { | |||
| const __fp16* kptr = (const __fp16*)weight_data_fp16 + num_input * p * packn; | |||
| const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn; | |||
| const __fp16* m = bottom_blob.row<const __fp16>(j); | |||
| vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); | |||
| @@ -788,7 +944,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co | |||
| for (int p = 0; p < num_output; p++) | |||
| { | |||
| const __fp16* kptr = (const __fp16*)weight_data_fp16 + num_input * p; | |||
| const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p; | |||
| const __fp16* m = bottom_blob.row<const __fp16>(j); | |||
| vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); | |||
| @@ -823,7 +979,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co | |||
| for (int p = 0; p < num_output; p++) | |||
| { | |||
| const __fp16* kptr = (const __fp16*)weight_data_fp16 + num_input * p; | |||
| const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p; | |||
| const __fp16* m = bottom_blob.row<const __fp16>(j); | |||
| float sum = 0.f; | |||
| @@ -883,7 +1039,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co | |||
| _sum = vle16_v_f16m1((const __fp16*)bias_data_fp16 + p * packn, vl); | |||
| } | |||
| const __fp16* kptr = weight_data_fp16.row<const __fp16>(p); | |||
| const __fp16* kptr = weight_data_tm.row<const __fp16>(p); | |||
| const __fp16* sptr = bottom_blob_flattened; | |||
| @@ -917,7 +1073,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co | |||
| if (bias_term) | |||
| sum = bias_data[p]; | |||
| const __fp16* kptr = weight_data_fp16.row<__fp16>(p); | |||
| const __fp16* kptr = weight_data_tm.row<__fp16>(p); | |||
| const __fp16* sptr = bottom_blob_flattened; | |||
| @@ -39,8 +39,9 @@ protected: | |||
| public: | |||
| Layer* flatten; | |||
| Mat weight_data_tm; | |||
| // fp16 | |||
| Mat weight_data_fp16; | |||
| Mat bias_data_fp16; | |||
| }; | |||
| @@ -96,8 +96,6 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt) | |||
| { | |||
| Mat weight_data_r2 = weight_data.reshape(maxk, group); | |||
| convert_packing(weight_data_r2, weight_data_tm, 16, opt); | |||
| return 0; | |||
| } | |||
| #endif // __AVX512F__ | |||
| @@ -106,8 +104,6 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt) | |||
| { | |||
| Mat weight_data_r2 = weight_data.reshape(maxk, group); | |||
| convert_packing(weight_data_r2, weight_data_tm, 8, opt); | |||
| return 0; | |||
| } | |||
| #endif // __AVX__ | |||
| @@ -116,8 +112,6 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt) | |||
| { | |||
| Mat weight_data_r2 = weight_data.reshape(maxk, group); | |||
| convert_packing(weight_data_r2, weight_data_tm, 4, opt); | |||
| return 0; | |||
| } | |||
| #endif // __SSE2__ | |||
| @@ -127,14 +121,23 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt) | |||
| if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| weight_data_tm = weight_data; | |||
| return 0; | |||
| } | |||
| if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| weight_data_tm = weight_data; | |||
| return 0; | |||
| } | |||
| else | |||
| { | |||
| create_group_ops(opt); | |||
| } | |||
| } | |||
| if (opt.lightmode) | |||
| { | |||
| weight_data.release(); | |||
| } | |||
| return 0; | |||
| } | |||
| // group convolution | |||