| @@ -81,6 +81,7 @@ option(NCNN_COVERAGE "build for coverage" OFF) | |||
| option(NCNN_BUILD_BENCHMARK "build benchmark" ON) | |||
| option(NCNN_PYTHON "build python api" OFF) | |||
| option(NCNN_INT8 "int8 inference" ON) | |||
| option(NCNN_BF16 "bf16 inference" ON) | |||
| if(ANDROID OR IOS OR NCNN_SIMPLESTL OR CMAKE_CROSSCOMPILING) | |||
| option(NCNN_DISABLE_RTTI "disable rtti" ON) | |||
| @@ -29,7 +29,9 @@ BatchNorm_arm::BatchNorm_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| @@ -46,8 +48,10 @@ int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_inplace_bf16s(bottom_top_blob, opt); | |||
| #endif | |||
| int dims = bottom_top_blob.dims; | |||
| int elempack = bottom_top_blob.elempack; | |||
| @@ -660,6 +664,7 @@ int BatchNorm_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op | |||
| } | |||
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_BF16 | |||
| int BatchNorm_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int dims = bottom_top_blob.dims; | |||
| @@ -829,5 +834,6 @@ int BatchNorm_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| } // namespace ncnn | |||
| @@ -31,7 +31,9 @@ protected: | |||
| int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -33,7 +33,9 @@ BinaryOp_arm::BinaryOp_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| #if __ARM_NEON | |||
| @@ -812,8 +814,10 @@ int BinaryOp_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat> | |||
| return forward_fp16s(bottom_blobs, top_blobs, opt); | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_bf16s(bottom_blobs, top_blobs, opt); | |||
| #endif | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| const Mat& bottom_blob1 = bottom_blobs[1]; | |||
| @@ -866,8 +870,10 @@ int BinaryOp_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| return forward_inplace_fp16s(bottom_top_blob, opt); | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_inplace_bf16s(bottom_top_blob, opt); | |||
| #endif | |||
| #if __ARM_NEON | |||
| int elempack = bottom_top_blob.elempack; | |||
| @@ -3258,6 +3264,7 @@ int BinaryOp_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) | |||
| } | |||
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_BF16 | |||
| #if __ARM_NEON | |||
| template<typename Op> | |||
| static int binary_op_pack4_bf16s(const Mat& a, const Mat& b, Mat& c, const Option& opt) | |||
| @@ -4727,5 +4734,6 @@ int BinaryOp_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| } // namespace ncnn | |||
| @@ -33,8 +33,10 @@ protected: | |||
| int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -29,7 +29,9 @@ Clip_arm::Clip_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int Clip_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| @@ -41,8 +43,10 @@ int Clip_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| return forward_inplace_fp16s(bottom_top_blob, opt); | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_inplace_bf16s(bottom_top_blob, opt); | |||
| #endif | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| @@ -259,6 +263,7 @@ int Clip_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) con | |||
| } | |||
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_BF16 | |||
| int Clip_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| @@ -334,5 +339,6 @@ int Clip_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) con | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| } // namespace ncnn | |||
| @@ -30,7 +30,9 @@ protected: | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -25,7 +25,9 @@ Concat_arm::Concat_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int Concat_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const | |||
| @@ -37,8 +39,10 @@ int Concat_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& | |||
| return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt); | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt); | |||
| #endif | |||
| int dims = bottom_blobs[0].dims; | |||
| int positive_axis = axis < 0 ? dims + axis : axis; | |||
| @@ -27,17 +27,20 @@ | |||
| namespace ncnn { | |||
| #include "convolution_bf16s.h" | |||
| #include "convolution_sgemm.h" | |||
| #include "convolution_1x1.h" | |||
| #include "convolution_1x1_bf16s.h" | |||
| #include "convolution_2x2.h" | |||
| #include "convolution_3x3.h" | |||
| #include "convolution_4x4.h" | |||
| #include "convolution_5x5.h" | |||
| #include "convolution_7x7.h" | |||
| #if NCNN_BF16 | |||
| #include "convolution_bf16s.h" | |||
| #include "convolution_1x1_bf16s.h" | |||
| #endif // NCNN_BF16 | |||
| #if NCNN_INT8 | |||
| #include "convolution_sgemm_int8.h" | |||
| #include "convolution_1x1_int8.h" | |||
| @@ -47,27 +50,30 @@ namespace ncnn { | |||
| #if __ARM_NEON | |||
| #include "convolution_pack4.h" | |||
| #include "convolution_pack4_bf16s.h" | |||
| #include "convolution_pack1to4.h" | |||
| #include "convolution_pack1to4_bf16s.h" | |||
| #include "convolution_pack4to1.h" | |||
| #include "convolution_pack4to1_bf16s.h" | |||
| #include "convolution_sgemm_pack4.h" | |||
| #include "convolution_sgemm_pack4_bf16s.h" | |||
| #include "convolution_1x1_pack4.h" | |||
| #include "convolution_1x1_pack4_bf16s.h" | |||
| #include "convolution_1x1_pack4to1.h" | |||
| #include "convolution_1x1_pack4to1_bf16s.h" | |||
| #include "convolution_3x3_pack1to4.h" | |||
| #include "convolution_3x3_pack1to4_bf16s.h" | |||
| #include "convolution_3x3_pack4.h" | |||
| #include "convolution_3x3_pack4_bf16s.h" | |||
| #include "convolution_3x3_pack4to1.h" | |||
| #include "convolution_3x3_pack4to1_bf16s.h" | |||
| #include "convolution_5x5_pack4.h" | |||
| #include "convolution_5x5_pack4_bf16s.h" | |||
| #include "convolution_7x7_pack1to4.h" | |||
| #if NCNN_BF16 | |||
| #include "convolution_pack4_bf16s.h" | |||
| #include "convolution_pack1to4_bf16s.h" | |||
| #include "convolution_pack4to1_bf16s.h" | |||
| #include "convolution_sgemm_pack4_bf16s.h" | |||
| #include "convolution_1x1_pack4_bf16s.h" | |||
| #include "convolution_1x1_pack4to1_bf16s.h" | |||
| #include "convolution_3x3_pack1to4_bf16s.h" | |||
| #include "convolution_3x3_pack4_bf16s.h" | |||
| #include "convolution_3x3_pack4to1_bf16s.h" | |||
| #include "convolution_5x5_pack4_bf16s.h" | |||
| #include "convolution_7x7_pack1to4_bf16s.h" | |||
| #endif // NCNN_BF16 | |||
| #if NCNN_INT8 | |||
| #include "convolution_pack8to4_int8.h" | |||
| @@ -122,7 +128,9 @@ Convolution_arm::Convolution_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| activation = 0; | |||
| convolution_dilation1 = 0; | |||
| @@ -188,10 +196,12 @@ int Convolution_arm::create_pipeline(const Option& opt) | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage) | |||
| { | |||
| return create_pipeline_bf16s(opt); | |||
| } | |||
| #endif | |||
| if ((!support_packing || !opt.use_packing_layout) && !opt.use_bf16_storage && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1) | |||
| { | |||
| @@ -449,8 +459,10 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_bf16s(bottom_blob, top_blob, opt); | |||
| #endif | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| @@ -1538,6 +1550,7 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const | |||
| } | |||
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_BF16 | |||
| int Convolution_arm::create_pipeline_bf16s(const Option& opt) | |||
| { | |||
| const int maxk = kernel_w * kernel_h; | |||
| @@ -1812,6 +1825,7 @@ int Convolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| #if NCNN_INT8 | |||
| int Convolution_arm::create_pipeline_int8_arm(const Option& opt) | |||
| @@ -35,8 +35,10 @@ protected: | |||
| int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int create_pipeline_bf16s(const Option& opt); | |||
| int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_INT8 | |||
| int create_pipeline_int8_arm(const Option& opt); | |||
| int forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| @@ -66,8 +68,10 @@ public: | |||
| Mat weight_data_fp16; | |||
| Mat bias_data_fp16; | |||
| #if NCNN_BF16 | |||
| // bf16 | |||
| Mat weight_data_bf16; | |||
| #endif | |||
| #if NCNN_INT8 | |||
| // int8 | |||
| @@ -34,9 +34,12 @@ namespace ncnn { | |||
| #if __ARM_NEON | |||
| #include "convolutiondepthwise_3x3_pack4.h" | |||
| #include "convolutiondepthwise_3x3_pack4_bf16s.h" | |||
| #include "convolutiondepthwise_5x5_pack4.h" | |||
| #if NCNN_BF16 | |||
| #include "convolutiondepthwise_3x3_pack4_bf16s.h" | |||
| #include "convolutiondepthwise_5x5_pack4_bf16s.h" | |||
| #endif // NCNN_BF16 | |||
| #if NCNN_INT8 | |||
| #include "convolutiondepthwise_3x3_pack8_int8.h" | |||
| @@ -58,7 +61,9 @@ ConvolutionDepthWise_arm::ConvolutionDepthWise_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| activation = 0; | |||
| } | |||
| @@ -167,6 +172,7 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt) | |||
| } | |||
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage) | |||
| { | |||
| #if __ARM_NEON | |||
| @@ -186,6 +192,7 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt) | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| #if __ARM_NEON | |||
| // pack4 | |||
| @@ -362,8 +369,10 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_bf16s(bottom_blob, top_blob, opt); | |||
| #endif | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| @@ -1158,6 +1167,7 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl | |||
| } | |||
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_BF16 | |||
| int ConvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| @@ -1456,6 +1466,7 @@ int ConvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blo | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| #if NCNN_INT8 | |||
| int ConvolutionDepthWise_arm::create_pipeline_int8_arm(const Option& opt) | |||
| @@ -35,7 +35,9 @@ protected: | |||
| int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_INT8 | |||
| int create_pipeline_int8_arm(const Option& opt); | |||
| int forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| @@ -52,9 +54,11 @@ public: | |||
| Mat weight_data_fp16; | |||
| Mat bias_data_fp16; | |||
| #if NCNN_BF16 | |||
| // bf16 | |||
| Mat weight_data_bf16; | |||
| Mat weight_data_pack4_bf16; | |||
| #endif | |||
| #if NCNN_INT8 | |||
| // int8 | |||
| @@ -29,7 +29,9 @@ Crop_arm::Crop_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| #if __ARM_NEON | |||
| @@ -40,7 +40,9 @@ Deconvolution_arm::Deconvolution_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| activation = 0; | |||
| } | |||
| @@ -91,10 +93,12 @@ int Deconvolution_arm::create_pipeline(const Option& opt) | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage) | |||
| { | |||
| return create_pipeline_bf16s(opt); | |||
| } | |||
| #endif | |||
| const int maxk = kernel_w * kernel_h; | |||
| int num_input = weight_data_size / maxk / num_output; | |||
| @@ -308,8 +312,10 @@ int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Opti | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_bf16s(bottom_blob, top_blob, opt); | |||
| #endif | |||
| // deconvolv with NxN kernel | |||
| // value = value + bias | |||
| @@ -1899,6 +1905,7 @@ int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con | |||
| } | |||
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_BF16 | |||
| int Deconvolution_arm::create_pipeline_bf16s(const Option& opt) | |||
| { | |||
| const int maxk = kernel_w * kernel_h; | |||
| @@ -2337,5 +2344,6 @@ int Deconvolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, cons | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| } // namespace ncnn | |||
| @@ -35,8 +35,10 @@ protected: | |||
| int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int create_pipeline_bf16s(const Option& opt); | |||
| int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| #endif | |||
| public: | |||
| Layer* activation; | |||
| @@ -51,8 +53,10 @@ public: | |||
| Mat weight_data_fp16; | |||
| Mat bias_data_fp16; | |||
| #if NCNN_BF16 | |||
| // bf16 | |||
| Mat weight_data_bf16; | |||
| #endif | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -33,7 +33,9 @@ DeconvolutionDepthWise_arm::DeconvolutionDepthWise_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt) | |||
| @@ -101,6 +103,7 @@ int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt) | |||
| } | |||
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage) | |||
| { | |||
| #if __ARM_NEON | |||
| @@ -120,6 +123,7 @@ int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt) | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| #if __ARM_NEON | |||
| // pack4 | |||
| @@ -228,8 +232,10 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, c | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_bf16s(bottom_blob, top_blob, opt); | |||
| #endif | |||
| // convolv with NxN kernel | |||
| // value = value + bias | |||
| @@ -986,6 +992,7 @@ int DeconvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_ | |||
| } | |||
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_BF16 | |||
| int DeconvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| @@ -1224,5 +1231,6 @@ int DeconvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_b | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| } // namespace ncnn | |||
| @@ -34,7 +34,9 @@ protected: | |||
| int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| #endif | |||
| public: | |||
| std::vector<ncnn::Layer*> group_ops; | |||
| @@ -47,8 +49,10 @@ public: | |||
| Mat weight_data_fp16; | |||
| Mat bias_data_fp16; | |||
| #if NCNN_BF16 | |||
| // bf16 | |||
| Mat weight_data_bf16; | |||
| #endif | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -30,7 +30,9 @@ Dequantize_arm::Dequantize_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int Dequantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| @@ -47,8 +49,10 @@ int Dequantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage) | |||
| return forward_bf16s(bottom_blob, top_blob, opt); | |||
| #endif | |||
| int dims = bottom_blob.dims; | |||
| int elempack = bottom_blob.elempack; | |||
| @@ -2285,6 +2289,7 @@ int Dequantize_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const | |||
| } | |||
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_BF16 | |||
| int Dequantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| int dims = bottom_blob.dims; | |||
| @@ -3038,5 +3043,6 @@ int Dequantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const O | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| } // namespace ncnn | |||
| @@ -31,7 +31,9 @@ protected: | |||
| int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| #endif | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -29,7 +29,9 @@ Eltwise_arm::Eltwise_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const | |||
| @@ -46,8 +48,10 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_bf16s(bottom_blobs, top_blobs, opt); | |||
| #endif | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int w = bottom_blob.w; | |||
| @@ -2213,6 +2217,7 @@ int Eltwise_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vecto | |||
| } | |||
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_BF16 | |||
| int Eltwise_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const | |||
| { | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| @@ -2980,5 +2985,6 @@ int Eltwise_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| } // namespace ncnn | |||
| @@ -31,7 +31,9 @@ protected: | |||
| int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| int forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| #endif | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -29,7 +29,9 @@ Flatten_arm::Flatten_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif // NCNN_BF16 | |||
| } | |||
| int Flatten_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| @@ -44,8 +46,10 @@ int Flatten_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op | |||
| return forward_bf16s_fp16s(bottom_blob, top_blob, opt); | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_bf16s_fp16s(bottom_blob, top_blob, opt); | |||
| #endif | |||
| int dims = bottom_blob.dims; | |||
| @@ -32,7 +32,9 @@ GRU_arm::GRU_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int GRU_arm::create_pipeline(const Option& opt) | |||
| @@ -44,10 +46,12 @@ int GRU_arm::create_pipeline(const Option& opt) | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage) | |||
| { | |||
| return create_pipeline_bf16s(opt); | |||
| } | |||
| #endif | |||
| // pack RUN | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| @@ -627,8 +631,10 @@ int GRU_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_bf16s(bottom_blob, top_blob, opt); | |||
| #endif | |||
| int T = bottom_blob.h; | |||
| @@ -708,8 +714,10 @@ int GRU_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_bf16s(bottom_blobs, top_blobs, opt); | |||
| #endif | |||
| int T = bottom_blob.h; | |||
| Mat& top_blob = top_blobs[0]; | |||
| @@ -1727,6 +1735,7 @@ int GRU_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Ma | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| static int gru_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt) | |||
| { | |||
| int size = bottom_blob.w; | |||
| @@ -2378,5 +2387,6 @@ int GRU_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| } // namespace ncnn | |||
| @@ -36,9 +36,11 @@ protected: | |||
| int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| int forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int create_pipeline_bf16s(const Option& opt); | |||
| int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| #endif | |||
| public: | |||
| Mat weight_xc_data_packed; | |||
| @@ -29,7 +29,9 @@ HardSigmoid_arm::HardSigmoid_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int HardSigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| @@ -46,8 +48,10 @@ int HardSigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) co | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_inplace_bf16s(bottom_top_blob, opt); | |||
| #endif | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| @@ -326,6 +330,7 @@ int HardSigmoid_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& | |||
| } | |||
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_BF16 | |||
| int HardSigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| @@ -404,5 +409,6 @@ int HardSigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& o | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| } // namespace ncnn | |||
| @@ -31,7 +31,9 @@ protected: | |||
| int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -29,7 +29,9 @@ HardSwish_arm::HardSwish_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int HardSwish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| @@ -46,8 +48,10 @@ int HardSwish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_inplace_bf16s(bottom_top_blob, opt); | |||
| #endif | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| @@ -333,6 +337,7 @@ int HardSwish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op | |||
| } | |||
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_BF16 | |||
| int HardSwish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| @@ -413,5 +418,6 @@ int HardSwish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| } // namespace ncnn | |||
| @@ -31,7 +31,9 @@ protected: | |||
| int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -34,7 +34,9 @@ InnerProduct_arm::InnerProduct_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| flatten = 0; | |||
| activation = 0; | |||
| @@ -69,10 +71,12 @@ int InnerProduct_arm::create_pipeline(const Option& opt) | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage) | |||
| { | |||
| return create_pipeline_bf16s(opt); | |||
| } | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -117,8 +121,10 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Optio | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_bf16s(bottom_blob, top_blob, opt); | |||
| #endif | |||
| const int num_input = weight_data_size / num_output; | |||
| @@ -1535,6 +1541,7 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons | |||
| } | |||
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_BF16 | |||
| int InnerProduct_arm::create_pipeline_bf16s(const Option& opt) | |||
| { | |||
| const int num_input = weight_data_size / num_output; | |||
| @@ -1895,6 +1902,7 @@ int InnerProduct_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| #if NCNN_INT8 | |||
| int InnerProduct_arm::create_pipeline_int8_arm(const Option& opt) | |||
| @@ -37,8 +37,10 @@ protected: | |||
| int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int create_pipeline_bf16s(const Option& opt); | |||
| int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_INT8 | |||
| int create_pipeline_int8_arm(const Option& opt); | |||
| int forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| @@ -52,8 +54,10 @@ public: | |||
| Mat weight_data_fp16; | |||
| Mat bias_data_fp16; | |||
| #if NCNN_BF16 | |||
| // bf16 | |||
| Mat weight_data_bf16; | |||
| #endif | |||
| #if NCNN_INT8 | |||
| // int8 | |||
| @@ -29,7 +29,9 @@ InstanceNorm_arm::InstanceNorm_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int InstanceNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| @@ -41,8 +43,10 @@ int InstanceNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) c | |||
| return forward_inplace_fp16s(bottom_top_blob, opt); | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_inplace_bf16s(bottom_top_blob, opt); | |||
| #endif | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| @@ -473,6 +477,7 @@ int InstanceNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& | |||
| } | |||
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_BF16 | |||
| int InstanceNorm_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| @@ -646,5 +651,6 @@ int InstanceNorm_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| } // namespace ncnn | |||
| @@ -30,7 +30,9 @@ protected: | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -23,15 +23,20 @@ | |||
| namespace ncnn { | |||
| #include "interp_bicubic.h" | |||
| #include "interp_bicubic_bf16s.h" | |||
| #include "interp_bilinear.h" | |||
| #if NCNN_BF16 | |||
| #include "interp_bicubic_bf16s.h" | |||
| #include "interp_bilinear_bf16s.h" | |||
| #endif | |||
| #if __ARM_NEON | |||
| #include "interp_bicubic_pack4.h" | |||
| #include "interp_bicubic_pack4_bf16s.h" | |||
| #include "interp_bilinear_pack4.h" | |||
| #if NCNN_BF16 | |||
| #include "interp_bicubic_pack4_bf16s.h" | |||
| #include "interp_bilinear_pack4_bf16s.h" | |||
| #endif | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #include "interp_bicubic_fp16s.h" | |||
| #include "interp_bicubic_pack4_fp16s.h" | |||
| @@ -51,7 +56,9 @@ Interp_arm::Interp_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int Interp_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const | |||
| @@ -72,8 +79,10 @@ int Interp_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_bf16s(bottom_blobs, top_blobs, opt); | |||
| #endif | |||
| int h = bottom_blob.h; | |||
| int w = bottom_blob.w; | |||
| @@ -830,6 +839,7 @@ int Interp_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector | |||
| } | |||
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_BF16 | |||
| int Interp_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const | |||
| { | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| @@ -1052,5 +1062,6 @@ int Interp_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector< | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| } // namespace ncnn | |||
| @@ -31,7 +31,9 @@ protected: | |||
| int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| int forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| #endif | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -32,7 +32,9 @@ LSTM_arm::LSTM_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int LSTM_arm::create_pipeline(const Option& opt) | |||
| @@ -44,10 +46,12 @@ int LSTM_arm::create_pipeline(const Option& opt) | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage) | |||
| { | |||
| return create_pipeline_bf16s(opt); | |||
| } | |||
| #endif | |||
| // pack IFOG | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| @@ -349,8 +353,10 @@ int LSTM_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_bf16s(bottom_blob, top_blob, opt); | |||
| #endif | |||
| int T = bottom_blob.h; | |||
| @@ -436,8 +442,10 @@ int LSTM_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_bf16s(bottom_blobs, top_blobs, opt); | |||
| #endif | |||
| int T = bottom_blob.h; | |||
| Mat& top_blob = top_blobs[0]; | |||
| @@ -1296,6 +1304,7 @@ int LSTM_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<M | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| static int lstm_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, Mat& cell_state, const Option& opt) | |||
| { | |||
| int size = bottom_blob.w; | |||
| @@ -1680,5 +1689,6 @@ int LSTM_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Ma | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| } // namespace ncnn | |||
| @@ -36,9 +36,11 @@ protected: | |||
| int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| int forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int create_pipeline_bf16s(const Option& opt); | |||
| int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| #endif | |||
| public: | |||
| Mat weight_xc_data_packed; | |||
| @@ -35,7 +35,9 @@ Mish_arm::Mish_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int Mish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| @@ -52,8 +54,10 @@ int Mish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_inplace_bf16s(bottom_top_blob, opt); | |||
| #endif | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| @@ -243,6 +247,7 @@ int Mish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co | |||
| } | |||
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_BF16 | |||
| int Mish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| @@ -304,5 +309,6 @@ int Mish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) con | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| } // namespace ncnn | |||
| @@ -31,7 +31,9 @@ protected: | |||
| int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -38,7 +38,9 @@ Padding_arm::Padding_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int Padding_arm::create_pipeline(const Option& opt) | |||
| @@ -50,12 +52,14 @@ int Padding_arm::create_pipeline(const Option& opt) | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage) | |||
| { | |||
| value_bf16 = float32_to_bfloat16(value); | |||
| ncnn::cast_float32_to_bfloat16(per_channel_pad_data, per_channel_pad_data_bf16, opt); | |||
| } | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -83,8 +87,10 @@ int Padding_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op | |||
| return forward_bf16s_fp16s(bottom_blob, top_blob, opt); | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_bf16s_fp16s(bottom_blob, top_blob, opt); | |||
| #endif | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| @@ -352,19 +358,28 @@ int Padding_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons | |||
| { | |||
| Mat borderm = top_blob.channel(q); | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| // clang-format off | |||
| // *INDENT-OFF* | |||
| uint16x4_t pad_value; | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| if (opt.use_fp16_storage) | |||
| { | |||
| pad_value = per_channel_pad_data_size ? vreinterpret_u16_f16(vld1_f16((const __fp16*)per_channel_pad_data_fp16 + q * 4)) : vreinterpret_u16_f16(vdup_n_f16((__fp16)value)); | |||
| } | |||
| else | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage) | |||
| { | |||
| pad_value = per_channel_pad_data_size ? vld1_u16((const unsigned short*)per_channel_pad_data_bf16 + q * 4) : vdup_n_u16(value_bf16); | |||
| } | |||
| #else | |||
| uint16x4_t pad_value = per_channel_pad_data_size ? vld1_u16((const unsigned short*)per_channel_pad_data_bf16 + q * 4) : vdup_n_u16(value_bf16); | |||
| else | |||
| #endif | |||
| { | |||
| } | |||
| // *INDENT-ON* | |||
| // clang-format on | |||
| //Channel padding | |||
| if ((q - front_) < 0 || (q - front_) >= channels) | |||
| { | |||
| @@ -34,9 +34,11 @@ protected: | |||
| int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| public: | |||
| #if NCNN_BF16 | |||
| // bf16 | |||
| unsigned short value_bf16; | |||
| Mat per_channel_pad_data_bf16; | |||
| #endif | |||
| // fp16 | |||
| Mat per_channel_pad_data_fp16; | |||
| @@ -31,7 +31,9 @@ PixelShuffle_arm::PixelShuffle_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int PixelShuffle_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| @@ -43,8 +45,10 @@ int PixelShuffle_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Optio | |||
| return forward_bf16s_fp16s(bottom_blob, top_blob, opt); | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_bf16s_fp16s(bottom_blob, top_blob, opt); | |||
| #endif | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| @@ -39,7 +39,9 @@ Pooling_arm::Pooling_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int Pooling_arm::create_pipeline(const Option& /*opt*/) | |||
| @@ -78,8 +80,10 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_bf16s(bottom_blob, top_blob, opt); | |||
| #endif | |||
| // max value in NxN window | |||
| // avg value in NxN window | |||
| @@ -1235,6 +1239,7 @@ int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Opt | |||
| } | |||
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_BF16 | |||
| int Pooling_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| // max value in NxN window | |||
| @@ -1644,5 +1649,6 @@ int Pooling_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Opti | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| } // namespace ncnn | |||
| @@ -32,7 +32,9 @@ protected: | |||
| int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| #endif | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -29,7 +29,9 @@ PReLU_arm::PReLU_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int PReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| @@ -46,8 +48,10 @@ int PReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_inplace_bf16s(bottom_top_blob, opt); | |||
| #endif | |||
| int dims = bottom_top_blob.dims; | |||
| int elempack = bottom_top_blob.elempack; | |||
| @@ -816,6 +820,7 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c | |||
| } | |||
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_BF16 | |||
| int PReLU_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int dims = bottom_top_blob.dims; | |||
| @@ -1033,5 +1038,6 @@ int PReLU_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) co | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| } // namespace ncnn | |||
| @@ -31,7 +31,9 @@ protected: | |||
| int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -34,7 +34,9 @@ Quantize_arm::Quantize_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| @@ -51,8 +53,10 @@ int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_bf16s(bottom_blob, top_blob, opt); | |||
| #endif | |||
| int dims = bottom_blob.dims; | |||
| int elempack = bottom_blob.elempack; | |||
| @@ -1552,6 +1556,7 @@ int Quantize_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Op | |||
| } | |||
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_BF16 | |||
| int Quantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| int dims = bottom_blob.dims; | |||
| @@ -1953,5 +1958,6 @@ int Quantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Opt | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| } // namespace ncnn | |||
| @@ -31,7 +31,9 @@ protected: | |||
| int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| #endif | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -29,7 +29,9 @@ ReLU_arm::ReLU_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int ReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| @@ -44,8 +46,10 @@ int ReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| return forward_inplace_fp16s(bottom_top_blob, opt); | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_inplace_bf16s(bottom_top_blob, opt); | |||
| #endif | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| @@ -578,6 +582,7 @@ int ReLU_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) con | |||
| } | |||
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_BF16 | |||
| int ReLU_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| @@ -881,6 +886,7 @@ int ReLU_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) con | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| int ReLU_arm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| @@ -30,7 +30,9 @@ protected: | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| int forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const; | |||
| }; | |||
| @@ -29,7 +29,9 @@ Reshape_arm::Reshape_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int Reshape_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| @@ -41,8 +43,10 @@ int Reshape_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op | |||
| return forward_bf16s_fp16s(bottom_blob, top_blob, opt); | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_bf16s_fp16s(bottom_blob, top_blob, opt); | |||
| #endif | |||
| int elempack = bottom_blob.elempack; | |||
| @@ -32,7 +32,9 @@ RNN_arm::RNN_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int RNN_arm::create_pipeline(const Option& opt) | |||
| @@ -44,10 +46,12 @@ int RNN_arm::create_pipeline(const Option& opt) | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage) | |||
| { | |||
| return create_pipeline_bf16s(opt); | |||
| } | |||
| #endif | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| int size = weight_data_size / num_directions / num_output; | |||
| @@ -309,8 +313,10 @@ int RNN_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_bf16s(bottom_blob, top_blob, opt); | |||
| #endif | |||
| int T = bottom_blob.h; | |||
| @@ -390,8 +396,10 @@ int RNN_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_bf16s(bottom_blobs, top_blobs, opt); | |||
| #endif | |||
| int T = bottom_blob.h; | |||
| Mat& top_blob = top_blobs[0]; | |||
| @@ -1067,6 +1075,7 @@ int RNN_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Ma | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| static int rnn_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt) | |||
| { | |||
| int size = bottom_blob.w; | |||
| @@ -1400,5 +1409,6 @@ int RNN_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| } // namespace ncnn | |||
| @@ -36,9 +36,11 @@ protected: | |||
| int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| int forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int create_pipeline_bf16s(const Option& opt); | |||
| int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| #endif | |||
| public: | |||
| Mat weight_xc_data_packed; | |||
| @@ -31,7 +31,9 @@ ShuffleChannel_arm::ShuffleChannel_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int ShuffleChannel_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| @@ -43,8 +45,10 @@ int ShuffleChannel_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Opt | |||
| return forward_bf16s_fp16s(bottom_blob, top_blob, opt); | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_bf16s_fp16s(bottom_blob, top_blob, opt); | |||
| #endif | |||
| int channels = bottom_blob.c; | |||
| int elempack = bottom_blob.elempack; | |||
| @@ -36,7 +36,9 @@ Sigmoid_arm::Sigmoid_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| @@ -53,8 +55,10 @@ int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_inplace_bf16s(bottom_top_blob, opt); | |||
| #endif | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| @@ -247,6 +251,7 @@ int Sigmoid_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) | |||
| } | |||
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_BF16 | |||
| int Sigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| @@ -311,5 +316,6 @@ int Sigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| } // namespace ncnn | |||
| @@ -31,7 +31,9 @@ protected: | |||
| int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -25,7 +25,9 @@ Slice_arm::Slice_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int Slice_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const | |||
| @@ -37,8 +39,10 @@ int Slice_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& t | |||
| return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt); | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt); | |||
| #endif | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int dims = bottom_blob.dims; | |||
| @@ -35,7 +35,9 @@ Swish_arm::Swish_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int Swish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| @@ -52,8 +54,10 @@ int Swish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_inplace_bf16s(bottom_top_blob, opt); | |||
| #endif | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| @@ -250,6 +254,7 @@ int Swish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c | |||
| } | |||
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_BF16 | |||
| int Swish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| @@ -313,5 +318,6 @@ int Swish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) co | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| } // namespace ncnn | |||
| @@ -31,7 +31,9 @@ protected: | |||
| int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -35,7 +35,9 @@ TanH_arm::TanH_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int TanH_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| @@ -52,8 +54,10 @@ int TanH_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_inplace_bf16s(bottom_top_blob, opt); | |||
| #endif | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| @@ -243,6 +247,7 @@ int TanH_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co | |||
| } | |||
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #if NCNN_BF16 | |||
| int TanH_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| @@ -304,5 +309,6 @@ int TanH_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) con | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| } // namespace ncnn | |||
| @@ -31,7 +31,9 @@ protected: | |||
| int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -35,7 +35,9 @@ UnaryOp_arm::UnaryOp_arm() | |||
| #endif | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| #if __ARM_NEON | |||
| @@ -265,8 +267,10 @@ int UnaryOp_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| return forward_inplace_fp16s(bottom_top_blob, opt); | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_inplace_bf16s(bottom_top_blob, opt); | |||
| #endif | |||
| int elempack = bottom_top_blob.elempack; | |||
| @@ -1099,6 +1103,7 @@ static int unary_op_inplace_pack4_bf16s(Mat& a, const Option& opt) | |||
| } | |||
| #endif // __ARM_NEON | |||
| #if NCNN_BF16 | |||
| template<typename Op> | |||
| static int unary_op_inplace_bf16s(Mat& a, const Option& opt) | |||
| { | |||
| @@ -1375,5 +1380,6 @@ int UnaryOp_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) | |||
| return 0; | |||
| } | |||
| #endif // NCNN_BF16 | |||
| } // namespace ncnn | |||
| @@ -30,7 +30,9 @@ protected: | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| #if NCNN_BF16 | |||
| int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -35,7 +35,9 @@ Concat_riscv::Concat_riscv() | |||
| #endif | |||
| #endif // __riscv_vector | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int Concat_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const | |||
| @@ -47,8 +49,10 @@ int Concat_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat> | |||
| return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt); | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt); | |||
| #endif | |||
| #if __riscv_vector | |||
| const int packn = csrr_vlenb() / 4; | |||
| @@ -35,7 +35,9 @@ Crop_riscv::Crop_riscv() | |||
| #endif | |||
| #endif // __riscv_vector | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| #if __riscv_vector | |||
| @@ -35,7 +35,9 @@ Flatten_riscv::Flatten_riscv() | |||
| #endif | |||
| #endif // __riscv_vector | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| @@ -50,8 +52,10 @@ int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& | |||
| return forward_bf16s_fp16s(bottom_blob, top_blob, opt); | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage && elembits == 16) | |||
| return forward_bf16s_fp16s(bottom_blob, top_blob, opt); | |||
| #endif | |||
| int dims = bottom_blob.dims; | |||
| @@ -39,7 +39,9 @@ Padding_riscv::Padding_riscv() | |||
| #endif | |||
| #endif // __riscv_vector | |||
| #if NCNN_BF16 | |||
| support_bf16_storage = true; | |||
| #endif | |||
| } | |||
| int Padding_riscv::create_pipeline(const Option& opt) | |||
| @@ -51,12 +53,14 @@ int Padding_riscv::create_pipeline(const Option& opt) | |||
| } | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage) | |||
| { | |||
| value_bf16 = float32_to_bfloat16(value); | |||
| ncnn::cast_float32_to_bfloat16(per_channel_pad_data, per_channel_pad_data_bf16, opt); | |||
| } | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -282,19 +286,28 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co | |||
| { | |||
| Mat borderm = top_blob.channel(q); | |||
| #if __riscv_zfh | |||
| // clang-format off | |||
| // *INDENT-OFF* | |||
| vuint16m1_t pad_value; | |||
| #if __riscv_zfh | |||
| if (opt.use_fp16_storage) | |||
| { | |||
| pad_value = per_channel_pad_data_size ? vreinterpret_v_f16m1_u16m1(vle16_v_f16m1((const __fp16*)per_channel_pad_data_fp16 + q * packn, vl)) : vreinterpret_v_f16m1_u16m1(vfmv_v_f_f16m1((__fp16)value, vl)); | |||
| } | |||
| else | |||
| #endif | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage) | |||
| { | |||
| pad_value = per_channel_pad_data_size ? vle16_v_u16m1((const unsigned short*)per_channel_pad_data_bf16 + q * packn, vl) : vmv_v_x_u16m1(value_bf16, vl); | |||
| } | |||
| #else | |||
| vuint16m1_t pad_value = per_channel_pad_data_size ? vle16_v_u16m1((const unsigned short*)per_channel_pad_data_bf16 + q * packn, vl) : vmv_v_x_u16m1(value_bf16, vl); | |||
| else | |||
| #endif | |||
| { | |||
| } | |||
| // *INDENT-ON* | |||
| // clang-format on | |||
| //Channel padding | |||
| if ((q - front_) < 0 || (q - front_) >= channels) | |||
| { | |||
| @@ -34,9 +34,11 @@ protected: | |||
| int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| public: | |||
| #if NCNN_BF16 | |||
| // bf16 | |||
| unsigned short value_bf16; | |||
| Mat per_channel_pad_data_bf16; | |||
| #endif | |||
| // fp16 | |||
| Mat per_channel_pad_data_fp16; | |||
| @@ -766,6 +766,7 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio | |||
| } | |||
| else | |||
| #endif // NCNN_RVV | |||
| #if NCNN_BF16 | |||
| if (opt.use_bf16_storage) | |||
| { | |||
| if (bottom_blob.elembits() == 32 && layer->support_bf16_storage) | |||
| @@ -781,6 +782,11 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio | |||
| bottom_blob = bottom_blob_fp32; | |||
| } | |||
| } | |||
| else | |||
| #endif // NCNN_BF16 | |||
| { | |||
| // no type conversion | |||
| } | |||
| // *INDENT-ON* | |||
| // clang-format on | |||
| @@ -2582,6 +2588,7 @@ int Extractor::extract(int blob_index, Mat& feat, int type) | |||
| } | |||
| else | |||
| #endif // NCNN_ARM82 | |||
| #if NCNN_BF16 | |||
| if (d->opt.use_bf16_storage && (type == 0)) | |||
| { | |||
| if (feat.elembits() == 16) | |||
| @@ -2591,7 +2598,9 @@ int Extractor::extract(int blob_index, Mat& feat, int type) | |||
| feat = feat_fp32; | |||
| } | |||
| } | |||
| else if (feat.elembits() == 8 && (type == 0)) | |||
| else | |||
| #endif // NCNN_BF16 | |||
| if (feat.elembits() == 8 && (type == 0)) | |||
| { | |||
| Mat feat_fp32; | |||
| cast_int8_to_float32(feat, feat_fp32, d->opt); | |||
| @@ -37,6 +37,7 @@ | |||
| #cmakedefine01 NCNN_MMI | |||
| #cmakedefine01 NCNN_RVV | |||
| #cmakedefine01 NCNN_INT8 | |||
| #cmakedefine01 NCNN_BF16 | |||
| #cmakedefine NCNN_VERSION_STRING "@NCNN_VERSION_STRING@" | |||