diff --git a/CMakeLists.txt b/CMakeLists.txt index 5309c5b40..1c4a578be 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -81,6 +81,7 @@ option(NCNN_COVERAGE "build for coverage" OFF) option(NCNN_BUILD_BENCHMARK "build benchmark" ON) option(NCNN_PYTHON "build python api" OFF) option(NCNN_INT8 "int8 inference" ON) +option(NCNN_BF16 "bf16 inference" ON) if(ANDROID OR IOS OR NCNN_SIMPLESTL OR CMAKE_CROSSCOMPILING) option(NCNN_DISABLE_RTTI "disable rtti" ON) diff --git a/src/layer/arm/batchnorm_arm.cpp b/src/layer/arm/batchnorm_arm.cpp index 602befaea..d2087f22a 100644 --- a/src/layer/arm/batchnorm_arm.cpp +++ b/src/layer/arm/batchnorm_arm.cpp @@ -29,7 +29,9 @@ BatchNorm_arm::BatchNorm_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const @@ -46,8 +48,10 @@ int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons } #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_inplace_bf16s(bottom_top_blob, opt); +#endif int dims = bottom_top_blob.dims; int elempack = bottom_top_blob.elempack; @@ -660,6 +664,7 @@ int BatchNorm_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if NCNN_BF16 int BatchNorm_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const { int dims = bottom_top_blob.dims; @@ -829,5 +834,6 @@ int BatchNorm_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt return 0; } +#endif // NCNN_BF16 } // namespace ncnn diff --git a/src/layer/arm/batchnorm_arm.h b/src/layer/arm/batchnorm_arm.h index c50cd47af..f6d5f9896 100644 --- a/src/layer/arm/batchnorm_arm.h +++ b/src/layer/arm/batchnorm_arm.h @@ -31,7 +31,9 @@ protected: int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; #endif +#if NCNN_BF16 int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; +#endif }; } // namespace ncnn diff --git a/src/layer/arm/binaryop_arm.cpp b/src/layer/arm/binaryop_arm.cpp index c5c098b60..d7b97ed02 100644 --- a/src/layer/arm/binaryop_arm.cpp +++ b/src/layer/arm/binaryop_arm.cpp @@ -33,7 +33,9 @@ BinaryOp_arm::BinaryOp_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } #if __ARM_NEON @@ -812,8 +814,10 @@ int BinaryOp_arm::forward(const std::vector& bottom_blobs, std::vector return forward_fp16s(bottom_blobs, top_blobs, opt); #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_bf16s(bottom_blobs, top_blobs, opt); +#endif const Mat& bottom_blob = bottom_blobs[0]; const Mat& bottom_blob1 = bottom_blobs[1]; @@ -866,8 +870,10 @@ int BinaryOp_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const return forward_inplace_fp16s(bottom_top_blob, opt); #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_inplace_bf16s(bottom_top_blob, opt); +#endif #if __ARM_NEON int elempack = bottom_top_blob.elempack; @@ -3258,6 +3264,7 @@ int BinaryOp_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if NCNN_BF16 #if __ARM_NEON template static int binary_op_pack4_bf16s(const Mat& a, const Mat& b, Mat& c, const Option& opt) @@ -4727,5 +4734,6 @@ int BinaryOp_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) return 0; } +#endif // NCNN_BF16 } // namespace ncnn diff --git a/src/layer/arm/binaryop_arm.h b/src/layer/arm/binaryop_arm.h index 88f89638d..cc142f69b 100644 --- a/src/layer/arm/binaryop_arm.h +++ b/src/layer/arm/binaryop_arm.h @@ -33,8 +33,10 @@ protected: int forward_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; #endif +#if NCNN_BF16 int forward_bf16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; +#endif }; } // namespace ncnn diff --git a/src/layer/arm/clip_arm.cpp b/src/layer/arm/clip_arm.cpp index ceeac659d..1285d73a4 100644 --- a/src/layer/arm/clip_arm.cpp +++ b/src/layer/arm/clip_arm.cpp @@ -29,7 +29,9 @@ Clip_arm::Clip_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int Clip_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const @@ -41,8 +43,10 @@ int Clip_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const return forward_inplace_fp16s(bottom_top_blob, opt); #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_inplace_bf16s(bottom_top_blob, opt); +#endif int w = bottom_top_blob.w; int h = bottom_top_blob.h; @@ -259,6 +263,7 @@ int Clip_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) con } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if NCNN_BF16 int Clip_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; @@ -334,5 +339,6 @@ int Clip_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) con return 0; } +#endif // NCNN_BF16 } // namespace ncnn diff --git a/src/layer/arm/clip_arm.h b/src/layer/arm/clip_arm.h index 83ab1d6c9..73ab895f1 100644 --- a/src/layer/arm/clip_arm.h +++ b/src/layer/arm/clip_arm.h @@ -30,7 +30,9 @@ protected: #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; #endif +#if NCNN_BF16 int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; +#endif }; } // namespace ncnn diff --git a/src/layer/arm/concat_arm.cpp b/src/layer/arm/concat_arm.cpp index 794ede95c..42a505b5e 100644 --- a/src/layer/arm/concat_arm.cpp +++ b/src/layer/arm/concat_arm.cpp @@ -25,7 +25,9 @@ Concat_arm::Concat_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int Concat_arm::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const @@ -37,8 +39,10 @@ int Concat_arm::forward(const std::vector& bottom_blobs, std::vector& return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt); #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt); +#endif int dims = bottom_blobs[0].dims; int positive_axis = axis < 0 ? dims + axis : axis; diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp index 2aed43bad..b8a2917c2 100644 --- a/src/layer/arm/convolution_arm.cpp +++ b/src/layer/arm/convolution_arm.cpp @@ -27,17 +27,20 @@ namespace ncnn { -#include "convolution_bf16s.h" #include "convolution_sgemm.h" #include "convolution_1x1.h" -#include "convolution_1x1_bf16s.h" #include "convolution_2x2.h" #include "convolution_3x3.h" #include "convolution_4x4.h" #include "convolution_5x5.h" #include "convolution_7x7.h" +#if NCNN_BF16 +#include "convolution_bf16s.h" +#include "convolution_1x1_bf16s.h" +#endif // NCNN_BF16 + #if NCNN_INT8 #include "convolution_sgemm_int8.h" #include "convolution_1x1_int8.h" @@ -47,27 +50,30 @@ namespace ncnn { #if __ARM_NEON #include "convolution_pack4.h" -#include "convolution_pack4_bf16s.h" #include "convolution_pack1to4.h" -#include "convolution_pack1to4_bf16s.h" #include "convolution_pack4to1.h" -#include "convolution_pack4to1_bf16s.h" #include "convolution_sgemm_pack4.h" -#include "convolution_sgemm_pack4_bf16s.h" #include "convolution_1x1_pack4.h" -#include "convolution_1x1_pack4_bf16s.h" #include "convolution_1x1_pack4to1.h" -#include "convolution_1x1_pack4to1_bf16s.h" #include "convolution_3x3_pack1to4.h" -#include "convolution_3x3_pack1to4_bf16s.h" #include "convolution_3x3_pack4.h" -#include "convolution_3x3_pack4_bf16s.h" #include "convolution_3x3_pack4to1.h" -#include "convolution_3x3_pack4to1_bf16s.h" #include "convolution_5x5_pack4.h" -#include "convolution_5x5_pack4_bf16s.h" #include "convolution_7x7_pack1to4.h" + +#if NCNN_BF16 +#include "convolution_pack4_bf16s.h" +#include "convolution_pack1to4_bf16s.h" +#include "convolution_pack4to1_bf16s.h" +#include "convolution_sgemm_pack4_bf16s.h" +#include "convolution_1x1_pack4_bf16s.h" +#include "convolution_1x1_pack4to1_bf16s.h" +#include "convolution_3x3_pack1to4_bf16s.h" +#include "convolution_3x3_pack4_bf16s.h" +#include "convolution_3x3_pack4to1_bf16s.h" +#include "convolution_5x5_pack4_bf16s.h" #include "convolution_7x7_pack1to4_bf16s.h" +#endif // NCNN_BF16 #if NCNN_INT8 #include "convolution_pack8to4_int8.h" @@ -122,7 +128,9 @@ Convolution_arm::Convolution_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif activation = 0; convolution_dilation1 = 0; @@ -188,10 +196,12 @@ int Convolution_arm::create_pipeline(const Option& opt) } #endif +#if NCNN_BF16 if (opt.use_bf16_storage) { return create_pipeline_bf16s(opt); } +#endif if ((!support_packing || !opt.use_packing_layout) && !opt.use_bf16_storage && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1) { @@ -449,8 +459,10 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option } #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_bf16s(bottom_blob, top_blob, opt); +#endif int w = bottom_blob.w; int h = bottom_blob.h; @@ -1538,6 +1550,7 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if NCNN_BF16 int Convolution_arm::create_pipeline_bf16s(const Option& opt) { const int maxk = kernel_w * kernel_h; @@ -1812,6 +1825,7 @@ int Convolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const return 0; } +#endif // NCNN_BF16 #if NCNN_INT8 int Convolution_arm::create_pipeline_int8_arm(const Option& opt) diff --git a/src/layer/arm/convolution_arm.h b/src/layer/arm/convolution_arm.h index e0eca6a08..18d4fd57c 100644 --- a/src/layer/arm/convolution_arm.h +++ b/src/layer/arm/convolution_arm.h @@ -35,8 +35,10 @@ protected: int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; #endif +#if NCNN_BF16 int create_pipeline_bf16s(const Option& opt); int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif #if NCNN_INT8 int create_pipeline_int8_arm(const Option& opt); int forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; @@ -66,8 +68,10 @@ public: Mat weight_data_fp16; Mat bias_data_fp16; +#if NCNN_BF16 // bf16 Mat weight_data_bf16; +#endif #if NCNN_INT8 // int8 diff --git a/src/layer/arm/convolutiondepthwise_arm.cpp b/src/layer/arm/convolutiondepthwise_arm.cpp index cd8124108..0f0d5a488 100644 --- a/src/layer/arm/convolutiondepthwise_arm.cpp +++ b/src/layer/arm/convolutiondepthwise_arm.cpp @@ -34,9 +34,12 @@ namespace ncnn { #if __ARM_NEON #include "convolutiondepthwise_3x3_pack4.h" -#include "convolutiondepthwise_3x3_pack4_bf16s.h" #include "convolutiondepthwise_5x5_pack4.h" + +#if NCNN_BF16 +#include "convolutiondepthwise_3x3_pack4_bf16s.h" #include "convolutiondepthwise_5x5_pack4_bf16s.h" +#endif // NCNN_BF16 #if NCNN_INT8 #include "convolutiondepthwise_3x3_pack8_int8.h" @@ -58,7 +61,9 @@ ConvolutionDepthWise_arm::ConvolutionDepthWise_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif activation = 0; } @@ -167,6 +172,7 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt) } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if NCNN_BF16 if (opt.use_bf16_storage) { #if __ARM_NEON @@ -186,6 +192,7 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt) return 0; } +#endif // NCNN_BF16 #if __ARM_NEON // pack4 @@ -362,8 +369,10 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con } #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_bf16s(bottom_blob, top_blob, opt); +#endif int w = bottom_blob.w; int h = bottom_blob.h; @@ -1158,6 +1167,7 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if NCNN_BF16 int ConvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { int w = bottom_blob.w; @@ -1456,6 +1466,7 @@ int ConvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blo return 0; } +#endif // NCNN_BF16 #if NCNN_INT8 int ConvolutionDepthWise_arm::create_pipeline_int8_arm(const Option& opt) diff --git a/src/layer/arm/convolutiondepthwise_arm.h b/src/layer/arm/convolutiondepthwise_arm.h index 2cff01cb9..fb9bc427f 100644 --- a/src/layer/arm/convolutiondepthwise_arm.h +++ b/src/layer/arm/convolutiondepthwise_arm.h @@ -35,7 +35,9 @@ protected: int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; #endif +#if NCNN_BF16 int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif #if NCNN_INT8 int create_pipeline_int8_arm(const Option& opt); int forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; @@ -52,9 +54,11 @@ public: Mat weight_data_fp16; Mat bias_data_fp16; +#if NCNN_BF16 // bf16 Mat weight_data_bf16; Mat weight_data_pack4_bf16; +#endif #if NCNN_INT8 // int8 diff --git a/src/layer/arm/crop_arm.cpp b/src/layer/arm/crop_arm.cpp index 1b4b521b5..cf5178bf7 100644 --- a/src/layer/arm/crop_arm.cpp +++ b/src/layer/arm/crop_arm.cpp @@ -29,7 +29,9 @@ Crop_arm::Crop_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } #if __ARM_NEON diff --git a/src/layer/arm/deconvolution_arm.cpp b/src/layer/arm/deconvolution_arm.cpp index 38cf04eff..498137dc4 100644 --- a/src/layer/arm/deconvolution_arm.cpp +++ b/src/layer/arm/deconvolution_arm.cpp @@ -40,7 +40,9 @@ Deconvolution_arm::Deconvolution_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif activation = 0; } @@ -91,10 +93,12 @@ int Deconvolution_arm::create_pipeline(const Option& opt) } #endif +#if NCNN_BF16 if (opt.use_bf16_storage) { return create_pipeline_bf16s(opt); } +#endif const int maxk = kernel_w * kernel_h; int num_input = weight_data_size / maxk / num_output; @@ -308,8 +312,10 @@ int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Opti } #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_bf16s(bottom_blob, top_blob, opt); +#endif // deconvolv with NxN kernel // value = value + bias @@ -1899,6 +1905,7 @@ int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if NCNN_BF16 int Deconvolution_arm::create_pipeline_bf16s(const Option& opt) { const int maxk = kernel_w * kernel_h; @@ -2337,5 +2344,6 @@ int Deconvolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, cons return 0; } +#endif // NCNN_BF16 } // namespace ncnn diff --git a/src/layer/arm/deconvolution_arm.h b/src/layer/arm/deconvolution_arm.h index b8dd38cf1..08bb004eb 100644 --- a/src/layer/arm/deconvolution_arm.h +++ b/src/layer/arm/deconvolution_arm.h @@ -35,8 +35,10 @@ protected: int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; #endif +#if NCNN_BF16 int create_pipeline_bf16s(const Option& opt); int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif public: Layer* activation; @@ -51,8 +53,10 @@ public: Mat weight_data_fp16; Mat bias_data_fp16; +#if NCNN_BF16 // bf16 Mat weight_data_bf16; +#endif }; } // namespace ncnn diff --git a/src/layer/arm/deconvolutiondepthwise_arm.cpp b/src/layer/arm/deconvolutiondepthwise_arm.cpp index 987c54c2d..99a33a1a0 100644 --- a/src/layer/arm/deconvolutiondepthwise_arm.cpp +++ b/src/layer/arm/deconvolutiondepthwise_arm.cpp @@ -33,7 +33,9 @@ DeconvolutionDepthWise_arm::DeconvolutionDepthWise_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt) @@ -101,6 +103,7 @@ int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt) } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if NCNN_BF16 if (opt.use_bf16_storage) { #if __ARM_NEON @@ -120,6 +123,7 @@ int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt) return 0; } +#endif // NCNN_BF16 #if __ARM_NEON // pack4 @@ -228,8 +232,10 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, c } #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_bf16s(bottom_blob, top_blob, opt); +#endif // convolv with NxN kernel // value = value + bias @@ -986,6 +992,7 @@ int DeconvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_ } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if NCNN_BF16 int DeconvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { int w = bottom_blob.w; @@ -1224,5 +1231,6 @@ int DeconvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_b return 0; } +#endif // NCNN_BF16 } // namespace ncnn diff --git a/src/layer/arm/deconvolutiondepthwise_arm.h b/src/layer/arm/deconvolutiondepthwise_arm.h index ba22418a1..9ef46009a 100644 --- a/src/layer/arm/deconvolutiondepthwise_arm.h +++ b/src/layer/arm/deconvolutiondepthwise_arm.h @@ -34,7 +34,9 @@ protected: int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; #endif +#if NCNN_BF16 int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif public: std::vector group_ops; @@ -47,8 +49,10 @@ public: Mat weight_data_fp16; Mat bias_data_fp16; +#if NCNN_BF16 // bf16 Mat weight_data_bf16; +#endif }; } // namespace ncnn diff --git a/src/layer/arm/dequantize_arm.cpp b/src/layer/arm/dequantize_arm.cpp index db4a01755..ea345ded9 100644 --- a/src/layer/arm/dequantize_arm.cpp +++ b/src/layer/arm/dequantize_arm.cpp @@ -30,7 +30,9 @@ Dequantize_arm::Dequantize_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int Dequantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const @@ -47,8 +49,10 @@ int Dequantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& } #endif +#if NCNN_BF16 if (opt.use_bf16_storage) return forward_bf16s(bottom_blob, top_blob, opt); +#endif int dims = bottom_blob.dims; int elempack = bottom_blob.elempack; @@ -2285,6 +2289,7 @@ int Dequantize_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if NCNN_BF16 int Dequantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { int dims = bottom_blob.dims; @@ -3038,5 +3043,6 @@ int Dequantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const O return 0; } +#endif // NCNN_BF16 } // namespace ncnn diff --git a/src/layer/arm/dequantize_arm.h b/src/layer/arm/dequantize_arm.h index 0b0014fd6..8949f3956 100644 --- a/src/layer/arm/dequantize_arm.h +++ b/src/layer/arm/dequantize_arm.h @@ -31,7 +31,9 @@ protected: int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; #endif +#if NCNN_BF16 int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif }; } // namespace ncnn diff --git a/src/layer/arm/eltwise_arm.cpp b/src/layer/arm/eltwise_arm.cpp index 7a03b7f06..dda0a7755 100644 --- a/src/layer/arm/eltwise_arm.cpp +++ b/src/layer/arm/eltwise_arm.cpp @@ -29,7 +29,9 @@ Eltwise_arm::Eltwise_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int Eltwise_arm::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const @@ -46,8 +48,10 @@ int Eltwise_arm::forward(const std::vector& bottom_blobs, std::vector& } #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_bf16s(bottom_blobs, top_blobs, opt); +#endif const Mat& bottom_blob = bottom_blobs[0]; int w = bottom_blob.w; @@ -2213,6 +2217,7 @@ int Eltwise_arm::forward_fp16sa(const std::vector& bottom_blobs, std::vecto } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if NCNN_BF16 int Eltwise_arm::forward_bf16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { const Mat& bottom_blob = bottom_blobs[0]; @@ -2980,5 +2985,6 @@ int Eltwise_arm::forward_bf16s(const std::vector& bottom_blobs, std::vector return 0; } +#endif // NCNN_BF16 } // namespace ncnn diff --git a/src/layer/arm/eltwise_arm.h b/src/layer/arm/eltwise_arm.h index c41197bfb..977458821 100644 --- a/src/layer/arm/eltwise_arm.h +++ b/src/layer/arm/eltwise_arm.h @@ -31,7 +31,9 @@ protected: int forward_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; int forward_fp16sa(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; #endif +#if NCNN_BF16 int forward_bf16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +#endif }; } // namespace ncnn diff --git a/src/layer/arm/flatten_arm.cpp b/src/layer/arm/flatten_arm.cpp index a5664402a..015c5849a 100644 --- a/src/layer/arm/flatten_arm.cpp +++ b/src/layer/arm/flatten_arm.cpp @@ -29,7 +29,9 @@ Flatten_arm::Flatten_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif // NCNN_BF16 } int Flatten_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const @@ -44,8 +46,10 @@ int Flatten_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op return forward_bf16s_fp16s(bottom_blob, top_blob, opt); #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_bf16s_fp16s(bottom_blob, top_blob, opt); +#endif int dims = bottom_blob.dims; diff --git a/src/layer/arm/gru_arm.cpp b/src/layer/arm/gru_arm.cpp index e5f3e6a59..97a2bb61e 100644 --- a/src/layer/arm/gru_arm.cpp +++ b/src/layer/arm/gru_arm.cpp @@ -32,7 +32,9 @@ GRU_arm::GRU_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int GRU_arm::create_pipeline(const Option& opt) @@ -44,10 +46,12 @@ int GRU_arm::create_pipeline(const Option& opt) } #endif +#if NCNN_BF16 if (opt.use_bf16_storage) { return create_pipeline_bf16s(opt); } +#endif // pack RUN int num_directions = direction == 2 ? 2 : 1; @@ -627,8 +631,10 @@ int GRU_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c } #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_bf16s(bottom_blob, top_blob, opt); +#endif int T = bottom_blob.h; @@ -708,8 +714,10 @@ int GRU_arm::forward(const std::vector& bottom_blobs, std::vector& top } #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_bf16s(bottom_blobs, top_blobs, opt); +#endif int T = bottom_blob.h; Mat& top_blob = top_blobs[0]; @@ -1727,6 +1735,7 @@ int GRU_arm::forward_fp16sa(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; #endif +#if NCNN_BF16 int create_pipeline_bf16s(const Option& opt); int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_bf16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +#endif public: Mat weight_xc_data_packed; diff --git a/src/layer/arm/hardsigmoid_arm.cpp b/src/layer/arm/hardsigmoid_arm.cpp index 3a94d7aaf..f7105b193 100644 --- a/src/layer/arm/hardsigmoid_arm.cpp +++ b/src/layer/arm/hardsigmoid_arm.cpp @@ -29,7 +29,9 @@ HardSigmoid_arm::HardSigmoid_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int HardSigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const @@ -46,8 +48,10 @@ int HardSigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) co } #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_inplace_bf16s(bottom_top_blob, opt); +#endif int w = bottom_top_blob.w; int h = bottom_top_blob.h; @@ -326,6 +330,7 @@ int HardSigmoid_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if NCNN_BF16 int HardSigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; @@ -404,5 +409,6 @@ int HardSigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& o return 0; } +#endif // NCNN_BF16 } // namespace ncnn diff --git a/src/layer/arm/hardsigmoid_arm.h b/src/layer/arm/hardsigmoid_arm.h index 93e61e17a..63fdb07d3 100644 --- a/src/layer/arm/hardsigmoid_arm.h +++ b/src/layer/arm/hardsigmoid_arm.h @@ -31,7 +31,9 @@ protected: int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; #endif +#if NCNN_BF16 int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; +#endif }; } // namespace ncnn diff --git a/src/layer/arm/hardswish_arm.cpp b/src/layer/arm/hardswish_arm.cpp index d5e8c0c1c..a3fd6bd6e 100644 --- a/src/layer/arm/hardswish_arm.cpp +++ b/src/layer/arm/hardswish_arm.cpp @@ -29,7 +29,9 @@ HardSwish_arm::HardSwish_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int HardSwish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const @@ -46,8 +48,10 @@ int HardSwish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons } #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_inplace_bf16s(bottom_top_blob, opt); +#endif int w = bottom_top_blob.w; int h = bottom_top_blob.h; @@ -333,6 +337,7 @@ int HardSwish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if NCNN_BF16 int HardSwish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; @@ -413,5 +418,6 @@ int HardSwish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt return 0; } +#endif // NCNN_BF16 } // namespace ncnn diff --git a/src/layer/arm/hardswish_arm.h b/src/layer/arm/hardswish_arm.h index b47b124b7..9a1d7a560 100644 --- a/src/layer/arm/hardswish_arm.h +++ b/src/layer/arm/hardswish_arm.h @@ -31,7 +31,9 @@ protected: int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; #endif +#if NCNN_BF16 int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; +#endif }; } // namespace ncnn diff --git a/src/layer/arm/innerproduct_arm.cpp b/src/layer/arm/innerproduct_arm.cpp index 34edf9273..b0c7a58e0 100644 --- a/src/layer/arm/innerproduct_arm.cpp +++ b/src/layer/arm/innerproduct_arm.cpp @@ -34,7 +34,9 @@ InnerProduct_arm::InnerProduct_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif flatten = 0; activation = 0; @@ -69,10 +71,12 @@ int InnerProduct_arm::create_pipeline(const Option& opt) } #endif +#if NCNN_BF16 if (opt.use_bf16_storage) { return create_pipeline_bf16s(opt); } +#endif return 0; } @@ -117,8 +121,10 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Optio } #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_bf16s(bottom_blob, top_blob, opt); +#endif const int num_input = weight_data_size / num_output; @@ -1535,6 +1541,7 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if NCNN_BF16 int InnerProduct_arm::create_pipeline_bf16s(const Option& opt) { const int num_input = weight_data_size / num_output; @@ -1895,6 +1902,7 @@ int InnerProduct_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const return 0; } +#endif // NCNN_BF16 #if NCNN_INT8 int InnerProduct_arm::create_pipeline_int8_arm(const Option& opt) diff --git a/src/layer/arm/innerproduct_arm.h b/src/layer/arm/innerproduct_arm.h index 4cdc600bb..ba772486b 100644 --- a/src/layer/arm/innerproduct_arm.h +++ b/src/layer/arm/innerproduct_arm.h @@ -37,8 +37,10 @@ protected: int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; #endif +#if NCNN_BF16 int create_pipeline_bf16s(const Option& opt); int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif #if NCNN_INT8 int create_pipeline_int8_arm(const Option& opt); int forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; @@ -52,8 +54,10 @@ public: Mat weight_data_fp16; Mat bias_data_fp16; +#if NCNN_BF16 // bf16 Mat weight_data_bf16; +#endif #if NCNN_INT8 // int8 diff --git a/src/layer/arm/instancenorm_arm.cpp b/src/layer/arm/instancenorm_arm.cpp index 2f80a8b38..4562c6fc3 100644 --- a/src/layer/arm/instancenorm_arm.cpp +++ b/src/layer/arm/instancenorm_arm.cpp @@ -29,7 +29,9 @@ InstanceNorm_arm::InstanceNorm_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int InstanceNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const @@ -41,8 +43,10 @@ int InstanceNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) c return forward_inplace_fp16s(bottom_top_blob, opt); #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_inplace_bf16s(bottom_top_blob, opt); +#endif int w = bottom_top_blob.w; int h = bottom_top_blob.h; @@ -473,6 +477,7 @@ int InstanceNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if NCNN_BF16 int InstanceNorm_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; @@ -646,5 +651,6 @@ int InstanceNorm_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& return 0; } +#endif // NCNN_BF16 } // namespace ncnn diff --git a/src/layer/arm/instancenorm_arm.h b/src/layer/arm/instancenorm_arm.h index 0ea031d02..1e483b6df 100644 --- a/src/layer/arm/instancenorm_arm.h +++ b/src/layer/arm/instancenorm_arm.h @@ -30,7 +30,9 @@ protected: #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; #endif +#if NCNN_BF16 int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; +#endif }; } // namespace ncnn diff --git a/src/layer/arm/interp_arm.cpp b/src/layer/arm/interp_arm.cpp index 858507b18..86c9a3d31 100644 --- a/src/layer/arm/interp_arm.cpp +++ b/src/layer/arm/interp_arm.cpp @@ -23,15 +23,20 @@ namespace ncnn { #include "interp_bicubic.h" -#include "interp_bicubic_bf16s.h" #include "interp_bilinear.h" + +#if NCNN_BF16 +#include "interp_bicubic_bf16s.h" #include "interp_bilinear_bf16s.h" +#endif #if __ARM_NEON #include "interp_bicubic_pack4.h" -#include "interp_bicubic_pack4_bf16s.h" #include "interp_bilinear_pack4.h" +#if NCNN_BF16 +#include "interp_bicubic_pack4_bf16s.h" #include "interp_bilinear_pack4_bf16s.h" +#endif #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #include "interp_bicubic_fp16s.h" #include "interp_bicubic_pack4_fp16s.h" @@ -51,7 +56,9 @@ Interp_arm::Interp_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int Interp_arm::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const @@ -72,8 +79,10 @@ int Interp_arm::forward(const std::vector& bottom_blobs, std::vector& } #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_bf16s(bottom_blobs, top_blobs, opt); +#endif int h = bottom_blob.h; int w = bottom_blob.w; @@ -830,6 +839,7 @@ int Interp_arm::forward_fp16sa(const std::vector& bottom_blobs, std::vector } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if NCNN_BF16 int Interp_arm::forward_bf16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { const Mat& bottom_blob = bottom_blobs[0]; @@ -1052,5 +1062,6 @@ int Interp_arm::forward_bf16s(const std::vector& bottom_blobs, std::vector< return 0; } +#endif // NCNN_BF16 } // namespace ncnn diff --git a/src/layer/arm/interp_arm.h b/src/layer/arm/interp_arm.h index d70bef1dc..7ee5022c9 100644 --- a/src/layer/arm/interp_arm.h +++ b/src/layer/arm/interp_arm.h @@ -31,7 +31,9 @@ protected: int forward_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; int forward_fp16sa(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; #endif +#if NCNN_BF16 int forward_bf16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +#endif }; } // namespace ncnn diff --git a/src/layer/arm/lstm_arm.cpp b/src/layer/arm/lstm_arm.cpp index dd34ef46c..7b8511b8f 100644 --- a/src/layer/arm/lstm_arm.cpp +++ b/src/layer/arm/lstm_arm.cpp @@ -32,7 +32,9 @@ LSTM_arm::LSTM_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int LSTM_arm::create_pipeline(const Option& opt) @@ -44,10 +46,12 @@ int LSTM_arm::create_pipeline(const Option& opt) } #endif +#if NCNN_BF16 if (opt.use_bf16_storage) { return create_pipeline_bf16s(opt); } +#endif // pack IFOG int num_directions = direction == 2 ? 2 : 1; @@ -349,8 +353,10 @@ int LSTM_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) } #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_bf16s(bottom_blob, top_blob, opt); +#endif int T = bottom_blob.h; @@ -436,8 +442,10 @@ int LSTM_arm::forward(const std::vector& bottom_blobs, std::vector& to } #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_bf16s(bottom_blobs, top_blobs, opt); +#endif int T = bottom_blob.h; Mat& top_blob = top_blobs[0]; @@ -1296,6 +1304,7 @@ int LSTM_arm::forward_fp16sa(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; #endif +#if NCNN_BF16 int create_pipeline_bf16s(const Option& opt); int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_bf16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +#endif public: Mat weight_xc_data_packed; diff --git a/src/layer/arm/mish_arm.cpp b/src/layer/arm/mish_arm.cpp index 0b3352428..a51a2ee21 100644 --- a/src/layer/arm/mish_arm.cpp +++ b/src/layer/arm/mish_arm.cpp @@ -35,7 +35,9 @@ Mish_arm::Mish_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int Mish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const @@ -52,8 +54,10 @@ int Mish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const } #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_inplace_bf16s(bottom_top_blob, opt); +#endif int w = bottom_top_blob.w; int h = bottom_top_blob.h; @@ -243,6 +247,7 @@ int Mish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if NCNN_BF16 int Mish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; @@ -304,5 +309,6 @@ int Mish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) con return 0; } +#endif // NCNN_BF16 } // namespace ncnn diff --git a/src/layer/arm/mish_arm.h b/src/layer/arm/mish_arm.h index 3c968d9dc..81b566758 100644 --- a/src/layer/arm/mish_arm.h +++ b/src/layer/arm/mish_arm.h @@ -31,7 +31,9 @@ protected: int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; #endif +#if NCNN_BF16 int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; +#endif }; } // namespace ncnn diff --git a/src/layer/arm/padding_arm.cpp b/src/layer/arm/padding_arm.cpp index 6567fe061..8f61d07a9 100644 --- a/src/layer/arm/padding_arm.cpp +++ b/src/layer/arm/padding_arm.cpp @@ -38,7 +38,9 @@ Padding_arm::Padding_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int Padding_arm::create_pipeline(const Option& opt) @@ -50,12 +52,14 @@ int Padding_arm::create_pipeline(const Option& opt) } #endif +#if NCNN_BF16 if (opt.use_bf16_storage) { value_bf16 = float32_to_bfloat16(value); ncnn::cast_float32_to_bfloat16(per_channel_pad_data, per_channel_pad_data_bf16, opt); } +#endif return 0; } @@ -83,8 +87,10 @@ int Padding_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op return forward_bf16s_fp16s(bottom_blob, top_blob, opt); #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_bf16s_fp16s(bottom_blob, top_blob, opt); +#endif int w = bottom_blob.w; int h = bottom_blob.h; @@ -352,19 +358,28 @@ int Padding_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons { Mat borderm = top_blob.channel(q); -#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + // clang-format off + // *INDENT-OFF* uint16x4_t pad_value; +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC if (opt.use_fp16_storage) { pad_value = per_channel_pad_data_size ? vreinterpret_u16_f16(vld1_f16((const __fp16*)per_channel_pad_data_fp16 + q * 4)) : vreinterpret_u16_f16(vdup_n_f16((__fp16)value)); } else +#endif +#if NCNN_BF16 + if (opt.use_bf16_storage) { pad_value = per_channel_pad_data_size ? vld1_u16((const unsigned short*)per_channel_pad_data_bf16 + q * 4) : vdup_n_u16(value_bf16); } -#else - uint16x4_t pad_value = per_channel_pad_data_size ? vld1_u16((const unsigned short*)per_channel_pad_data_bf16 + q * 4) : vdup_n_u16(value_bf16); + else #endif + { + } + // *INDENT-ON* + // clang-format on + //Channel padding if ((q - front_) < 0 || (q - front_) >= channels) { diff --git a/src/layer/arm/padding_arm.h b/src/layer/arm/padding_arm.h index 25c50e786..dfc088de0 100644 --- a/src/layer/arm/padding_arm.h +++ b/src/layer/arm/padding_arm.h @@ -34,9 +34,11 @@ protected: int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; public: +#if NCNN_BF16 // bf16 unsigned short value_bf16; Mat per_channel_pad_data_bf16; +#endif // fp16 Mat per_channel_pad_data_fp16; diff --git a/src/layer/arm/pixelshuffle_arm.cpp b/src/layer/arm/pixelshuffle_arm.cpp index 1b2548ce5..60581e921 100644 --- a/src/layer/arm/pixelshuffle_arm.cpp +++ b/src/layer/arm/pixelshuffle_arm.cpp @@ -31,7 +31,9 @@ PixelShuffle_arm::PixelShuffle_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int PixelShuffle_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const @@ -43,8 +45,10 @@ int PixelShuffle_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Optio return forward_bf16s_fp16s(bottom_blob, top_blob, opt); #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_bf16s_fp16s(bottom_blob, top_blob, opt); +#endif int w = bottom_blob.w; int h = bottom_blob.h; diff --git a/src/layer/arm/pooling_arm.cpp b/src/layer/arm/pooling_arm.cpp index d010b63f2..ad0a36332 100644 --- a/src/layer/arm/pooling_arm.cpp +++ b/src/layer/arm/pooling_arm.cpp @@ -39,7 +39,9 @@ Pooling_arm::Pooling_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int Pooling_arm::create_pipeline(const Option& /*opt*/) @@ -78,8 +80,10 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op } #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_bf16s(bottom_blob, top_blob, opt); +#endif // max value in NxN window // avg value in NxN window @@ -1235,6 +1239,7 @@ int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Opt } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if NCNN_BF16 int Pooling_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { // max value in NxN window @@ -1644,5 +1649,6 @@ int Pooling_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Opti return 0; } +#endif // NCNN_BF16 } // namespace ncnn diff --git a/src/layer/arm/pooling_arm.h b/src/layer/arm/pooling_arm.h index 5dab6f5d3..de457bf57 100644 --- a/src/layer/arm/pooling_arm.h +++ b/src/layer/arm/pooling_arm.h @@ -32,7 +32,9 @@ protected: int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; #endif +#if NCNN_BF16 int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif }; } // namespace ncnn diff --git a/src/layer/arm/prelu_arm.cpp b/src/layer/arm/prelu_arm.cpp index eb847e203..362272553 100644 --- a/src/layer/arm/prelu_arm.cpp +++ b/src/layer/arm/prelu_arm.cpp @@ -29,7 +29,9 @@ PReLU_arm::PReLU_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int PReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const @@ -46,8 +48,10 @@ int PReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const } #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_inplace_bf16s(bottom_top_blob, opt); +#endif int dims = bottom_top_blob.dims; int elempack = bottom_top_blob.elempack; @@ -816,6 +820,7 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if NCNN_BF16 int PReLU_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const { int dims = bottom_top_blob.dims; @@ -1033,5 +1038,6 @@ int PReLU_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) co return 0; } +#endif // NCNN_BF16 } // namespace ncnn diff --git a/src/layer/arm/prelu_arm.h b/src/layer/arm/prelu_arm.h index be610bce1..90befce2b 100644 --- a/src/layer/arm/prelu_arm.h +++ b/src/layer/arm/prelu_arm.h @@ -31,7 +31,9 @@ protected: int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; #endif +#if NCNN_BF16 int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; +#endif }; } // namespace ncnn diff --git a/src/layer/arm/quantize_arm.cpp b/src/layer/arm/quantize_arm.cpp index a486cd9bb..a9b05872e 100644 --- a/src/layer/arm/quantize_arm.cpp +++ b/src/layer/arm/quantize_arm.cpp @@ -34,7 +34,9 @@ Quantize_arm::Quantize_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const @@ -51,8 +53,10 @@ int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o } #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_bf16s(bottom_blob, top_blob, opt); +#endif int dims = bottom_blob.dims; int elempack = bottom_blob.elempack; @@ -1552,6 +1556,7 @@ int Quantize_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Op } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if NCNN_BF16 int Quantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { int dims = bottom_blob.dims; @@ -1953,5 +1958,6 @@ int Quantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Opt return 0; } +#endif // NCNN_BF16 } // namespace ncnn diff --git a/src/layer/arm/quantize_arm.h b/src/layer/arm/quantize_arm.h index f4edf9d6c..a5b9e5d02 100644 --- a/src/layer/arm/quantize_arm.h +++ b/src/layer/arm/quantize_arm.h @@ -31,7 +31,9 @@ protected: int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; #endif +#if NCNN_BF16 int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif }; } // namespace ncnn diff --git a/src/layer/arm/relu_arm.cpp b/src/layer/arm/relu_arm.cpp index e319e68e3..f8d2b9925 100644 --- a/src/layer/arm/relu_arm.cpp +++ b/src/layer/arm/relu_arm.cpp @@ -29,7 +29,9 @@ ReLU_arm::ReLU_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int ReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const @@ -44,8 +46,10 @@ int ReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const return forward_inplace_fp16s(bottom_top_blob, opt); #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_inplace_bf16s(bottom_top_blob, opt); +#endif int w = bottom_top_blob.w; int h = bottom_top_blob.h; @@ -578,6 +582,7 @@ int ReLU_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) con } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if NCNN_BF16 int ReLU_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; @@ -881,6 +886,7 @@ int ReLU_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) con return 0; } +#endif // NCNN_BF16 int ReLU_arm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const { diff --git a/src/layer/arm/relu_arm.h b/src/layer/arm/relu_arm.h index 4a7bcbaba..cb1be789b 100644 --- a/src/layer/arm/relu_arm.h +++ b/src/layer/arm/relu_arm.h @@ -30,7 +30,9 @@ protected: #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; #endif +#if NCNN_BF16 int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; +#endif int forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const; }; diff --git a/src/layer/arm/reshape_arm.cpp b/src/layer/arm/reshape_arm.cpp index 81493d2d2..3b9c5cfec 100644 --- a/src/layer/arm/reshape_arm.cpp +++ b/src/layer/arm/reshape_arm.cpp @@ -29,7 +29,9 @@ Reshape_arm::Reshape_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int Reshape_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const @@ -41,8 +43,10 @@ int Reshape_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op return forward_bf16s_fp16s(bottom_blob, top_blob, opt); #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_bf16s_fp16s(bottom_blob, top_blob, opt); +#endif int elempack = bottom_blob.elempack; diff --git a/src/layer/arm/rnn_arm.cpp b/src/layer/arm/rnn_arm.cpp index 0de8b994b..9bd5bcc29 100644 --- a/src/layer/arm/rnn_arm.cpp +++ b/src/layer/arm/rnn_arm.cpp @@ -32,7 +32,9 @@ RNN_arm::RNN_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int RNN_arm::create_pipeline(const Option& opt) @@ -44,10 +46,12 @@ int RNN_arm::create_pipeline(const Option& opt) } #endif +#if NCNN_BF16 if (opt.use_bf16_storage) { return create_pipeline_bf16s(opt); } +#endif int num_directions = direction == 2 ? 2 : 1; int size = weight_data_size / num_directions / num_output; @@ -309,8 +313,10 @@ int RNN_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c } #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_bf16s(bottom_blob, top_blob, opt); +#endif int T = bottom_blob.h; @@ -390,8 +396,10 @@ int RNN_arm::forward(const std::vector& bottom_blobs, std::vector& top } #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_bf16s(bottom_blobs, top_blobs, opt); +#endif int T = bottom_blob.h; Mat& top_blob = top_blobs[0]; @@ -1067,6 +1075,7 @@ int RNN_arm::forward_fp16sa(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; #endif +#if NCNN_BF16 int create_pipeline_bf16s(const Option& opt); int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_bf16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +#endif public: Mat weight_xc_data_packed; diff --git a/src/layer/arm/shufflechannel_arm.cpp b/src/layer/arm/shufflechannel_arm.cpp index e657a5a99..54abfddeb 100644 --- a/src/layer/arm/shufflechannel_arm.cpp +++ b/src/layer/arm/shufflechannel_arm.cpp @@ -31,7 +31,9 @@ ShuffleChannel_arm::ShuffleChannel_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int ShuffleChannel_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const @@ -43,8 +45,10 @@ int ShuffleChannel_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Opt return forward_bf16s_fp16s(bottom_blob, top_blob, opt); #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_bf16s_fp16s(bottom_blob, top_blob, opt); +#endif int channels = bottom_blob.c; int elempack = bottom_blob.elempack; diff --git a/src/layer/arm/sigmoid_arm.cpp b/src/layer/arm/sigmoid_arm.cpp index c1d6b0497..f46057594 100644 --- a/src/layer/arm/sigmoid_arm.cpp +++ b/src/layer/arm/sigmoid_arm.cpp @@ -36,7 +36,9 @@ Sigmoid_arm::Sigmoid_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const @@ -53,8 +55,10 @@ int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const } #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_inplace_bf16s(bottom_top_blob, opt); +#endif int w = bottom_top_blob.w; int h = bottom_top_blob.h; @@ -247,6 +251,7 @@ int Sigmoid_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if NCNN_BF16 int Sigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; @@ -311,5 +316,6 @@ int Sigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) return 0; } +#endif // NCNN_BF16 } // namespace ncnn diff --git a/src/layer/arm/sigmoid_arm.h b/src/layer/arm/sigmoid_arm.h index a338c4919..5b52d5885 100644 --- a/src/layer/arm/sigmoid_arm.h +++ b/src/layer/arm/sigmoid_arm.h @@ -31,7 +31,9 @@ protected: int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; #endif +#if NCNN_BF16 int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; +#endif }; } // namespace ncnn diff --git a/src/layer/arm/slice_arm.cpp b/src/layer/arm/slice_arm.cpp index be3db7fe2..c9c19be85 100644 --- a/src/layer/arm/slice_arm.cpp +++ b/src/layer/arm/slice_arm.cpp @@ -25,7 +25,9 @@ Slice_arm::Slice_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int Slice_arm::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const @@ -37,8 +39,10 @@ int Slice_arm::forward(const std::vector& bottom_blobs, std::vector& t return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt); #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt); +#endif const Mat& bottom_blob = bottom_blobs[0]; int dims = bottom_blob.dims; diff --git a/src/layer/arm/swish_arm.cpp b/src/layer/arm/swish_arm.cpp index 80c003f8c..79f9c2f52 100644 --- a/src/layer/arm/swish_arm.cpp +++ b/src/layer/arm/swish_arm.cpp @@ -35,7 +35,9 @@ Swish_arm::Swish_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int Swish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const @@ -52,8 +54,10 @@ int Swish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const } #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_inplace_bf16s(bottom_top_blob, opt); +#endif int w = bottom_top_blob.w; int h = bottom_top_blob.h; @@ -250,6 +254,7 @@ int Swish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if NCNN_BF16 int Swish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; @@ -313,5 +318,6 @@ int Swish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) co return 0; } +#endif // NCNN_BF16 } // namespace ncnn diff --git a/src/layer/arm/swish_arm.h b/src/layer/arm/swish_arm.h index e990983ac..7e38ac922 100644 --- a/src/layer/arm/swish_arm.h +++ b/src/layer/arm/swish_arm.h @@ -31,7 +31,9 @@ protected: int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; #endif +#if NCNN_BF16 int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; +#endif }; } // namespace ncnn diff --git a/src/layer/arm/tanh_arm.cpp b/src/layer/arm/tanh_arm.cpp index f4837e5c4..f71f62204 100644 --- a/src/layer/arm/tanh_arm.cpp +++ b/src/layer/arm/tanh_arm.cpp @@ -35,7 +35,9 @@ TanH_arm::TanH_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } int TanH_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const @@ -52,8 +54,10 @@ int TanH_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const } #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_inplace_bf16s(bottom_top_blob, opt); +#endif int w = bottom_top_blob.w; int h = bottom_top_blob.h; @@ -243,6 +247,7 @@ int TanH_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if NCNN_BF16 int TanH_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; @@ -304,5 +309,6 @@ int TanH_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) con return 0; } +#endif // NCNN_BF16 } // namespace ncnn diff --git a/src/layer/arm/tanh_arm.h b/src/layer/arm/tanh_arm.h index 254a4427c..58697e7f7 100644 --- a/src/layer/arm/tanh_arm.h +++ b/src/layer/arm/tanh_arm.h @@ -31,7 +31,9 @@ protected: int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; #endif +#if NCNN_BF16 int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; +#endif }; } // namespace ncnn diff --git a/src/layer/arm/unaryop_arm.cpp b/src/layer/arm/unaryop_arm.cpp index 7ac1755c5..d809bb80a 100644 --- a/src/layer/arm/unaryop_arm.cpp +++ b/src/layer/arm/unaryop_arm.cpp @@ -35,7 +35,9 @@ UnaryOp_arm::UnaryOp_arm() #endif #endif // __ARM_NEON +#if NCNN_BF16 support_bf16_storage = true; +#endif } #if __ARM_NEON @@ -265,8 +267,10 @@ int UnaryOp_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const return forward_inplace_fp16s(bottom_top_blob, opt); #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_inplace_bf16s(bottom_top_blob, opt); +#endif int elempack = bottom_top_blob.elempack; @@ -1099,6 +1103,7 @@ static int unary_op_inplace_pack4_bf16s(Mat& a, const Option& opt) } #endif // __ARM_NEON +#if NCNN_BF16 template static int unary_op_inplace_bf16s(Mat& a, const Option& opt) { @@ -1375,5 +1380,6 @@ int UnaryOp_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) return 0; } +#endif // NCNN_BF16 } // namespace ncnn diff --git a/src/layer/arm/unaryop_arm.h b/src/layer/arm/unaryop_arm.h index 4a760e4cb..58e39e85a 100644 --- a/src/layer/arm/unaryop_arm.h +++ b/src/layer/arm/unaryop_arm.h @@ -30,7 +30,9 @@ protected: #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; #endif +#if NCNN_BF16 int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; +#endif }; } // namespace ncnn diff --git a/src/layer/riscv/concat_riscv.cpp b/src/layer/riscv/concat_riscv.cpp index d55e5a45f..55bee3a2e 100644 --- a/src/layer/riscv/concat_riscv.cpp +++ b/src/layer/riscv/concat_riscv.cpp @@ -35,7 +35,9 @@ Concat_riscv::Concat_riscv() #endif #endif // __riscv_vector +#if NCNN_BF16 support_bf16_storage = true; +#endif } int Concat_riscv::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const @@ -47,8 +49,10 @@ int Concat_riscv::forward(const std::vector& bottom_blobs, std::vector return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt); #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt); +#endif #if __riscv_vector const int packn = csrr_vlenb() / 4; diff --git a/src/layer/riscv/crop_riscv.cpp b/src/layer/riscv/crop_riscv.cpp index 68d3d000d..2800d0108 100644 --- a/src/layer/riscv/crop_riscv.cpp +++ b/src/layer/riscv/crop_riscv.cpp @@ -35,7 +35,9 @@ Crop_riscv::Crop_riscv() #endif #endif // __riscv_vector +#if NCNN_BF16 support_bf16_storage = true; +#endif } #if __riscv_vector diff --git a/src/layer/riscv/flatten_riscv.cpp b/src/layer/riscv/flatten_riscv.cpp index a9f6f7e8e..c8763d2b7 100644 --- a/src/layer/riscv/flatten_riscv.cpp +++ b/src/layer/riscv/flatten_riscv.cpp @@ -35,7 +35,9 @@ Flatten_riscv::Flatten_riscv() #endif #endif // __riscv_vector +#if NCNN_BF16 support_bf16_storage = true; +#endif } int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const @@ -50,8 +52,10 @@ int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& return forward_bf16s_fp16s(bottom_blob, top_blob, opt); #endif +#if NCNN_BF16 if (opt.use_bf16_storage && elembits == 16) return forward_bf16s_fp16s(bottom_blob, top_blob, opt); +#endif int dims = bottom_blob.dims; diff --git a/src/layer/riscv/padding_riscv.cpp b/src/layer/riscv/padding_riscv.cpp index c90212c1e..6d567cd77 100644 --- a/src/layer/riscv/padding_riscv.cpp +++ b/src/layer/riscv/padding_riscv.cpp @@ -39,7 +39,9 @@ Padding_riscv::Padding_riscv() #endif #endif // __riscv_vector +#if NCNN_BF16 support_bf16_storage = true; +#endif } int Padding_riscv::create_pipeline(const Option& opt) @@ -51,12 +53,14 @@ int Padding_riscv::create_pipeline(const Option& opt) } #endif +#if NCNN_BF16 if (opt.use_bf16_storage) { value_bf16 = float32_to_bfloat16(value); ncnn::cast_float32_to_bfloat16(per_channel_pad_data, per_channel_pad_data_bf16, opt); } +#endif return 0; } @@ -282,19 +286,28 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co { Mat borderm = top_blob.channel(q); -#if __riscv_zfh + // clang-format off + // *INDENT-OFF* vuint16m1_t pad_value; +#if __riscv_zfh if (opt.use_fp16_storage) { pad_value = per_channel_pad_data_size ? vreinterpret_v_f16m1_u16m1(vle16_v_f16m1((const __fp16*)per_channel_pad_data_fp16 + q * packn, vl)) : vreinterpret_v_f16m1_u16m1(vfmv_v_f_f16m1((__fp16)value, vl)); } else +#endif +#if NCNN_BF16 + if (opt.use_bf16_storage) { pad_value = per_channel_pad_data_size ? vle16_v_u16m1((const unsigned short*)per_channel_pad_data_bf16 + q * packn, vl) : vmv_v_x_u16m1(value_bf16, vl); } -#else - vuint16m1_t pad_value = per_channel_pad_data_size ? vle16_v_u16m1((const unsigned short*)per_channel_pad_data_bf16 + q * packn, vl) : vmv_v_x_u16m1(value_bf16, vl); + else #endif + { + } + // *INDENT-ON* + // clang-format on + //Channel padding if ((q - front_) < 0 || (q - front_) >= channels) { diff --git a/src/layer/riscv/padding_riscv.h b/src/layer/riscv/padding_riscv.h index d4ffcc327..c591806fa 100644 --- a/src/layer/riscv/padding_riscv.h +++ b/src/layer/riscv/padding_riscv.h @@ -34,9 +34,11 @@ protected: int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; public: +#if NCNN_BF16 // bf16 unsigned short value_bf16; Mat per_channel_pad_data_bf16; +#endif // fp16 Mat per_channel_pad_data_fp16; diff --git a/src/net.cpp b/src/net.cpp index 927411f0b..6c3ea57e5 100644 --- a/src/net.cpp +++ b/src/net.cpp @@ -766,6 +766,7 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio } else #endif // NCNN_RVV +#if NCNN_BF16 if (opt.use_bf16_storage) { if (bottom_blob.elembits() == 32 && layer->support_bf16_storage) @@ -781,6 +782,11 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio bottom_blob = bottom_blob_fp32; } } + else +#endif // NCNN_BF16 + { + // no type conversion + } // *INDENT-ON* // clang-format on @@ -2582,6 +2588,7 @@ int Extractor::extract(int blob_index, Mat& feat, int type) } else #endif // NCNN_ARM82 +#if NCNN_BF16 if (d->opt.use_bf16_storage && (type == 0)) { if (feat.elembits() == 16) @@ -2591,7 +2598,9 @@ int Extractor::extract(int blob_index, Mat& feat, int type) feat = feat_fp32; } } - else if (feat.elembits() == 8 && (type == 0)) + else +#endif // NCNN_BF16 + if (feat.elembits() == 8 && (type == 0)) { Mat feat_fp32; cast_int8_to_float32(feat, feat_fp32, d->opt); diff --git a/src/platform.h.in b/src/platform.h.in index 6f8723344..c686db45b 100644 --- a/src/platform.h.in +++ b/src/platform.h.in @@ -37,6 +37,7 @@ #cmakedefine01 NCNN_MMI #cmakedefine01 NCNN_RVV #cmakedefine01 NCNN_INT8 +#cmakedefine01 NCNN_BF16 #cmakedefine NCNN_VERSION_STRING "@NCNN_VERSION_STRING@"