cmake option NCNN_BF16 (#3068)

4 years ago · cdf45a6512
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -81,6 +81,7 @@ option(NCNN_COVERAGE "build for coverage" OFF)
 option(NCNN_BUILD_BENCHMARK "build benchmark" ON)
 option(NCNN_PYTHON "build python api" OFF)
 option(NCNN_INT8 "int8 inference" ON)
 option(NCNN_BF16 "bf16 inference" ON)

 if(ANDROID OR IOS OR NCNN_SIMPLESTL OR CMAKE_CROSSCOMPILING)
    option(NCNN_DISABLE_RTTI "disable rtti" ON)
--- a/src/layer/arm/batchnorm_arm.cpp
+++ b/src/layer/arm/batchnorm_arm.cpp
@@ -29,7 +29,9 @@ BatchNorm_arm::BatchNorm_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -46,8 +48,10 @@ int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
 #endif

    int dims = bottom_top_blob.dims;
    int elempack = bottom_top_blob.elempack;
@@ -660,6 +664,7 @@ int BatchNorm_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 #if NCNN_BF16
 int BatchNorm_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
 {
    int dims = bottom_top_blob.dims;
@@ -829,5 +834,6 @@ int BatchNorm_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt

    return 0;
 }
 #endif // NCNN_BF16

 } // namespace ncnn
--- a/src/layer/arm/batchnorm_arm.h
+++ b/src/layer/arm/batchnorm_arm.h
@@ -31,7 +31,9 @@ protected:
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
    int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/arm/binaryop_arm.cpp
+++ b/src/layer/arm/binaryop_arm.cpp
@@ -33,7 +33,9 @@ BinaryOp_arm::BinaryOp_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 #if __ARM_NEON
@@ -812,8 +814,10 @@ int BinaryOp_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
        return forward_fp16s(bottom_blobs, top_blobs, opt);
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blobs, top_blobs, opt);
 #endif

    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& bottom_blob1 = bottom_blobs[1];
@@ -866,8 +870,10 @@ int BinaryOp_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
        return forward_inplace_fp16s(bottom_top_blob, opt);
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
 #endif

 #if __ARM_NEON
    int elempack = bottom_top_blob.elempack;
@@ -3258,6 +3264,7 @@ int BinaryOp_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 #if NCNN_BF16
 #if __ARM_NEON
 template<typename Op>
 static int binary_op_pack4_bf16s(const Mat& a, const Mat& b, Mat& c, const Option& opt)
@@ -4727,5 +4734,6 @@ int BinaryOp_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt)

    return 0;
 }
 #endif // NCNN_BF16

 } // namespace ncnn
--- a/src/layer/arm/binaryop_arm.h
+++ b/src/layer/arm/binaryop_arm.h
@@ -33,8 +33,10 @@ protected:
    int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/arm/clip_arm.cpp
+++ b/src/layer/arm/clip_arm.cpp
@@ -29,7 +29,9 @@ Clip_arm::Clip_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int Clip_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -41,8 +43,10 @@ int Clip_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
        return forward_inplace_fp16s(bottom_top_blob, opt);
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
 #endif

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
@@ -259,6 +263,7 @@ int Clip_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) con
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 #if NCNN_BF16
 int Clip_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
 {
    int w = bottom_top_blob.w;
@@ -334,5 +339,6 @@ int Clip_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) con

    return 0;
 }
 #endif // NCNN_BF16

 } // namespace ncnn
--- a/src/layer/arm/clip_arm.h
+++ b/src/layer/arm/clip_arm.h
@@ -30,7 +30,9 @@ protected:
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/arm/concat_arm.cpp
+++ b/src/layer/arm/concat_arm.cpp
@@ -25,7 +25,9 @@ Concat_arm::Concat_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int Concat_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
@@ -37,8 +39,10 @@ int Concat_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
        return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
 #endif

    int dims = bottom_blobs[0].dims;
    int positive_axis = axis < 0 ? dims + axis : axis;
--- a/src/layer/arm/convolution_arm.cpp
+++ b/src/layer/arm/convolution_arm.cpp
@@ -27,17 +27,20 @@

 namespace ncnn {

 #include "convolution_bf16s.h"
 #include "convolution_sgemm.h"

 #include "convolution_1x1.h"
 #include "convolution_1x1_bf16s.h"
 #include "convolution_2x2.h"
 #include "convolution_3x3.h"
 #include "convolution_4x4.h"
 #include "convolution_5x5.h"
 #include "convolution_7x7.h"

 #if NCNN_BF16
 #include "convolution_bf16s.h"
 #include "convolution_1x1_bf16s.h"
 #endif // NCNN_BF16

 #if NCNN_INT8
 #include "convolution_sgemm_int8.h"
 #include "convolution_1x1_int8.h"
@@ -47,27 +50,30 @@ namespace ncnn {

 #if __ARM_NEON
 #include "convolution_pack4.h"
 #include "convolution_pack4_bf16s.h"
 #include "convolution_pack1to4.h"
 #include "convolution_pack1to4_bf16s.h"
 #include "convolution_pack4to1.h"
 #include "convolution_pack4to1_bf16s.h"
 #include "convolution_sgemm_pack4.h"
 #include "convolution_sgemm_pack4_bf16s.h"
 #include "convolution_1x1_pack4.h"
 #include "convolution_1x1_pack4_bf16s.h"
 #include "convolution_1x1_pack4to1.h"
 #include "convolution_1x1_pack4to1_bf16s.h"
 #include "convolution_3x3_pack1to4.h"
 #include "convolution_3x3_pack1to4_bf16s.h"
 #include "convolution_3x3_pack4.h"
 #include "convolution_3x3_pack4_bf16s.h"
 #include "convolution_3x3_pack4to1.h"
 #include "convolution_3x3_pack4to1_bf16s.h"
 #include "convolution_5x5_pack4.h"
 #include "convolution_5x5_pack4_bf16s.h"
 #include "convolution_7x7_pack1to4.h"

 #if NCNN_BF16
 #include "convolution_pack4_bf16s.h"
 #include "convolution_pack1to4_bf16s.h"
 #include "convolution_pack4to1_bf16s.h"
 #include "convolution_sgemm_pack4_bf16s.h"
 #include "convolution_1x1_pack4_bf16s.h"
 #include "convolution_1x1_pack4to1_bf16s.h"
 #include "convolution_3x3_pack1to4_bf16s.h"
 #include "convolution_3x3_pack4_bf16s.h"
 #include "convolution_3x3_pack4to1_bf16s.h"
 #include "convolution_5x5_pack4_bf16s.h"
 #include "convolution_7x7_pack1to4_bf16s.h"
 #endif // NCNN_BF16

 #if NCNN_INT8
 #include "convolution_pack8to4_int8.h"
@@ -122,7 +128,9 @@ Convolution_arm::Convolution_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif

    activation = 0;
    convolution_dilation1 = 0;
@@ -188,10 +196,12 @@ int Convolution_arm::create_pipeline(const Option& opt)
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage)
    {
        return create_pipeline_bf16s(opt);
    }
 #endif

    if ((!support_packing || !opt.use_packing_layout) && !opt.use_bf16_storage && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
    {
@@ -449,8 +459,10 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blob, top_blob, opt);
 #endif

    int w = bottom_blob.w;
    int h = bottom_blob.h;
@@ -1538,6 +1550,7 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 #if NCNN_BF16
 int Convolution_arm::create_pipeline_bf16s(const Option& opt)
 {
    const int maxk = kernel_w * kernel_h;
@@ -1812,6 +1825,7 @@ int Convolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const

    return 0;
 }
 #endif // NCNN_BF16

 #if NCNN_INT8
 int Convolution_arm::create_pipeline_int8_arm(const Option& opt)
--- a/src/layer/arm/convolution_arm.h
+++ b/src/layer/arm/convolution_arm.h
@@ -35,8 +35,10 @@ protected:
    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int create_pipeline_bf16s(const Option& opt);
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
 #if NCNN_INT8
    int create_pipeline_int8_arm(const Option& opt);
    int forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
@@ -66,8 +68,10 @@ public:
    Mat weight_data_fp16;
    Mat bias_data_fp16;

 #if NCNN_BF16
    // bf16
    Mat weight_data_bf16;
 #endif

 #if NCNN_INT8
    // int8
--- a/src/layer/arm/convolutiondepthwise_arm.cpp
+++ b/src/layer/arm/convolutiondepthwise_arm.cpp
@@ -34,9 +34,12 @@ namespace ncnn {

 #if __ARM_NEON
 #include "convolutiondepthwise_3x3_pack4.h"
 #include "convolutiondepthwise_3x3_pack4_bf16s.h"
 #include "convolutiondepthwise_5x5_pack4.h"

 #if NCNN_BF16
 #include "convolutiondepthwise_3x3_pack4_bf16s.h"
 #include "convolutiondepthwise_5x5_pack4_bf16s.h"
 #endif // NCNN_BF16

 #if NCNN_INT8
 #include "convolutiondepthwise_3x3_pack8_int8.h"
@@ -58,7 +61,9 @@ ConvolutionDepthWise_arm::ConvolutionDepthWise_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif

    activation = 0;
 }
@@ -167,6 +172,7 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)
        }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 #if NCNN_BF16
        if (opt.use_bf16_storage)
        {
 #if __ARM_NEON
@@ -186,6 +192,7 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)

            return 0;
        }
 #endif // NCNN_BF16

 #if __ARM_NEON
        // pack4
@@ -362,8 +369,10 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blob, top_blob, opt);
 #endif

    int w = bottom_blob.w;
    int h = bottom_blob.h;
@@ -1158,6 +1167,7 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 #if NCNN_BF16
 int ConvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    int w = bottom_blob.w;
@@ -1456,6 +1466,7 @@ int ConvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blo

    return 0;
 }
 #endif // NCNN_BF16

 #if NCNN_INT8
 int ConvolutionDepthWise_arm::create_pipeline_int8_arm(const Option& opt)
--- a/src/layer/arm/convolutiondepthwise_arm.h
+++ b/src/layer/arm/convolutiondepthwise_arm.h
@@ -35,7 +35,9 @@ protected:
    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
 #if NCNN_INT8
    int create_pipeline_int8_arm(const Option& opt);
    int forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
@@ -52,9 +54,11 @@ public:
    Mat weight_data_fp16;
    Mat bias_data_fp16;

 #if NCNN_BF16
    // bf16
    Mat weight_data_bf16;
    Mat weight_data_pack4_bf16;
 #endif

 #if NCNN_INT8
    // int8
--- a/src/layer/arm/crop_arm.cpp
+++ b/src/layer/arm/crop_arm.cpp
@@ -29,7 +29,9 @@ Crop_arm::Crop_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 #if __ARM_NEON
--- a/src/layer/arm/deconvolution_arm.cpp
+++ b/src/layer/arm/deconvolution_arm.cpp
@@ -40,7 +40,9 @@ Deconvolution_arm::Deconvolution_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif

    activation = 0;
 }
@@ -91,10 +93,12 @@ int Deconvolution_arm::create_pipeline(const Option& opt)
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage)
    {
        return create_pipeline_bf16s(opt);
    }
 #endif

    const int maxk = kernel_w * kernel_h;
    int num_input = weight_data_size / maxk / num_output;
@@ -308,8 +312,10 @@ int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blob, top_blob, opt);
 #endif

    // deconvolv with NxN kernel
    // value = value + bias
@@ -1899,6 +1905,7 @@ int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 #if NCNN_BF16
 int Deconvolution_arm::create_pipeline_bf16s(const Option& opt)
 {
    const int maxk = kernel_w * kernel_h;
@@ -2337,5 +2344,6 @@ int Deconvolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, cons

    return 0;
 }
 #endif // NCNN_BF16

 } // namespace ncnn
--- a/src/layer/arm/deconvolution_arm.h
+++ b/src/layer/arm/deconvolution_arm.h
@@ -35,8 +35,10 @@ protected:
    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int create_pipeline_bf16s(const Option& opt);
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif

 public:
    Layer* activation;
@@ -51,8 +53,10 @@ public:
    Mat weight_data_fp16;
    Mat bias_data_fp16;

 #if NCNN_BF16
    // bf16
    Mat weight_data_bf16;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/arm/deconvolutiondepthwise_arm.cpp
+++ b/src/layer/arm/deconvolutiondepthwise_arm.cpp
@@ -33,7 +33,9 @@ DeconvolutionDepthWise_arm::DeconvolutionDepthWise_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)
@@ -101,6 +103,7 @@ int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)
        }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 #if NCNN_BF16
        if (opt.use_bf16_storage)
        {
 #if __ARM_NEON
@@ -120,6 +123,7 @@ int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)

            return 0;
        }
 #endif // NCNN_BF16

 #if __ARM_NEON
        // pack4
@@ -228,8 +232,10 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, c
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blob, top_blob, opt);
 #endif

    // convolv with NxN kernel
    // value = value + bias
@@ -986,6 +992,7 @@ int DeconvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 #if NCNN_BF16
 int DeconvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    int w = bottom_blob.w;
@@ -1224,5 +1231,6 @@ int DeconvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_b

    return 0;
 }
 #endif // NCNN_BF16

 } // namespace ncnn
--- a/src/layer/arm/deconvolutiondepthwise_arm.h
+++ b/src/layer/arm/deconvolutiondepthwise_arm.h
@@ -34,7 +34,9 @@ protected:
    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif

 public:
    std::vector<ncnn::Layer*> group_ops;
@@ -47,8 +49,10 @@ public:
    Mat weight_data_fp16;
    Mat bias_data_fp16;

 #if NCNN_BF16
    // bf16
    Mat weight_data_bf16;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/arm/dequantize_arm.cpp
+++ b/src/layer/arm/dequantize_arm.cpp
@@ -30,7 +30,9 @@ Dequantize_arm::Dequantize_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int Dequantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
@@ -47,8 +49,10 @@ int Dequantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage)
        return forward_bf16s(bottom_blob, top_blob, opt);
 #endif

    int dims = bottom_blob.dims;
    int elempack = bottom_blob.elempack;
@@ -2285,6 +2289,7 @@ int Dequantize_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 #if NCNN_BF16
 int Dequantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    int dims = bottom_blob.dims;
@@ -3038,5 +3043,6 @@ int Dequantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const O

    return 0;
 }
 #endif // NCNN_BF16

 } // namespace ncnn
--- a/src/layer/arm/dequantize_arm.h
+++ b/src/layer/arm/dequantize_arm.h
@@ -31,7 +31,9 @@ protected:
    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/arm/eltwise_arm.cpp
+++ b/src/layer/arm/eltwise_arm.cpp
@@ -29,7 +29,9 @@ Eltwise_arm::Eltwise_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
@@ -46,8 +48,10 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blobs, top_blobs, opt);
 #endif

    const Mat& bottom_blob = bottom_blobs[0];
    int w = bottom_blob.w;
@@ -2213,6 +2217,7 @@ int Eltwise_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vecto
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 #if NCNN_BF16
 int Eltwise_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
    const Mat& bottom_blob = bottom_blobs[0];
@@ -2980,5 +2985,6 @@ int Eltwise_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector

    return 0;
 }
 #endif // NCNN_BF16

 } // namespace ncnn
--- a/src/layer/arm/eltwise_arm.h
+++ b/src/layer/arm/eltwise_arm.h
@@ -31,7 +31,9 @@ protected:
    int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
    int forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/arm/flatten_arm.cpp
+++ b/src/layer/arm/flatten_arm.cpp
@@ -29,7 +29,9 @@ Flatten_arm::Flatten_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif // NCNN_BF16
 }

 int Flatten_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
@@ -44,8 +46,10 @@ int Flatten_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
        return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
 #endif

    int dims = bottom_blob.dims;

--- a/src/layer/arm/gru_arm.cpp
+++ b/src/layer/arm/gru_arm.cpp
@@ -32,7 +32,9 @@ GRU_arm::GRU_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int GRU_arm::create_pipeline(const Option& opt)
@@ -44,10 +46,12 @@ int GRU_arm::create_pipeline(const Option& opt)
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage)
    {
        return create_pipeline_bf16s(opt);
    }
 #endif

    // pack RUN
    int num_directions = direction == 2 ? 2 : 1;
@@ -627,8 +631,10 @@ int GRU_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blob, top_blob, opt);
 #endif

    int T = bottom_blob.h;

@@ -708,8 +714,10 @@ int GRU_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blobs, top_blobs, opt);
 #endif

    int T = bottom_blob.h;
    Mat& top_blob = top_blobs[0];
@@ -1727,6 +1735,7 @@ int GRU_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Ma
 }
 #endif

 #if NCNN_BF16
 static int gru_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt)
 {
    int size = bottom_blob.w;
@@ -2378,5 +2387,6 @@ int GRU_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat

    return 0;
 }
 #endif // NCNN_BF16

 } // namespace ncnn
--- a/src/layer/arm/gru_arm.h
+++ b/src/layer/arm/gru_arm.h
@@ -36,9 +36,11 @@ protected:
    int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int create_pipeline_bf16s(const Option& opt);
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 #endif

 public:
    Mat weight_xc_data_packed;
--- a/src/layer/arm/hardsigmoid_arm.cpp
+++ b/src/layer/arm/hardsigmoid_arm.cpp
@@ -29,7 +29,9 @@ HardSigmoid_arm::HardSigmoid_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int HardSigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -46,8 +48,10 @@ int HardSigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
 #endif

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
@@ -326,6 +330,7 @@ int HardSigmoid_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option&
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 #if NCNN_BF16
 int HardSigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
 {
    int w = bottom_top_blob.w;
@@ -404,5 +409,6 @@ int HardSigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& o

    return 0;
 }
 #endif // NCNN_BF16

 } // namespace ncnn
--- a/src/layer/arm/hardsigmoid_arm.h
+++ b/src/layer/arm/hardsigmoid_arm.h
@@ -31,7 +31,9 @@ protected:
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
    int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/arm/hardswish_arm.cpp
+++ b/src/layer/arm/hardswish_arm.cpp
@@ -29,7 +29,9 @@ HardSwish_arm::HardSwish_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int HardSwish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -46,8 +48,10 @@ int HardSwish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
 #endif

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
@@ -333,6 +337,7 @@ int HardSwish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 #if NCNN_BF16
 int HardSwish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
 {
    int w = bottom_top_blob.w;
@@ -413,5 +418,6 @@ int HardSwish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt

    return 0;
 }
 #endif // NCNN_BF16

 } // namespace ncnn
--- a/src/layer/arm/hardswish_arm.h
+++ b/src/layer/arm/hardswish_arm.h
@@ -31,7 +31,9 @@ protected:
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
    int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/arm/innerproduct_arm.cpp
+++ b/src/layer/arm/innerproduct_arm.cpp
@@ -34,7 +34,9 @@ InnerProduct_arm::InnerProduct_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif

    flatten = 0;
    activation = 0;
@@ -69,10 +71,12 @@ int InnerProduct_arm::create_pipeline(const Option& opt)
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage)
    {
        return create_pipeline_bf16s(opt);
    }
 #endif

    return 0;
 }
@@ -117,8 +121,10 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blob, top_blob, opt);
 #endif

    const int num_input = weight_data_size / num_output;

@@ -1535,6 +1541,7 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 #if NCNN_BF16
 int InnerProduct_arm::create_pipeline_bf16s(const Option& opt)
 {
    const int num_input = weight_data_size / num_output;
@@ -1895,6 +1902,7 @@ int InnerProduct_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const

    return 0;
 }
 #endif // NCNN_BF16

 #if NCNN_INT8
 int InnerProduct_arm::create_pipeline_int8_arm(const Option& opt)
--- a/src/layer/arm/innerproduct_arm.h
+++ b/src/layer/arm/innerproduct_arm.h
@@ -37,8 +37,10 @@ protected:
    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int create_pipeline_bf16s(const Option& opt);
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
 #if NCNN_INT8
    int create_pipeline_int8_arm(const Option& opt);
    int forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
@@ -52,8 +54,10 @@ public:
    Mat weight_data_fp16;
    Mat bias_data_fp16;

 #if NCNN_BF16
    // bf16
    Mat weight_data_bf16;
 #endif

 #if NCNN_INT8
    // int8
--- a/src/layer/arm/instancenorm_arm.cpp
+++ b/src/layer/arm/instancenorm_arm.cpp
@@ -29,7 +29,9 @@ InstanceNorm_arm::InstanceNorm_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int InstanceNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -41,8 +43,10 @@ int InstanceNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) c
        return forward_inplace_fp16s(bottom_top_blob, opt);
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
 #endif

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
@@ -473,6 +477,7 @@ int InstanceNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option&
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 #if NCNN_BF16
 int InstanceNorm_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
 {
    int w = bottom_top_blob.w;
@@ -646,5 +651,6 @@ int InstanceNorm_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option&

    return 0;
 }
 #endif // NCNN_BF16

 } // namespace ncnn
--- a/src/layer/arm/instancenorm_arm.h
+++ b/src/layer/arm/instancenorm_arm.h
@@ -30,7 +30,9 @@ protected:
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/arm/interp_arm.cpp
+++ b/src/layer/arm/interp_arm.cpp
@@ -23,15 +23,20 @@
 namespace ncnn {

 #include "interp_bicubic.h"
 #include "interp_bicubic_bf16s.h"
 #include "interp_bilinear.h"

 #if NCNN_BF16
 #include "interp_bicubic_bf16s.h"
 #include "interp_bilinear_bf16s.h"
 #endif

 #if __ARM_NEON
 #include "interp_bicubic_pack4.h"
 #include "interp_bicubic_pack4_bf16s.h"
 #include "interp_bilinear_pack4.h"
 #if NCNN_BF16
 #include "interp_bicubic_pack4_bf16s.h"
 #include "interp_bilinear_pack4_bf16s.h"
 #endif
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #include "interp_bicubic_fp16s.h"
 #include "interp_bicubic_pack4_fp16s.h"
@@ -51,7 +56,9 @@ Interp_arm::Interp_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int Interp_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
@@ -72,8 +79,10 @@ int Interp_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blobs, top_blobs, opt);
 #endif

    int h = bottom_blob.h;
    int w = bottom_blob.w;
@@ -830,6 +839,7 @@ int Interp_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 #if NCNN_BF16
 int Interp_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
    const Mat& bottom_blob = bottom_blobs[0];
@@ -1052,5 +1062,6 @@ int Interp_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<

    return 0;
 }
 #endif // NCNN_BF16

 } // namespace ncnn
--- a/src/layer/arm/interp_arm.h
+++ b/src/layer/arm/interp_arm.h
@@ -31,7 +31,9 @@ protected:
    int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
    int forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/arm/lstm_arm.cpp
+++ b/src/layer/arm/lstm_arm.cpp
@@ -32,7 +32,9 @@ LSTM_arm::LSTM_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int LSTM_arm::create_pipeline(const Option& opt)
@@ -44,10 +46,12 @@ int LSTM_arm::create_pipeline(const Option& opt)
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage)
    {
        return create_pipeline_bf16s(opt);
    }
 #endif

    // pack IFOG
    int num_directions = direction == 2 ? 2 : 1;
@@ -349,8 +353,10 @@ int LSTM_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blob, top_blob, opt);
 #endif

    int T = bottom_blob.h;

@@ -436,8 +442,10 @@ int LSTM_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blobs, top_blobs, opt);
 #endif

    int T = bottom_blob.h;
    Mat& top_blob = top_blobs[0];
@@ -1296,6 +1304,7 @@ int LSTM_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<M
 }
 #endif

 #if NCNN_BF16
 static int lstm_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, Mat& cell_state, const Option& opt)
 {
    int size = bottom_blob.w;
@@ -1680,5 +1689,6 @@ int LSTM_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Ma

    return 0;
 }
 #endif // NCNN_BF16

 } // namespace ncnn
--- a/src/layer/arm/lstm_arm.h
+++ b/src/layer/arm/lstm_arm.h
@@ -36,9 +36,11 @@ protected:
    int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int create_pipeline_bf16s(const Option& opt);
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 #endif

 public:
    Mat weight_xc_data_packed;
--- a/src/layer/arm/mish_arm.cpp
+++ b/src/layer/arm/mish_arm.cpp
@@ -35,7 +35,9 @@ Mish_arm::Mish_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int Mish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -52,8 +54,10 @@ int Mish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
 #endif

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
@@ -243,6 +247,7 @@ int Mish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 #if NCNN_BF16
 int Mish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
 {
    int w = bottom_top_blob.w;
@@ -304,5 +309,6 @@ int Mish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) con

    return 0;
 }
 #endif // NCNN_BF16

 } // namespace ncnn
--- a/src/layer/arm/mish_arm.h
+++ b/src/layer/arm/mish_arm.h
@@ -31,7 +31,9 @@ protected:
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
    int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/arm/padding_arm.cpp
+++ b/src/layer/arm/padding_arm.cpp
@@ -38,7 +38,9 @@ Padding_arm::Padding_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int Padding_arm::create_pipeline(const Option& opt)
@@ -50,12 +52,14 @@ int Padding_arm::create_pipeline(const Option& opt)
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage)
    {
        value_bf16 = float32_to_bfloat16(value);

        ncnn::cast_float32_to_bfloat16(per_channel_pad_data, per_channel_pad_data_bf16, opt);
    }
 #endif

    return 0;
 }
@@ -83,8 +87,10 @@ int Padding_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
        return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
 #endif

    int w = bottom_blob.w;
    int h = bottom_blob.h;
@@ -352,19 +358,28 @@ int Padding_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
                {
                    Mat borderm = top_blob.channel(q);

 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                    // clang-format off
                    // *INDENT-OFF*
                    uint16x4_t pad_value;
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                    if (opt.use_fp16_storage)
                    {
                        pad_value = per_channel_pad_data_size ? vreinterpret_u16_f16(vld1_f16((const __fp16*)per_channel_pad_data_fp16 + q * 4)) : vreinterpret_u16_f16(vdup_n_f16((__fp16)value));
                    }
                    else
 #endif
 #if NCNN_BF16
                    if (opt.use_bf16_storage)
                    {
                        pad_value = per_channel_pad_data_size ? vld1_u16((const unsigned short*)per_channel_pad_data_bf16 + q * 4) : vdup_n_u16(value_bf16);
                    }
 #else
                    uint16x4_t pad_value = per_channel_pad_data_size ? vld1_u16((const unsigned short*)per_channel_pad_data_bf16 + q * 4) : vdup_n_u16(value_bf16);
                    else
 #endif
                    {
                    }
                    // *INDENT-ON*
                    // clang-format on

                    //Channel padding
                    if ((q - front_) < 0 || (q - front_) >= channels)
                    {
--- a/src/layer/arm/padding_arm.h
+++ b/src/layer/arm/padding_arm.h
@@ -34,9 +34,11 @@ protected:
    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

 public:
 #if NCNN_BF16
    // bf16
    unsigned short value_bf16;
    Mat per_channel_pad_data_bf16;
 #endif

    // fp16
    Mat per_channel_pad_data_fp16;
--- a/src/layer/arm/pixelshuffle_arm.cpp
+++ b/src/layer/arm/pixelshuffle_arm.cpp
@@ -31,7 +31,9 @@ PixelShuffle_arm::PixelShuffle_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int PixelShuffle_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
@@ -43,8 +45,10 @@ int PixelShuffle_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
        return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
 #endif

    int w = bottom_blob.w;
    int h = bottom_blob.h;
--- a/src/layer/arm/pooling_arm.cpp
+++ b/src/layer/arm/pooling_arm.cpp
@@ -39,7 +39,9 @@ Pooling_arm::Pooling_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int Pooling_arm::create_pipeline(const Option& /*opt*/)
@@ -78,8 +80,10 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blob, top_blob, opt);
 #endif

    // max value in NxN window
    // avg value in NxN window
@@ -1235,6 +1239,7 @@ int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Opt
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 #if NCNN_BF16
 int Pooling_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    // max value in NxN window
@@ -1644,5 +1649,6 @@ int Pooling_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Opti

    return 0;
 }
 #endif // NCNN_BF16

 } // namespace ncnn
--- a/src/layer/arm/pooling_arm.h
+++ b/src/layer/arm/pooling_arm.h
@@ -32,7 +32,9 @@ protected:
    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/arm/prelu_arm.cpp
+++ b/src/layer/arm/prelu_arm.cpp
@@ -29,7 +29,9 @@ PReLU_arm::PReLU_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int PReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -46,8 +48,10 @@ int PReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
 #endif

    int dims = bottom_top_blob.dims;
    int elempack = bottom_top_blob.elempack;
@@ -816,6 +820,7 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 #if NCNN_BF16
 int PReLU_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
 {
    int dims = bottom_top_blob.dims;
@@ -1033,5 +1038,6 @@ int PReLU_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) co

    return 0;
 }
 #endif // NCNN_BF16

 } // namespace ncnn
--- a/src/layer/arm/prelu_arm.h
+++ b/src/layer/arm/prelu_arm.h
@@ -31,7 +31,9 @@ protected:
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
    int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/arm/quantize_arm.cpp
+++ b/src/layer/arm/quantize_arm.cpp
@@ -34,7 +34,9 @@ Quantize_arm::Quantize_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
@@ -51,8 +53,10 @@ int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blob, top_blob, opt);
 #endif

    int dims = bottom_blob.dims;
    int elempack = bottom_blob.elempack;
@@ -1552,6 +1556,7 @@ int Quantize_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Op
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 #if NCNN_BF16
 int Quantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    int dims = bottom_blob.dims;
@@ -1953,5 +1958,6 @@ int Quantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Opt

    return 0;
 }
 #endif // NCNN_BF16

 } // namespace ncnn
--- a/src/layer/arm/quantize_arm.h
+++ b/src/layer/arm/quantize_arm.h
@@ -31,7 +31,9 @@ protected:
    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/arm/relu_arm.cpp
+++ b/src/layer/arm/relu_arm.cpp
@@ -29,7 +29,9 @@ ReLU_arm::ReLU_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int ReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -44,8 +46,10 @@ int ReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
        return forward_inplace_fp16s(bottom_top_blob, opt);
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
 #endif

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
@@ -578,6 +582,7 @@ int ReLU_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) con
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 #if NCNN_BF16
 int ReLU_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
 {
    int w = bottom_top_blob.w;
@@ -881,6 +886,7 @@ int ReLU_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) con

    return 0;
 }
 #endif // NCNN_BF16

 int ReLU_arm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const
 {
--- a/src/layer/arm/relu_arm.h
+++ b/src/layer/arm/relu_arm.h
@@ -30,7 +30,9 @@ protected:
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
    int forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const;
 };

--- a/src/layer/arm/reshape_arm.cpp
+++ b/src/layer/arm/reshape_arm.cpp
@@ -29,7 +29,9 @@ Reshape_arm::Reshape_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int Reshape_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
@@ -41,8 +43,10 @@ int Reshape_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
        return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
 #endif

    int elempack = bottom_blob.elempack;

--- a/src/layer/arm/rnn_arm.cpp
+++ b/src/layer/arm/rnn_arm.cpp
@@ -32,7 +32,9 @@ RNN_arm::RNN_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int RNN_arm::create_pipeline(const Option& opt)
@@ -44,10 +46,12 @@ int RNN_arm::create_pipeline(const Option& opt)
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage)
    {
        return create_pipeline_bf16s(opt);
    }
 #endif

    int num_directions = direction == 2 ? 2 : 1;
    int size = weight_data_size / num_directions / num_output;
@@ -309,8 +313,10 @@ int RNN_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blob, top_blob, opt);
 #endif

    int T = bottom_blob.h;

@@ -390,8 +396,10 @@ int RNN_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blobs, top_blobs, opt);
 #endif

    int T = bottom_blob.h;
    Mat& top_blob = top_blobs[0];
@@ -1067,6 +1075,7 @@ int RNN_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Ma
 }
 #endif

 #if NCNN_BF16
 static int rnn_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt)
 {
    int size = bottom_blob.w;
@@ -1400,5 +1409,6 @@ int RNN_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat

    return 0;
 }
 #endif // NCNN_BF16

 } // namespace ncnn
--- a/src/layer/arm/rnn_arm.h
+++ b/src/layer/arm/rnn_arm.h
@@ -36,9 +36,11 @@ protected:
    int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int create_pipeline_bf16s(const Option& opt);
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 #endif

 public:
    Mat weight_xc_data_packed;
--- a/src/layer/arm/shufflechannel_arm.cpp
+++ b/src/layer/arm/shufflechannel_arm.cpp
@@ -31,7 +31,9 @@ ShuffleChannel_arm::ShuffleChannel_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int ShuffleChannel_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
@@ -43,8 +45,10 @@ int ShuffleChannel_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
        return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
 #endif

    int channels = bottom_blob.c;
    int elempack = bottom_blob.elempack;
--- a/src/layer/arm/sigmoid_arm.cpp
+++ b/src/layer/arm/sigmoid_arm.cpp
@@ -36,7 +36,9 @@ Sigmoid_arm::Sigmoid_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -53,8 +55,10 @@ int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
 #endif

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
@@ -247,6 +251,7 @@ int Sigmoid_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 #if NCNN_BF16
 int Sigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
 {
    int w = bottom_top_blob.w;
@@ -311,5 +316,6 @@ int Sigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt)

    return 0;
 }
 #endif // NCNN_BF16

 } // namespace ncnn
--- a/src/layer/arm/sigmoid_arm.h
+++ b/src/layer/arm/sigmoid_arm.h
@@ -31,7 +31,9 @@ protected:
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
    int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/arm/slice_arm.cpp
+++ b/src/layer/arm/slice_arm.cpp
@@ -25,7 +25,9 @@ Slice_arm::Slice_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int Slice_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
@@ -37,8 +39,10 @@ int Slice_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& t
        return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
 #endif

    const Mat& bottom_blob = bottom_blobs[0];
    int dims = bottom_blob.dims;
--- a/src/layer/arm/swish_arm.cpp
+++ b/src/layer/arm/swish_arm.cpp
@@ -35,7 +35,9 @@ Swish_arm::Swish_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int Swish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -52,8 +54,10 @@ int Swish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
 #endif

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
@@ -250,6 +254,7 @@ int Swish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 #if NCNN_BF16
 int Swish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
 {
    int w = bottom_top_blob.w;
@@ -313,5 +318,6 @@ int Swish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) co

    return 0;
 }
 #endif // NCNN_BF16

 } // namespace ncnn
--- a/src/layer/arm/swish_arm.h
+++ b/src/layer/arm/swish_arm.h
@@ -31,7 +31,9 @@ protected:
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
    int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/arm/tanh_arm.cpp
+++ b/src/layer/arm/tanh_arm.cpp
@@ -35,7 +35,9 @@ TanH_arm::TanH_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int TanH_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -52,8 +54,10 @@ int TanH_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
 #endif

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
@@ -243,6 +247,7 @@ int TanH_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 #if NCNN_BF16
 int TanH_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
 {
    int w = bottom_top_blob.w;
@@ -304,5 +309,6 @@ int TanH_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) con

    return 0;
 }
 #endif // NCNN_BF16

 } // namespace ncnn
--- a/src/layer/arm/tanh_arm.h
+++ b/src/layer/arm/tanh_arm.h
@@ -31,7 +31,9 @@ protected:
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
    int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/arm/unaryop_arm.cpp
+++ b/src/layer/arm/unaryop_arm.cpp
@@ -35,7 +35,9 @@ UnaryOp_arm::UnaryOp_arm()
 #endif
 #endif // __ARM_NEON

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 #if __ARM_NEON
@@ -265,8 +267,10 @@ int UnaryOp_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
        return forward_inplace_fp16s(bottom_top_blob, opt);
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
 #endif

    int elempack = bottom_top_blob.elempack;

@@ -1099,6 +1103,7 @@ static int unary_op_inplace_pack4_bf16s(Mat& a, const Option& opt)
 }
 #endif // __ARM_NEON

 #if NCNN_BF16
 template<typename Op>
 static int unary_op_inplace_bf16s(Mat& a, const Option& opt)
 {
@@ -1375,5 +1380,6 @@ int UnaryOp_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt)

    return 0;
 }
 #endif // NCNN_BF16

 } // namespace ncnn
--- a/src/layer/arm/unaryop_arm.h
+++ b/src/layer/arm/unaryop_arm.h
@@ -30,7 +30,9 @@ protected:
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 #if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/riscv/concat_riscv.cpp
+++ b/src/layer/riscv/concat_riscv.cpp
@@ -35,7 +35,9 @@ Concat_riscv::Concat_riscv()
 #endif
 #endif // __riscv_vector

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int Concat_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
@@ -47,8 +49,10 @@ int Concat_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
        return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
 #endif

 #if __riscv_vector
    const int packn = csrr_vlenb() / 4;
--- a/src/layer/riscv/crop_riscv.cpp
+++ b/src/layer/riscv/crop_riscv.cpp
@@ -35,7 +35,9 @@ Crop_riscv::Crop_riscv()
 #endif
 #endif // __riscv_vector

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 #if __riscv_vector
--- a/src/layer/riscv/flatten_riscv.cpp
+++ b/src/layer/riscv/flatten_riscv.cpp
@@ -35,7 +35,9 @@ Flatten_riscv::Flatten_riscv()
 #endif
 #endif // __riscv_vector

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
@@ -50,8 +52,10 @@ int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
        return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
 #endif

    int dims = bottom_blob.dims;

--- a/src/layer/riscv/padding_riscv.cpp
+++ b/src/layer/riscv/padding_riscv.cpp
@@ -39,7 +39,9 @@ Padding_riscv::Padding_riscv()
 #endif
 #endif // __riscv_vector

 #if NCNN_BF16
    support_bf16_storage = true;
 #endif
 }

 int Padding_riscv::create_pipeline(const Option& opt)
@@ -51,12 +53,14 @@ int Padding_riscv::create_pipeline(const Option& opt)
    }
 #endif

 #if NCNN_BF16
    if (opt.use_bf16_storage)
    {
        value_bf16 = float32_to_bfloat16(value);

        ncnn::cast_float32_to_bfloat16(per_channel_pad_data, per_channel_pad_data_bf16, opt);
    }
 #endif

    return 0;
 }
@@ -282,19 +286,28 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                {
                    Mat borderm = top_blob.channel(q);

 #if __riscv_zfh
                    // clang-format off
                    // *INDENT-OFF*
                    vuint16m1_t pad_value;
 #if __riscv_zfh
                    if (opt.use_fp16_storage)
                    {
                        pad_value = per_channel_pad_data_size ? vreinterpret_v_f16m1_u16m1(vle16_v_f16m1((const __fp16*)per_channel_pad_data_fp16 + q * packn, vl)) : vreinterpret_v_f16m1_u16m1(vfmv_v_f_f16m1((__fp16)value, vl));
                    }
                    else
 #endif
 #if NCNN_BF16
                    if (opt.use_bf16_storage)
                    {
                        pad_value = per_channel_pad_data_size ? vle16_v_u16m1((const unsigned short*)per_channel_pad_data_bf16 + q * packn, vl) : vmv_v_x_u16m1(value_bf16, vl);
                    }
 #else
                    vuint16m1_t pad_value = per_channel_pad_data_size ? vle16_v_u16m1((const unsigned short*)per_channel_pad_data_bf16 + q * packn, vl) : vmv_v_x_u16m1(value_bf16, vl);
                    else
 #endif
                    {
                    }
                    // *INDENT-ON*
                    // clang-format on

                    //Channel padding
                    if ((q - front_) < 0 || (q - front_) >= channels)
                    {
--- a/src/layer/riscv/padding_riscv.h
+++ b/src/layer/riscv/padding_riscv.h
@@ -34,9 +34,11 @@ protected:
    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

 public:
 #if NCNN_BF16
    // bf16
    unsigned short value_bf16;
    Mat per_channel_pad_data_bf16;
 #endif

    // fp16
    Mat per_channel_pad_data_fp16;
--- a/src/net.cpp
+++ b/src/net.cpp
@@ -766,6 +766,7 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
    }
    else
 #endif // NCNN_RVV
 #if NCNN_BF16
    if (opt.use_bf16_storage)
    {
        if (bottom_blob.elembits() == 32 && layer->support_bf16_storage)
@@ -781,6 +782,11 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
            bottom_blob = bottom_blob_fp32;
        }
    }
    else
 #endif // NCNN_BF16
    {
        // no type conversion
    }
    // *INDENT-ON*
    // clang-format on

@@ -2582,6 +2588,7 @@ int Extractor::extract(int blob_index, Mat& feat, int type)
    }
    else
 #endif // NCNN_ARM82
 #if NCNN_BF16
    if (d->opt.use_bf16_storage && (type == 0))
    {
        if (feat.elembits() == 16)
@@ -2591,7 +2598,9 @@ int Extractor::extract(int blob_index, Mat& feat, int type)
            feat = feat_fp32;
        }
    }
    else if (feat.elembits() == 8 && (type == 0))
    else
 #endif // NCNN_BF16
    if (feat.elembits() == 8 && (type == 0))
    {
        Mat feat_fp32;
        cast_int8_to_float32(feat, feat_fp32, d->opt);
--- a/src/platform.h.in
+++ b/src/platform.h.in
@@ -37,6 +37,7 @@
 #cmakedefine01 NCNN_MMI
 #cmakedefine01 NCNN_RVV
 #cmakedefine01 NCNN_INT8
 #cmakedefine01 NCNN_BF16

 #cmakedefine NCNN_VERSION_STRING "@NCNN_VERSION_STRING@"