Browse Source

cmake option NCNN_BF16 (#3068)

tags/20210720
nihui GitHub 4 years ago
parent
commit
cdf45a6512
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
69 changed files with 357 additions and 22 deletions
  1. +1
    -0
      CMakeLists.txt
  2. +6
    -0
      src/layer/arm/batchnorm_arm.cpp
  3. +2
    -0
      src/layer/arm/batchnorm_arm.h
  4. +8
    -0
      src/layer/arm/binaryop_arm.cpp
  5. +2
    -0
      src/layer/arm/binaryop_arm.h
  6. +6
    -0
      src/layer/arm/clip_arm.cpp
  7. +2
    -0
      src/layer/arm/clip_arm.h
  8. +4
    -0
      src/layer/arm/concat_arm.cpp
  9. +26
    -12
      src/layer/arm/convolution_arm.cpp
  10. +4
    -0
      src/layer/arm/convolution_arm.h
  11. +12
    -1
      src/layer/arm/convolutiondepthwise_arm.cpp
  12. +4
    -0
      src/layer/arm/convolutiondepthwise_arm.h
  13. +2
    -0
      src/layer/arm/crop_arm.cpp
  14. +8
    -0
      src/layer/arm/deconvolution_arm.cpp
  15. +4
    -0
      src/layer/arm/deconvolution_arm.h
  16. +8
    -0
      src/layer/arm/deconvolutiondepthwise_arm.cpp
  17. +4
    -0
      src/layer/arm/deconvolutiondepthwise_arm.h
  18. +6
    -0
      src/layer/arm/dequantize_arm.cpp
  19. +2
    -0
      src/layer/arm/dequantize_arm.h
  20. +6
    -0
      src/layer/arm/eltwise_arm.cpp
  21. +2
    -0
      src/layer/arm/eltwise_arm.h
  22. +4
    -0
      src/layer/arm/flatten_arm.cpp
  23. +10
    -0
      src/layer/arm/gru_arm.cpp
  24. +2
    -0
      src/layer/arm/gru_arm.h
  25. +6
    -0
      src/layer/arm/hardsigmoid_arm.cpp
  26. +2
    -0
      src/layer/arm/hardsigmoid_arm.h
  27. +6
    -0
      src/layer/arm/hardswish_arm.cpp
  28. +2
    -0
      src/layer/arm/hardswish_arm.h
  29. +8
    -0
      src/layer/arm/innerproduct_arm.cpp
  30. +4
    -0
      src/layer/arm/innerproduct_arm.h
  31. +6
    -0
      src/layer/arm/instancenorm_arm.cpp
  32. +2
    -0
      src/layer/arm/instancenorm_arm.h
  33. +13
    -2
      src/layer/arm/interp_arm.cpp
  34. +2
    -0
      src/layer/arm/interp_arm.h
  35. +10
    -0
      src/layer/arm/lstm_arm.cpp
  36. +2
    -0
      src/layer/arm/lstm_arm.h
  37. +6
    -0
      src/layer/arm/mish_arm.cpp
  38. +2
    -0
      src/layer/arm/mish_arm.h
  39. +18
    -3
      src/layer/arm/padding_arm.cpp
  40. +2
    -0
      src/layer/arm/padding_arm.h
  41. +4
    -0
      src/layer/arm/pixelshuffle_arm.cpp
  42. +6
    -0
      src/layer/arm/pooling_arm.cpp
  43. +2
    -0
      src/layer/arm/pooling_arm.h
  44. +6
    -0
      src/layer/arm/prelu_arm.cpp
  45. +2
    -0
      src/layer/arm/prelu_arm.h
  46. +6
    -0
      src/layer/arm/quantize_arm.cpp
  47. +2
    -0
      src/layer/arm/quantize_arm.h
  48. +6
    -0
      src/layer/arm/relu_arm.cpp
  49. +2
    -0
      src/layer/arm/relu_arm.h
  50. +4
    -0
      src/layer/arm/reshape_arm.cpp
  51. +10
    -0
      src/layer/arm/rnn_arm.cpp
  52. +2
    -0
      src/layer/arm/rnn_arm.h
  53. +4
    -0
      src/layer/arm/shufflechannel_arm.cpp
  54. +6
    -0
      src/layer/arm/sigmoid_arm.cpp
  55. +2
    -0
      src/layer/arm/sigmoid_arm.h
  56. +4
    -0
      src/layer/arm/slice_arm.cpp
  57. +6
    -0
      src/layer/arm/swish_arm.cpp
  58. +2
    -0
      src/layer/arm/swish_arm.h
  59. +6
    -0
      src/layer/arm/tanh_arm.cpp
  60. +2
    -0
      src/layer/arm/tanh_arm.h
  61. +6
    -0
      src/layer/arm/unaryop_arm.cpp
  62. +2
    -0
      src/layer/arm/unaryop_arm.h
  63. +4
    -0
      src/layer/riscv/concat_riscv.cpp
  64. +2
    -0
      src/layer/riscv/crop_riscv.cpp
  65. +4
    -0
      src/layer/riscv/flatten_riscv.cpp
  66. +16
    -3
      src/layer/riscv/padding_riscv.cpp
  67. +2
    -0
      src/layer/riscv/padding_riscv.h
  68. +10
    -1
      src/net.cpp
  69. +1
    -0
      src/platform.h.in

+ 1
- 0
CMakeLists.txt View File

@@ -81,6 +81,7 @@ option(NCNN_COVERAGE "build for coverage" OFF)
option(NCNN_BUILD_BENCHMARK "build benchmark" ON)
option(NCNN_PYTHON "build python api" OFF)
option(NCNN_INT8 "int8 inference" ON)
option(NCNN_BF16 "bf16 inference" ON)

if(ANDROID OR IOS OR NCNN_SIMPLESTL OR CMAKE_CROSSCOMPILING)
option(NCNN_DISABLE_RTTI "disable rtti" ON)


+ 6
- 0
src/layer/arm/batchnorm_arm.cpp View File

@@ -29,7 +29,9 @@ BatchNorm_arm::BatchNorm_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -46,8 +48,10 @@ int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

int dims = bottom_top_blob.dims;
int elempack = bottom_top_blob.elempack;
@@ -660,6 +664,7 @@ int BatchNorm_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if NCNN_BF16
int BatchNorm_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
int dims = bottom_top_blob.dims;
@@ -829,5 +834,6 @@ int BatchNorm_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt

return 0;
}
#endif // NCNN_BF16

} // namespace ncnn

+ 2
- 0
src/layer/arm/batchnorm_arm.h View File

@@ -31,7 +31,9 @@ protected:
int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn


+ 8
- 0
src/layer/arm/binaryop_arm.cpp View File

@@ -33,7 +33,9 @@ BinaryOp_arm::BinaryOp_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

#if __ARM_NEON
@@ -812,8 +814,10 @@ int BinaryOp_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
return forward_fp16s(bottom_blobs, top_blobs, opt);
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_bf16s(bottom_blobs, top_blobs, opt);
#endif

const Mat& bottom_blob = bottom_blobs[0];
const Mat& bottom_blob1 = bottom_blobs[1];
@@ -866,8 +870,10 @@ int BinaryOp_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
return forward_inplace_fp16s(bottom_top_blob, opt);
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

#if __ARM_NEON
int elempack = bottom_top_blob.elempack;
@@ -3258,6 +3264,7 @@ int BinaryOp_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if NCNN_BF16
#if __ARM_NEON
template<typename Op>
static int binary_op_pack4_bf16s(const Mat& a, const Mat& b, Mat& c, const Option& opt)
@@ -4727,5 +4734,6 @@ int BinaryOp_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt)

return 0;
}
#endif // NCNN_BF16

} // namespace ncnn

+ 2
- 0
src/layer/arm/binaryop_arm.h View File

@@ -33,8 +33,10 @@ protected:
int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn


+ 6
- 0
src/layer/arm/clip_arm.cpp View File

@@ -29,7 +29,9 @@ Clip_arm::Clip_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int Clip_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -41,8 +43,10 @@ int Clip_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
return forward_inplace_fp16s(bottom_top_blob, opt);
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
@@ -259,6 +263,7 @@ int Clip_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) con
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if NCNN_BF16
int Clip_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
@@ -334,5 +339,6 @@ int Clip_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) con

return 0;
}
#endif // NCNN_BF16

} // namespace ncnn

+ 2
- 0
src/layer/arm/clip_arm.h View File

@@ -30,7 +30,9 @@ protected:
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn


+ 4
- 0
src/layer/arm/concat_arm.cpp View File

@@ -25,7 +25,9 @@ Concat_arm::Concat_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int Concat_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
@@ -37,8 +39,10 @@ int Concat_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
#endif

int dims = bottom_blobs[0].dims;
int positive_axis = axis < 0 ? dims + axis : axis;


+ 26
- 12
src/layer/arm/convolution_arm.cpp View File

@@ -27,17 +27,20 @@

namespace ncnn {

#include "convolution_bf16s.h"
#include "convolution_sgemm.h"

#include "convolution_1x1.h"
#include "convolution_1x1_bf16s.h"
#include "convolution_2x2.h"
#include "convolution_3x3.h"
#include "convolution_4x4.h"
#include "convolution_5x5.h"
#include "convolution_7x7.h"

#if NCNN_BF16
#include "convolution_bf16s.h"
#include "convolution_1x1_bf16s.h"
#endif // NCNN_BF16

#if NCNN_INT8
#include "convolution_sgemm_int8.h"
#include "convolution_1x1_int8.h"
@@ -47,27 +50,30 @@ namespace ncnn {

#if __ARM_NEON
#include "convolution_pack4.h"
#include "convolution_pack4_bf16s.h"
#include "convolution_pack1to4.h"
#include "convolution_pack1to4_bf16s.h"
#include "convolution_pack4to1.h"
#include "convolution_pack4to1_bf16s.h"
#include "convolution_sgemm_pack4.h"
#include "convolution_sgemm_pack4_bf16s.h"
#include "convolution_1x1_pack4.h"
#include "convolution_1x1_pack4_bf16s.h"
#include "convolution_1x1_pack4to1.h"
#include "convolution_1x1_pack4to1_bf16s.h"
#include "convolution_3x3_pack1to4.h"
#include "convolution_3x3_pack1to4_bf16s.h"
#include "convolution_3x3_pack4.h"
#include "convolution_3x3_pack4_bf16s.h"
#include "convolution_3x3_pack4to1.h"
#include "convolution_3x3_pack4to1_bf16s.h"
#include "convolution_5x5_pack4.h"
#include "convolution_5x5_pack4_bf16s.h"
#include "convolution_7x7_pack1to4.h"

#if NCNN_BF16
#include "convolution_pack4_bf16s.h"
#include "convolution_pack1to4_bf16s.h"
#include "convolution_pack4to1_bf16s.h"
#include "convolution_sgemm_pack4_bf16s.h"
#include "convolution_1x1_pack4_bf16s.h"
#include "convolution_1x1_pack4to1_bf16s.h"
#include "convolution_3x3_pack1to4_bf16s.h"
#include "convolution_3x3_pack4_bf16s.h"
#include "convolution_3x3_pack4to1_bf16s.h"
#include "convolution_5x5_pack4_bf16s.h"
#include "convolution_7x7_pack1to4_bf16s.h"
#endif // NCNN_BF16

#if NCNN_INT8
#include "convolution_pack8to4_int8.h"
@@ -122,7 +128,9 @@ Convolution_arm::Convolution_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif

activation = 0;
convolution_dilation1 = 0;
@@ -188,10 +196,12 @@ int Convolution_arm::create_pipeline(const Option& opt)
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage)
{
return create_pipeline_bf16s(opt);
}
#endif

if ((!support_packing || !opt.use_packing_layout) && !opt.use_bf16_storage && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
{
@@ -449,8 +459,10 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_bf16s(bottom_blob, top_blob, opt);
#endif

int w = bottom_blob.w;
int h = bottom_blob.h;
@@ -1538,6 +1550,7 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if NCNN_BF16
int Convolution_arm::create_pipeline_bf16s(const Option& opt)
{
const int maxk = kernel_w * kernel_h;
@@ -1812,6 +1825,7 @@ int Convolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const

return 0;
}
#endif // NCNN_BF16

#if NCNN_INT8
int Convolution_arm::create_pipeline_int8_arm(const Option& opt)


+ 4
- 0
src/layer/arm/convolution_arm.h View File

@@ -35,8 +35,10 @@ protected:
int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
int create_pipeline_bf16s(const Option& opt);
int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
#if NCNN_INT8
int create_pipeline_int8_arm(const Option& opt);
int forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
@@ -66,8 +68,10 @@ public:
Mat weight_data_fp16;
Mat bias_data_fp16;

#if NCNN_BF16
// bf16
Mat weight_data_bf16;
#endif

#if NCNN_INT8
// int8


+ 12
- 1
src/layer/arm/convolutiondepthwise_arm.cpp View File

@@ -34,9 +34,12 @@ namespace ncnn {

#if __ARM_NEON
#include "convolutiondepthwise_3x3_pack4.h"
#include "convolutiondepthwise_3x3_pack4_bf16s.h"
#include "convolutiondepthwise_5x5_pack4.h"

#if NCNN_BF16
#include "convolutiondepthwise_3x3_pack4_bf16s.h"
#include "convolutiondepthwise_5x5_pack4_bf16s.h"
#endif // NCNN_BF16

#if NCNN_INT8
#include "convolutiondepthwise_3x3_pack8_int8.h"
@@ -58,7 +61,9 @@ ConvolutionDepthWise_arm::ConvolutionDepthWise_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif

activation = 0;
}
@@ -167,6 +172,7 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if NCNN_BF16
if (opt.use_bf16_storage)
{
#if __ARM_NEON
@@ -186,6 +192,7 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)

return 0;
}
#endif // NCNN_BF16

#if __ARM_NEON
// pack4
@@ -362,8 +369,10 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_bf16s(bottom_blob, top_blob, opt);
#endif

int w = bottom_blob.w;
int h = bottom_blob.h;
@@ -1158,6 +1167,7 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if NCNN_BF16
int ConvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
int w = bottom_blob.w;
@@ -1456,6 +1466,7 @@ int ConvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blo

return 0;
}
#endif // NCNN_BF16

#if NCNN_INT8
int ConvolutionDepthWise_arm::create_pipeline_int8_arm(const Option& opt)


+ 4
- 0
src/layer/arm/convolutiondepthwise_arm.h View File

@@ -35,7 +35,9 @@ protected:
int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
#if NCNN_INT8
int create_pipeline_int8_arm(const Option& opt);
int forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
@@ -52,9 +54,11 @@ public:
Mat weight_data_fp16;
Mat bias_data_fp16;

#if NCNN_BF16
// bf16
Mat weight_data_bf16;
Mat weight_data_pack4_bf16;
#endif

#if NCNN_INT8
// int8


+ 2
- 0
src/layer/arm/crop_arm.cpp View File

@@ -29,7 +29,9 @@ Crop_arm::Crop_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

#if __ARM_NEON


+ 8
- 0
src/layer/arm/deconvolution_arm.cpp View File

@@ -40,7 +40,9 @@ Deconvolution_arm::Deconvolution_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif

activation = 0;
}
@@ -91,10 +93,12 @@ int Deconvolution_arm::create_pipeline(const Option& opt)
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage)
{
return create_pipeline_bf16s(opt);
}
#endif

const int maxk = kernel_w * kernel_h;
int num_input = weight_data_size / maxk / num_output;
@@ -308,8 +312,10 @@ int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_bf16s(bottom_blob, top_blob, opt);
#endif

// deconvolv with NxN kernel
// value = value + bias
@@ -1899,6 +1905,7 @@ int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if NCNN_BF16
int Deconvolution_arm::create_pipeline_bf16s(const Option& opt)
{
const int maxk = kernel_w * kernel_h;
@@ -2337,5 +2344,6 @@ int Deconvolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, cons

return 0;
}
#endif // NCNN_BF16

} // namespace ncnn

+ 4
- 0
src/layer/arm/deconvolution_arm.h View File

@@ -35,8 +35,10 @@ protected:
int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
int create_pipeline_bf16s(const Option& opt);
int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif

public:
Layer* activation;
@@ -51,8 +53,10 @@ public:
Mat weight_data_fp16;
Mat bias_data_fp16;

#if NCNN_BF16
// bf16
Mat weight_data_bf16;
#endif
};

} // namespace ncnn


+ 8
- 0
src/layer/arm/deconvolutiondepthwise_arm.cpp View File

@@ -33,7 +33,9 @@ DeconvolutionDepthWise_arm::DeconvolutionDepthWise_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)
@@ -101,6 +103,7 @@ int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if NCNN_BF16
if (opt.use_bf16_storage)
{
#if __ARM_NEON
@@ -120,6 +123,7 @@ int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)

return 0;
}
#endif // NCNN_BF16

#if __ARM_NEON
// pack4
@@ -228,8 +232,10 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, c
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_bf16s(bottom_blob, top_blob, opt);
#endif

// convolv with NxN kernel
// value = value + bias
@@ -986,6 +992,7 @@ int DeconvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if NCNN_BF16
int DeconvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
int w = bottom_blob.w;
@@ -1224,5 +1231,6 @@ int DeconvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_b

return 0;
}
#endif // NCNN_BF16

} // namespace ncnn

+ 4
- 0
src/layer/arm/deconvolutiondepthwise_arm.h View File

@@ -34,7 +34,9 @@ protected:
int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif

public:
std::vector<ncnn::Layer*> group_ops;
@@ -47,8 +49,10 @@ public:
Mat weight_data_fp16;
Mat bias_data_fp16;

#if NCNN_BF16
// bf16
Mat weight_data_bf16;
#endif
};

} // namespace ncnn


+ 6
- 0
src/layer/arm/dequantize_arm.cpp View File

@@ -30,7 +30,9 @@ Dequantize_arm::Dequantize_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int Dequantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
@@ -47,8 +49,10 @@ int Dequantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage)
return forward_bf16s(bottom_blob, top_blob, opt);
#endif

int dims = bottom_blob.dims;
int elempack = bottom_blob.elempack;
@@ -2285,6 +2289,7 @@ int Dequantize_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if NCNN_BF16
int Dequantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
int dims = bottom_blob.dims;
@@ -3038,5 +3043,6 @@ int Dequantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const O

return 0;
}
#endif // NCNN_BF16

} // namespace ncnn

+ 2
- 0
src/layer/arm/dequantize_arm.h View File

@@ -31,7 +31,9 @@ protected:
int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn


+ 6
- 0
src/layer/arm/eltwise_arm.cpp View File

@@ -29,7 +29,9 @@ Eltwise_arm::Eltwise_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
@@ -46,8 +48,10 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_bf16s(bottom_blobs, top_blobs, opt);
#endif

const Mat& bottom_blob = bottom_blobs[0];
int w = bottom_blob.w;
@@ -2213,6 +2217,7 @@ int Eltwise_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vecto
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if NCNN_BF16
int Eltwise_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
const Mat& bottom_blob = bottom_blobs[0];
@@ -2980,5 +2985,6 @@ int Eltwise_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector

return 0;
}
#endif // NCNN_BF16

} // namespace ncnn

+ 2
- 0
src/layer/arm/eltwise_arm.h View File

@@ -31,7 +31,9 @@ protected:
int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
int forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif
#if NCNN_BF16
int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif
};

} // namespace ncnn


+ 4
- 0
src/layer/arm/flatten_arm.cpp View File

@@ -29,7 +29,9 @@ Flatten_arm::Flatten_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif // NCNN_BF16
}

int Flatten_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
@@ -44,8 +46,10 @@ int Flatten_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
#endif

int dims = bottom_blob.dims;



+ 10
- 0
src/layer/arm/gru_arm.cpp View File

@@ -32,7 +32,9 @@ GRU_arm::GRU_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int GRU_arm::create_pipeline(const Option& opt)
@@ -44,10 +46,12 @@ int GRU_arm::create_pipeline(const Option& opt)
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage)
{
return create_pipeline_bf16s(opt);
}
#endif

// pack RUN
int num_directions = direction == 2 ? 2 : 1;
@@ -627,8 +631,10 @@ int GRU_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_bf16s(bottom_blob, top_blob, opt);
#endif

int T = bottom_blob.h;

@@ -708,8 +714,10 @@ int GRU_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_bf16s(bottom_blobs, top_blobs, opt);
#endif

int T = bottom_blob.h;
Mat& top_blob = top_blobs[0];
@@ -1727,6 +1735,7 @@ int GRU_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Ma
}
#endif

#if NCNN_BF16
static int gru_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt)
{
int size = bottom_blob.w;
@@ -2378,5 +2387,6 @@ int GRU_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat

return 0;
}
#endif // NCNN_BF16

} // namespace ncnn

+ 2
- 0
src/layer/arm/gru_arm.h View File

@@ -36,9 +36,11 @@ protected:
int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
int forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif
#if NCNN_BF16
int create_pipeline_bf16s(const Option& opt);
int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif

public:
Mat weight_xc_data_packed;


+ 6
- 0
src/layer/arm/hardsigmoid_arm.cpp View File

@@ -29,7 +29,9 @@ HardSigmoid_arm::HardSigmoid_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int HardSigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -46,8 +48,10 @@ int HardSigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
@@ -326,6 +330,7 @@ int HardSigmoid_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option&
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if NCNN_BF16
int HardSigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
@@ -404,5 +409,6 @@ int HardSigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& o

return 0;
}
#endif // NCNN_BF16

} // namespace ncnn

+ 2
- 0
src/layer/arm/hardsigmoid_arm.h View File

@@ -31,7 +31,9 @@ protected:
int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn


+ 6
- 0
src/layer/arm/hardswish_arm.cpp View File

@@ -29,7 +29,9 @@ HardSwish_arm::HardSwish_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int HardSwish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -46,8 +48,10 @@ int HardSwish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
@@ -333,6 +337,7 @@ int HardSwish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if NCNN_BF16
int HardSwish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
@@ -413,5 +418,6 @@ int HardSwish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt

return 0;
}
#endif // NCNN_BF16

} // namespace ncnn

+ 2
- 0
src/layer/arm/hardswish_arm.h View File

@@ -31,7 +31,9 @@ protected:
int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn


+ 8
- 0
src/layer/arm/innerproduct_arm.cpp View File

@@ -34,7 +34,9 @@ InnerProduct_arm::InnerProduct_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif

flatten = 0;
activation = 0;
@@ -69,10 +71,12 @@ int InnerProduct_arm::create_pipeline(const Option& opt)
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage)
{
return create_pipeline_bf16s(opt);
}
#endif

return 0;
}
@@ -117,8 +121,10 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_bf16s(bottom_blob, top_blob, opt);
#endif

const int num_input = weight_data_size / num_output;

@@ -1535,6 +1541,7 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if NCNN_BF16
int InnerProduct_arm::create_pipeline_bf16s(const Option& opt)
{
const int num_input = weight_data_size / num_output;
@@ -1895,6 +1902,7 @@ int InnerProduct_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const

return 0;
}
#endif // NCNN_BF16

#if NCNN_INT8
int InnerProduct_arm::create_pipeline_int8_arm(const Option& opt)


+ 4
- 0
src/layer/arm/innerproduct_arm.h View File

@@ -37,8 +37,10 @@ protected:
int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
int create_pipeline_bf16s(const Option& opt);
int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
#if NCNN_INT8
int create_pipeline_int8_arm(const Option& opt);
int forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
@@ -52,8 +54,10 @@ public:
Mat weight_data_fp16;
Mat bias_data_fp16;

#if NCNN_BF16
// bf16
Mat weight_data_bf16;
#endif

#if NCNN_INT8
// int8


+ 6
- 0
src/layer/arm/instancenorm_arm.cpp View File

@@ -29,7 +29,9 @@ InstanceNorm_arm::InstanceNorm_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int InstanceNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -41,8 +43,10 @@ int InstanceNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) c
return forward_inplace_fp16s(bottom_top_blob, opt);
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
@@ -473,6 +477,7 @@ int InstanceNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option&
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if NCNN_BF16
int InstanceNorm_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
@@ -646,5 +651,6 @@ int InstanceNorm_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option&

return 0;
}
#endif // NCNN_BF16

} // namespace ncnn

+ 2
- 0
src/layer/arm/instancenorm_arm.h View File

@@ -30,7 +30,9 @@ protected:
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn


+ 13
- 2
src/layer/arm/interp_arm.cpp View File

@@ -23,15 +23,20 @@
namespace ncnn {

#include "interp_bicubic.h"
#include "interp_bicubic_bf16s.h"
#include "interp_bilinear.h"

#if NCNN_BF16
#include "interp_bicubic_bf16s.h"
#include "interp_bilinear_bf16s.h"
#endif

#if __ARM_NEON
#include "interp_bicubic_pack4.h"
#include "interp_bicubic_pack4_bf16s.h"
#include "interp_bilinear_pack4.h"
#if NCNN_BF16
#include "interp_bicubic_pack4_bf16s.h"
#include "interp_bilinear_pack4_bf16s.h"
#endif
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#include "interp_bicubic_fp16s.h"
#include "interp_bicubic_pack4_fp16s.h"
@@ -51,7 +56,9 @@ Interp_arm::Interp_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int Interp_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
@@ -72,8 +79,10 @@ int Interp_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_bf16s(bottom_blobs, top_blobs, opt);
#endif

int h = bottom_blob.h;
int w = bottom_blob.w;
@@ -830,6 +839,7 @@ int Interp_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if NCNN_BF16
int Interp_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
const Mat& bottom_blob = bottom_blobs[0];
@@ -1052,5 +1062,6 @@ int Interp_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<

return 0;
}
#endif // NCNN_BF16

} // namespace ncnn

+ 2
- 0
src/layer/arm/interp_arm.h View File

@@ -31,7 +31,9 @@ protected:
int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
int forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif
#if NCNN_BF16
int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif
};

} // namespace ncnn


+ 10
- 0
src/layer/arm/lstm_arm.cpp View File

@@ -32,7 +32,9 @@ LSTM_arm::LSTM_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int LSTM_arm::create_pipeline(const Option& opt)
@@ -44,10 +46,12 @@ int LSTM_arm::create_pipeline(const Option& opt)
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage)
{
return create_pipeline_bf16s(opt);
}
#endif

// pack IFOG
int num_directions = direction == 2 ? 2 : 1;
@@ -349,8 +353,10 @@ int LSTM_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_bf16s(bottom_blob, top_blob, opt);
#endif

int T = bottom_blob.h;

@@ -436,8 +442,10 @@ int LSTM_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_bf16s(bottom_blobs, top_blobs, opt);
#endif

int T = bottom_blob.h;
Mat& top_blob = top_blobs[0];
@@ -1296,6 +1304,7 @@ int LSTM_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<M
}
#endif

#if NCNN_BF16
static int lstm_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, Mat& cell_state, const Option& opt)
{
int size = bottom_blob.w;
@@ -1680,5 +1689,6 @@ int LSTM_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Ma

return 0;
}
#endif // NCNN_BF16

} // namespace ncnn

+ 2
- 0
src/layer/arm/lstm_arm.h View File

@@ -36,9 +36,11 @@ protected:
int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
int forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif
#if NCNN_BF16
int create_pipeline_bf16s(const Option& opt);
int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif

public:
Mat weight_xc_data_packed;


+ 6
- 0
src/layer/arm/mish_arm.cpp View File

@@ -35,7 +35,9 @@ Mish_arm::Mish_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int Mish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -52,8 +54,10 @@ int Mish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
@@ -243,6 +247,7 @@ int Mish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if NCNN_BF16
int Mish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
@@ -304,5 +309,6 @@ int Mish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) con

return 0;
}
#endif // NCNN_BF16

} // namespace ncnn

+ 2
- 0
src/layer/arm/mish_arm.h View File

@@ -31,7 +31,9 @@ protected:
int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn


+ 18
- 3
src/layer/arm/padding_arm.cpp View File

@@ -38,7 +38,9 @@ Padding_arm::Padding_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int Padding_arm::create_pipeline(const Option& opt)
@@ -50,12 +52,14 @@ int Padding_arm::create_pipeline(const Option& opt)
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage)
{
value_bf16 = float32_to_bfloat16(value);

ncnn::cast_float32_to_bfloat16(per_channel_pad_data, per_channel_pad_data_bf16, opt);
}
#endif

return 0;
}
@@ -83,8 +87,10 @@ int Padding_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
#endif

int w = bottom_blob.w;
int h = bottom_blob.h;
@@ -352,19 +358,28 @@ int Padding_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
{
Mat borderm = top_blob.channel(q);

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
// clang-format off
// *INDENT-OFF*
uint16x4_t pad_value;
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
if (opt.use_fp16_storage)
{
pad_value = per_channel_pad_data_size ? vreinterpret_u16_f16(vld1_f16((const __fp16*)per_channel_pad_data_fp16 + q * 4)) : vreinterpret_u16_f16(vdup_n_f16((__fp16)value));
}
else
#endif
#if NCNN_BF16
if (opt.use_bf16_storage)
{
pad_value = per_channel_pad_data_size ? vld1_u16((const unsigned short*)per_channel_pad_data_bf16 + q * 4) : vdup_n_u16(value_bf16);
}
#else
uint16x4_t pad_value = per_channel_pad_data_size ? vld1_u16((const unsigned short*)per_channel_pad_data_bf16 + q * 4) : vdup_n_u16(value_bf16);
else
#endif
{
}
// *INDENT-ON*
// clang-format on

//Channel padding
if ((q - front_) < 0 || (q - front_) >= channels)
{


+ 2
- 0
src/layer/arm/padding_arm.h View File

@@ -34,9 +34,11 @@ protected:
int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

public:
#if NCNN_BF16
// bf16
unsigned short value_bf16;
Mat per_channel_pad_data_bf16;
#endif

// fp16
Mat per_channel_pad_data_fp16;


+ 4
- 0
src/layer/arm/pixelshuffle_arm.cpp View File

@@ -31,7 +31,9 @@ PixelShuffle_arm::PixelShuffle_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int PixelShuffle_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
@@ -43,8 +45,10 @@ int PixelShuffle_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
#endif

int w = bottom_blob.w;
int h = bottom_blob.h;


+ 6
- 0
src/layer/arm/pooling_arm.cpp View File

@@ -39,7 +39,9 @@ Pooling_arm::Pooling_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int Pooling_arm::create_pipeline(const Option& /*opt*/)
@@ -78,8 +80,10 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_bf16s(bottom_blob, top_blob, opt);
#endif

// max value in NxN window
// avg value in NxN window
@@ -1235,6 +1239,7 @@ int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Opt
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if NCNN_BF16
int Pooling_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
// max value in NxN window
@@ -1644,5 +1649,6 @@ int Pooling_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Opti

return 0;
}
#endif // NCNN_BF16

} // namespace ncnn

+ 2
- 0
src/layer/arm/pooling_arm.h View File

@@ -32,7 +32,9 @@ protected:
int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn


+ 6
- 0
src/layer/arm/prelu_arm.cpp View File

@@ -29,7 +29,9 @@ PReLU_arm::PReLU_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int PReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -46,8 +48,10 @@ int PReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

int dims = bottom_top_blob.dims;
int elempack = bottom_top_blob.elempack;
@@ -816,6 +820,7 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if NCNN_BF16
int PReLU_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
int dims = bottom_top_blob.dims;
@@ -1033,5 +1038,6 @@ int PReLU_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) co

return 0;
}
#endif // NCNN_BF16

} // namespace ncnn

+ 2
- 0
src/layer/arm/prelu_arm.h View File

@@ -31,7 +31,9 @@ protected:
int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn


+ 6
- 0
src/layer/arm/quantize_arm.cpp View File

@@ -34,7 +34,9 @@ Quantize_arm::Quantize_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
@@ -51,8 +53,10 @@ int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_bf16s(bottom_blob, top_blob, opt);
#endif

int dims = bottom_blob.dims;
int elempack = bottom_blob.elempack;
@@ -1552,6 +1556,7 @@ int Quantize_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Op
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if NCNN_BF16
int Quantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
int dims = bottom_blob.dims;
@@ -1953,5 +1958,6 @@ int Quantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Opt

return 0;
}
#endif // NCNN_BF16

} // namespace ncnn

+ 2
- 0
src/layer/arm/quantize_arm.h View File

@@ -31,7 +31,9 @@ protected:
int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn


+ 6
- 0
src/layer/arm/relu_arm.cpp View File

@@ -29,7 +29,9 @@ ReLU_arm::ReLU_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int ReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -44,8 +46,10 @@ int ReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
return forward_inplace_fp16s(bottom_top_blob, opt);
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
@@ -578,6 +582,7 @@ int ReLU_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) con
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if NCNN_BF16
int ReLU_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
@@ -881,6 +886,7 @@ int ReLU_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) con

return 0;
}
#endif // NCNN_BF16

int ReLU_arm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const
{


+ 2
- 0
src/layer/arm/relu_arm.h View File

@@ -30,7 +30,9 @@ protected:
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
int forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const;
};



+ 4
- 0
src/layer/arm/reshape_arm.cpp View File

@@ -29,7 +29,9 @@ Reshape_arm::Reshape_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int Reshape_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
@@ -41,8 +43,10 @@ int Reshape_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
#endif

int elempack = bottom_blob.elempack;



+ 10
- 0
src/layer/arm/rnn_arm.cpp View File

@@ -32,7 +32,9 @@ RNN_arm::RNN_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int RNN_arm::create_pipeline(const Option& opt)
@@ -44,10 +46,12 @@ int RNN_arm::create_pipeline(const Option& opt)
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage)
{
return create_pipeline_bf16s(opt);
}
#endif

int num_directions = direction == 2 ? 2 : 1;
int size = weight_data_size / num_directions / num_output;
@@ -309,8 +313,10 @@ int RNN_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_bf16s(bottom_blob, top_blob, opt);
#endif

int T = bottom_blob.h;

@@ -390,8 +396,10 @@ int RNN_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_bf16s(bottom_blobs, top_blobs, opt);
#endif

int T = bottom_blob.h;
Mat& top_blob = top_blobs[0];
@@ -1067,6 +1075,7 @@ int RNN_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Ma
}
#endif

#if NCNN_BF16
static int rnn_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt)
{
int size = bottom_blob.w;
@@ -1400,5 +1409,6 @@ int RNN_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat

return 0;
}
#endif // NCNN_BF16

} // namespace ncnn

+ 2
- 0
src/layer/arm/rnn_arm.h View File

@@ -36,9 +36,11 @@ protected:
int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
int forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif
#if NCNN_BF16
int create_pipeline_bf16s(const Option& opt);
int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif

public:
Mat weight_xc_data_packed;


+ 4
- 0
src/layer/arm/shufflechannel_arm.cpp View File

@@ -31,7 +31,9 @@ ShuffleChannel_arm::ShuffleChannel_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int ShuffleChannel_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
@@ -43,8 +45,10 @@ int ShuffleChannel_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
#endif

int channels = bottom_blob.c;
int elempack = bottom_blob.elempack;


+ 6
- 0
src/layer/arm/sigmoid_arm.cpp View File

@@ -36,7 +36,9 @@ Sigmoid_arm::Sigmoid_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -53,8 +55,10 @@ int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
@@ -247,6 +251,7 @@ int Sigmoid_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if NCNN_BF16
int Sigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
@@ -311,5 +316,6 @@ int Sigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt)

return 0;
}
#endif // NCNN_BF16

} // namespace ncnn

+ 2
- 0
src/layer/arm/sigmoid_arm.h View File

@@ -31,7 +31,9 @@ protected:
int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn


+ 4
- 0
src/layer/arm/slice_arm.cpp View File

@@ -25,7 +25,9 @@ Slice_arm::Slice_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int Slice_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
@@ -37,8 +39,10 @@ int Slice_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& t
return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
#endif

const Mat& bottom_blob = bottom_blobs[0];
int dims = bottom_blob.dims;


+ 6
- 0
src/layer/arm/swish_arm.cpp View File

@@ -35,7 +35,9 @@ Swish_arm::Swish_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int Swish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -52,8 +54,10 @@ int Swish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
@@ -250,6 +254,7 @@ int Swish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if NCNN_BF16
int Swish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
@@ -313,5 +318,6 @@ int Swish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) co

return 0;
}
#endif // NCNN_BF16

} // namespace ncnn

+ 2
- 0
src/layer/arm/swish_arm.h View File

@@ -31,7 +31,9 @@ protected:
int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn


+ 6
- 0
src/layer/arm/tanh_arm.cpp View File

@@ -35,7 +35,9 @@ TanH_arm::TanH_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int TanH_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -52,8 +54,10 @@ int TanH_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
@@ -243,6 +247,7 @@ int TanH_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if NCNN_BF16
int TanH_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
@@ -304,5 +309,6 @@ int TanH_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) con

return 0;
}
#endif // NCNN_BF16

} // namespace ncnn

+ 2
- 0
src/layer/arm/tanh_arm.h View File

@@ -31,7 +31,9 @@ protected:
int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn


+ 6
- 0
src/layer/arm/unaryop_arm.cpp View File

@@ -35,7 +35,9 @@ UnaryOp_arm::UnaryOp_arm()
#endif
#endif // __ARM_NEON

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

#if __ARM_NEON
@@ -265,8 +267,10 @@ int UnaryOp_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
return forward_inplace_fp16s(bottom_top_blob, opt);
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

int elempack = bottom_top_blob.elempack;

@@ -1099,6 +1103,7 @@ static int unary_op_inplace_pack4_bf16s(Mat& a, const Option& opt)
}
#endif // __ARM_NEON

#if NCNN_BF16
template<typename Op>
static int unary_op_inplace_bf16s(Mat& a, const Option& opt)
{
@@ -1375,5 +1380,6 @@ int UnaryOp_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt)

return 0;
}
#endif // NCNN_BF16

} // namespace ncnn

+ 2
- 0
src/layer/arm/unaryop_arm.h View File

@@ -30,7 +30,9 @@ protected:
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn


+ 4
- 0
src/layer/riscv/concat_riscv.cpp View File

@@ -35,7 +35,9 @@ Concat_riscv::Concat_riscv()
#endif
#endif // __riscv_vector

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int Concat_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
@@ -47,8 +49,10 @@ int Concat_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
#endif

#if __riscv_vector
const int packn = csrr_vlenb() / 4;


+ 2
- 0
src/layer/riscv/crop_riscv.cpp View File

@@ -35,7 +35,9 @@ Crop_riscv::Crop_riscv()
#endif
#endif // __riscv_vector

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

#if __riscv_vector


+ 4
- 0
src/layer/riscv/flatten_riscv.cpp View File

@@ -35,7 +35,9 @@ Flatten_riscv::Flatten_riscv()
#endif
#endif // __riscv_vector

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
@@ -50,8 +52,10 @@ int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
#endif

#if NCNN_BF16
if (opt.use_bf16_storage && elembits == 16)
return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
#endif

int dims = bottom_blob.dims;



+ 16
- 3
src/layer/riscv/padding_riscv.cpp View File

@@ -39,7 +39,9 @@ Padding_riscv::Padding_riscv()
#endif
#endif // __riscv_vector

#if NCNN_BF16
support_bf16_storage = true;
#endif
}

int Padding_riscv::create_pipeline(const Option& opt)
@@ -51,12 +53,14 @@ int Padding_riscv::create_pipeline(const Option& opt)
}
#endif

#if NCNN_BF16
if (opt.use_bf16_storage)
{
value_bf16 = float32_to_bfloat16(value);

ncnn::cast_float32_to_bfloat16(per_channel_pad_data, per_channel_pad_data_bf16, opt);
}
#endif

return 0;
}
@@ -282,19 +286,28 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
{
Mat borderm = top_blob.channel(q);

#if __riscv_zfh
// clang-format off
// *INDENT-OFF*
vuint16m1_t pad_value;
#if __riscv_zfh
if (opt.use_fp16_storage)
{
pad_value = per_channel_pad_data_size ? vreinterpret_v_f16m1_u16m1(vle16_v_f16m1((const __fp16*)per_channel_pad_data_fp16 + q * packn, vl)) : vreinterpret_v_f16m1_u16m1(vfmv_v_f_f16m1((__fp16)value, vl));
}
else
#endif
#if NCNN_BF16
if (opt.use_bf16_storage)
{
pad_value = per_channel_pad_data_size ? vle16_v_u16m1((const unsigned short*)per_channel_pad_data_bf16 + q * packn, vl) : vmv_v_x_u16m1(value_bf16, vl);
}
#else
vuint16m1_t pad_value = per_channel_pad_data_size ? vle16_v_u16m1((const unsigned short*)per_channel_pad_data_bf16 + q * packn, vl) : vmv_v_x_u16m1(value_bf16, vl);
else
#endif
{
}
// *INDENT-ON*
// clang-format on

//Channel padding
if ((q - front_) < 0 || (q - front_) >= channels)
{


+ 2
- 0
src/layer/riscv/padding_riscv.h View File

@@ -34,9 +34,11 @@ protected:
int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

public:
#if NCNN_BF16
// bf16
unsigned short value_bf16;
Mat per_channel_pad_data_bf16;
#endif

// fp16
Mat per_channel_pad_data_fp16;


+ 10
- 1
src/net.cpp View File

@@ -766,6 +766,7 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
}
else
#endif // NCNN_RVV
#if NCNN_BF16
if (opt.use_bf16_storage)
{
if (bottom_blob.elembits() == 32 && layer->support_bf16_storage)
@@ -781,6 +782,11 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
bottom_blob = bottom_blob_fp32;
}
}
else
#endif // NCNN_BF16
{
// no type conversion
}
// *INDENT-ON*
// clang-format on

@@ -2582,6 +2588,7 @@ int Extractor::extract(int blob_index, Mat& feat, int type)
}
else
#endif // NCNN_ARM82
#if NCNN_BF16
if (d->opt.use_bf16_storage && (type == 0))
{
if (feat.elembits() == 16)
@@ -2591,7 +2598,9 @@ int Extractor::extract(int blob_index, Mat& feat, int type)
feat = feat_fp32;
}
}
else if (feat.elembits() == 8 && (type == 0))
else
#endif // NCNN_BF16
if (feat.elembits() == 8 && (type == 0))
{
Mat feat_fp32;
cast_int8_to_float32(feat, feat_fp32, d->opt);


+ 1
- 0
src/platform.h.in View File

@@ -37,6 +37,7 @@
#cmakedefine01 NCNN_MMI
#cmakedefine01 NCNN_RVV
#cmakedefine01 NCNN_INT8
#cmakedefine01 NCNN_BF16

#cmakedefine NCNN_VERSION_STRING "@NCNN_VERSION_STRING@"



Loading…
Cancel
Save