cmake option NCNN_INT8 (#2839)

5 years ago · 7e1aaa5828
--- a/.github/workflows/linux-aarch64-cpu-gcc.yml
+++ b/.github/workflows/linux-aarch64-cpu-gcc.yml
@@ -43,17 +43,28 @@ jobs:
        sudo apt-get update
        sudo apt-get install g++-aarch64-linux-gnu

    - name: configure
      run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
    - name: build
      run: cmake --build build -j 2

      run: |
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 2
    - name: test
      run: |
        export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
        cd build
        TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2

    - name: build-noint8
      run: |
        mkdir build-noint8 && cd build-noint8
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 2
    - name: test-noint8
      run: |
        export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
        cd build-noint8
        TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2

  linux-gcc-arm82:
    runs-on: ubuntu-20.04
    steps:
@@ -90,13 +101,24 @@ jobs:
        sudo apt-get update
        sudo apt-get install g++-aarch64-linux-gnu

    - name: configure
      run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
    - name: build
      run: cmake --build build -j 2

      run: |
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 2
    - name: test
      run: |
        export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
        cd build
        TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2

    - name: build-noint8
      run: |
        mkdir build-noint8 && cd build-noint8
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_INT8=OFF ..
        cmake --build . -j 2
    - name: test-noint8
      run: |
        export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
        cd build-noint8
        TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2
--- a/.github/workflows/linux-arm-cpu-gcc.yml
+++ b/.github/workflows/linux-arm-cpu-gcc.yml
@@ -43,17 +43,28 @@ jobs:
        sudo apt-get update
        sudo apt-get install g++-arm-linux-gnueabi

    - name: configure
      run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
    - name: build
      run: cmake --build build -j 2

      run: |
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 2
    - name: test
      run: |
        export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
        cd build
        TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabi" ctest --output-on-failure -j 2

    - name: build-noint8
      run: |
        mkdir build-noint8 && cd build-noint8
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 2
    - name: test-noint8
      run: |
        export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
        cd build-noint8
        TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabi" ctest --output-on-failure -j 2

  linux-gcc-armhf:
    runs-on: ubuntu-20.04
    steps:
@@ -90,13 +101,24 @@ jobs:
        sudo apt-get update
        sudo apt-get install g++-arm-linux-gnueabihf

    - name: configure
      run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
    - name: build
      run: cmake --build build -j 2

      run: |
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 2
    - name: test
      run: |
        export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
        cd build
        TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j 2

    - name: build-noint8
      run: |
        mkdir build-noint8 && cd build-noint8
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_INT8=OFF ..
        cmake --build . -j 2
    - name: test-noint8
      run: |
        export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
        cd build-noint8
        TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j 2
--- a/.github/workflows/linux-x64-cpu-clang.yml
+++ b/.github/workflows/linux-x64-cpu-clang.yml
@@ -47,6 +47,16 @@ jobs:
        cmake --build . -j 2
    - name: test-avx2
      run: cd build-avx2 && ctest --output-on-failure -j 2
    - name: build-noint8
      env:
        CC: clang
        CXX: clang++
      run: |
        mkdir build-noint8 && cd build-noint8
        cmake -DNCNN_INT8=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 2
    - name: test-noint8
      run: cd build-noint8 && ctest --output-on-failure -j 2

  linux-clang-simplestl:
    runs-on: ubuntu-latest
--- a/.github/workflows/linux-x64-cpu-gcc.yml
+++ b/.github/workflows/linux-x64-cpu-gcc.yml
@@ -38,6 +38,13 @@ jobs:
        cmake --build . -j 2
    - name: test-avx2
      run: cd build-avx2 && ctest --output-on-failure -j 2
    - name: build-noint8
      run: |
        mkdir build-noint8 && cd build-noint8
        cmake -DNCNN_INT8=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 2
    - name: test-noint8
      run: cd build-noint8 && ctest --output-on-failure -j 2

  linux-gcc-cpp03-nostdio-nostring-simplestl:
    runs-on: ubuntu-16.04
--- a/.github/workflows/linux-x86-cpu-clang.yml
+++ b/.github/workflows/linux-x86-cpu-clang.yml
@@ -37,3 +37,13 @@ jobs:
        mkdir build-shared && cd build-shared
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.clang-m32.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON -DNCNN_ENABLE_LTO=ON ..
        cmake --build . -j 2
    - name: build-noint8
      env:
        CC: clang
        CXX: clang++
      run: |
        mkdir build-noint8 && cd build-noint8
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.clang-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_INT8=OFF ..
        cmake --build . -j 2
    - name: test-noint8
      run: cd build-noint8 && ctest --output-on-failure -j 2
--- a/.github/workflows/linux-x86-cpu-gcc.yml
+++ b/.github/workflows/linux-x86-cpu-gcc.yml
@@ -31,3 +31,10 @@ jobs:
        mkdir build-shared && cd build-shared
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON -DNCNN_ENABLE_LTO=ON ..
        cmake --build . -j 2
    - name: build-noint8
      run: |
        mkdir build-noint8 && cd build-noint8
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_INT8=OFF ..
        cmake --build . -j 2
    - name: test-noint8
      run: cd build-noint8 && ctest --output-on-failure -j 2
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -79,6 +79,7 @@ option(NCNN_BUILD_TESTS "build tests" OFF)
 option(NCNN_COVERAGE "build for coverage" OFF)
 option(NCNN_BUILD_BENCHMARK "build benchmark" ON)
 option(NCNN_PYTHON "build python api" OFF)
 option(NCNN_INT8 "int8 inference" ON)

 if(ANDROID OR IOS OR NCNN_SIMPLESTL OR CMAKE_CROSSCOMPILING)
    option(NCNN_DISABLE_RTTI "disable rtti" ON)
--- a/docs/how-to-use-and-FAQ/build-minimal-library.md
+++ b/docs/how-to-use-and-FAQ/build-minimal-library.md
@@ -39,6 +39,15 @@ cmake -DNCNN_STRING=OFF ..

    Read more [here](https://github.com/Tencent/ncnn/blob/master/docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.md#input-and-output).

 ### disable NCNN_INT8

 ```
 cmake -DNCNN_INT8=OFF ..
 ```

 * Cannot use quantized int8 inference.


 ### drop pixel rotate and affine functions

 ```
--- a/src/layer/arm/convolution_arm.cpp
+++ b/src/layer/arm/convolution_arm.cpp
@@ -29,19 +29,21 @@ namespace ncnn {

 #include "convolution_bf16s.h"
 #include "convolution_sgemm.h"
 #include "convolution_sgemm_int8.h"

 #include "convolution_1x1.h"
 #include "convolution_1x1_bf16s.h"
 #include "convolution_1x1_int8.h"
 #include "convolution_2x2.h"
 #include "convolution_3x3.h"
 #include "convolution_3x3_int8.h"
 #include "convolution_4x4.h"
 #include "convolution_5x5.h"
 #include "convolution_7x7.h"

 #if NCNN_INT8
 #include "convolution_sgemm_int8.h"
 #include "convolution_1x1_int8.h"
 #include "convolution_3x3_int8.h"
 #include "convolution_int8.h"
 #endif // NCNN_INT8

 #if __ARM_NEON
 #include "convolution_pack4.h"
@@ -67,6 +69,7 @@ namespace ncnn {
 #include "convolution_7x7_pack1to4.h"
 #include "convolution_7x7_pack1to4_bf16s.h"

 #if NCNN_INT8
 #include "convolution_pack8_int8.h"
 #include "convolution_pack1to8_int8.h"
 #include "convolution_pack8to1_int8.h"
@@ -80,6 +83,7 @@ namespace ncnn {
 #include "convolution_3x3_pack1to8_int8.h"
 #include "convolution_7x7_pack1to8_int8.h"
 #include "convolution_3x3_pack8to1_int8.h"
 #endif // NCNN_INT8

 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #include "convolution_fp16s.h"
@@ -169,10 +173,12 @@ int Convolution_arm::create_pipeline(const Option& opt)
        activation->create_pipeline(opt);
    }

 #if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return create_pipeline_int8_arm(opt);
    }
 #endif

 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    if (opt.use_fp16_storage)
@@ -418,10 +424,12 @@ int Convolution_arm::destroy_pipeline(const Option& opt)

 int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
 #if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return forward_int8_arm(bottom_blob, top_blob, opt);
    }
 #endif

    if (bottom_blob.dims != 3)
    {
@@ -1767,6 +1775,7 @@ int Convolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const
    return 0;
 }

 #if NCNN_INT8
 int Convolution_arm::create_pipeline_int8_arm(const Option& opt)
 {
    const int maxk = kernel_w * kernel_h;
@@ -2263,6 +2272,7 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con

    return 0;
 }
 #endif // NCNN_INT8

 int Convolution_arm::forwardDilation_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
--- a/src/layer/arm/convolution_arm.h
+++ b/src/layer/arm/convolution_arm.h
@@ -37,8 +37,10 @@ protected:
 #endif
    int create_pipeline_bf16s(const Option& opt);
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #if NCNN_INT8
    int create_pipeline_int8_arm(const Option& opt);
    int forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
    int forwardDilation_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

 public:
@@ -67,11 +69,13 @@ public:
    // bf16
    Mat weight_data_bf16;

 #if NCNN_INT8
    // int8
    Mat weight_data_int8;

    //     Mat weight_3x3s2_data_int8;
    std::vector<Mat> weight_3x3_winograd23_data_int8;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/arm/convolutiondepthwise_arm.cpp
+++ b/src/layer/arm/convolutiondepthwise_arm.cpp
@@ -26,16 +26,21 @@
 namespace ncnn {

 #include "convolutiondepthwise_3x3.h"
 #include "convolutiondepthwise_3x3_int8.h"
 #include "convolutiondepthwise_5x5.h"

 #if NCNN_INT8
 #include "convolutiondepthwise_3x3_int8.h"
 #endif // NCNN_INT8

 #if __ARM_NEON
 #include "convolutiondepthwise_3x3_pack4.h"
 #include "convolutiondepthwise_3x3_pack4_bf16s.h"
 #include "convolutiondepthwise_5x5_pack4.h"
 #include "convolutiondepthwise_5x5_pack4_bf16s.h"

 #if NCNN_INT8
 #include "convolutiondepthwise_3x3_pack8_int8.h"
 #endif // NCNN_INT8

 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #include "convolutiondepthwise_3x3_fp16s.h"
@@ -104,10 +109,12 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)
        activation->create_pipeline(opt);
    }

 #if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return create_pipeline_int8_arm(opt);
    }
 #endif

    const int maxk = kernel_w * kernel_h;
    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
@@ -269,6 +276,7 @@ int ConvolutionDepthWise_arm::create_group_ops(const Option& opt)
            weights[0] = weight_data_g;
            weights[1] = bias_data_g;

 #if NCNN_INT8
            if (int8_scale_term)
            {
                Mat weight_data_int8_scales_g(num_output_g);
@@ -280,6 +288,7 @@ int ConvolutionDepthWise_arm::create_group_ops(const Option& opt)
            {
                weights[4] = top_blob_int8_scales.range(g, 1);
            }
 #endif

            op->load_model(ModelBinFromMatArray(weights));
        }
@@ -288,6 +297,7 @@ int ConvolutionDepthWise_arm::create_group_ops(const Option& opt)
            ncnn::Mat weights[4];
            weights[0] = weight_data_g;

 #if NCNN_INT8
            if (int8_scale_term)
            {
                Mat weight_data_int8_scales_g(num_output_g);
@@ -299,6 +309,7 @@ int ConvolutionDepthWise_arm::create_group_ops(const Option& opt)
            {
                weights[3] = top_blob_int8_scales.range(g, 1);
            }
 #endif

            op->load_model(ModelBinFromMatArray(weights));
        }
@@ -332,13 +343,12 @@ int ConvolutionDepthWise_arm::destroy_pipeline(const Option& opt)

 int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    // convolv with NxN kernel
    // value = value + bias

 #if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return forward_int8_arm(bottom_blob, top_blob, opt);
    }
 #endif

    int elembits = bottom_blob.elembits();

@@ -1447,6 +1457,7 @@ int ConvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blo
    return 0;
 }

 #if NCNN_INT8
 int ConvolutionDepthWise_arm::create_pipeline_int8_arm(const Option& opt)
 {
    const int maxk = kernel_w * kernel_h;
@@ -1981,5 +1992,6 @@ int ConvolutionDepthWise_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_

    return 0;
 }
 #endif // NCNN_INT8

 } // namespace ncnn
--- a/src/layer/arm/convolutiondepthwise_arm.h
+++ b/src/layer/arm/convolutiondepthwise_arm.h
@@ -36,8 +36,10 @@ protected:
    int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #if NCNN_INT8
    int create_pipeline_int8_arm(const Option& opt);
    int forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif

 public:
    Layer* activation;
@@ -54,8 +56,10 @@ public:
    Mat weight_data_bf16;
    Mat weight_data_pack4_bf16;

 #if NCNN_INT8
    // int8
    Mat weight_data_int8;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/arm/innerproduct_arm.cpp
+++ b/src/layer/arm/innerproduct_arm.cpp
@@ -55,10 +55,12 @@ int InnerProduct_arm::create_pipeline(const Option& opt)
    }
 #endif // __ARM_NEON

 #if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return create_pipeline_int8_arm(opt);
    }
 #endif

 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    if (opt.use_fp16_storage)
@@ -94,260 +96,14 @@ int InnerProduct_arm::destroy_pipeline(const Option& opt)
    return 0;
 }

 int InnerProduct_arm::create_pipeline_int8_arm(const Option& opt)
 {
    if (activation_type == 1)
    {
        activation = ncnn::create_layer(ncnn::LayerType::ReLU);

        ncnn::ParamDict pd;
        activation->load_param(pd);
    }

    const int num_input = weight_data_size / num_output;

    int out_elempack = 1;

    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 8 == 0 ? 8 : 1;
    }

    // src = inch-outch
    // dst = pb-inch-outch/pb
    {
        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);

        weight_data_int8.create(num_input, num_output / out_elempack, (size_t)out_elempack, out_elempack);

        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
        {
            signed char* g0 = weight_data_int8.row<signed char>(q / out_elempack);

            for (int p = 0; p < num_input; p++)
            {
                for (int j = 0; j < out_elempack; j++)
                {
                    *g0++ = weight_data_r2.row<signed char>(q + j)[p];
                }
            }
        }
    }

    //     // convert fp32 to int8
    //     if (weight_data_int8_scales.empty())
    //     {
    //         return 0;
    //     }
    // #if __aarch64__
    //     // first reorder Matrix A before MatMul
    //     const int n = num_output;
    //     const int k = weight_data.total() / n;
    //     weight_data_int8.create(n * k, (size_t)1u, opt.blob_allocator);
    //
    //     int8_t* b = weight_data;
    //     int8_t* sb = weight_data_int8;
    //     reorder_a(b, sb, n, k, k);
    //
    //     // pre-built scales
    //     scales_in.create(num_output, 4u, opt.blob_allocator);
    //     for (int i = 0; i < num_output; ++i)
    //     {
    //         if (std::fabs(static_cast<float>(weight_data_int8_scales[i])) <= 1e-6)
    //         {
    //             scales_in[i] = 0.f;
    //         }
    //         else
    //         {
    //             scales_in[i] = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[i]);
    //         }
    //     }
    // #endif
    return 0;
 }

 int InnerProduct_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    const int num_input = weight_data_size / num_output;

    if (bottom_blob.dims == 2 && bottom_blob.w == num_input && bottom_blob.h * bottom_blob.elempack > 1)
    {
        // gemm
        Mat bottom_blob_unpacked;
        Option opt_unpack = opt;
        opt_unpack.blob_allocator = opt.workspace_allocator;
        convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_unpack);

        return forward_int8(bottom_blob_unpacked, top_blob, opt);
    }

    int elembits = bottom_blob.elembits();

    Mat bottom_blob_int8 = bottom_blob;
    if (elembits != 8)
    {
        Option opt_q = opt;
        opt_q.blob_allocator = opt.workspace_allocator;
        quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
    }

    Mat bottom_blob_int8_flattened = bottom_blob_int8;
    if (bottom_blob_int8.dims != 1)
    {
        Option opt_flatten = opt;
        opt_flatten.blob_allocator = opt.workspace_allocator;
        flatten->forward(bottom_blob_int8, bottom_blob_int8_flattened, opt_flatten);
    }

    //     int elempack = bottom_blob_int8_flattened.elempack;

    int out_elempack = 1;
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 8 == 0 ? 8 : 1;
    }

    top_blob.create(num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    Mat top_blob_int32;
    top_blob_int32.create(num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.workspace_allocator);
    if (top_blob_int32.empty())
        return -100;

 #if __ARM_NEON
    if (out_elempack == 8)
    {
        // num_output
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < num_output / out_elempack; p++)
        {
            int32x4_t _sum0 = vdupq_n_s32(0);
            int32x4_t _sum1 = vdupq_n_s32(0);

            const signed char* kptr = weight_data_int8.row<const signed char>(p);
            const signed char* sptr = bottom_blob_int8_flattened;

            int i = 0;
            for (; i < num_input; i++)
            {
                int8x8_t _val = vdup_n_s8(sptr[0]);

                int8x8_t _w = vld1_s8(kptr);

                int16x8_t _s0 = vmull_s8(_val, _w);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));

                sptr += 1;
                kptr += 8;
            }

            int* outptr = (int*)top_blob_int32;
            vst1q_s32(outptr + p * 8, _sum0);
            vst1q_s32(outptr + p * 8 + 4, _sum1);
        }
    }
 #endif // __ARM_NEON

    if (out_elempack == 1)
    {
        // num_output
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < num_output / out_elempack; p++)
        {
            int sum = 0;

            const signed char* kptr = weight_data_int8.row<const signed char>(p);
            const signed char* sptr = bottom_blob_int8_flattened;

            int i = 0;
            for (; i < num_input; i++)
            {
                signed char val = sptr[0];

                signed char w = kptr[0];

                sum += val * w;

                sptr += 1;
                kptr += 1;
            }

            int* outptr = (int*)top_blob_int32;
            outptr[p] = sum;
        }
    }

    Mat scale_data(num_output);
    for (int p = 0; p < num_output; p++)
    {
        // dequantize
        float scale_in;
        if (weight_data_int8_scales[p] == 0)
            scale_in = 0;
        else
            scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);

        scale_data[p] = scale_in;
    }

    dequantize_from_int32(top_blob_int32, top_blob, scale_data, bias_data, opt);

    if (activation)
    {
        activation->forward_inplace(top_blob, opt);
    }

    return 0;

    // #if __aarch64__
    //     const int w = bottom_blob_tm.w;
    //     const int h = bottom_blob_tm.h;
    //
    //     const int m = 1;
    //     const int k = bottom_blob_tm.c * w * h;
    //     Mat bottom_blob_reorder(m * k, (size_t)1u, opt.workspace_allocator);
    //     {
    //         reorder_a(bottom_blob_tm_flattened, bottom_blob_reorder, m, k, k);
    //     }
    //
    //     Mat top_blob_tm(m * num_output, (size_t)4u, opt.workspace_allocator);
    //     int32_t* pc = top_blob_tm;
    //     const int8_t* pa = bottom_blob_reorder;
    //     const int8_t* pb = weight_data_int8;
    //     int8kernel((void*)pc, pa, pb, m, k, num_output, num_output, 0, 0, opt);
    //
    //     float* outptr = top_blob;
    //
    //     // dequant.fused.relu int32_t to float
    //     for (int p = 0; p < num_output; ++p)
    //     {
    //         float sumfp32 = pc[p] * scales_in[p];
    //         if (bias_term)
    //         {
    //             sumfp32 += bias_data[p];
    //         }
    //         if (1 == activation_type)
    //         {
    //             sumfp32 = std::max(0.f, sumfp32);
    //         }
    //
    //         outptr[p] = sumfp32;
    //     }
    //     return 0;
    // #else
    //     return InnerProduct::forward_int8(bottom_blob, top_blob, opt);
    // #endif
 }

 int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
 #if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return forward_int8_arm(bottom_blob, top_blob, opt);
    }
 #endif

    int elembits = bottom_blob.elembits();

@@ -2140,4 +1896,254 @@ int InnerProduct_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const
    return 0;
 }

 #if NCNN_INT8
 int InnerProduct_arm::create_pipeline_int8_arm(const Option& opt)
 {
    if (activation_type == 1)
    {
        activation = ncnn::create_layer(ncnn::LayerType::ReLU);

        ncnn::ParamDict pd;
        activation->load_param(pd);
    }

    const int num_input = weight_data_size / num_output;

    int out_elempack = 1;

    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 8 == 0 ? 8 : 1;
    }

    // src = inch-outch
    // dst = pb-inch-outch/pb
    {
        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);

        weight_data_int8.create(num_input, num_output / out_elempack, (size_t)out_elempack, out_elempack);

        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
        {
            signed char* g0 = weight_data_int8.row<signed char>(q / out_elempack);

            for (int p = 0; p < num_input; p++)
            {
                for (int j = 0; j < out_elempack; j++)
                {
                    *g0++ = weight_data_r2.row<signed char>(q + j)[p];
                }
            }
        }
    }

    //     // convert fp32 to int8
    //     if (weight_data_int8_scales.empty())
    //     {
    //         return 0;
    //     }
    // #if __aarch64__
    //     // first reorder Matrix A before MatMul
    //     const int n = num_output;
    //     const int k = weight_data.total() / n;
    //     weight_data_int8.create(n * k, (size_t)1u, opt.blob_allocator);
    //
    //     int8_t* b = weight_data;
    //     int8_t* sb = weight_data_int8;
    //     reorder_a(b, sb, n, k, k);
    //
    //     // pre-built scales
    //     scales_in.create(num_output, 4u, opt.blob_allocator);
    //     for (int i = 0; i < num_output; ++i)
    //     {
    //         if (std::fabs(static_cast<float>(weight_data_int8_scales[i])) <= 1e-6)
    //         {
    //             scales_in[i] = 0.f;
    //         }
    //         else
    //         {
    //             scales_in[i] = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[i]);
    //         }
    //     }
    // #endif
    return 0;
 }

 int InnerProduct_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    const int num_input = weight_data_size / num_output;

    if (bottom_blob.dims == 2 && bottom_blob.w == num_input && bottom_blob.h * bottom_blob.elempack > 1)
    {
        // gemm
        Mat bottom_blob_unpacked;
        Option opt_unpack = opt;
        opt_unpack.blob_allocator = opt.workspace_allocator;
        convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_unpack);

        return forward_int8(bottom_blob_unpacked, top_blob, opt);
    }

    int elembits = bottom_blob.elembits();

    Mat bottom_blob_int8 = bottom_blob;
    if (elembits != 8)
    {
        Option opt_q = opt;
        opt_q.blob_allocator = opt.workspace_allocator;
        quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
    }

    Mat bottom_blob_int8_flattened = bottom_blob_int8;
    if (bottom_blob_int8.dims != 1)
    {
        Option opt_flatten = opt;
        opt_flatten.blob_allocator = opt.workspace_allocator;
        flatten->forward(bottom_blob_int8, bottom_blob_int8_flattened, opt_flatten);
    }

    //     int elempack = bottom_blob_int8_flattened.elempack;

    int out_elempack = 1;
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 8 == 0 ? 8 : 1;
    }

    top_blob.create(num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    Mat top_blob_int32;
    top_blob_int32.create(num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.workspace_allocator);
    if (top_blob_int32.empty())
        return -100;

 #if __ARM_NEON
    if (out_elempack == 8)
    {
        // num_output
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < num_output / out_elempack; p++)
        {
            int32x4_t _sum0 = vdupq_n_s32(0);
            int32x4_t _sum1 = vdupq_n_s32(0);

            const signed char* kptr = weight_data_int8.row<const signed char>(p);
            const signed char* sptr = bottom_blob_int8_flattened;

            int i = 0;
            for (; i < num_input; i++)
            {
                int8x8_t _val = vdup_n_s8(sptr[0]);

                int8x8_t _w = vld1_s8(kptr);

                int16x8_t _s0 = vmull_s8(_val, _w);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));

                sptr += 1;
                kptr += 8;
            }

            int* outptr = (int*)top_blob_int32;
            vst1q_s32(outptr + p * 8, _sum0);
            vst1q_s32(outptr + p * 8 + 4, _sum1);
        }
    }
 #endif // __ARM_NEON

    if (out_elempack == 1)
    {
        // num_output
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < num_output / out_elempack; p++)
        {
            int sum = 0;

            const signed char* kptr = weight_data_int8.row<const signed char>(p);
            const signed char* sptr = bottom_blob_int8_flattened;

            int i = 0;
            for (; i < num_input; i++)
            {
                signed char val = sptr[0];

                signed char w = kptr[0];

                sum += val * w;

                sptr += 1;
                kptr += 1;
            }

            int* outptr = (int*)top_blob_int32;
            outptr[p] = sum;
        }
    }

    Mat scale_data(num_output);
    for (int p = 0; p < num_output; p++)
    {
        // dequantize
        float scale_in;
        if (weight_data_int8_scales[p] == 0)
            scale_in = 0;
        else
            scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);

        scale_data[p] = scale_in;
    }

    dequantize_from_int32(top_blob_int32, top_blob, scale_data, bias_data, opt);

    if (activation)
    {
        activation->forward_inplace(top_blob, opt);
    }

    return 0;

    // #if __aarch64__
    //     const int w = bottom_blob_tm.w;
    //     const int h = bottom_blob_tm.h;
    //
    //     const int m = 1;
    //     const int k = bottom_blob_tm.c * w * h;
    //     Mat bottom_blob_reorder(m * k, (size_t)1u, opt.workspace_allocator);
    //     {
    //         reorder_a(bottom_blob_tm_flattened, bottom_blob_reorder, m, k, k);
    //     }
    //
    //     Mat top_blob_tm(m * num_output, (size_t)4u, opt.workspace_allocator);
    //     int32_t* pc = top_blob_tm;
    //     const int8_t* pa = bottom_blob_reorder;
    //     const int8_t* pb = weight_data_int8;
    //     int8kernel((void*)pc, pa, pb, m, k, num_output, num_output, 0, 0, opt);
    //
    //     float* outptr = top_blob;
    //
    //     // dequant.fused.relu int32_t to float
    //     for (int p = 0; p < num_output; ++p)
    //     {
    //         float sumfp32 = pc[p] * scales_in[p];
    //         if (bias_term)
    //         {
    //             sumfp32 += bias_data[p];
    //         }
    //         if (1 == activation_type)
    //         {
    //             sumfp32 = std::max(0.f, sumfp32);
    //         }
    //
    //         outptr[p] = sumfp32;
    //     }
    //     return 0;
    // #else
    //     return InnerProduct::forward_int8(bottom_blob, top_blob, opt);
    // #endif
 }
 #endif // NCNN_INT8

 } // namespace ncnn
--- a/src/layer/arm/innerproduct_arm.h
+++ b/src/layer/arm/innerproduct_arm.h
@@ -39,9 +39,10 @@ protected:
 #endif
    int create_pipeline_bf16s(const Option& opt);
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

 #if NCNN_INT8
    int create_pipeline_int8_arm(const Option& opt);
    int forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif

 public:
    Layer* flatten;
@@ -54,9 +55,11 @@ public:
    // bf16
    Mat weight_data_bf16;

 #if NCNN_INT8
    // int8
    Mat weight_data_int8;
    Mat scales_in;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/convolution.cpp
+++ b/src/layer/convolution.cpp
@@ -47,7 +47,12 @@ int Convolution::load_param(const ParamDict& pd)

    if (int8_scale_term)
    {
 #if NCNN_INT8
        support_int8_storage = true;
 #else
        NCNN_LOGE("please build ncnn with NCNN_INT8 enabled for int8 inference");
        return -1;
 #endif
    }

    return 0;
@@ -66,6 +71,7 @@ int Convolution::load_model(const ModelBin& mb)
            return -100;
    }

 #if NCNN_INT8
    if (int8_scale_term)
    {
        weight_data_int8_scales = mb.load(num_output, 1);
@@ -76,12 +82,14 @@ int Convolution::load_model(const ModelBin& mb)
    {
        top_blob_int8_scales = mb.load(1, 1);
    }
 #endif // NCNN_INT8

    return 0;
 }

 int Convolution::create_pipeline(const Option& opt)
 {
 #if NCNN_INT8
    // runtime quantize the weight data
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)4u && int8_scale_term)
    {
@@ -101,6 +109,7 @@ int Convolution::create_pipeline(const Option& opt)

        weight_data = weight_data_int8.reshape(weight_data_size);
    }
 #endif // NCNN_INT8

    return 0;
 }
@@ -110,10 +119,12 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
    // convolv with NxN kernel
    // value = value + bias

 #if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return forward_int8(bottom_blob, top_blob, opt);
    }
 #endif

    // flattened blob, implement as InnerProduct
    if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
@@ -140,11 +151,13 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
            weights[0] = weight_data;
            weights[1] = bias_data;

 #if NCNN_INT8
            if (int8_scale_term)
            {
                weights[2] = weight_data_int8_scales;
                weights[3] = bottom_blob_int8_scales;
            }
 #endif

            op->load_model(ModelBinFromMatArray(weights));

@@ -327,6 +340,7 @@ void Convolution::make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered
    }
 }

 #if NCNN_INT8
 static inline signed char float2int8(float v)
 {
    int int32 = static_cast<int>(round(v));
@@ -492,5 +506,6 @@ int Convolution::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio

    return 0;
 }
 #endif // NCNN_INT8

 } // namespace ncnn
--- a/src/layer/convolution.h
+++ b/src/layer/convolution.h
@@ -35,7 +35,9 @@ public:
 protected:
    void make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, const Option& opt) const;

 #if NCNN_INT8
    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif

 public:
    // param
@@ -65,9 +67,11 @@ public:
    Mat weight_data;
    Mat bias_data;

 #if NCNN_INT8
    Mat weight_data_int8_scales;
    Mat bottom_blob_int8_scales;
    Mat top_blob_int8_scales;
 #endif

    // implementation type, 0 means do not use auto pack model
    int impl_type;
--- a/src/layer/convolutiondepthwise.cpp
+++ b/src/layer/convolutiondepthwise.cpp
@@ -53,7 +53,12 @@ int ConvolutionDepthWise::load_param(const ParamDict& pd)

    if (int8_scale_term)
    {
 #if NCNN_INT8
        support_int8_storage = true;
 #else
        NCNN_LOGE("please build ncnn with NCNN_INT8 enabled for int8 inference");
        return -1;
 #endif
    }

    return 0;
@@ -72,6 +77,7 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb)
            return -100;
    }

 #if NCNN_INT8
    if (int8_scale_term == 1 || int8_scale_term == 101)
    {
        weight_data_int8_scales = mb.load(group, 1);
@@ -104,12 +110,14 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb)
        top_blob_int8_scales = Mat(group);
        top_blob_int8_scales.fill(top_blob_int8_scale);
    }
 #endif // NCNN_INT8

    return 0;
 }

 int ConvolutionDepthWise::create_pipeline(const Option& opt)
 {
 #if NCNN_INT8
    // runtime quantize the weight data
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)4u && int8_scale_term)
    {
@@ -133,6 +141,7 @@ int ConvolutionDepthWise::create_pipeline(const Option& opt)

        weight_data = int8_weight_data;
    }
 #endif // NCNN_INT8

    return 0;
 }
@@ -142,10 +151,12 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O
    // convolv with NxN kernel
    // value = value + bias

 #if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return forward_int8(bottom_blob, top_blob, opt);
    }
 #endif

    int w = bottom_blob.w;
    int h = bottom_blob.h;
@@ -403,6 +414,7 @@ void ConvolutionDepthWise::make_padding(const Mat& bottom_blob, Mat& bottom_blob
    }
 }

 #if NCNN_INT8
 static inline signed char float2int8(float v)
 {
    int int32 = static_cast<int>(round(v));
@@ -694,5 +706,6 @@ int ConvolutionDepthWise::forward_int8(const Mat& bottom_blob, Mat& top_blob, co

    return 0;
 }
 #endif // NCNN_INT8

 } // namespace ncnn
--- a/src/layer/convolutiondepthwise.h
+++ b/src/layer/convolutiondepthwise.h
@@ -35,7 +35,9 @@ public:
 protected:
    void make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, const Option& opt) const;

 #if NCNN_INT8
    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif

 public:
    // param
@@ -66,9 +68,11 @@ public:
    Mat weight_data;
    Mat bias_data;

 #if NCNN_INT8
    Mat weight_data_int8_scales;
    Mat bottom_blob_int8_scales;
    Mat top_blob_int8_scales;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/innerproduct.cpp
+++ b/src/layer/innerproduct.cpp
@@ -35,7 +35,12 @@ int InnerProduct::load_param(const ParamDict& pd)

    if (int8_scale_term)
    {
 #if NCNN_INT8
        support_int8_storage = true;
 #else
        NCNN_LOGE("please build ncnn with NCNN_INT8 enabled for int8 inference");
        return -1;
 #endif
    }

    return 0;
@@ -54,17 +59,20 @@ int InnerProduct::load_model(const ModelBin& mb)
            return -100;
    }

 #if NCNN_INT8
    if (int8_scale_term)
    {
        weight_data_int8_scales = mb.load(num_output, 1);
        bottom_blob_int8_scales = mb.load(1, 1);
    }
 #endif // NCNN_INT8

    return 0;
 }

 int InnerProduct::create_pipeline(const Option& opt)
 {
 #if NCNN_INT8
    // runtime quantize the weight data
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)4u && int8_scale_term)
    {
@@ -81,16 +89,19 @@ int InnerProduct::create_pipeline(const Option& opt)

        weight_data = weight_data_int8.reshape(weight_data_size);
    }
 #endif // NCNN_INT8

    return 0;
 }

 int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
 #if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return forward_int8(bottom_blob, top_blob, opt);
    }
 #endif

    const int num_input = weight_data_size / num_output;

@@ -218,6 +229,7 @@ int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o
    return 0;
 }

 #if NCNN_INT8
 int InnerProduct::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    const int num_input = weight_data_size / num_output;
@@ -332,5 +344,6 @@ int InnerProduct::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opti

    return 0;
 }
 #endif // NCNN_INT8

 } // namespace ncnn
--- a/src/layer/innerproduct.h
+++ b/src/layer/innerproduct.h
@@ -33,7 +33,9 @@ public:
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

 protected:
 #if NCNN_INT8
    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif

 public:
    // param
@@ -52,8 +54,10 @@ public:
    Mat weight_data;
    Mat bias_data;

 #if NCNN_INT8
    Mat weight_data_int8_scales;
    Mat bottom_blob_int8_scales;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/x86/convolution_x86.cpp
+++ b/src/layer/x86/convolution_x86.cpp
@@ -35,18 +35,21 @@ namespace ncnn {
 #include "convolution_5x5.h"
 #include "convolution_7x7.h"

 #if NCNN_INT8
 #include "convolution_sgemm_int8.h"
 #include "convolution_1x1_int8.h"
 #include "convolution_3x3_int8.h"

 #include "convolution_int8.h"
 #endif // NCNN_INT8

 #if __SSE2__
 #include "convolution_1x1_pack4.h"

 #if NCNN_INT8
 #include "convolution_pack8_int8.h"
 #include "convolution_pack1to8_int8.h"
 #include "convolution_pack8to1_int8.h"
 #endif // NCNN_INT8
 #if __AVX__
 #include "convolution_3x3_pack1to8.h"
 #include "convolution_3x3_pack8to1.h"
@@ -118,10 +121,12 @@ int Convolution_x86::create_pipeline(const Option& opt)
        activation->create_pipeline(opt);
    }

 #if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return create_pipeline_int8_x86(opt);
    }
 #endif

    int kernel_size = kernel_w * kernel_h;
    int num_input = weight_data_size / kernel_size / num_output;
@@ -311,10 +316,12 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
    // convolv with NxN kernel
    // value = value + bias

 #if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return forward_int8_x86(bottom_blob, top_blob, opt);
    }
 #endif

    if (bottom_blob.dims != 3)
    {
@@ -1058,6 +1065,7 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
    return 0;
 }

 #if NCNN_INT8
 int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
 {
    const int maxk = kernel_w * kernel_h;
@@ -1410,6 +1418,7 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con

    return 0;
 }
 #endif // NCNN_INT8

 int Convolution_x86::forwardDilation_x86(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
--- a/src/layer/x86/convolution_x86.h
+++ b/src/layer/x86/convolution_x86.h
@@ -30,8 +30,10 @@ public:
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

 protected:
 #if NCNN_INT8
    int create_pipeline_int8_x86(const Option& opt);
    int forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
    int forwardDilation_x86(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

 public:
@@ -47,9 +49,11 @@ public:

    Mat weight_3x3_winograd64_data_pack8;

 #if NCNN_INT8
    // int8
    Mat weight_data_int8;
    Mat weight_3x3_winograd23_data_int8;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/x86/convolutiondepthwise_x86.cpp
+++ b/src/layer/x86/convolutiondepthwise_x86.cpp
@@ -36,7 +36,9 @@ namespace ncnn {
 #endif
 #endif // __SSE2__
 #include "convolutiondepthwise_3x3.h"
 #if NCNN_INT8
 #include "convolutiondepthwise_3x3_int8.h"
 #endif // NCNN_INT8

 ConvolutionDepthWise_x86::ConvolutionDepthWise_x86()
 {
@@ -102,10 +104,12 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt)
        activation->create_pipeline(opt);
    }

 #if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return create_pipeline_int8_x86(opt);
    }
 #endif

    const int maxk = kernel_w * kernel_h;
    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
@@ -235,6 +239,7 @@ int ConvolutionDepthWise_x86::create_group_ops(const Option& opt)
            weights[0] = weight_data_g;
            weights[1] = bias_data_g;

 #if NCNN_INT8
            if (int8_scale_term)
            {
                Mat weight_data_int8_scales_g(num_output_g);
@@ -246,6 +251,7 @@ int ConvolutionDepthWise_x86::create_group_ops(const Option& opt)
            {
                weights[4] = top_blob_int8_scales.range(g, 1);
            }
 #endif

            op->load_model(ModelBinFromMatArray(weights));
        }
@@ -254,6 +260,7 @@ int ConvolutionDepthWise_x86::create_group_ops(const Option& opt)
            ncnn::Mat weights[4];
            weights[0] = weight_data_g;

 #if NCNN_INT8
            if (int8_scale_term)
            {
                Mat weight_data_int8_scales_g(num_output_g);
@@ -265,6 +272,7 @@ int ConvolutionDepthWise_x86::create_group_ops(const Option& opt)
            {
                weights[3] = top_blob_int8_scales.range(g, 1);
            }
 #endif

            op->load_model(ModelBinFromMatArray(weights));
        }
@@ -298,13 +306,12 @@ int ConvolutionDepthWise_x86::destroy_pipeline(const Option& opt)

 int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    // convolv with NxN kernel
    // value = value + bias

 #if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return forward_int8_x86(bottom_blob, top_blob, opt);
    }
 #endif

    int w = bottom_blob.w;
    int h = bottom_blob.h;
@@ -628,6 +635,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
    return 0;
 }

 #if NCNN_INT8
 int ConvolutionDepthWise_x86::create_pipeline_int8_x86(const Option& opt)
 {
    const int maxk = kernel_w * kernel_h;
@@ -1061,5 +1069,6 @@ int ConvolutionDepthWise_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_

    return 0;
 }
 #endif // NCNN_INT8

 } // namespace ncnn
--- a/src/layer/x86/convolutiondepthwise_x86.h
+++ b/src/layer/x86/convolutiondepthwise_x86.h
@@ -31,8 +31,10 @@ public:

 protected:
    int create_group_ops(const Option& opt);
 #if NCNN_INT8
    int create_pipeline_int8_x86(const Option& opt);
    int forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif

 public:
    Layer* activation;
@@ -41,8 +43,10 @@ public:
    // packing
    Mat weight_data_packed;

 #if NCNN_INT8
    // int8
    Mat weight_data_int8;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/x86/innerproduct_x86.cpp
+++ b/src/layer/x86/innerproduct_x86.cpp
@@ -54,10 +54,12 @@ int InnerProduct_x86::create_pipeline(const Option& opt)
        flatten->create_pipeline(opt);
    }

 #if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return create_pipeline_int8_x86(opt);
    }
 #endif

    const int num_input = weight_data_size / num_output;

@@ -124,10 +126,12 @@ int InnerProduct_x86::destroy_pipeline(const Option& opt)

 int InnerProduct_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
 #if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return forward_int8_x86(bottom_blob, top_blob, opt);
    }
 #endif

    const int num_input = weight_data_size / num_output;

@@ -1694,6 +1698,7 @@ int InnerProduct_x86::forward_fp16(const Mat& bottom_blob, Mat& top_blob, const
 }
 #endif // __AVX__

 #if NCNN_INT8
 int InnerProduct_x86::create_pipeline_int8_x86(const Option& opt)
 {
    if (activation_type == 1)
@@ -1883,5 +1888,6 @@ int InnerProduct_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, co

    return 0;
 }
 #endif // NCNN_INT8

 } // namespace ncnn
--- a/src/layer/x86/innerproduct_x86.h
+++ b/src/layer/x86/innerproduct_x86.h
@@ -34,9 +34,10 @@ public:

 protected:
    int forward_fp16(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

 #if NCNN_INT8
    int create_pipeline_int8_x86(const Option& opt);
    int forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif

 public:
    Layer* flatten;
@@ -47,9 +48,11 @@ public:
    // fp16 weight data
    Mat weight_data_fp16;

 #if NCNN_INT8
    // int8
    Mat weight_data_int8;
    Mat scales_in;
 #endif
 };

 } // namespace ncnn
--- a/src/platform.h.in
+++ b/src/platform.h.in
@@ -30,6 +30,7 @@
 #cmakedefine01 NCNN_RUNTIME_CPU
 #cmakedefine01 NCNN_AVX2
 #cmakedefine01 NCNN_ARM82
 #cmakedefine01 NCNN_INT8

 #cmakedefine NCNN_VERSION_STRING "@NCNN_VERSION_STRING@"

--- a/tests/test_convolution.cpp
+++ b/tests/test_convolution.cpp
@@ -171,6 +171,7 @@ static int test_convolution_2()
           || test_convolution_vec(64, 128, 1, 1, 1, 0, 0);
 }

 #if NCNN_INT8
 static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, bool requant = false)
 {
    ncnn::Mat a = RandomMat(w, h, c);
@@ -298,12 +299,20 @@ static int test_convolution_1()
           || test_convolution_int8(6, 7, 64, 64, 3, 1, 2, 0, 1)
           || test_convolution_int8(25, 33, 16, 15, 3, 1, 1, 1, 0);
 }
 #endif // NCNN_INT8

 int main()
 {
    SRAND(7767517);

 #if NCNN_INT8
    return 0
           || test_convolution_0()
           || test_convolution_1()
           || test_convolution_2();
 #else
    return 0
           || test_convolution_0()
           || test_convolution_2();
 #endif
 }
--- a/tests/test_convolutiondepthwise.cpp
+++ b/tests/test_convolutiondepthwise.cpp
@@ -125,6 +125,7 @@ static int test_convolutiondepthwise_0()
    return 0;
 }

 #if NCNN_INT8
 static int test_convolutiondepthwise_int8(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int group, bool requant = false)
 {
    ncnn::Mat a = RandomMat(w, h, c);
@@ -251,10 +252,15 @@ static int test_convolutiondepthwise_1()

    return 0;
 }
 #endif // NCNN_INT8

 int main()
 {
    SRAND(7767517);

 #if NCNN_INT8
    return test_convolutiondepthwise_0() || test_convolutiondepthwise_1();
 #else
    return test_convolutiondepthwise_0();
 #endif
 }
--- a/tests/test_innerproduct.cpp
+++ b/tests/test_innerproduct.cpp
@@ -87,6 +87,7 @@ static int test_innerproduct_2()
           || test_innerproduct(RandomMat(24), 32, 1);
 }

 #if NCNN_INT8
 static int test_innerproduct_int8(const ncnn::Mat& a, int outch, int bias)
 {
    ncnn::ParamDict pd;
@@ -145,6 +146,7 @@ static int test_innerproduct_3()
           || test_innerproduct_int8(RandomMat(7, 2, 16), 4, 1)
           || test_innerproduct_int8(RandomMat(6, 3, 16), 16, 1);
 }
 #endif // NCNN_INT8

 static int test_innerproduct_gemm(const ncnn::Mat& a, int outch, int bias)
 {
@@ -193,6 +195,7 @@ static int test_innerproduct_4()
           || test_innerproduct_gemm(RandomMat(12, 16), 7, 1);
 }

 #if NCNN_INT8
 static int test_innerproduct_gemm_int8(const ncnn::Mat& a, int outch, int bias)
 {
    ncnn::ParamDict pd;
@@ -242,11 +245,13 @@ static int test_innerproduct_5()
           || test_innerproduct_gemm_int8(RandomMat(6, 16), 16, 0)
           || test_innerproduct_gemm_int8(RandomMat(12, 16), 7, 1);
 }
 #endif // NCNN_INT8

 int main()
 {
    SRAND(7767517);

 #if NCNN_INT8
    return 0
           || test_innerproduct_0()
           || test_innerproduct_1()
@@ -254,4 +259,11 @@ int main()
           || test_innerproduct_3()
           || test_innerproduct_4()
           || test_innerproduct_5();
 #else
    return 0
           || test_innerproduct_0()
           || test_innerproduct_1()
           || test_innerproduct_2()
           || test_innerproduct_4();
 #endif
 }
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -12,7 +12,11 @@ add_subdirectory(caffe)
 add_subdirectory(mxnet)
 add_subdirectory(onnx)
 add_subdirectory(darknet)
 add_subdirectory(quantize)
 if(NCNN_INT8)
    add_subdirectory(quantize)
 else()
    message(WARNING "NCNN_INT8 disabled, quantize tools won't be built")
 endif()

 add_executable(ncnn2mem ncnn2mem.cpp)
 target_link_libraries(ncnn2mem PRIVATE ncnn)
--- a/tools/ncnnoptimize.cpp
+++ b/tools/ncnnoptimize.cpp
@@ -2673,8 +2673,10 @@ int NetOptimize::replace_convolution_with_innerproduct_after_global_pooling()

        innerproduct->weight_data = convolution->weight_data;
        innerproduct->bias_data = convolution->bias_data;
 #if NCNN_INT8
        innerproduct->weight_data_int8_scales = convolution->weight_data_int8_scales;
        innerproduct->bottom_blob_int8_scales = convolution->bottom_blob_int8_scales;
 #endif

        innerproduct->activation_type = convolution->activation_type;
        innerproduct->activation_params = convolution->activation_params;
@@ -2739,8 +2741,10 @@ int NetOptimize::replace_convolution_with_innerproduct_after_innerproduct()

            innerproduct2->weight_data = convolution->weight_data;
            innerproduct2->bias_data = convolution->bias_data;
 #if NCNN_INT8
            innerproduct->weight_data_int8_scales = convolution->weight_data_int8_scales;
            innerproduct->bottom_blob_int8_scales = convolution->bottom_blob_int8_scales;
 #endif

            innerproduct2->activation_type = convolution->activation_type;
            innerproduct2->activation_params = convolution->activation_params;