Browse Source

new int8 implement,better accuracy (#749)

* add the armv7a conv3x3s1 implement without overflow,remove old codes

* fix the bug of conv3x3s2 packed int8

* new int8 implement,weight quant by perchanel,better accuracy~

* fix the bug of conv3x3s1 packed int8 neon

* add the naive c fp32 and int8 winograd F(2,3)

* add the neon intrinsic int8 winograd F(2,3)

* optimize the armv7a int8 winograd F(2,3) with neon assembly

* optimize the armv7a int8 winograd F(2,3) input transform with assembly.

* add the requantize layer and int8 relu implement.

* add graph optimize conv1x1s2 -> conv1x1s1,begin optimize int8 aarch64.

* fix int8 bugs

* add the c naive im2col with sgemm

* add aarch64 int8 winograd f23, conv3x3s2 naive implement

* add the int8 sgemm conv7x7s2 on x86/armv7a platform

* optimize the int8 sgemm by neon intrinsic and packed kernel

* optimize the int8 sgemm with packed data

* optimize the int8 sgemm with armv7a neon assembly

* add the int8 sgemm on arm64-v8a platform

* perpare to merge latest codes from master

* add the int8 param files

* In the Class Net,add the fuse_network method
tags/20190320
BUG1989 nihui 7 years ago
parent
commit
df3d224484
49 changed files with 9347 additions and 6796 deletions
  1. +2
    -0
      CMakeLists.txt
  2. +73
    -1
      benchmark/benchncnn.cpp
  3. +154
    -0
      benchmark/googlenet_int8.param
  4. +114
    -0
      benchmark/mobilenet_int8.param
  5. +129
    -0
      benchmark/mobilenet_ssd_int8.param
  6. +103
    -0
      benchmark/resnet18_int8.param
  7. +247
    -0
      benchmark/resnet50.param
  8. +247
    -0
      benchmark/resnet50_int8.param
  9. +77
    -0
      benchmark/squeezenet_int8.param
  10. +181
    -0
      benchmark/squeezenet_ssd_int8.param
  11. +42
    -0
      benchmark/vgg16_int8.param
  12. +1
    -0
      src/CMakeLists.txt
  13. +2
    -2
      src/benchmark.cpp
  14. +821
    -4101
      src/layer/arm/convolution_1x1_int8.h
  15. +2549
    -2119
      src/layer/arm/convolution_3x3_int8.h
  16. +35
    -0
      src/layer/arm/convolution_5x5_int8.h
  17. +35
    -0
      src/layer/arm/convolution_7x7_int8.h
  18. +109
    -38
      src/layer/arm/convolution_arm.cpp
  19. +2
    -0
      src/layer/arm/convolution_arm.h
  20. +1598
    -0
      src/layer/arm/convolution_sgemm_int8.h
  21. +0
    -343
      src/layer/arm/convolutiondepthwise_3x3_int8.h
  22. +63
    -19
      src/layer/arm/convolutiondepthwise_arm.cpp
  23. +0
    -22
      src/layer/arm/quantize_arm.cpp
  24. +84
    -0
      src/layer/arm/relu_arm.cpp
  25. +1
    -0
      src/layer/arm/relu_arm.h
  26. +325
    -0
      src/layer/arm/requantize_arm.cpp
  27. +30
    -0
      src/layer/arm/requantize_arm.h
  28. +194
    -47
      src/layer/convolution.cpp
  29. +7
    -2
      src/layer/convolution.h
  30. +65
    -2
      src/layer/convolutiondepthwise.cpp
  31. +5
    -0
      src/layer/convolutiondepthwise.h
  32. +78
    -33
      src/layer/innerproduct.cpp
  33. +2
    -2
      src/layer/innerproduct.h
  34. +43
    -0
      src/layer/relu.cpp
  35. +1
    -0
      src/layer/relu.h
  36. +195
    -0
      src/layer/requantize.cpp
  37. +46
    -0
      src/layer/requantize.h
  38. +494
    -0
      src/layer/x86/convolution_3x3.h
  39. +431
    -23
      src/layer/x86/convolution_3x3_int8.h
  40. +35
    -0
      src/layer/x86/convolution_5x5_int8.h
  41. +35
    -0
      src/layer/x86/convolution_7x7_int8.h
  42. +381
    -0
      src/layer/x86/convolution_sgemm_int8.h
  43. +128
    -22
      src/layer/x86/convolution_x86.cpp
  44. +8
    -0
      src/layer/x86/convolution_x86.h
  45. +59
    -19
      src/layer/x86/convolutiondepthwise_x86.cpp
  46. +109
    -0
      src/net.cpp
  47. +4
    -0
      src/net.h
  48. +2
    -0
      src/platform.h.in
  49. +1
    -1
      tools/caffe/caffe2ncnn.cpp

+ 2
- 0
CMakeLists.txt View File

@@ -32,6 +32,8 @@ option(NCNN_PIXEL "convert and resize from/to image pixel" ON)
option(NCNN_PIXEL_ROTATE "rotate image pixel orientation" OFF)
option(NCNN_CMAKE_VERBOSE "print verbose cmake messages" OFF)
option(NCNN_VULKAN "vulkan compute support" OFF)
option(NCNN_REQUANT "auto merge int8 quant and dequant" OFF)
option(NCNN_IM2COL_SGEMM "im2col sgemm support" OFF)

if(NCNN_OPENMP)
find_package(OpenMP)


+ 73
- 1
benchmark/benchncnn.cpp View File

@@ -202,7 +202,7 @@ void benchmark(const char* comment, void (*init)(ncnn::Net&), void (*run)(const

time_avg /= g_loop_count;

fprintf(stderr, "%16s min = %7.2f max = %7.2f avg = %7.2f\n", comment, time_min, time_max, time_avg);
fprintf(stderr, "%-20s min = %7.2f max = %7.2f avg = %7.2f\n", comment, time_min, time_max, time_avg);
}

void squeezenet_init(ncnn::Net& net)
@@ -210,6 +210,11 @@ void squeezenet_init(ncnn::Net& net)
net.load_param("squeezenet.param");
}

void squeezenet_int8_init(ncnn::Net& net)
{
net.load_param("squeezenet_int8.param");
}

void squeezenet_run(const ncnn::Net& net)
{
ncnn::Extractor ex = net.create_extractor();
@@ -226,6 +231,11 @@ void mobilenet_init(ncnn::Net& net)
net.load_param("mobilenet.param");
}

void mobilenet_int8_init(ncnn::Net& net)
{
net.load_param("mobilenet_int8.param");
}

void mobilenet_run(const ncnn::Net& net)
{
ncnn::Extractor ex = net.create_extractor();
@@ -306,6 +316,11 @@ void googlenet_init(ncnn::Net& net)
net.load_param("googlenet.param");
}

void googlenet_int8_init(ncnn::Net& net)
{
net.load_param("googlenet_int8.param");
}

void googlenet_run(const ncnn::Net& net)
{
ncnn::Extractor ex = net.create_extractor();
@@ -322,6 +337,11 @@ void resnet18_init(ncnn::Net& net)
net.load_param("resnet18.param");
}

void resnet18_int8_init(ncnn::Net& net)
{
net.load_param("resnet18_int8.param");
}

void resnet18_run(const ncnn::Net& net)
{
ncnn::Extractor ex = net.create_extractor();
@@ -354,6 +374,11 @@ void vgg16_init(ncnn::Net& net)
net.load_param("vgg16.param");
}

void vgg16_int8_init(ncnn::Net& net)
{
net.load_param("vgg16_int8.param");
}

void vgg16_run(const ncnn::Net& net)
{
ncnn::Extractor ex = net.create_extractor();
@@ -365,11 +390,37 @@ void vgg16_run(const ncnn::Net& net)
ex.extract("prob", out);
}

void resnet50_init(ncnn::Net& net)
{
net.load_param("resnet50.param");
}

void resnet50_int8_init(ncnn::Net& net)
{
net.load_param("resnet50_int8.param");
}

void resnet50_run(const ncnn::Net& net)
{
ncnn::Extractor ex = net.create_extractor();

ncnn::Mat in(224, 224, 3);
ex.input("data", in);

ncnn::Mat out;
ex.extract("prob", out);
}

void squeezenet_ssd_init(ncnn::Net& net)
{
net.load_param("squeezenet_ssd.param");
}

void squeezenet_ssd_int8_init(ncnn::Net& net)
{
net.load_param("squeezenet_ssd_int8.param");
}

void squeezenet_ssd_run(const ncnn::Net& net)
{
ncnn::Extractor ex = net.create_extractor();
@@ -386,6 +437,11 @@ void mobilenet_ssd_init(ncnn::Net& net)
net.load_param("mobilenet_ssd.param");
}

void mobilenet_ssd_int8_init(ncnn::Net& net)
{
net.load_param("mobilenet_ssd_int8.param");
}

void mobilenet_ssd_run(const ncnn::Net& net)
{
ncnn::Extractor ex = net.create_extractor();
@@ -497,8 +553,12 @@ int main(int argc, char** argv)
// run
benchmark("squeezenet", squeezenet_init, squeezenet_run);

benchmark("squeezenet-int8", squeezenet_int8_init, squeezenet_run);

benchmark("mobilenet", mobilenet_init, mobilenet_run);

benchmark("mobilenet-int8", mobilenet_int8_init, mobilenet_run);

benchmark("mobilenet_v2", mobilenet_v2_init, mobilenet_v2_run);

benchmark("shufflenet", shufflenet_init, shufflenet_run);
@@ -509,16 +569,28 @@ int main(int argc, char** argv)

benchmark("googlenet", googlenet_init, googlenet_run);

benchmark("googlenet-int8", googlenet_int8_init, googlenet_run);

benchmark("resnet18", resnet18_init, resnet18_run);

benchmark("resnet18-int8", resnet18_int8_init, resnet18_run);

benchmark("alexnet", alexnet_init, alexnet_run);

benchmark("vgg16", vgg16_init, vgg16_run);

benchmark("resnet50", resnet50_init, resnet50_run);

benchmark("resnet50-int8", resnet50_int8_init, resnet50_run);

benchmark("squeezenet-ssd", squeezenet_ssd_init, squeezenet_ssd_run);

benchmark("squeezenet-ssd-int8", squeezenet_ssd_int8_init, squeezenet_ssd_run);

benchmark("mobilenet-ssd", mobilenet_ssd_init, mobilenet_ssd_run);

benchmark("mobilenet-ssd-int8", mobilenet_ssd_int8_init, mobilenet_ssd_run);

benchmark("mobilenet-yolo", mobilenet_yolo_init, mobilenet_yolo_run);

benchmark("mobilenet-yolov3", mobilenet_yolov3_init, mobilenet_yolov3_run);


+ 154
- 0
benchmark/googlenet_int8.param View File

@@ -0,0 +1,154 @@
7767517
152 179
Input data 0 1 data 0=224 1=224 2=3
Convolution conv1/7x7_s2 1 1 data conv1/7x7_s2 0=64 1=7 2=1 3=2 4=3 5=1 6=9408 8=2
ReLU conv1/relu_7x7 1 1 conv1/7x7_s2 conv1/7x7_s2_conv1/relu_7x7
Pooling pool1/3x3_s2 1 1 conv1/7x7_s2_conv1/relu_7x7 pool1/3x3_s2 0=0 1=3 2=2 3=0 4=0
LRN pool1/norm1 1 1 pool1/3x3_s2 pool1/norm1 0=0 1=5 2=0.000100 3=0.750000
Convolution conv2/3x3_reduce 1 1 pool1/norm1 conv2/3x3_reduce 0=64 1=1 2=1 3=1 4=0 5=1 6=4096 8=2
ReLU conv2/relu_3x3_reduce 1 1 conv2/3x3_reduce conv2/3x3_reduce_conv2/relu_3x3_reduce
Convolution conv2/3x3 1 1 conv2/3x3_reduce_conv2/relu_3x3_reduce conv2/3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=110592 8=2
ReLU conv2/relu_3x3 1 1 conv2/3x3 conv2/3x3_conv2/relu_3x3
LRN conv2/norm2 1 1 conv2/3x3_conv2/relu_3x3 conv2/norm2 0=0 1=5 2=0.000100 3=0.750000
Pooling pool2/3x3_s2 1 1 conv2/norm2 pool2/3x3_s2 0=0 1=3 2=2 3=0 4=0
Split splitncnn_0 1 4 pool2/3x3_s2 pool2/3x3_s2_splitncnn_0 pool2/3x3_s2_splitncnn_1 pool2/3x3_s2_splitncnn_2 pool2/3x3_s2_splitncnn_3
Convolution inception_3a/1x1 1 1 pool2/3x3_s2_splitncnn_3 inception_3a/1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=12288 8=2
ReLU inception_3a/relu_1x1 1 1 inception_3a/1x1 inception_3a/1x1_inception_3a/relu_1x1
Convolution inception_3a/3x3_reduce 1 1 pool2/3x3_s2_splitncnn_2 inception_3a/3x3_reduce 0=96 1=1 2=1 3=1 4=0 5=1 6=18432 8=2
ReLU inception_3a/relu_3x3_reduce 1 1 inception_3a/3x3_reduce inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce
Convolution inception_3a/3x3 1 1 inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce inception_3a/3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=110592 8=2
ReLU inception_3a/relu_3x3 1 1 inception_3a/3x3 inception_3a/3x3_inception_3a/relu_3x3
Convolution inception_3a/5x5_reduce 1 1 pool2/3x3_s2_splitncnn_1 inception_3a/5x5_reduce 0=16 1=1 2=1 3=1 4=0 5=1 6=3072 8=2
ReLU inception_3a/relu_5x5_reduce 1 1 inception_3a/5x5_reduce inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce
Convolution inception_3a/5x5 1 1 inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce inception_3a/5x5 0=32 1=5 2=1 3=1 4=2 5=1 6=12800 8=2
ReLU inception_3a/relu_5x5 1 1 inception_3a/5x5 inception_3a/5x5_inception_3a/relu_5x5
Pooling inception_3a/pool 1 1 pool2/3x3_s2_splitncnn_0 inception_3a/pool 0=0 1=3 2=1 3=1 4=0
Convolution inception_3a/pool_proj 1 1 inception_3a/pool inception_3a/pool_proj 0=32 1=1 2=1 3=1 4=0 5=1 6=6144 8=2
ReLU inception_3a/relu_pool_proj 1 1 inception_3a/pool_proj inception_3a/pool_proj_inception_3a/relu_pool_proj
Concat inception_3a/output 4 1 inception_3a/1x1_inception_3a/relu_1x1 inception_3a/3x3_inception_3a/relu_3x3 inception_3a/5x5_inception_3a/relu_5x5 inception_3a/pool_proj_inception_3a/relu_pool_proj inception_3a/output 0=0
Split splitncnn_1 1 4 inception_3a/output inception_3a/output_splitncnn_0 inception_3a/output_splitncnn_1 inception_3a/output_splitncnn_2 inception_3a/output_splitncnn_3
Convolution inception_3b/1x1 1 1 inception_3a/output_splitncnn_3 inception_3b/1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
ReLU inception_3b/relu_1x1 1 1 inception_3b/1x1 inception_3b/1x1_inception_3b/relu_1x1
Convolution inception_3b/3x3_reduce 1 1 inception_3a/output_splitncnn_2 inception_3b/3x3_reduce 0=128 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
ReLU inception_3b/relu_3x3_reduce 1 1 inception_3b/3x3_reduce inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce
Convolution inception_3b/3x3 1 1 inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce inception_3b/3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=221184 8=2
ReLU inception_3b/relu_3x3 1 1 inception_3b/3x3 inception_3b/3x3_inception_3b/relu_3x3
Convolution inception_3b/5x5_reduce 1 1 inception_3a/output_splitncnn_1 inception_3b/5x5_reduce 0=32 1=1 2=1 3=1 4=0 5=1 6=8192 8=2
ReLU inception_3b/relu_5x5_reduce 1 1 inception_3b/5x5_reduce inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce
Convolution inception_3b/5x5 1 1 inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce inception_3b/5x5 0=96 1=5 2=1 3=1 4=2 5=1 6=76800 8=2
ReLU inception_3b/relu_5x5 1 1 inception_3b/5x5 inception_3b/5x5_inception_3b/relu_5x5
Pooling inception_3b/pool 1 1 inception_3a/output_splitncnn_0 inception_3b/pool 0=0 1=3 2=1 3=1 4=0
Convolution inception_3b/pool_proj 1 1 inception_3b/pool inception_3b/pool_proj 0=64 1=1 2=1 3=1 4=0 5=1 6=16384 8=2
ReLU inception_3b/relu_pool_proj 1 1 inception_3b/pool_proj inception_3b/pool_proj_inception_3b/relu_pool_proj
Concat inception_3b/output 4 1 inception_3b/1x1_inception_3b/relu_1x1 inception_3b/3x3_inception_3b/relu_3x3 inception_3b/5x5_inception_3b/relu_5x5 inception_3b/pool_proj_inception_3b/relu_pool_proj inception_3b/output 0=0
Pooling pool3/3x3_s2 1 1 inception_3b/output pool3/3x3_s2 0=0 1=3 2=2 3=0 4=0
Split splitncnn_2 1 4 pool3/3x3_s2 pool3/3x3_s2_splitncnn_0 pool3/3x3_s2_splitncnn_1 pool3/3x3_s2_splitncnn_2 pool3/3x3_s2_splitncnn_3
Convolution inception_4a/1x1 1 1 pool3/3x3_s2_splitncnn_3 inception_4a/1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=92160 8=2
ReLU inception_4a/relu_1x1 1 1 inception_4a/1x1 inception_4a/1x1_inception_4a/relu_1x1
Convolution inception_4a/3x3_reduce 1 1 pool3/3x3_s2_splitncnn_2 inception_4a/3x3_reduce 0=96 1=1 2=1 3=1 4=0 5=1 6=46080 8=2
ReLU inception_4a/relu_3x3_reduce 1 1 inception_4a/3x3_reduce inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce
Convolution inception_4a/3x3 1 1 inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce inception_4a/3x3 0=208 1=3 2=1 3=1 4=1 5=1 6=179712 8=2
ReLU inception_4a/relu_3x3 1 1 inception_4a/3x3 inception_4a/3x3_inception_4a/relu_3x3
Convolution inception_4a/5x5_reduce 1 1 pool3/3x3_s2_splitncnn_1 inception_4a/5x5_reduce 0=16 1=1 2=1 3=1 4=0 5=1 6=7680 8=2
ReLU inception_4a/relu_5x5_reduce 1 1 inception_4a/5x5_reduce inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce
Convolution inception_4a/5x5 1 1 inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce inception_4a/5x5 0=48 1=5 2=1 3=1 4=2 5=1 6=19200 8=2
ReLU inception_4a/relu_5x5 1 1 inception_4a/5x5 inception_4a/5x5_inception_4a/relu_5x5
Pooling inception_4a/pool 1 1 pool3/3x3_s2_splitncnn_0 inception_4a/pool 0=0 1=3 2=1 3=1 4=0
Convolution inception_4a/pool_proj 1 1 inception_4a/pool inception_4a/pool_proj 0=64 1=1 2=1 3=1 4=0 5=1 6=30720 8=2
ReLU inception_4a/relu_pool_proj 1 1 inception_4a/pool_proj inception_4a/pool_proj_inception_4a/relu_pool_proj
Concat inception_4a/output 4 1 inception_4a/1x1_inception_4a/relu_1x1 inception_4a/3x3_inception_4a/relu_3x3 inception_4a/5x5_inception_4a/relu_5x5 inception_4a/pool_proj_inception_4a/relu_pool_proj inception_4a/output 0=0
Split splitncnn_3 1 4 inception_4a/output inception_4a/output_splitncnn_0 inception_4a/output_splitncnn_1 inception_4a/output_splitncnn_2 inception_4a/output_splitncnn_3
Convolution inception_4b/1x1 1 1 inception_4a/output_splitncnn_3 inception_4b/1x1 0=160 1=1 2=1 3=1 4=0 5=1 6=81920 8=2
ReLU inception_4b/relu_1x1 1 1 inception_4b/1x1 inception_4b/1x1_inception_4b/relu_1x1
Convolution inception_4b/3x3_reduce 1 1 inception_4a/output_splitncnn_2 inception_4b/3x3_reduce 0=112 1=1 2=1 3=1 4=0 5=1 6=57344 8=2
ReLU inception_4b/relu_3x3_reduce 1 1 inception_4b/3x3_reduce inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce
Convolution inception_4b/3x3 1 1 inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce inception_4b/3x3 0=224 1=3 2=1 3=1 4=1 5=1 6=225792 8=2
ReLU inception_4b/relu_3x3 1 1 inception_4b/3x3 inception_4b/3x3_inception_4b/relu_3x3
Convolution inception_4b/5x5_reduce 1 1 inception_4a/output_splitncnn_1 inception_4b/5x5_reduce 0=24 1=1 2=1 3=1 4=0 5=1 6=12288 8=2
ReLU inception_4b/relu_5x5_reduce 1 1 inception_4b/5x5_reduce inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce
Convolution inception_4b/5x5 1 1 inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce inception_4b/5x5 0=64 1=5 2=1 3=1 4=2 5=1 6=38400 8=2
ReLU inception_4b/relu_5x5 1 1 inception_4b/5x5 inception_4b/5x5_inception_4b/relu_5x5
Pooling inception_4b/pool 1 1 inception_4a/output_splitncnn_0 inception_4b/pool 0=0 1=3 2=1 3=1 4=0
Convolution inception_4b/pool_proj 1 1 inception_4b/pool inception_4b/pool_proj 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
ReLU inception_4b/relu_pool_proj 1 1 inception_4b/pool_proj inception_4b/pool_proj_inception_4b/relu_pool_proj
Concat inception_4b/output 4 1 inception_4b/1x1_inception_4b/relu_1x1 inception_4b/3x3_inception_4b/relu_3x3 inception_4b/5x5_inception_4b/relu_5x5 inception_4b/pool_proj_inception_4b/relu_pool_proj inception_4b/output 0=0
Split splitncnn_4 1 4 inception_4b/output inception_4b/output_splitncnn_0 inception_4b/output_splitncnn_1 inception_4b/output_splitncnn_2 inception_4b/output_splitncnn_3
Convolution inception_4c/1x1 1 1 inception_4b/output_splitncnn_3 inception_4c/1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=65536 8=2
ReLU inception_4c/relu_1x1 1 1 inception_4c/1x1 inception_4c/1x1_inception_4c/relu_1x1
Convolution inception_4c/3x3_reduce 1 1 inception_4b/output_splitncnn_2 inception_4c/3x3_reduce 0=128 1=1 2=1 3=1 4=0 5=1 6=65536 8=2
ReLU inception_4c/relu_3x3_reduce 1 1 inception_4c/3x3_reduce inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce
Convolution inception_4c/3x3 1 1 inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce inception_4c/3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=294912 8=2
ReLU inception_4c/relu_3x3 1 1 inception_4c/3x3 inception_4c/3x3_inception_4c/relu_3x3
Convolution inception_4c/5x5_reduce 1 1 inception_4b/output_splitncnn_1 inception_4c/5x5_reduce 0=24 1=1 2=1 3=1 4=0 5=1 6=12288 8=2
ReLU inception_4c/relu_5x5_reduce 1 1 inception_4c/5x5_reduce inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce
Convolution inception_4c/5x5 1 1 inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce inception_4c/5x5 0=64 1=5 2=1 3=1 4=2 5=1 6=38400 8=2
ReLU inception_4c/relu_5x5 1 1 inception_4c/5x5 inception_4c/5x5_inception_4c/relu_5x5
Pooling inception_4c/pool 1 1 inception_4b/output_splitncnn_0 inception_4c/pool 0=0 1=3 2=1 3=1 4=0
Convolution inception_4c/pool_proj 1 1 inception_4c/pool inception_4c/pool_proj 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
ReLU inception_4c/relu_pool_proj 1 1 inception_4c/pool_proj inception_4c/pool_proj_inception_4c/relu_pool_proj
Concat inception_4c/output 4 1 inception_4c/1x1_inception_4c/relu_1x1 inception_4c/3x3_inception_4c/relu_3x3 inception_4c/5x5_inception_4c/relu_5x5 inception_4c/pool_proj_inception_4c/relu_pool_proj inception_4c/output 0=0
Split splitncnn_5 1 4 inception_4c/output inception_4c/output_splitncnn_0 inception_4c/output_splitncnn_1 inception_4c/output_splitncnn_2 inception_4c/output_splitncnn_3
Convolution inception_4d/1x1 1 1 inception_4c/output_splitncnn_3 inception_4d/1x1 0=112 1=1 2=1 3=1 4=0 5=1 6=57344 8=2
ReLU inception_4d/relu_1x1 1 1 inception_4d/1x1 inception_4d/1x1_inception_4d/relu_1x1
Convolution inception_4d/3x3_reduce 1 1 inception_4c/output_splitncnn_2 inception_4d/3x3_reduce 0=144 1=1 2=1 3=1 4=0 5=1 6=73728 8=2
ReLU inception_4d/relu_3x3_reduce 1 1 inception_4d/3x3_reduce inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce
Convolution inception_4d/3x3 1 1 inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce inception_4d/3x3 0=288 1=3 2=1 3=1 4=1 5=1 6=373248 8=2
ReLU inception_4d/relu_3x3 1 1 inception_4d/3x3 inception_4d/3x3_inception_4d/relu_3x3
Convolution inception_4d/5x5_reduce 1 1 inception_4c/output_splitncnn_1 inception_4d/5x5_reduce 0=32 1=1 2=1 3=1 4=0 5=1 6=16384 8=2
ReLU inception_4d/relu_5x5_reduce 1 1 inception_4d/5x5_reduce inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce
Convolution inception_4d/5x5 1 1 inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce inception_4d/5x5 0=64 1=5 2=1 3=1 4=2 5=1 6=51200 8=2
ReLU inception_4d/relu_5x5 1 1 inception_4d/5x5 inception_4d/5x5_inception_4d/relu_5x5
Pooling inception_4d/pool 1 1 inception_4c/output_splitncnn_0 inception_4d/pool 0=0 1=3 2=1 3=1 4=0
Convolution inception_4d/pool_proj 1 1 inception_4d/pool inception_4d/pool_proj 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
ReLU inception_4d/relu_pool_proj 1 1 inception_4d/pool_proj inception_4d/pool_proj_inception_4d/relu_pool_proj
Concat inception_4d/output 4 1 inception_4d/1x1_inception_4d/relu_1x1 inception_4d/3x3_inception_4d/relu_3x3 inception_4d/5x5_inception_4d/relu_5x5 inception_4d/pool_proj_inception_4d/relu_pool_proj inception_4d/output 0=0
Split splitncnn_6 1 4 inception_4d/output inception_4d/output_splitncnn_0 inception_4d/output_splitncnn_1 inception_4d/output_splitncnn_2 inception_4d/output_splitncnn_3
Convolution inception_4e/1x1 1 1 inception_4d/output_splitncnn_3 inception_4e/1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=135168 8=2
ReLU inception_4e/relu_1x1 1 1 inception_4e/1x1 inception_4e/1x1_inception_4e/relu_1x1
Convolution inception_4e/3x3_reduce 1 1 inception_4d/output_splitncnn_2 inception_4e/3x3_reduce 0=160 1=1 2=1 3=1 4=0 5=1 6=84480 8=2
ReLU inception_4e/relu_3x3_reduce 1 1 inception_4e/3x3_reduce inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce
Convolution inception_4e/3x3 1 1 inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce inception_4e/3x3 0=320 1=3 2=1 3=1 4=1 5=1 6=460800 8=2
ReLU inception_4e/relu_3x3 1 1 inception_4e/3x3 inception_4e/3x3_inception_4e/relu_3x3
Convolution inception_4e/5x5_reduce 1 1 inception_4d/output_splitncnn_1 inception_4e/5x5_reduce 0=32 1=1 2=1 3=1 4=0 5=1 6=16896 8=2
ReLU inception_4e/relu_5x5_reduce 1 1 inception_4e/5x5_reduce inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce
Convolution inception_4e/5x5 1 1 inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce inception_4e/5x5 0=128 1=5 2=1 3=1 4=2 5=1 6=102400 8=2
ReLU inception_4e/relu_5x5 1 1 inception_4e/5x5 inception_4e/5x5_inception_4e/relu_5x5
Pooling inception_4e/pool 1 1 inception_4d/output_splitncnn_0 inception_4e/pool 0=0 1=3 2=1 3=1 4=0
Convolution inception_4e/pool_proj 1 1 inception_4e/pool inception_4e/pool_proj 0=128 1=1 2=1 3=1 4=0 5=1 6=67584 8=2
ReLU inception_4e/relu_pool_proj 1 1 inception_4e/pool_proj inception_4e/pool_proj_inception_4e/relu_pool_proj
Concat inception_4e/output 4 1 inception_4e/1x1_inception_4e/relu_1x1 inception_4e/3x3_inception_4e/relu_3x3 inception_4e/5x5_inception_4e/relu_5x5 inception_4e/pool_proj_inception_4e/relu_pool_proj inception_4e/output 0=0
Pooling pool4/3x3_s2 1 1 inception_4e/output pool4/3x3_s2 0=0 1=3 2=2 3=0 4=0
Split splitncnn_7 1 4 pool4/3x3_s2 pool4/3x3_s2_splitncnn_0 pool4/3x3_s2_splitncnn_1 pool4/3x3_s2_splitncnn_2 pool4/3x3_s2_splitncnn_3
Convolution inception_5a/1x1 1 1 pool4/3x3_s2_splitncnn_3 inception_5a/1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=212992 8=2
ReLU inception_5a/relu_1x1 1 1 inception_5a/1x1 inception_5a/1x1_inception_5a/relu_1x1
Convolution inception_5a/3x3_reduce 1 1 pool4/3x3_s2_splitncnn_2 inception_5a/3x3_reduce 0=160 1=1 2=1 3=1 4=0 5=1 6=133120 8=2
ReLU inception_5a/relu_3x3_reduce 1 1 inception_5a/3x3_reduce inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce
Convolution inception_5a/3x3 1 1 inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce inception_5a/3x3 0=320 1=3 2=1 3=1 4=1 5=1 6=460800 8=2
ReLU inception_5a/relu_3x3 1 1 inception_5a/3x3 inception_5a/3x3_inception_5a/relu_3x3
Convolution inception_5a/5x5_reduce 1 1 pool4/3x3_s2_splitncnn_1 inception_5a/5x5_reduce 0=32 1=1 2=1 3=1 4=0 5=1 6=26624 8=2
ReLU inception_5a/relu_5x5_reduce 1 1 inception_5a/5x5_reduce inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce
Convolution inception_5a/5x5 1 1 inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce inception_5a/5x5 0=128 1=5 2=1 3=1 4=2 5=1 6=102400 8=2
ReLU inception_5a/relu_5x5 1 1 inception_5a/5x5 inception_5a/5x5_inception_5a/relu_5x5
Pooling inception_5a/pool 1 1 pool4/3x3_s2_splitncnn_0 inception_5a/pool 0=0 1=3 2=1 3=1 4=0
Convolution inception_5a/pool_proj 1 1 inception_5a/pool inception_5a/pool_proj 0=128 1=1 2=1 3=1 4=0 5=1 6=106496 8=2
ReLU inception_5a/relu_pool_proj 1 1 inception_5a/pool_proj inception_5a/pool_proj_inception_5a/relu_pool_proj
Concat inception_5a/output 4 1 inception_5a/1x1_inception_5a/relu_1x1 inception_5a/3x3_inception_5a/relu_3x3 inception_5a/5x5_inception_5a/relu_5x5 inception_5a/pool_proj_inception_5a/relu_pool_proj inception_5a/output 0=0
Split splitncnn_8 1 4 inception_5a/output inception_5a/output_splitncnn_0 inception_5a/output_splitncnn_1 inception_5a/output_splitncnn_2 inception_5a/output_splitncnn_3
Convolution inception_5b/1x1 1 1 inception_5a/output_splitncnn_3 inception_5b/1x1 0=384 1=1 2=1 3=1 4=0 5=1 6=319488 8=2
ReLU inception_5b/relu_1x1 1 1 inception_5b/1x1 inception_5b/1x1_inception_5b/relu_1x1
Convolution inception_5b/3x3_reduce 1 1 inception_5a/output_splitncnn_2 inception_5b/3x3_reduce 0=192 1=1 2=1 3=1 4=0 5=1 6=159744 8=2
ReLU inception_5b/relu_3x3_reduce 1 1 inception_5b/3x3_reduce inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce
Convolution inception_5b/3x3 1 1 inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce inception_5b/3x3 0=384 1=3 2=1 3=1 4=1 5=1 6=663552 8=2
ReLU inception_5b/relu_3x3 1 1 inception_5b/3x3 inception_5b/3x3_inception_5b/relu_3x3
Convolution inception_5b/5x5_reduce 1 1 inception_5a/output_splitncnn_1 inception_5b/5x5_reduce 0=48 1=1 2=1 3=1 4=0 5=1 6=39936 8=2
ReLU inception_5b/relu_5x5_reduce 1 1 inception_5b/5x5_reduce inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce
Convolution inception_5b/5x5 1 1 inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce inception_5b/5x5 0=128 1=5 2=1 3=1 4=2 5=1 6=153600 8=2
ReLU inception_5b/relu_5x5 1 1 inception_5b/5x5 inception_5b/5x5_inception_5b/relu_5x5
Pooling inception_5b/pool 1 1 inception_5a/output_splitncnn_0 inception_5b/pool 0=0 1=3 2=1 3=1 4=0
Convolution inception_5b/pool_proj 1 1 inception_5b/pool inception_5b/pool_proj 0=128 1=1 2=1 3=1 4=0 5=1 6=106496 8=2
ReLU inception_5b/relu_pool_proj 1 1 inception_5b/pool_proj inception_5b/pool_proj_inception_5b/relu_pool_proj
Concat inception_5b/output 4 1 inception_5b/1x1_inception_5b/relu_1x1 inception_5b/3x3_inception_5b/relu_3x3 inception_5b/5x5_inception_5b/relu_5x5 inception_5b/pool_proj_inception_5b/relu_pool_proj inception_5b/output 0=0
Pooling pool5/7x7_s1 1 1 inception_5b/output pool5/7x7_s1 0=1 1=7 2=1 3=0 4=0
Dropout pool5/drop_7x7_s1 1 1 pool5/7x7_s1 pool5/7x7_s1_pool5/drop_7x7_s1
InnerProduct loss3/classifier 1 1 pool5/7x7_s1_pool5/drop_7x7_s1 loss3/classifier 0=1000 1=1 2=1024000
Softmax prob 1 1 loss3/classifier prob 0=0

+ 114
- 0
benchmark/mobilenet_int8.param View File

@@ -0,0 +1,114 @@
7767517
112 112
Input data 0 1 data 0=224 1=224 2=3
Convolution conv1 1 1 data conv1 0=32 1=3 2=1 3=2 4=1 5=0 6=864 8=2
BatchNorm conv1/bn 1 1 conv1 conv1_conv1/bn 0=32
Scale conv1/scale 1 1 conv1_conv1/bn conv1_conv1/scale 0=32 1=1
ReLU relu1 1 1 conv1_conv1/scale conv1_relu1
ConvolutionDepthWise conv2_1/dw 1 1 conv1_relu1 conv2_1/dw 0=32 1=3 2=1 3=1 4=1 5=0 6=288 7=32 8=1
BatchNorm conv2_1/dw/bn 1 1 conv2_1/dw conv2_1/dw_conv2_1/dw/bn 0=32
Scale conv2_1/dw/scale 1 1 conv2_1/dw_conv2_1/dw/bn conv2_1/dw_conv2_1/dw/scale 0=32 1=1
ReLU relu2_1/dw 1 1 conv2_1/dw_conv2_1/dw/scale conv2_1/dw_relu2_1/dw
Convolution conv2_1/sep 1 1 conv2_1/dw_relu2_1/dw conv2_1/sep 0=64 1=1 2=1 3=1 4=0 5=0 6=2048 8=2
BatchNorm conv2_1/sep/bn 1 1 conv2_1/sep conv2_1/sep_conv2_1/sep/bn 0=64
Scale conv2_1/sep/scale 1 1 conv2_1/sep_conv2_1/sep/bn conv2_1/sep_conv2_1/sep/scale 0=64 1=1
ReLU relu2_1/sep 1 1 conv2_1/sep_conv2_1/sep/scale conv2_1/sep_relu2_1/sep
ConvolutionDepthWise conv2_2/dw 1 1 conv2_1/sep_relu2_1/sep conv2_2/dw 0=64 1=3 2=1 3=2 4=1 5=0 6=576 7=64 8=1
BatchNorm conv2_2/dw/bn 1 1 conv2_2/dw conv2_2/dw_conv2_2/dw/bn 0=64
Scale conv2_2/dw/scale 1 1 conv2_2/dw_conv2_2/dw/bn conv2_2/dw_conv2_2/dw/scale 0=64 1=1
ReLU relu2_2/dw 1 1 conv2_2/dw_conv2_2/dw/scale conv2_2/dw_relu2_2/dw
Convolution conv2_2/sep 1 1 conv2_2/dw_relu2_2/dw conv2_2/sep 0=128 1=1 2=1 3=1 4=0 5=0 6=8192 8=2
BatchNorm conv2_2/sep/bn 1 1 conv2_2/sep conv2_2/sep_conv2_2/sep/bn 0=128
Scale conv2_2/sep/scale 1 1 conv2_2/sep_conv2_2/sep/bn conv2_2/sep_conv2_2/sep/scale 0=128 1=1
ReLU relu2_2/sep 1 1 conv2_2/sep_conv2_2/sep/scale conv2_2/sep_relu2_2/sep
ConvolutionDepthWise conv3_1/dw 1 1 conv2_2/sep_relu2_2/sep conv3_1/dw 0=128 1=3 2=1 3=1 4=1 5=0 6=1152 7=128 8=1
BatchNorm conv3_1/dw/bn 1 1 conv3_1/dw conv3_1/dw_conv3_1/dw/bn 0=128
Scale conv3_1/dw/scale 1 1 conv3_1/dw_conv3_1/dw/bn conv3_1/dw_conv3_1/dw/scale 0=128 1=1
ReLU relu3_1/dw 1 1 conv3_1/dw_conv3_1/dw/scale conv3_1/dw_relu3_1/dw
Convolution conv3_1/sep 1 1 conv3_1/dw_relu3_1/dw conv3_1/sep 0=128 1=1 2=1 3=1 4=0 5=0 6=16384 8=2
BatchNorm conv3_1/sep/bn 1 1 conv3_1/sep conv3_1/sep_conv3_1/sep/bn 0=128
Scale conv3_1/sep/scale 1 1 conv3_1/sep_conv3_1/sep/bn conv3_1/sep_conv3_1/sep/scale 0=128 1=1
ReLU relu3_1/sep 1 1 conv3_1/sep_conv3_1/sep/scale conv3_1/sep_relu3_1/sep
ConvolutionDepthWise conv3_2/dw 1 1 conv3_1/sep_relu3_1/sep conv3_2/dw 0=128 1=3 2=1 3=2 4=1 5=0 6=1152 7=128 8=1
BatchNorm conv3_2/dw/bn 1 1 conv3_2/dw conv3_2/dw_conv3_2/dw/bn 0=128
Scale conv3_2/dw/scale 1 1 conv3_2/dw_conv3_2/dw/bn conv3_2/dw_conv3_2/dw/scale 0=128 1=1
ReLU relu3_2/dw 1 1 conv3_2/dw_conv3_2/dw/scale conv3_2/dw_relu3_2/dw
Convolution conv3_2/sep 1 1 conv3_2/dw_relu3_2/dw conv3_2/sep 0=256 1=1 2=1 3=1 4=0 5=0 6=32768 8=2
BatchNorm conv3_2/sep/bn 1 1 conv3_2/sep conv3_2/sep_conv3_2/sep/bn 0=256
Scale conv3_2/sep/scale 1 1 conv3_2/sep_conv3_2/sep/bn conv3_2/sep_conv3_2/sep/scale 0=256 1=1
ReLU relu3_2/sep 1 1 conv3_2/sep_conv3_2/sep/scale conv3_2/sep_relu3_2/sep
ConvolutionDepthWise conv4_1/dw 1 1 conv3_2/sep_relu3_2/sep conv4_1/dw 0=256 1=3 2=1 3=1 4=1 5=0 6=2304 7=256 8=1
BatchNorm conv4_1/dw/bn 1 1 conv4_1/dw conv4_1/dw_conv4_1/dw/bn 0=256
Scale conv4_1/dw/scale 1 1 conv4_1/dw_conv4_1/dw/bn conv4_1/dw_conv4_1/dw/scale 0=256 1=1
ReLU relu4_1/dw 1 1 conv4_1/dw_conv4_1/dw/scale conv4_1/dw_relu4_1/dw
Convolution conv4_1/sep 1 1 conv4_1/dw_relu4_1/dw conv4_1/sep 0=256 1=1 2=1 3=1 4=0 5=0 6=65536 8=2
BatchNorm conv4_1/sep/bn 1 1 conv4_1/sep conv4_1/sep_conv4_1/sep/bn 0=256
Scale conv4_1/sep/scale 1 1 conv4_1/sep_conv4_1/sep/bn conv4_1/sep_conv4_1/sep/scale 0=256 1=1
ReLU relu4_1/sep 1 1 conv4_1/sep_conv4_1/sep/scale conv4_1/sep_relu4_1/sep
ConvolutionDepthWise conv4_2/dw 1 1 conv4_1/sep_relu4_1/sep conv4_2/dw 0=256 1=3 2=1 3=2 4=1 5=0 6=2304 7=256 8=1
BatchNorm conv4_2/dw/bn 1 1 conv4_2/dw conv4_2/dw_conv4_2/dw/bn 0=256
Scale conv4_2/dw/scale 1 1 conv4_2/dw_conv4_2/dw/bn conv4_2/dw_conv4_2/dw/scale 0=256 1=1
ReLU relu4_2/dw 1 1 conv4_2/dw_conv4_2/dw/scale conv4_2/dw_relu4_2/dw
Convolution conv4_2/sep 1 1 conv4_2/dw_relu4_2/dw conv4_2/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=131072 8=2
BatchNorm conv4_2/sep/bn 1 1 conv4_2/sep conv4_2/sep_conv4_2/sep/bn 0=512
Scale conv4_2/sep/scale 1 1 conv4_2/sep_conv4_2/sep/bn conv4_2/sep_conv4_2/sep/scale 0=512 1=1
ReLU relu4_2/sep 1 1 conv4_2/sep_conv4_2/sep/scale conv4_2/sep_relu4_2/sep
ConvolutionDepthWise conv5_1/dw 1 1 conv4_2/sep_relu4_2/sep conv5_1/dw 0=512 1=3 2=1 3=1 4=1 5=0 6=4608 7=512 8=1
BatchNorm conv5_1/dw/bn 1 1 conv5_1/dw conv5_1/dw_conv5_1/dw/bn 0=512
Scale conv5_1/dw/scale 1 1 conv5_1/dw_conv5_1/dw/bn conv5_1/dw_conv5_1/dw/scale 0=512 1=1
ReLU relu5_1/dw 1 1 conv5_1/dw_conv5_1/dw/scale conv5_1/dw_relu5_1/dw
Convolution conv5_1/sep 1 1 conv5_1/dw_relu5_1/dw conv5_1/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
BatchNorm conv5_1/sep/bn 1 1 conv5_1/sep conv5_1/sep_conv5_1/sep/bn 0=512
Scale conv5_1/sep/scale 1 1 conv5_1/sep_conv5_1/sep/bn conv5_1/sep_conv5_1/sep/scale 0=512 1=1
ReLU relu5_1/sep 1 1 conv5_1/sep_conv5_1/sep/scale conv5_1/sep_relu5_1/sep
ConvolutionDepthWise conv5_2/dw 1 1 conv5_1/sep_relu5_1/sep conv5_2/dw 0=512 1=3 2=1 3=1 4=1 5=0 6=4608 7=512 8=1
BatchNorm conv5_2/dw/bn 1 1 conv5_2/dw conv5_2/dw_conv5_2/dw/bn 0=512
Scale conv5_2/dw/scale 1 1 conv5_2/dw_conv5_2/dw/bn conv5_2/dw_conv5_2/dw/scale 0=512 1=1
ReLU relu5_2/dw 1 1 conv5_2/dw_conv5_2/dw/scale conv5_2/dw_relu5_2/dw
Convolution conv5_2/sep 1 1 conv5_2/dw_relu5_2/dw conv5_2/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
BatchNorm conv5_2/sep/bn 1 1 conv5_2/sep conv5_2/sep_conv5_2/sep/bn 0=512
Scale conv5_2/sep/scale 1 1 conv5_2/sep_conv5_2/sep/bn conv5_2/sep_conv5_2/sep/scale 0=512 1=1
ReLU relu5_2/sep 1 1 conv5_2/sep_conv5_2/sep/scale conv5_2/sep_relu5_2/sep
ConvolutionDepthWise conv5_3/dw 1 1 conv5_2/sep_relu5_2/sep conv5_3/dw 0=512 1=3 2=1 3=1 4=1 5=0 6=4608 7=512 8=1
BatchNorm conv5_3/dw/bn 1 1 conv5_3/dw conv5_3/dw_conv5_3/dw/bn 0=512
Scale conv5_3/dw/scale 1 1 conv5_3/dw_conv5_3/dw/bn conv5_3/dw_conv5_3/dw/scale 0=512 1=1
ReLU relu5_3/dw 1 1 conv5_3/dw_conv5_3/dw/scale conv5_3/dw_relu5_3/dw
Convolution conv5_3/sep 1 1 conv5_3/dw_relu5_3/dw conv5_3/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
BatchNorm conv5_3/sep/bn 1 1 conv5_3/sep conv5_3/sep_conv5_3/sep/bn 0=512
Scale conv5_3/sep/scale 1 1 conv5_3/sep_conv5_3/sep/bn conv5_3/sep_conv5_3/sep/scale 0=512 1=1
ReLU relu5_3/sep 1 1 conv5_3/sep_conv5_3/sep/scale conv5_3/sep_relu5_3/sep
ConvolutionDepthWise conv5_4/dw 1 1 conv5_3/sep_relu5_3/sep conv5_4/dw 0=512 1=3 2=1 3=1 4=1 5=0 6=4608 7=512 8=1
BatchNorm conv5_4/dw/bn 1 1 conv5_4/dw conv5_4/dw_conv5_4/dw/bn 0=512
Scale conv5_4/dw/scale 1 1 conv5_4/dw_conv5_4/dw/bn conv5_4/dw_conv5_4/dw/scale 0=512 1=1
ReLU relu5_4/dw 1 1 conv5_4/dw_conv5_4/dw/scale conv5_4/dw_relu5_4/dw
Convolution conv5_4/sep 1 1 conv5_4/dw_relu5_4/dw conv5_4/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
BatchNorm conv5_4/sep/bn 1 1 conv5_4/sep conv5_4/sep_conv5_4/sep/bn 0=512
Scale conv5_4/sep/scale 1 1 conv5_4/sep_conv5_4/sep/bn conv5_4/sep_conv5_4/sep/scale 0=512 1=1
ReLU relu5_4/sep 1 1 conv5_4/sep_conv5_4/sep/scale conv5_4/sep_relu5_4/sep
ConvolutionDepthWise conv5_5/dw 1 1 conv5_4/sep_relu5_4/sep conv5_5/dw 0=512 1=3 2=1 3=1 4=1 5=0 6=4608 7=512 8=1
BatchNorm conv5_5/dw/bn 1 1 conv5_5/dw conv5_5/dw_conv5_5/dw/bn 0=512
Scale conv5_5/dw/scale 1 1 conv5_5/dw_conv5_5/dw/bn conv5_5/dw_conv5_5/dw/scale 0=512 1=1
ReLU relu5_5/dw 1 1 conv5_5/dw_conv5_5/dw/scale conv5_5/dw_relu5_5/dw
Convolution conv5_5/sep 1 1 conv5_5/dw_relu5_5/dw conv5_5/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
BatchNorm conv5_5/sep/bn 1 1 conv5_5/sep conv5_5/sep_conv5_5/sep/bn 0=512
Scale conv5_5/sep/scale 1 1 conv5_5/sep_conv5_5/sep/bn conv5_5/sep_conv5_5/sep/scale 0=512 1=1
ReLU relu5_5/sep 1 1 conv5_5/sep_conv5_5/sep/scale conv5_5/sep_relu5_5/sep
ConvolutionDepthWise conv5_6/dw 1 1 conv5_5/sep_relu5_5/sep conv5_6/dw 0=512 1=3 2=1 3=2 4=1 5=0 6=4608 7=512 8=1
BatchNorm conv5_6/dw/bn 1 1 conv5_6/dw conv5_6/dw_conv5_6/dw/bn 0=512
Scale conv5_6/dw/scale 1 1 conv5_6/dw_conv5_6/dw/bn conv5_6/dw_conv5_6/dw/scale 0=512 1=1
ReLU relu5_6/dw 1 1 conv5_6/dw_conv5_6/dw/scale conv5_6/dw_relu5_6/dw
Convolution conv5_6/sep 1 1 conv5_6/dw_relu5_6/dw conv5_6/sep 0=1024 1=1 2=1 3=1 4=0 5=0 6=524288 8=2
BatchNorm conv5_6/sep/bn 1 1 conv5_6/sep conv5_6/sep_conv5_6/sep/bn 0=1024
Scale conv5_6/sep/scale 1 1 conv5_6/sep_conv5_6/sep/bn conv5_6/sep_conv5_6/sep/scale 0=1024 1=1
ReLU relu5_6/sep 1 1 conv5_6/sep_conv5_6/sep/scale conv5_6/sep_relu5_6/sep
ConvolutionDepthWise conv6/dw 1 1 conv5_6/sep_relu5_6/sep conv6/dw 0=1024 1=3 2=1 3=1 4=1 5=0 6=9216 7=1024 8=1
BatchNorm conv6/dw/bn 1 1 conv6/dw conv6/dw_conv6/dw/bn 0=1024
Scale conv6/dw/scale 1 1 conv6/dw_conv6/dw/bn conv6/dw_conv6/dw/scale 0=1024 1=1
ReLU relu6/dw 1 1 conv6/dw_conv6/dw/scale conv6/dw_relu6/dw
Convolution conv6/sep 1 1 conv6/dw_relu6/dw conv6/sep 0=1024 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2
BatchNorm conv6/sep/bn 1 1 conv6/sep conv6/sep_conv6/sep/bn 0=1024
Scale conv6/sep/scale 1 1 conv6/sep_conv6/sep/bn conv6/sep_conv6/sep/scale 0=1024 1=1
ReLU relu6/sep 1 1 conv6/sep_conv6/sep/scale conv6/sep_relu6/sep
Pooling pool6 1 1 conv6/sep_relu6/sep pool6 0=1 1=0 2=1 3=0 4=1
Convolution fc7 1 1 pool6 fc7 0=1000 1=1 2=1 3=1 4=0 5=1 6=1024000 8=2
Softmax prob 1 1 fc7 prob 0=0

+ 129
- 0
benchmark/mobilenet_ssd_int8.param View File

@@ -0,0 +1,129 @@
7767517
127 150
Input data 0 1 data 0=300 1=300 2=3
Split splitncnn_0 1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6
Convolution conv0 1 1 data_splitncnn_6 conv0 0=32 1=3 2=1 3=2 4=1 5=1 6=864 8=2
ReLU conv0/relu 1 1 conv0 conv0_conv0/relu
ConvolutionDepthWise conv1/dw 1 1 conv0_conv0/relu conv1/dw 0=32 1=3 2=1 3=1 4=1 5=1 6=288 7=32 8=1
ReLU conv1/dw/relu 1 1 conv1/dw conv1/dw_conv1/dw/relu
Convolution conv1 1 1 conv1/dw_conv1/dw/relu conv1 0=64 1=1 2=1 3=1 4=0 5=1 6=2048 8=2
ReLU conv1/relu 1 1 conv1 conv1_conv1/relu
ConvolutionDepthWise conv2/dw 1 1 conv1_conv1/relu conv2/dw 0=64 1=3 2=1 3=2 4=1 5=1 6=576 7=64 8=1
ReLU conv2/dw/relu 1 1 conv2/dw conv2/dw_conv2/dw/relu
Convolution conv2 1 1 conv2/dw_conv2/dw/relu conv2 0=128 1=1 2=1 3=1 4=0 5=1 6=8192 8=2
ReLU conv2/relu 1 1 conv2 conv2_conv2/relu
ConvolutionDepthWise conv3/dw 1 1 conv2_conv2/relu conv3/dw 0=128 1=3 2=1 3=1 4=1 5=1 6=1152 7=128 8=1
ReLU conv3/dw/relu 1 1 conv3/dw conv3/dw_conv3/dw/relu
Convolution conv3 1 1 conv3/dw_conv3/dw/relu conv3 0=128 1=1 2=1 3=1 4=0 5=1 6=16384 8=2
ReLU conv3/relu 1 1 conv3 conv3_conv3/relu
ConvolutionDepthWise conv4/dw 1 1 conv3_conv3/relu conv4/dw 0=128 1=3 2=1 3=2 4=1 5=1 6=1152 7=128 8=1
ReLU conv4/dw/relu 1 1 conv4/dw conv4/dw_conv4/dw/relu
Convolution conv4 1 1 conv4/dw_conv4/dw/relu conv4 0=256 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
ReLU conv4/relu 1 1 conv4 conv4_conv4/relu
ConvolutionDepthWise conv5/dw 1 1 conv4_conv4/relu conv5/dw 0=256 1=3 2=1 3=1 4=1 5=1 6=2304 7=256 8=1
ReLU conv5/dw/relu 1 1 conv5/dw conv5/dw_conv5/dw/relu
Convolution conv5 1 1 conv5/dw_conv5/dw/relu conv5 0=256 1=1 2=1 3=1 4=0 5=1 6=65536 8=2
ReLU conv5/relu 1 1 conv5 conv5_conv5/relu
ConvolutionDepthWise conv6/dw 1 1 conv5_conv5/relu conv6/dw 0=256 1=3 2=1 3=2 4=1 5=1 6=2304 7=256 8=1
ReLU conv6/dw/relu 1 1 conv6/dw conv6/dw_conv6/dw/relu
Convolution conv6 1 1 conv6/dw_conv6/dw/relu conv6 0=512 1=1 2=1 3=1 4=0 5=1 6=131072 8=2
ReLU conv6/relu 1 1 conv6 conv6_conv6/relu
ConvolutionDepthWise conv7/dw 1 1 conv6_conv6/relu conv7/dw 0=512 1=3 2=1 3=1 4=1 5=1 6=4608 7=512 8=1
ReLU conv7/dw/relu 1 1 conv7/dw conv7/dw_conv7/dw/relu
Convolution conv7 1 1 conv7/dw_conv7/dw/relu conv7 0=512 1=1 2=1 3=1 4=0 5=1 6=262144 8=2
ReLU conv7/relu 1 1 conv7 conv7_conv7/relu
ConvolutionDepthWise conv8/dw 1 1 conv7_conv7/relu conv8/dw 0=512 1=3 2=1 3=1 4=1 5=1 6=4608 7=512 8=1
ReLU conv8/dw/relu 1 1 conv8/dw conv8/dw_conv8/dw/relu
Convolution conv8 1 1 conv8/dw_conv8/dw/relu conv8 0=512 1=1 2=1 3=1 4=0 5=1 6=262144 8=2
ReLU conv8/relu 1 1 conv8 conv8_conv8/relu
ConvolutionDepthWise conv9/dw 1 1 conv8_conv8/relu conv9/dw 0=512 1=3 2=1 3=1 4=1 5=1 6=4608 7=512 8=1
ReLU conv9/dw/relu 1 1 conv9/dw conv9/dw_conv9/dw/relu
Convolution conv9 1 1 conv9/dw_conv9/dw/relu conv9 0=512 1=1 2=1 3=1 4=0 5=1 6=262144 8=2
ReLU conv9/relu 1 1 conv9 conv9_conv9/relu
ConvolutionDepthWise conv10/dw 1 1 conv9_conv9/relu conv10/dw 0=512 1=3 2=1 3=1 4=1 5=1 6=4608 7=512 8=1
ReLU conv10/dw/relu 1 1 conv10/dw conv10/dw_conv10/dw/relu
Convolution conv10 1 1 conv10/dw_conv10/dw/relu conv10 0=512 1=1 2=1 3=1 4=0 5=1 6=262144 8=2
ReLU conv10/relu 1 1 conv10 conv10_conv10/relu
ConvolutionDepthWise conv11/dw 1 1 conv10_conv10/relu conv11/dw 0=512 1=3 2=1 3=1 4=1 5=1 6=4608 7=512 8=1
ReLU conv11/dw/relu 1 1 conv11/dw conv11/dw_conv11/dw/relu
Convolution conv11 1 1 conv11/dw_conv11/dw/relu conv11 0=512 1=1 2=1 3=1 4=0 5=1 6=262144 8=2
ReLU conv11/relu 1 1 conv11 conv11_conv11/relu
Split splitncnn_1 1 4 conv11_conv11/relu conv11_conv11/relu_splitncnn_0 conv11_conv11/relu_splitncnn_1 conv11_conv11/relu_splitncnn_2 conv11_conv11/relu_splitncnn_3
ConvolutionDepthWise conv12/dw 1 1 conv11_conv11/relu_splitncnn_3 conv12/dw 0=512 1=3 2=1 3=2 4=1 5=1 6=4608 7=512 8=1
ReLU conv12/dw/relu 1 1 conv12/dw conv12/dw_conv12/dw/relu
Convolution conv12 1 1 conv12/dw_conv12/dw/relu conv12 0=1024 1=1 2=1 3=1 4=0 5=1 6=524288 8=2
ReLU conv12/relu 1 1 conv12 conv12_conv12/relu
ConvolutionDepthWise conv13/dw 1 1 conv12_conv12/relu conv13/dw 0=1024 1=3 2=1 3=1 4=1 5=1 6=9216 7=1024 8=1
ReLU conv13/dw/relu 1 1 conv13/dw conv13/dw_conv13/dw/relu
Convolution conv13 1 1 conv13/dw_conv13/dw/relu conv13 0=1024 1=1 2=1 3=1 4=0 5=1 6=1048576 8=2
ReLU conv13/relu 1 1 conv13 conv13_conv13/relu
Split splitncnn_2 1 4 conv13_conv13/relu conv13_conv13/relu_splitncnn_0 conv13_conv13/relu_splitncnn_1 conv13_conv13/relu_splitncnn_2 conv13_conv13/relu_splitncnn_3
Convolution conv14_1 1 1 conv13_conv13/relu_splitncnn_3 conv14_1 0=256 1=1 2=1 3=1 4=0 5=1 6=262144 8=2
ReLU conv14_1/relu 1 1 conv14_1 conv14_1_conv14_1/relu
Convolution conv14_2 1 1 conv14_1_conv14_1/relu conv14_2 0=512 1=3 2=1 3=2 4=1 5=1 6=1179648 8=2
ReLU conv14_2/relu 1 1 conv14_2 conv14_2_conv14_2/relu
Split splitncnn_3 1 4 conv14_2_conv14_2/relu conv14_2_conv14_2/relu_splitncnn_0 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_conv14_2/relu_splitncnn_3
Convolution conv15_1 1 1 conv14_2_conv14_2/relu_splitncnn_3 conv15_1 0=128 1=1 2=1 3=1 4=0 5=1 6=65536 8=2
ReLU conv15_1/relu 1 1 conv15_1 conv15_1_conv15_1/relu
Convolution conv15_2 1 1 conv15_1_conv15_1/relu conv15_2 0=256 1=3 2=1 3=2 4=1 5=1 6=294912 8=2
ReLU conv15_2/relu 1 1 conv15_2 conv15_2_conv15_2/relu
Split splitncnn_4 1 4 conv15_2_conv15_2/relu conv15_2_conv15_2/relu_splitncnn_0 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_conv15_2/relu_splitncnn_3
Convolution conv16_1 1 1 conv15_2_conv15_2/relu_splitncnn_3 conv16_1 0=128 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
ReLU conv16_1/relu 1 1 conv16_1 conv16_1_conv16_1/relu
Convolution conv16_2 1 1 conv16_1_conv16_1/relu conv16_2 0=256 1=3 2=1 3=2 4=1 5=1 6=294912 8=2
ReLU conv16_2/relu 1 1 conv16_2 conv16_2_conv16_2/relu
Split splitncnn_5 1 4 conv16_2_conv16_2/relu conv16_2_conv16_2/relu_splitncnn_0 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_conv16_2/relu_splitncnn_3
Convolution conv17_1 1 1 conv16_2_conv16_2/relu_splitncnn_3 conv17_1 0=64 1=1 2=1 3=1 4=0 5=1 6=16384 8=2
ReLU conv17_1/relu 1 1 conv17_1 conv17_1_conv17_1/relu
Convolution conv17_2 1 1 conv17_1_conv17_1/relu conv17_2 0=128 1=3 2=1 3=2 4=1 5=1 6=73728 8=2
ReLU conv17_2/relu 1 1 conv17_2 conv17_2_conv17_2/relu
Split splitncnn_6 1 3 conv17_2_conv17_2/relu conv17_2_conv17_2/relu_splitncnn_0 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_conv17_2/relu_splitncnn_2
Convolution conv11_mbox_loc 1 1 conv11_conv11/relu_splitncnn_2 conv11_mbox_loc 0=12 1=1 2=1 3=1 4=0 5=1 6=6144 8=2
Permute conv11_mbox_loc_perm 1 1 conv11_mbox_loc conv11_mbox_loc_perm 0=3
Flatten conv11_mbox_loc_flat 1 1 conv11_mbox_loc_perm conv11_mbox_loc_flat
Convolution conv11_mbox_conf 1 1 conv11_conv11/relu_splitncnn_1 conv11_mbox_conf 0=63 1=1 2=1 3=1 4=0 5=1 6=32256 8=2
Permute conv11_mbox_conf_perm 1 1 conv11_mbox_conf conv11_mbox_conf_perm 0=3
Flatten conv11_mbox_conf_flat 1 1 conv11_mbox_conf_perm conv11_mbox_conf_flat
PriorBox conv11_mbox_priorbox 2 1 conv11_conv11/relu_splitncnn_0 data_splitncnn_5 conv11_mbox_priorbox -23300=1,60.000000 -23301=0 -23302=1,2.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000
Convolution conv13_mbox_loc 1 1 conv13_conv13/relu_splitncnn_2 conv13_mbox_loc 0=24 1=1 2=1 3=1 4=0 5=1 6=24576 8=2
Permute conv13_mbox_loc_perm 1 1 conv13_mbox_loc conv13_mbox_loc_perm 0=3
Flatten conv13_mbox_loc_flat 1 1 conv13_mbox_loc_perm conv13_mbox_loc_flat
Convolution conv13_mbox_conf 1 1 conv13_conv13/relu_splitncnn_1 conv13_mbox_conf 0=126 1=1 2=1 3=1 4=0 5=1 6=129024 8=2
Permute conv13_mbox_conf_perm 1 1 conv13_mbox_conf conv13_mbox_conf_perm 0=3
Flatten conv13_mbox_conf_flat 1 1 conv13_mbox_conf_perm conv13_mbox_conf_flat
PriorBox conv13_mbox_priorbox 2 1 conv13_conv13/relu_splitncnn_0 data_splitncnn_4 conv13_mbox_priorbox -23300=1,105.000000 -23301=1,150.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000
Convolution conv14_2_mbox_loc 1 1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_mbox_loc 0=24 1=1 2=1 3=1 4=0 5=1 6=12288 8=2
Permute conv14_2_mbox_loc_perm 1 1 conv14_2_mbox_loc conv14_2_mbox_loc_perm 0=3
Flatten conv14_2_mbox_loc_flat 1 1 conv14_2_mbox_loc_perm conv14_2_mbox_loc_flat
Convolution conv14_2_mbox_conf 1 1 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_mbox_conf 0=126 1=1 2=1 3=1 4=0 5=1 6=64512 8=2
Permute conv14_2_mbox_conf_perm 1 1 conv14_2_mbox_conf conv14_2_mbox_conf_perm 0=3
Flatten conv14_2_mbox_conf_flat 1 1 conv14_2_mbox_conf_perm conv14_2_mbox_conf_flat
PriorBox conv14_2_mbox_priorbox 2 1 conv14_2_conv14_2/relu_splitncnn_0 data_splitncnn_3 conv14_2_mbox_priorbox -23300=1,150.000000 -23301=1,195.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000
Convolution conv15_2_mbox_loc 1 1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_mbox_loc 0=24 1=1 2=1 3=1 4=0 5=1 6=6144 8=2
Permute conv15_2_mbox_loc_perm 1 1 conv15_2_mbox_loc conv15_2_mbox_loc_perm 0=3
Flatten conv15_2_mbox_loc_flat 1 1 conv15_2_mbox_loc_perm conv15_2_mbox_loc_flat
Convolution conv15_2_mbox_conf 1 1 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_mbox_conf 0=126 1=1 2=1 3=1 4=0 5=1 6=32256 8=2
Permute conv15_2_mbox_conf_perm 1 1 conv15_2_mbox_conf conv15_2_mbox_conf_perm 0=3
Flatten conv15_2_mbox_conf_flat 1 1 conv15_2_mbox_conf_perm conv15_2_mbox_conf_flat
PriorBox conv15_2_mbox_priorbox 2 1 conv15_2_conv15_2/relu_splitncnn_0 data_splitncnn_2 conv15_2_mbox_priorbox -23300=1,195.000000 -23301=1,240.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000
Convolution conv16_2_mbox_loc 1 1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_mbox_loc 0=24 1=1 2=1 3=1 4=0 5=1 6=6144 8=2
Permute conv16_2_mbox_loc_perm 1 1 conv16_2_mbox_loc conv16_2_mbox_loc_perm 0=3
Flatten conv16_2_mbox_loc_flat 1 1 conv16_2_mbox_loc_perm conv16_2_mbox_loc_flat
Convolution conv16_2_mbox_conf 1 1 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_mbox_conf 0=126 1=1 2=1 3=1 4=0 5=1 6=32256 8=2
Permute conv16_2_mbox_conf_perm 1 1 conv16_2_mbox_conf conv16_2_mbox_conf_perm 0=3
Flatten conv16_2_mbox_conf_flat 1 1 conv16_2_mbox_conf_perm conv16_2_mbox_conf_flat
PriorBox conv16_2_mbox_priorbox 2 1 conv16_2_conv16_2/relu_splitncnn_0 data_splitncnn_1 conv16_2_mbox_priorbox -23300=1,240.000000 -23301=1,285.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000
Convolution conv17_2_mbox_loc 1 1 conv17_2_conv17_2/relu_splitncnn_2 conv17_2_mbox_loc 0=24 1=1 2=1 3=1 4=0 5=1 6=3072 8=2
Permute conv17_2_mbox_loc_perm 1 1 conv17_2_mbox_loc conv17_2_mbox_loc_perm 0=3
Flatten conv17_2_mbox_loc_flat 1 1 conv17_2_mbox_loc_perm conv17_2_mbox_loc_flat
Convolution conv17_2_mbox_conf 1 1 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_mbox_conf 0=126 1=1 2=1 3=1 4=0 5=1 6=16128 8=2
Permute conv17_2_mbox_conf_perm 1 1 conv17_2_mbox_conf conv17_2_mbox_conf_perm 0=3
Flatten conv17_2_mbox_conf_flat 1 1 conv17_2_mbox_conf_perm conv17_2_mbox_conf_flat
PriorBox conv17_2_mbox_priorbox 2 1 conv17_2_conv17_2/relu_splitncnn_0 data_splitncnn_0 conv17_2_mbox_priorbox -23300=1,285.000000 -23301=1,300.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000
Concat mbox_loc 6 1 conv11_mbox_loc_flat conv13_mbox_loc_flat conv14_2_mbox_loc_flat conv15_2_mbox_loc_flat conv16_2_mbox_loc_flat conv17_2_mbox_loc_flat mbox_loc 0=0
Concat mbox_conf 6 1 conv11_mbox_conf_flat conv13_mbox_conf_flat conv14_2_mbox_conf_flat conv15_2_mbox_conf_flat conv16_2_mbox_conf_flat conv17_2_mbox_conf_flat mbox_conf 0=0
Concat mbox_priorbox 6 1 conv11_mbox_priorbox conv13_mbox_priorbox conv14_2_mbox_priorbox conv15_2_mbox_priorbox conv16_2_mbox_priorbox conv17_2_mbox_priorbox mbox_priorbox 0=1
Reshape mbox_conf_reshape 1 1 mbox_conf mbox_conf_reshape 0=21 1=-1 2=0 3=0
Softmax mbox_conf_softmax 1 1 mbox_conf_reshape mbox_conf_softmax 0=1
Flatten mbox_conf_flatten 1 1 mbox_conf_softmax mbox_conf_flatten
DetectionOutput detection_out 3 1 mbox_loc mbox_conf_flatten mbox_priorbox detection_out 0=21 1=0.450000 2=100 3=100 4=0.250000

+ 103
- 0
benchmark/resnet18_int8.param View File

@@ -0,0 +1,103 @@
7767517
101 109
Input data 0 1 data 0=224 1=224 2=3
Convolution conv1 1 1 data conv1 0=64 1=7 2=1 3=2 4=3 5=0 6=9408 8=2
BatchNorm bn_conv1 1 1 conv1 conv1_bn_conv1 0=64
Scale scale_conv1 1 1 conv1_bn_conv1 conv1_scale_conv1 0=64 1=1
ReLU conv1_relu 1 1 conv1_scale_conv1 conv1_conv1_relu
Pooling pool1 1 1 conv1_conv1_relu pool1 0=0 1=3 2=2 3=0 4=0
Split splitncnn_0 1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1
Convolution res2a_branch1 1 1 pool1_splitncnn_1 res2a_branch1 0=64 1=1 2=1 3=1 4=0 5=0 6=4096 8=2
BatchNorm bn2a_branch1 1 1 res2a_branch1 res2a_branch1_bn2a_branch1 0=64
Scale scale2a_branch1 1 1 res2a_branch1_bn2a_branch1 res2a_branch1_scale2a_branch1 0=64 1=1
Convolution res2a_branch2a 1 1 pool1_splitncnn_0 res2a_branch2a 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2
BatchNorm bn2a_branch2a 1 1 res2a_branch2a res2a_branch2a_bn2a_branch2a 0=64
Scale scale2a_branch2a 1 1 res2a_branch2a_bn2a_branch2a res2a_branch2a_scale2a_branch2a 0=64 1=1
ReLU res2a_branch2a_relu 1 1 res2a_branch2a_scale2a_branch2a res2a_branch2a_res2a_branch2a_relu
Convolution res2a_branch2b 1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2
BatchNorm bn2a_branch2b 1 1 res2a_branch2b res2a_branch2b_bn2a_branch2b 0=64
Scale scale2a_branch2b 1 1 res2a_branch2b_bn2a_branch2b res2a_branch2b_scale2a_branch2b 0=64 1=1
Eltwise res2a 2 1 res2a_branch1_scale2a_branch1 res2a_branch2b_scale2a_branch2b res2a 0=1 -23301=0
ReLU res2a_relu 1 1 res2a res2a_res2a_relu
Split splitncnn_1 1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1
Convolution res2b_branch2a 1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2
BatchNorm bn2b_branch2a 1 1 res2b_branch2a res2b_branch2a_bn2b_branch2a 0=64
Scale scale2b_branch2a 1 1 res2b_branch2a_bn2b_branch2a res2b_branch2a_scale2b_branch2a 0=64 1=1
ReLU res2b_branch2a_relu 1 1 res2b_branch2a_scale2b_branch2a res2b_branch2a_res2b_branch2a_relu
Convolution res2b_branch2b 1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2
BatchNorm bn2b_branch2b 1 1 res2b_branch2b res2b_branch2b_bn2b_branch2b 0=64
Scale scale2b_branch2b 1 1 res2b_branch2b_bn2b_branch2b res2b_branch2b_scale2b_branch2b 0=64 1=1
Eltwise res2b 2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2b_scale2b_branch2b res2b 0=1 -23301=0
ReLU res2b_relu 1 1 res2b res2b_res2b_relu
Split splitncnn_2 1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1
Convolution res3a_branch1 1 1 res2b_res2b_relu_splitncnn_1 res3a_branch1 0=128 1=1 2=1 3=2 4=0 5=0 6=8192 8=2
BatchNorm bn3a_branch1 1 1 res3a_branch1 res3a_branch1_bn3a_branch1 0=128
Scale scale3a_branch1 1 1 res3a_branch1_bn3a_branch1 res3a_branch1_scale3a_branch1 0=128 1=1
Convolution res3a_branch2a 1 1 res2b_res2b_relu_splitncnn_0 res3a_branch2a 0=128 1=3 2=1 3=2 4=1 5=0 6=73728 8=2
BatchNorm bn3a_branch2a 1 1 res3a_branch2a res3a_branch2a_bn3a_branch2a 0=128
Scale scale3a_branch2a 1 1 res3a_branch2a_bn3a_branch2a res3a_branch2a_scale3a_branch2a 0=128 1=1
ReLU res3a_branch2a_relu 1 1 res3a_branch2a_scale3a_branch2a res3a_branch2a_res3a_branch2a_relu
Convolution res3a_branch2b 1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2
BatchNorm bn3a_branch2b 1 1 res3a_branch2b res3a_branch2b_bn3a_branch2b 0=128
Scale scale3a_branch2b 1 1 res3a_branch2b_bn3a_branch2b res3a_branch2b_scale3a_branch2b 0=128 1=1
Eltwise res3a 2 1 res3a_branch1_scale3a_branch1 res3a_branch2b_scale3a_branch2b res3a 0=1 -23301=0
ReLU res3a_relu 1 1 res3a res3a_res3a_relu
Split splitncnn_3 1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1
Convolution res3b_branch2a 1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2
BatchNorm bn3b_branch2a 1 1 res3b_branch2a res3b_branch2a_bn3b_branch2a 0=128
Scale scale3b_branch2a 1 1 res3b_branch2a_bn3b_branch2a res3b_branch2a_scale3b_branch2a 0=128 1=1
ReLU res3b_branch2a_relu 1 1 res3b_branch2a_scale3b_branch2a res3b_branch2a_res3b_branch2a_relu
Convolution res3b_branch2b 1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2
BatchNorm bn3b_branch2b 1 1 res3b_branch2b res3b_branch2b_bn3b_branch2b 0=128
Scale scale3b_branch2b 1 1 res3b_branch2b_bn3b_branch2b res3b_branch2b_scale3b_branch2b 0=128 1=1
Eltwise res3b 2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2b_scale3b_branch2b res3b 0=1 -23301=0
ReLU res3b_relu 1 1 res3b res3b_res3b_relu
Split splitncnn_4 1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1
Convolution res4a_branch1 1 1 res3b_res3b_relu_splitncnn_1 res4a_branch1 0=256 1=1 2=1 3=2 4=0 5=0 6=32768 8=2
BatchNorm bn4a_branch1 1 1 res4a_branch1 res4a_branch1_bn4a_branch1 0=256
Scale scale4a_branch1 1 1 res4a_branch1_bn4a_branch1 res4a_branch1_scale4a_branch1 0=256 1=1
Convolution res4a_branch2a 1 1 res3b_res3b_relu_splitncnn_0 res4a_branch2a 0=256 1=3 2=1 3=2 4=1 5=0 6=294912 8=2
BatchNorm bn4a_branch2a 1 1 res4a_branch2a res4a_branch2a_bn4a_branch2a 0=256
Scale scale4a_branch2a 1 1 res4a_branch2a_bn4a_branch2a res4a_branch2a_scale4a_branch2a 0=256 1=1
ReLU res4a_branch2a_relu 1 1 res4a_branch2a_scale4a_branch2a res4a_branch2a_res4a_branch2a_relu
Convolution res4a_branch2b 1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
BatchNorm bn4a_branch2b 1 1 res4a_branch2b res4a_branch2b_bn4a_branch2b 0=256
Scale scale4a_branch2b 1 1 res4a_branch2b_bn4a_branch2b res4a_branch2b_scale4a_branch2b 0=256 1=1
Eltwise res4a 2 1 res4a_branch1_scale4a_branch1 res4a_branch2b_scale4a_branch2b res4a 0=1 -23301=0
ReLU res4a_relu 1 1 res4a res4a_res4a_relu
Split splitncnn_5 1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1
Convolution res4b_branch2a 1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
BatchNorm bn4b_branch2a 1 1 res4b_branch2a res4b_branch2a_bn4b_branch2a 0=256
Scale scale4b_branch2a 1 1 res4b_branch2a_bn4b_branch2a res4b_branch2a_scale4b_branch2a 0=256 1=1
ReLU res4b_branch2a_relu 1 1 res4b_branch2a_scale4b_branch2a res4b_branch2a_res4b_branch2a_relu
Convolution res4b_branch2b 1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
BatchNorm bn4b_branch2b 1 1 res4b_branch2b res4b_branch2b_bn4b_branch2b 0=256
Scale scale4b_branch2b 1 1 res4b_branch2b_bn4b_branch2b res4b_branch2b_scale4b_branch2b 0=256 1=1
Eltwise res4b 2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2b_scale4b_branch2b res4b 0=1 -23301=0
ReLU res4b_relu 1 1 res4b res4b_res4b_relu
Split splitncnn_6 1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1
Convolution res5a_branch1 1 1 res4b_res4b_relu_splitncnn_1 res5a_branch1 0=512 1=1 2=1 3=2 4=0 5=0 6=131072 8=2
BatchNorm bn5a_branch1 1 1 res5a_branch1 res5a_branch1_bn5a_branch1 0=512
Scale scale5a_branch1 1 1 res5a_branch1_bn5a_branch1 res5a_branch1_scale5a_branch1 0=512 1=1
Convolution res5a_branch2a 1 1 res4b_res4b_relu_splitncnn_0 res5a_branch2a 0=512 1=3 2=1 3=2 4=1 5=0 6=1179648 8=2
BatchNorm bn5a_branch2a 1 1 res5a_branch2a res5a_branch2a_bn5a_branch2a 0=512
Scale scale5a_branch2a 1 1 res5a_branch2a_bn5a_branch2a res5a_branch2a_scale5a_branch2a 0=512 1=1
ReLU res5a_branch2a_relu 1 1 res5a_branch2a_scale5a_branch2a res5a_branch2a_res5a_branch2a_relu
Convolution res5a_branch2b 1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2
BatchNorm bn5a_branch2b 1 1 res5a_branch2b res5a_branch2b_bn5a_branch2b 0=512
Scale scale5a_branch2b 1 1 res5a_branch2b_bn5a_branch2b res5a_branch2b_scale5a_branch2b 0=512 1=1
Eltwise res5a 2 1 res5a_branch1_scale5a_branch1 res5a_branch2b_scale5a_branch2b res5a 0=1 -23301=0
ReLU res5a_relu 1 1 res5a res5a_res5a_relu
Split splitncnn_7 1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1
Convolution res5b_branch2a 1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2
BatchNorm bn5b_branch2a 1 1 res5b_branch2a res5b_branch2a_bn5b_branch2a 0=512
Scale scale5b_branch2a 1 1 res5b_branch2a_bn5b_branch2a res5b_branch2a_scale5b_branch2a 0=512 1=1
ReLU res5b_branch2a_relu 1 1 res5b_branch2a_scale5b_branch2a res5b_branch2a_res5b_branch2a_relu
Convolution res5b_branch2b 1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2
BatchNorm bn5b_branch2b 1 1 res5b_branch2b res5b_branch2b_bn5b_branch2b 0=512
Scale scale5b_branch2b 1 1 res5b_branch2b_bn5b_branch2b res5b_branch2b_scale5b_branch2b 0=512 1=1
Eltwise res5b 2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2b_scale5b_branch2b res5b 0=1 -23301=0
ReLU res5b_relu 1 1 res5b res5b_res5b_relu
Pooling pool5 1 1 res5b_res5b_relu pool5 0=1 1=7 2=1 3=0 4=0
InnerProduct fc1000 1 1 pool5 fc1000 0=1000 1=1 2=512000
Softmax prob 1 1 fc1000 prob 0=0

+ 247
- 0
benchmark/resnet50.param View File

@@ -0,0 +1,247 @@
7767517
245 261
Input data 0 1 data 0=224 1=224 2=3
Convolution conv1 1 1 data conv1 0=64 1=7 2=1 3=2 4=3 5=1 6=9408
BatchNorm bn_conv1 1 1 conv1 conv1_bn_conv1 0=64
Scale scale_conv1 1 1 conv1_bn_conv1 conv1_scale_conv1 0=64 1=1
ReLU conv1_relu 1 1 conv1_scale_conv1 conv1_conv1_relu
Pooling pool1 1 1 conv1_conv1_relu pool1 0=0 1=3 2=2 3=0 4=0
Split splitncnn_0 1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1
Convolution res2a_branch1 1 1 pool1_splitncnn_1 res2a_branch1 0=256 1=1 2=1 3=1 4=0 5=0 6=16384
BatchNorm bn2a_branch1 1 1 res2a_branch1 res2a_branch1_bn2a_branch1 0=256
Scale scale2a_branch1 1 1 res2a_branch1_bn2a_branch1 res2a_branch1_scale2a_branch1 0=256 1=1
Convolution res2a_branch2a 1 1 pool1_splitncnn_0 res2a_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=4096
BatchNorm bn2a_branch2a 1 1 res2a_branch2a res2a_branch2a_bn2a_branch2a 0=64
Scale scale2a_branch2a 1 1 res2a_branch2a_bn2a_branch2a res2a_branch2a_scale2a_branch2a 0=64 1=1
ReLU res2a_branch2a_relu 1 1 res2a_branch2a_scale2a_branch2a res2a_branch2a_res2a_branch2a_relu
Convolution res2a_branch2b 1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864
BatchNorm bn2a_branch2b 1 1 res2a_branch2b res2a_branch2b_bn2a_branch2b 0=64
Scale scale2a_branch2b 1 1 res2a_branch2b_bn2a_branch2b res2a_branch2b_scale2a_branch2b 0=64 1=1
ReLU res2a_branch2b_relu 1 1 res2a_branch2b_scale2a_branch2b res2a_branch2b_res2a_branch2b_relu
Convolution res2a_branch2c 1 1 res2a_branch2b_res2a_branch2b_relu res2a_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384
BatchNorm bn2a_branch2c 1 1 res2a_branch2c res2a_branch2c_bn2a_branch2c 0=256
Scale scale2a_branch2c 1 1 res2a_branch2c_bn2a_branch2c res2a_branch2c_scale2a_branch2c 0=256 1=1
Eltwise res2a 2 1 res2a_branch1_scale2a_branch1 res2a_branch2c_scale2a_branch2c res2a 0=1 -23301=0
ReLU res2a_relu 1 1 res2a res2a_res2a_relu
Split splitncnn_1 1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1
Convolution res2b_branch2a 1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=16384
BatchNorm bn2b_branch2a 1 1 res2b_branch2a res2b_branch2a_bn2b_branch2a 0=64
Scale scale2b_branch2a 1 1 res2b_branch2a_bn2b_branch2a res2b_branch2a_scale2b_branch2a 0=64 1=1
ReLU res2b_branch2a_relu 1 1 res2b_branch2a_scale2b_branch2a res2b_branch2a_res2b_branch2a_relu
Convolution res2b_branch2b 1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864
BatchNorm bn2b_branch2b 1 1 res2b_branch2b res2b_branch2b_bn2b_branch2b 0=64
Scale scale2b_branch2b 1 1 res2b_branch2b_bn2b_branch2b res2b_branch2b_scale2b_branch2b 0=64 1=1
ReLU res2b_branch2b_relu 1 1 res2b_branch2b_scale2b_branch2b res2b_branch2b_res2b_branch2b_relu
Convolution res2b_branch2c 1 1 res2b_branch2b_res2b_branch2b_relu res2b_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384
BatchNorm bn2b_branch2c 1 1 res2b_branch2c res2b_branch2c_bn2b_branch2c 0=256
Scale scale2b_branch2c 1 1 res2b_branch2c_bn2b_branch2c res2b_branch2c_scale2b_branch2c 0=256 1=1
Eltwise res2b 2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2c_scale2b_branch2c res2b 0=1 -23301=0
ReLU res2b_relu 1 1 res2b res2b_res2b_relu
Split splitncnn_2 1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1
Convolution res2c_branch2a 1 1 res2b_res2b_relu_splitncnn_1 res2c_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=16384
BatchNorm bn2c_branch2a 1 1 res2c_branch2a res2c_branch2a_bn2c_branch2a 0=64
Scale scale2c_branch2a 1 1 res2c_branch2a_bn2c_branch2a res2c_branch2a_scale2c_branch2a 0=64 1=1
ReLU res2c_branch2a_relu 1 1 res2c_branch2a_scale2c_branch2a res2c_branch2a_res2c_branch2a_relu
Convolution res2c_branch2b 1 1 res2c_branch2a_res2c_branch2a_relu res2c_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864
BatchNorm bn2c_branch2b 1 1 res2c_branch2b res2c_branch2b_bn2c_branch2b 0=64
Scale scale2c_branch2b 1 1 res2c_branch2b_bn2c_branch2b res2c_branch2b_scale2c_branch2b 0=64 1=1
ReLU res2c_branch2b_relu 1 1 res2c_branch2b_scale2c_branch2b res2c_branch2b_res2c_branch2b_relu
Convolution res2c_branch2c 1 1 res2c_branch2b_res2c_branch2b_relu res2c_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384
BatchNorm bn2c_branch2c 1 1 res2c_branch2c res2c_branch2c_bn2c_branch2c 0=256
Scale scale2c_branch2c 1 1 res2c_branch2c_bn2c_branch2c res2c_branch2c_scale2c_branch2c 0=256 1=1
Eltwise res2c 2 1 res2b_res2b_relu_splitncnn_0 res2c_branch2c_scale2c_branch2c res2c 0=1 -23301=0
ReLU res2c_relu 1 1 res2c res2c_res2c_relu
Split splitncnn_3 1 2 res2c_res2c_relu res2c_res2c_relu_splitncnn_0 res2c_res2c_relu_splitncnn_1
Convolution res3a_branch1 1 1 res2c_res2c_relu_splitncnn_1 res3a_branch1 0=512 1=1 2=1 3=2 4=0 5=0 6=131072
BatchNorm bn3a_branch1 1 1 res3a_branch1 res3a_branch1_bn3a_branch1 0=512
Scale scale3a_branch1 1 1 res3a_branch1_bn3a_branch1 res3a_branch1_scale3a_branch1 0=512 1=1
Convolution res3a_branch2a 1 1 res2c_res2c_relu_splitncnn_0 res3a_branch2a 0=128 1=1 2=1 3=2 4=0 5=0 6=32768
BatchNorm bn3a_branch2a 1 1 res3a_branch2a res3a_branch2a_bn3a_branch2a 0=128
Scale scale3a_branch2a 1 1 res3a_branch2a_bn3a_branch2a res3a_branch2a_scale3a_branch2a 0=128 1=1
ReLU res3a_branch2a_relu 1 1 res3a_branch2a_scale3a_branch2a res3a_branch2a_res3a_branch2a_relu
Convolution res3a_branch2b 1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456
BatchNorm bn3a_branch2b 1 1 res3a_branch2b res3a_branch2b_bn3a_branch2b 0=128
Scale scale3a_branch2b 1 1 res3a_branch2b_bn3a_branch2b res3a_branch2b_scale3a_branch2b 0=128 1=1
ReLU res3a_branch2b_relu 1 1 res3a_branch2b_scale3a_branch2b res3a_branch2b_res3a_branch2b_relu
Convolution res3a_branch2c 1 1 res3a_branch2b_res3a_branch2b_relu res3a_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536
BatchNorm bn3a_branch2c 1 1 res3a_branch2c res3a_branch2c_bn3a_branch2c 0=512
Scale scale3a_branch2c 1 1 res3a_branch2c_bn3a_branch2c res3a_branch2c_scale3a_branch2c 0=512 1=1
Eltwise res3a 2 1 res3a_branch1_scale3a_branch1 res3a_branch2c_scale3a_branch2c res3a 0=1 -23301=0
ReLU res3a_relu 1 1 res3a res3a_res3a_relu
Split splitncnn_4 1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1
Convolution res3b_branch2a 1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536
BatchNorm bn3b_branch2a 1 1 res3b_branch2a res3b_branch2a_bn3b_branch2a 0=128
Scale scale3b_branch2a 1 1 res3b_branch2a_bn3b_branch2a res3b_branch2a_scale3b_branch2a 0=128 1=1
ReLU res3b_branch2a_relu 1 1 res3b_branch2a_scale3b_branch2a res3b_branch2a_res3b_branch2a_relu
Convolution res3b_branch2b 1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456
BatchNorm bn3b_branch2b 1 1 res3b_branch2b res3b_branch2b_bn3b_branch2b 0=128
Scale scale3b_branch2b 1 1 res3b_branch2b_bn3b_branch2b res3b_branch2b_scale3b_branch2b 0=128 1=1
ReLU res3b_branch2b_relu 1 1 res3b_branch2b_scale3b_branch2b res3b_branch2b_res3b_branch2b_relu
Convolution res3b_branch2c 1 1 res3b_branch2b_res3b_branch2b_relu res3b_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536
BatchNorm bn3b_branch2c 1 1 res3b_branch2c res3b_branch2c_bn3b_branch2c 0=512
Scale scale3b_branch2c 1 1 res3b_branch2c_bn3b_branch2c res3b_branch2c_scale3b_branch2c 0=512 1=1
Eltwise res3b 2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2c_scale3b_branch2c res3b 0=1 -23301=0
ReLU res3b_relu 1 1 res3b res3b_res3b_relu
Split splitncnn_5 1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1
Convolution res3c_branch2a 1 1 res3b_res3b_relu_splitncnn_1 res3c_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536
BatchNorm bn3c_branch2a 1 1 res3c_branch2a res3c_branch2a_bn3c_branch2a 0=128
Scale scale3c_branch2a 1 1 res3c_branch2a_bn3c_branch2a res3c_branch2a_scale3c_branch2a 0=128 1=1
ReLU res3c_branch2a_relu 1 1 res3c_branch2a_scale3c_branch2a res3c_branch2a_res3c_branch2a_relu
Convolution res3c_branch2b 1 1 res3c_branch2a_res3c_branch2a_relu res3c_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456
BatchNorm bn3c_branch2b 1 1 res3c_branch2b res3c_branch2b_bn3c_branch2b 0=128
Scale scale3c_branch2b 1 1 res3c_branch2b_bn3c_branch2b res3c_branch2b_scale3c_branch2b 0=128 1=1
ReLU res3c_branch2b_relu 1 1 res3c_branch2b_scale3c_branch2b res3c_branch2b_res3c_branch2b_relu
Convolution res3c_branch2c 1 1 res3c_branch2b_res3c_branch2b_relu res3c_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536
BatchNorm bn3c_branch2c 1 1 res3c_branch2c res3c_branch2c_bn3c_branch2c 0=512
Scale scale3c_branch2c 1 1 res3c_branch2c_bn3c_branch2c res3c_branch2c_scale3c_branch2c 0=512 1=1
Eltwise res3c 2 1 res3b_res3b_relu_splitncnn_0 res3c_branch2c_scale3c_branch2c res3c 0=1 -23301=0
ReLU res3c_relu 1 1 res3c res3c_res3c_relu
Split splitncnn_6 1 2 res3c_res3c_relu res3c_res3c_relu_splitncnn_0 res3c_res3c_relu_splitncnn_1
Convolution res3d_branch2a 1 1 res3c_res3c_relu_splitncnn_1 res3d_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536
BatchNorm bn3d_branch2a 1 1 res3d_branch2a res3d_branch2a_bn3d_branch2a 0=128
Scale scale3d_branch2a 1 1 res3d_branch2a_bn3d_branch2a res3d_branch2a_scale3d_branch2a 0=128 1=1
ReLU res3d_branch2a_relu 1 1 res3d_branch2a_scale3d_branch2a res3d_branch2a_res3d_branch2a_relu
Convolution res3d_branch2b 1 1 res3d_branch2a_res3d_branch2a_relu res3d_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456
BatchNorm bn3d_branch2b 1 1 res3d_branch2b res3d_branch2b_bn3d_branch2b 0=128
Scale scale3d_branch2b 1 1 res3d_branch2b_bn3d_branch2b res3d_branch2b_scale3d_branch2b 0=128 1=1
ReLU res3d_branch2b_relu 1 1 res3d_branch2b_scale3d_branch2b res3d_branch2b_res3d_branch2b_relu
Convolution res3d_branch2c 1 1 res3d_branch2b_res3d_branch2b_relu res3d_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536
BatchNorm bn3d_branch2c 1 1 res3d_branch2c res3d_branch2c_bn3d_branch2c 0=512
Scale scale3d_branch2c 1 1 res3d_branch2c_bn3d_branch2c res3d_branch2c_scale3d_branch2c 0=512 1=1
Eltwise res3d 2 1 res3c_res3c_relu_splitncnn_0 res3d_branch2c_scale3d_branch2c res3d 0=1 -23301=0
ReLU res3d_relu 1 1 res3d res3d_res3d_relu
Split splitncnn_7 1 2 res3d_res3d_relu res3d_res3d_relu_splitncnn_0 res3d_res3d_relu_splitncnn_1
Convolution res4a_branch1 1 1 res3d_res3d_relu_splitncnn_1 res4a_branch1 0=1024 1=1 2=1 3=2 4=0 5=0 6=524288
BatchNorm bn4a_branch1 1 1 res4a_branch1 res4a_branch1_bn4a_branch1 0=1024
Scale scale4a_branch1 1 1 res4a_branch1_bn4a_branch1 res4a_branch1_scale4a_branch1 0=1024 1=1
Convolution res4a_branch2a 1 1 res3d_res3d_relu_splitncnn_0 res4a_branch2a 0=256 1=1 2=1 3=2 4=0 5=0 6=131072
BatchNorm bn4a_branch2a 1 1 res4a_branch2a res4a_branch2a_bn4a_branch2a 0=256
Scale scale4a_branch2a 1 1 res4a_branch2a_bn4a_branch2a res4a_branch2a_scale4a_branch2a 0=256 1=1
ReLU res4a_branch2a_relu 1 1 res4a_branch2a_scale4a_branch2a res4a_branch2a_res4a_branch2a_relu
Convolution res4a_branch2b 1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824
BatchNorm bn4a_branch2b 1 1 res4a_branch2b res4a_branch2b_bn4a_branch2b 0=256
Scale scale4a_branch2b 1 1 res4a_branch2b_bn4a_branch2b res4a_branch2b_scale4a_branch2b 0=256 1=1
ReLU res4a_branch2b_relu 1 1 res4a_branch2b_scale4a_branch2b res4a_branch2b_res4a_branch2b_relu
Convolution res4a_branch2c 1 1 res4a_branch2b_res4a_branch2b_relu res4a_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144
BatchNorm bn4a_branch2c 1 1 res4a_branch2c res4a_branch2c_bn4a_branch2c 0=1024
Scale scale4a_branch2c 1 1 res4a_branch2c_bn4a_branch2c res4a_branch2c_scale4a_branch2c 0=1024 1=1
Eltwise res4a 2 1 res4a_branch1_scale4a_branch1 res4a_branch2c_scale4a_branch2c res4a 0=1 -23301=0
ReLU res4a_relu 1 1 res4a res4a_res4a_relu
Split splitncnn_8 1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1
Convolution res4b_branch2a 1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144
BatchNorm bn4b_branch2a 1 1 res4b_branch2a res4b_branch2a_bn4b_branch2a 0=256
Scale scale4b_branch2a 1 1 res4b_branch2a_bn4b_branch2a res4b_branch2a_scale4b_branch2a 0=256 1=1
ReLU res4b_branch2a_relu 1 1 res4b_branch2a_scale4b_branch2a res4b_branch2a_res4b_branch2a_relu
Convolution res4b_branch2b 1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824
BatchNorm bn4b_branch2b 1 1 res4b_branch2b res4b_branch2b_bn4b_branch2b 0=256
Scale scale4b_branch2b 1 1 res4b_branch2b_bn4b_branch2b res4b_branch2b_scale4b_branch2b 0=256 1=1
ReLU res4b_branch2b_relu 1 1 res4b_branch2b_scale4b_branch2b res4b_branch2b_res4b_branch2b_relu
Convolution res4b_branch2c 1 1 res4b_branch2b_res4b_branch2b_relu res4b_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144
BatchNorm bn4b_branch2c 1 1 res4b_branch2c res4b_branch2c_bn4b_branch2c 0=1024
Scale scale4b_branch2c 1 1 res4b_branch2c_bn4b_branch2c res4b_branch2c_scale4b_branch2c 0=1024 1=1
Eltwise res4b 2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2c_scale4b_branch2c res4b 0=1 -23301=0
ReLU res4b_relu 1 1 res4b res4b_res4b_relu
Split splitncnn_9 1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1
Convolution res4c_branch2a 1 1 res4b_res4b_relu_splitncnn_1 res4c_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144
BatchNorm bn4c_branch2a 1 1 res4c_branch2a res4c_branch2a_bn4c_branch2a 0=256
Scale scale4c_branch2a 1 1 res4c_branch2a_bn4c_branch2a res4c_branch2a_scale4c_branch2a 0=256 1=1
ReLU res4c_branch2a_relu 1 1 res4c_branch2a_scale4c_branch2a res4c_branch2a_res4c_branch2a_relu
Convolution res4c_branch2b 1 1 res4c_branch2a_res4c_branch2a_relu res4c_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824
BatchNorm bn4c_branch2b 1 1 res4c_branch2b res4c_branch2b_bn4c_branch2b 0=256
Scale scale4c_branch2b 1 1 res4c_branch2b_bn4c_branch2b res4c_branch2b_scale4c_branch2b 0=256 1=1
ReLU res4c_branch2b_relu 1 1 res4c_branch2b_scale4c_branch2b res4c_branch2b_res4c_branch2b_relu
Convolution res4c_branch2c 1 1 res4c_branch2b_res4c_branch2b_relu res4c_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144
BatchNorm bn4c_branch2c 1 1 res4c_branch2c res4c_branch2c_bn4c_branch2c 0=1024
Scale scale4c_branch2c 1 1 res4c_branch2c_bn4c_branch2c res4c_branch2c_scale4c_branch2c 0=1024 1=1
Eltwise res4c 2 1 res4b_res4b_relu_splitncnn_0 res4c_branch2c_scale4c_branch2c res4c 0=1 -23301=0
ReLU res4c_relu 1 1 res4c res4c_res4c_relu
Split splitncnn_10 1 2 res4c_res4c_relu res4c_res4c_relu_splitncnn_0 res4c_res4c_relu_splitncnn_1
Convolution res4d_branch2a 1 1 res4c_res4c_relu_splitncnn_1 res4d_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144
BatchNorm bn4d_branch2a 1 1 res4d_branch2a res4d_branch2a_bn4d_branch2a 0=256
Scale scale4d_branch2a 1 1 res4d_branch2a_bn4d_branch2a res4d_branch2a_scale4d_branch2a 0=256 1=1
ReLU res4d_branch2a_relu 1 1 res4d_branch2a_scale4d_branch2a res4d_branch2a_res4d_branch2a_relu
Convolution res4d_branch2b 1 1 res4d_branch2a_res4d_branch2a_relu res4d_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824
BatchNorm bn4d_branch2b 1 1 res4d_branch2b res4d_branch2b_bn4d_branch2b 0=256
Scale scale4d_branch2b 1 1 res4d_branch2b_bn4d_branch2b res4d_branch2b_scale4d_branch2b 0=256 1=1
ReLU res4d_branch2b_relu 1 1 res4d_branch2b_scale4d_branch2b res4d_branch2b_res4d_branch2b_relu
Convolution res4d_branch2c 1 1 res4d_branch2b_res4d_branch2b_relu res4d_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144
BatchNorm bn4d_branch2c 1 1 res4d_branch2c res4d_branch2c_bn4d_branch2c 0=1024
Scale scale4d_branch2c 1 1 res4d_branch2c_bn4d_branch2c res4d_branch2c_scale4d_branch2c 0=1024 1=1
Eltwise res4d 2 1 res4c_res4c_relu_splitncnn_0 res4d_branch2c_scale4d_branch2c res4d 0=1 -23301=0
ReLU res4d_relu 1 1 res4d res4d_res4d_relu
Split splitncnn_11 1 2 res4d_res4d_relu res4d_res4d_relu_splitncnn_0 res4d_res4d_relu_splitncnn_1
Convolution res4e_branch2a 1 1 res4d_res4d_relu_splitncnn_1 res4e_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144
BatchNorm bn4e_branch2a 1 1 res4e_branch2a res4e_branch2a_bn4e_branch2a 0=256
Scale scale4e_branch2a 1 1 res4e_branch2a_bn4e_branch2a res4e_branch2a_scale4e_branch2a 0=256 1=1
ReLU res4e_branch2a_relu 1 1 res4e_branch2a_scale4e_branch2a res4e_branch2a_res4e_branch2a_relu
Convolution res4e_branch2b 1 1 res4e_branch2a_res4e_branch2a_relu res4e_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824
BatchNorm bn4e_branch2b 1 1 res4e_branch2b res4e_branch2b_bn4e_branch2b 0=256
Scale scale4e_branch2b 1 1 res4e_branch2b_bn4e_branch2b res4e_branch2b_scale4e_branch2b 0=256 1=1
ReLU res4e_branch2b_relu 1 1 res4e_branch2b_scale4e_branch2b res4e_branch2b_res4e_branch2b_relu
Convolution res4e_branch2c 1 1 res4e_branch2b_res4e_branch2b_relu res4e_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144
BatchNorm bn4e_branch2c 1 1 res4e_branch2c res4e_branch2c_bn4e_branch2c 0=1024
Scale scale4e_branch2c 1 1 res4e_branch2c_bn4e_branch2c res4e_branch2c_scale4e_branch2c 0=1024 1=1
Eltwise res4e 2 1 res4d_res4d_relu_splitncnn_0 res4e_branch2c_scale4e_branch2c res4e 0=1 -23301=0
ReLU res4e_relu 1 1 res4e res4e_res4e_relu
Split splitncnn_12 1 2 res4e_res4e_relu res4e_res4e_relu_splitncnn_0 res4e_res4e_relu_splitncnn_1
Convolution res4f_branch2a 1 1 res4e_res4e_relu_splitncnn_1 res4f_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144
BatchNorm bn4f_branch2a 1 1 res4f_branch2a res4f_branch2a_bn4f_branch2a 0=256
Scale scale4f_branch2a 1 1 res4f_branch2a_bn4f_branch2a res4f_branch2a_scale4f_branch2a 0=256 1=1
ReLU res4f_branch2a_relu 1 1 res4f_branch2a_scale4f_branch2a res4f_branch2a_res4f_branch2a_relu
Convolution res4f_branch2b 1 1 res4f_branch2a_res4f_branch2a_relu res4f_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824
BatchNorm bn4f_branch2b 1 1 res4f_branch2b res4f_branch2b_bn4f_branch2b 0=256
Scale scale4f_branch2b 1 1 res4f_branch2b_bn4f_branch2b res4f_branch2b_scale4f_branch2b 0=256 1=1
ReLU res4f_branch2b_relu 1 1 res4f_branch2b_scale4f_branch2b res4f_branch2b_res4f_branch2b_relu
Convolution res4f_branch2c 1 1 res4f_branch2b_res4f_branch2b_relu res4f_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144
BatchNorm bn4f_branch2c 1 1 res4f_branch2c res4f_branch2c_bn4f_branch2c 0=1024
Scale scale4f_branch2c 1 1 res4f_branch2c_bn4f_branch2c res4f_branch2c_scale4f_branch2c 0=1024 1=1
Eltwise res4f 2 1 res4e_res4e_relu_splitncnn_0 res4f_branch2c_scale4f_branch2c res4f 0=1 -23301=0
ReLU res4f_relu 1 1 res4f res4f_res4f_relu
Split splitncnn_13 1 2 res4f_res4f_relu res4f_res4f_relu_splitncnn_0 res4f_res4f_relu_splitncnn_1
Convolution res5a_branch1 1 1 res4f_res4f_relu_splitncnn_1 res5a_branch1 0=2048 1=1 2=1 3=2 4=0 5=0 6=2097152
BatchNorm bn5a_branch1 1 1 res5a_branch1 res5a_branch1_bn5a_branch1 0=2048
Scale scale5a_branch1 1 1 res5a_branch1_bn5a_branch1 res5a_branch1_scale5a_branch1 0=2048 1=1
Convolution res5a_branch2a 1 1 res4f_res4f_relu_splitncnn_0 res5a_branch2a 0=512 1=1 2=1 3=2 4=0 5=0 6=524288
BatchNorm bn5a_branch2a 1 1 res5a_branch2a res5a_branch2a_bn5a_branch2a 0=512
Scale scale5a_branch2a 1 1 res5a_branch2a_bn5a_branch2a res5a_branch2a_scale5a_branch2a 0=512 1=1
ReLU res5a_branch2a_relu 1 1 res5a_branch2a_scale5a_branch2a res5a_branch2a_res5a_branch2a_relu
Convolution res5a_branch2b 1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296
BatchNorm bn5a_branch2b 1 1 res5a_branch2b res5a_branch2b_bn5a_branch2b 0=512
Scale scale5a_branch2b 1 1 res5a_branch2b_bn5a_branch2b res5a_branch2b_scale5a_branch2b 0=512 1=1
ReLU res5a_branch2b_relu 1 1 res5a_branch2b_scale5a_branch2b res5a_branch2b_res5a_branch2b_relu
Convolution res5a_branch2c 1 1 res5a_branch2b_res5a_branch2b_relu res5a_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576
BatchNorm bn5a_branch2c 1 1 res5a_branch2c res5a_branch2c_bn5a_branch2c 0=2048
Scale scale5a_branch2c 1 1 res5a_branch2c_bn5a_branch2c res5a_branch2c_scale5a_branch2c 0=2048 1=1
Eltwise res5a 2 1 res5a_branch1_scale5a_branch1 res5a_branch2c_scale5a_branch2c res5a 0=1 -23301=0
ReLU res5a_relu 1 1 res5a res5a_res5a_relu
Split splitncnn_14 1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1
Convolution res5b_branch2a 1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a 0=512 1=1 2=1 3=1 4=0 5=0 6=1048576
BatchNorm bn5b_branch2a 1 1 res5b_branch2a res5b_branch2a_bn5b_branch2a 0=512
Scale scale5b_branch2a 1 1 res5b_branch2a_bn5b_branch2a res5b_branch2a_scale5b_branch2a 0=512 1=1
ReLU res5b_branch2a_relu 1 1 res5b_branch2a_scale5b_branch2a res5b_branch2a_res5b_branch2a_relu
Convolution res5b_branch2b 1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296
BatchNorm bn5b_branch2b 1 1 res5b_branch2b res5b_branch2b_bn5b_branch2b 0=512
Scale scale5b_branch2b 1 1 res5b_branch2b_bn5b_branch2b res5b_branch2b_scale5b_branch2b 0=512 1=1
ReLU res5b_branch2b_relu 1 1 res5b_branch2b_scale5b_branch2b res5b_branch2b_res5b_branch2b_relu
Convolution res5b_branch2c 1 1 res5b_branch2b_res5b_branch2b_relu res5b_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576
BatchNorm bn5b_branch2c 1 1 res5b_branch2c res5b_branch2c_bn5b_branch2c 0=2048
Scale scale5b_branch2c 1 1 res5b_branch2c_bn5b_branch2c res5b_branch2c_scale5b_branch2c 0=2048 1=1
Eltwise res5b 2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2c_scale5b_branch2c res5b 0=1 -23301=0
ReLU res5b_relu 1 1 res5b res5b_res5b_relu
Split splitncnn_15 1 2 res5b_res5b_relu res5b_res5b_relu_splitncnn_0 res5b_res5b_relu_splitncnn_1
Convolution res5c_branch2a 1 1 res5b_res5b_relu_splitncnn_1 res5c_branch2a 0=512 1=1 2=1 3=1 4=0 5=0 6=1048576
BatchNorm bn5c_branch2a 1 1 res5c_branch2a res5c_branch2a_bn5c_branch2a 0=512
Scale scale5c_branch2a 1 1 res5c_branch2a_bn5c_branch2a res5c_branch2a_scale5c_branch2a 0=512 1=1
ReLU res5c_branch2a_relu 1 1 res5c_branch2a_scale5c_branch2a res5c_branch2a_res5c_branch2a_relu
Convolution res5c_branch2b 1 1 res5c_branch2a_res5c_branch2a_relu res5c_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296
BatchNorm bn5c_branch2b 1 1 res5c_branch2b res5c_branch2b_bn5c_branch2b 0=512
Scale scale5c_branch2b 1 1 res5c_branch2b_bn5c_branch2b res5c_branch2b_scale5c_branch2b 0=512 1=1
ReLU res5c_branch2b_relu 1 1 res5c_branch2b_scale5c_branch2b res5c_branch2b_res5c_branch2b_relu
Convolution res5c_branch2c 1 1 res5c_branch2b_res5c_branch2b_relu res5c_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576
BatchNorm bn5c_branch2c 1 1 res5c_branch2c res5c_branch2c_bn5c_branch2c 0=2048
Scale scale5c_branch2c 1 1 res5c_branch2c_bn5c_branch2c res5c_branch2c_scale5c_branch2c 0=2048 1=1
Eltwise res5c 2 1 res5b_res5b_relu_splitncnn_0 res5c_branch2c_scale5c_branch2c res5c 0=1 -23301=0
ReLU res5c_relu 1 1 res5c res5c_res5c_relu
Pooling pool5 1 1 res5c_res5c_relu pool5 0=1 1=7 2=1 3=0 4=0
InnerProduct fc1000 1 1 pool5 fc1000 0=1000 1=1 2=2048000
Softmax prob 1 1 fc1000 prob 0=0

+ 247
- 0
benchmark/resnet50_int8.param View File

@@ -0,0 +1,247 @@
7767517
245 261
Input data 0 1 data 0=224 1=224 2=3
Convolution conv1 1 1 data conv1 0=64 1=7 2=1 3=2 4=3 5=1 6=9408 8=2
BatchNorm bn_conv1 1 1 conv1 conv1_bn_conv1 0=64
Scale scale_conv1 1 1 conv1_bn_conv1 conv1_scale_conv1 0=64 1=1
ReLU conv1_relu 1 1 conv1_scale_conv1 conv1_conv1_relu
Pooling pool1 1 1 conv1_conv1_relu pool1 0=0 1=3 2=2 3=0 4=0
Split splitncnn_0 1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1
Convolution res2a_branch1 1 1 pool1_splitncnn_1 res2a_branch1 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 8=2
BatchNorm bn2a_branch1 1 1 res2a_branch1 res2a_branch1_bn2a_branch1 0=256
Scale scale2a_branch1 1 1 res2a_branch1_bn2a_branch1 res2a_branch1_scale2a_branch1 0=256 1=1
Convolution res2a_branch2a 1 1 pool1_splitncnn_0 res2a_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=4096 8=2
BatchNorm bn2a_branch2a 1 1 res2a_branch2a res2a_branch2a_bn2a_branch2a 0=64
Scale scale2a_branch2a 1 1 res2a_branch2a_bn2a_branch2a res2a_branch2a_scale2a_branch2a 0=64 1=1
ReLU res2a_branch2a_relu 1 1 res2a_branch2a_scale2a_branch2a res2a_branch2a_res2a_branch2a_relu
Convolution res2a_branch2b 1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2
BatchNorm bn2a_branch2b 1 1 res2a_branch2b res2a_branch2b_bn2a_branch2b 0=64
Scale scale2a_branch2b 1 1 res2a_branch2b_bn2a_branch2b res2a_branch2b_scale2a_branch2b 0=64 1=1
ReLU res2a_branch2b_relu 1 1 res2a_branch2b_scale2a_branch2b res2a_branch2b_res2a_branch2b_relu
Convolution res2a_branch2c 1 1 res2a_branch2b_res2a_branch2b_relu res2a_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 8=2
BatchNorm bn2a_branch2c 1 1 res2a_branch2c res2a_branch2c_bn2a_branch2c 0=256
Scale scale2a_branch2c 1 1 res2a_branch2c_bn2a_branch2c res2a_branch2c_scale2a_branch2c 0=256 1=1
Eltwise res2a 2 1 res2a_branch1_scale2a_branch1 res2a_branch2c_scale2a_branch2c res2a 0=1 -23301=0
ReLU res2a_relu 1 1 res2a res2a_res2a_relu
Split splitncnn_1 1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1
Convolution res2b_branch2a 1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=16384 8=2
BatchNorm bn2b_branch2a 1 1 res2b_branch2a res2b_branch2a_bn2b_branch2a 0=64
Scale scale2b_branch2a 1 1 res2b_branch2a_bn2b_branch2a res2b_branch2a_scale2b_branch2a 0=64 1=1
ReLU res2b_branch2a_relu 1 1 res2b_branch2a_scale2b_branch2a res2b_branch2a_res2b_branch2a_relu
Convolution res2b_branch2b 1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2
BatchNorm bn2b_branch2b 1 1 res2b_branch2b res2b_branch2b_bn2b_branch2b 0=64
Scale scale2b_branch2b 1 1 res2b_branch2b_bn2b_branch2b res2b_branch2b_scale2b_branch2b 0=64 1=1
ReLU res2b_branch2b_relu 1 1 res2b_branch2b_scale2b_branch2b res2b_branch2b_res2b_branch2b_relu
Convolution res2b_branch2c 1 1 res2b_branch2b_res2b_branch2b_relu res2b_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 8=2
BatchNorm bn2b_branch2c 1 1 res2b_branch2c res2b_branch2c_bn2b_branch2c 0=256
Scale scale2b_branch2c 1 1 res2b_branch2c_bn2b_branch2c res2b_branch2c_scale2b_branch2c 0=256 1=1
Eltwise res2b 2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2c_scale2b_branch2c res2b 0=1 -23301=0
ReLU res2b_relu 1 1 res2b res2b_res2b_relu
Split splitncnn_2 1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1
Convolution res2c_branch2a 1 1 res2b_res2b_relu_splitncnn_1 res2c_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=16384 8=2
BatchNorm bn2c_branch2a 1 1 res2c_branch2a res2c_branch2a_bn2c_branch2a 0=64
Scale scale2c_branch2a 1 1 res2c_branch2a_bn2c_branch2a res2c_branch2a_scale2c_branch2a 0=64 1=1
ReLU res2c_branch2a_relu 1 1 res2c_branch2a_scale2c_branch2a res2c_branch2a_res2c_branch2a_relu
Convolution res2c_branch2b 1 1 res2c_branch2a_res2c_branch2a_relu res2c_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2
BatchNorm bn2c_branch2b 1 1 res2c_branch2b res2c_branch2b_bn2c_branch2b 0=64
Scale scale2c_branch2b 1 1 res2c_branch2b_bn2c_branch2b res2c_branch2b_scale2c_branch2b 0=64 1=1
ReLU res2c_branch2b_relu 1 1 res2c_branch2b_scale2c_branch2b res2c_branch2b_res2c_branch2b_relu
Convolution res2c_branch2c 1 1 res2c_branch2b_res2c_branch2b_relu res2c_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 8=2
BatchNorm bn2c_branch2c 1 1 res2c_branch2c res2c_branch2c_bn2c_branch2c 0=256
Scale scale2c_branch2c 1 1 res2c_branch2c_bn2c_branch2c res2c_branch2c_scale2c_branch2c 0=256 1=1
Eltwise res2c 2 1 res2b_res2b_relu_splitncnn_0 res2c_branch2c_scale2c_branch2c res2c 0=1 -23301=0
ReLU res2c_relu 1 1 res2c res2c_res2c_relu
Split splitncnn_3 1 2 res2c_res2c_relu res2c_res2c_relu_splitncnn_0 res2c_res2c_relu_splitncnn_1
Convolution res3a_branch1 1 1 res2c_res2c_relu_splitncnn_1 res3a_branch1 0=512 1=1 2=1 3=2 4=0 5=0 6=131072 8=2
BatchNorm bn3a_branch1 1 1 res3a_branch1 res3a_branch1_bn3a_branch1 0=512
Scale scale3a_branch1 1 1 res3a_branch1_bn3a_branch1 res3a_branch1_scale3a_branch1 0=512 1=1
Convolution res3a_branch2a 1 1 res2c_res2c_relu_splitncnn_0 res3a_branch2a 0=128 1=1 2=1 3=2 4=0 5=0 6=32768 8=2
BatchNorm bn3a_branch2a 1 1 res3a_branch2a res3a_branch2a_bn3a_branch2a 0=128
Scale scale3a_branch2a 1 1 res3a_branch2a_bn3a_branch2a res3a_branch2a_scale3a_branch2a 0=128 1=1
ReLU res3a_branch2a_relu 1 1 res3a_branch2a_scale3a_branch2a res3a_branch2a_res3a_branch2a_relu
Convolution res3a_branch2b 1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2
BatchNorm bn3a_branch2b 1 1 res3a_branch2b res3a_branch2b_bn3a_branch2b 0=128
Scale scale3a_branch2b 1 1 res3a_branch2b_bn3a_branch2b res3a_branch2b_scale3a_branch2b 0=128 1=1
ReLU res3a_branch2b_relu 1 1 res3a_branch2b_scale3a_branch2b res3a_branch2b_res3a_branch2b_relu
Convolution res3a_branch2c 1 1 res3a_branch2b_res3a_branch2b_relu res3a_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 8=2
BatchNorm bn3a_branch2c 1 1 res3a_branch2c res3a_branch2c_bn3a_branch2c 0=512
Scale scale3a_branch2c 1 1 res3a_branch2c_bn3a_branch2c res3a_branch2c_scale3a_branch2c 0=512 1=1
Eltwise res3a 2 1 res3a_branch1_scale3a_branch1 res3a_branch2c_scale3a_branch2c res3a 0=1 -23301=0
ReLU res3a_relu 1 1 res3a res3a_res3a_relu
Split splitncnn_4 1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1
Convolution res3b_branch2a 1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536 8=2
BatchNorm bn3b_branch2a 1 1 res3b_branch2a res3b_branch2a_bn3b_branch2a 0=128
Scale scale3b_branch2a 1 1 res3b_branch2a_bn3b_branch2a res3b_branch2a_scale3b_branch2a 0=128 1=1
ReLU res3b_branch2a_relu 1 1 res3b_branch2a_scale3b_branch2a res3b_branch2a_res3b_branch2a_relu
Convolution res3b_branch2b 1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2
BatchNorm bn3b_branch2b 1 1 res3b_branch2b res3b_branch2b_bn3b_branch2b 0=128
Scale scale3b_branch2b 1 1 res3b_branch2b_bn3b_branch2b res3b_branch2b_scale3b_branch2b 0=128 1=1
ReLU res3b_branch2b_relu 1 1 res3b_branch2b_scale3b_branch2b res3b_branch2b_res3b_branch2b_relu
Convolution res3b_branch2c 1 1 res3b_branch2b_res3b_branch2b_relu res3b_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 8=2
BatchNorm bn3b_branch2c 1 1 res3b_branch2c res3b_branch2c_bn3b_branch2c 0=512
Scale scale3b_branch2c 1 1 res3b_branch2c_bn3b_branch2c res3b_branch2c_scale3b_branch2c 0=512 1=1
Eltwise res3b 2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2c_scale3b_branch2c res3b 0=1 -23301=0
ReLU res3b_relu 1 1 res3b res3b_res3b_relu
Split splitncnn_5 1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1
Convolution res3c_branch2a 1 1 res3b_res3b_relu_splitncnn_1 res3c_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536 8=2
BatchNorm bn3c_branch2a 1 1 res3c_branch2a res3c_branch2a_bn3c_branch2a 0=128
Scale scale3c_branch2a 1 1 res3c_branch2a_bn3c_branch2a res3c_branch2a_scale3c_branch2a 0=128 1=1
ReLU res3c_branch2a_relu 1 1 res3c_branch2a_scale3c_branch2a res3c_branch2a_res3c_branch2a_relu
Convolution res3c_branch2b 1 1 res3c_branch2a_res3c_branch2a_relu res3c_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2
BatchNorm bn3c_branch2b 1 1 res3c_branch2b res3c_branch2b_bn3c_branch2b 0=128
Scale scale3c_branch2b 1 1 res3c_branch2b_bn3c_branch2b res3c_branch2b_scale3c_branch2b 0=128 1=1
ReLU res3c_branch2b_relu 1 1 res3c_branch2b_scale3c_branch2b res3c_branch2b_res3c_branch2b_relu
Convolution res3c_branch2c 1 1 res3c_branch2b_res3c_branch2b_relu res3c_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 8=2
BatchNorm bn3c_branch2c 1 1 res3c_branch2c res3c_branch2c_bn3c_branch2c 0=512
Scale scale3c_branch2c 1 1 res3c_branch2c_bn3c_branch2c res3c_branch2c_scale3c_branch2c 0=512 1=1
Eltwise res3c 2 1 res3b_res3b_relu_splitncnn_0 res3c_branch2c_scale3c_branch2c res3c 0=1 -23301=0
ReLU res3c_relu 1 1 res3c res3c_res3c_relu
Split splitncnn_6 1 2 res3c_res3c_relu res3c_res3c_relu_splitncnn_0 res3c_res3c_relu_splitncnn_1
Convolution res3d_branch2a 1 1 res3c_res3c_relu_splitncnn_1 res3d_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536 8=2
BatchNorm bn3d_branch2a 1 1 res3d_branch2a res3d_branch2a_bn3d_branch2a 0=128
Scale scale3d_branch2a 1 1 res3d_branch2a_bn3d_branch2a res3d_branch2a_scale3d_branch2a 0=128 1=1
ReLU res3d_branch2a_relu 1 1 res3d_branch2a_scale3d_branch2a res3d_branch2a_res3d_branch2a_relu
Convolution res3d_branch2b 1 1 res3d_branch2a_res3d_branch2a_relu res3d_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2
BatchNorm bn3d_branch2b 1 1 res3d_branch2b res3d_branch2b_bn3d_branch2b 0=128
Scale scale3d_branch2b 1 1 res3d_branch2b_bn3d_branch2b res3d_branch2b_scale3d_branch2b 0=128 1=1
ReLU res3d_branch2b_relu 1 1 res3d_branch2b_scale3d_branch2b res3d_branch2b_res3d_branch2b_relu
Convolution res3d_branch2c 1 1 res3d_branch2b_res3d_branch2b_relu res3d_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 8=2
BatchNorm bn3d_branch2c 1 1 res3d_branch2c res3d_branch2c_bn3d_branch2c 0=512
Scale scale3d_branch2c 1 1 res3d_branch2c_bn3d_branch2c res3d_branch2c_scale3d_branch2c 0=512 1=1
Eltwise res3d 2 1 res3c_res3c_relu_splitncnn_0 res3d_branch2c_scale3d_branch2c res3d 0=1 -23301=0
ReLU res3d_relu 1 1 res3d res3d_res3d_relu
Split splitncnn_7 1 2 res3d_res3d_relu res3d_res3d_relu_splitncnn_0 res3d_res3d_relu_splitncnn_1
Convolution res4a_branch1 1 1 res3d_res3d_relu_splitncnn_1 res4a_branch1 0=1024 1=1 2=1 3=2 4=0 5=0 6=524288 8=2
BatchNorm bn4a_branch1 1 1 res4a_branch1 res4a_branch1_bn4a_branch1 0=1024
Scale scale4a_branch1 1 1 res4a_branch1_bn4a_branch1 res4a_branch1_scale4a_branch1 0=1024 1=1
Convolution res4a_branch2a 1 1 res3d_res3d_relu_splitncnn_0 res4a_branch2a 0=256 1=1 2=1 3=2 4=0 5=0 6=131072 8=2
BatchNorm bn4a_branch2a 1 1 res4a_branch2a res4a_branch2a_bn4a_branch2a 0=256
Scale scale4a_branch2a 1 1 res4a_branch2a_bn4a_branch2a res4a_branch2a_scale4a_branch2a 0=256 1=1
ReLU res4a_branch2a_relu 1 1 res4a_branch2a_scale4a_branch2a res4a_branch2a_res4a_branch2a_relu
Convolution res4a_branch2b 1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
BatchNorm bn4a_branch2b 1 1 res4a_branch2b res4a_branch2b_bn4a_branch2b 0=256
Scale scale4a_branch2b 1 1 res4a_branch2b_bn4a_branch2b res4a_branch2b_scale4a_branch2b 0=256 1=1
ReLU res4a_branch2b_relu 1 1 res4a_branch2b_scale4a_branch2b res4a_branch2b_res4a_branch2b_relu
Convolution res4a_branch2c 1 1 res4a_branch2b_res4a_branch2b_relu res4a_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
BatchNorm bn4a_branch2c 1 1 res4a_branch2c res4a_branch2c_bn4a_branch2c 0=1024
Scale scale4a_branch2c 1 1 res4a_branch2c_bn4a_branch2c res4a_branch2c_scale4a_branch2c 0=1024 1=1
Eltwise res4a 2 1 res4a_branch1_scale4a_branch1 res4a_branch2c_scale4a_branch2c res4a 0=1 -23301=0
ReLU res4a_relu 1 1 res4a res4a_res4a_relu
Split splitncnn_8 1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1
Convolution res4b_branch2a 1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
BatchNorm bn4b_branch2a 1 1 res4b_branch2a res4b_branch2a_bn4b_branch2a 0=256
Scale scale4b_branch2a 1 1 res4b_branch2a_bn4b_branch2a res4b_branch2a_scale4b_branch2a 0=256 1=1
ReLU res4b_branch2a_relu 1 1 res4b_branch2a_scale4b_branch2a res4b_branch2a_res4b_branch2a_relu
Convolution res4b_branch2b 1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
BatchNorm bn4b_branch2b 1 1 res4b_branch2b res4b_branch2b_bn4b_branch2b 0=256
Scale scale4b_branch2b 1 1 res4b_branch2b_bn4b_branch2b res4b_branch2b_scale4b_branch2b 0=256 1=1
ReLU res4b_branch2b_relu 1 1 res4b_branch2b_scale4b_branch2b res4b_branch2b_res4b_branch2b_relu
Convolution res4b_branch2c 1 1 res4b_branch2b_res4b_branch2b_relu res4b_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
BatchNorm bn4b_branch2c 1 1 res4b_branch2c res4b_branch2c_bn4b_branch2c 0=1024
Scale scale4b_branch2c 1 1 res4b_branch2c_bn4b_branch2c res4b_branch2c_scale4b_branch2c 0=1024 1=1
Eltwise res4b 2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2c_scale4b_branch2c res4b 0=1 -23301=0
ReLU res4b_relu 1 1 res4b res4b_res4b_relu
Split splitncnn_9 1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1
Convolution res4c_branch2a 1 1 res4b_res4b_relu_splitncnn_1 res4c_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
BatchNorm bn4c_branch2a 1 1 res4c_branch2a res4c_branch2a_bn4c_branch2a 0=256
Scale scale4c_branch2a 1 1 res4c_branch2a_bn4c_branch2a res4c_branch2a_scale4c_branch2a 0=256 1=1
ReLU res4c_branch2a_relu 1 1 res4c_branch2a_scale4c_branch2a res4c_branch2a_res4c_branch2a_relu
Convolution res4c_branch2b 1 1 res4c_branch2a_res4c_branch2a_relu res4c_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
BatchNorm bn4c_branch2b 1 1 res4c_branch2b res4c_branch2b_bn4c_branch2b 0=256
Scale scale4c_branch2b 1 1 res4c_branch2b_bn4c_branch2b res4c_branch2b_scale4c_branch2b 0=256 1=1
ReLU res4c_branch2b_relu 1 1 res4c_branch2b_scale4c_branch2b res4c_branch2b_res4c_branch2b_relu
Convolution res4c_branch2c 1 1 res4c_branch2b_res4c_branch2b_relu res4c_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
BatchNorm bn4c_branch2c 1 1 res4c_branch2c res4c_branch2c_bn4c_branch2c 0=1024
Scale scale4c_branch2c 1 1 res4c_branch2c_bn4c_branch2c res4c_branch2c_scale4c_branch2c 0=1024 1=1
Eltwise res4c 2 1 res4b_res4b_relu_splitncnn_0 res4c_branch2c_scale4c_branch2c res4c 0=1 -23301=0
ReLU res4c_relu 1 1 res4c res4c_res4c_relu
Split splitncnn_10 1 2 res4c_res4c_relu res4c_res4c_relu_splitncnn_0 res4c_res4c_relu_splitncnn_1
Convolution res4d_branch2a 1 1 res4c_res4c_relu_splitncnn_1 res4d_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
BatchNorm bn4d_branch2a 1 1 res4d_branch2a res4d_branch2a_bn4d_branch2a 0=256
Scale scale4d_branch2a 1 1 res4d_branch2a_bn4d_branch2a res4d_branch2a_scale4d_branch2a 0=256 1=1
ReLU res4d_branch2a_relu 1 1 res4d_branch2a_scale4d_branch2a res4d_branch2a_res4d_branch2a_relu
Convolution res4d_branch2b 1 1 res4d_branch2a_res4d_branch2a_relu res4d_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
BatchNorm bn4d_branch2b 1 1 res4d_branch2b res4d_branch2b_bn4d_branch2b 0=256
Scale scale4d_branch2b 1 1 res4d_branch2b_bn4d_branch2b res4d_branch2b_scale4d_branch2b 0=256 1=1
ReLU res4d_branch2b_relu 1 1 res4d_branch2b_scale4d_branch2b res4d_branch2b_res4d_branch2b_relu
Convolution res4d_branch2c 1 1 res4d_branch2b_res4d_branch2b_relu res4d_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
BatchNorm bn4d_branch2c 1 1 res4d_branch2c res4d_branch2c_bn4d_branch2c 0=1024
Scale scale4d_branch2c 1 1 res4d_branch2c_bn4d_branch2c res4d_branch2c_scale4d_branch2c 0=1024 1=1
Eltwise res4d 2 1 res4c_res4c_relu_splitncnn_0 res4d_branch2c_scale4d_branch2c res4d 0=1 -23301=0
ReLU res4d_relu 1 1 res4d res4d_res4d_relu
Split splitncnn_11 1 2 res4d_res4d_relu res4d_res4d_relu_splitncnn_0 res4d_res4d_relu_splitncnn_1
Convolution res4e_branch2a 1 1 res4d_res4d_relu_splitncnn_1 res4e_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
BatchNorm bn4e_branch2a 1 1 res4e_branch2a res4e_branch2a_bn4e_branch2a 0=256
Scale scale4e_branch2a 1 1 res4e_branch2a_bn4e_branch2a res4e_branch2a_scale4e_branch2a 0=256 1=1
ReLU res4e_branch2a_relu 1 1 res4e_branch2a_scale4e_branch2a res4e_branch2a_res4e_branch2a_relu
Convolution res4e_branch2b 1 1 res4e_branch2a_res4e_branch2a_relu res4e_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
BatchNorm bn4e_branch2b 1 1 res4e_branch2b res4e_branch2b_bn4e_branch2b 0=256
Scale scale4e_branch2b 1 1 res4e_branch2b_bn4e_branch2b res4e_branch2b_scale4e_branch2b 0=256 1=1
ReLU res4e_branch2b_relu 1 1 res4e_branch2b_scale4e_branch2b res4e_branch2b_res4e_branch2b_relu
Convolution res4e_branch2c 1 1 res4e_branch2b_res4e_branch2b_relu res4e_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
BatchNorm bn4e_branch2c 1 1 res4e_branch2c res4e_branch2c_bn4e_branch2c 0=1024
Scale scale4e_branch2c 1 1 res4e_branch2c_bn4e_branch2c res4e_branch2c_scale4e_branch2c 0=1024 1=1
Eltwise res4e 2 1 res4d_res4d_relu_splitncnn_0 res4e_branch2c_scale4e_branch2c res4e 0=1 -23301=0
ReLU res4e_relu 1 1 res4e res4e_res4e_relu
Split splitncnn_12 1 2 res4e_res4e_relu res4e_res4e_relu_splitncnn_0 res4e_res4e_relu_splitncnn_1
Convolution res4f_branch2a 1 1 res4e_res4e_relu_splitncnn_1 res4f_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
BatchNorm bn4f_branch2a 1 1 res4f_branch2a res4f_branch2a_bn4f_branch2a 0=256
Scale scale4f_branch2a 1 1 res4f_branch2a_bn4f_branch2a res4f_branch2a_scale4f_branch2a 0=256 1=1
ReLU res4f_branch2a_relu 1 1 res4f_branch2a_scale4f_branch2a res4f_branch2a_res4f_branch2a_relu
Convolution res4f_branch2b 1 1 res4f_branch2a_res4f_branch2a_relu res4f_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
BatchNorm bn4f_branch2b 1 1 res4f_branch2b res4f_branch2b_bn4f_branch2b 0=256
Scale scale4f_branch2b 1 1 res4f_branch2b_bn4f_branch2b res4f_branch2b_scale4f_branch2b 0=256 1=1
ReLU res4f_branch2b_relu 1 1 res4f_branch2b_scale4f_branch2b res4f_branch2b_res4f_branch2b_relu
Convolution res4f_branch2c 1 1 res4f_branch2b_res4f_branch2b_relu res4f_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
BatchNorm bn4f_branch2c 1 1 res4f_branch2c res4f_branch2c_bn4f_branch2c 0=1024
Scale scale4f_branch2c 1 1 res4f_branch2c_bn4f_branch2c res4f_branch2c_scale4f_branch2c 0=1024 1=1
Eltwise res4f 2 1 res4e_res4e_relu_splitncnn_0 res4f_branch2c_scale4f_branch2c res4f 0=1 -23301=0
ReLU res4f_relu 1 1 res4f res4f_res4f_relu
Split splitncnn_13 1 2 res4f_res4f_relu res4f_res4f_relu_splitncnn_0 res4f_res4f_relu_splitncnn_1
Convolution res5a_branch1 1 1 res4f_res4f_relu_splitncnn_1 res5a_branch1 0=2048 1=1 2=1 3=2 4=0 5=0 6=2097152 8=2
BatchNorm bn5a_branch1 1 1 res5a_branch1 res5a_branch1_bn5a_branch1 0=2048
Scale scale5a_branch1 1 1 res5a_branch1_bn5a_branch1 res5a_branch1_scale5a_branch1 0=2048 1=1
Convolution res5a_branch2a 1 1 res4f_res4f_relu_splitncnn_0 res5a_branch2a 0=512 1=1 2=1 3=2 4=0 5=0 6=524288 8=2
BatchNorm bn5a_branch2a 1 1 res5a_branch2a res5a_branch2a_bn5a_branch2a 0=512
Scale scale5a_branch2a 1 1 res5a_branch2a_bn5a_branch2a res5a_branch2a_scale5a_branch2a 0=512 1=1
ReLU res5a_branch2a_relu 1 1 res5a_branch2a_scale5a_branch2a res5a_branch2a_res5a_branch2a_relu
Convolution res5a_branch2b 1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2
BatchNorm bn5a_branch2b 1 1 res5a_branch2b res5a_branch2b_bn5a_branch2b 0=512
Scale scale5a_branch2b 1 1 res5a_branch2b_bn5a_branch2b res5a_branch2b_scale5a_branch2b 0=512 1=1
ReLU res5a_branch2b_relu 1 1 res5a_branch2b_scale5a_branch2b res5a_branch2b_res5a_branch2b_relu
Convolution res5a_branch2c 1 1 res5a_branch2b_res5a_branch2b_relu res5a_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2
BatchNorm bn5a_branch2c 1 1 res5a_branch2c res5a_branch2c_bn5a_branch2c 0=2048
Scale scale5a_branch2c 1 1 res5a_branch2c_bn5a_branch2c res5a_branch2c_scale5a_branch2c 0=2048 1=1
Eltwise res5a 2 1 res5a_branch1_scale5a_branch1 res5a_branch2c_scale5a_branch2c res5a 0=1 -23301=0
ReLU res5a_relu 1 1 res5a res5a_res5a_relu
Split splitncnn_14 1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1
Convolution res5b_branch2a 1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a 0=512 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2
BatchNorm bn5b_branch2a 1 1 res5b_branch2a res5b_branch2a_bn5b_branch2a 0=512
Scale scale5b_branch2a 1 1 res5b_branch2a_bn5b_branch2a res5b_branch2a_scale5b_branch2a 0=512 1=1
ReLU res5b_branch2a_relu 1 1 res5b_branch2a_scale5b_branch2a res5b_branch2a_res5b_branch2a_relu
Convolution res5b_branch2b 1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2
BatchNorm bn5b_branch2b 1 1 res5b_branch2b res5b_branch2b_bn5b_branch2b 0=512
Scale scale5b_branch2b 1 1 res5b_branch2b_bn5b_branch2b res5b_branch2b_scale5b_branch2b 0=512 1=1
ReLU res5b_branch2b_relu 1 1 res5b_branch2b_scale5b_branch2b res5b_branch2b_res5b_branch2b_relu
Convolution res5b_branch2c 1 1 res5b_branch2b_res5b_branch2b_relu res5b_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2
BatchNorm bn5b_branch2c 1 1 res5b_branch2c res5b_branch2c_bn5b_branch2c 0=2048
Scale scale5b_branch2c 1 1 res5b_branch2c_bn5b_branch2c res5b_branch2c_scale5b_branch2c 0=2048 1=1
Eltwise res5b 2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2c_scale5b_branch2c res5b 0=1 -23301=0
ReLU res5b_relu 1 1 res5b res5b_res5b_relu
Split splitncnn_15 1 2 res5b_res5b_relu res5b_res5b_relu_splitncnn_0 res5b_res5b_relu_splitncnn_1
Convolution res5c_branch2a 1 1 res5b_res5b_relu_splitncnn_1 res5c_branch2a 0=512 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2
BatchNorm bn5c_branch2a 1 1 res5c_branch2a res5c_branch2a_bn5c_branch2a 0=512
Scale scale5c_branch2a 1 1 res5c_branch2a_bn5c_branch2a res5c_branch2a_scale5c_branch2a 0=512 1=1
ReLU res5c_branch2a_relu 1 1 res5c_branch2a_scale5c_branch2a res5c_branch2a_res5c_branch2a_relu
Convolution res5c_branch2b 1 1 res5c_branch2a_res5c_branch2a_relu res5c_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2
BatchNorm bn5c_branch2b 1 1 res5c_branch2b res5c_branch2b_bn5c_branch2b 0=512
Scale scale5c_branch2b 1 1 res5c_branch2b_bn5c_branch2b res5c_branch2b_scale5c_branch2b 0=512 1=1
ReLU res5c_branch2b_relu 1 1 res5c_branch2b_scale5c_branch2b res5c_branch2b_res5c_branch2b_relu
Convolution res5c_branch2c 1 1 res5c_branch2b_res5c_branch2b_relu res5c_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2
BatchNorm bn5c_branch2c 1 1 res5c_branch2c res5c_branch2c_bn5c_branch2c 0=2048
Scale scale5c_branch2c 1 1 res5c_branch2c_bn5c_branch2c res5c_branch2c_scale5c_branch2c 0=2048 1=1
Eltwise res5c 2 1 res5b_res5b_relu_splitncnn_0 res5c_branch2c_scale5c_branch2c res5c 0=1 -23301=0
ReLU res5c_relu 1 1 res5c res5c_res5c_relu
Pooling pool5 1 1 res5c_res5c_relu pool5 0=1 1=7 2=1 3=0 4=0
InnerProduct fc1000 1 1 pool5 fc1000 0=1000 1=1 2=2048000
Softmax prob 1 1 fc1000 prob 0=0

+ 77
- 0
benchmark/squeezenet_int8.param View File

@@ -0,0 +1,77 @@
7767517
75 83
Input data 0 1 data 0=227 1=227 2=3
Convolution conv1 1 1 data conv1 0=64 1=3 2=1 3=2 4=0 5=1 6=1728 8=2
ReLU relu_conv1 1 1 conv1 conv1_relu_conv1
Pooling pool1 1 1 conv1_relu_conv1 pool1 0=0 1=3 2=2 3=0 4=0
Convolution fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=1024 8=2
ReLU fire2/relu_squeeze1x1 1 1 fire2/squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1
Split splitncnn_0 1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1
Convolution fire2/expand1x1 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024 8=2
ReLU fire2/relu_expand1x1 1 1 fire2/expand1x1 fire2/expand1x1_fire2/relu_expand1x1
Convolution fire2/expand3x3 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216 8=2
ReLU fire2/relu_expand3x3 1 1 fire2/expand3x3 fire2/expand3x3_fire2/relu_expand3x3
Concat fire2/concat 2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat 0=0
Convolution fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=2048 8=2
ReLU fire3/relu_squeeze1x1 1 1 fire3/squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1
Split splitncnn_1 1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1
Convolution fire3/expand1x1 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024 8=2
ReLU fire3/relu_expand1x1 1 1 fire3/expand1x1 fire3/expand1x1_fire3/relu_expand1x1
Convolution fire3/expand3x3 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216 8=2
ReLU fire3/relu_expand3x3 1 1 fire3/expand3x3 fire3/expand3x3_fire3/relu_expand3x3
Concat fire3/concat 2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat 0=0
Pooling pool3 1 1 fire3/concat pool3 0=0 1=3 2=2 3=0 4=0
Convolution fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=4096 8=2
ReLU fire4/relu_squeeze1x1 1 1 fire4/squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1
Split splitncnn_2 1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1
Convolution fire4/expand1x1 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096 8=2
ReLU fire4/relu_expand1x1 1 1 fire4/expand1x1 fire4/expand1x1_fire4/relu_expand1x1
Convolution fire4/expand3x3 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864 8=2
ReLU fire4/relu_expand3x3 1 1 fire4/expand3x3 fire4/expand3x3_fire4/relu_expand3x3
Concat fire4/concat 2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat 0=0
Convolution fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=8192 8=2
ReLU fire5/relu_squeeze1x1 1 1 fire5/squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1
Split splitncnn_3 1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1
Convolution fire5/expand1x1 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096 8=2
ReLU fire5/relu_expand1x1 1 1 fire5/expand1x1 fire5/expand1x1_fire5/relu_expand1x1
Convolution fire5/expand3x3 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864 8=2
ReLU fire5/relu_expand3x3 1 1 fire5/expand3x3 fire5/expand3x3_fire5/relu_expand3x3
Concat fire5/concat 2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat 0=0
Pooling pool5 1 1 fire5/concat pool5 0=0 1=3 2=2 3=0 4=0
Convolution fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=12288 8=2
ReLU fire6/relu_squeeze1x1 1 1 fire6/squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1
Split splitncnn_4 1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1
Convolution fire6/expand1x1 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216 8=2
ReLU fire6/relu_expand1x1 1 1 fire6/expand1x1 fire6/expand1x1_fire6/relu_expand1x1
Convolution fire6/expand3x3 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944 8=2
ReLU fire6/relu_expand3x3 1 1 fire6/expand3x3 fire6/expand3x3_fire6/relu_expand3x3
Concat fire6/concat 2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat 0=0
Convolution fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=18432 8=2
ReLU fire7/relu_squeeze1x1 1 1 fire7/squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1
Split splitncnn_5 1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1
Convolution fire7/expand1x1 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216 8=2
ReLU fire7/relu_expand1x1 1 1 fire7/expand1x1 fire7/expand1x1_fire7/relu_expand1x1
Convolution fire7/expand3x3 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944 8=2
ReLU fire7/relu_expand3x3 1 1 fire7/expand3x3 fire7/expand3x3_fire7/relu_expand3x3
Concat fire7/concat 2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat 0=0
Convolution fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=24576 8=2
ReLU fire8/relu_squeeze1x1 1 1 fire8/squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1
Split splitncnn_6 1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1
Convolution fire8/expand1x1 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384 8=2
ReLU fire8/relu_expand1x1 1 1 fire8/expand1x1 fire8/expand1x1_fire8/relu_expand1x1
Convolution fire8/expand3x3 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456 8=2
ReLU fire8/relu_expand3x3 1 1 fire8/expand3x3 fire8/expand3x3_fire8/relu_expand3x3
Concat fire8/concat 2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat 0=0
Convolution fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
ReLU fire9/relu_squeeze1x1 1 1 fire9/squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1
Split splitncnn_7 1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1
Convolution fire9/expand1x1 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384 8=2
ReLU fire9/relu_expand1x1 1 1 fire9/expand1x1 fire9/expand1x1_fire9/relu_expand1x1
Convolution fire9/expand3x3 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456 8=2
ReLU fire9/relu_expand3x3 1 1 fire9/expand3x3 fire9/expand3x3_fire9/relu_expand3x3
Concat fire9/concat 2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat 0=0
Dropout drop9 1 1 fire9/concat fire9/concat_drop9
Convolution conv10 1 1 fire9/concat_drop9 conv10 0=1000 1=1 2=1 3=1 4=1 5=1 6=512000 8=2
ReLU relu_conv10 1 1 conv10 conv10_relu_conv10
Pooling pool10 1 1 conv10_relu_conv10 pool10 0=1 1=0 2=1 3=0 4=1
Softmax prob 1 1 pool10 prob 0=0

+ 181
- 0
benchmark/squeezenet_ssd_int8.param View File

@@ -0,0 +1,181 @@
7767517
179 212
Input data 0 1 data 0=300 1=300 2=3
Split splitncnn_0 1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6
Convolution conv1 1 1 data_splitncnn_6 conv1 0=64 1=3 2=1 3=2 4=0 5=1 6=1728 8=2
ReLU relu_conv1 1 1 conv1 conv1_relu_conv1
Pooling pool1 1 1 conv1_relu_conv1 pool1 0=0 1=3 2=2 3=0 4=0
Convolution fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=1024 8=2
ReLU fire2/relu_squeeze1x1 1 1 fire2/squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1
Split splitncnn_1 1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1
Convolution fire2/expand1x1 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024 8=2
ReLU fire2/relu_expand1x1 1 1 fire2/expand1x1 fire2/expand1x1_fire2/relu_expand1x1
Convolution fire2/expand3x3 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216 8=2
ReLU fire2/relu_expand3x3 1 1 fire2/expand3x3 fire2/expand3x3_fire2/relu_expand3x3
Concat fire2/concat 2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat 0=0
Convolution fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=2048 8=2
ReLU fire3/relu_squeeze1x1 1 1 fire3/squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1
Split splitncnn_2 1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1
Convolution fire3/expand1x1 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024 8=2
ReLU fire3/relu_expand1x1 1 1 fire3/expand1x1 fire3/expand1x1_fire3/relu_expand1x1
Convolution fire3/expand3x3 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216 8=2
ReLU fire3/relu_expand3x3 1 1 fire3/expand3x3 fire3/expand3x3_fire3/relu_expand3x3
Concat fire3/concat 2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat 0=0
Pooling pool3 1 1 fire3/concat pool3 0=0 1=3 2=2 3=0 4=0
Convolution fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=4096 8=2
ReLU fire4/relu_squeeze1x1 1 1 fire4/squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1
Split splitncnn_3 1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1
Convolution fire4/expand1x1 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096 8=2
ReLU fire4/relu_expand1x1 1 1 fire4/expand1x1 fire4/expand1x1_fire4/relu_expand1x1
Convolution fire4/expand3x3 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864 8=2
ReLU fire4/relu_expand3x3 1 1 fire4/expand3x3 fire4/expand3x3_fire4/relu_expand3x3
Concat fire4/concat 2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat 0=0
Convolution fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=8192 8=2
ReLU fire5/relu_squeeze1x1 1 1 fire5/squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1
Split splitncnn_4 1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1
Convolution fire5/expand1x1 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096 8=2
ReLU fire5/relu_expand1x1 1 1 fire5/expand1x1 fire5/expand1x1_fire5/relu_expand1x1
Convolution fire5/expand3x3 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864 8=2
ReLU fire5/relu_expand3x3 1 1 fire5/expand3x3 fire5/expand3x3_fire5/relu_expand3x3
Concat fire5/concat 2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat 0=0
Split splitncnn_5 1 2 fire5/concat fire5/concat_splitncnn_0 fire5/concat_splitncnn_1
Pooling pool5 1 1 fire5/concat_splitncnn_1 pool5 0=0 1=3 2=2 3=0 4=0
Convolution fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=12288 8=2
ReLU fire6/relu_squeeze1x1 1 1 fire6/squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1
Split splitncnn_6 1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1
Convolution fire6/expand1x1 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216 8=2
ReLU fire6/relu_expand1x1 1 1 fire6/expand1x1 fire6/expand1x1_fire6/relu_expand1x1
Convolution fire6/expand3x3 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944 8=2
ReLU fire6/relu_expand3x3 1 1 fire6/expand3x3 fire6/expand3x3_fire6/relu_expand3x3
Concat fire6/concat 2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat 0=0
Convolution fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=18432 8=2
ReLU fire7/relu_squeeze1x1 1 1 fire7/squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1
Split splitncnn_7 1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1
Convolution fire7/expand1x1 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216 8=2
ReLU fire7/relu_expand1x1 1 1 fire7/expand1x1 fire7/expand1x1_fire7/relu_expand1x1
Convolution fire7/expand3x3 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944 8=2
ReLU fire7/relu_expand3x3 1 1 fire7/expand3x3 fire7/expand3x3_fire7/relu_expand3x3
Concat fire7/concat 2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat 0=0
Convolution fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=24576 8=2
ReLU fire8/relu_squeeze1x1 1 1 fire8/squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1
Split splitncnn_8 1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1
Convolution fire8/expand1x1 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384 8=2
ReLU fire8/relu_expand1x1 1 1 fire8/expand1x1 fire8/expand1x1_fire8/relu_expand1x1
Convolution fire8/expand3x3 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456 8=2
ReLU fire8/relu_expand3x3 1 1 fire8/expand3x3 fire8/expand3x3_fire8/relu_expand3x3
Concat fire8/concat 2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat 0=0
Convolution fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
ReLU fire9/relu_squeeze1x1 1 1 fire9/squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1
Split splitncnn_9 1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1
Convolution fire9/expand1x1 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384 8=2
BatchNorm fire9/expand1x1/bn 1 1 fire9/expand1x1 fire9/expand1x1_fire9/expand1x1/bn 0=256
Scale fire9/expand1x1/scale 1 1 fire9/expand1x1_fire9/expand1x1/bn fire9/expand1x1_fire9/expand1x1/scale 0=256 1=1
ReLU fire9/relu_expand1x1 1 1 fire9/expand1x1_fire9/expand1x1/scale fire9/expand1x1_fire9/relu_expand1x1
Convolution fire9/expand3x3 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456 8=2
BatchNorm fire9/expand3x3/bn 1 1 fire9/expand3x3 fire9/expand3x3_fire9/expand3x3/bn 0=256
Scale fire9/expand3x3/scale 1 1 fire9/expand3x3_fire9/expand3x3/bn fire9/expand3x3_fire9/expand3x3/scale 0=256 1=1
ReLU fire9/relu_expand3x3 1 1 fire9/expand3x3_fire9/expand3x3/scale fire9/expand3x3_fire9/relu_expand3x3
Concat fire9/concat 2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat 0=0
Split splitncnn_10 1 4 fire9/concat fire9/concat_splitncnn_0 fire9/concat_splitncnn_1 fire9/concat_splitncnn_2 fire9/concat_splitncnn_3
Pooling pool9 1 1 fire9/concat_splitncnn_3 pool9 0=0 1=3 2=2 3=0 4=0
Convolution fire10/squeeze1x1 1 1 pool9 fire10/squeeze1x1 0=96 1=1 2=1 3=1 4=0 5=1 6=49152 8=2
BatchNorm fire10/squeeze1x1/bn 1 1 fire10/squeeze1x1 fire10/squeeze1x1_fire10/squeeze1x1/bn 0=96
Scale fire10/squeeze1x1/scale 1 1 fire10/squeeze1x1_fire10/squeeze1x1/bn fire10/squeeze1x1_fire10/squeeze1x1/scale 0=96 1=1
ReLU fire10/relu_squeeze1x1 1 1 fire10/squeeze1x1_fire10/squeeze1x1/scale fire10/squeeze1x1_fire10/relu_squeeze1x1
Split splitncnn_11 1 2 fire10/squeeze1x1_fire10/relu_squeeze1x1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1
Convolution fire10/expand1x1 1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1 fire10/expand1x1 0=384 1=1 2=1 3=1 4=0 5=1 6=36864 8=2
BatchNorm fire10/expand1x1/bn 1 1 fire10/expand1x1 fire10/expand1x1_fire10/expand1x1/bn 0=384
Scale fire10/expand1x1/scale 1 1 fire10/expand1x1_fire10/expand1x1/bn fire10/expand1x1_fire10/expand1x1/scale 0=384 1=1
ReLU fire10/relu_expand1x1 1 1 fire10/expand1x1_fire10/expand1x1/scale fire10/expand1x1_fire10/relu_expand1x1
Convolution fire10/expand3x3 1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/expand3x3 0=384 1=3 2=1 3=1 4=1 5=1 6=331776 8=2
BatchNorm fire10/expand3x3/bn 1 1 fire10/expand3x3 fire10/expand3x3_fire10/expand3x3/bn 0=384
Scale fire10/expand3x3/scale 1 1 fire10/expand3x3_fire10/expand3x3/bn fire10/expand3x3_fire10/expand3x3/scale 0=384 1=1
ReLU fire10/relu_expand3x3 1 1 fire10/expand3x3_fire10/expand3x3/scale fire10/expand3x3_fire10/relu_expand3x3
Concat fire10/concat 2 1 fire10/expand1x1_fire10/relu_expand1x1 fire10/expand3x3_fire10/relu_expand3x3 fire10/concat 0=0
Split splitncnn_12 1 4 fire10/concat fire10/concat_splitncnn_0 fire10/concat_splitncnn_1 fire10/concat_splitncnn_2 fire10/concat_splitncnn_3
Pooling pool10 1 1 fire10/concat_splitncnn_3 pool10 0=0 1=3 2=2 3=0 4=0
Convolution fire11/squeeze1x1 1 1 pool10 fire11/squeeze1x1 0=96 1=1 2=1 3=1 4=0 5=1 6=73728 8=2
BatchNorm fire11/squeeze1x1/bn 1 1 fire11/squeeze1x1 fire11/squeeze1x1_fire11/squeeze1x1/bn 0=96
Scale fire11/squeeze1x1/scale 1 1 fire11/squeeze1x1_fire11/squeeze1x1/bn fire11/squeeze1x1_fire11/squeeze1x1/scale 0=96 1=1
ReLU fire11/relu_squeeze1x1 1 1 fire11/squeeze1x1_fire11/squeeze1x1/scale fire11/squeeze1x1_fire11/relu_squeeze1x1
Split splitncnn_13 1 2 fire11/squeeze1x1_fire11/relu_squeeze1x1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1
Convolution fire11/expand1x1 1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1 fire11/expand1x1 0=384 1=1 2=1 3=1 4=0 5=1 6=36864 8=2
BatchNorm fire11/expand1x1/bn 1 1 fire11/expand1x1 fire11/expand1x1_fire11/expand1x1/bn 0=384
Scale fire11/expand1x1/scale 1 1 fire11/expand1x1_fire11/expand1x1/bn fire11/expand1x1_fire11/expand1x1/scale 0=384 1=1
ReLU fire11/relu_expand1x1 1 1 fire11/expand1x1_fire11/expand1x1/scale fire11/expand1x1_fire11/relu_expand1x1
Convolution fire11/expand3x3 1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/expand3x3 0=384 1=3 2=1 3=1 4=1 5=1 6=331776 8=2
BatchNorm fire11/expand3x3/bn 1 1 fire11/expand3x3 fire11/expand3x3_fire11/expand3x3/bn 0=384
Scale fire11/expand3x3/scale 1 1 fire11/expand3x3_fire11/expand3x3/bn fire11/expand3x3_fire11/expand3x3/scale 0=384 1=1
ReLU fire11/relu_expand3x3 1 1 fire11/expand3x3_fire11/expand3x3/scale fire11/expand3x3_fire11/relu_expand3x3
Concat fire11/concat 2 1 fire11/expand1x1_fire11/relu_expand1x1 fire11/expand3x3_fire11/relu_expand3x3 fire11/concat 0=0
Split splitncnn_14 1 4 fire11/concat fire11/concat_splitncnn_0 fire11/concat_splitncnn_1 fire11/concat_splitncnn_2 fire11/concat_splitncnn_3
Convolution conv12_1 1 1 fire11/concat_splitncnn_3 conv12_1 0=128 1=1 2=1 3=1 4=0 5=0 6=98304 8=2
BatchNorm conv12_1/bn 1 1 conv12_1 conv12_1_conv12_1/bn 0=128
Scale conv12_1/scale 1 1 conv12_1_conv12_1/bn conv12_1_conv12_1/scale 0=128 1=1
ReLU conv12_1/relu 1 1 conv12_1_conv12_1/scale conv12_1_conv12_1/relu
Convolution conv12_2 1 1 conv12_1_conv12_1/relu conv12_2 0=256 1=3 2=1 3=2 4=1 5=0 6=294912 8=2
BatchNorm conv12_2/bn 1 1 conv12_2 conv12_2_conv12_2/bn 0=256
Scale conv12_2/scale 1 1 conv12_2_conv12_2/bn conv12_2_conv12_2/scale 0=256 1=1
ReLU conv12_2/relu 1 1 conv12_2_conv12_2/scale conv12_2_conv12_2/relu
Split splitncnn_15 1 4 conv12_2_conv12_2/relu conv12_2_conv12_2/relu_splitncnn_0 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_conv12_2/relu_splitncnn_3
Convolution conv13_1 1 1 conv12_2_conv12_2/relu_splitncnn_3 conv13_1 0=64 1=1 2=1 3=1 4=0 5=0 6=16384 8=2
BatchNorm conv13_1/bn 1 1 conv13_1 conv13_1_conv13_1/bn 0=64
Scale conv13_1/scale 1 1 conv13_1_conv13_1/bn conv13_1_conv13_1/scale 0=64 1=1
ReLU conv13_1/relu 1 1 conv13_1_conv13_1/scale conv13_1_conv13_1/relu
Convolution conv13_2 1 1 conv13_1_conv13_1/relu conv13_2 0=128 1=3 2=1 3=2 4=1 5=0 6=73728 8=2
BatchNorm conv13_2/bn 1 1 conv13_2 conv13_2_conv13_2/bn 0=128
Scale conv13_2/scale 1 1 conv13_2_conv13_2/bn conv13_2_conv13_2/scale 0=128 1=1
ReLU conv13_2/relu 1 1 conv13_2_conv13_2/scale conv13_2_conv13_2/relu
Split splitncnn_16 1 3 conv13_2_conv13_2/relu conv13_2_conv13_2/relu_splitncnn_0 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_conv13_2/relu_splitncnn_2
BatchNorm fire5/bn 1 1 fire5/concat_splitncnn_0 fire5/normal 0=256
Scale fire5/scale 1 1 fire5/normal fire5/normal_fire5/scale 0=256 1=1
Split splitncnn_17 1 3 fire5/normal_fire5/scale fire5/normal_fire5/scale_splitncnn_0 fire5/normal_fire5/scale_splitncnn_1 fire5/normal_fire5/scale_splitncnn_2
Convolution fire5_mbox_loc 1 1 fire5/normal_fire5/scale_splitncnn_2 fire5_mbox_loc 0=16 1=3 2=1 3=1 4=1 5=1 6=36864 8=2
Permute fire5_mbox_loc_perm 1 1 fire5_mbox_loc fire5_mbox_loc_perm 0=3
Flatten fire5_mbox_loc_flat 1 1 fire5_mbox_loc_perm fire5_mbox_loc_flat
Convolution fire5_mbox_conf 1 1 fire5/normal_fire5/scale_splitncnn_1 fire5_mbox_conf 0=84 1=3 2=1 3=1 4=1 5=1 6=193536 8=2
Permute fire5_mbox_conf_perm 1 1 fire5_mbox_conf fire5_mbox_conf_perm 0=3
Flatten fire5_mbox_conf_flat 1 1 fire5_mbox_conf_perm fire5_mbox_conf_flat
PriorBox fire5_mbox_priorbox 2 1 fire5/normal_fire5/scale_splitncnn_0 data_splitncnn_5 fire5_mbox_priorbox -23300=1,21.000000 -23301=1,45.000000 -23302=1,2.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=8.000000 12=8.000000 13=0.500000
Convolution fire9_mbox_loc 1 1 fire9/concat_splitncnn_2 fire9_mbox_loc 0=24 1=3 2=1 3=1 4=1 5=1 6=110592 8=2
Permute fire9_mbox_loc_perm 1 1 fire9_mbox_loc fire9_mbox_loc_perm 0=3
Flatten fire9_mbox_loc_flat 1 1 fire9_mbox_loc_perm fire9_mbox_loc_flat
Convolution fire9_mbox_conf 1 1 fire9/concat_splitncnn_1 fire9_mbox_conf 0=126 1=3 2=1 3=1 4=1 5=1 6=580608 8=2
Permute fire9_mbox_conf_perm 1 1 fire9_mbox_conf fire9_mbox_conf_perm 0=3
Flatten fire9_mbox_conf_flat 1 1 fire9_mbox_conf_perm fire9_mbox_conf_flat
PriorBox fire9_mbox_priorbox 2 1 fire9/concat_splitncnn_0 data_splitncnn_4 fire9_mbox_priorbox -23300=1,45.000000 -23301=1,99.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=16.000000 12=16.000000 13=0.500000
Convolution fire10_mbox_loc 1 1 fire10/concat_splitncnn_2 fire10_mbox_loc 0=24 1=3 2=1 3=1 4=1 5=1 6=165888 8=2
Permute fire10_mbox_loc_perm 1 1 fire10_mbox_loc fire10_mbox_loc_perm 0=3
Flatten fire10_mbox_loc_flat 1 1 fire10_mbox_loc_perm fire10_mbox_loc_flat
Convolution fire10_mbox_conf 1 1 fire10/concat_splitncnn_1 fire10_mbox_conf 0=126 1=3 2=1 3=1 4=1 5=1 6=870912 8=2
Permute fire10_mbox_conf_perm 1 1 fire10_mbox_conf fire10_mbox_conf_perm 0=3
Flatten fire10_mbox_conf_flat 1 1 fire10_mbox_conf_perm fire10_mbox_conf_flat
PriorBox fire10_mbox_priorbox 2 1 fire10/concat_splitncnn_0 data_splitncnn_3 fire10_mbox_priorbox -23300=1,99.000000 -23301=1,153.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=32.000000 12=32.000000 13=0.500000
Convolution fire11_mbox_loc 1 1 fire11/concat_splitncnn_2 fire11_mbox_loc 0=24 1=3 2=1 3=1 4=1 5=1 6=165888 8=2
Permute fire11_mbox_loc_perm 1 1 fire11_mbox_loc fire11_mbox_loc_perm 0=3
Flatten fire11_mbox_loc_flat 1 1 fire11_mbox_loc_perm fire11_mbox_loc_flat
Convolution fire11_mbox_conf 1 1 fire11/concat_splitncnn_1 fire11_mbox_conf 0=126 1=3 2=1 3=1 4=1 5=1 6=870912 8=2
Permute fire11_mbox_conf_perm 1 1 fire11_mbox_conf fire11_mbox_conf_perm 0=3
Flatten fire11_mbox_conf_flat 1 1 fire11_mbox_conf_perm fire11_mbox_conf_flat
PriorBox fire11_mbox_priorbox 2 1 fire11/concat_splitncnn_0 data_splitncnn_2 fire11_mbox_priorbox -23300=1,153.000000 -23301=1,207.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=64.000000 12=64.000000 13=0.500000
Convolution conv12_2_mbox_loc 1 1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_mbox_loc 0=24 1=3 2=1 3=1 4=1 5=1 6=55296 8=2
Permute conv12_2_mbox_loc_perm 1 1 conv12_2_mbox_loc conv12_2_mbox_loc_perm 0=3
Flatten conv12_2_mbox_loc_flat 1 1 conv12_2_mbox_loc_perm conv12_2_mbox_loc_flat
Convolution conv12_2_mbox_conf 1 1 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_mbox_conf 0=126 1=3 2=1 3=1 4=1 5=1 6=290304 8=2
Permute conv12_2_mbox_conf_perm 1 1 conv12_2_mbox_conf conv12_2_mbox_conf_perm 0=3
Flatten conv12_2_mbox_conf_flat 1 1 conv12_2_mbox_conf_perm conv12_2_mbox_conf_flat
PriorBox conv12_2_mbox_priorbox 2 1 conv12_2_conv12_2/relu_splitncnn_0 data_splitncnn_1 conv12_2_mbox_priorbox -23300=1,207.000000 -23301=1,261.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=100.000000 12=100.000000 13=0.500000
Convolution conv13_2_mbox_loc 1 1 conv13_2_conv13_2/relu_splitncnn_2 conv13_2_mbox_loc 0=16 1=3 2=1 3=1 4=1 5=1 6=18432 8=2
Permute conv13_2_mbox_loc_perm 1 1 conv13_2_mbox_loc conv13_2_mbox_loc_perm 0=3
Flatten conv13_2_mbox_loc_flat 1 1 conv13_2_mbox_loc_perm conv13_2_mbox_loc_flat
Convolution conv13_2_mbox_conf 1 1 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_mbox_conf 0=84 1=3 2=1 3=1 4=1 5=1 6=96768 8=2
Permute conv13_2_mbox_conf_perm 1 1 conv13_2_mbox_conf conv13_2_mbox_conf_perm 0=3
Flatten conv13_2_mbox_conf_flat 1 1 conv13_2_mbox_conf_perm conv13_2_mbox_conf_flat
PriorBox conv13_2_mbox_priorbox 2 1 conv13_2_conv13_2/relu_splitncnn_0 data_splitncnn_0 conv13_2_mbox_priorbox -23300=1,261.000000 -23301=1,315.000000 -23302=1,2.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=300.000000 12=300.000000 13=0.500000
Concat mbox_loc 6 1 fire5_mbox_loc_flat fire9_mbox_loc_flat fire10_mbox_loc_flat fire11_mbox_loc_flat conv12_2_mbox_loc_flat conv13_2_mbox_loc_flat mbox_loc 0=0
Concat mbox_conf 6 1 fire5_mbox_conf_flat fire9_mbox_conf_flat fire10_mbox_conf_flat fire11_mbox_conf_flat conv12_2_mbox_conf_flat conv13_2_mbox_conf_flat mbox_conf 0=0
Concat mbox_priorbox 6 1 fire5_mbox_priorbox fire9_mbox_priorbox fire10_mbox_priorbox fire11_mbox_priorbox conv12_2_mbox_priorbox conv13_2_mbox_priorbox mbox_priorbox 0=1
Reshape mbox_conf_reshape 1 1 mbox_conf mbox_conf_reshape 0=21 1=-1 2=0 3=0
Softmax mbox_conf_softmax 1 1 mbox_conf_reshape mbox_conf_softmax 0=1
Flatten mbox_conf_flatten 1 1 mbox_conf_softmax mbox_conf_flatten
DetectionOutput detection_out 3 1 mbox_loc mbox_conf_flatten mbox_priorbox detection_out 0=21 1=0.450000 2=100 3=100 4=0.050000

+ 42
- 0
benchmark/vgg16_int8.param View File

@@ -0,0 +1,42 @@
7767517
40 40
Input data 0 1 data 0=224 1=224 2=3
Convolution conv1_1 1 1 data conv1_1 0=64 1=3 2=1 3=1 4=1 5=1 6=1728 8=2
ReLU relu1_1 1 1 conv1_1 conv1_1_relu1_1
Convolution conv1_2 1 1 conv1_1_relu1_1 conv1_2 0=64 1=3 2=1 3=1 4=1 5=1 6=36864 8=2
ReLU relu1_2 1 1 conv1_2 conv1_2_relu1_2
Pooling pool1 1 1 conv1_2_relu1_2 pool1 0=0 1=2 2=2 3=0 4=0
Convolution conv2_1 1 1 pool1 conv2_1 0=128 1=3 2=1 3=1 4=1 5=1 6=73728 8=2
ReLU relu2_1 1 1 conv2_1 conv2_1_relu2_1
Convolution conv2_2 1 1 conv2_1_relu2_1 conv2_2 0=128 1=3 2=1 3=1 4=1 5=1 6=147456 8=2
ReLU relu2_2 1 1 conv2_2 conv2_2_relu2_2
Pooling pool2 1 1 conv2_2_relu2_2 pool2 0=0 1=2 2=2 3=0 4=0
Convolution conv3_1 1 1 pool2 conv3_1 0=256 1=3 2=1 3=1 4=1 5=1 6=294912 8=2
ReLU relu3_1 1 1 conv3_1 conv3_1_relu3_1
Convolution conv3_2 1 1 conv3_1_relu3_1 conv3_2 0=256 1=3 2=1 3=1 4=1 5=1 6=589824 8=2
ReLU relu3_2 1 1 conv3_2 conv3_2_relu3_2
Convolution conv3_3 1 1 conv3_2_relu3_2 conv3_3 0=256 1=3 2=1 3=1 4=1 5=1 6=589824 8=2
ReLU relu3_3 1 1 conv3_3 conv3_3_relu3_3
Pooling pool3 1 1 conv3_3_relu3_3 pool3 0=0 1=2 2=2 3=0 4=0
Convolution conv4_1 1 1 pool3 conv4_1 0=512 1=3 2=1 3=1 4=1 5=1 6=1179648 8=2
ReLU relu4_1 1 1 conv4_1 conv4_1_relu4_1
Convolution conv4_2 1 1 conv4_1_relu4_1 conv4_2 0=512 1=3 2=1 3=1 4=1 5=1 6=2359296 8=2
ReLU relu4_2 1 1 conv4_2 conv4_2_relu4_2
Convolution conv4_3 1 1 conv4_2_relu4_2 conv4_3 0=512 1=3 2=1 3=1 4=1 5=1 6=2359296 8=2
ReLU relu4_3 1 1 conv4_3 conv4_3_relu4_3
Pooling pool4 1 1 conv4_3_relu4_3 pool4 0=0 1=2 2=2 3=0 4=0
Convolution conv5_1 1 1 pool4 conv5_1 0=512 1=3 2=1 3=1 4=1 5=1 6=2359296 8=2
ReLU relu5_1 1 1 conv5_1 conv5_1_relu5_1
Convolution conv5_2 1 1 conv5_1_relu5_1 conv5_2 0=512 1=3 2=1 3=1 4=1 5=1 6=2359296 8=2
ReLU relu5_2 1 1 conv5_2 conv5_2_relu5_2
Convolution conv5_3 1 1 conv5_2_relu5_2 conv5_3 0=512 1=3 2=1 3=1 4=1 5=1 6=2359296 8=2
ReLU relu5_3 1 1 conv5_3 conv5_3_relu5_3
Pooling pool5 1 1 conv5_3_relu5_3 pool5 0=0 1=2 2=2 3=0 4=0
InnerProduct fc6 1 1 pool5 fc6 0=4096 1=1 2=102760448
ReLU relu6 1 1 fc6 fc6_relu6
Dropout drop6 1 1 fc6_relu6 fc6_drop6
InnerProduct fc7 1 1 fc6_drop6 fc7 0=4096 1=1 2=16777216
ReLU relu7 1 1 fc7 fc7_relu7
Dropout drop7 1 1 fc7_relu7 fc7_drop7
InnerProduct fc8 1 1 fc7_drop7 fc8 0=1000 1=1 2=4096000
Softmax prob 1 1 fc8 prob 0=0

+ 1
- 0
src/CMakeLists.txt View File

@@ -183,6 +183,7 @@ ncnn_add_layer(Yolov3DetectionOutput)
ncnn_add_layer(PSROIPooling)
ncnn_add_layer(ROIAlign OFF)
ncnn_add_layer(Packing)
ncnn_add_layer(Requantize)

# message("SHADER_SPV_HEX_FILES = ${SHADER_SPV_HEX_FILES}")
add_custom_target(generate-spirv DEPENDS ${SHADER_SPV_HEX_FILES})


+ 2
- 2
src/benchmark.cpp View File

@@ -55,14 +55,14 @@ double get_current_time()

void benchmark(const Layer* layer, double start, double end)
{
fprintf(stderr, "%-24s %-24s %8.2lfms", layer->type.c_str(), layer->name.c_str(), end - start);
fprintf(stderr, "%-24s %-30s %8.2lfms", layer->type.c_str(), layer->name.c_str(), end - start);
fprintf(stderr, " |");
fprintf(stderr, "\n");
}

void benchmark(const Layer* layer, const Mat& bottom_blob, Mat& top_blob, double start, double end)
{
fprintf(stderr, "%-24s %-24s %8.2lfms", layer->type.c_str(), layer->name.c_str(), end - start);
fprintf(stderr, "%-24s %-30s %8.2lfms", layer->type.c_str(), layer->name.c_str(), end - start);
fprintf(stderr, " | feature_map: %4d x %-4d inch: %4d outch: %4d", bottom_blob.w, bottom_blob.h, bottom_blob.c, top_blob.c);
if (layer->type == "Convolution")
{


+ 821
- 4101
src/layer/arm/convolution_1x1_int8.h
File diff suppressed because it is too large
View File


+ 2549
- 2119
src/layer/arm/convolution_3x3_int8.h
File diff suppressed because it is too large
View File


+ 35
- 0
src/layer/arm/convolution_5x5_int8.h View File

@@ -0,0 +1,35 @@
// SenseNets is pleased to support the open source community by supporting ncnn available.
//
// Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv5x5s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
{
int kernel_w = 5;
int kernel_h = 5;

int stride_w = 1;
int stride_h = 1;

conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
}

static void conv5x5s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
{
int kernel_w = 5;
int kernel_h = 5;

int stride_w = 2;
int stride_h = 2;

conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
}

+ 35
- 0
src/layer/arm/convolution_7x7_int8.h View File

@@ -0,0 +1,35 @@
// SenseNets is pleased to support the open source community by supporting ncnn available.
//
// Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv7x7s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
{
int kernel_w = 7;
int kernel_h = 7;

int stride_w = 1;
int stride_h = 1;

conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
}

static void conv7x7s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
{
int kernel_w = 7;
int kernel_h = 7;

int stride_w = 2;
int stride_h = 2;

conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
}

+ 109
- 38
src/layer/arm/convolution_arm.cpp View File

@@ -14,6 +14,8 @@

#include "convolution_arm.h"

#include "benchmark.h"

namespace ncnn {

#include "convolution_1x1.h"
@@ -24,8 +26,11 @@ namespace ncnn {
#include "convolution_7x7.h"

#if __ARM_NEON
#include "convolution_sgemm_int8.h"
#include "convolution_1x1_int8.h"
#include "convolution_3x3_int8.h"
#include "convolution_5x5_int8.h"
#include "convolution_7x7_int8.h"
#endif // __ARM_NEON

DEFINE_LAYER_CREATOR(Convolution_arm)
@@ -66,9 +71,12 @@ int Convolution_arm::load_model(const ModelBin& mb)

if (use_int8_inference)
{
#if __ARM_NEON
#if !__aarch64__
if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
if (use_winograd3x3)
{
int num_input = weight_data_size / 9 / num_output;
conv3x3s1_winograd23_transform_kernel_int8_neon(weight_data, weight_3x3_winograd23_int8_data, num_input, num_output);
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
int num_input = weight_data_size / 9 / num_output;
conv3x3s1_transform_kernel_int8_neon(weight_data, weight_3x3s1_int8_data, num_input, num_output);
@@ -78,16 +86,15 @@ int Convolution_arm::load_model(const ModelBin& mb)
{
int num_input = weight_data_size / 9 / num_output;
conv3x3s2_transform_kernel_int8_neon(weight_data, weight_3x3s2_int8_data, num_input, num_output);
}
}

if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
int num_input = weight_data_size / num_output;
conv1x1s1_sgemm_transform_kernel_int8_neon(weight_data, weight_1x1s1_sgemm_int8_data, num_input, num_output);
use_sgemm1x1 = true;
}
#endif // !__aarch64__
#endif // __ARM_NEON
}
return 0;
}

@@ -233,7 +240,8 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
}

const int kernel_size = kernel_w;
const int stride = stride_w;
//const int stride = stride_w;
int stride = stride_w;

if (kernel_size > 7 || stride > 4 || dilation_w != dilation_h)
{
@@ -293,43 +301,50 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option

#if __ARM_NEON
// kernel_size x stride
conv_int8_func conv_int8_func_table[5][5] =
conv_int8_func conv_int8_func_table[7][4] =
{
{
conv1x1s1_int8_neon,
conv1x1s2_int8_neon,
0,
0,
0
}, // kernel_size = 1
{
0,
0,
0,
0,
0
}, // kernel_size = 2
{
conv3x3s1_int8_neon,
conv3x3s2_int8_neon,
0,
0,
0
}, // kernel_size = 3
{
0,
0,
0,
0,
0
}, // kernel_size = 4
{
conv5x5s1_int8_neon,
conv5x5s2_int8_neon,
0,
0
}, // kernel_size = 5
{
0,
0,
0,
0
} // kernel_size = 5
}, // kernel_size = 6
{
conv7x7s1_int8_neon,
conv7x7s2_int8_neon,
0,
0
} // kernel_size = 7
};
#endif // __ARM_NEON

@@ -384,9 +399,9 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
opt_g.blob_allocator = bottom_blob_int8.allocator;

quantize->forward(bottom_blob, bottom_blob_int8, opt_g);
}
}

bottom_blob_unbordered = bottom_blob_int8;
bottom_blob_unbordered = bottom_blob_int8;
}

Mat bottom_blob_bordered = bottom_blob_unbordered;
@@ -423,34 +438,90 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option

if (use_int8_inference)
{
#if __ARM_NEON
#if !__aarch64__
if (use_sgemm1x1)
{
conv1x1s1_sgemm_int8_neon(bottom_blob_bordered, top_blob, weight_1x1s1_sgemm_int8_data, opt);
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
if (use_int8_requantize == true)
{
conv3x3s1_packed_int8_neon(bottom_blob_bordered, top_blob, weight_3x3s1_int8_data, opt);
Mat top_blob_tm;
top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator);
if (top_blob_tm.empty())
return -100;
top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator);
if (top_blob.empty())
return -100;

if (use_sgemm1x1)
{
conv1x1s1_sgemm_int8_neon(bottom_blob_bordered, top_blob_tm, weight_1x1s1_sgemm_int8_data, opt);
}
else if (use_winograd3x3)
{
conv3x3s1_winograd23_int8_neon(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_int8_data, opt);
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv3x3s1_packed_int8_neon(bottom_blob_bordered, top_blob_tm, weight_3x3s1_int8_data, opt);
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv3x3s2_packed_int8_neon(bottom_blob_bordered, top_blob_tm, weight_3x3s2_int8_data, opt);
}
else
{
conv_int8(bottom_blob_bordered, top_blob_tm, weight_data, opt);
}

// requantize, reverse scale inplace
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<num_output; p++)
{
ncnn::Option opt_g = opt;
opt_g.num_threads = 1;
opt_g.blob_allocator = top_blob.allocator;

Mat top_blob_tm_g = top_blob_tm.channel_range(p, 1);
Mat top_blob_g = top_blob.channel_range(p, 1);
requantize_ops[p]->forward(top_blob_tm_g, top_blob_g, opt_g);
}
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv3x3s2_packed_int8_neon(bottom_blob_bordered, top_blob, weight_3x3s2_int8_data, opt);
}
else
#endif // !__aarch64__
#endif // __ARM_NEON
{
conv_int8(bottom_blob_bordered, top_blob, weight_data, opt);
}
top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator);
if (top_blob.empty())
return -100;

// dequantize, reverse scale inplace
{
ncnn::Option opt_g = opt;
opt_g.blob_allocator = top_blob.allocator;
if (use_sgemm1x1)
{
conv1x1s1_sgemm_int8_neon(bottom_blob_bordered, top_blob, weight_1x1s1_sgemm_int8_data, opt);
}
else if (use_winograd3x3)
{
conv3x3s1_winograd23_int8_neon(bottom_blob_bordered, top_blob, weight_3x3_winograd23_int8_data, opt);
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv3x3s1_packed_int8_neon(bottom_blob_bordered, top_blob, weight_3x3s1_int8_data, opt);
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv3x3s2_packed_int8_neon(bottom_blob_bordered, top_blob, weight_3x3s2_int8_data, opt);
}
else
{
conv_int8(bottom_blob_bordered, top_blob, weight_data, opt);
}

dequantize->forward_inplace(top_blob, opt_g);
}
// dequantize, reverse scale inplace
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<num_output; p++)
{
ncnn::Option opt_g = opt;
opt_g.num_threads = 1;
opt_g.blob_allocator = top_blob.allocator;

Mat top_blob_g = top_blob.channel_range(p, 1);
dequantize_ops[p]->forward_inplace(top_blob_g, opt_g);
}
}

return 0;
}


+ 2
- 0
src/layer/arm/convolution_arm.h View File

@@ -40,6 +40,8 @@ public:
Mat weight_3x3s1_int8_data;
Mat weight_3x3s2_int8_data;
Mat weight_1x1s1_sgemm_int8_data;
Mat weight_3x3_winograd23_data;
std::vector<Mat> weight_3x3_winograd23_int8_data;
};

} // namespace ncnn


+ 1598
- 0
src/layer/arm/convolution_sgemm_int8.h
File diff suppressed because it is too large
View File


+ 0
- 343
src/layer/arm/convolutiondepthwise_3x3_int8.h View File

@@ -16,347 +16,6 @@
#include <arm_neon.h>
#endif // __ARM_NEON

#if __aarch64__
static void convdw3x3s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
{
int w = bottom_blob.w;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
Mat out = top_blob.channel(p);

const signed char* kernel = (const signed char *)_kernel + p*9;
int* outptr0 = out;
int* outptr0n = outptr0 + outw;
const signed char* img0 = bottom_blob.channel(p);
const signed char* r0 = img0;
const signed char* r1 = img0 + w;
const signed char* r2 = img0 + w*2;
const signed char* r3 = img0 + w*3;

int i = 0;

int8x8_t _k0 = vdup_n_s8(kernel[0]);
int8x8_t _k1 = vdup_n_s8(kernel[1]);
int8x8_t _k2 = vdup_n_s8(kernel[2]);

int8x8_t _k3 = vdup_n_s8(kernel[3]);
int8x8_t _k4 = vdup_n_s8(kernel[4]);
int8x8_t _k5 = vdup_n_s8(kernel[5]);

int8x8_t _k6 = vdup_n_s8(kernel[6]);
int8x8_t _k7 = vdup_n_s8(kernel[7]);
int8x8_t _k8 = vdup_n_s8(kernel[8]);

for (; i+1 < outh; i+=2)
{
int nn = outw >> 3;
int remain = outw & 7;

for (; nn >0; nn--)
{
int8x8_t _r0 = vld1_s8(r0);
int8x8_t _r0n = vld1_s8(r0+8);
int8x8_t _r01 = vext_s8(_r0, _r0n, 1);
int8x8_t _r02 = vext_s8(_r0, _r0n, 2);

int16x8_t _sum0 = vmull_s8(_r0, _k0);
_sum0 = vmlal_s8(_sum0, _r01, _k1);
_sum0 = vmlal_s8(_sum0, _r02, _k2);

int8x8_t _r1 = vld1_s8(r1);
int8x8_t _r1n = vld1_s8(r1+8);
int8x8_t _r11 = vext_s8(_r1, _r1n, 1);
int8x8_t _r12 = vext_s8(_r1, _r1n, 2);
_sum0 = vmlal_s8(_sum0, _r1, _k3);
_sum0 = vmlal_s8(_sum0, _r11, _k4);
_sum0 = vmlal_s8(_sum0, _r12, _k5);

int16x8_t _sum1 = vmull_s8(_r1, _k0);
_sum1 = vmlal_s8(_sum1, _r11, _k1);
_sum1 = vmlal_s8(_sum1, _r12, _k2);

int8x8_t _r2 = vld1_s8(r2);
int8x8_t _r2n = vld1_s8(r2+8);
int8x8_t _r21 = vext_s8(_r2, _r2n, 1);
int8x8_t _r22 = vext_s8(_r2, _r2n, 2);
_sum0 = vmlal_s8(_sum0, _r2, _k6);
_sum0 = vmlal_s8(_sum0, _r21, _k7);
_sum0 = vmlal_s8(_sum0, _r22, _k8);

_sum1 = vmlal_s8(_sum1, _r2, _k3);
_sum1 = vmlal_s8(_sum1, _r21, _k4);
_sum1 = vmlal_s8(_sum1, _r22, _k5);

int8x8_t _r3 = vld1_s8(r3);
int8x8_t _r3n = vld1_s8(r3+8);
int8x8_t _r31 = vext_s8(_r3, _r3n, 1);
int8x8_t _r32 = vext_s8(_r3, _r3n, 2);
_sum1 = vmlal_s8(_sum1, _r3, _k6);
_sum1 = vmlal_s8(_sum1, _r31, _k7);
_sum1 = vmlal_s8(_sum1, _r32, _k8);

int32x4_t sum0_s32 = vmovl_s16(vget_low_s16(_sum0));
int32x4_t sum0n_s32 = vmovl_s16(vget_high_s16(_sum0));

vst1q_s32(outptr0, sum0_s32);
vst1q_s32(outptr0+4, sum0n_s32);

int32x4_t sum1_s32 = vmovl_s16(vget_low_s16(_sum1));
int32x4_t sum1n_s32 = vmovl_s16(vget_high_s16(_sum1));

vst1q_s32(outptr0n, sum1_s32);
vst1q_s32(outptr0n+4, sum1n_s32);

r0 += 8;
r1 += 8;
r2 += 8;
r3 += 8;
outptr0 += 8;
outptr0n += 8;
}

for (; remain>0; remain--)
{
//Todo Neon

int sum0 = 0;
int sum0n = 0;

sum0 += (int)r0[0] * kernel[0];
sum0 += (int)r0[1] * kernel[1];
sum0 += (int)r0[2] * kernel[2];
sum0 += (int)r1[0] * kernel[3];
sum0 += (int)r1[1] * kernel[4];
sum0 += (int)r1[2] * kernel[5];
sum0 += (int)r2[0] * kernel[6];
sum0 += (int)r2[1] * kernel[7];
sum0 += (int)r2[2] * kernel[8];

sum0n += (int)r1[0] * kernel[0];
sum0n += (int)r1[1] * kernel[1];
sum0n += (int)r1[2] * kernel[2];
sum0n += (int)r2[0] * kernel[3];
sum0n += (int)r2[1] * kernel[4];
sum0n += (int)r2[2] * kernel[5];
sum0n += (int)r3[0] * kernel[6];
sum0n += (int)r3[1] * kernel[7];
sum0n += (int)r3[2] * kernel[8];

*outptr0 = sum0;
*outptr0n = sum0n;

r0++;
r1++;
r2++;
r3++;
outptr0++;
outptr0n++;
}

r0 += 2 + w;
r1 += 2 + w;
r2 += 2 + w;
r3 += 2 + w;

outptr0 += outw;
outptr0n += outw;
}

for (; i < outh; i++)
{
int nn = outw >> 3;
int remain = outw & 7;

for (; nn >0; nn--)
{
int8x8_t _r0 = vld1_s8(r0);
int8x8_t _r0n = vld1_s8(r0+8);
int8x8_t _r01 = vext_s8(_r0, _r0n, 1);
int8x8_t _r02 = vext_s8(_r0, _r0n, 2);

int16x8_t _sum0 = vmull_s8(_r0, _k0);
_sum0 = vmlal_s8(_sum0, _r01, _k1);
_sum0 = vmlal_s8(_sum0, _r02, _k2);

int8x8_t _r1 = vld1_s8(r1);
int8x8_t _r1n = vld1_s8(r1+8);
int8x8_t _r11 = vext_s8(_r1, _r1n, 1);
int8x8_t _r12 = vext_s8(_r1, _r1n, 2);
_sum0 = vmlal_s8(_sum0, _r1, _k3);
_sum0 = vmlal_s8(_sum0, _r11, _k4);
_sum0 = vmlal_s8(_sum0, _r12, _k5);

int8x8_t _r2 = vld1_s8(r2);
int8x8_t _r2n = vld1_s8(r2+8);
int8x8_t _r21 = vext_s8(_r2, _r2n, 1);
int8x8_t _r22 = vext_s8(_r2, _r2n, 2);
_sum0 = vmlal_s8(_sum0, _r2, _k6);
_sum0 = vmlal_s8(_sum0, _r21, _k7);
_sum0 = vmlal_s8(_sum0, _r22, _k8);

int32x4_t sum0_s32 = vmovl_s16(vget_low_s16(_sum0));
int32x4_t sum0n_s32 = vmovl_s16(vget_high_s16(_sum0));

vst1q_s32(outptr0, sum0_s32);
vst1q_s32(outptr0+4, sum0n_s32);

r0 += 8;
r1 += 8;
r2 += 8;
outptr0 += 8;
}

for (; remain>0; remain--)
{
int sum = 0;

sum += (int)r0[0] * kernel[0];
sum += (int)r0[1] * kernel[1];
sum += (int)r0[2] * kernel[2];
sum += (int)r1[0] * kernel[3];
sum += (int)r1[1] * kernel[4];
sum += (int)r1[2] * kernel[5];
sum += (int)r2[0] * kernel[6];
sum += (int)r2[1] * kernel[7];
sum += (int)r2[2] * kernel[8];

*outptr0 = sum;

r0++;
r1++;
r2++;
outptr0++;
}

r0 += 2;
r1 += 2;
r2 += 2;
}
}
}

static void convdw3x3s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
{
int w = bottom_blob.w;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int tailstep = w - 2*outw + w;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<outch; p++)
{
Mat out = top_blob.channel(p);

const signed char* kernel = (const signed char*)_kernel + p*9;

int* outptr = out;

const signed char* img = bottom_blob.channel(p);

const signed char* r0 = img;
const signed char* r1 = img + w;
const signed char* r2 = img + w*2;

int i = 0;

int8x8_t _k0 = vdup_n_s8(kernel[0]);
int8x8_t _k1 = vdup_n_s8(kernel[1]);
int8x8_t _k2 = vdup_n_s8(kernel[2]);
int8x8_t _k3 = vdup_n_s8(kernel[3]);
int8x8_t _k4 = vdup_n_s8(kernel[4]);
int8x8_t _k5 = vdup_n_s8(kernel[5]);
int8x8_t _k6 = vdup_n_s8(kernel[6]);
int8x8_t _k7 = vdup_n_s8(kernel[7]);
int8x8_t _k8 = vdup_n_s8(kernel[8]);

for (; i < outh; i++)
{
int nn = outw >> 3;
int remain = outw & 7;

for (; nn > 0; nn--)
{
int8x8x2_t _r0 = vld2_s8(r0);
int8x8x2_t _r0n = vld2_s8(r0+16);
int8x8_t _r00 = _r0.val[0];
int8x8_t _r01 = _r0.val[1];
int8x8_t _r02 = vext_s8(_r00, _r0n.val[0], 1);

int16x8_t _sum = vmull_s8(_r00, _k0);
_sum = vmlal_s8(_sum, _r01, _k1);
_sum = vmlal_s8(_sum, _r02, _k2);

int8x8x2_t _r1 = vld2_s8(r1);
int8x8x2_t _r1n = vld2_s8(r1+16);
int8x8_t _r10 = _r1.val[0];
int8x8_t _r11 = _r1.val[1];
int8x8_t _r12 = vext_s8(_r10, _r1n.val[0], 1);
_sum = vmlal_s8(_sum, _r10, _k3);
_sum = vmlal_s8(_sum, _r11, _k4);
_sum = vmlal_s8(_sum, _r12, _k5);

int8x8x2_t _r2 = vld2_s8(r2);
int8x8x2_t _r2n = vld2_s8(r2+16);
int8x8_t _r20 = _r2.val[0];
int8x8_t _r21 = _r2.val[1];
int8x8_t _r22 = vext_s8(_r20, _r2n.val[0], 1);
_sum = vmlal_s8(_sum, _r20, _k6);
_sum = vmlal_s8(_sum, _r21, _k7);
_sum = vmlal_s8(_sum, _r22, _k8);

int32x4_t sum0_s32 = vmovl_s16(vget_low_s16(_sum));
int32x4_t sum0n_s32 = vmovl_s16(vget_high_s16(_sum));

vst1q_s32(outptr, sum0_s32);
vst1q_s32(outptr+4, sum0n_s32);

r0 += 16;
r1 += 16;
r2 += 16;
outptr += 8;
}

for (; remain>0; remain--)
{
int sum = 0;
sum += (int)r0[0] * kernel[0];
sum += (int)r0[1] * kernel[1];
sum += (int)r0[2] * kernel[2];
sum += (int)r1[0] * kernel[3];
sum += (int)r1[1] * kernel[4];
sum += (int)r1[2] * kernel[5];
sum += (int)r2[0] * kernel[6];
sum += (int)r2[1] * kernel[7];
sum += (int)r2[2] * kernel[8];

*outptr = sum;

r0 += 2;
r1 += 2;
r2 += 2;
outptr++;
}

r0 += tailstep;
r1 += tailstep;
r2 += tailstep;
}
}
}
#else // __aarch64__
static void convdw3x3s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
{
int w = bottom_blob.w;
@@ -824,5 +483,3 @@ static void convdw3x3s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const M
}
}
}

#endif

+ 63
- 19
src/layer/arm/convolutiondepthwise_arm.cpp View File

@@ -13,7 +13,7 @@
// specific language governing permissions and limitations under the License.

#include "convolutiondepthwise_arm.h"
#include "benchmark.h"
#ifdef _OPENMP
#include <omp.h>
#endif
@@ -147,6 +147,8 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con
Mat bottom_blob_unbordered = bottom_blob;
if (use_int8_inference && elemsize != 1)
{
// start = ncnn::get_current_time();

Mat bottom_blob_int8;
bottom_blob_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator);
if (bottom_blob_int8.empty())
@@ -167,8 +169,8 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con
quantize_ops[g]->forward(bottom_blob_g, bottom_blob_int8_g, opt_g);
}

bottom_blob_unbordered = bottom_blob_int8;
}
bottom_blob_unbordered = bottom_blob_int8;
}

Mat bottom_blob_bordered = bottom_blob_unbordered;
if (pad_w > 0 || pad_h > 0)
@@ -211,25 +213,67 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con
{
if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2))
{
if (stride_w == 1 && stride_h == 1)
if (use_int8_requantize)
{
convdw3x3s1_int8_neon(bottom_blob_bordered, top_blob, weight_data, opt);
Mat top_blob_tm;
top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator);
if (top_blob_tm.empty())
return -100;
top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator);
if (top_blob.empty())
return -100;

if (stride_w == 1 && stride_h == 1)
{
convdw3x3s1_int8_neon(bottom_blob_bordered, top_blob_tm, weight_data, opt);
}
else if (stride_w == 2 && stride_h == 2)
{
convdw3x3s2_int8_neon(bottom_blob_bordered, top_blob_tm, weight_data, opt);
}

// requantize, reverse scale inplace
#pragma omp parallel for num_threads(opt.num_threads)
for (int g=0; g<group; g++)
{
ncnn::Option opt_g = opt;
opt_g.num_threads = 1;
opt_g.blob_allocator = top_blob.allocator;

Mat top_blob_tm_g = top_blob_tm.channel_range(g, 1);
Mat top_blob_g = top_blob.channel_range(g, 1);
requantize_ops[g]->forward(top_blob_tm_g, top_blob_g, opt_g);
}
}
else if (stride_w == 2 && stride_h == 2)
else
{
convdw3x3s2_int8_neon(bottom_blob_bordered, top_blob, weight_data, opt);
}

// dequantize, reverse scale inplace
#pragma omp parallel for num_threads(opt.num_threads)
for (int g=0; g<group; g++)
{
ncnn::Option opt_g = opt;
opt_g.num_threads = 1;
opt_g.blob_allocator = top_blob.allocator;

Mat top_blob_g = top_blob.channel_range(g, 1);
dequantize_ops[g]->forward_inplace(top_blob_g, opt_g);
// start = ncnn::get_current_time();

top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator);
if (top_blob.empty())
return -100;

if (stride_w == 1 && stride_h == 1)
{
convdw3x3s1_int8_neon(bottom_blob_bordered, top_blob, weight_data, opt);
}
else if (stride_w == 2 && stride_h == 2)
{
convdw3x3s2_int8_neon(bottom_blob_bordered, top_blob, weight_data, opt);
}

// dequantize, reverse scale inplace
#pragma omp parallel for num_threads(opt.num_threads)
for (int g=0; g<group; g++)
{
ncnn::Option opt_g = opt;
opt_g.num_threads = 1;
opt_g.blob_allocator = top_blob.allocator;

Mat top_blob_g = top_blob.channel_range(g, 1);
dequantize_ops[g]->forward_inplace(top_blob_g, opt_g);
}
}

return 0;


+ 0
- 22
src/layer/arm/quantize_arm.cpp View File

@@ -31,19 +31,6 @@ static inline signed char float2int8(float v)

int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
#if !__aarch64__ && __ARM_NEON
int FPSCR_value = 0;

asm volatile(
"vmrs %0, FPSCR \n"
"bic r10, %0, #0x00c00000 \n"
"vmsr FPSCR, r10 \n"
: "=r"(FPSCR_value)
:
: "memory", "r10"
);
#endif

int dims = bottom_blob.dims;

if (dims == 1)
@@ -200,15 +187,6 @@ int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o
}
}

#if !__aarch64__ && __ARM_NEON
asm volatile(
"vmsr FPSCR, %0 \n"
:
: "r"(FPSCR_value)
: "memory"
);
#endif

return 0;
}



+ 84
- 0
src/layer/arm/relu_arm.cpp View File

@@ -22,8 +22,92 @@ namespace ncnn {

DEFINE_LAYER_CREATOR(ReLU_arm)

int ReLU_arm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;

if (slope == 0.f)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
signed char* ptr = bottom_top_blob.channel(q);

#if __ARM_NEON
int nn = size >> 4;
int remain = size - (nn << 4);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
int8x16_t _zero = vdupq_n_s8(0);
for (; nn>0; nn--)
{
int8x16_t _p = vld1q_s8(ptr);
_p = vmaxq_s8(_p, _zero);
vst1q_s8(ptr, _p);

ptr += 16;
}
#else
if (nn > 0)
{
asm volatile(
"veor q1, q0, q0 \n"
"0: \n"
"pld [%1, #128] \n"
"vld1.s8 {d0-d1}, [%1 :128] \n"
"vmax.s8 q0, q0, q1 \n"
"subs %0, #1 \n"
"vst1.s8 {d0-d1}, [%1 :128]! \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr) // %1
: "0"(nn),
"1"(ptr)
: "cc", "memory", "q0", "q1"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
if (*ptr < 0)
*ptr = 0;

ptr++;
}
}
}
else
{
// TODO
// #pragma omp parallel for num_threads(opt.num_threads)
// for (int q=0; q<channels; q++)
// {
// float* ptr = bottom_top_blob.channel(q);

// for (int i=0; i<size; i++)
// {
// if (ptr[i] < 0)
// ptr[i] *= slope;
// }
// }
}

return 0;
}

int ReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
if (bottom_top_blob.elemsize == 1u)
return ReLU_arm::forward_inplace_int8(bottom_top_blob, opt);

int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;


+ 1
- 0
src/layer/arm/relu_arm.h View File

@@ -23,6 +23,7 @@ class ReLU_arm : public ReLU
{
public:
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
virtual int forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn


+ 325
- 0
src/layer/arm/requantize_arm.cpp View File

@@ -0,0 +1,325 @@
// SenseNets is pleased to support the open source community by supporting ncnn available.
//
// Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "requantize_arm.h"

#include <math.h>

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

namespace ncnn {

DEFINE_LAYER_CREATOR(Requantize_arm)

static inline signed char float2int8(float v)
{
int int32 = round(v);
if (int32 > 127) return 127;
if (int32 < -128) return -128;
return (signed char)int32;
}

int Requantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
int dims = bottom_blob.dims;

if (dims == 1)
{
int w = bottom_blob.w;

const int* intptr = bottom_blob;
signed char * ptr = top_blob;

if (bias_term)
{
if (bias_data_size > 1)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int i=0; i<w; i++)
{
ptr[i] = float2int8(((intptr[i] * scale_in) + bias_data[i]) * scale_out);
if (fusion_relu && ptr[i] < 0)
ptr[i] = 0;
}
}
else
{
float bias = bias_data[0];
#pragma omp parallel for num_threads(opt.num_threads)
for (int i=0; i<w; i++)
{
ptr[i] = float2int8(((intptr[i] * scale_in) + bias) * scale_out);
if (fusion_relu && ptr[i] < 0)
ptr[i] = 0;
}
}
}
else
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int i=0; i<w; i++)
{
ptr[i] = float2int8(intptr[i] * scale_in * scale_out);
if (fusion_relu && ptr[i] < 0)
ptr[i] = 0;
}
}
}

if (dims == 2)
{
int w = bottom_blob.w;
int h = bottom_blob.h;

if (bias_term)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int i=0; i<h; i++)
{
const int* intptr = bottom_blob.row<const int>(i);
signed char* ptr = top_blob.row<signed char>(i);

float bias = bias_data_size > 1 ? bias_data[i] : bias_data[0];

for (int j=0; j<w; j++)
{
ptr[j] = float2int8(((intptr[j] * scale_in) + bias) * scale_out);
if (fusion_relu && ptr[j] < 0)
ptr[j] = 0;
}
}
}
else
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int i=0; i<h; i++)
{
const int* intptr = bottom_blob.row<const int>(i);
signed char* ptr = top_blob.row<signed char>(i);

for (int j=0; j<w; j++)
{
ptr[j] = float2int8(intptr[j] * scale_in * scale_out);
if (fusion_relu && ptr[j] < 0)
ptr[j] = 0;
}
}
}
}

if (dims == 3)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
int size = w * h;

if (bias_term)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const int* intptr = bottom_blob.channel(q);
signed char* ptr = top_blob.channel(q);

float bias = bias_data_size > 1 ? bias_data[q] : bias_data[0];

#if __ARM_NEON
int nn = size >> 3;
int remain = size & 7;

#if __aarch64__
for (; nn>0; nn--)
{
ptr[0] = float2int8(((intptr[0] * scale_in) + bias) * scale_out);
ptr[1] = float2int8(((intptr[1] * scale_in) + bias) * scale_out);
ptr[2] = float2int8(((intptr[2] * scale_in) + bias) * scale_out);
ptr[3] = float2int8(((intptr[3] * scale_in) + bias) * scale_out);
ptr[4] = float2int8(((intptr[4] * scale_in) + bias) * scale_out);
ptr[5] = float2int8(((intptr[5] * scale_in) + bias) * scale_out);
ptr[6] = float2int8(((intptr[6] * scale_in) + bias) * scale_out);
ptr[7] = float2int8(((intptr[7] * scale_in) + bias) * scale_out);

ptr += 8;
intptr += 8;
}
#else
if (nn > 0)
{
asm volatile(
"pld [%1, #256] \n"
"vld1.s32 {d0-d3}, [%1:128]! \n" //q0-q1 data
"vdup.f32 q10, %6 \n" //q10 scale_in
"vdup.f32 q11, %7 \n" //q11 scale_out
"vdup.f32 q12, %8 \n" //q12 bias
"0: \n"
// top_s32 -> top_f32
"vcvt.f32.s32 q0, q0 \n"
"vcvt.f32.s32 q1, q1 \n"
// top_f32 = top_f32 * scale_int
"vmul.f32 q0, q0, q10 \n"
"vmul.f32 q1, q1, q10 \n"
// top_f32 = top_f32 + bias
"vadd.f32 q0, q0, q12 \n"
"vadd.f32 q1, q1, q12 \n"
// top_f32 = top_f32 * scale_out
"vmul.f32 q0, q0, q11 \n"
"vmul.f32 q1, q1, q11 \n"
// top_f32 -> top_s32
"vcvtr.s32.f32 s0, s0 \n"
"vcvtr.s32.f32 s1, s1 \n"
"vcvtr.s32.f32 s2, s2 \n"
"vcvtr.s32.f32 s3, s3 \n"
"vcvtr.s32.f32 s4, s4 \n"
"vcvtr.s32.f32 s5, s5 \n"
"vcvtr.s32.f32 s6, s6 \n"
"vcvtr.s32.f32 s7, s7 \n"
// top_s32 -> top_s16
"vqmovn.s32 d4, q0 \n"
"vqmovn.s32 d5, q1 \n"
"pld [%1, #256] \n"
"vld1.s32 {d0-d3}, [%1:128]! \n" //q0-q1 data
// top_s16 -> top_s8
"vqmovn.s16 d4, q2 \n"
// save top_s8
"vst1.8 {d4}, [%2:64]! \n"
"subs %0, #1 \n"
"bne 0b \n"
"sub %1, #32 \n"
: "=r"(nn), // %0
"=r"(intptr), // %1
"=r"(ptr) // %2
: "0"(nn),
"1"(intptr),
"2"(ptr),
"r"(scale_in), // %6
"r"(scale_out), // %7
"r"(bias) // %8
: "cc", "memory", "q0", "q1", "q2", "q10", "q11", "q12"
);
}
#endif // __aarch64__
#else
int remain = size;
#endif // __ARM_NEON

for (; remain > 0; remain--)
{
*ptr = float2int8(((*intptr * scale_in) + bias) * scale_out);

intptr++;
ptr ++;
}
}
}
else
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const int* intptr = bottom_blob.channel(q);
signed char* ptr = top_blob.channel(q);

#if __ARM_NEON
int nn = size >> 3;
int remain = size & 7;

#if __aarch64__
//TODO
for (; nn>0; nn--)
{
ptr[0] = float2int8(intptr[0] * scale_in * scale_out);
ptr[1] = float2int8(intptr[1] * scale_in * scale_out);
ptr[2] = float2int8(intptr[2] * scale_in * scale_out);
ptr[3] = float2int8(intptr[3] * scale_in * scale_out);
ptr[4] = float2int8(intptr[4] * scale_in * scale_out);
ptr[5] = float2int8(intptr[5] * scale_in * scale_out);
ptr[6] = float2int8(intptr[6] * scale_in * scale_out);
ptr[7] = float2int8(intptr[7] * scale_in * scale_out);

ptr += 8;
intptr += 8;
}
#else
if (nn > 0)
{
asm volatile(
"pld [%1, #256] \n"
"vld1.s32 {d0-d3}, [%1:128]! \n" //q0-q1 data
"vdup.f32 q10, %6 \n" //q10 scale_in
"vdup.f32 q11, %7 \n" //q11 scale_out
"0: \n"
// top_s32 -> top_f32
"vcvt.f32.s32 q0, q0 \n"
"vcvt.f32.s32 q1, q1 \n"
// top_f32 = top_f32 * scale_int
"vmul.f32 q0, q0, q10 \n"
"vmul.f32 q1, q1, q10 \n"
// top_f32 = top_f32 * scale_out
"vmul.f32 q0, q0, q11 \n"
"vmul.f32 q1, q1, q11 \n"
// top_f32 -> top_s32
"vcvtr.s32.f32 s0, s0 \n"
"vcvtr.s32.f32 s1, s1 \n"
"vcvtr.s32.f32 s2, s2 \n"
"vcvtr.s32.f32 s3, s3 \n"
"vcvtr.s32.f32 s4, s4 \n"
"vcvtr.s32.f32 s5, s5 \n"
"vcvtr.s32.f32 s6, s6 \n"
"vcvtr.s32.f32 s7, s7 \n"
// top_s32 -> top_s16
"vqmovn.s32 d4, q0 \n"
"vqmovn.s32 d5, q1 \n"
"pld [%1, #256] \n"
"vld1.s32 {d0-d3}, [%1:128]! \n" //q0-q1 data
// top_s16 -> top_s8
"vqmovn.s16 d4, q2 \n"
// save top_s8
"vst1.8 {d4}, [%2:64]! \n"
"subs %0, #1 \n"
"bne 0b \n"
"sub %1, #32 \n"
: "=r"(nn), // %0
"=r"(intptr), // %1
"=r"(ptr) // %2
: "0"(nn),
"1"(intptr),
"2"(ptr),
"r"(scale_in), // %6
"r"(scale_out) // %7
: "cc", "memory", "q0", "q1", "q2", "q10", "q11"
);
}
#endif // __aarch64__
#else
int remain = size;
#endif // __ARM_NEON

for (; remain > 0; remain--)
{
*ptr = float2int8(*intptr * scale_in * scale_out);

intptr++;
ptr ++;
}
}
}
}

return 0;
}

} // namespace ncnn

+ 30
- 0
src/layer/arm/requantize_arm.h View File

@@ -0,0 +1,30 @@
// SenseNets is pleased to support the open source community by supporting ncnn available.
//
// Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_REQUANTIZE_ARM_H
#define LAYER_REQUANTIZE_ARM_H

#include "requantize.h"

namespace ncnn {

class Requantize_arm : public Requantize
{
public:
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_REQUANTIZE_ARM_H

+ 194
- 47
src/layer/convolution.cpp View File

@@ -25,6 +25,7 @@ Convolution::Convolution()
one_blob_only = true;
support_inplace = false;
support_vulkan = true;
use_int8_requantize = false;

#if NCNN_VULKAN
padding = 0;
@@ -42,7 +43,6 @@ Convolution::Convolution()
#endif // NCNN_VULKAN

quantize = 0;
dequantize = 0;
}

Convolution::~Convolution()
@@ -52,7 +52,14 @@ Convolution::~Convolution()
#endif // NCNN_VULKAN

delete quantize;
delete dequantize;

for (int i=0; i<(int)dequantize_ops.size(); i++)
delete dequantize_ops[i];
dequantize_ops.clear();

for (int i=0; i<(int)requantize_ops.size(); i++)
delete requantize_ops[i];
requantize_ops.clear();
}

int Convolution::load_param(const ParamDict& pd)
@@ -113,10 +120,18 @@ int Convolution::load_model(const ModelBin& mb)

if (int8_scale_term)
{
weight_data_int8_scale = mb.load(1, 1)[0];
weight_data_int8_scales = mb.load(num_output, 1);
bottom_blob_int8_scale = mb.load(1, 1)[0];
}

for (int i=0; i<(int)dequantize_ops.size(); i++)
delete dequantize_ops[i];
dequantize_ops.clear();

for (int i=0; i<(int)requantize_ops.size(); i++)
delete requantize_ops[i];
requantize_ops.clear();

bool weight_data_is_int8 = (weight_data.elemsize == (size_t)1u);
bool weight_data_is_float32 = (weight_data.elemsize == (size_t)4u);

@@ -126,27 +141,39 @@ int Convolution::load_model(const ModelBin& mb)
return -1;
}

// runtime quantize the weight data
if (weight_data_is_float32 && use_int8_inference)
{
// quantize weight to int8
Layer* op = ncnn::create_layer(ncnn::LayerType::Quantize);
Mat int8_weight_data(weight_data_size, (size_t)1u);
if (int8_weight_data.empty())
return -100;

ncnn::ParamDict pd;
pd.set(0, weight_data_int8_scale);// scale
const int weight_data_size_output = weight_data_size / num_output;

for (int n=0; n<num_output; n++)
{
Layer* op = ncnn::create_layer(ncnn::LayerType::Quantize);

op->load_param(pd);
ncnn::ParamDict pd;
pd.set(0, weight_data_int8_scales[n]);// scale

Mat int8_weight_data;
op->forward(weight_data, int8_weight_data);
op->load_param(pd);

delete op;
ncnn::Option opt = ncnn::get_default_option();
opt.blob_allocator = int8_weight_data.allocator;

if (int8_weight_data.empty())
return -100;
const Mat weight_data_n = weight_data.range(weight_data_size_output * n, weight_data_size_output);
Mat int8_weight_data_n = int8_weight_data.range(weight_data_size_output * n, weight_data_size_output);
op->forward(weight_data_n, int8_weight_data_n, opt);

delete op;
}

weight_data = int8_weight_data;
}

// initial the quantize,dequantize op layer
if (use_int8_inference)
{
quantize = ncnn::create_layer(ncnn::LayerType::Quantize);
@@ -157,22 +184,74 @@ int Convolution::load_model(const ModelBin& mb)
quantize->load_param(pd);
}

dequantize = ncnn::create_layer(ncnn::LayerType::Dequantize);
dequantize_ops.resize(num_output);
for (int n=0; n<num_output; n++)
{
float top_rescale = 1.f / (bottom_blob_int8_scale * weight_data_int8_scale);
dequantize_ops[n] = ncnn::create_layer(ncnn::LayerType::Dequantize);

float top_rescale = 1.f;

if (weight_data_int8_scales[n] == 0)
top_rescale = 0;
else
top_rescale = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[n]);

ncnn::ParamDict pd;
pd.set(0, top_rescale);// scale
pd.set(1, bias_term);// bias_term
pd.set(2, num_output);// bias_data_size
pd.set(1, bias_term); // bias_term
pd.set(2, 1); // bias_data_size

dequantize->load_param(pd);
dequantize_ops[n]->load_param(pd);

ncnn::Mat weights[1];
weights[0] = bias_data;
weights[0] = bias_data.range(n, 1);

dequantize->load_model(ModelBinFromMatArray(weights));
dequantize_ops[n]->load_model(ModelBinFromMatArray(weights));
}
}

return 0;
}

int Convolution::create_requantize_op(void)
{
if (!use_int8_requantize)
{
fprintf(stderr, "requantized op set but use_int8_requantize disabled\n");
return -1;
}

requantize_ops.resize(num_output);
for (int n=0; n<num_output; n++)
{
requantize_ops[n] = ncnn::create_layer(ncnn::LayerType::Requantize);

float scale_in = 1.f;
float scale_out = 1.f;

if (weight_data_int8_scales[n] == 0)
{
scale_in = 0;
}
else
{
scale_in = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[n]);
}

scale_out = top_blob_int8_scale;

ncnn::ParamDict pd;
pd.set(0, scale_in); // scale in
pd.set(1, scale_out); // scale_out
pd.set(2, bias_term); // bias_term
pd.set(3, 1); // bias_data_size

requantize_ops[n]->load_param(pd);

ncnn::Mat weights[1];
weights[0] = bias_data.range(n, 1);

requantize_ops[n]->load_model(ModelBinFromMatArray(weights));
}

return 0;
@@ -210,7 +289,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op

if (int8_scale_term)
{
weights[2] = Mat(1, (size_t)4u, (void*)&weight_data_int8_scale);
weights[2] = weight_data_int8_scales;
weights[3] = Mat(1, (size_t)4u, (void*)&bottom_blob_int8_scale);
}

@@ -309,50 +388,118 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op

if (use_int8_inference)
{
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<num_output; p++)
if (use_int8_requantize == true)
{
int* outptr = top_blob.channel(p);
Mat top_blob_tm;
top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator);
if (top_blob_tm.empty())
return -100;
top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator);
if (top_blob.empty())
return -100;

for (int i = 0; i < outh; i++)
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<num_output; p++)
{
for (int j = 0; j < outw; j++)
{
int sum = 0;
int* outptr = top_blob_tm.channel(p);

const signed char* kptr = (const signed char*)weight_data + maxk * channels * p;

// channels
for (int q=0; q<channels; q++)
for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)
{
const Mat m = bottom_blob_bordered.channel(q);
const signed char* sptr = m.row<signed char>(i*stride_h) + j*stride_w;
int sum = 0;

const signed char* kptr = (const signed char*)weight_data + maxk * channels * p;

for (int k = 0; k < maxk; k++)
// channels
for (int q=0; q<channels; q++)
{
int val = sptr[ space_ofs[k] ];
int w = kptr[k];
sum += val * w;
const Mat m = bottom_blob_bordered.channel(q);
const signed char* sptr = m.row<signed char>(i*stride_h) + j*stride_w;

for (int k = 0; k < maxk; k++)
{
int val = sptr[ space_ofs[k] ];
int w = kptr[k];
sum += val * w;
}

kptr += maxk;
}

kptr += maxk;
outptr[j] = sum;
}

outptr[j] = sum;
outptr += outw;
}

outptr += outw;
// requantize, reverse scale inplace
{
ncnn::Option opt_g = opt;
opt_g.num_threads = 1;
opt_g.blob_allocator = top_blob.allocator;

Mat top_blob_tm_g = top_blob_tm.channel_range(p, 1);
Mat top_blob_g = top_blob.channel_range(p, 1);
requantize_ops[p]->forward(top_blob_tm_g, top_blob_g, opt_g);
}
}
}

// dequantize, reverse scale inplace
else
{
ncnn::Option opt_g = opt;
opt_g.blob_allocator = top_blob.allocator;
top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator);
if (top_blob.empty())
return -100;
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<num_output; p++)
{
int* outptr = top_blob.channel(p);

dequantize->forward_inplace(top_blob, opt_g);
}
for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)
{
int sum = 0;

const signed char* kptr = (const signed char*)weight_data + maxk * channels * p;

// channels
for (int q=0; q<channels; q++)
{
const Mat m = bottom_blob_bordered.channel(q);
const signed char* sptr = m.row<signed char>(i*stride_h) + j*stride_w;

for (int k = 0; k < maxk; k++)
{
int val = sptr[ space_ofs[k] ];
int w = kptr[k];
sum += val * w;
}

kptr += maxk;
}

outptr[j] = sum;
}

outptr += outw;
}

// dequantize, reverse scale inplace
{
ncnn::Option opt_g = opt;
opt_g.num_threads = 1;
opt_g.blob_allocator = top_blob.allocator;

Mat top_blob_g = top_blob.channel_range(p, 1);
dequantize_ops[p]->forward_inplace(top_blob_g, opt_g);
}
}
}

return 0;
}


+ 7
- 2
src/layer/convolution.h View File

@@ -29,6 +29,8 @@ public:

virtual int load_model(const ModelBin& mb);

virtual int create_requantize_op(void);

virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

#if NCNN_VULKAN
@@ -91,13 +93,16 @@ public:
Pipeline* pipeline_innerproduct_pack4to1;
#endif // NCNN_VULKAN

float weight_data_int8_scale;
Mat weight_data_int8_scales;
float bottom_blob_int8_scale;
float top_blob_int8_scale;

bool use_int8_inference;
bool use_int8_requantize;

ncnn::Layer* quantize;
ncnn::Layer* dequantize;
std::vector<ncnn::Layer*> dequantize_ops;
std::vector<ncnn::Layer*> requantize_ops;
};

} // namespace ncnn


+ 65
- 2
src/layer/convolutiondepthwise.cpp View File

@@ -25,6 +25,7 @@ ConvolutionDepthWise::ConvolutionDepthWise()
one_blob_only = true;
support_inplace = false;
support_vulkan = true;
use_int8_requantize = false;

#if NCNN_VULKAN
padding = 0;
@@ -58,6 +59,11 @@ ConvolutionDepthWise::~ConvolutionDepthWise()
delete dequantize_ops[i];

dequantize_ops.clear();

for (int i=0; i<(int)requantize_ops.size(); i++)
delete requantize_ops[i];

requantize_ops.clear();
}

int ConvolutionDepthWise::load_param(const ParamDict& pd)
@@ -150,7 +156,11 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb)
if (int8_scale_term == 1)
{
weight_data_int8_scales = mb.load(group, 1);
bottom_blob_int8_scales = mb.load(group, 1);
bottom_blob_int8_scales = mb.load(1, 1);

float bottom_blob_int8_scale = bottom_blob_int8_scales[0];
bottom_blob_int8_scales = Mat(group);
bottom_blob_int8_scales.fill(bottom_blob_int8_scale);
}
else if (int8_scale_term == 2)
{
@@ -177,6 +187,11 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb)

dequantize_ops.clear();

for (int i=0; i<(int)requantize_ops.size(); i++)
delete requantize_ops[i];

requantize_ops.clear();

bool weight_data_is_int8 = (weight_data.elemsize == (size_t)1u);
bool weight_data_is_float32 = (weight_data.elemsize == (size_t)4u);

@@ -236,7 +251,11 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb)
{
dequantize_ops[g] = ncnn::create_layer(ncnn::LayerType::Dequantize);

float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
float top_rescale = 1.f;
if (weight_data_int8_scales[g] == 0)
top_rescale = 0;
else
top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);

ncnn::ParamDict pd;
pd.set(0, top_rescale);// scale
@@ -255,6 +274,50 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb)
return 0;
}

int ConvolutionDepthWise::create_requantize_op(void)
{
if (!use_int8_requantize)
{
fprintf(stderr, "requantized op set but use_int8_requantize disabled\n");
return -1;
}

requantize_ops.resize(group);
for (int g=0; g<group; g++)
{
requantize_ops[g] = ncnn::create_layer(ncnn::LayerType::Requantize);

float scale_in = 1.f;
float scale_out = 1.f;

if (weight_data_int8_scales[g] == 0)
{
scale_in = 0;
}
else
{
scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
}

scale_out = top_blob_int8_scale;

ncnn::ParamDict pd;
pd.set(0, scale_in); // scale in
pd.set(1, scale_out); // scale_out
pd.set(2, bias_term); // bias_term
pd.set(3, 1); // bias_data_size

requantize_ops[g]->load_param(pd);

ncnn::Mat weights[1];
weights[0] = bias_data.range(g, 1);

requantize_ops[g]->load_model(ModelBinFromMatArray(weights));
}

return 0;
}

int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
// convolv with NxN kernel


+ 5
- 0
src/layer/convolutiondepthwise.h View File

@@ -29,6 +29,8 @@ public:

virtual int load_model(const ModelBin& mb);

virtual int create_requantize_op(void);

virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

#if NCNN_VULKAN
@@ -92,11 +94,14 @@ public:

Mat weight_data_int8_scales;
Mat bottom_blob_int8_scales;
float top_blob_int8_scale;

bool use_int8_inference;
bool use_int8_requantize;

std::vector<ncnn::Layer*> quantize_ops;
std::vector<ncnn::Layer*> dequantize_ops;
std::vector<ncnn::Layer*> requantize_ops;
};

} // namespace ncnn


+ 78
- 33
src/layer/innerproduct.cpp View File

@@ -36,7 +36,6 @@ InnerProduct::InnerProduct()
#endif // NCNN_VULKAN

quantize = 0;
dequantize = 0;
}

InnerProduct::~InnerProduct()
@@ -46,7 +45,11 @@ InnerProduct::~InnerProduct()
#endif // NCNN_VULKAN

delete quantize;
delete dequantize;

for (int i=0; i<(int)dequantize_ops.size(); i++)
delete dequantize_ops[i];

dequantize_ops.clear();
}

int InnerProduct::load_param(const ParamDict& pd)
@@ -92,7 +95,7 @@ int InnerProduct::load_model(const ModelBin& mb)

if (int8_scale_term)
{
weight_data_int8_scale = mb.load(1, 1)[0];
weight_data_int8_scales = mb.load(num_output, 1);
bottom_blob_int8_scale = mb.load(1, 1)[0];
}

@@ -105,25 +108,71 @@ int InnerProduct::load_model(const ModelBin& mb)
return -1;
}

// initial the quantize,dequantize op layer
if (use_int8_inference)
{
quantize = ncnn::create_layer(ncnn::LayerType::Quantize);
dequantize = ncnn::create_layer(ncnn::LayerType::Dequantize);
{
ncnn::ParamDict pd;
pd.set(0, bottom_blob_int8_scale);// scale

quantize->load_param(pd);
}

dequantize_ops.resize(num_output);
for (int n=0; n<num_output; n++)
{
dequantize_ops[n] = ncnn::create_layer(ncnn::LayerType::Dequantize);

float top_rescale = 1.f;

if (weight_data_int8_scales[n] == 0)
top_rescale = 0;
else
top_rescale = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[n]);

ncnn::ParamDict pd;
pd.set(0, top_rescale);// scale
pd.set(1, bias_term); // bias_term
pd.set(2, 1); // bias_data_size

dequantize_ops[n]->load_param(pd);

ncnn::Mat weights[1];
weights[0] = bias_data.range(n, 1);

dequantize_ops[n]->load_model(ModelBinFromMatArray(weights));
}
}

// runtime quantize the weight data
if (weight_data_is_float32 && use_int8_inference)
{
// quantize weight to int8
ncnn::ParamDict pd;
pd.set(0, weight_data_int8_scale);// scale
Mat int8_weight_data(weight_data_size, (size_t)1u);
if (int8_weight_data.empty())
return -100;

quantize->load_param(pd);
const int weight_data_size_output = weight_data_size / num_output;

Mat int8_weight_data;
quantize->forward(weight_data, int8_weight_data);
for (int n=0; n<num_output; n++)
{
Layer* op = ncnn::create_layer(ncnn::LayerType::Quantize);

if (int8_weight_data.empty())
return -100;
ncnn::ParamDict pd;
pd.set(0, weight_data_int8_scales[n]);// scale

op->load_param(pd);

ncnn::Option opt = ncnn::get_default_option();
opt.blob_allocator = int8_weight_data.allocator;

const Mat weight_data_n = weight_data.range(weight_data_size_output * n, weight_data_size_output);
Mat int8_weight_data_n = int8_weight_data.range(weight_data_size_output * n, weight_data_size_output);
op->forward(weight_data_n, int8_weight_data_n, opt);

delete op;
}

weight_data = int8_weight_data;
}
@@ -152,12 +201,10 @@ int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o

// quantize, scale and round to nearest
{
ncnn::ParamDict pd;
pd.set(0, bottom_blob_int8_scale);// scale
ncnn::Option opt_g = opt;
opt_g.blob_allocator = bottom_blob_int8.allocator;

quantize->load_param(pd);

quantize->forward(bottom_blob, bottom_blob_int8, opt);
quantize->forward(bottom_blob, bottom_blob_int8, opt_g);
}

// num_output
@@ -179,26 +226,24 @@ int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o
}
}

out[p] = sum;
out[p] = sum;
}

// dequantize, reverse scale inplace
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<num_output; p++)
{
float top_rescale = 1.f / (bottom_blob_int8_scale * weight_data_int8_scale);

ncnn::ParamDict pd;
pd.set(0, top_rescale);// scale
pd.set(1, bias_term);// bias_term
pd.set(2, num_output);// bias_data_size

dequantize->load_param(pd);

ncnn::Mat weights[1];
weights[0] = bias_data;

dequantize->load_model(ModelBinFromMatArray(weights));

dequantize->forward_inplace(top_blob, opt);
int* out_s32 = top_blob;
float* out_f32 = top_blob;
float top_rescale = 1.f;
if (weight_data_int8_scales[p] == 0)
top_rescale = 0;
else
top_rescale = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[p]);

if (bias_term)
out_f32[p] = out_s32[p] * top_rescale + bias_data[p];
else
out_f32[p] = out_s32[p] * top_rescale;
}

return 0;


+ 2
- 2
src/layer/innerproduct.h View File

@@ -76,13 +76,13 @@ public:
Pipeline* pipeline_innerproduct_pack4to1;
#endif // NCNN_VULKAN

float weight_data_int8_scale;
Mat weight_data_int8_scales;
float bottom_blob_int8_scale;

bool use_int8_inference;

ncnn::Layer* quantize;
ncnn::Layer* dequantize;
std::vector<ncnn::Layer*> dequantize_ops;
};

} // namespace ncnn


+ 43
- 0
src/layer/relu.cpp View File

@@ -38,8 +38,51 @@ int ReLU::load_param(const ParamDict& pd)
return 0;
}

int ReLU::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;

if (slope == 0.f)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
signed char* ptr = bottom_top_blob.channel(q);

for (int i=0; i<size; i++)
{
if (ptr[i] < 0)
ptr[i] = 0;
}
}
}
else
{
// TODO
// #pragma omp parallel for num_threads(opt.num_threads)
// for (int q=0; q<channels; q++)
// {
// float* ptr = bottom_top_blob.channel(q);

// for (int i=0; i<size; i++)
// {
// if (ptr[i] < 0)
// ptr[i] *= slope;
// }
// }
}

return 0;
}

int ReLU::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
if (bottom_top_blob.elemsize == 1u)
return ReLU::forward_inplace_int8(bottom_top_blob, opt);

int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;


+ 1
- 0
src/layer/relu.h View File

@@ -27,6 +27,7 @@ public:
virtual int load_param(const ParamDict& pd);

virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
virtual int forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const;

#if NCNN_VULKAN
virtual int create_pipeline();


+ 195
- 0
src/layer/requantize.cpp View File

@@ -0,0 +1,195 @@
// SenseNets is pleased to support the open source community by supporting ncnn available.
//
// Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "requantize.h"

#include <math.h>

namespace ncnn {

DEFINE_LAYER_CREATOR(Requantize)

Requantize::Requantize()
{
one_blob_only = true;
support_inplace = false;
fusion_relu = false;
}

static inline signed char float2int8(float v)
{
int int32 = round(v);
if (int32 > 127) return 127;
if (int32 < -128) return -128;
return (signed char)int32;
}

int Requantize::load_param(const ParamDict& pd)
{
scale_in = pd.get(0, 1.f); // bottom_blob_scale * weight_scale
scale_out = pd.get(1, 1.f); // top_blob_scale
bias_term = pd.get(2, 0);
bias_data_size = pd.get(3, 0);
fusion_relu = pd.get(4, 0);

return 0;
}

int Requantize::load_model(const ModelBin& mb)
{
if (bias_term)
{
bias_data = mb.load(bias_data_size, 1);
if (bias_data.empty())
return -100;
}

return 0;
}

int Requantize::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
int dims = bottom_blob.dims;

if (dims == 1)
{
int w = bottom_blob.w;

const int* intptr = bottom_blob;
signed char * ptr = top_blob;

if (bias_term)
{
if (bias_data_size > 1)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int i=0; i<w; i++)
{
ptr[i] = float2int8(((intptr[i] * scale_in) + bias_data[i]) * scale_out);
if (fusion_relu && ptr[i] < 0)
ptr[i] = 0;
}
}
else
{
float bias = bias_data[0];
#pragma omp parallel for num_threads(opt.num_threads)
for (int i=0; i<w; i++)
{
ptr[i] = float2int8(((intptr[i] * scale_in) + bias) * scale_out);
if (fusion_relu && ptr[i] < 0)
ptr[i] = 0;
}
}
}
else
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int i=0; i<w; i++)
{
ptr[i] = float2int8(intptr[i] * scale_in * scale_out);
if (fusion_relu && ptr[i] < 0)
ptr[i] = 0;
}
}
}

if (dims == 2)
{
int w = bottom_blob.w;
int h = bottom_blob.h;

if (bias_term)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int i=0; i<h; i++)
{
const int* intptr = bottom_blob.row<const int>(i);
signed char* ptr = top_blob.row<signed char>(i);

float bias = bias_data_size > 1 ? bias_data[i] : bias_data[0];

for (int j=0; j<w; j++)
{
ptr[j] = float2int8(((intptr[j] * scale_in) + bias) * scale_out);
if (fusion_relu && ptr[j] < 0)
ptr[j] = 0;
}
}
}
else
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int i=0; i<h; i++)
{
const int* intptr = bottom_blob.row<const int>(i);
signed char* ptr = top_blob.row<signed char>(i);

for (int j=0; j<w; j++)
{
ptr[j] = float2int8(intptr[j] * scale_in * scale_out);
if (fusion_relu && ptr[j] < 0)
ptr[j] = 0;
}
}
}
}

if (dims == 3)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
int size = w * h;

if (bias_term)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const int* intptr = bottom_blob.channel(q);
signed char* ptr = top_blob.channel(q);

float bias = bias_data_size > 1 ? bias_data[q] : bias_data[0];

for (int i=0; i<size; i++)
{
ptr[i] = float2int8(((intptr[i] * scale_in) + bias) * scale_out);
if (fusion_relu && ptr[i] < 0)
ptr[i] = 0;
}
}
}
else
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const int* intptr = bottom_blob.channel(q);
signed char* ptr = top_blob.channel(q);

for (int i=0; i<size; i++)
{
ptr[i] = float2int8(intptr[i] * scale_in * scale_out);
if (fusion_relu && ptr[i] < 0)
ptr[i] = 0;
}
}
}
}

return 0;
}

} // namespace ncnn

+ 46
- 0
src/layer/requantize.h View File

@@ -0,0 +1,46 @@
// SenseNets is pleased to support the open source community by supporting ncnn available.
//
// Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_REQUANTIZE_H
#define LAYER_REQUANTIZE_H

#include "layer.h"

namespace ncnn {

class Requantize : public Layer
{
public:
Requantize();

virtual int load_param(const ParamDict& pd);

virtual int load_model(const ModelBin& mb);

virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

public:
float scale_in; // bottom_blob_scale * weight_scale
float scale_out;// top_blob_scale / (bottom_blob_scale * weight_scale)
int bias_term;
int bias_data_size;

bool fusion_relu;

Mat bias_data;
};

} // namespace ncnn

#endif // LAYER_REQUANTIZE_H

+ 494
- 0
src/layer/x86/convolution_3x3.h View File

@@ -1,6 +1,7 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
// Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
@@ -138,3 +139,496 @@ static void conv3x3s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _ker
}

}

static void conv3x3s1_winograd23_transform_kernel_sse(const Mat& kernel, Mat& kernel_tm, int inch, int outch)
{
kernel_tm.create(4*4, inch, outch);

// G
const float ktm[4][3] = {
{ 1.0f, 0.0f, 0.0f},
{ 1.0f/2, 1.0f/2, 1.0f/2},
{ 1.0f/2, -1.0f/2, 1.0f/2},
{ 0.0f, 0.0f, 1.0f}
};

#pragma omp parallel for
for (int p = 0; p<outch; p++)
{
for (int q = 0; q<inch; q++)
{
const float* kernel0 = (const float*)kernel + p*inch * 9 + q * 9;
float* kernel_tm0 = kernel_tm.channel(p).row(q);

// transform kernel
const float* k0 = kernel0;
const float* k1 = kernel0 + 3;
const float* k2 = kernel0 + 6;

// h
float tmp[4][3];
for (int i=0; i<4; i++)
{
tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
}

// U
for (int j=0; j<4; j++)
{
float* tmpp = &tmp[j][0];

for (int i=0; i<4; i++)
{
kernel_tm0[j*4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
}
}
}
}
}

static void conv3x3s1_winograd23_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

// pad to 2n+2, winograd F(2,3)
Mat bottom_blob_bordered = bottom_blob;

outw = (outw + 1) / 2 * 2;
outh = (outh + 1) / 2 * 2;

w = outw + 2;
h = outh + 2;
copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt.workspace_allocator, opt.num_threads);

const float* bias = _bias;

// BEGIN transform input
Mat bottom_blob_tm;
{
int w_tm = outw / 2 * 4;
int h_tm = outh / 2 * 4;

int nColBlocks = h_tm/4; // may be the block num in Feathercnn
int nRowBlocks = w_tm/4;

const int tiles = nColBlocks * nRowBlocks;

bottom_blob_tm.create(4*4, tiles, inch, 4u, opt.workspace_allocator);

// BT
// const float itm[4][4] = {
// {1.0f, 0.0f, -1.0f, 0.0f},
// {0.0f, 1.0f, 1.00f, 0.0f},
// {0.0f, -1.0f, 1.00f, 0.0f},
// {0.0f, -1.0f, 0.00f, 1.0f}
// };

for (int q=0; q<inch; q++)
{
const float* img = bottom_blob_bordered.channel(q);
float* out_tm0 = bottom_blob_tm.channel(q);

for (int j = 0; j < nColBlocks; j++)
{
const float* r0 = img + w * j * 2;
const float* r1 = r0 + w;
const float* r2 = r1 + w;
const float* r3 = r2 + w;

for (int i = 0; i < nRowBlocks; i++)
{
float d0[4],d1[4],d2[4],d3[4];
float w0[4],w1[4],w2[4],w3[4];
float t0[4],t1[4],t2[4],t3[4];
// load
for (int n = 0; n < 4; n++)
{
d0[n] = r0[n];
d1[n] = r1[n];
d2[n] = r2[n];
d3[n] = r3[n];
}
// w = B_t * d
for (int n = 0; n < 4; n++)
{
w0[n] = d0[n] - d2[n];
w1[n] = d1[n] + d2[n];
w2[n] = d2[n] - d1[n];
w3[n] = d3[n] - d1[n];
}
// transpose d to d_t
{
t0[0]=w0[0]; t1[0]=w0[1]; t2[0]=w0[2]; t3[0]=w0[3];
t0[1]=w1[0]; t1[1]=w1[1]; t2[1]=w1[2]; t3[1]=w1[3];
t0[2]=w2[0]; t1[2]=w2[1]; t2[2]=w2[2]; t3[2]=w2[3];
t0[3]=w3[0]; t1[3]=w3[1]; t2[3]=w3[2]; t3[3]=w3[3];
}
// d = B_t * d_t
for (int n = 0; n < 4; n++)
{
d0[n] = t0[n] - t2[n];
d1[n] = t1[n] + t2[n];
d2[n] = t2[n] - t1[n];
d3[n] = t3[n] - t1[n];
}
// save to out_tm
for (int n = 0; n < 4; n++)
{
out_tm0[n ] = d0[n];
out_tm0[n+ 4] = d1[n];
out_tm0[n+ 8] = d2[n];
out_tm0[n+12] = d3[n];
}

r0 += 2;
r1 += 2;
r2 += 2;
r3 += 2;

out_tm0 += 16;
}
}
}
}
bottom_blob_bordered = Mat();

// BEGIN dot
Mat top_blob_tm;
{
int w_tm = outw / 2 * 4;
int h_tm = outh / 2 * 4;

int nColBlocks = h_tm/4; // may be the block num in Feathercnn
int nRowBlocks = w_tm/4;

const int tiles = nColBlocks * nRowBlocks;

top_blob_tm.create(16, tiles, outch, 4u, opt.workspace_allocator);

int nn_outch = outch >> 2;
int remain_outch_start = nn_outch << 2;

#pragma omp parallel for num_threads(opt.num_threads)
for (int pp=0; pp<nn_outch; pp++)
{
int p = pp * 4;

Mat out0_tm = top_blob_tm.channel(p);
Mat out1_tm = top_blob_tm.channel(p+1);
Mat out2_tm = top_blob_tm.channel(p+2);
Mat out3_tm = top_blob_tm.channel(p+3);

const Mat kernel0_tm = kernel_tm.channel(p);
const Mat kernel1_tm = kernel_tm.channel(p+1);
const Mat kernel2_tm = kernel_tm.channel(p+2);
const Mat kernel3_tm = kernel_tm.channel(p+3);

for (int i=0; i<tiles; i++)
{
float* output0_tm = out0_tm.row(i);
float* output1_tm = out1_tm.row(i);
float* output2_tm = out2_tm.row(i);
float* output3_tm = out3_tm.row(i);

float sum0[16] = {0.0f};
float sum1[16] = {0.0f};
float sum2[16] = {0.0f};
float sum3[16] = {0.0f};

int q = 0;
for (; q+3<inch; q+=4)
{
const float* r0 = bottom_blob_tm.channel(q).row(i);
const float* r1 = bottom_blob_tm.channel(q+1).row(i);
const float* r2 = bottom_blob_tm.channel(q+2).row(i);
const float* r3 = bottom_blob_tm.channel(q+3).row(i);

const float* k0 = kernel0_tm.row(q);
const float* k1 = kernel1_tm.row(q);
const float* k2 = kernel2_tm.row(q);
const float* k3 = kernel3_tm.row(q);

for (int n=0; n<16; n++)
{
sum0[n] += r0[n] * k0[n];
k0 += 16;
sum0[n] += r1[n] * k0[n];
k0 += 16;
sum0[n] += r2[n] * k0[n];
k0 += 16;
sum0[n] += r3[n] * k0[n];
k0 -= 16 * 3;

sum1[n] += r0[n] * k1[n];
k1 += 16;
sum1[n] += r1[n] * k1[n];
k1 += 16;
sum1[n] += r2[n] * k1[n];
k1 += 16;
sum1[n] += r3[n] * k1[n];
k1 -= 16 * 3;

sum2[n] += r0[n] * k2[n];
k2 += 16;
sum2[n] += r1[n] * k2[n];
k2 += 16;
sum2[n] += r2[n] * k2[n];
k2 += 16;
sum2[n] += r3[n] * k2[n];
k2 -= 16 * 3;

sum3[n] += r0[n] * k3[n];
k3 += 16;
sum3[n] += r1[n] * k3[n];
k3 += 16;
sum3[n] += r2[n] * k3[n];
k3 += 16;
sum3[n] += r3[n] * k3[n];
k3 -= 16 * 3;
}
}

for (; q<inch; q++)
{
const float* r0 = bottom_blob_tm.channel(q).row(i);

const float* k0 = kernel0_tm.row(q);
const float* k1 = kernel1_tm.row(q);
const float* k2 = kernel2_tm.row(q);
const float* k3 = kernel3_tm.row(q);

for (int n=0; n<16; n++)
{
sum0[n] += r0[n] * k0[n];
sum1[n] += r0[n] * k1[n];
sum2[n] += r0[n] * k2[n];
sum3[n] += r0[n] * k3[n];
}
}

for (int n=0; n<16; n++)
{
output0_tm[n] = sum0[n];
output1_tm[n] = sum1[n];
output2_tm[n] = sum2[n];
output3_tm[n] = sum3[n];
}
}
}

#pragma omp parallel for num_threads(opt.num_threads)
for (int p=remain_outch_start; p<outch; p++)
{
Mat out0_tm = top_blob_tm.channel(p);
const Mat kernel0_tm = kernel_tm.channel(p);

for (int i=0; i<tiles; i++)
{
float* output0_tm = out0_tm.row(i);

float sum0[16] = {0.0f};

int q = 0;
for (; q+3<inch; q+=4)
{
const float* r0 = bottom_blob_tm.channel(q).row(i);
const float* r1 = bottom_blob_tm.channel(q+1).row(i);
const float* r2 = bottom_blob_tm.channel(q+2).row(i);
const float* r3 = bottom_blob_tm.channel(q+3).row(i);

const float* k0 = kernel0_tm.row(q);
const float* k1 = kernel0_tm.row(q+1);
const float* k2 = kernel0_tm.row(q+2);
const float* k3 = kernel0_tm.row(q+3);

for (int n=0; n<16; n++)
{
sum0[n] += r0[n] * k0[n];
sum0[n] += r1[n] * k1[n];
sum0[n] += r2[n] * k2[n];
sum0[n] += r3[n] * k3[n];
}
}

for (; q<inch; q++)
{
const float* r0 = bottom_blob_tm.channel(q).row(i);
const float* k0 = kernel0_tm.row(q);

for (int n=0; n<16; n++)
{
sum0[n] += r0[n] * k0[n];
}
}

for (int n=0; n<16; n++)
{
output0_tm[n] = sum0[n];
}
}
}
}
bottom_blob_tm = Mat();
// END dot

// BEGIN transform output
Mat top_blob_bordered;
top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
{
// AT
// const float itm[2][4] = {
// {1.0f, 1.0f, 1.0f, 0.0f},
// {0.0f, 1.0f, -1.0f, 1.0f}
// };

int w_tm = outw / 2 * 4;
int h_tm = outh / 2 * 4;

int nColBlocks = h_tm/4; // may be the block num in Feathercnn
int nRowBlocks = w_tm/4;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<outch; p++)
{
Mat out_tm = top_blob_tm.channel(p);
Mat out = top_blob_bordered.channel(p);

const float bias0 = bias ? bias[p] : 0.f;

for (int j=0; j<nColBlocks; j++)
{
float* outRow0 = out.row(j*2);
float* outRow1 = out.row(j*2+1);

for(int i=0; i<nRowBlocks; i++)
{
float* out_tile = out_tm.row(j*nRowBlocks + i);

float s0[4],s1[4],s2[4],s3[4];
float w0[4],w1[4];
float d0[2],d1[2],d2[2],d3[2];
float o0[2],o1[2];
// load
for (int n = 0; n < 4; n++)
{
s0[n] = out_tile[n];
s1[n] = out_tile[n+ 4];
s2[n] = out_tile[n+ 8];
s3[n] = out_tile[n+12];
}
// w = A_T * W
for (int n = 0; n < 4; n++)
{
w0[n] = s0[n] + s1[n] + s2[n];
w1[n] = s1[n] - s2[n] + s3[n];
}
// transpose w to w_t
{
d0[0] = w0[0]; d0[1] = w1[0];
d1[0] = w0[1]; d1[1] = w1[1];
d2[0] = w0[2]; d2[1] = w1[2];
d3[0] = w0[3]; d3[1] = w1[3];
}
// Y = A_T * w_t
for (int n = 0; n < 2; n++)
{
o0[n] = d0[n] + d1[n] + d2[n] + bias0;
o1[n] = d1[n] - d2[n] + d3[n] + bias0;
}
// save to top blob tm
outRow0[0] = o0[0];
outRow0[1] = o0[1];
outRow1[0] = o1[0];
outRow1[1] = o1[1];

outRow0 += 2;
outRow1 += 2;
}
}
}
}
// END transform output

// cut result pad
copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt.blob_allocator, opt.num_threads);
}

static void conv3x3s2_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Mat& _bias, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int tailstep = w - 2 * outw + w;

const float* kernel = _kernel;
const float* bias = _bias;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
Mat out = top_blob.channel(p);

const float bias0 = bias ? bias[p] : 0.f;

out.fill(bias0);

for (int q = 0; q < inch; q++)
{
float *outptr = out;

const float *img = bottom_blob.channel(q);
const float* kernel0 = kernel + p*inch*9 + q*9;

const float *r0 = img;
const float *r1 = img + w;
const float *r2 = img + w * 2;

const float* k0 = kernel0;
const float* k1 = kernel0 + 3;
const float* k2 = kernel0 + 6;

for (int i = 0; i < outh; i++)
{
int remain = outw;

for (; remain > 0; remain--)
{
float sum = 0;

sum += r0[0] * k0[0];
sum += r0[1] * k0[1];
sum += r0[2] * k0[2];
sum += r1[0] * k1[0];
sum += r1[1] * k1[1];
sum += r1[2] * k1[2];
sum += r2[0] * k2[0];
sum += r2[1] * k2[1];
sum += r2[2] * k2[2];

*outptr += sum;

r0 += 2;
r1 += 2;
r2 += 2;
outptr++;
}

r0 += tailstep;
r1 += tailstep;
r2 += tailstep;
}
}
}
}

+ 431
- 23
src/layer/x86/convolution_3x3_int8.h View File

@@ -11,12 +11,6 @@
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static inline short saturate2int16(int v)
{
if (v > 32767) return 32767;
if (v < -32768) return -32768;
return (short)v;
}

static void conv3x3s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
{
@@ -84,6 +78,424 @@ static void conv3x3s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat
}
}

static void conv3x3s1_winograd23_transform_kernel_int8_sse(const Mat& kernel, Mat& kernel_tm, int inch, int outch)
{
kernel_tm.create(4*4, inch, outch, 2ul);

// G
const short ktm[4][3] = {
{ 2, 0, 0},
{ 1, 1, 1},
{ 1, -1, 1},
{ 0, 0, 2}
};

#pragma omp parallel for
for (int p = 0; p<outch; p++)
{
for (int q = 0; q<inch; q++)
{
const signed char* kernel0 = (const signed char*)kernel + p*inch * 9 + q * 9;
short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);

// transform kernel
const signed char* k0 = kernel0;
const signed char* k1 = kernel0 + 3;
const signed char* k2 = kernel0 + 6;

// h
short tmp[4][3];
for (int i=0; i<4; i++)
{
tmp[i][0] = (short)k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
tmp[i][1] = (short)k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
tmp[i][2] = (short)k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
}

// U
for (int j=0; j<4; j++)
{
short* tmpp = &tmp[j][0];

for (int i=0; i<4; i++)
{
kernel_tm0[j*4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
}
}
}
}
}

static void conv3x3s1_winograd23_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

// pad to 2n+2, winograd F(2,3)
Mat bottom_blob_bordered = bottom_blob;

outw = (outw + 1) / 2 * 2;
outh = (outh + 1) / 2 * 2;

w = outw + 2;
h = outh + 2;
copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt.workspace_allocator, opt.num_threads);

// BEGIN transform input
Mat bottom_blob_tm;
{
int w_tm = outw / 2 * 4;
int h_tm = outh / 2 * 4;

int nColBlocks = h_tm/4; // may be the block num in Feathercnn
int nRowBlocks = w_tm/4;

const int tiles = nColBlocks * nRowBlocks;

bottom_blob_tm.create(4*4, tiles, inch, 2u, opt.workspace_allocator);

// BT
// const float itm[4][4] = {
// {1.0f, 0.0f, -1.0f, 0.0f},
// {0.0f, 1.0f, 1.00f, 0.0f},
// {0.0f, -1.0f, 1.00f, 0.0f},
// {0.0f, -1.0f, 0.00f, 1.0f}
// };
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<inch; q++)
{
const signed char* img = bottom_blob_bordered.channel(q);
short* out_tm0 = bottom_blob_tm.channel(q);

for (int j = 0; j < nColBlocks; j++)
{
const signed char* r0 = img + w * j * 2;
const signed char* r1 = r0 + w;
const signed char* r2 = r1 + w;
const signed char* r3 = r2 + w;

for (int i = 0; i < nRowBlocks; i++)
{
short d0[4],d1[4],d2[4],d3[4];
short w0[4],w1[4],w2[4],w3[4];
short t0[4],t1[4],t2[4],t3[4];
// load
for (int n = 0; n < 4; n++)
{
d0[n] = r0[n];
d1[n] = r1[n];
d2[n] = r2[n];
d3[n] = r3[n];
}
// w = B_t * d
for (int n = 0; n < 4; n++)
{
w0[n] = d0[n] - d2[n];
w1[n] = d1[n] + d2[n];
w2[n] = d2[n] - d1[n];
w3[n] = d3[n] - d1[n];
}
// transpose d to d_t
{
t0[0]=w0[0]; t1[0]=w0[1]; t2[0]=w0[2]; t3[0]=w0[3];
t0[1]=w1[0]; t1[1]=w1[1]; t2[1]=w1[2]; t3[1]=w1[3];
t0[2]=w2[0]; t1[2]=w2[1]; t2[2]=w2[2]; t3[2]=w2[3];
t0[3]=w3[0]; t1[3]=w3[1]; t2[3]=w3[2]; t3[3]=w3[3];
}
// U = B_t * d_t
for (int n = 0; n < 4; n++)
{
d0[n] = t0[n] - t2[n];
d1[n] = t1[n] + t2[n];
d2[n] = t2[n] - t1[n];
d3[n] = t3[n] - t1[n];
}
// save to out_tm
for (int n = 0; n < 4; n++)
{
out_tm0[n ] = d0[n];
out_tm0[n+ 4] = d1[n];
out_tm0[n+ 8] = d2[n];
out_tm0[n+12] = d3[n];
}

r0 += 2;
r1 += 2;
r2 += 2;
r3 += 2;

out_tm0 += 16;
}
}
}
}
bottom_blob_bordered = Mat();
// BEGIN dot
Mat top_blob_tm;
{
int w_tm = outw / 2 * 4;
int h_tm = outh / 2 * 4;

int nColBlocks = h_tm/4; // may be the block num in Feathercnn
int nRowBlocks = w_tm/4;

const int tiles = nColBlocks * nRowBlocks;

top_blob_tm.create(16, tiles, outch, 4u, opt.workspace_allocator);

int nn_outch = outch >> 2;
int remain_outch_start = nn_outch << 2;

#pragma omp parallel for num_threads(opt.num_threads)
for (int pp=0; pp<nn_outch; pp++)
{
int p = pp * 4;

Mat out0_tm = top_blob_tm.channel(p);
Mat out1_tm = top_blob_tm.channel(p+1);
Mat out2_tm = top_blob_tm.channel(p+2);
Mat out3_tm = top_blob_tm.channel(p+3);

const Mat kernel0_tm = kernel_tm.channel(p);
const Mat kernel1_tm = kernel_tm.channel(p+1);
const Mat kernel2_tm = kernel_tm.channel(p+2);
const Mat kernel3_tm = kernel_tm.channel(p+3);

for (int i=0; i<tiles; i++)
{
int* output0_tm = out0_tm.row<int>(i);
int* output1_tm = out1_tm.row<int>(i);
int* output2_tm = out2_tm.row<int>(i);
int* output3_tm = out3_tm.row<int>(i);

int sum0[16] = {0};
int sum1[16] = {0};
int sum2[16] = {0};
int sum3[16] = {0};

int q = 0;
for (; q+3<inch; q+=4)
{
const short* r0 = bottom_blob_tm.channel(q).row<short>(i);
const short* r1 = bottom_blob_tm.channel(q+1).row<short>(i);
const short* r2 = bottom_blob_tm.channel(q+2).row<short>(i);
const short* r3 = bottom_blob_tm.channel(q+3).row<short>(i);

const short* k0 = kernel0_tm.row<short>(q);
const short* k1 = kernel1_tm.row<short>(q);
const short* k2 = kernel2_tm.row<short>(q);
const short* k3 = kernel3_tm.row<short>(q);

for (int n=0; n<16; n++)
{
sum0[n] += (int)r0[n] * k0[n];
k0 += 16;
sum0[n] += (int)r1[n] * k0[n];
k0 += 16;
sum0[n] += (int)r2[n] * k0[n];
k0 += 16;
sum0[n] += (int)r3[n] * k0[n];
k0 -= 16 * 3;

sum1[n] += (int)r0[n] * k1[n];
k1 += 16;
sum1[n] += (int)r1[n] * k1[n];
k1 += 16;
sum1[n] += (int)r2[n] * k1[n];
k1 += 16;
sum1[n] += (int)r3[n] * k1[n];
k1 -= 16 * 3;

sum2[n] += (int)r0[n] * k2[n];
k2 += 16;
sum2[n] += (int)r1[n] * k2[n];
k2 += 16;
sum2[n] += (int)r2[n] * k2[n];
k2 += 16;
sum2[n] += (int)r3[n] * k2[n];
k2 -= 16 * 3;

sum3[n] += (int)r0[n] * k3[n];
k3 += 16;
sum3[n] += (int)r1[n] * k3[n];
k3 += 16;
sum3[n] += (int)r2[n] * k3[n];
k3 += 16;
sum3[n] += (int)r3[n] * k3[n];
k3 -= 16 * 3;
}
}

for (; q<inch; q++)
{
const short* r0 = bottom_blob_tm.channel(q).row<short>(i);

const short* k0 = kernel0_tm.row<short>(q);
const short* k1 = kernel1_tm.row<short>(q);
const short* k2 = kernel2_tm.row<short>(q);
const short* k3 = kernel3_tm.row<short>(q);

for (int n=0; n<16; n++)
{
sum0[n] += (int)r0[n] * k0[n];
sum1[n] += (int)r0[n] * k1[n];
sum2[n] += (int)r0[n] * k2[n];
sum3[n] += (int)r0[n] * k3[n];
}
}

for (int n=0; n<16; n++)
{
output0_tm[n] = sum0[n];
output1_tm[n] = sum1[n];
output2_tm[n] = sum2[n];
output3_tm[n] = sum3[n];
}
}
}

#pragma omp parallel for num_threads(opt.num_threads)
for (int p=remain_outch_start; p<outch; p++)
{
Mat out0_tm = top_blob_tm.channel(p);
const Mat kernel0_tm = kernel_tm.channel(p);

for (int i=0; i<tiles; i++)
{
int* output0_tm = out0_tm.row<int>(i);

int sum0[16] = {0};

int q = 0;
for (; q+3<inch; q+=4)
{
const short* r0 = bottom_blob_tm.channel(q).row<short>(i);
const short* r1 = bottom_blob_tm.channel(q+1).row<short>(i);
const short* r2 = bottom_blob_tm.channel(q+2).row<short>(i);
const short* r3 = bottom_blob_tm.channel(q+3).row<short>(i);

const short* k0 = kernel0_tm.row<short>(q);
const short* k1 = kernel0_tm.row<short>(q+1);
const short* k2 = kernel0_tm.row<short>(q+2);
const short* k3 = kernel0_tm.row<short>(q+3);

for (int n=0; n<16; n++)
{
sum0[n] += (int)r0[n] * k0[n];
sum0[n] += (int)r1[n] * k1[n];
sum0[n] += (int)r2[n] * k2[n];
sum0[n] += (int)r3[n] * k3[n];
}
}

for (; q<inch; q++)
{
const short* r0 = bottom_blob_tm.channel(q).row<short>(i);
const short* k0 = kernel0_tm.row<short>(q);

for (int n=0; n<16; n++)
{
sum0[n] += (int)r0[n] * k0[n];
}
}

for (int n=0; n<16; n++)
{
output0_tm[n] = sum0[n];
}
}
}
}
bottom_blob_tm = Mat();
// END dot

// BEGIN transform output
Mat top_blob_bordered;
top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
{
// AT
// const float itm[2][4] = {
// {1.0f, 1.0f, 1.0f, 0.0f},
// {0.0f, 1.0f, -1.0f, 1.0f}
// };

int w_tm = outw / 2 * 4;
int h_tm = outh / 2 * 4;

int nColBlocks = h_tm/4; // may be the block num in Feathercnn
int nRowBlocks = w_tm/4;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<outch; p++)
{
Mat out_tm = top_blob_tm.channel(p);
Mat out = top_blob_bordered.channel(p);

for (int j=0; j<nColBlocks; j++)
{
int* outRow0 = out.row<int>(j*2);
int* outRow1 = out.row<int>(j*2+1);

for(int i=0; i<nRowBlocks; i++)
{
int* out_tile = out_tm.row<int>(j*nRowBlocks + i);

int s0[4],s1[4],s2[4],s3[4];
int w0[4],w1[4];
int d0[2],d1[2],d2[2],d3[2];
int o0[2],o1[2];
// load
for (int n = 0; n < 4; n++)
{
s0[n] = out_tile[n];
s1[n] = out_tile[n+ 4];
s2[n] = out_tile[n+ 8];
s3[n] = out_tile[n+12];
}
// w = A_T * W
for (int n = 0; n < 4; n++)
{
w0[n] = s0[n] + s1[n] + s2[n];
w1[n] = s1[n] - s2[n] + s3[n];
}
// transpose w to w_t
{
d0[0] = w0[0]; d0[1] = w1[0];
d1[0] = w0[1]; d1[1] = w1[1];
d2[0] = w0[2]; d2[1] = w1[2];
d3[0] = w0[3]; d3[1] = w1[3];
}
// Y = A_T * w_t
for (int n = 0; n < 2; n++)
{
o0[n] = d0[n] + d1[n] + d2[n];
o1[n] = d1[n] - d2[n] + d3[n];
}
// save to top blob tm,why right 2,because the G' = G*2
outRow0[0] = o0[0] >> 2;
outRow0[1] = o0[1] >> 2;
outRow1[0] = o1[0] >> 2;
outRow1[1] = o1[1] >> 2;

outRow0 += 2;
outRow1 += 2;
}
}
}
}
// END transform output

// cut result pad
copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt.blob_allocator, opt.num_threads);
}

static void conv3x3s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
{
int w = bottom_blob.w;
@@ -122,23 +534,19 @@ static void conv3x3s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat

for (; remain > 0; remain--)
{
short sum0 = 0;
short sum1 = 0;
short sum2 = 0;

sum0 += (short)r0[0] * kernel0[0];
sum0 += (short)r0[1] * kernel0[1];
sum0 += (short)r0[2] * kernel0[2];
sum1 += (short)r1[0] * kernel0[3];
sum1 += (short)r1[1] * kernel0[4];
sum1 += (short)r1[2] * kernel0[5];
sum2 += (short)r2[0] * kernel0[6];
sum2 += (short)r2[1] * kernel0[7];
sum2 += (short)r2[2] * kernel0[8];

*outptr0 = saturate2int16(*outptr0 + sum0);
*outptr0 = saturate2int16(*outptr0 + sum1);
*outptr0 = saturate2int16(*outptr0 + sum2);
int sum0 = 0;

sum0 += (int)r0[0] * kernel0[0];
sum0 += (int)r0[1] * kernel0[1];
sum0 += (int)r0[2] * kernel0[2];
sum0 += (int)r1[0] * kernel0[3];
sum0 += (int)r1[1] * kernel0[4];
sum0 += (int)r1[2] * kernel0[5];
sum0 += (int)r2[0] * kernel0[6];
sum0 += (int)r2[1] * kernel0[7];
sum0 += (int)r2[2] * kernel0[8];

*outptr0 += sum0;

r0 += 2;
r1 += 2;


+ 35
- 0
src/layer/x86/convolution_5x5_int8.h View File

@@ -0,0 +1,35 @@
// SenseNets is pleased to support the open source community by supporting ncnn available.
//
// Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv5x5s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
{
int kernel_w = 5;
int kernel_h = 5;

int stride_w = 1;
int stride_h = 1;

conv_im2col_sgemm_int8_sse(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
}

static void conv5x5s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
{
int kernel_w = 5;
int kernel_h = 5;

int stride_w = 2;
int stride_h = 2;

conv_im2col_sgemm_int8_sse(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
}

+ 35
- 0
src/layer/x86/convolution_7x7_int8.h View File

@@ -0,0 +1,35 @@
// SenseNets is pleased to support the open source community by supporting ncnn available.
//
// Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv7x7s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
{
int kernel_w = 7;
int kernel_h = 7;

int stride_w = 1;
int stride_h = 1;

conv_im2col_sgemm_int8_sse(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
}

static void conv7x7s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
{
int kernel_w = 7;
int kernel_h = 7;

int stride_w = 2;
int stride_h = 2;

conv_im2col_sgemm_int8_sse(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
}

+ 381
- 0
src/layer/x86/convolution_sgemm_int8.h View File

@@ -0,0 +1,381 @@
// SenseNets is pleased to support the open source community by supporting ncnn available.
//
// Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv_im2col_sgemm_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, \
const int kernel_w, const int kernel_h, const int stride_w, const int stride_h, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const signed char *kernel = _kernel;

// im2col
Mat bottom_im2col(outw*outh, kernel_h*kernel_w*inch, 1UL, opt.workspace_allocator);
{
const int stride = kernel_h*kernel_w*outw*outh;
signed char* ret = (signed char*)bottom_im2col;
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<inch; p++)
{
const signed char* input = bottom_blob.channel(p);
int retID = stride * p;
for (int u=0; u<kernel_h; u++)
{
for (int v=0; v<kernel_w; v++)
{
for (int i=0; i<outh; i++)
{
for (int j=0; j<outw; j++)
{
int row = u + i * stride_h;
int col = v + j * stride_w;
int index = row * w + col;
ret[retID] = input[index];
retID++;
}
}
}
}
}
}

int kernel_size = kernel_w * kernel_h;
int out_size = outw * outh;

// bottom_im2col memory packed 4 x 8
Mat bottom_tm(8*kernel_size, inch, out_size/8 + out_size%8, (size_t)1u, opt.workspace_allocator);
{
int nn_size = out_size >> 3;
int remain_size_start = nn_size << 3;

#pragma omp parallel for num_threads(opt.num_threads)
for (int ii=0; ii<nn_size; ii++)
{
int i = ii * 8;

const signed char* img0 = bottom_im2col.channel(0);
img0 += i;

signed char* tmpptr = bottom_tm.channel(i/8);

for (int q=0; q<inch*kernel_size; q++)
{
tmpptr[0] = img0[0];
tmpptr[1] = img0[1];
tmpptr[2] = img0[2];
tmpptr[3] = img0[3];
tmpptr[4] = img0[4];
tmpptr[5] = img0[5];
tmpptr[6] = img0[6];
tmpptr[7] = img0[7];

tmpptr += 8;
img0 += out_size;
}
}

#pragma omp parallel for num_threads(opt.num_threads)
for (int i=remain_size_start; i<out_size; i++)
{
const signed char* img0 = bottom_im2col.channel(0);
img0 += i;

signed char* tmpptr = bottom_tm.channel(i/8 + i%8);

for (int q=0; q<inch*kernel_size; q++)
{
tmpptr[0] = img0[0];

tmpptr += 1;
img0 += out_size;
}
}
}

// kernel memory packed 4 x 8
Mat kernel_tm(4*kernel_size, inch, outch/4 + outch%4, (size_t)1u, opt.workspace_allocator);
{
int nn_outch = 0;
int remain_outch_start = 0;

nn_outch = outch >> 2;
remain_outch_start = nn_outch << 2;
#pragma omp parallel for num_threads(opt.num_threads)
for (int pp=0; pp<nn_outch; pp++)
{
int p = pp * 4;

const signed char* k0 = kernel + (p+0)*inch*kernel_size;
const signed char* k1 = kernel + (p+1)*inch*kernel_size;
const signed char* k2 = kernel + (p+2)*inch*kernel_size;
const signed char* k3 = kernel + (p+3)*inch*kernel_size;

signed char* ktmp = kernel_tm.channel(p/4);

for (int q=0; q<inch*kernel_size; q++)
{
ktmp[0] = k0[0];
ktmp[1] = k1[0];
ktmp[2] = k2[0];
ktmp[3] = k3[0];
ktmp += 4;

k0 += 1;
k1 += 1;
k2 += 1;
k3 += 1;
}
}

#pragma omp parallel for num_threads(opt.num_threads)
for (int p=remain_outch_start; p<outch; p++)
{
const signed char* k0 = kernel + (p+0)*inch*kernel_size;

signed char* ktmp = kernel_tm.channel(p/4 + p%4);

for (int q=0; q<inch*kernel_size; q++)
{
ktmp[0] = k0[0];
ktmp++;
k0++;
}
}
}

// sgemm(int M, int N, int L, float* A, float* B, float* C)
{
// int M = outch; // outch
int N = outw * outh; // outsize or out stride
int L = kernel_w * kernel_h * inch; // ksize * inch

int nn_outch = 0;
int remain_outch_start = 0;

nn_outch = outch >> 2;
remain_outch_start = nn_outch << 2;
#pragma omp parallel for num_threads(opt.num_threads)
for (int pp=0; pp<nn_outch; pp++)
{
int i = pp * 4;

int* output0 = top_blob.channel(i);
int* output1 = top_blob.channel(i+1);
int* output2 = top_blob.channel(i+2);
int* output3 = top_blob.channel(i+3);

int j=0;
for (; j+7<N; j=j+8)
{
signed char* vb = bottom_tm.channel(j/8);
signed char* va = kernel_tm.channel(i/4);
int sum0[8] = {0};
int sum1[8] = {0};
int sum2[8] = {0};
int sum3[8] = {0};
int k=0;
for (; k+7<L; k=k+8)
{
for (int n=0; n<8; n++)
{
sum0[n] += (int)va[0] * vb[n];
sum1[n] += (int)va[1] * vb[n];
sum2[n] += (int)va[2] * vb[n];
sum3[n] += (int)va[3] * vb[n];
va += 4;

sum0[n] += (int)va[0] * vb[n+8];
sum1[n] += (int)va[1] * vb[n+8];
sum2[n] += (int)va[2] * vb[n+8];
sum3[n] += (int)va[3] * vb[n+8];
va += 4;

sum0[n] += (int)va[0] * vb[n+16];
sum1[n] += (int)va[1] * vb[n+16];
sum2[n] += (int)va[2] * vb[n+16];
sum3[n] += (int)va[3] * vb[n+16];
va += 4;

sum0[n] += (int)va[0] * vb[n+24];
sum1[n] += (int)va[1] * vb[n+24];
sum2[n] += (int)va[2] * vb[n+24];
sum3[n] += (int)va[3] * vb[n+24];
va += 4;

sum0[n] += (int)va[0] * vb[n+32];
sum1[n] += (int)va[1] * vb[n+32];
sum2[n] += (int)va[2] * vb[n+32];
sum3[n] += (int)va[3] * vb[n+32];
va += 4;

sum0[n] += (int)va[0] * vb[n+40];
sum1[n] += (int)va[1] * vb[n+40];
sum2[n] += (int)va[2] * vb[n+40];
sum3[n] += (int)va[3] * vb[n+40];
va += 4;

sum0[n] += (int)va[0] * vb[n+48];
sum1[n] += (int)va[1] * vb[n+48];
sum2[n] += (int)va[2] * vb[n+48];
sum3[n] += (int)va[3] * vb[n+48];
va += 4;

sum0[n] += (int)va[0] * vb[n+56];
sum1[n] += (int)va[1] * vb[n+56];
sum2[n] += (int)va[2] * vb[n+56];
sum3[n] += (int)va[3] * vb[n+56];
va -= 28;
}

va += 32;
vb += 64;
}

for (; k<L; k++)
{
for (int n=0; n<8; n++)
{
sum0[n] += (int)va[0] * vb[n];
sum1[n] += (int)va[1] * vb[n];
sum2[n] += (int)va[2] * vb[n];
sum3[n] += (int)va[3] * vb[n];
}
va += 4;
vb += 8;
}

for (int n=0; n<8; n++)
{
output0[n] = sum0[n];
output1[n] = sum1[n];
output2[n] = sum2[n];
output3[n] = sum3[n];
}
output0 += 8;
output1 += 8;
output2 += 8;
output3 += 8;
}

for (; j<N; j++)
{
int sum0 = 0;
int sum1 = 0;
int sum2 = 0;
int sum3 = 0;

signed char* vb = bottom_tm.channel(j/8 + j%8);
signed char* va = kernel_tm.channel(i/4);

for (int k=0; k<L; k++)
{
sum0 += (int)va[0] * vb[0];
sum1 += (int)va[1] * vb[0];
sum2 += (int)va[2] * vb[0];
sum3 += (int)va[3] * vb[0];

va += 4;
vb += 1;
}
output0[0] = sum0;
output1[0] = sum1;
output2[0] = sum2;
output3[0] = sum3;

output0++;
output1++;
output2++;
output3++;
}
}

#pragma omp parallel for num_threads(opt.num_threads)
for (int i=remain_outch_start; i<outch; i++)
{
int* output = top_blob.channel(i);

int j=0;
for (; j+7<N; j=j+8)
{
signed char* vb = bottom_tm.channel(j/8);
signed char* va = kernel_tm.channel(i/4 + i%4);
int sum[8] = {0};

int k=0;
for (; k+7<L; k=k+8)
{
for (int n=0; n<8; n++)
{
sum[n] += (int)va[0] * vb[n];
sum[n] += (int)va[1] * vb[n+8];
sum[n] += (int)va[2] * vb[n+16];
sum[n] += (int)va[3] * vb[n+24];
sum[n] += (int)va[4] * vb[n+32];
sum[n] += (int)va[5] * vb[n+40];
sum[n] += (int)va[6] * vb[n+48];
sum[n] += (int)va[7] * vb[n+56];
}
va += 8;
vb += 64;
}

for (; k<L; k++)
{
for (int n=0; n<8; n++)
{
sum[n] += (int)va[0] * vb[n];
}
va += 1;
vb += 8;
}

for (int n=0; n<8; n++)
{
output[n] = sum[n];
}
output += 8;
}

for (; j<N; j++)
{
int sum = 0;

signed char* vb = bottom_tm.channel(j/8 + j%8);
signed char* va = kernel_tm.channel(i/4 + i%4);

for (int k=0; k<L; k++)
{
sum += (int)va[0] * vb[0];

va += 1;
vb += 1;
}
output[0] = sum;

output++;
}
}
}
}

+ 128
- 22
src/layer/x86/convolution_x86.cpp View File

@@ -14,17 +14,61 @@

#include "convolution_x86.h"

#include "layer_type.h"
#include "benchmark.h"

namespace ncnn {

#include "convolution_1x1.h"
#include "convolution_3x3.h"
#include "convolution_5x5.h"

#include "convolution_sgemm_int8.h"
#include "convolution_1x1_int8.h"
#include "convolution_3x3_int8.h"
#include "convolution_5x5_int8.h"
#include "convolution_7x7_int8.h"

DEFINE_LAYER_CREATOR(Convolution_x86)

int Convolution_x86::load_param(const ParamDict& pd)
{
int ret = Convolution::load_param(pd);
if (ret != 0)
return ret;

use_winograd3x3 = false;

if (pd.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
int num_input = weight_data_size / 9 / num_output;
// winograd is slow on small channel count
if(num_input >= 16 && num_output >= 16)
use_winograd3x3 = true;
}

return 0;
}

int Convolution_x86::load_model(const ModelBin& mb)
{
int ret = Convolution::load_model(mb);
if (ret != 0)
return ret;

if (use_winograd3x3)
{
int num_input = weight_data_size / 9 / num_output;

if (use_int8_inference)
conv3x3s1_winograd23_transform_kernel_int8_sse(weight_data, weight_3x3_winograd23_data, num_input, num_output);
else
conv3x3s1_winograd23_transform_kernel_sse(weight_data, weight_3x3_winograd23_data, num_input, num_output);
}

return 0;
}

int Convolution_x86::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv, const Option& opt) const
{
int w = bottom_blob.w;
@@ -147,7 +191,7 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
const int kernel_size = kernel_w;
const int stride = stride_w;

if (kernel_size > 5 || stride > 5 || dilation_w != dilation_h)
if (kernel_size > 7 || stride > 7 || dilation_w != dilation_h)
{
return Convolution::forward(bottom_blob, top_blob, opt);
}
@@ -155,26 +199,23 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&);

// kernel_size x stride
conv_func conv_func_table[5][5] =
conv_func conv_func_table[7][4] =
{
{
conv1x1s1_sse,
conv1x1s2_sse,
0,
0,
0
}, // kernel_size = 1
{
0,
0,
0,
0,
0
}, // kernel_size = 2
{
conv3x3s1_sse,
0,
0,
conv3x3s2_sse,
0,
0
}, // kernel_size = 3
@@ -182,35 +223,43 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
0,
0,
0,
0,
0
}, // kernel_size = 4
{
conv5x5s1_sse,
0,
0,
0
}, // kernel_size = 5
{
0,
0,
0,
0
}, // kernel_size = 6
{
0,
0,
0,
0
} // kernel_size = 5
} // kernel_size = 7
};

typedef void (*conv_int8_func)(const Mat&, Mat&, const Mat&, const Option&);

// kernel_size x stride
conv_int8_func conv_int8_func_table[5][5] =
conv_int8_func conv_int8_func_table[7][4] =
{
{
conv1x1s1_int8_sse,
conv1x1s2_int8_sse,
0,
0,
0
}, // kernel_size = 1
{
0,
0,
0,
0,
0
}, // kernel_size = 2
{
@@ -218,22 +267,31 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
conv3x3s2_int8_sse,
0,
0,
0
}, // kernel_size = 3
{
0,
0,
0,
0,
0
}, // kernel_size = 4
{
conv5x5s1_int8_sse,
conv5x5s2_int8_sse,
0,
0
}, // kernel_size = 5
{
0,
0,
0,
0
}, // kernel_size = 6
{
conv7x7s1_int8_sse,
conv7x7s2_int8_sse,
0,
0
} // kernel_size = 5
} // kernel_size = 7
};

conv_func conv = 0;
@@ -322,21 +380,69 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option

if (use_int8_inference)
{
conv_int8(bottom_blob_bordered, top_blob, weight_data, opt);

// dequantize, reverse scale inplace
if (use_int8_requantize == true)
{
ncnn::Option opt_g = opt;
opt_g.blob_allocator = top_blob.allocator;
Mat top_blob_tm;
top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator);
if (top_blob_tm.empty())
return -100;
top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator);
if (top_blob.empty())
return -100;

if (use_winograd3x3)
conv3x3s1_winograd23_int8_sse(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data, opt);
else
conv_int8(bottom_blob_bordered, top_blob_tm, weight_data, opt);

dequantize->forward_inplace(top_blob, opt_g);
// requantize, reverse scale inplace
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<num_output; p++)
{
ncnn::Option opt_g = opt;
opt_g.num_threads = 1;
opt_g.blob_allocator = top_blob.allocator;

Mat top_blob_tm_g = top_blob_tm.channel_range(p, 1);
Mat top_blob_g = top_blob.channel_range(p, 1);
requantize_ops[p]->forward(top_blob_tm_g, top_blob_g, opt_g);
}
}
else
{
top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator);
if (top_blob.empty())
return -100;

if (use_winograd3x3)
conv3x3s1_winograd23_int8_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data, opt);
else
conv_int8(bottom_blob_bordered, top_blob, weight_data, opt);

// dequantize, reverse scale inplace
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<num_output; p++)
{
ncnn::Option opt_g = opt;
opt_g.num_threads = 1;
opt_g.blob_allocator = top_blob.allocator;

Mat top_blob_g = top_blob.channel_range(p, 1);
dequantize_ops[p]->forward_inplace(top_blob_g, opt_g);
}
}
return 0;
}

conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);

if (use_winograd3x3)
{
conv3x3s1_winograd23_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data, bias_data, opt);
}
else
conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
return 0;
}



+ 8
- 0
src/layer/x86/convolution_x86.h View File

@@ -24,8 +24,16 @@ typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option
class Convolution_x86 : public Convolution
{
public:
virtual int load_param(const ParamDict& pd);

virtual int load_model(const ModelBin& mb);

virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
virtual int forwardDilation(const Mat& bottom_blob, Mat &top_blob, conv_func conv, const Option& opt) const;

public:
bool use_winograd3x3;
Mat weight_3x3_winograd23_data;
};

} // namespace ncnn


+ 59
- 19
src/layer/x86/convolutiondepthwise_x86.cpp View File

@@ -134,7 +134,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
}

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

Mat bottom_blob_unbordered = bottom_blob;
if (use_int8_inference && elemsize != 1)
@@ -159,8 +159,8 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
quantize_ops[g]->forward(bottom_blob_g, bottom_blob_int8_g, opt_g);
}

bottom_blob_unbordered = bottom_blob_int8;
}
bottom_blob_unbordered = bottom_blob_int8;
}

Mat bottom_blob_bordered = bottom_blob_unbordered;
if (pad_w > 0 || pad_h > 0)
@@ -203,25 +203,65 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
{
if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2))
{
if (stride_w == 1 && stride_h == 1)
if (use_int8_requantize)
{
convdw3x3s1_int8_sse(bottom_blob_bordered, top_blob, weight_data, opt);
Mat top_blob_tm;
top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator);
if (top_blob_tm.empty())
return -100;
top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator);
if (top_blob.empty())
return -100;

if (stride_w == 1 && stride_h == 1)
{
convdw3x3s1_int8_sse(bottom_blob_bordered, top_blob_tm, weight_data, opt);
}
else if (stride_w == 2 && stride_h == 2)
{
convdw3x3s2_int8_sse(bottom_blob_bordered, top_blob_tm, weight_data, opt);
}

// requantize, reverse scale inplace
#pragma omp parallel for num_threads(opt.num_threads)
for (int g=0; g<group; g++)
{
ncnn::Option opt_g = opt;
opt_g.num_threads = 1;
opt_g.blob_allocator = top_blob.allocator;

Mat top_blob_tm_g = top_blob_tm.channel_range(g, 1);
Mat top_blob_g = top_blob.channel_range(g, 1);
requantize_ops[g]->forward(top_blob_tm_g, top_blob_g, opt_g);
}
}
else if (stride_w == 2 && stride_h == 2)
else
{
convdw3x3s2_int8_sse(bottom_blob_bordered, top_blob, weight_data, opt);
}

// dequantize, reverse scale inplace
#pragma omp parallel for num_threads(opt.num_threads)
for (int g=0; g<group; g++)
{
ncnn::Option opt_g = opt;
opt_g.num_threads = 1;
opt_g.blob_allocator = top_blob.allocator;

Mat top_blob_g = top_blob.channel(g);
dequantize_ops[g]->forward_inplace(top_blob_g, opt_g);
top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator);
if (top_blob.empty())
return -100;

if (stride_w == 1 && stride_h == 1)
{
convdw3x3s1_int8_sse(bottom_blob_bordered, top_blob, weight_data, opt);
}
else if (stride_w == 2 && stride_h == 2)
{
convdw3x3s2_int8_sse(bottom_blob_bordered, top_blob, weight_data, opt);
}

// dequantize, reverse scale inplace
#pragma omp parallel for num_threads(opt.num_threads)
for (int g=0; g<group; g++)
{
ncnn::Option opt_g = opt;
opt_g.num_threads = 1;
opt_g.blob_allocator = top_blob.allocator;

Mat top_blob_g = top_blob.channel(g);
dequantize_ops[g]->forward_inplace(top_blob_g, opt_g);
}
}

return 0;


+ 109
- 0
src/net.cpp View File

@@ -16,6 +16,9 @@
#include "layer_type.h"
#include "modelbin.h"
#include "paramdict.h"
#include "convolution.h"
#include "convolutiondepthwise.h"
#include "relu.h"

#include <stdarg.h>
#include <stdio.h>
@@ -679,6 +682,8 @@ int Net::load_model(FILE* fp)
}
#endif // NCNN_VULKAN

fuse_network();

return ret;
}

@@ -898,6 +903,110 @@ int Net::load_model(const unsigned char* _mem)
return mem - _mem;
}

void Net::fuse_network()
{
// set the int8 op fusion:requantize
#if NCNN_STRING && NCNN_REQUANT
// fprintf(stderr, "Test op fusion to int8 implement:\n");
for (size_t i=0; i<layers.size(); i++)
{
Layer* layer = layers[i];

if (layer->type == "Convolution" || layer->type == "ConvolutionDepthWise")
{
if (((Convolution*)layer)->use_int8_inference == false)
continue;

for (size_t n=0; n<blobs[layer->tops[0]].consumers.size(); n++)
{
int layer_next_index = blobs[layer->tops[0]].consumers[n];
Layer* layer_next = layers[layer_next_index];

if (layer_next->type == "ReLU")
{
int layer_next_2_index = blobs[layer_next->tops[0]].consumers[0];
Layer* layer_next_2 = layers[layer_next_2_index];

if (layer_next_2->type == "Convolution" || layer_next_2->type == "ConvolutionDepthWise")
{
// fprintf(stderr, "%s, %s, %s\n", layer->name.c_str(), layer_next->name.c_str(), layer_next_2->name.c_str());
if (layer->type == "Convolution" && layer_next_2->type == "Convolution")
{
((Convolution*)layer)->use_int8_requantize = true;
((Convolution*)layer)->top_blob_int8_scale = ((Convolution*)layer_next_2)->bottom_blob_int8_scale;
((Convolution*)layer)->create_requantize_op();
}
else if (layer->type == "ConvolutionDepthWise" && layer_next_2->type == "Convolution")
{
((ConvolutionDepthWise*)layer)->use_int8_requantize = true;
((ConvolutionDepthWise*)layer)->top_blob_int8_scale = ((Convolution*)layer_next_2)->bottom_blob_int8_scale;
((ConvolutionDepthWise*)layer)->create_requantize_op();
}
else if (layer->type == "Convolution" && layer_next_2->type == "ConvolutionDepthWise")
{
((Convolution*)layer)->use_int8_requantize = true;
((Convolution*)layer)->top_blob_int8_scale = ((ConvolutionDepthWise*)layer_next_2)->bottom_blob_int8_scales[0];
((Convolution*)layer)->create_requantize_op();
}
else
{
((ConvolutionDepthWise*)layer)->use_int8_requantize = true;
((ConvolutionDepthWise*)layer)->top_blob_int8_scale = ((ConvolutionDepthWise*)layer_next_2)->bottom_blob_int8_scales[0];
((ConvolutionDepthWise*)layer)->create_requantize_op();
}
}
else if (layer_next_2->type == "Split")
{
bool all_conv = true;
for (size_t i=0; i<layer_next_2->tops.size(); i++)
{
int layer_next_3_index = blobs[layer_next_2->tops[i]].consumers[0];
if (layers[layer_next_3_index]->type != "Convolution" && layers[layer_next_3_index]->type != "ConvolutionDepthWise" && layers[layer_next_3_index]->type != "PriorBox" )
{
// fprintf(stderr, "%s, %s, %s, %s\n", layer->name.c_str(), layer_next->name.c_str(), layer_next_2->name.c_str(), layers[layer_next_3_index]->name.c_str());
all_conv = false;
}
}

if (all_conv == true && layer_next_2->tops.size() >= size_t(2))
{
// fprintf(stderr, "%s, %s, %s, ", layer->name.c_str(), layer_next->name.c_str(), layer_next_2->name.c_str());
for (size_t i=0; i<layer_next_2->tops.size(); i++)
{
int layer_next_3_index = blobs[layer_next_2->tops[i]].consumers[0];
Layer* layer_next_3 = layers[layer_next_3_index];

// fprintf(stderr, "%s, ", layer_next_3->name.c_str());
if (layer_next_3->type == "Convolution")
{
((Convolution*)layer)->top_blob_int8_scale = ((Convolution*)layer_next_3)->bottom_blob_int8_scale;
}
}

((Convolution*)layer)->use_int8_requantize = true;
((Convolution*)layer)->create_requantize_op();
// fprintf(stderr, "\n");
}
}
else
{
// fprintf(stderr, "%s, %s\n", layer->name.c_str(), layer_next->name.c_str());
}
}
else if (layer_next->type == "Pooling")
{
// ToDo
}
else
{
// fprintf(stderr, "%s\n", layer->name.c_str());
}
}
}
}
#endif
}

void Net::clear()
{
blobs.clear();


+ 4
- 0
src/net.h View File

@@ -76,6 +76,10 @@ public:
// return bytes consumed
int load_model(const unsigned char* mem);

// parse the structure of network
// fuse int8 op dequantize and quantize by requantize
void fuse_network();

// unload network structure and weight data
void clear();



+ 2
- 0
src/platform.h.in View File

@@ -22,5 +22,7 @@
#cmakedefine01 NCNN_PIXEL
#cmakedefine01 NCNN_PIXEL_ROTATE
#cmakedefine01 NCNN_VULKAN
#cmakedefine01 NCNN_REQUANT
#cmakedefine01 NCNN_IM2COL_SGEMM

#endif // NCNN_PLATFORM_H

+ 1
- 1
tools/caffe/caffe2ncnn.cpp View File

@@ -685,7 +685,7 @@ int main(int argc, char** argv)

if (int8_scale_term)
{
if ((int)weight_int8scale.size() == num_group && (int)blob_int8scale.size() == num_group)
if ((int)weight_int8scale.size() == num_group)
{
fprintf(pp, " 8=1");
}


Loading…
Cancel
Save