diff --git a/docs/Home.md b/docs/Home.md
new file mode 100644
index 000000000..78d2d42d3
--- /dev/null
+++ b/docs/Home.md
@@ -0,0 +1,100 @@
+### input data and extract output
+```
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include "net.h"
+
+int main()
+{
+    cv::Mat img = cv::imread("image.ppm", CV_LOAD_IMAGE_GRAYSCALE);
+    int w = img.cols;
+    int h = img.rows;
+
+    // subtract 128, norm to -1 ~ 1
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(img.data, ncnn::Mat::PIXEL_GRAY, w, h, 60, 60);
+    float mean[1] = { 128.f };
+    float norm[1] = { 1/128.f };
+    in.substract_mean_normalize(mean, norm);
+
+    ncnn::Net net;
+    net.load_param("model.param");
+    net.load_model("model.bin");
+
+    ncnn::Extractor ex = net.create_extractor();
+    ex.set_light_mode(true);
+    ex.set_num_threads(4);
+
+    ex.input("data", in);
+
+    ncnn::Mat feat;
+    ex.extract("output", feat);
+
+    return 0;
+}
+
+```
+
+### print Mat content
+```
+void pretty_print(const Mat& m)
+{
+    for (int q=0; q<m.c; q++)
+    {
+        const float* ptr = m.channel(q);
+        for (int y=0; y<m.h; y++)
+        {
+            for (int x=0; x<m.w; x++)
+            {
+                printf("%f ", ptr[x]);
+            }
+            ptr += m.w;
+            printf("\n");
+        }
+        printf("------------------------\n");
+    }
+}
+```
+
+### caffe-android-lib+openblas vs ncnn
+use squeezenet v1.1, nexus6p, android 7.1.2
+
+memory usage is the RSS item in top utility output
+
+|compare item|caffe-android-lib+openblas|ncnn|
+|---|---|---|
+|inference time(1 thread)|228ms|88ms|
+|inference time(8 thread)|152ms|38ms|
+|memory usage|138.16M|21.56M|
+|library binary size|6.9M|<500K|
+|compability|armeabi-v7a-hard with neon or arm64-v8a|armeabi-v7a with neon or arm64-v8a|
+|thirdparty dependency|boost gflags glog lmdb openblas opencv protobuf|none|
+
+### FAQ
+Q ncnn的起源
+
+A 深度学习算法要在手机上落地，caffe依赖太多，手机上也没有cuda，需要个又快又小的前向网络实现
+
+
+Q ncnn名字的来历
+
+A cnn就是卷积神经网络的缩写，开头的n算是一语n关。比如new/next(全新的实现)，naive(ncnn是naive实现)，neon(ncnn最初为手机优化)，up主名字(←_←)
+
+
+Q 支持哪些平台
+
+A 跨平台，主要支持 android，次要支持 ios / linux / windows
+
+
+Q 计算精度如何
+
+A armv7 neon float 不遵照 ieee754 标准，有些采用快速实现(如exp sin等)，速度快但确保精度足够高
+
+
+Q pc 上的速度很慢
+
+A pc都是x86架构的，基本没做什么优化，主要用来核对结果，毕竟up主精力是有限的（
+
+
+Q 为何没有 logo
+
+A up主是mc玩家，所以开始是找了萌萌的苦力怕当看板娘的，但是这样子会侵权对吧，只好空出来了...
diff --git a/docs/application-with-ncnn-inside.md b/docs/application-with-ncnn-inside.md
new file mode 100644
index 000000000..5ab1bed82
--- /dev/null
+++ b/docs/application-with-ncnn-inside.md
@@ -0,0 +1,48 @@
+![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.azarlive.android.png) Azar-视频交友与聊天 June 20, 2018
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.cyberlink.youcammakeup.png) 玩美彩妆 - 自拍美颜 & 智能美妆相机 June 21, 2018
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.fotoable.makeup.png) You Makeup Photo Camera 2.1.5
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.fotoable.cartoon.cam.png) 滤镜相机 Cartoon Camera- Paintlab January 24, 2018
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.pipcamera.activity.png) 画中画相机 January 30, 2018
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.hefe.pro.editor.png) Photo Editor Pro 1.1.4.1029
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.apus.camera.id.png) Air Camera 1.7.3.1002
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.fotoable.fotobeauty.png) 美丽拍－懂你的自拍美颜相机 February 1, 2018
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.perfectcorp.ycf.png) 玩美Fun-特效动图自拍滤镜&分享相片！ May 15, 2018
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.ufotosoft.justshot.png) Sweet Snap - 生活贴纸&图像编辑器,实时滤镜,录制视频和有趣表情包,美容效果 June 22, 2018
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.wantu.activity.png) 玩图 - 美图相机 March 29, 2018
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.meitu.meiyancamera.png) 美颜相机 7.6.95
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.lyrebirdstudio.colorizer.lite.png) 自拍相机 - 照片编辑器和过滤器和贴纸 April 27, 2018
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.apusapps.fulakora.png) APUS Camera 1.7.2.1001
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180626/video.like.png) LIKE短视频 — 魔法视频自拍神器 2.2.4
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.qiyi.video.png) 爱奇艺 9.6.0
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.eg.android.AlipayGphone.png) 支付宝 10.1.25.752
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.perfectcorp.beautycircle.png) YouCam Shop - World's First AR Makeup Shopping App 3.4.0
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.lyrebirdstudio.beauty.png) 美容化妆自拍相机和自拍照片编辑器 1.4.8
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.jingdong.app.mall.png) 京东-挑好物，上京东 7.0.8
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.versa.png) Versa 2.9.2
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.tencent.weishi.png) 微视 4.3.1.88
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.smile.gifmaker.png) 快手短视频—国民短视频平台 5.4.2.5360
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.sdu.didi.psnger.png) 滴滴出行 5.3.0
+
diff --git a/docs/benchmark/the-benchmark-of-caffe-android-lib,-mini-caffe,-and-ncnn.md b/docs/benchmark/the-benchmark-of-caffe-android-lib,-mini-caffe,-and-ncnn.md
new file mode 100644
index 000000000..a3ea325f3
--- /dev/null
+++ b/docs/benchmark/the-benchmark-of-caffe-android-lib,-mini-caffe,-and-ncnn.md
@@ -0,0 +1,118 @@
+caffe-android-lib https://github.com/sh1r0/caffe-android-lib
+
+mini-caffe https://github.com/luoyetx/mini-caffe
+
+openblas-0.2.20 https://github.com/xianyi/OpenBLAS
+
+ncnn https://github.com/Tencent/ncnn
+
+***
+
+squeezenet_v1.1 https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1
+
+mobilenet_v1 https://github.com/shicai/MobileNet-Caffe
+
+vgg16 https://gist.github.com/ksimonyan/211839e770f7b538e2d8
+
+***
+
+Host platform and compiler configuration: 
+
+fedora 27, android-ndk-r15c, target arch = arm64-v8a
+
+we manually update openblas package to version 0.2.20 in caffe-android-lib for better performance
+
+
+***
+
+Device: Nexus 6p
+
+OS: LineageOS 15.1(Android 8.1.0), ROM newly flashed without any third-party APP installed
+
+CPU: Snapdragon 810 (Cortex-A57 2.0GHz x 4 + Cortex-A53 1.55GHz x 4)
+
+RAM: 3G
+
+
+***
+
+Benchmark method: 
+
+Run squeezenet, mobilenet inference 23 times in a loop, discard the the first three warmup records, and then calculate the average inference time
+
+Run vgg169 times in a loop, discard the first warmup record, and then calculate the average inference time
+
+Since the system may force SOC lowering its frequency when temperature goes high, sleep over 1 minute before each benchmark to prevent this issue.
+
+fps performance: fps = 1000 / avgtime(ms)
+
+cpu usage: take the CPU value in top utility output
+
+memory usage: take the RES value in top utility output
+
+the overall power consumption and performance per watt: 
+
+Disable usb charging: adb shell echo 0 > /sys/class/power_supply/battery/charging_enabled
+
+current(μA) = adb shell cat /sys/class/power_supply/battery/current_now (multiply -1 for 810 chip)
+
+voltage(μV) = adb shell cat /sys/class/power_supply/battery/voltage_now
+
+power consumption(mW) = current / 1000 * voltage / 1000 / 1000
+
+performance per watt(1000fps/W) = fps / power consumption * 1000
+
+
+***
+
+The binary size after debug stripping
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180413/1.jpg)
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180413/2.jpg)
+
+***
+
+squeezenet
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180413/3.jpg)
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180413/4.jpg)
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180413/5.jpg)
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180413/6.jpg)
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180413/7.jpg)
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180413/8.jpg)
+***
+
+mobilnet
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180413/9.jpg)
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180413/10.jpg)
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180413/11.jpg)
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180413/12.jpg)
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180413/13.jpg)
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180413/14.jpg)
+***
+
+vgg16
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180413/15.jpg)
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180413/16.jpg)
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180413/17.jpg)
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180413/18.jpg)
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180413/19.jpg)
+
+![](https://github.com/nihui/ncnn-assets/raw/master/20180413/20.jpg)
diff --git a/docs/benchmark/vulkan-conformance-test.md b/docs/benchmark/vulkan-conformance-test.md
new file mode 100644
index 000000000..02cff194f
--- /dev/null
+++ b/docs/benchmark/vulkan-conformance-test.md
@@ -0,0 +1,46 @@
+
+|device|gpu|api version|driver version|squeezenet|mobilenetssd|yolov3|
+|---|---|---|---|---|---|---|
+|intel-i7-7700|Intel(R) HD Graphics 630 (Kaby Lake GT2)|1.1.90|18.3.4|y|y|y|
+|GTX-1060|GeForce GTX 1060 3GB|1.1.95|418.172.0|y|y|y|
+|AMD-Radeon R9 M290X|AMD RADV PITCAIRN (LLVM 7.0.1)|1.1.70|18.3.4|y|y|y|
+|iphone-5s|Apple A7 GPU|1.0.82|0.2.1825|y|y|y|
+|huawei-nexus6p|Adreno (TM) 430|1.0.49|35.601.2388|y|y|y
+|vivo-y1731ca|Adreno (TM) 505|1.0.61|37.845.1429|y|n|n|
+|vivo-y85a|Adreno (TM) 506|1.0.61|2.944.3349|y|n|n|
+|vivo-x9s|Adreno (TM) 510|1.0.61|42.917.1172|y|y|y|
+|meizu-15|Adreno (TM) 512|1.0.38|29.189.223|n|n|n|
+|chuizi-jianguo-pro2|Adreno (TM) 512|1.0.38|21.219.2615|n|n|n|
+|xiaomi-note3|Adreno (TM) 512|1.0.38|39.369.2305|n|n|n|
+|oppo-r11|Adreno (TM) 512|1.0.38|42.977.756|n|n|n|
+|xiaomi-6x|Adreno (TM) 512|1.0.61|14.322.3739|y|y|y|
+|oppo-r11s+|Adreno (TM) 512|1.0.61|35.1004.3936|y|y|y|
+|vivo-x20a|Adreno (TM) 512|1.0.61|43.10.3141|y|y|y|
+|vivo-v1816a|Adreno (TM) 512|1.0.61|43.10.3141|y|y|y|
+|vivo-z1|Adreno (TM) 512|1.0.61|43.10.3141|y|y|y|
+|xiaomi-redmi-note5|Adreno (TM) 512|1.0.61|63.219.2354|y|y|y|
+|google-pixel|Adreno (TM) 530|1.1.87|512.354.0|y|y|y|
+|nubia-z17|Adreno (TM) 540|1.0.38|1.28.32|n|n|n|
+|samsung-galaxys8+|Adreno (TM) 540|1.0.61|29.896.3583|y|y|y|
+|oneplus-5t|Adreno (TM) 540|1.0.61|18.1023.2233|y|y|y|
+|google-pixel2|Adreno (TM) 540|1.1.66|512.313.0|y|y|y|
+|essential-ph-1|Adreno (TM) 540|1.1.66|512.319.0|y|y|y|
+|vivo-x23|Adreno (TM) 615|1.0.66|33.870.3328|y|y|y|
+|vivo-v1813ba|Adreno (TM) 615|1.0.66|33.870.3328|y|y|y|
+|xiaomi-8se|Adreno (TM) 616|1.0.66|30.913.18|y|y|y|
+|vivo-nex-a|Adreno (TM) 616|1.0.66|33.870.3328|y|y|y|
+|xiaomi-mix2s|Adreno (TM) 630|1.0.61|4.91.2976|y|y|y|
+|heisha-SKR-A0|Adreno (TM) 630|1.0.61|36.173.3586|y|y|y|
+|heisha-SKR-A0|Adreno (TM) 630|1.0.66|47.448.1532|y|y|y|
+|oneplus-6|Adreno (TM) 630|1.1.66|512.324.0|y|y|y|
+|vivo-iQOO|Adreno (TM) 640|1.1.87|512.361.0|y|y|y|
+|meitu-m8s|Mali-T880|1.0.14|500.910.1017|n|n|n|
+|huawei-p10|Mali-G71|1.0.53|151.949.2145|n|n|n|
+|huawei-mate9|Mali-G71|1.0.53|151.949.2145|n|n|n|
+|oppo-a73|Mali-G71|1.0.47|575.795.1934|n|n|n|
+|vivo-y97|Mali-G72|1.0.58|240.537.3580|n|n|n|
+|huawei-mate10|Mali-G72|1.0.66|14.0.0|y|y|y|
+|huawei-v10|Mali-G72|1.0.66|14.0.0|y|y|y|
+|huawei-vce-al00|Mali-G72|1.0.66|14.0.0|y|y|y|
+|huawei-mate20|Mali-G76|1.0.66|14.0.0|y|y|y|
+|huawei-pct-al10|Mali-G76|1.0.66|14.0.0|y|y|y|
\ No newline at end of file
diff --git a/docs/developer-guide/aarch64-mix-assembly-and-intrinsic.md b/docs/developer-guide/aarch64-mix-assembly-and-intrinsic.md
new file mode 100644
index 000000000..358c66538
--- /dev/null
+++ b/docs/developer-guide/aarch64-mix-assembly-and-intrinsic.md
@@ -0,0 +1,57 @@
+```
+// v寄存器全部使用 %.4s
+// 128-bit vreg matches %.4s
+// a += b * c
+float32x4_t _a = vld1q_f32(a);
+float32x4_t _b = vld1q_f32(b);
+float32x4_t _c = vld1q_f32(c);
+asm volatile(
+    "fmla  %0.4s, %2.4s, %3.4s"
+    : "=w"(_a) // %0
+    : "0"(_a),
+      "w"(_b), // %2
+      "w"(_c)  // %3
+    :
+);
+```
+```
+// v寄存器使用低64位  %.2s
+// low 64-bit vreg matches %.2s
+// a += b * c
+float32x2_t _a = vld1_f32(a);
+float32x2_t _b = vld1_f32(b);
+float32x2_t _c = vld1_f32(c);
+asm volatile(
+    "fmla  %0.2s, %2.2s, %3.2s"
+    : "=w"(_a) // %0
+    : "0"(_a),
+      "w"(_b), // %2
+      "w"(_c)  // %3
+    :
+);
+```
+```
+// v寄存器单路使用 %.s[0] %.s[1] %.s[2] %.s[3]
+// 32-bit register matches %.s[0]
+// a += b * c[0]
+// a += b * c[1]
+// a += b * c[2]
+// a += b * c[3]
+float32x4_t _a = vld1_f32(a);
+float32x4_t _b = vld1_f32(b);
+float32x4_t _c = vld1_f32(c);
+asm volatile(
+    "fmla  %0.4s, %2.4s, %3.s[0]"
+    "fmla  %0.4s, %2.4s, %3.s[1]"
+    "fmla  %0.4s, %2.4s, %3.s[2]"
+    "fmla  %0.4s, %2.4s, %3.s[3]"
+    : "=w"(_a) // %0
+    : "0"(_a),
+      "w"(_b), // %2
+      "w"(_c)  // %3
+    :
+);
+```
+
+
+qwq
diff --git a/docs/developer-guide/add-custom-layer.zh.md b/docs/developer-guide/add-custom-layer.zh.md
new file mode 100644
index 000000000..1eb8bb7fb
--- /dev/null
+++ b/docs/developer-guide/add-custom-layer.zh.md
@@ -0,0 +1,108 @@
+这里举个例子添加 Relu6，即 std::min(6, std::max(0, val))
+
+```
+Input            input   0 1 input
+Convolution      conv2d  1 1 input conv2d 0=32 1=1 2=1 3=1 4=0 5=0 6=768
+Relu6            relu6   1 1 conv2d relu6
+Pooling          maxpool 1 1 relu6 maxpool 0=0 1=3 2=2 3=-233 4=0
+```
+
+## method 1 -- 注册自定义层
+```
+#include "layer.h"
+
+class Relu6 : public ncnn::Layer
+{
+public:
+    Relu6()
+    {
+        one_blob_only = true;
+        support_inplace = true;
+    }
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+    {
+        int w = bottom_top_blob.w;
+        int h = bottom_top_blob.h;
+        int channels = bottom_top_blob.c;
+        int size = w * h;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q=0; q<channels; q++)
+        {
+            float* outptr = bottom_top_blob.channel(q);
+
+            for (int i=0; i<size; i++)
+            {
+                outptr[i] = std::min(6, std::max(0, outptr[i]));
+            }
+        }
+
+        return 0;
+    }
+};
+
+DEFINE_LAYER_CREATOR(Relu6)
+```
+
+```
+ncnn::Net net;
+net.register_custom_layer("Relu6", Relu6_layer_creator);
+
+net.load_param("model.param");
+net.load_model("model.bin");
+
+ncnn::Extractor ex = net.create_extractor();
+
+ex.input("input", inputmat);
+ex.extract("maxpool", maxpoolmat);
+```
+
+
+## method 2 -- 处理中间 blob
+```
+ncnn::Net net;
+net.load_param("model.param");
+net.load_model("model.bin");
+
+ncnn::Extractor ex = net.create_extractor();
+
+ex.input("input", inputmat);
+ex.extract("conv2d", conv2dmat);
+
+// relu6
+ncnn::Mat relu6mat = conv2dmat.clone();
+{
+    int w = relu6mat.w;
+    int h = relu6mat.h;
+    int channels = relu6mat.c;
+    int size = w * h;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        float* outptr = relu6mat.channel(q);
+        for (int i=0; i<size; i++)
+        {
+            outptr[i] = std::min(6, std::max(0, outptr[i]));
+        }
+    }
+}
+
+ex.input("relu6", relu6mat);
+ex.extract("maxpool", maxpoolmat);
+
+```
+
+## method 3 -- 直接修改 ncnn
+实现 src/layer/relu6.h
+
+实现 src/layer/relu6.cpp
+
+修改 src/CMakeLists.txt
+```
+ncnn_add_layer(UnaryOp)
+ncnn_add_layer(ConvolutionDepthWise)
+ncnn_add_layer(Padding)
+ncnn_add_layer(Relu6)
+```
diff --git a/docs/developer-guide/armv7-mix-assembly-and-intrinsic.md b/docs/developer-guide/armv7-mix-assembly-and-intrinsic.md
new file mode 100644
index 000000000..ea395c82b
--- /dev/null
+++ b/docs/developer-guide/armv7-mix-assembly-and-intrinsic.md
@@ -0,0 +1,130 @@
+```
+// d寄存器全部使用 %P
+// d reg matches %P
+// a += b * c
+float32x2_t _a = vld1_f32(a);
+float32x2_t _b = vld1_f32(b);
+float32x2_t _c = vld1_f32(c);
+asm volatile(
+    "vmla.f32  %P0, %P2, %P3"
+    : "=w"(_a) // %0
+    : "0"(_a),
+      "w"(_b), // %2
+      "w"(_c)  // %3
+    :
+);
+```
+```
+// q寄存器全部使用 %q
+// q reg matches %q
+// a += b * c
+float32x4_t _a = vld1q_f32(a);
+float32x4_t _b = vld1q_f32(b);
+float32x4_t _c = vld1q_f32(c);
+asm volatile(
+    "vmla.f32  %q0, %q2, %q3"
+    : "=w"(_a) // %0
+    : "0"(_a),
+      "w"(_b), // %2
+      "w"(_c)  // %3
+    :
+);
+```
+```
+// d寄存器单路使用 %P[0] %P[1]
+// 32bit d reg matches %P[0]
+// a += b * c[0]
+// a += b * c[1]
+float32x2_t _a = vld1_f32(a);
+float32x2_t _b = vld1_f32(b);
+float32x2_t _c = vld1_f32(c);
+asm volatile(
+    "vmla.f32  %P0, %P2, %P3[0]"
+    "vmla.f32  %P0, %P2, %P3[1]"
+    : "=w"(_a) // %0
+    : "0"(_a),
+      "w"(_b), // %2
+      "w"(_c)  // %3
+    :
+);
+```
+```
+// q寄存器单路使用 %e[0] %e[1] %f[0] %f[1]
+// 32-bit q reg matches %e[0]
+// a += b * c[0]
+// a += b * c[1]
+// a += b * c[2]
+// a += b * c[3]
+float32x4_t _a = vld1q_f32(a);
+float32x4_t _b = vld1q_f32(b);
+float32x4_t _c = vld1q_f32(c);
+asm volatile(
+    "vmla.f32  %q0, %q2, %e3[0]"
+    "vmla.f32  %q0, %q2, %e3[1]"
+    "vmla.f32  %q0, %q2, %f3[0]"
+    "vmla.f32  %q0, %q2, %f3[1]"
+    : "=w"(_a) // %0
+    : "0"(_a),
+      "w"(_b), // %2
+      "w"(_c)  // %3
+    :
+);
+```
+```
+// q寄存器拆分d寄存器使用 %e %f
+// use %e %f to split q reg into two d regs
+// a += b * c[0]c[1]
+// a += b * c[2]c[3]
+float32x2_t _a = vldq_f32(a);
+float32x2_t _b = vldq_f32(b);
+float32x4_t _c = vld1q_f32(c);
+asm volatile(
+    "vmla.f32  %P0, %P2, %e3"
+    "vmla.f32  %P0, %P2, %f3"
+    : "=w"(_a) // %0
+    : "0"(_a),
+      "w"(_b), // %2
+      "w"(_c)  // %3
+    :
+);
+```
+```
+// d寄存器声明绑定
+// specify concrete d reg which want to save
+// vmla.f32  d0, d2, d4
+register float32x2_t _a asm("d0") = vld1_f32(a);
+register float32x2_t _b asm("d2") = vld1_f32(b);
+register float32x2_t _c asm("d4") = vld1_f32(c);
+
+asm volatile(
+    "vmla.f32  %P0, %P2, %P3"
+    : "=w"(_a) // %0
+    : "0"(_a),
+      "w"(_b), // %2
+      "w"(_c)  // %3
+    :
+);
+```
+```
+// q寄存器声明绑定
+// bind q reg with data
+// vmla.f32  q0, q1, q2
+register float32x4_t _a asm("q0") = vld1q_f32(a);
+register float32x4_t _b asm("q1") = vld1q_f32(b);
+register float32x4_t _c asm("q2") = vld1q_f32(c);
+
+asm volatile(
+    "vmla.f32  %q0, %q2, %q3"
+    : "=w"(_a) // %0
+    : "0"(_a),
+      "w"(_b), // %2
+      "w"(_c)  // %3
+    :
+);
+```
+
+如果不是因为编译器的bug，寄存器绑定是用不着的，然而。。。
+
+https://gcc.gnu.org/bugzilla/show_bug.cgi?id=41538
+
+qwq
diff --git a/docs/developer-guide/binaryop-broadcasting.md b/docs/developer-guide/binaryop-broadcasting.md
new file mode 100644
index 000000000..b830e8723
--- /dev/null
+++ b/docs/developer-guide/binaryop-broadcasting.md
@@ -0,0 +1,35 @@
+### broadcasting rule
+
+ncnn BinaryOp accepts blobs with different shape
+
+C = BinaryOp(A, B)
+
+shape notation convention is [w], [w,h], [w,h,c]
+
+|type|A|B|C|
+|---|---|---|---|
+|1|[1]|scalar|[1]|
+|2|[1]|[1]|[1]|
+|3|[1]|[2,3]|[2,3]|
+|4|[1]|[2,3,4]|[2,3,4]|
+|5|[2]|scalar|[2]|
+|6|[2]|[1]|[2]|
+|7|[2]|[2]|[2]|
+|8|[3]|[2,3]|[2,3]|
+|9|[4]|[2,3,4]|[2,3,4]|
+|10|[2,3]|scalar|[2,3]|
+|11|[2,3]|[1]|[2,3]|
+|12|[2,3]|[3]|[2,3]|
+|13|[2,3]|[2,3]|[2,3]|
+|14|[3,4]|[2,3,4]|[2,3,4]|
+|15|[2,3,4]|scalar|[2,3,4]|
+|16|[2,3,4]|[1]|[2,3,4]|
+|17|[2,3,4]|[4]|[2,3,4]|
+|18|[2,3,4]|[3,4]|[2,3,4]|
+|19|[2,3,4]|[2,3,4]|[2,3,4]|
+
+some special broadcasting rule exists for model compatibility
+
+|special type|A|B|C|
+|---|---|---|---|
+|1|[2,3,4]|[1,1,4]|[2,3,4]|
diff --git a/docs/developer-guide/custom-allocator.md b/docs/developer-guide/custom-allocator.md
new file mode 100644
index 000000000..2c2a2d17f
--- /dev/null
+++ b/docs/developer-guide/custom-allocator.md
@@ -0,0 +1,63 @@
+Mat structure is now allocator-aware via an extra allocator parameter with default zero value.
+
+The good-old ncnn::fastMalloc()/ncnn::fastFree() will be used for a null allocator.
+
+You could pass a custom allocator to delegate all memory allocation and deallocation.
+
+```
+class Allocator
+{
+public:
+    virtual void* fastMalloc(size_t size) = 0;
+    virtual void fastFree(void* ptr) = 0;
+};
+```
+
+ncnn has already implemented two simple pooled Allocator class, with mutex lock or without it.
+
+```
+ncnn::PoolAllocator locked_mempool;
+ncnn::UnlockedPoolAllocator unlocked_mempool;
+```
+
+the two allocator types in ncnn
+
+* blob allocator
+
+    used to allocate memory for all named blobs, which you could retrieve by Extractor::extract()
+* workspace allocator
+
+    used to allocate memory for internal temporary use in layer implementation, such as the temp blob after padding in convolution
+
+by default, all Extractor instance use the two allocator in the default option
+You can alter them by ncnn::set_default_option()
+or you can set them per Extractor by Extractor::set_blob_allocator()/Extractor::set_workspace_allocator()
+
+blob allocator is guaranteed to be called in-order in layer implementation during each Extractor lifecycle
+while workspace allocator may be called synchronously
+
+the practical usage
+
+* one network, one-by-one inference
+
+    shared unlocked blob allocator for all Extractor
+
+    shared locked workspace allocator for all Extractor
+
+* one network, concurrent inference
+
+    shared unlocked blob allocator for all Extractor in each thread
+
+    shared locked workspace allocator for all Extractor among all threads
+
+* concurrent multiple networks, one-by-one inference for each network
+
+    shared unlocked blob allocator for all Extractor of each network
+
+    shared locked workspace allocator for all Extractor among all networks (for saving memory)
+
+* concurrent multiple networks, concurrent inference for each network
+
+    shared unlocked blob allocator for all Extractor of each network in each thread
+
+    shared locked workspace allocator for all Extractor among all networks (for saving memory)
diff --git a/docs/developer-guide/element-packing.md b/docs/developer-guide/element-packing.md
new file mode 100644
index 000000000..abf02600b
--- /dev/null
+++ b/docs/developer-guide/element-packing.md
@@ -0,0 +1,119 @@
+### what is packing and why
+
+packing is the form of storing multiple short-sized values as one long-sized value.
+
+element packing is well mapped with the underlying simd register, which usually use one very wide register to store different types of values.
+
+|C|elemsize|elempack|
+|---|---|---|
+|double|8|1|
+|float|4|1|
+|int|4|1|
+|short|2|1|
+|signed char|1|1|
+
+|arm neon|elemsize|elempack|
+|---|---|---|
+|float64x2_t|16|2|
+|float32x4_t|16|4|
+|int32x4_t|16|4|
+|float16x4_t|8|4|
+|int8x8_t|8|8|
+
+Though the real count of values doubles when elempack is two, the wide-sized value is still treated as one value in the view of Mat structure. For example, we want to store 40 float values in Mat object, if elempack 1 is used, Mat width is then 40, while 10 if elempack 4 is used.
+
+|dims|w|h|c|cstep|elemsize|elempack|
+|---|---|---|---|---|---|---|
+|1|40|1|1|40|4|1|
+|1|10|1|1|10|16|4|
+
+### packing style convention
+
+In practise, elempack 1, 4, 8 are the most common cases. It is possible to use any other packing style in theory.
+
+The following table show the packing axis used in ncnn for different dimension.
+
+|dims|packing axis|shape before packing|shape after packing|
+|---|---|---|---|
+|1|w|w|w/elempack|
+|2|h|w, h|w, h/elempack|
+|3|c|w, h, c|w, h, c/elempack|
+
+If the packing axis dim is not evenly divisible by elempack, zero padding may be used.
+
+```
+outw = (w + elempack - 1) / elempack;
+```
+
+The following snippet shows the memory layout after elempack=4 on 3-dim Mat
+
+```
+// w=2 h=3 c=4 elempack=1
+0 1
+2 3
+4 5
+
+6 7
+8 9
+10 11
+
+12 13
+14 15
+16 17
+
+18 19
+20 21
+22 23
+
+// w=2 h=3 c=1 elempack=4
+(0,6,12,18) (1,7,13,19)
+(2,8,14,20) (3,9,15,21)
+(4,10,16,22) (5,11,17,23)
+```
+
+### how to convert elempack
+
+There is a convenient wrapper function provided
+```
+// convert to elempack 4 if packing axis dim is evenly divisible by elempack
+// return the identity Mat otherwise
+ncnn::Mat a;
+ncnn::Mat a_packed;
+ncnn::convert_packing(a, a_packed, 4);
+if (a_packed.elempack == 4)
+{
+    // check if packing is successful
+}
+
+// convert to packing 1, aka unpacking, shall be always successful
+ncnn::Mat b;
+ncnn::Mat b_unpacked;
+ncnn::convert_packing(b, b_unpacked, 1);
+```
+
+### handle general interleaved data
+
+Here is an example of using convert packing to convert RGB interleaved data to planar
+
+**NOTE:** The following code is just presented to explain what packing is and the conversion process. Do not use it in production due to its poor performance. Do use ncnn::Mat::from_pixels()
+
+```
+// rgb_interleaved_u8 is RGB RGB RGB ...
+// rgb_interleaved_u8.w = w;
+// rgb_interleaved_u8.h = h;
+// rgb_interleaved_u8.c = 1;
+// rgb_interleaved_u8.elemsize = 3;
+// rgb_interleaved_u8.elempack = 3;
+
+ncnn::Mat rgb_interleaved_u8(w, h, 1, 3, 3);
+ncnn::Mat rgb_planar_u8;
+
+ncnn::convert_packing(rgb_interleaved_u8, rgb_planar_u8, 1);
+
+// rgb_planar_u8 is now RRR ... GGG ... BBB ...
+// rgb_planar_u8.w = w;
+// rgb_planar_u8.h = h;
+// rgb_planar_u8.c = 3;
+// rgb_planar_u8.elemsize = 1;
+// rgb_planar_u8.elempack = 1;
+```
diff --git a/docs/developer-guide/how-to-implement-custom-layer-step-by-step.md b/docs/developer-guide/how-to-implement-custom-layer-step-by-step.md
new file mode 100644
index 000000000..454b1a18f
--- /dev/null
+++ b/docs/developer-guide/how-to-implement-custom-layer-step-by-step.md
@@ -0,0 +1,323 @@
+# step1 create a new empty class
+```
+// mylayer.h
+#include "layer.h"
+using namespace ncnn;
+
+// a new layer type called MyLayer
+class MyLayer : public Layer
+{
+};
+
+// mylayer.cpp
+#include "mylayer.h"
+DEFINE_LAYER_CREATOR(MyLayer)
+```
+
+# step2 declare layer parameters and weights
+```
+// mylayer.h
+#include "layer.h"
+using namespace ncnn;
+
+class MyLayer : public Layer
+{
+private:
+    int channels;// new code
+    float gamma;// new code
+    Mat weight;// new code
+};
+
+// mylayer.cpp
+#include "mylayer.h"
+DEFINE_LAYER_CREATOR(MyLayer)
+```
+
+# step3 implment load functions for parameters and weights
+```
+// mylayer.h
+#include "layer.h"
+using namespace ncnn;
+
+class MyLayer : public Layer
+{
+public:
+    virtual int load_param(const ParamDict& pd);// new code
+    virtual int load_model(const ModelBin& mb);// new code
+
+private:
+    int channels;
+    float eps;
+    Mat gamma_data;
+};
+
+// mylayer.cpp
+#include "mylayer.h"
+DEFINE_LAYER_CREATOR(MyLayer)
+
+// new routine for loading parameters
+int MyLayer::load_param(const ParamDict& pd)
+{
+    // details about the relations with param file
+    // https://github.com/Tencent/ncnn/wiki/param-and-model-file-structure
+    //
+    channels = pd.get(0, 0);// parse 0=<int value> entry, default value 0
+    eps = pd.get(1, 0.001f);// parse 1=<float value> entry, default value 0.001f
+
+    return 0;// return zero if success
+}
+
+// new routine for loading weights
+int MyLayer::load_model(const ModelBin& mb)
+{
+    // details about the relations with model file
+    // https://github.com/Tencent/ncnn/wiki/param-and-model-file-structure
+    //
+    // read weights with length of channels * sizeof(float)
+    // the second argument explains as follows
+    // 0 judge the value type automatically, you may get float or float16 or uint8 etc
+    //   depends on the model storage and the supporting target hardware
+    // 1 read float values anyway
+    // 2 read float16 values anyway
+    // 3 read uint8 values anyway
+    gamma_data = mb.load(channels, 1);
+    if (gamma_data.empty())
+        return -100;// return non-zero on error, -100 indicates out-of-memory
+
+    return 0;// return zero if success
+}
+```
+
+# step4 determine forward behavior
+```
+// mylayer.h
+#include "layer.h"
+using namespace ncnn;
+
+class MyLayer : public Layer
+{
+public:
+    MyLayer();// new code
+    virtual int load_param(const ParamDict& pd);
+    virtual int load_model(const ModelBin& mb);
+
+private:
+    int channels;
+    float eps;
+    Mat gamma_data;
+};
+
+// mylayer.cpp
+#include "mylayer.h"
+DEFINE_LAYER_CREATOR(MyLayer)
+
+// new routine for setting forward behavior
+MyLayer::MyLayer()
+{
+    // one input and one output
+    // typical one_blob_only type: Convolution, Pooling, ReLU, Softmax ...
+    // typical non-non_blob_only type: Eltwise, Split, Concat, Slice ...
+    one_blob_only = true;
+
+    // do not change the blob size, modify data in-place
+    // typical support_inplace type: ReLU, Sigmoid ...
+    // typical non-support_inplace type: Convolution, Pooling ...
+    support_inplace = true;
+}
+
+int MyLayer::load_param(const ParamDict& pd)
+{
+    channels = pd.get(0, 0);
+    eps = pd.get(1, 0.001f);
+
+    // you could alter the behavior based on loaded parameter
+    // if (eps == 0.001f)
+    // {
+    //     one_blob_only = false;
+    //     support_inplace = false;
+    // }
+
+    return 0;
+}
+
+int MyLayer::load_model(const ModelBin& mb)
+{
+    gamma_data = mb.load(channels, 1);
+    if (gamma_data.empty())
+        return -100;
+
+    // you could alter the behavior based on loaded weight
+    // if (gamma_data[0] == 0.f)
+    // {
+    //     one_blob_only = false;
+    //     support_inplace = false;
+    // }
+
+    return 0;
+}
+```
+
+# step5 choose proper interface based on forward behavior
+```
+// The base class Layer defines four interfaces for each forward behavior combination
+
+// 1
+virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
+// 2
+virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+// 3
+virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const;
+
+// 4
+virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+```
+**must** = layer must implement this function
+
+**optional** = layer may implement this function for optimal performance
+
+sometimes the graph inference path cannot call forward_inplace directly due to data sharing, in this situation the non-inplace forward routine will be used, which deep-copy the input blob and call inplace forward on it if the optional routine is not implemented. Thus, you could avoid this deep-copy by process input to output on-the-fly.
+
+|one_blob_only|support_inplace|1|2|3|4|
+|---|---|---|---|---|---|
+|false|false|must| | | |
+|false|true|optional| |must| |
+|true|false| |must| | |
+|true|true| |optional| |must|
+
+# step6 implement forward function
+```
+// mylayer.h
+#include "layer.h"
+using namespace ncnn;
+
+class MyLayer : public Layer
+{
+public:
+    MyLayer();
+    virtual int load_param(const ParamDict& pd);
+    virtual int load_model(const ModelBin& mb);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;// new code, optional
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;// new code
+
+private:
+    int channels;
+    float eps;
+    Mat gamma_data;
+};
+
+// mylayer.cpp
+#include "mylayer.h"
+DEFINE_LAYER_CREATOR(MyLayer)
+
+MyLayer::MyLayer()
+{
+    one_blob_only = true;
+    support_inplace = true;
+}
+
+int MyLayer::load_param(const ParamDict& pd)
+{
+    channels = pd.get(0, 0);
+    eps = pd.get(1, 0.001f);
+
+    return 0;
+}
+
+int MyLayer::load_model(const ModelBin& mb)
+{
+    gamma_data = mb.load(channels, 1);
+    if (gamma_data.empty())
+        return -100;
+
+    return 0;
+}
+
+// optional new routine for layer forward function, non-inplace version
+int MyLayer::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    // check input dims, return non-zero on error
+    if (bottom_blob.c != channels)
+        return -1;
+
+    // x = (x + eps) * gamma_per_channel
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    size_t elemsize = bottom_blob.elemsize;
+    int size = w * h;
+
+    top_blob.create(w, h, channels, elemsize, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;// return non-zero on error, -100 indicates out-of-memory
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+        float* outptr = top_blob.channel(q);
+        const float gamma = gamma_data[q];
+
+        for (int i=0; i<size; i++)
+        {
+            outptr[i] = (ptr[i] + eps) * gamma ;
+        }
+    }
+
+    return 0;
+}
+
+// new routine for layer forward function
+int MyLayer::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    // check input dims, return non-zero on error
+    if (bottom_top_blob.c != channels)
+        return -1;
+
+    // x = (x + eps) * gamma_per_channel
+
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int size = w * h;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q=0; q<channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+        const float gamma = gamma_data[q];
+
+        for (int i=0; i<size; i++)
+        {
+            ptr[i] = (ptr[i] + eps) * gamma ;
+        }
+    }
+
+    return 0;
+}
+```
+
+# step7 integret with ncnn library
+you may probably need to modify caffe2ncnn or mxnet2ncnn etc. to write your layer specific parameters and weights into ncnn param and model file
+
+the param and model file structure [param-and-model-file-structure](param-and-model-file-structure.md)
+
+```
+// example param file content
+Input            input   0 1 input
+Convolution      conv2d  1 1 input conv2d 0=32 1=1 2=1 3=1 4=0 5=0 6=768
+MyLayer          mylayer 1 1 conv2d mylayer0
+Pooling          maxpool 1 1 mylayer0 maxpool 0=0 1=3 2=2 3=-233 4=0
+```
+
+```
+ncnn::Net net;
+
+// register custom layer before load param and model
+// the layer creator function signature is always XYZ_layer_creator, which defined in DEFINE_LAYER_CREATOR macro
+net.register_custom_layer("MyLayer", MyLayer_layer_creator);
+
+net.load_param("model.param");
+net.load_model("model.bin");
+```
diff --git a/docs/developer-guide/how-to-write-a-neon-optimized-op-kernel.md b/docs/developer-guide/how-to-write-a-neon-optimized-op-kernel.md
new file mode 100644
index 000000000..754be9cc6
--- /dev/null
+++ b/docs/developer-guide/how-to-write-a-neon-optimized-op-kernel.md
@@ -0,0 +1,38 @@
+# benchmark
+op
+
+# naive C with openmp
+for for for
+
+# unroll, first try
+h
+
+# register allocation
+kernels
+
+# unroll, second try
+simd
+
+# neon intrinsics
+optional
+
+# naive neon assembly with pld
+asm
+
+# pipeline optimize, first try
+more register load mla
+
+# pipeline optimize, second try
+interleave load mla
+
+# pipeline optimize, third try
+loop tail
+
+# usual practice, load/save
+233
+
+# usual practice, unroll
+233
+
+# usual practice, save register
+233
diff --git a/docs/developer-guide/low-level-operation-api.md b/docs/developer-guide/low-level-operation-api.md
new file mode 100644
index 000000000..62b700f61
--- /dev/null
+++ b/docs/developer-guide/low-level-operation-api.md
@@ -0,0 +1,228 @@
+implement elementwise addition with/without broadcast using BinaryOp operation
+```
+void binary_add(const ncnn::Mat& a, const ncnn::Mat& b, ncnn::Mat& c)
+{
+    ncnn::Layer* op = ncnn::create_layer("BinaryOp");
+
+    // set param
+    ncnn::ParamDict pd;
+    pd.set(0, 0);// op_type
+
+    op->load_param(pd);
+
+    // forward
+    std::vector<ncnn::Mat> bottoms(2);
+    bottoms[0] = a;
+    bottoms[1] = b;
+
+    std::vector<ncnn::Mat> tops(1);
+    op->forward(bottoms, tops);
+
+    c = tops[0];
+
+    delete op;
+}
+```
+
+implement 3x3 box blur on three channel image using ConvolutionDepthWise operation
+```
+void convolution_3x3_boxblur_RGB(const ncnn::Mat& rgb, ncnn::Mat& out)
+{
+    ncnn::Layer* op = ncnn::create_layer("ConvolutionDepthWise");
+
+    // set param
+    ncnn::ParamDict pd;
+    pd.set(0, 3);// num_output
+    pd.set(1, 3);// kernel_w
+    pd.set(5, 0);// bias_term
+    pd.set(6, 3*3*3);// weight_data_size
+    pd.set(7, 3);// group
+
+    op->load_param(pd);
+
+    // set weights
+    ncnn::Mat weights[1];
+    weights[0].create(3*3*3);// weight_data
+
+    for (int i=0; i<3*3*3; i++)
+    {
+        weights[0][i] = 1.f / 9;
+    }
+
+    op->load_model(ncnn::ModelBinFromMatArray(weights));
+
+    // forward
+    op->forward(rgb, out);
+
+    delete op;
+}
+```
+transpose Mat, chw to cwh
+```
+void transpose(const ncnn::Mat& in, ncnn::Mat& out)
+{
+    ncnn::Layer* op = ncnn::create_layer("Permute");
+
+    // set param
+    ncnn::ParamDict pd;
+    pd.set(0, 1);// order_type
+
+    op->load_param(pd);
+
+    // forward
+    op->forward(in, out);
+
+    delete op;
+}
+```
+apply instance normalization
+// x = (x - mean) / sqrt(var)
+```
+void normalize(const ncnn::Mat& in, ncnn::Mat& out)
+{
+    ncnn::Layer* op = ncnn::create_layer("InstanceNorm");
+
+    // set param
+    ncnn::ParamDict pd;
+    pd.set(0, in.c);// channels
+    pd.set(1, 0.f);// eps
+
+    op->load_param(pd);
+
+    // set weights
+    ncnn::Mat weights[2];
+    weights[0].create(in.c);// gamma_data
+    weights[1].create(in.c);// beta_data
+
+    weights[0].fill(1.f);
+    weights[1].fill(0.f);
+
+    op->load_model(ncnn::ModelBinFromMatArray(weights));
+
+    // forward
+    op->forward(in, out);
+
+    delete op;
+}
+```
+
+# cpu -> gpu -> forward -> gpu -> cpu
+```
+ncnn::create_gpu_instance();
+
+{
+ncnn::VulkanDevice vkdev;
+
+ncnn::VkWeightBufferAllocator g_weight_vkallocator(&vkdev);
+ncnn::VkBlobBufferAllocator g_blob_vkallocator(&vkdev);
+ncnn::VkStagingBufferAllocator g_staging_vkallocator(&vkdev);
+ncnn::VkWeightStagingBufferAllocator g_weight_staging_vkallocator(&vkdev);
+
+// create layer
+ncnn::Layer* convolution = ncnn::create_layer("Convolution");
+convolution->vkdev = &vkdev;
+
+// load param
+{
+ncnn::ParamDict pd;
+pd.set(0, outch);
+pd.set(1, ksize);
+pd.set(6, outch*inch*ksize*ksize);
+pd.use_vulkan_compute = 1;
+
+convolution->load_param(pd);
+}
+
+// load model
+{
+ncnn::Mat weights[2];
+weights[0] = random_mat(outch*inch*ksize*ksize);
+weights[1] = random_mat(outch);
+
+ncnn::ModelBinFromMatArray mb(weights);
+convolution->load_model(mb);
+}
+
+// upload model
+{
+ncnn::VkTransfer cmd(&vkdev);
+cmd.weight_vkallocator = &g_weight_vkallocator;
+cmd.staging_vkallocator = &g_weight_staging_vkallocator;
+
+convolution->upload_model(cmd);
+
+cmd.submit();
+cmd.wait();
+
+g_weight_staging_vkallocator.clear();
+}
+
+// create pipeline
+convolution->create_pipeline();
+
+// set default option
+{
+ncnn::Option opt = ncnn::get_default_option();
+
+opt.lightmode = true;
+opt.num_threads = 4;
+opt.blob_allocator = 0;
+opt.workspace_allocator = 0;
+
+opt.vulkan_compute = true;
+opt.blob_vkallocator = &g_blob_vkallocator;
+opt.workspace_vkallocator = &g_blob_vkallocator;
+opt.staging_vkallocator = &g_staging_vkallocator;
+
+ncnn::set_default_option(opt);
+}
+
+ncnn::Mat bottom = random_mat(w, h, inch);
+
+ncnn::VkMat bottom_gpu;
+
+// copy bottom to bottom_gpu
+{
+bottom_gpu.create_like(bottom, &g_blob_vkallocator, &g_staging_vkallocator);
+bottom_gpu.prepare_staging_buffer();
+bottom_gpu.upload(bottom);
+}
+
+ncnn::VkMat top_gpu;
+
+// forward
+{
+ncnn::VkCompute cmd(&vkdev);
+
+cmd.record_upload(bottom_gpu);
+
+convolution->forward(bottom_gpu, top_gpu, cmd);
+
+top_gpu.prepare_staging_buffer();
+
+cmd.record_download(top_gpu);
+
+cmd.submit();
+cmd.wait();
+}
+
+ncnn::Mat top;
+
+// copy top_gpu to top
+{
+top.create_like(top_gpu);
+top_gpu.download(top);
+}
+
+delete convolution;
+
+g_weight_vkallocator.clear();
+g_blob_vkallocator.clear();
+g_staging_vkallocator.clear();
+g_weight_staging_vkallocator.clear();
+}
+
+ncnn::destroy_gpu_instance();
+
+```
+
diff --git a/docs/developer-guide/ncnn-tips-and-tricks.zh.md b/docs/developer-guide/ncnn-tips-and-tricks.zh.md
new file mode 100644
index 000000000..0416b43bf
--- /dev/null
+++ b/docs/developer-guide/ncnn-tips-and-tricks.zh.md
@@ -0,0 +1,46 @@
+### blob内存是隐含共享的
+
+ncnn的blob最初直接使用opencv的cv::Mat，后发现blob最多只支持三维，因此实现了类似的Mat
+Mat的data每个通道内存16字节对齐，并且有原子的引用计数，a=b不复制数据，超级快
+Mat支持直接引用外部的内存块，不复制数据，加快模型加载和输入输出
+
+举个例子：split layer 将一个blob复制成n个，ncnn中实现为单纯的增加引用计数，没有任何数据复制
+
+### 只运算一部分并保留中间结果
+
+ncnn的net在解决分支依赖时是自上而下深度优先的，因此当网络有多个分支时，运算只会在需要结果的那个分支中进行，节约时间
+当多个分支有重合部分时，运算其中一个分支后会自动保留其余分支所需的中间结果，隐含共享，以便运算其余分支时利用
+
+举个例子：某网络结构为 A -> B -> C1 + C2，向ncnn索要C1结果时，运算过程是 A -> B -> C1，同时B结果引用计数加1自动保留，后面还需要C2结果时，只运算C2就足够了
+
+### 开启轻模式省内存
+
+每个layer都会产生blob，除了最后的结果和多分支中间结果，大部分blob都不值得保留，开启轻模式可以在运算后自动回收，省下内存
+
+举个例子：某网络结构为 A -> B -> C，在轻模式下，向ncnn索要C结果时，A结果会在运算B时自动回收，而B结果会在运算C时自动回收，最后只保留C结果，后面再需要C结果会直接获得，满足绝大部分深度网络的使用方式
+
+### 网络和运算是分开的
+
+ncnn的net是网络模型，实际使用的是extractor，也就是同个net可以有很多个运算实例，而且运算实例互不影响，中间结果保留在extractor内部，在多线程使用时共用网络的结构和参数数据，初始化网络模型和参数只需要一遍
+
+举个例子：全局静态的net实例，初始化一次后，就能不停地生成extractor使用
+
+### openmp虽快但未必合适
+
+ncnn中几乎所有运算都能用上openmp多线程加速，而且性能很赞
+不过系统有时候会突然慢一下，比如手机太热自动降频，界面操作等等，ncnn耗时也会偶尔抖动变长，在计算耗时稳定性比较重要的时候建议关闭openmp，或者设置下extractor线程数
+
+举个例子：手机自拍时，用ncnn进行人脸实时定位，如果耗时突然涨一下就会感觉到掉帧，而稳定的帧率体验更好
+
+### NCNN_STDIO/NCNN_STRING禁用模型文件
+
+ncnn支持加载自有的模型文件和模型内存，NCNN_STDIO控制是否需要支持加载模型文件，设成0能禁用这部分代码，从而减小库的体积，NCNN_STRING设成0能清除大部分可见的字符串和解析过程
+模型内存加载时的参数数据是直接引用的，速度更快，通常在手机上使用这种方式
+
+### 削减 ncnn 内置的层实现
+
+cmake的时候，加参数 -DWITH_LAYER_xxx=OFF 就可以完全不编译对应的内置层，这样可以进一步减小库的体积
+
+### 关于 ARM big.LITTLE 调度
+
+调用set_cpu_powersave可以把ncnn运算线程控制在特定的cpu核心上，大核心速度快耗电多，小核心速度慢点但省电，大小一起用手机热得快
diff --git a/docs/developer-guide/new-model-load-api.md b/docs/developer-guide/new-model-load-api.md
new file mode 100644
index 000000000..1a6cdd192
--- /dev/null
+++ b/docs/developer-guide/new-model-load-api.md
@@ -0,0 +1,194 @@
+## current model load api
+### Cons
+#### long and awful code
+#### two functions
+#### deal float32 float16 quantized-u8
+#### deal alignment size
+```
+#if NCNN_STDIO
+int Convolution::load_model(FILE* binfp)
+{
+    int nread;
+
+    union
+    {
+        struct
+        {
+            unsigned char f0;
+            unsigned char f1;
+            unsigned char f2;
+            unsigned char f3;
+        };
+        unsigned int tag;
+    } flag_struct;
+
+    nread = fread(&flag_struct, sizeof(flag_struct), 1, binfp);
+    if (nread != 1)
+    {
+        fprintf(stderr, "Convolution read flag_struct failed %d\n", nread);
+        return -1;
+    }
+
+    unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;
+
+    weight_data.create(weight_data_size);
+    if (weight_data.empty())
+        return -100;
+
+    if (flag_struct.tag == 0x01306B47)
+    {
+        // half-precision weight data
+        int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned short), 4);
+        std::vector<unsigned short> float16_weights;
+        float16_weights.resize(align_weight_data_size);
+        nread = fread(float16_weights.data(), align_weight_data_size, 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "Convolution read float16_weights failed %d\n", nread);
+            return -1;
+        }
+
+        weight_data = Mat::from_float16(float16_weights.data(), weight_data_size);
+        if (weight_data.empty())
+            return -100;
+    }
+    else if (flag != 0)
+    {
+        // quantized weight data
+        float quantization_value[256];
+        nread = fread(quantization_value, 256 * sizeof(float), 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "Convolution read quantization_value failed %d\n", nread);
+            return -1;
+        }
+
+        int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned char), 4);
+        std::vector<unsigned char> index_array;
+        index_array.resize(align_weight_data_size);
+        nread = fread(index_array.data(), align_weight_data_size, 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "Convolution read index_array failed %d\n", nread);
+            return -1;
+        }
+
+        float* weight_data_ptr = weight_data;
+        for (int i = 0; i < weight_data_size; i++)
+        {
+            weight_data_ptr[i] = quantization_value[ index_array[i] ];
+        }
+    }
+    else if (flag_struct.f0 == 0)
+    {
+        // raw weight data
+        nread = fread(weight_data, weight_data_size * sizeof(float), 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "Convolution read weight_data failed %d\n", nread);
+            return -1;
+        }
+    }
+
+    if (bias_term)
+    {
+        bias_data.create(num_output);
+        if (bias_data.empty())
+            return -100;
+        nread = fread(bias_data, num_output * sizeof(float), 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "Convolution read bias_data failed %d\n", nread);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int Convolution::load_model(const unsigned char*& mem)
+{
+    union
+    {
+        struct
+        {
+            unsigned char f0;
+            unsigned char f1;
+            unsigned char f2;
+            unsigned char f3;
+        };
+        unsigned int tag;
+    } flag_struct;
+
+    memcpy(&flag_struct, mem, sizeof(flag_struct));
+    mem += sizeof(flag_struct);
+
+    unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;
+
+    if (flag_struct.tag == 0x01306B47)
+    {
+        // half-precision weight data
+        weight_data = Mat::from_float16((unsigned short*)mem, weight_data_size);
+        mem += alignSize(weight_data_size * sizeof(unsigned short), 4);
+        if (weight_data.empty())
+            return -100;
+    }
+    else if (flag != 0)
+    {
+        // quantized weight data
+        const float* quantization_value = (const float*)mem;
+        mem += 256 * sizeof(float);
+
+        const unsigned char* index_array = (const unsigned char*)mem;
+        mem += alignSize(weight_data_size * sizeof(unsigned char), 4);
+
+        weight_data.create(weight_data_size);
+        if (weight_data.empty())
+            return -100;
+        float* weight_data_ptr = weight_data;
+        for (int i = 0; i < weight_data_size; i++)
+        {
+            weight_data_ptr[i] = quantization_value[ index_array[i] ];
+        }
+    }
+    else if (flag_struct.f0 == 0)
+    {
+        // raw weight data
+        weight_data = Mat(weight_data_size, (float*)mem);
+        mem += weight_data_size * sizeof(float);
+    }
+
+    if (bias_term)
+    {
+        bias_data = Mat(num_output, (float*)mem);
+        mem += num_output * sizeof(float);
+    }
+
+    return 0;
+}
+```
+
+## new model load api proposed
+### Pros
+#### clean and simple api
+#### element type detection
+```
+int Convolution::load_model(const ModelBin& mb)
+{
+    // auto detect element type
+    weight_data = mb.load(weight_data_size, 0);
+    if (weight_data.empty())
+        return -100;
+
+    if (bias_term)
+    {
+        // certain type specified
+        bias_data = mb.load(num_output, 1);
+        if (bias_data.empty())
+            return -100;
+    }
+
+    return 0;
+}
+```
diff --git a/docs/developer-guide/new-param-load-api.md b/docs/developer-guide/new-param-load-api.md
new file mode 100644
index 000000000..6dde92463
--- /dev/null
+++ b/docs/developer-guide/new-param-load-api.md
@@ -0,0 +1,92 @@
+## current param load api
+### Cons
+#### long and awful code
+#### three functions
+#### not extensible
+#### no default value
+#### no variable length array
+```
+MyLayer  mylayer 1 1 in out 100 1.250000
+```
+```
+binary 100
+binary 1.250000
+```
+```
+#if NCNN_STDIO
+#if NCNN_STRING
+int MyLayer::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d %f", &a, &b);
+    if (nscan != 2)
+    {
+        fprintf(stderr, "MyLayer load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int MyLayer::load_param_bin(FILE* paramfp)
+{
+    fread(&a, sizeof(int), 1, paramfp);
+
+    fread(&b, sizeof(float), 1, paramfp);
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int MyLayer::load_param(const unsigned char*& mem)
+{
+    a = *(int*)(mem);
+    mem += 4;
+
+    b = *(float*)(mem);
+    mem += 4;
+
+    return 0;
+}
+```
+
+## new param load api proposed
+### Pros
+#### clean and simple api
+#### default value
+#### extensible
+#### variable length array
+```
+7767517
+MyLayer  mylayer 1 1 in out 0=100 1=1.250000 -23303=5,0.1,0.2,0.4,0.8,1.0
+```
+```
+binary 0xDD857600(magic)
+
+binary 0
+binary 100
+binary 1
+binary 1.250000
+binary -23303
+binary 5
+binary 0.1
+binary 0.2
+binary 0.4
+binary 0.8
+binary 1.0
+binary -233(EOP)
+```
+```
+int MyLayer::load_param(const ParamDict& pd)
+{
+    // pd.get( param id (seq), default value );
+    a = pd.get(0, 100);
+    b = pd.get(1, 1.25f);
+
+    // get default value for c if not specified in param file
+    c = pd.get(2, 0.001);
+
+    // get array
+    d = pd.get(3, Mat(len, array));
+    return 0;
+}
+```
diff --git a/docs/developer-guide/operation-param-weight-table.md b/docs/developer-guide/operation-param-weight-table.md
new file mode 100644
index 000000000..4cb88fe06
--- /dev/null
+++ b/docs/developer-guide/operation-param-weight-table.md
@@ -0,0 +1,241 @@
+
+|operation|param id|param phase|default value|weight order|
+|:---:|:---:|:---:|:---:|:---:|
+|AbsVal|||
+|ArgMax TODO|||
+|BatchNorm|0|channels|0|slope mean variance bias|
+||1|eps|0.f|
+|Bias|0|bias_data_size|0|
+|BinaryOp|0|op_type|0|
+||1|with_scalar|0|
+||2|b|0.f|
+|BNLL|||
+|Cast|0|type_from|0|
+||1|type_to|0|
+|Clip|0|min|-FLT_MAX|
+||1|max|FLT_MAX|
+|Concat|0|axis|0|
+|Convolution|0|num_output|0|weight bias|
+||1|kernel_w|0|
+||2|dilation_w|1|
+||3|stride_w|1|
+||4|pad_left|0|
+||5|bias_term|0|
+||6|weight_data_size|0|
+||8|int8_scale_term|0|
+||9|activation_type|0|
+||10|activation_params|[ ]|
+||11|kernel_h|kernel_w|
+||12|dilation_h|dilation_w|
+||13|stride_h|stride_w|
+||15|pad_right|pad_left|
+||14|pad_top|pad_left|
+||16|pad_bottom|pad_top|
+||17|impl_type|0|
+|ConvolutionDepthWise|0|num_output|0|weight bias|
+||1|kernel_w|0|
+||2|dilation_w|1|
+||3|stride_w|1|
+||4|pad_left|0|
+||5|bias_term|0|
+||6|weight_data_size|0|
+||7|group|1|
+||8|int8_scale_term|0|
+||9|activation_type|0|
+||10|activation_params|[ ]|
+||11|kernel_h|kernel_w|
+||12|dilation_h|dilation_w|
+||13|stride_h|stride_w|
+||15|pad_right|pad_left|
+||14|pad_top|pad_left|
+||16|pad_bottom|pad_top|
+|Crop|0|woffset|0|
+||1|hoffset|0|
+||2|coffset|0|
+||3|outw|0|
+||4|outh|0|
+||5|outc|0|
+|Deconvolution|0|num_output|0|weight bias|
+||1|kernel_w|0|
+||2|dilation_w|1|
+||3|stride_w|1|
+||4|pad_left|0|
+||5|bias_term|0|
+||6|weight_data_size|0|
+||9|activation_type|0|
+||10|activation_params|[ ]|
+||11|kernel_h|kernel_w|
+||12|dilation_h|dilation_w|
+||13|stride_h|stride_w|
+||15|pad_right|pad_left|
+||14|pad_top|pad_left|
+||16|pad_bottom|pad_top|
+|DeconvolutionDepthWise|0|num_output|0|weight bias|
+||1|kernel_w|0|
+||2|dilation_w|1|
+||3|stride_w|1|
+||4|pad_left|0|
+||5|bias_term|0|
+||6|weight_data_size|0|
+||7|group|1|
+||9|activation_type|0|
+||10|activation_params|[ ]|
+||11|kernel_h|kernel_w|
+||12|dilation_h|dilation_w|
+||13|stride_h|stride_w|
+||15|pad_right|pad_left|
+||14|pad_top|pad_left|
+||16|pad_bottom|pad_top|
+|Dequantize|0|scale|1.f|bias|
+||1|bias_term|0|
+||2|bias_data_size|0|
+|DetectionOutput|0|num_class|0|
+||1|nms_threshold|0.05f|
+||2|nms_top_k|300|
+||3|keep_top_k|100|
+||4|confidence_threshold|0.5f|
+||5|variances[0]|0.1f|
+||6|variances[1]|0.1f|
+||7|variances[2]|0.2f|
+||8|variances[3]|0.2f|
+|Dropout|0|scale|1.f|
+|Eltwise|0|op_type|0|
+||1|coeffs|[ ]|
+|ELU|0|alpha|0.1f|
+|Embed|0|num_output|0|weight bias|
+||1|input_dim|0|
+||2|bias_term|0|
+||3|weight_data_size|0|
+|Exp|0|base|-1.f|
+||1|scale|1.f|
+||2|shift|0.f|
+|ExpandDims|0|expand_w|0|
+||1|expand_h|0|
+||2|expand_c|0|
+|Flatten|||
+|HardSigmoid|0|alpha|0.2f||
+||1|beta|0.5f|
+|HardSwish|0|alpha|0.2f||
+||1|beta|0.5f|
+|InnerProduct|0|num_output|0|weight bias|
+||1|bias_term|0|
+||2|weight_data_size|0|
+||8|int8_scale_term|0|
+||9|activation_type|0|
+||10|activation_params|[ ]|
+|Input|0|w|0|
+||1|h|0|
+||2|c|0|
+|InstanceNorm|0|channels|0|gamma bias|
+||1|eps|0.001f|
+|Interp|0|resize_type|0|
+||1|height_scale|1.f|
+||2|width_scale|1.f|
+||3|output_height|0|
+||4|output_width|0|
+|Log|0|base|-1.f|
+||1|scale|1.f|
+||2|shift|0.f|
+|LRN|0|region_type|0|
+||1|local_size|5|
+||2|alpha|1.f|
+||3|beta|0.75f|
+||4|bias|1.f|
+|MemoryData|0|w|0|
+||1|h|0|
+||2|c|0|
+|MVN|0|normalize_variance|0|
+||1|across_channels|0|
+||2|eps|0.0001f|
+|Normalize|0|across_spatial|0|scale|
+||4|across_channel|0|
+||1|channel_shared|0|
+||2|eps|0.0001f|
+||3|scale_data_size|0|
+|Packing|0|out_packing|1|
+||1|use_padding|0|
+|Padding|0|top|0|
+||1|bottom|0|
+||2|left|0|
+||3|right|0|
+||4|type|0|
+||5|value|0.f|
+|Permute|0|order_type|0|
+|Pooling|0|pooling_type|0|
+||1|kernel_w|0|
+||11|kernel_h|kernel_w|
+||2|stride_w|1|
+||12|stride_h|stride_w|
+||3|pad_left|0|
+||14|pad_right|pad_left|
+||13|pad_top|pad_left|
+||15|pad_bottom|pad_top|
+||4|global_pooling|0|
+||5|pad_mode|0|
+|Power|0|power|1.f|
+||1|scale|1.f|
+||2|shift|0.f|
+|PReLU|0|num_slope|0|slope|
+|PriorBox|0|min_sizes|[ ]|
+||1|max_sizes|[ ]|
+||2|aspect_ratios|[ ]|
+||3|varainces[0]|0.f|
+||4|varainces[1]|0.f|
+||5|varainces[2]|0.f|
+||6|varainces[3]|0.f|
+||7|flip|1|
+||8|clip|0|
+||9|image_width|0|
+||10|image_height|0|
+||11|step_width|-233.f|
+||12|step_height|-233.f|
+||13|offset|0.f|
+|Proposal|0|feat_stride|16|
+||1|base_size|16|
+||2|pre_nms_topN|6000|
+||3|after_nms_topN|300|
+||4|num_thresh|0.7f|
+||5|min_size|16|
+|PSROIPooling|0|pooled_width|7|
+||1|pooled_height|7|
+||2|spatial_scale|0.0625f|
+||3|output_dim|0|
+|Quantize|0|scale|1.f|
+|Reduction|0|operation|0|
+||1|dim|0|
+||2|coeff|1.f|
+|ReLU|0|slope|0.f|
+|Reorg|0|stride|0|
+|Requantize|0|scale_in|1.f|bias|
+||1|scale_out|1.f|
+||2|bias_term|0|
+||3|bias_data_size|0|
+||4|fusion_relu|0|
+|Reshape|0|w|-233|
+||1|h|-233|
+||2|c|-233|
+||3|permute|0|
+|ROIAlign|0|pooled_width|0|
+||1|pooled_height|0|
+||2|spatial_scale|1.f|
+|ROIPooling|0|pooled_width|0|
+||1|pooled_height|0|
+||2|spatial_scale|1.f|
+|Scale|0|scale_data_size|0|scale bias|
+||1|bias_term|0|
+|ShuffleChannel|0|group|1|
+|Sigmoid|||
+|Slice|0|slices|[ ]|
+||1|axis|0|
+|Softmax|0|axis|0|
+|Split|||
+|SPP TODO|||
+|Squeeze|0|squeeze_w|0|
+||1|squeeze_h|0|
+||2|squeeze_c|0|
+|TanH|||
+|Threshold|0|threshold|0.f|
+|Tile TODO|||
+|UnaryOp|0|op_type|0|
+|RNN TODO|||
+|LSTM TODO|||
diff --git a/docs/developer-guide/param-and-model-file-structure.md b/docs/developer-guide/param-and-model-file-structure.md
new file mode 100644
index 000000000..120ade34a
--- /dev/null
+++ b/docs/developer-guide/param-and-model-file-structure.md
@@ -0,0 +1,64 @@
+## net.param
+### example
+```
+7767517
+3 3
+Input         input    0 1 data 0=4 1=4 2=1
+InnerProduct  ip       1 1 data fc 0=10 1=1 2=80
+Softmax       softmax  1 1 fc prob 0=0
+```
+### overview
+```
+[magic]
+```
+* magic number : 7767517
+```
+[layer count] [blob count]
+```
+* layer count : count of the layer line follows, should be exactly the count of all layer names
+* blob count : count of all blobs, usually greater than or equals to the layer count
+### layer line
+```
+[layer type] [layer name] [input count] [output count] [input blobs] [output blobs] [layer specific params]
+```
+* layer type : type name, such as Convolution Softmax etc
+* layer name : name of this layer, must be unique among all layer names
+* input count : count of the blobs this layer needs as input
+* output count : count of the blobs this layer produces as output
+* input blobs : name list of all the input blob names, seperated by space, must be unique among input blob names of all layers
+* output blobs : name list of all the output blob names, seperated by space, must be unique among output blob names of all layers
+* layer specific params : key=value pair list, seperated by space
+### layer param
+```
+0=1 1=2.5 -23303=2,2.0,3.0
+```
+key index should be unique in each layer line, pair can be omitted if the default value used
+
+the meaning of existing param key index can be looked up at [operation-param-weight-table](https://github.com/Tencent/ncnn/wiki/operation-param-weight-table)
+
+* integer or float key : index 0 ~ 19
+* integer value : int
+* float value : float
+* integer array or float array key : -23300 minus index 0 ~ 19
+* integer array value : [array size],int,int,...,int
+* float array value : [array size],float,float,...,float
+
+## net.bin
+```
+  +---------+---------+---------+---------+---------+---------+
+  | weight1 | weight2 | weight3 | weight4 | ....... | weightN |
+  +---------+---------+---------+---------+---------+---------+
+  ^         ^         ^         ^
+  0x0      0x80      0x140     0x1C0
+```
+the model binary is the concatenation of all weight data, each weight buffer is aligned by 32bit
+
+### weight buffer
+```
+[flag] (optional)
+[raw data]
+[padding] (optional)
+```
+* flag : unsigned int,  little-endian, indicating the weight storage type, 0 => float32, 0x01306B47 => float16, otherwise => quantized int8, may be omitted if the layer implementation forced the storage type explicitly
+* raw data : raw weight data, little-endian, float32 data or float16 data or quantized table and indexes depending on the storage type flag
+* padding : padding space for 32bit alignment, may be omitted if already aligned
diff --git a/docs/developer-guide/preload-practice.zh.md b/docs/developer-guide/preload-practice.zh.md
new file mode 100644
index 000000000..1ee3d392e
--- /dev/null
+++ b/docs/developer-guide/preload-practice.zh.md
@@ -0,0 +1,29 @@
+## 只是实践经验，没有理论，不一定正确
+
+```
+prfm pldl1keep, [x0, #256]
+```
+* 放在 ld1 [x0] 前面 0~8 条指令
+* #256 表示把 x0+256 的内容放进 L1 cache
+* ldp 也适用
+* (经验)不写 offset 不如写个 #128
+* (经验)pldl1strm 似乎没啥意思，也没 pldl1keep 快
+* (经验)x0 ~ x0+256 的内容也会进来
+* (经验)load 128bit 用 #128，256bit或更多用 #256
+* (经验)避免 pld a，pld b，load a，load b 顺序，可能相互干扰
+* (经验)提前太多会失效
+* (经验)适合连续读
+
+```
+prfm pldl2strm, [x0, #256]
+```
+* 放在 ld1 [x0] 前面 N 条指令，N 尽量大些
+* #256 表示把 x0+256 的内容放进 L2 cache
+* ldp 也适用
+* (经验)不写 offset 不如写个 #128
+* (经验)pldl2strm 效果稍好于 pldl2keep
+* (经验)x0 ~ x0+256 的内容也会进来
+* (经验)load 128bit 用 #128，256bit 用 #256
+* (经验)读很多数据，用不同 offset 连续两次 pldl2strm
+* (经验)后面不要对同位置再 pldl1keep，会变慢
+* (经验)适合提前准备要跳到很远的地方读，比如换 channel
diff --git a/docs/developer-guide/tensorflow-op-combination.md b/docs/developer-guide/tensorflow-op-combination.md
new file mode 100644
index 000000000..88a86d50d
--- /dev/null
+++ b/docs/developer-guide/tensorflow-op-combination.md
@@ -0,0 +1,57 @@
+## batchnorm
+```
+Input       A            0 1 A 0 0 0
+MemoryData  sub/y        0 1 sub/y 16 0 0
+BinaryOp    sub          2 1 A sub/y sub 1
+MemoryData  div/y        0 1 div/y 16 0 0
+BinaryOp    div          2 1 sub div/y div 3
+MemoryData  mul/y        0 1 mul/y 16 0 0
+BinaryOp    mul          2 1 div mul/y mul 2
+MemoryData  BiasAdd/bias 0 1 BiasAdd/bias 16 0 0
+BinaryOp    BiasAdd      2 1 mul BiasAdd/bias BiasAdd 0
+```
+## convolution
+```
+Input       A            0 1 A 0 0 0
+Convolution Conv2D       1 1 A Conv2D 10 3 1 1 0 0 270
+MemoryData  biases/read  0 1 biases/read 10 0 0
+BinaryOp    BiasAdd      2 1 Conv2D biases/read BiasAdd 0
+```
+## innerproduct
+```
+Input        A           0 1 A 0 0 0
+MemoryData   biases/read 0 1 biases/read 10 0 0
+InnerProduct MatMul      1 1 A MatMul 10 0 2560
+BinaryOp     conv6       2 1 MatMul biases/read conv6 0
+```
+## leakyrelu
+```
+Input       A            0 1 A 0 0 0
+Split       splitncnn_0  1 2 A A_splitncnn_0 A_splitncnn_1
+MemoryData  mul_1/x      0 1 mul_1/x 0 0 0
+BinaryOp    mul_1        2 1 mul_1/x A_splitncnn_1 mul_1 2
+BinaryOp    leaky        2 1 mul_1 A_splitncnn_0 leaky 4
+```
+## prelu
+```
+Input       A            0 1 A 0 0 0
+Split       splitncnn_0  1 2 A A_splitncnn_0 A_splitncnn_1
+MemoryData  prelu/alpha  0 1 prelu/alpha 10 0 0
+ReLU        prelu/Relu   1 1 A_splitncnn_1 prelu/Relu 0.000000
+UnaryOp     prelu/Neg    1 1 A_splitncnn_0 prelu/Neg 1
+ReLU        prelu/Relu_1 1 1 prelu/Neg prelu/Relu_1 0.000000
+UnaryOp     prelu/Neg_1  1 1 prelu/Relu_1 prelu/Neg_1 1
+BinaryOp    prelu/Mul    2 1 prelu/alpha prelu/Neg_1 prelu/Mul 2
+BinaryOp    prelu/add    2 1 prelu/Relu prelu/Mul prelu/add 0
+```
+## softmax
+```
+Input       A            0 1 A 0 0 0
+Split       splitncnn_4  1 2 A A_splitncnn_0 A_splitncnn_1
+Reduction   Max          1 1 A_splitncnn_1 Max 4 -2 1.000000
+BinaryOp    sub          2 1 A_splitncnn_0 Max sub 1
+UnaryOp     Exp          1 1 sub Exp 7
+Split       splitncnn_5  1 2 Exp Exp_splitncnn_0 Exp_splitncnn_1
+Reduction   Sum          1 1 Exp_splitncnn_1 Sum 0 -2 1.000000
+BinaryOp    prob         2 1 Exp_splitncnn_0 Sum prob 3
+```
\ No newline at end of file
diff --git a/docs/how-to-build/build-for-VS2017.zh.md b/docs/how-to-build/build-for-VS2017.zh.md
new file mode 100644
index 000000000..4a2b52786
--- /dev/null
+++ b/docs/how-to-build/build-for-VS2017.zh.md
@@ -0,0 +1,45 @@
+## 预先准备
+
+Visual Studio 2017 Community Edition，使用动态的 CRT 运行库
+
+以下命令行均使用  **适用于 VS 2017 的 x64 本机工具命令提示**
+
+## 编译安装 protobuf
+
+https://github.com/google/protobuf/archive/v3.4.0.zip
+
+我下载到 C:/Users/shuiz/source 解压缩
+
+```
+mkdir build-vs2017
+cd build-vs2017
+cmake -G"NMake Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=%cd%/install \
+    -Dprotobuf_BUILD_TESTS=OFF \
+    -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
+nmake
+nmake install
+```
+
+protobuf 会安装在 build-vs2017/install 里头
+
+## 编译安装 ncnn
+
+https://github.com/Tencent/ncnn.git
+
+cmake 命令中的 protobuf 路径要相应修改成自己的
+
+```
+mkdir build-vs2017
+cd build-vs2017
+cmake -G"NMake Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=%cd%/install \
+    -DProtobuf_INCLUDE_DIR=C:/Users/shuiz/source/protobuf-3.4.0/build-vs2017/install/include \
+    -DProtobuf_LIBRARIES=C:/Users/shuiz/source/protobuf-3.4.0/build-vs2017/install/lib/libprotobuf.lib \
+    -DProtobuf_PROTOC_EXECUTABLE=C:/Users/shuiz/source/protobuf-3.4.0/build-vs2017/install/bin/protoc.exe ..
+nmake
+nmake install
+```
+
+ncnn 会安装在 build-vs2017/install 里头
+
+ncnn 转换工具在 build-vs2017/tools 里头
+
diff --git a/docs/how-to-build/build-for-android.zh.md b/docs/how-to-build/build-for-android.zh.md
new file mode 100644
index 000000000..1ad49e4b5
--- /dev/null
+++ b/docs/how-to-build/build-for-android.zh.md
@@ -0,0 +1,90 @@
+### 安装 android-ndk
+
+传送门 http://developer.android.com/ndk/downloads/index.html
+
+比如我把 android-ndk 解压到 /home/nihui/android-ndk-r15c
+```
+export ANDROID_NDK=/home/nihui/android-ndk-r15c
+```
+
+### 准备 android toolchain 文件
+
+android.toolchain.cmake 这个文件可以从 $ANDROID_NDK/build/cmake 找到
+
+(可选) 删除debug编译参数，缩小二进制体积 [android-ndk issue](https://github.com/android-ndk/ndk/issues/243)
+```
+# 用编辑器打开 $ANDROID_NDK/build/cmake/android.toolchain.cmake
+# 删除 "-g" 这行
+list(APPEND ANDROID_COMPILER_FLAGS
+  -g
+  -DANDROID
+```
+
+### 编译方法
+```
+mkdir build-android
+cd build-android
+cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+    -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON \
+    -DANDROID_PLATFORM=android-14 ..
+make
+make install
+make package
+```
+没有遇到错误的话，sdk 包已经静静地在 build-android/dist 目录里等你了
+
+这里简要介绍几个参数
+
+ANDROID_ABI 是架构名字，"armeabi-v7a" 支持绝大部分手机硬件
+
+ANDROID_ARM_NEON 是否使用 NEON 指令集，设为 ON 支持绝大部分手机硬件
+
+ANDROID_PLATFORM 指定最低系统版本，"android-14" 就是 android-4.0
+
+armv5的参数
+```
+ANDROID_ABI="armeabi"
+```
+armv8的参数
+```
+ANDROID_ABI="arm64-v8a"
+```
+x86的参数
+```
+ANDROID_ABI="x86"
+```
+x86_64的参数
+```
+ANDROID_ABI="x86_64"
+```
+
+### CMakeLists.txt 要注意的地方
+
+开头 project(XXX) 之前要加
+```
+if(CMAKE_TOOLCHAIN_FILE)
+set(CMAKE_INSTALL_PREFIX ${CMAKE_BINARY_DIR}/install CACHE PATH "Installation Directory")
+endif()
+```
+交叉编译通常不需要把文件装在编译主机上的，所以 CMAKE_INSTALL_PREFIX 设置为 build-android/install
+
+使用 opencv 这类第三方库的时候，也要指定 android 版本的路径
+```
+set(OpenCV_DIR "/home/nihui/opencv-2.4.11/sdk/native/jni")
+find_package(OpenCV REQUIRED)
+```
+
+CMakeLists.txt 里头可以用 if(ANDROID) .... endif() 来判断是否给 android 编译
+
+android 并不全是 arm 架构，如果要编译 neon 优化的源码文件，还要判断下处理器架构
+当然最好还是别分成两个文件，在同一个 cpp 里用 __ARM_NEON 围起来
+```
+set(XXX_SRCS matrix_test.cpp)
+if((ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a"))
+    OR (ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64")))
+    # 这里是 arm 专门的源代码文件
+    set(XXX_SRCS ${XXX_SRCS} matrix_mul_neon.cpp)
+else()
+    set(XXX_SRCS ${XXX_SRCS} matrix_mul_c.cpp)
+endif()
+```
\ No newline at end of file
diff --git a/docs/how-to-build/build-for-ios.zh.md b/docs/how-to-build/build-for-ios.zh.md
new file mode 100644
index 000000000..95dbeac21
--- /dev/null
+++ b/docs/how-to-build/build-for-ios.zh.md
@@ -0,0 +1,123 @@
+### 安装 xcode 和 cmake
+
+传送门 https://developer.apple.com/xcode/download
+
+传送门 https://cmake.org/download
+
+默认情况 cmake 命令行可能用不了，需要手工加在 PATH 里面
+```
+export PATH=/Applications/CMake.app/Contents/bin/:$PATH
+```
+ 
+### 准备 ios toolchain 文件
+
+把 ios.toolchain.cmake 放到和 CMakeLists.txt 同一级的项目目录里
+可以去 opencv 的 github 上弄来，自己稍微调整下。
+传送门 https://github.com/Itseez/opencv/tree/master/platforms/ios/cmake
+
+### 编译方法
+```
+mkdir build-ios
+cd build-ios
+cmake -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_TOOLCHAIN_FILE=../ios.toolchain.cmake \
+    -DIOS_PLATFORM=iPhoneOS \
+    -DCMAKE_OSX_ARCHITECTURES=armv7 ..
+make
+make install
+make package
+```
+没有遇到错误的话，手机平台 armv7 库已经编好了
+
+这里简要介绍几个参数
+
+IOS_PLATFORM 是平台名字，iPhoneOS 是真实机器的 ios，iPhoneSimulator 是模拟器平台
+
+CMAKE_OSX_ARCHITECTURES 指定架构，iPhoneOS 配套 armv7 armv7s arm64，iPhoneSimulator 配套 i386 x86_64
+
+### 打包成 framework
+
+手工新建目录 XXX.framework/Versions/A，还有些软链接
+```
+mkdir -p XXX.framework/Versions/A/Headers
+mkdir -p XXX.framework/Versions/A/Resources
+ln -s A XXX.framework/Versions/Current
+ln -s Versions/Current/Headers XXX.framework/Headers
+ln -s Versions/Current/Resources XXX.framework/Resources
+ln -s Versions/Current/XXX XXX.framework/XXX
+```
+framework 里面的库是多架构的，得先把 5 种架构都编译出来，比如分别编译在
+
+build-iPhoneOS-armv7
+
+build-iPhoneOS-armv7s
+
+build-iPhoneOS-arm64
+
+build-iPhoneSimulator-i386
+
+build-iPhoneSimulator-x86_64
+
+### 合成胖子库(fat)
+```
+lipo -create \
+    build-iPhoneOS-armv7/install/lib/libXXX.a \
+    build-iPhoneOS-armv7s/install/lib/libXXX.a \
+    build-iPhoneOS-arm64/install/lib/libXXX.a \
+    build-iPhoneSimulator-i386/install/lib/libXXX.a \
+    build-iPhoneSimulator-x86_64/install/lib/libXXX.a \
+    -o XXX.framework/Versions/A/XXX
+```
+复制头文件和 Info.plist
+```
+cp -r build-iPhoneOS-armv7/install/include/* XXX.framework/Versions/A/Headers/
+cp Info.plist XXX.framework/Versions/A/Resources/
+```
+压缩成 zip
+```
+zip -y -r XXX.framework.zip XXX.framework
+```
+
+### CMakeLists.txt 要注意的地方
+
+开头 project(XXX) 之前要加
+```
+if(CMAKE_TOOLCHAIN_FILE)
+set(CMAKE_INSTALL_PREFIX ${CMAKE_BINARY_DIR}/install CACHE PATH "Installation Directory")
+endif()
+```
+交叉编译通常不需要把文件装在主机上的，所以 CMAKE_INSTALL_PREFIX 设置为 build-android/install 这里
+
+使用 opencv2.framework 这类第三方 sdk，需要指定这些 framework 的位置
+```
+set(CMAKE_FRAMEWORK_PATH "/Users/nihui/Downloads")
+add_definitions(-F ${CMAKE_FRAMEWORK_PATH})
+```
+
+ios 平台默认不允许生成动态库
+
+add_library(XXX SHARED ${XXX_SRCS}) 并没有效果，越狱设备除外
+
+本文中的 ios toolchain 文件默认指定使用 libc++，最低系统需求为 ios 6.0
+
+如果要修改这个配置，在 ios.toolchain.cmake 文件里
+```
+set (CMAKE_C_FLAGS_INIT "-isysroot ${CMAKE_OSX_SYSROOT} -miphoneos-version-min=6.0")
+set (CMAKE_CXX_FLAGS_INIT "-stdlib=libc++ -fvisibility=hidden -fvisibility-inlines-hidden -isysroot ${CMAKE_OSX_SYSROOT} -miphoneos-version-min=6.0")
+```
+
+CMakeLists.txt 里头可以用 if(IOS) .... endif() 来判断是否给 ios 编译
+
+ios 并不全是 arm 架构，如果要编译 neon 优化的源码文件，还要判断下处理器架构
+当然最好还是别分成两个文件，在同一个 cpp 里用 __ARM_NEON 围起来
+```
+set(XXX_SRCS matrix_test.cpp)
+if((IOS AND ("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "armv7"))
+    OR (IOS AND ("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "armv7s"))
+    OR (IOS AND ("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "arm64")))
+    # 这里是 arm 专门的源代码文件
+    set(XXX_SRCS ${XXX_SRCS} matrix_mul_neon.cpp)
+else()
+    set(XXX_SRCS ${XXX_SRCS} matrix_mul_c.cpp)
+endif()
+```
diff --git a/docs/how-to-build/build.md b/docs/how-to-build/build.md
new file mode 100644
index 000000000..0f3aa5775
--- /dev/null
+++ b/docs/how-to-build/build.md
@@ -0,0 +1,444 @@
+* [Build for Linux x86](build.md#build-for-linux-x86)
+* [Build for Windows x64 using VS2017](build-for-VS2017.zh.md)
+* [Build for MacOSX](build.md#build-for-macosx)
+* [Build for Raspberry Pi 3](build.md#build-for-raspberry-pi-3)
+* [Build for NVIDIA Jetson](build.md#build-for-nvidia-jetson)
+* [Build for ARM Cortex-A family with cross-compiling](build.md#build-for-arm-cortex-a-family-with-cross-compiling)
+* [Build for Android](build.md#build-for-android)
+* [Build for iOS on MacOSX with xcode](build.md#build-for-ios-on-macosx-with-xcode)
+* [Build for iOS on Linux with cctools-port](build.md#build-for-ios-on-linux-with-cctools-port)
+* [Build for Hisilicon platform with cross-compiling](build.md#build-for-hisilicon-platform-with-cross-compiling)
+
+***
+
+### Build for Linux x86
+install g++ cmake protobuf
+
+(optional) download and install vulkan-sdk from https://vulkan.lunarg.com/sdk/home
+```
+$ wget https://sdk.lunarg.com/sdk/download/1.1.92.1/linux/vulkansdk-linux-x86_64-1.1.92.1.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.1.92.1.tar.gz
+$ tar -xf vulkansdk-linux-x86_64-1.1.92.1.tar.gz
+
+# setup env
+$ export VULKAN_SDK=`pwd`/1.1.92.1/x86_64
+```
+
+```
+$ cd <ncnn-root-dir>
+$ mkdir -p build
+$ cd build
+
+# cmake option NCNN_VULKAN for enabling vulkan
+$ cmake -DNCNN_VULKAN=OFF ..
+
+$ make -j4
+```
+install opencv for building example
+```
+$ cd <ncnn-root-dir>
+
+uncomment add_subdirectory(examples)
+ in CMakeLists.txt with your favourite editor
+
+$ mkdir -p build
+$ cd build
+$ cmake ..
+$ make -j4
+
+copy examples/squeezenet_v1.1.param to build/examples
+copy examples/squeezenet_v1.1.bin to build/examples
+
+$ cd build/examples
+$ ./squeezenet yourimage.jpg 
+
+output top-3 class-id and score
+you may refer examples/synset_words.txt to find the class name
+404 = 0.990290
+908 = 0.004464
+405 = 0.003941
+```
+
+***
+
+### Build for Windows x64 using Visual Studio Community 2017
+
+install Visual Studio Community 2017
+```
+download Visual Studio Community 2017 from https://visualstudio.microsoft.com/vs/community/
+install it
+Start → Programs → Visual Studio 2017 → Visual Studio Tools → x64 Native Tools Command Prompt for VS 2017
+```
+build protobuf library
+```
+download protobuf-3.4.0 from https://github.com/google/protobuf/archive/v3.4.0.zip
+> cd <protobuf-root-dir>
+> mkdir build-vs2017
+> cd build-vs2017
+> cmake -G"NMake Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=%cd%/install -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
+> nmake
+> nmake install
+```
+(optional) download and install vulkan-sdk from https://vulkan.lunarg.com/sdk/home
+
+launch VulkanSDK-1.1.92.1-Installer.exe and install
+
+build ncnn library (replace <protobuf-root-dir> with your path)
+```
+> cd <ncnn-root-dir>
+> mkdir -p build-vs2017
+> cd build-vs2017
+
+# cmake option NCNN_VULKAN for enabling vulkan
+> cmake -G"NMake Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=%cd%/install -DProtobuf_INCLUDE_DIR=<protobuf-root-dir>/build-vs2017/install/include -DProtobuf_LIBRARIES=<protobuf-root-dir>/build-vs2017/install/lib/libprotobuf.lib -DProtobuf_PROTOC_EXECUTABLE=<protobuf-root-dir>/build-vs2017/install/bin/protoc.exe -DNCNN_VULKAN=OFF ..
+
+> nmake
+> nmake install
+
+pick build-vs2017/install folder for further usage
+```
+
+***
+
+### Build for MacOSX
+install xcode and protobuf
+
+**Because the compiler bundled with xcode do not support openmp feature, you cannot enable the multithreading inference feature of ncnn library, if you build with xcode.**
+
+```
+# install protobuf via homebrew
+$ brew install protobuf
+```
+
+(optional) download and install vulkan-sdk from https://vulkan.lunarg.com/sdk/home
+```
+$ wget https://sdk.lunarg.com/sdk/download/1.1.92.1/mac/vulkansdk-macos-1.1.92.1.tar.gz?Human=true -O vulkansdk-macos-1.1.92.1.tar.gz
+$ tar -xf vulkansdk-macos-1.1.92.1.tar.gz
+
+# setup env
+$ export VULKAN_SDK=`pwd`/vulkansdk-macos-1.1.92.1/macOS
+```
+
+```
+$ cd <ncnn-root-dir>
+$ mkdir -p build
+$ cd build
+
+# cmake option NCNN_VULKAN for enabling vulkan
+$ cmake -DNCNN_VULKAN=OFF ..
+
+$ make -j4
+$ make install
+```
+
+pick build/install folder for further usage
+
+***
+
+### Build for Raspberry Pi 3
+install g++ cmake protobuf
+```
+$ cd <ncnn-root-dir>
+$ mkdir -p build
+$ cd build
+$ cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/pi3.toolchain.cmake -DPI3=ON ..
+$ make -j4
+$ make install
+```
+
+pick build/install folder for further usage
+
+***
+
+### Build for NVIDIA Jetson
+#### download Vulkan SDK from NVIDIA
+please click the `Vulkan SDK File` link on [https://developer.nvidia.com/embedded/vulkan](https://developer.nvidia.com/embedded/vulkan), at the time of writing we got `Vulkan_loader_demos_1.1.100.tar.gz`
+
+scp the downloaded SDK to your Jetson device
+
+```bash
+scp Vulkan_loader_demos_1.1.100.tar.gz USERNAME@JETSON_IP:~/
+```
+
+from this monment on, we will work on the Jetson device
+```bash
+ssh USERNAME@JETSON_IP
+```
+
+#### install Vulkan SDK
+
+```bash
+cd ~/Vulkanloader_demos_1.1.100
+sudo cp loader/libvulkan.so.1.1.100 /usr/lib/aarch64-linux-gnu/
+cd /usr/lib/aarch64-linux-gnu/
+sudo rm -rf libvulkan.so.1 libvulkan.so
+sudo ln -s libvulkan.so.1.1.100 libvulkan.so
+sudo ln -s libvulkan.so.1.1.100 libvulkan.so.1
+cd ~/
+```
+
+#### install glslang dependency
+```
+# glslang is a dependency of Tencent/ncnn
+git clone --depth=1 https://github.com/KhronosGroup/glslang.git
+cd glslang
+# assure that SPIR-V generated from HLSL is legal for Vulkan
+./update_glslang_sources.py
+mkdir -p build && cd build
+sudo make -j`nproc` install && cd ..
+```
+
+#### compile ncnn
+```
+git clone https://github.com/Tencent/ncnn.git
+# while aarch64-linux-gnu.toolchain.cmake would compile Tencent/ncnn as well
+# but why not compile with more native features w
+cd ncnn && mkdir -p build && cd build
+cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/jetson.toolchain.cmake -DNCNN_VULKAN=ON -DCMAKE_BUILD_TYPE=Release ..
+make -j`nproc`
+sudo make install
+```
+
+***
+
+### Build for ARM Cortex-A family with cross-compiling
+download ARM toolchain from https://developer.arm.com/open-source/gnu-toolchain/gnu-a/downloads
+```
+$ export PATH=<your-toolchain-compiler-path>:$PATH
+```
+AArch32 target with soft float (arm-linux-gnueabi)
+```
+$ cd <ncnn-root-dir>
+$ mkdir -p build-arm-linux-gnueabi
+$ cd build-arm-linux-gnueabi
+$ cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake ..
+$ make -j4
+$ make install
+```
+AArch32 target with hard float (arm-linux-gnueabihf)
+```
+$ cd <ncnn-root-dir>
+$ mkdir -p build-arm-linux-gnueabihf
+$ cd build-arm-linux-gnueabihf
+$ cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf.toolchain.cmake ..
+$ make -j4
+$ make install
+```
+AArch64 GNU/Linux target (aarch64-linux-gnu)
+```
+$ cd <ncnn-root-dir>
+$ mkdir -p build-aarch64-linux-gnu
+$ cd build-aarch64-linux-gnu
+$ cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake ..
+$ make -j4
+$ make install
+```
+
+pick build-XXXXX/install folder for further usage
+
+***
+
+### Build for Android
+you can use the pre-build ncnn-android-lib.zip from https://github.com/Tencent/ncnn/releases
+
+install android-ndk
+```
+download android-ndk from http://developer.android.com/ndk/downloads/index.html
+$ unzip android-ndk-r18b-linux-x86_64.zip
+$ export ANDROID_NDK=<your-ndk-root-path>
+```
+(optional) drop debug compile flag to reduce binary size due to [android-ndk issue](https://github.com/android-ndk/ndk/issues/243)
+```
+# edit $ANDROID_NDK/build/cmake/android.toolchain.cmake with your favorite editor
+# remove "-g" line
+list(APPEND ANDROID_COMPILER_FLAGS
+  -g
+  -DANDROID
+```
+
+(optional) download and install vulkan-sdk from https://vulkan.lunarg.com/sdk/home
+```
+$ wget https://sdk.lunarg.com/sdk/download/1.1.92.1/linux/vulkansdk-linux-x86_64-1.1.92.1.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.1.92.1.tar.gz
+$ tar -xf vulkansdk-linux-x86_64-1.1.92.1.tar.gz
+
+# setup env
+$ export VULKAN_SDK=`pwd`/1.1.92.1/x86_64
+```
+
+build armv7 library
+```
+$ cd <ncnn-root-dir>
+$ mkdir -p build-android-armv7
+$ cd build-android-armv7
+
+$ cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+    -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON \
+    -DANDROID_PLATFORM=android-14 ..
+
+# if you want to enable vulkan, platform api version >= android-24 is needed
+$ cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+    -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON \
+    -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
+
+$ make -j4
+$ make install
+
+pick build-android-armv7/install folder for further jni usage
+```
+build aarch64 library
+```
+$ cd <ncnn-root-dir>
+$ mkdir -p build-android-aarch64
+$ cd build-android-aarch64
+
+$ cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+    -DANDROID_ABI="arm64-v8a" \
+    -DANDROID_PLATFORM=android-21 ..
+
+# if you want to enable vulkan, platform api version >= android-24 is needed
+$ cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+    -DANDROID_ABI="arm64-v8a" \
+    -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
+
+$ make -j4
+$ make install
+
+pick build-android-aarch64/install folder for further jni usage
+```
+
+***
+
+### Build for iOS on MacOSX with xcode
+you can use the pre-build ncnn.framework and openmp.framework from https://github.com/Tencent/ncnn/releases
+
+install xcode
+
+**Because the compiler bundled with xcode do not support openmp feature, you cannot enable the multithreading inference feature of ncnn library, if you build with xcode.**
+
+(optional) download and install vulkan-sdk from https://vulkan.lunarg.com/sdk/home
+```
+$ wget https://sdk.lunarg.com/sdk/download/1.1.92.1/mac/vulkansdk-macos-1.1.92.1.tar.gz?Human=true -O vulkansdk-macos-1.1.92.1.tar.gz
+$ tar -xf vulkansdk-macos-1.1.92.1.tar.gz
+
+# setup env
+$ export VULKAN_SDK=`pwd`/vulkansdk-macos-1.1.92.1/macOS
+```
+
+build library for iPhoneOS
+```
+$ cd <ncnn-root-dir>
+$ mkdir build-ios
+$ cd build-ios
+
+$ cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DIOS_PLATFORM=OS ..
+
+# vulkan is only available on arm64 devices
+$ cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DIOS_PLATFORM=OS64 -DVulkan_INCLUDE_DIR=`pwd`/vulkansdk-macos-1.1.92.1/MoltenVK/include -DVulkan_LIBRARY=`pwd`/vulkansdk-macos-1.1.92.1/MoltenVK/iOS/dynamic/libMoltenVK.dylib -DNCNN_VULKAN=ON ..
+
+$ make -j4
+$ make install
+```
+
+build library for iPhoneSimulator
+```
+$ cd <ncnn-root-dir>
+$ mkdir build-ios-sim
+$ cd build-ios-sim
+$ cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DIOS_PLATFORM=SIMULATOR ..
+$ make -j4
+$ make install
+```
+package framework
+```
+$ cd <ncnn-root-dir>
+$ mkdir -p ncnn.framework/Versions/A/Headers
+$ mkdir -p ncnn.framework/Versions/A/Resources
+$ ln -s A ncnn.framework/Versions/Current
+$ ln -s Versions/Current/Headers ncnn.framework/Headers
+$ ln -s Versions/Current/Resources ncnn.framework/Resources
+$ ln -s Versions/Current/ncnn ncnn.framework/ncnn
+$ lipo -create \
+    build-ios/install/lib/libncnn.a \
+    build-ios-sim/install/lib/libncnn.a \
+    -o ncnn.framework/Versions/A/ncnn
+$ cp -r build-ios/install/include/* ncnn.framework/Versions/A/Headers/
+$ cp Info.plist ncnn.framework/Versions/A/Resources/
+
+pick ncnn.framework folder for app development
+```
+
+***
+
+### Build for iOS on Linux with cctools-port
+you can use the pre-build ncnn.framework and openmp.framework from https://github.com/Tencent/ncnn/releases
+
+setup cross-compiling environment with https://github.com/tpoechtrager/cctools-port
+
+**you can enable the multithreading inference feature of ncnn library, if you build with cctools-port.**
+
+```
+$ cd <ncnn-root-dir>
+
+change CMAKE_IOS_SDK_ROOT variable to your cctools-port target path
+ in iosxc.toolchain.cmake and iossimxc.toolchain.cmake with your favourite editor
+```
+build armv7 arm64 library
+```
+$ cd <ncnn-root-dir>
+$ mkdir -p build-ios
+$ cd build-ios
+$ cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iosxc.toolchain.cmake ..
+$ make
+$ make install
+```
+build i386 x86_64 simulator library
+```
+$ cd <ncnn-root-dir>
+$ mkdir -p build-ios-sim
+$ cd build-ios-sim
+$ cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iossimxc.toolchain.cmake ..
+$ make
+$ make install
+```
+package framework
+```
+$ cd <ncnn-root-dir>
+$ mkdir -p ncnn.framework/Versions/A/Headers
+$ mkdir -p ncnn.framework/Versions/A/Resources
+$ ln -s A ncnn.framework/Versions/Current
+$ ln -s Versions/Current/Headers ncnn.framework/Headers
+$ ln -s Versions/Current/Resources ncnn.framework/Resources
+$ ln -s Versions/Current/ncnn ncnn.framework/ncnn
+$ lipo -create \
+    build-ios/install/lib/libncnn.a \
+    build-ios-sim/install/lib/libncnn.a \
+    -o ncnn.framework/Versions/A/ncnn
+$ cp -r build-ios/install/include/* ncnn.framework/Versions/A/Headers/
+$ cp Info.plist ncnn.framework/Versions/A/Resources/
+
+pick ncnn.framework folder for app development
+```
+
+***
+
+### Build for Hisilicon platform with cross-compiling
+download and install Hisilicon SDK
+```
+# the path that toolchain should be installed in
+$ ls /opt/hisi-linux/x86-arm
+```
+```
+$ cd <ncnn-root-dir>
+$ mkdir -p build
+$ cd build
+
+# choose one cmake toolchain file depends on your target platform
+$ cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/hisiv300.toolchain.cmake ..
+$ cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/hisiv500.toolchain.cmake ..
+$ cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/himix100.toolchain.cmake ..
+$ cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/himix200.toolchain.cmake ..
+
+$ make -j4
+$ make install
+```
+
+pick build/install folder for further usage
diff --git a/docs/how-to-build/enable-openmp-for-ios.zh.md b/docs/how-to-build/enable-openmp-for-ios.zh.md
new file mode 100644
index 000000000..c80db36af
--- /dev/null
+++ b/docs/how-to-build/enable-openmp-for-ios.zh.md
@@ -0,0 +1,289 @@
+### 背景知识
+
+目前，最新版本的 Xcode-8.2.1 携带的 clang 编译器不具有 openmp 特性，Apple 方面比较推 GCD 技术，网络上能找到的大多是教你如何把 openmp 代码改用 GCD 实现，而关于如何真正用 openmp，这是第一篇吧
+
+这篇文章记录 up 主在 Linux 上用 clang 编译器和交叉编译方式实现 ios 的 openmp 加速
+
+### 准备交叉编译环境
+
+系统：fedora 25
+
+工具链：llvm 3.8.1，clang 3.8.0，fuse 2.9.7
+
+从 Apple 官网上下载 Xcode_7.3.1.dmg，因为 cctools-port 还没有支持最新版的 Xcode
+https://developer.apple.com/download/more/
+
+安装 Xcode dmg 挂载工具
+https://github.com/darlinghq/darling-dmg
+```
+$ mkdir xcode
+$ ./darling-dmg/build/darling-dmg Xcode_7.3.1.dmg xcode
+Skipping partition of type Primary GPT Header
+Skipping partition of type Primary GPT Table
+Skipping partition of type Apple_Free
+Skipping partition of type C12A7328-F81F-11D2-BA4B-00A0C93EC93B
+Using partition #4 of type Apple_HFS
+Everything looks OK, disk mounted
+```
+提取 ios sdk，卸载 dmg
+```
+$ mkdir -p iPhoneSDK/iPhoneOS9.3.sdk
+$ cp -r xcode/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/* iPhoneSDK/iPhoneOS9.3.sdk
+$ cp -r xcode/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/include/c++/* iPhoneSDK/iPhoneOS9.3.sdk/usr/include/c++
+$ fusermount -u xcode  # unmount the image
+```
+打包 ios sdk
+```
+$ cd iPhoneSDK
+$ tar -cf - iPhoneOS9.3.sdk | xz -9 -c - > iPhoneOS9.3.sdk.tar.xz
+```
+安装 cctools 移植版本，感谢 cjacker 和 tpoechtrager 的辛勤付出！
+https://github.com/tpoechtrager/cctools-port
+```
+$ cd cctools-port/usage_examples/ios_toolchain
+$ ./build.sh iPhoneOS9.3.sdk.tar.xz armv7
+```
+交叉编译的工具链在 cctools-port/usage_examples/ios_toolchain/target/bin 目录下
+
+### 编译 libomp
+
+下载 llvm 官网上的 openmp 运行时
+```
+svn co http://llvm.org/svn/llvm-project/openmp/trunk openmp
+```
+准备好 ios 交叉编译的 cmake toolchain，文件名 iosxc.toolchain.cmake，放在 openmp 目录中
+```
+# standard settings
+set (CMAKE_SYSTEM_NAME Darwin)
+set (CMAKE_SYSTEM_VERSION 1)
+set (UNIX True)
+set (APPLE True)
+set (IOS True)
+
+set(CMAKE_C_COMPILER arm-apple-darwin11-clang)
+set(CMAKE_CXX_COMPILER arm-apple-darwin11-clang++)
+
+set(_CMAKE_TOOLCHAIN_PREFIX arm-apple-darwin11-)
+
+# 这里指定交叉编译工具链中的 sdk 目录
+set(CMAKE_IOS_SDK_ROOT "/home/nihui/osd/cctools-port/usage_examples/ios_toolchain/target/SDK/")
+
+# set the sysroot default to the most recent SDK
+set(CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")
+
+# set the architecture for iOS 双架构
+set(IOS_ARCH armv7;arm64)
+set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS")
+
+# set the find root to the iOS developer roots and to user defined paths
+set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE string "iOS find search path root")
+
+# searching for frameworks only
+set(CMAKE_FIND_FRAMEWORK FIRST)
+
+# set up the default search directories for frameworks
+set(CMAKE_SYSTEM_FRAMEWORK_PATH
+    ${CMAKE_IOS_SDK_ROOT}/System/Library/Frameworks
+)
+```
+直接编译的话会失败，所以 up 主自己弄了下面三个补丁，放在附件里可以下载
+编译补丁A，ios sdk 没有 crt_externs.h 头文件，改为经典声明
+```
+diff --git a/runtime/src/kmp_environment.cpp b/runtime/src/kmp_environment.cpp
+index d4d95df..c8c2970 100644
+--- a/runtime/src/kmp_environment.cpp
++++ b/runtime/src/kmp_environment.cpp
+@@ -64,12 +64,12 @@
+ #if KMP_OS_UNIX
+     #include <stdlib.h>    // getenv, setenv, unsetenv.
+     #include <string.h>    // strlen, strcpy.
+-    #if KMP_OS_DARWIN
+-        #include <crt_externs.h>
+-        #define environ (*_NSGetEnviron())
+-    #else
++//     #if KMP_OS_DARWIN
++//         #include <crt_externs.h>
++//         #define environ (*_NSGetEnviron())
++//     #else
+         extern char * * environ;
+-    #endif
++//     #endif
+ #elif KMP_OS_WINDOWS
+     #include <windows.h>   // GetEnvironmentVariable, SetEnvironmentVariable, GetLastError.
+ #else
+
+```
+```
+$ patch -p1 -i openmp-ios-classic-environ.patch
+```
+编译补丁B，clang 不支持 .size 的语法，删除，并为符号名字前补上额外下划线，不然会链接失败
+```
+diff --git a/runtime/src/z_Linux_asm.s b/runtime/src/z_Linux_asm.s
+index d6e1c0b..69f94ef 100644
+--- a/runtime/src/z_Linux_asm.s
++++ b/runtime/src/z_Linux_asm.s
+@@ -1781,10 +1781,10 @@ __kmp_invoke_microtask:
+     .comm .gomp_critical_user_,32,8
+     .data
+     .align 4
+-    .global __kmp_unnamed_critical_addr
+-__kmp_unnamed_critical_addr:
++    .global ___kmp_unnamed_critical_addr
++___kmp_unnamed_critical_addr:
+     .4byte .gomp_critical_user_
+-    .size __kmp_unnamed_critical_addr,4
++//    .size __kmp_unnamed_critical_addr,4
+ #endif /* KMP_ARCH_ARM */
+ 
+ #if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
+@@ -1792,10 +1792,10 @@ __kmp_unnamed_critical_addr:
+     .comm .gomp_critical_user_,32,8
+     .data
+     .align 8
+-    .global __kmp_unnamed_critical_addr
+-__kmp_unnamed_critical_addr:
++    .global ___kmp_unnamed_critical_addr
++___kmp_unnamed_critical_addr:
+     .8byte .gomp_critical_user_
+-    .size __kmp_unnamed_critical_addr,8
++//    .size __kmp_unnamed_critical_addr,8
+ #endif /* KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 */
+ 
+ #if KMP_OS_LINUX
+
+```
+```
+$ patch -p1 -i openmp-kmp_unnamed_critical_addr-clang-arm-build-fix.patch
+```
+编译补丁C，交叉编译的工具链会把 complex 类型的除法放在 compiler-rt builtin library 实现，但是 ios sdk 本身没有，为了免去麻烦就直接删掉了。这个补丁也许在正常的 macos 下不需要，不过 up 主不用 macos 也就不管了
+```
+diff --git a/runtime/src/kmp_atomic.cpp b/runtime/src/kmp_atomic.cpp
+index 3831165..b969175 100644
+--- a/runtime/src/kmp_atomic.cpp
++++ b/runtime/src/kmp_atomic.cpp
+@@ -1139,23 +1139,23 @@ ATOMIC_CRITICAL( float16, div, QUAD_LEGACY,     /, 16r,   1 )            // __km
+ ATOMIC_CMPXCHG_WORKAROUND( cmplx4, add, kmp_cmplx32, 64, +, 8c, 7, 1 )   // __kmpc_atomic_cmplx4_add
+ ATOMIC_CMPXCHG_WORKAROUND( cmplx4, sub, kmp_cmplx32, 64, -, 8c, 7, 1 )   // __kmpc_atomic_cmplx4_sub
+ ATOMIC_CMPXCHG_WORKAROUND( cmplx4, mul, kmp_cmplx32, 64, *, 8c, 7, 1 )   // __kmpc_atomic_cmplx4_mul
+-ATOMIC_CMPXCHG_WORKAROUND( cmplx4, div, kmp_cmplx32, 64, /, 8c, 7, 1 )   // __kmpc_atomic_cmplx4_div
++// ATOMIC_CMPXCHG_WORKAROUND( cmplx4, div, kmp_cmplx32, 64, /, 8c, 7, 1 )   // __kmpc_atomic_cmplx4_div
+ // end of the workaround for C78287
+ #else
+ ATOMIC_CRITICAL( cmplx4,  add, kmp_cmplx32,     +,  8c,   1 )            // __kmpc_atomic_cmplx4_add
+ ATOMIC_CRITICAL( cmplx4,  sub, kmp_cmplx32,     -,  8c,   1 )            // __kmpc_atomic_cmplx4_sub
+ ATOMIC_CRITICAL( cmplx4,  mul, kmp_cmplx32,     *,  8c,   1 )            // __kmpc_atomic_cmplx4_mul
+-ATOMIC_CRITICAL( cmplx4,  div, kmp_cmplx32,     /,  8c,   1 )            // __kmpc_atomic_cmplx4_div
++// ATOMIC_CRITICAL( cmplx4,  div, kmp_cmplx32,     /,  8c,   1 )            // __kmpc_atomic_cmplx4_div
+ #endif // USE_CMPXCHG_FIX
+ 
+ ATOMIC_CRITICAL( cmplx8,  add, kmp_cmplx64,     +, 16c,   1 )            // __kmpc_atomic_cmplx8_add
+ ATOMIC_CRITICAL( cmplx8,  sub, kmp_cmplx64,     -, 16c,   1 )            // __kmpc_atomic_cmplx8_sub
+ ATOMIC_CRITICAL( cmplx8,  mul, kmp_cmplx64,     *, 16c,   1 )            // __kmpc_atomic_cmplx8_mul
+-ATOMIC_CRITICAL( cmplx8,  div, kmp_cmplx64,     /, 16c,   1 )            // __kmpc_atomic_cmplx8_div
++// ATOMIC_CRITICAL( cmplx8,  div, kmp_cmplx64,     /, 16c,   1 )            // __kmpc_atomic_cmplx8_div
+ ATOMIC_CRITICAL( cmplx10, add, kmp_cmplx80,     +, 20c,   1 )            // __kmpc_atomic_cmplx10_add
+ ATOMIC_CRITICAL( cmplx10, sub, kmp_cmplx80,     -, 20c,   1 )            // __kmpc_atomic_cmplx10_sub
+ ATOMIC_CRITICAL( cmplx10, mul, kmp_cmplx80,     *, 20c,   1 )            // __kmpc_atomic_cmplx10_mul
+-ATOMIC_CRITICAL( cmplx10, div, kmp_cmplx80,     /, 20c,   1 )            // __kmpc_atomic_cmplx10_div
++// ATOMIC_CRITICAL( cmplx10, div, kmp_cmplx80,     /, 20c,   1 )            // __kmpc_atomic_cmplx10_div
+ #if KMP_HAVE_QUAD
+ ATOMIC_CRITICAL( cmplx16, add, CPLX128_LEG,     +, 32c,   1 )            // __kmpc_atomic_cmplx16_add
+ ATOMIC_CRITICAL( cmplx16, sub, CPLX128_LEG,     -, 32c,   1 )            // __kmpc_atomic_cmplx16_sub
+@@ -1541,7 +1541,7 @@ ATOMIC_BEGIN_MIX(TYPE_ID,TYPE,OP_ID,RTYPE_ID,RTYPE)
+ ATOMIC_CMPXCHG_CMPLX( cmplx4, kmp_cmplx32, add, 64, +, cmplx8,  kmp_cmplx64,  8c, 7, KMP_ARCH_X86 ) // __kmpc_atomic_cmplx4_add_cmplx8
+ ATOMIC_CMPXCHG_CMPLX( cmplx4, kmp_cmplx32, sub, 64, -, cmplx8,  kmp_cmplx64,  8c, 7, KMP_ARCH_X86 ) // __kmpc_atomic_cmplx4_sub_cmplx8
+ ATOMIC_CMPXCHG_CMPLX( cmplx4, kmp_cmplx32, mul, 64, *, cmplx8,  kmp_cmplx64,  8c, 7, KMP_ARCH_X86 ) // __kmpc_atomic_cmplx4_mul_cmplx8
+-ATOMIC_CMPXCHG_CMPLX( cmplx4, kmp_cmplx32, div, 64, /, cmplx8,  kmp_cmplx64,  8c, 7, KMP_ARCH_X86 ) // __kmpc_atomic_cmplx4_div_cmplx8
++// ATOMIC_CMPXCHG_CMPLX( cmplx4, kmp_cmplx32, div, 64, /, cmplx8,  kmp_cmplx64,  8c, 7, KMP_ARCH_X86 ) // __kmpc_atomic_cmplx4_div_cmplx8
+ 
+ // READ, WRITE, CAPTURE are supported only on IA-32 architecture and Intel(R) 64
+ #if KMP_ARCH_X86 || KMP_ARCH_X86_64
+diff --git a/runtime/src/kmp_atomic.h b/runtime/src/kmp_atomic.h
+index 7a98de6..d3d37c2 100644
+--- a/runtime/src/kmp_atomic.h
++++ b/runtime/src/kmp_atomic.h
+@@ -573,15 +573,15 @@ void __kmpc_atomic_float16_div( ident_t *id_ref, int gtid, QUAD_LEGACY * lhs, QU
+ void __kmpc_atomic_cmplx4_add(  ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx32 rhs );
+ void __kmpc_atomic_cmplx4_sub(  ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx32 rhs );
+ void __kmpc_atomic_cmplx4_mul(  ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx32 rhs );
+-void __kmpc_atomic_cmplx4_div(  ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx32 rhs );
++// void __kmpc_atomic_cmplx4_div(  ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx32 rhs );
+ void __kmpc_atomic_cmplx8_add(  ident_t *id_ref, int gtid, kmp_cmplx64 * lhs, kmp_cmplx64 rhs );
+ void __kmpc_atomic_cmplx8_sub(  ident_t *id_ref, int gtid, kmp_cmplx64 * lhs, kmp_cmplx64 rhs );
+ void __kmpc_atomic_cmplx8_mul(  ident_t *id_ref, int gtid, kmp_cmplx64 * lhs, kmp_cmplx64 rhs );
+-void __kmpc_atomic_cmplx8_div(  ident_t *id_ref, int gtid, kmp_cmplx64 * lhs, kmp_cmplx64 rhs );
++// void __kmpc_atomic_cmplx8_div(  ident_t *id_ref, int gtid, kmp_cmplx64 * lhs, kmp_cmplx64 rhs );
+ void __kmpc_atomic_cmplx10_add( ident_t *id_ref, int gtid, kmp_cmplx80 * lhs, kmp_cmplx80 rhs );
+ void __kmpc_atomic_cmplx10_sub( ident_t *id_ref, int gtid, kmp_cmplx80 * lhs, kmp_cmplx80 rhs );
+ void __kmpc_atomic_cmplx10_mul( ident_t *id_ref, int gtid, kmp_cmplx80 * lhs, kmp_cmplx80 rhs );
+-void __kmpc_atomic_cmplx10_div( ident_t *id_ref, int gtid, kmp_cmplx80 * lhs, kmp_cmplx80 rhs );
++// void __kmpc_atomic_cmplx10_div( ident_t *id_ref, int gtid, kmp_cmplx80 * lhs, kmp_cmplx80 rhs );
+ #if KMP_HAVE_QUAD
+ void __kmpc_atomic_cmplx16_add( ident_t *id_ref, int gtid, CPLX128_LEG * lhs, CPLX128_LEG rhs );
+ void __kmpc_atomic_cmplx16_sub( ident_t *id_ref, int gtid, CPLX128_LEG * lhs, CPLX128_LEG rhs );
+@@ -753,7 +753,7 @@ void __kmpc_atomic_float10_div_rev_fp( ident_t *id_ref, int gtid, long double *
+ void __kmpc_atomic_cmplx4_add_cmplx8( ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx64 rhs );
+ void __kmpc_atomic_cmplx4_sub_cmplx8( ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx64 rhs );
+ void __kmpc_atomic_cmplx4_mul_cmplx8( ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx64 rhs );
+-void __kmpc_atomic_cmplx4_div_cmplx8( ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx64 rhs );
++// void __kmpc_atomic_cmplx4_div_cmplx8( ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx64 rhs );
+ 
+ // generic atomic routines
+ void __kmpc_atomic_1(  ident_t *id_ref, int gtid, void* lhs, void* rhs, void (*f)( void *, void *, void * ) );
+
+```
+```
+$ patch -p1 -i openmp-atomic-drop-complex-div.patch
+```
+编译 libomp 静态库
+```
+$ mkdir build-ios
+$ cd build-ios
+$ cmake -DCMAKE_TOOLCHAIN_FILE=../iosxc.toolchain.cmake -DLIBOMP_ENABLE_SHARED=off ..
+$ make
+```
+编译完成后，libomp.a 和 omp.h 在 openmp/build-ios/runtime/src 目录里
+把这两个文件分别放在 cctools-port/usage_examples/ios_toolchain/target/SDK/usr/lib 和 cctools-port/usage_examples/ios_toolchain/target/SDK/usr/include 里面，成为 sdk 的一部分
+
+### openmp 测试程序
+```
+#include <stdio.h>
+#include "omp.h"
+
+int main()
+{
+    #pragma omp parallel for
+    for (int i=0; i<20; i++)
+    {
+        fprintf(stderr, "%d\n", i);
+    }
+}
+```
+编译方法，增加 -fopenmp 参数
+```
+$ cctools-port/usage_examples/ios_toolchain/target/bin/arm-apple-darwin11-clang -fopenmp testomp.c -o testomp
+```
+找一台双核cpu的越狱设备，比如这个 ipad2，把程序上传后运行，乱许输出表明 openmp 是可用的了
+```
+Chengjiede-iPad:~ root# ./testomp
+0
+1
+2
+3
+10
+4
+11
+5
+12
+6
+13
+7
+14
+8
+15
+9
+16
+17
+18
+19
+```
diff --git a/docs/how-to-use-and-FAQ/FAQ-ncnn-produce-wrong-result.md b/docs/how-to-use-and-FAQ/FAQ-ncnn-produce-wrong-result.md
new file mode 100644
index 000000000..379112ea8
--- /dev/null
+++ b/docs/how-to-use-and-FAQ/FAQ-ncnn-produce-wrong-result.md
@@ -0,0 +1,128 @@
+### caffemodel should be row-major
+
+`caffe2ncnn` tool assumes the caffemodel is row-major (produced by c++ caffe train command).
+
+The kernel 3x3 weights should be stored as
+```
+a b c
+d e f
+g h i
+```
+
+However, matlab caffe produced col-major caffemodel.
+
+You have to transpose all the kernel weights by yourself or re-training using c++ caffe train command.
+
+Besides, you may interest in https://github.com/conanhujinming/matcaffe2caffe
+
+### check input is RGB or BGR
+
+If your caffemodel is trained using c++ caffe and opencv, then the input image should be BGR order.
+
+If your model is trained using matlab caffe or mxnet or tensorflow, the input image would probably be RGB order.
+
+The channel order can be changed on-the-fly through proper pixel type enum
+```
+// construct RGB blob from rgb image
+ncnn::Mat in_rgb = ncnn::Mat::from_pixels(rgb_data, ncnn::Mat::PIXEL_RGB, w, h);
+
+// construct BGR blob from bgr image
+ncnn::Mat in_bgr = ncnn::Mat::from_pixels(bgr_data, ncnn::Mat::PIXEL_BGR, w, h);
+
+// construct BGR blob from rgb image
+ncnn::Mat in_bgr = ncnn::Mat::from_pixels(rgb_data, ncnn::Mat::PIXEL_RGB2BGR, w, h);
+
+// construct RGB blob from bgr image
+ncnn::Mat in_rgb = ncnn::Mat::from_pixels(bgr_data, ncnn::Mat::PIXEL_BGR2RGB, w, h);
+```
+
+### Mat::from_pixels/from_pixels_resize assume that the pixel data is continous
+
+You shall pass continous pixel buffer to from_pixels family.
+
+If your image is an opencv submat from an image roi, call clone() to get a continous one.
+```
+cv::Mat image;// the image
+cv::Rect facerect;// the face rectangle
+
+cv::Mat faceimage = image(facerect).clone();// get a continous sub image
+
+ncnn::Mat in = ncnn::Mat::from_pixels(faceimage.data, ncnn::Mat::PIXEL_BGR, faceimage.cols, faceimage.rows);
+```
+
+### pre process
+Apply pre process according to your training configuration
+
+Different model has different pre process config, you may find the following transform config in Data layer section
+```
+transform_param {
+    mean_value: 103.94
+    mean_value: 116.78
+    mean_value: 123.68
+    scale: 0.017
+}
+```
+Then the corresponding code for ncnn pre process is
+```
+const float mean_vals[3] = { 103.94f, 116.78f, 123.68f };
+const float norm_vals[3] = { 0.017f, 0.017f, 0.017f };
+in.substract_mean_normalize(mean_vals, norm_vals);
+```
+
+Mean file is not supported currently
+
+So you have to pre process the input data by yourself (use opencv or something)
+```
+transform_param {
+    mean_file: "imagenet_mean.binaryproto"
+}
+```
+
+### use the desired blob
+The blob names for input and extract are differ among models.
+
+For example, squeezenet v1.1 use "data" as input blob and "prob" as output blob while mobilenet-ssd use "data" as input blob and "detection_out" as output blob.
+
+Some models may need multiple input or produce multiple output.
+
+```
+ncnn::Extractor ex = net.create_extractor();
+
+ex.input("data", in);// change "data" to yours
+ex.input("mask", mask);// change "mask" to yours
+
+ex.extract("output1", out1);// change "output1" to yours
+ex.extract("output2", out2);// change "output2" to yours
+```
+
+### blob may have channel gap
+Each channel pointer is aligned by 128bit in ncnn Mat structure.
+
+blob may have gaps between channels if (width x height) can not divided exactly by 4
+
+Prefer using ncnn::Mat::from_pixels or ncnn::Mat::from_pixels_resize for constructing input blob from image data
+
+If you do need a continous blob buffer, reshape the output.
+```
+// out is the output blob extracted
+ncnn::Mat flattened_out = out.reshape(out.w * out.h * out.c);
+
+// plain array, C-H-W
+const float* outptr = flattened_out;
+```
+
+### create new Extractor for each image
+The `ncnn::Extractor` object is stateful, if you reuse for different input, you will always get exact the same result cached inside.
+
+Always create new Extractor to process images in loop unless you do know how the stateful Extractor works.
+```
+for (int i=0; i<count; i++)
+{
+    // always create Extractor
+    // it's cheap and almost instantly !
+    ncnn::Extractor ex = net.create_extractor();
+
+    // use
+    ex.input(your_data[i]);
+}
+```
diff --git a/docs/how-to-use-and-FAQ/FAQ-ncnn-throw-error.md b/docs/how-to-use-and-FAQ/FAQ-ncnn-throw-error.md
new file mode 100644
index 000000000..80e845b6a
--- /dev/null
+++ b/docs/how-to-use-and-FAQ/FAQ-ncnn-throw-error.md
@@ -0,0 +1,82 @@
+### param is too old, please regenerate
+
+Your model file is being the old format converted by an old caffe2ncnn tool.
+
+Checkout the latest ncnn code, build it and regenerate param and model binary files, and that should work.
+
+Make sure that your param file starts with the magic number 7767517.
+
+you may find more info on [how-to-use-ncnn-with-alexnet](how-to-use-ncnn-with-alexnet.md)
+
+### find_blob_index_by_name XYZ failed
+
+That means ncnn couldn't find the XYZ blob in the network. 
+
+You shall call Extractor::input()/extract() by blob name instead of layer name.
+
+For models loaded from binary param file or external memory, you shall call Extractor::input()/extract() by the enum defined in xxx.id.h because all the visible string literals have been stripped in binary form.
+
+This error usually happens when the input layer is not properly converted.
+
+You shall upgrade caffe prototxt/caffemodel before converting it to ncnn. Following snnipet type shall be ok. 
+
+```
+layer {
+  name: "data"
+  type: "Input"
+  top: "data"
+  input_param { shape: { dim: 1 dim: 3 dim: 227 dim: 227 } }
+}
+```
+
+you may find more info on [how-to-use-ncnn-with-alexnet](how-to-use-ncnn-with-alexnet.md).
+
+### layer XYZ not exists or registered
+
+Your network contains some operations that are not implemented in ncnn.
+
+You may implement them as custom layer followed in [how-to-implement-custom-layer-step-by-step](how-to-implement-custom-layer-step-by-step.md).
+
+Or you could simply register them as no-op if you are sure those operations make no sense.
+
+```
+class Noop : public ncnn::Layer {};
+DEFINE_LAYER_CREATOR(Noop)
+
+net.register_custom_layer("LinearRegressionOutput", Noop_layer_creator);
+net.register_custom_layer("MAERegressionOutput", Noop_layer_creator);
+```
+
+### fopen XYZ.param/XYZ.bin failed
+
+File not found or not readable. Make sure that XYZ.param/XYZ.bin is accessible.
+
+### network graph not ready
+
+You shall call Net::load_param() first, then Net::load_model().
+
+This error may also happens when Net::load_param() failed, but not properly handled.
+
+### memory not 32-bit aligned at XYZ
+
+The pointer passed to Net::load_param() or Net::load_model() is not 32bit aligned.
+
+In practice, the head pointer of std::vector<unsigned char> is not guaranteed to be 32bit aligned.
+
+you can store your binary buffer in ncnn::Mat structure, its internal memory is aligned.
+
+### Why I get so many XYZ unsupported yet errors when converting tensorflow model
+
+~~Sorry, I decided to give up maintaining this tool after struggling quite lots time.~~
+I have given up maintaining this tool!
+
+The tensorflow model usually contains huge amount of operations which are not implemented in ncnn, and the operations are too tiny and piecemeal that makes big runtime overhead which is not suitable for deploying on devices.
+
+### undefined reference to '__kmpc_XYZ_XYZ'
+
+use clang for building android shared library
+
+comment the following line in your Application.mk
+```
+NDK_TOOLCHAIN_VERSION := 4.9
+```
diff --git a/docs/how-to-use-and-FAQ/FAQ-ncnn-vulkan.md b/docs/how-to-use-and-FAQ/FAQ-ncnn-vulkan.md
new file mode 100644
index 000000000..3496af83d
--- /dev/null
+++ b/docs/how-to-use-and-FAQ/FAQ-ncnn-vulkan.md
@@ -0,0 +1,92 @@
+### how to enable ncnn vulkan capablity
+
+follow [the build and install instruction](../how-to-build/build.md)
+
+make sure you have installed vulkan sdk from [lunarg vulkan sdk website](https://vulkan.lunarg.com/sdk/home)
+
+Usually, you can enable the vulkan compute inference feature by adding only three lines of code to your application.
+
+```
+// initialize when app starts
+ncnn::create_gpu_instance();// line1
+
+// enable vulkan compute feature before loading
+ncnn::Net net;
+net.opt.use_vulkan_compute = 1;// line2
+
+// deinitialize when app exits
+ncnn::destroy_gpu_instance();// line3
+```
+
+### does my graphics device support vulkan
+
+Some platforms have been tested and known working. In theory, if your platform support vulkan api, either 1.0 or 1.1, it shall work.
+
+* Y = known work
+* ? = shall work, not confirmed
+* / = not applied
+
+|    |windows|linux|android|mac|ios|
+|---|---|---|---|---|---|
+|intel|Y|Y|?|?|/|
+|amd|Y|Y|/|?|/|
+|nvidia|Y|Y|?|/|/|
+|qcom|/|/|Y|/|/|
+|apple|/|/|/|?|Y|
+|arm|/|?|Y|/|/|
+
+You can search [the vulkan database](https://vulkan.gpuinfo.org) to see if your device supports vulkan.
+
+Some old buggy drivers may produce wrong result, that are blacklisted in ncnn and treated as non-vulkan capable device.
+You could check if your device and driver have this issue with  [my conformance test here](../benchmark/vulkan-conformance-test.md).
+Most of these systems are android with version lower than 8.1.
+
+### why using vulkan over cuda/opencl/metal
+
+In the beginning, I had no GPGPU programming experience, and I had to learn one.
+
+vulkan is considered more portable and well supported by venders and the cross-platform low-overhead graphics api. As a contrast, cuda is only available on nvidia device, metal is only available on macos and ios, while loading opencl library is banned in android 7.0+ and does not work on ios.
+
+### I got errors like "vkCreateComputePipelines failed -1000012000" or random stalls or crashes
+
+Upgrade your vulkan driver.
+
+[intel https://downloadcenter.intel.com/product/80939/Graphics-Drivers](https://downloadcenter.intel.com/product/80939/Graphics-Drivers)
+
+[amd https://www.amd.com/en/support](https://www.amd.com/en/support)
+
+[nvidia https://www.nvidia.com/Download/index.aspx](https://www.nvidia.com/Download/index.aspx)
+
+### how to use ncnn vulkan on android
+
+minimum android ndk version: android-ndk-r18b
+
+minimum sdk platform api version: android-24
+
+link your jni project with libvulkan.so
+
+[The squeezencnn example](https://github.com/Tencent/ncnn/tree/master/examples/squeezencnn) have equipped gpu inference, you could take it as reference.
+
+### how to use ncnn vulkan on ios
+
+setup vulkan sdk (https://vulkan.lunarg.com/sdk/home#mac)
+
+metal only works on real device with arm64 cpu (iPhone 5s and later)
+
+link your project with MoltenVK framework and Metal
+
+### what about the layers without vulkan support
+
+These layers have vulkan support currently
+
+AbsVal, BatchNorm, BinaryOp(no broadcasting), Cast, Clip, Concat, Convolution(pad -233 not supported), ConvolutionDepthWise(pad -233 not supported), Crop, Deconvolution, DeconvolutionDepthWise, Dropout, Eltwise, Flatten, HardSigmoid, InnerProduct, Interp, LRN, Packing, Padding, Permute, Pooling(pad SAME not supported), PReLU, PriorBox, ReLU, Reorg, Reshape, Scale, ShuffleChannel, Sigmoid, Softmax, TanH, UnaryOp
+
+For these layers without vulkan support, ncnn inference engine will automatically fallback to cpu path.
+
+Thus, it is usually not a serious issue if your network only has some special head layers like SSD or YOLO. All examples in ncnn are known working properly with vulkan enabled.
+
+### my model runs slower on gpu than cpu
+
+The current vulkan inference implementation is far from the preferred state. Many handful optimization techniques are planned, such as winograd convolution, operator fusion, fp16 storage and arithmetic etc.
+
+It is common that your model runs slower on gpu than cpu on arm devices like mobile phones, since we have quite good arm optimization in ncnn ;)
diff --git a/docs/how-to-use-and-FAQ/quantized-int8-inference.md b/docs/how-to-use-and-FAQ/quantized-int8-inference.md
new file mode 100644
index 000000000..2e948f6a3
--- /dev/null
+++ b/docs/how-to-use-and-FAQ/quantized-int8-inference.md
@@ -0,0 +1,45 @@
+under construction ...
+
+## caffe-int8-convert-tools
+https://github.com/BUG1989/caffe-int8-convert-tools
+
+## convert caffe model to ncnn quantized int8 model
+### the offline way, reduce model binary size down to 25%
+
+|sample model binary|size|
+|---|---|
+|squeezenet.bin|4.7M|
+|squeezenet-int8.bin|1.2M|
+|mobilenet_ssd_voc.bin|22.1M|
+|mobilenet_ssd_voc-int8.bin|5.6M|
+
+```
+./caffe2ncnn resnet.prototxt resnet.caffemodel resnet-int8.param resnet-int8.bin 256 resnet.table
+```
+### the runtime way, no model binary reduction
+```
+./caffe2ncnn resnet.prototxt resnet.caffemodel resnet-fp32-int8.param resnet-fp32-int8.bin 0 resnet.table
+```
+
+## use ncnn int8 inference
+the ncnn library would use int8 inference automatically, nothing changed in your code
+```
+ncnn::Net resnet;
+resnet.load_param("resnet-int8.param");
+resnet.load_model("resnet-int8.bin");
+```
+### turn off int8 inference, the runtime model only
+```
+ncnn::Net resnet;
+resnet.use_int8_inference = 0;// set the switch before loading, force int8 inference off
+resnet.load_param("resnet-fp32-int8.param");
+resnet.load_model("resnet-fp32-int8.bin");
+```
+
+## mixed precision inference
+before converting your model files, delete the layer weight scale line in table file, and that layer will do the float32 inference
+```
+conv1_param_0 156.639840536
+```
+```
+```
diff --git a/docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.md b/docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.md
new file mode 100644
index 000000000..bdd6b8963
--- /dev/null
+++ b/docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.md
@@ -0,0 +1,161 @@
+We use alexnet as an example
+
+### prepare caffe prototxt and model
+
+These files will usually generated when trained with caffe
+```
+train.prototxt
+deploy.prototxt
+snapshot_10000.caffemodel
+```
+deploy.prototxt and caffemodel file are enough for TEST phase
+
+alexnet deploy.prototxt can be downloaded here
+
+https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet
+
+alexnet caffemodel can be downloaded here
+
+http://dl.caffe.berkeleyvision.org/bvlc_alexnet.caffemodel
+
+### convert to ncnn model
+
+Convert old caffe prototxt and caffemodel to new ones using tools in caffe
+
+because the ncnn convert tool needs the new format
+```
+upgrade_net_proto_text [old prototxt] [new prototxt]
+upgrade_net_proto_binary [old caffemodel] [new caffemodel]
+```
+
+Use Input layer as input, set N dim as 1 since only one image can be processed each time
+```
+layer {
+  name: "data"
+  type: "Input"
+  top: "data"
+  input_param { shape: { dim: 1 dim: 3 dim: 227 dim: 227 } }
+}
+```
+Use caffe2ncnn tool to convert caffe model to ncnn model
+```
+caffe2ncnn deploy.prototxt bvlc_alexnet.caffemodel alexnet.param alexnet.bin
+```
+
+### strip visible string
+
+It is already enough for deploying with param and bin file only, but there are visible strings in param file, it may not be suitable to distrubute plain neural network information in your APP.
+
+You can use ncnn2mem tool to convert plain model file to binary representation. It will generate alexnet.param.bin and two static array code files.
+```
+ncnn2mem alexnet.param alexnet.bin alexnet.id.h alexnet.mem.h
+```
+
+### load model
+
+Load param and bin file, the easy way
+```
+ncnn::Net net;
+net.load_param("alexnet.param");
+net.load_model("alexnet.bin");
+```
+Load binary param.bin and bin file, no visible strings included, suitable for bundled as APP resource
+```
+ncnn::Net net;
+net.load_param_bin("alexnet.param.bin");
+net.load_model("alexnet.bin");
+```
+Load network and model from external memory, no visible strings included, no external resource files bundled, the whole model is hardcoded in your program
+
+You may use this way to load from android asset resource
+```
+#include "alexnet.mem.h"
+ncnn::Net net;
+net.load_param(alexnet_param_bin);
+net.load_model(alexnet_bin);
+```
+You can choose either way to load model. Loading from external memory is zero-copy, which means you must keep your memory buffer during processing
+
+### unload model
+```
+net.clear();
+```
+
+### input and output
+
+ncnn Mat is the data structure for input and output data
+
+Input image should be converted to Mat, and substracted mean values and normalized when needed
+
+```
+#include "mat.h"
+unsigned char* rgbdata;// data pointer to RGB image pixels
+int w;// image width
+int h;// image height
+ncnn::Mat in = ncnn::Mat::from_pixels(rgbdata, ncnn::Mat::PIXEL_RGB, w, h);
+
+const float mean_vals[3] = {104.f, 117.f, 123.f};
+in.substract_mean_normalize(mean_vals, 0);
+```
+Execute the network inference and retrieve the result
+```
+#include "net.h"
+ncnn::Mat in;// input blob as above
+ncnn::Mat out;
+ncnn::Extractor ex = net.create_extractor();
+ex.set_light_mode(true);
+ex.input("data", in);
+ex.extract("prob", out);
+```
+If you load model with binary param.bin file, you should use the enum value in alexnet.id.h file instead of the blob name
+```
+#include "net.h"
+#include "alexnet.id.h"
+ncnn::Mat in;// input blob as above
+ncnn::Mat out;
+ncnn::Extractor ex = net.create_extractor();
+ex.set_light_mode(true);
+ex.input(alexnet_param_id::BLOB_data, in);
+ex.extract(alexnet_param_id::BLOB_prob, out);
+```
+Read the data in the output Mat. Iterate data to get all classification scores.
+```
+ncnn::Mat out_flatterned = out.reshape(out.w * out.h * out.c);
+std::vector<float> scores;
+scores.resize(out_flatterned.w);
+for (int j=0; j<out_flatterned.w; j++)
+{
+    scores[j] = out_flatterned[j];
+}
+```
+
+### some tricks
+
+Set multithreading thread number with Extractor
+```
+ex.set_num_threads(4);
+```
+Convert image colorspace and resize image with Mat convenient function, these functions are well optimized
+
+Support RGB2GRAY GRAY2RGB RGB2BGR etc, support scale up and scale down
+```
+#include "mat.h"
+unsigned char* rgbdata;// data pointer to RGB image pixels
+int w;// image width
+int h;// image height
+int target_width = 227;// target resized width
+int target_height = 227;// target resized height
+ncnn::Mat in = ncnn::Mat::from_pixels_resize(rgbdata, ncnn::Mat::PIXEL_RGB2GRAY, w, h, target_width, target_height);
+```
+You can concat multiple model files into one, and load this single file from FILE* interface.
+
+It should ease the distribution of param and model files.
+```
+$ cat alexnet.param.bin alexnet.bin > alexnet-all.bin
+
+#include "net.h"
+FILE* fp = fopen("alexnet-all.bin", "rb");
+net.load_param_bin(fp);
+net.load_model(fp);
+fclose(fp);
+```
diff --git a/docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.zh.md b/docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.zh.md
new file mode 100644
index 000000000..c391b1196
--- /dev/null
+++ b/docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.zh.md
@@ -0,0 +1,148 @@
+首先，非常感谢大家对 ncnn 组件的关注
+为了方便大家使用 ncnn 组件，up主特意写了这篇使用指北，以烂大街的 alexnet 作为例子
+
+
+### 准备caffe网络和模型
+
+caffe 的网络和模型通常是搞深度学习的研究者训练出来的，一般来说训练完会有
+```
+train.prototxt
+deploy.prototxt
+snapshot_10000.caffemodel
+```
+部署的时候只需要 TEST 过程，所以有 deploy.prototxt 和 caffemodel 就足够了
+
+alexnet 的 deploy.prototxt 可以在这里下载
+https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet
+
+alexnet 的 caffemodel 可以在这里下载
+http://dl.caffe.berkeleyvision.org/bvlc_alexnet.caffemodel
+
+### 转换ncnn网络和模型
+
+caffe 自带了工具可以把老版本的 caffe 网络和模型转换为新版（ncnn的工具只认识新版
+```
+upgrade_net_proto_text [老prototxt] [新prototxt]
+upgrade_net_proto_binary [老caffemodel] [新caffemodel]
+```
+输入层改用 Input，因为每次只需要做一个图片，所以第一个 dim 设为 1
+```
+layer {
+  name: "data"
+  type: "Input"
+  top: "data"
+  input_param { shape: { dim: 1 dim: 3 dim: 227 dim: 227 } }
+}
+```
+使用 caffe2ncnn 工具转换为 ncnn 的网络描述和模型
+```
+caffe2ncnn deploy.prototxt bvlc_alexnet.caffemodel alexnet.param alexnet.bin
+```
+### 去除可见字符串
+
+有 param 和 bin 文件其实已经可以用了，但是 param 描述文件是明文的，如果放在 APP 分发出去容易被窥探到网络结构（说得好像不明文就看不到一样
+使用 ncnn2mem 工具转换为二进制描述文件和内存模型，生成 alexnet.param.bin 和两个静态数组的代码文件
+```
+ncnn2mem alexnet.param alexnet.bin alexnet.id.h alexnet.mem.h
+```
+### 加载模型
+
+直接加载 param 和 bin，适合快速验证效果使用
+```
+ncnn::Net net;
+net.load_param("alexnet.param");
+net.load_model("alexnet.bin");
+```
+加载二进制的 param.bin 和 bin，没有可见字符串，适合 APP 分发模型资源
+```
+ncnn::Net net;
+net.load_param_bin("alexnet.param.bin");
+net.load_model("alexnet.bin");
+```
+从内存引用加载网络和模型，没有可见字符串，模型数据全在代码里头，没有任何外部文件
+另外，android apk 打包的资源文件读出来也是内存块
+```
+#include "alexnet.mem.h"
+ncnn::Net net;
+net.load_param(alexnet_param_bin);
+net.load_model(alexnet_bin);
+```
+以上三种都可以加载模型，其中内存引用方式加载是 zero-copy 的，所以使用 net 模型的来源内存块必须存在
+
+### 卸载模型
+```
+net.clear();
+```
+
+### 输入和输出
+
+ncnn 用自己的数据结构 Mat 来存放输入和输出数据
+输入图像的数据要转换为 Mat，依需要减去均值和乘系数
+```
+#include "mat.h"
+unsigned char* rgbdata;// data pointer to RGB image pixels
+int w;// image width
+int h;// image height
+ncnn::Mat in = ncnn::Mat::from_pixels(rgbdata, ncnn::Mat::PIXEL_RGB, w, h);
+
+const float mean_vals[3] = {104.f, 117.f, 123.f};
+in.substract_mean_normalize(mean_vals, 0);
+```
+执行前向网络，获得计算结果
+```
+#include "net.h"
+ncnn::Mat in;// input blob as above
+ncnn::Mat out;
+ncnn::Extractor ex = net.create_extractor();
+ex.set_light_mode(true);
+ex.input("data", in);
+ex.extract("prob", out);
+```
+如果是二进制的 param.bin 方式，没有可见字符串，利用 alexnet.id.h 的枚举来代替 blob 的名字
+```
+#include "net.h"
+#include "alexnet.id.h"
+ncnn::Mat in;// input blob as above
+ncnn::Mat out;
+ncnn::Extractor ex = net.create_extractor();
+ex.set_light_mode(true);
+ex.input(alexnet_param_id::BLOB_data, in);
+ex.extract(alexnet_param_id::BLOB_prob, out);
+```
+获取 Mat 中的输出数据，Mat 内部的数据通常是三维的，c / h / w，遍历所有获得全部分类的分数
+```
+ncnn::Mat out_flatterned = out.reshape(out.w * out.h * out.c);
+std::vector<float> scores;
+scores.resize(out_flatterned.w);
+for (int j=0; j<out_flatterned.w; j++)
+{
+    scores[j] = out_flatterned[j];
+}
+```
+### 某些使用技巧
+
+Extractor 有个多线程加速的开关，设置线程数能加快计算
+```
+ex.set_num_threads(4);
+```
+Mat 转换图像的时候可以顺便转换颜色和缩放大小，这些顺带的操作也是有优化的
+支持 RGB2GRAY GRAY2RGB RGB2BGR 等常用转换，支持缩小和放大
+```
+#include "mat.h"
+unsigned char* rgbdata;// data pointer to RGB image pixels
+int w;// image width
+int h;// image height
+int target_width = 227;// target resized width
+int target_height = 227;// target resized height
+ncnn::Mat in = ncnn::Mat::from_pixels_resize(rgbdata, ncnn::Mat::PIXEL_RGB2GRAY, w, h, target_width, target_height);
+```
+Net 有从 FILE* 文件描述加载的接口，可以利用这点把多个网络和模型文件合并为一个，分发时能方便些，内存引用就无所谓了
+```
+$ cat alexnet.param.bin alexnet.bin > alexnet-all.bin
+
+#include "net.h"
+FILE* fp = fopen("alexnet-all.bin", "rb");
+net.load_param_bin(fp);
+net.load_model(fp);
+fclose(fp);
+```
diff --git a/docs/how-to-use-and-FAQ/use-ncnn-with-pytorch-or-onnx.md b/docs/how-to-use-and-FAQ/use-ncnn-with-pytorch-or-onnx.md
new file mode 100644
index 000000000..f3c23860f
--- /dev/null
+++ b/docs/how-to-use-and-FAQ/use-ncnn-with-pytorch-or-onnx.md
@@ -0,0 +1,55 @@
+Here is a practical guide for converting pytorch model to ncnn
+
+resnet18 is used as the example
+
+## pytorch to onnx
+
+The official pytorch tutorial for exporting onnx model
+
+https://pytorch.org/tutorials/advanced/super_resolution_with_caffe2.html
+
+```
+import torch
+import torchvision
+import torch.onnx
+
+# An instance of your model
+model = torchvision.models.resnet18()
+
+# An example input you would normally provide to your model's forward() method
+x = torch.rand(1, 3, 224, 224)
+
+# Export the model
+torch_out = torch.onnx._export(model, x, "resnet18.onnx", export_params=True)
+```
+
+## simplify onnx model
+
+The exported resnet18.onnx model may contains many redundant operators such as Shape, Gather and Unsqueeze that is not supported in ncnn
+
+```
+Shape not supported yet!
+Gather not supported yet!
+  # axis=0
+Unsqueeze not supported yet!
+  # axes 7
+Unsqueeze not supported yet!
+  # axes 7
+```
+
+Fortunately, daquexian developed a handy tool to eliminate them. cheers!
+
+https://github.com/daquexian/onnx-simplifier
+
+```
+python3 -m onnxsim resnet18.onnx resnet18-sim.onnx
+```
+
+## onnx to ncnn
+
+Finally, you can convert the model to ncnn using tools/onnx2ncnn
+
+```
+onnx2ncnn resnet18-sim.onnx resnet18.param resnet18.bin
+```
+
diff --git a/docs/how-to-use-and-FAQ/use-ncnnoptmize-to-optimize-model.md b/docs/how-to-use-and-FAQ/use-ncnnoptmize-to-optimize-model.md
new file mode 100644
index 000000000..66cdf3c1e
--- /dev/null
+++ b/docs/how-to-use-and-FAQ/use-ncnnoptmize-to-optimize-model.md
@@ -0,0 +1,43 @@
+### Non ARM Linux Platform
+
+the typical usage
+```
+ncnnoptimize mobilenet.param mobilenet.bin mobilenet-opt.param mobilenet-opt.bin 65536 
+```
+
+operator fusion
+* batchnorm - scale
+* convolution - batchnorm
+* convolutiondepthwise - batchnorm
+* deconvolution - batchnorm
+* deconvolutiondepthwise - batchnorm
+* innerproduct - batchnorm
+* convolution - relu
+* convolutiondepthwise - relu
+* deconvolution - relu
+* deconvolutiondepthwise - relu
+* innerproduct - relu
+
+eliminate noop operator
+* innerproduct - dropout
+* flatten after global pooling
+
+prefer better operator
+* replace convolution with innerproduct after global pooling
+
+### ARM Linux Platform
+usage
+```
+ncnnoptimize squeezenet.param squeezenet.bin squeezenet-opt.param squeezenet-opt.bin 0 data 227 224 3
+```
+
+explanation
+
+|parameter|meaning|
+|---|---|
+|data|input data node, currently support one input|
+|227|input weight|
+|224|input height|
+|3|input channel|
+
+this feature would auto choose the fastest convolution implementation, normally speedup 10%.
diff --git a/docs/how-to-use-and-FAQ/vulkan-notes.md b/docs/how-to-use-and-FAQ/vulkan-notes.md
new file mode 100644
index 000000000..4e559c7f5
--- /dev/null
+++ b/docs/how-to-use-and-FAQ/vulkan-notes.md
@@ -0,0 +1,184 @@
+# supported platform
+
+* Y = known work
+* ? = shall work, not confirmed
+* / = not applied
+
+|    |windows|linux|android|mac|ios|
+|---|---|---|---|---|---|
+|intel|Y|Y|?|?|/|
+|amd|Y|Y|/|?|/|
+|nvidia|Y|Y|?|/|/|
+|qcom|/|/|Y|/|/|
+|apple|/|/|/|?|Y|
+|arm|/|?|?|/|/|
+
+# enable vulkan compute support
+```
+$ sudo dnf install vulkan-devel
+$ cmake -DNCNN_VULKAN=ON ..
+```
+
+# initialize vulkan runtime
+```
+ncnn::create_gpu_instance();
+
+{
+...
+}
+
+ncnn::destroy_gpu_instance();
+```
+
+# enable vulkan compute inference
+```
+ncnn::Net net;
+net.opt.use_vulkan_compute = 1;
+```
+
+# proper allocator usage
+```
+ncnn::VkAllocator* blob_vkallocator = vkdev.acquire_blob_allocator();
+ncnn::VkAllocator* staging_vkallocator = vkdev.acquire_blob_allocator();
+
+net.opt.blob_vkallocator = blob_vkallocator;
+net.opt.workspace_vkallocator = blob_vkallocator;
+net.opt.staging_vkallocator = staging_vkallocator;
+
+// ....
+
+// after inference
+vkdev.reclaim_blob_allocator(blob_vkallocator);
+vkdev.reclaim_staging_allocator(staging_vkallocator);
+```
+
+# select gpu device
+```
+// get gpu count
+int gpu_count = ncnn::get_gpu_count();
+
+// set specified vulkan device before loading param and model
+net.set_vulkan_device(0); // use device-0
+net.set_vulkan_device(1); // use device-1
+```
+
+# zero-copy on unified memory device
+```
+ncnn::VkMat blob_gpu;
+ncnn::Mat mapped = blob_gpu.mapped();
+
+// use mapped.data directly
+```
+
+# hybrid cpu/gpu inference
+```
+ncnn::Extractor ex_cpu = net.create_extractor();
+ncnn::Extractor ex_gpu = net.create_extractor();
+ex_cpu.set_vulkan_compute(false);
+ex_gpu.set_vulkan_compute(true);
+
+#pragma omp parallel sections
+{
+    #pragma omp section
+    {
+        ex_cpu.input();
+        ex_cpu.extract();
+    }
+    #pragma omp section
+    {
+        ex_gpu.input();
+        ex_gpu.extract();
+    }
+}
+```
+
+# zero-copy gpu inference chaining
+```
+ncnn::Extractor ex1 = net1.create_extractor();
+ncnn::Extractor ex2 = net2.create_extractor();
+
+ncnn::VkCompute cmd(&vkdev);
+
+ncnn::VkMat conv1;
+ncnn::VkMat conv2;
+ncnn::VkMat conv3;
+
+ex1.input("conv1", conv1);
+ex1.extract("conv2", conv2, cmd);
+
+ex2.input("conv2", conv2);
+ex2.extract("conv3", conv3, cmd);
+
+cmd.submit();
+
+cmd.wait();
+
+```
+
+# batch inference
+```
+int max_batch_size = vkdev->info.compute_queue_count;
+
+ncnn::Mat inputs[1000];
+ncnn::Mat outputs[1000];
+
+#pragma omp parallel for num_threads(max_batch_size)
+for (int i=0; i<1000; i++)
+{
+    ncnn::Extractor ex = net1.create_extractor();
+    ex.input("data", inputs[i]);
+    ex.extract("prob", outputs[i]);
+}
+```
+
+# control storage and arithmetic precision
+
+disable all lower-precision optimzations, get full fp32 precision
+
+```
+ncnn::Net net;
+net.opt.use_fp16_packed = false;
+net.opt.use_fp16_storage = false;
+net.opt.use_fp16_arithmetic = false;
+net.opt.use_int8_storage = false;
+net.opt.use_int8_arithmetic = false;
+```
+
+# debugging tips
+```
+#define ENABLE_VALIDATION_LAYER 1 // modify to 1 in gpu.cpp
+```
+
+# add vulkan compute support to layer
+1. add vulkan shader in src/layer/shader/
+
+2. upload model weight data in Layer::upload_model()
+
+3. setup pipeline in Layer::create_pipeline()
+
+4. destroy pipeline in Layer::destroy_pipeline()
+
+5. record command in Layer::forward()
+
+# add optimized shader path
+1. add vulkan shader in src/layer/shader/ named XXX_abc.comp
+
+2. create pipeline with "XXX_abc"
+
+3. record command using XXX_abc pipeline
+
+# low-level op api
+1. create layer
+
+2. load param and load model
+
+3. upload model
+
+4. create pipeline
+
+5. new command
+
+6. record
+
+7. submit and wait
+