add ncnn source qwq

9 years ago · b7db8be4f6
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,62 @@

 if(CMAKE_TOOLCHAIN_FILE)
 set(LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_BINARY_DIR} CACHE PATH "root for library output, set this to change where android libs are compiled to")
 # get absolute path, but get_filename_component ABSOLUTE only refer with source dir, so find_file here :(
 get_filename_component(CMAKE_TOOLCHAIN_FILE_NAME ${CMAKE_TOOLCHAIN_FILE} NAME)
 find_file(CMAKE_TOOLCHAIN_FILE ${CMAKE_TOOLCHAIN_FILE_NAME} PATHS ${CMAKE_SOURCE_DIR} NO_DEFAULT_PATH)
 message(STATUS "CMAKE_TOOLCHAIN_FILE = ${CMAKE_TOOLCHAIN_FILE}")
 endif()

 if(NOT DEFINED CMAKE_INSTALL_PREFIX)
 set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "Installation Directory")
 endif()
 message(STATUS "CMAKE_INSTALL_PREFIX = ${CMAKE_INSTALL_PREFIX}")

 project(ncnn)

 cmake_minimum_required(VERSION 2.8.10)

 # set(CMAKE_BUILD_TYPE debug)
 # set(CMAKE_BUILD_TYPE relwithdebinfo)
 set(CMAKE_BUILD_TYPE release)

 option(NCNN_OPENMP "openmp support" ON)
 option(NCNN_STDIO "load model from external file" ON)
 option(NCNN_STRING "plain and verbose string" ON)
 option(NCNN_OPENCV "minimal opencv structure emulation" OFF)

 if(NCNN_OPENMP)
    find_package(OpenMP)
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 endif()

 add_definitions(-Wall -Wextra)

 add_definitions(-fPIC)
 add_definitions(-Ofast)

 add_definitions(-ffast-math)
 # add_definitions(-march=native)

 # add_definitions(-flto)

 add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)

 if(ANDROID)
    # disable shared library on android
    set_property(GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS FALSE)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti -fno-exceptions")
 elseif(IOS)
    # disable shared library on xcode ios
    set_property(GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS FALSE)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti -fno-exceptions")
 endif()

 ##############################################

 # add_subdirectory(examples)
 add_subdirectory(src)
 if(NOT ANDROID AND NOT IOS)
 add_subdirectory(tools)
 endif()
--- a/Info.plist
+++ b/Info.plist
@@ -0,0 +1,18 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
 <plist version="1.0">
 <dict>
    <key>CFBundleName</key>
    <string>ncnn</string>
    <key>CFBundleIdentifier</key>
    <string>com.tencent.ncnn</string>
    <key>CFBundleVersion</key>
    <string>1.0</string>
    <key>CFBundleShortVersionString</key>
    <string>1.0</string>
    <key>CFBundleSignature</key>
    <string>????</string>
    <key>CFBundlePackageType</key>
    <string>FMWK</string>
 </dict>
 </plist>
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -0,0 +1,86 @@
 Tencent is pleased to support the open source community by making ncnn available.
 Copyright (C) 2017 THL A29 Limited, a Tencent company.  All rights reserved.
 If you have downloaded a copy of the ncnn binary from Tencent, please note that the ncnn binary is licensed under the BSD 3-Clause License.
 If you have downloaded a copy of the ncnn source code from Tencent, please note that ncnn source code is licensed under the BSD 3-Clause License, except for the third-party components listed below which are subject to different license terms.  Your integration of ncnn into your own projects may require compliance with the BSD 3-Clause License, as well as the other licenses applicable to the third-party components included within ncnn.
 A copy of the BSD 3-Clause License is included in this file.

 Other dependencies and licenses:

 Open Source Software Licensed Under the zlib License:
 The below software in this distribution may have been modified by THL A29 Limited (“Tencent Modifications”). All Tencent Modifications are Copyright (C) 2017 THL A29 Limited.
 ----------------------------------------------------------------------------------------
 1. neon_mathfun.h
 Copyright (C) 2011 Julien Pommier

 2. sse_mathfun.h
 Copyright (C) 2007 Julien Pommier

 3. avx_mathfun.h
 Copyright (C) 2012 Giovanni Garberoglio
 Interdisciplinary Laboratory for Computational Science (LISC)
 Fondazione Bruno Kessler and University of Trento
 via Sommarive, 18
 I-38123 Trento (Italy)


 Terms of the zlib License:
 ---------------------------------------------------
 Copyright (c) <year> <copyright holders>

 This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software.

 Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions:

 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.



 Open Source Software Licensed Under the BSD 2-Clause License:
 The below software in this distribution may have been modified by THL A29 Limited (“Tencent Modifications”). All Tencent Modifications are Copyright (C) 2017 THL A29 Limited.
 ----------------------------------------------------------------------------------------
 1. squeezenet  1.1
 Copyright (c) 2016 Forrest N. Iandola and Matthew W. Moskewicz and Khalid Ashraf and Song Han and William J. Dally and Kurt Keutzer
 All rights reserved.

 2. caffe.proto  master
 All contributions by the University of California:
 Copyright (c) 2014-2017 The Regents of the University of California (Regents)
 All rights reserved.

 All other contributions:
 Copyright (c) 2014-2017, the respective contributors
 All rights reserved.


 Terms of the BSD 2-Clause License:
 --------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

 Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.

 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.



 Open Source Software Licensed Under the BSD 3-Clause License:
 The below software in this distribution may have been modified by THL A29 Limited (“Tencent Modifications”). All Tencent Modifications are Copyright (C) 2017 THL A29 Limited.
 ----------------------------------------------------------------------------------------
 1. android.toolchain.cmake  master
 Copyright (c) 2010-2011, Ethan Rublee
 Copyright (c) 2011-2014, Andrey Kamaev
 All rights reserved.


 Terms of the BSD 3-Clause License:
 --------------------------------------------------------------------

 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

 Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 Neither the name of [copyright holder] nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.

 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,44 @@
 # ncnn

 ---

 ncnn 是一个为手机端极致优化的高性能神经网络前向计算框架。ncnn 从设计之初深刻考虑手机端的部属和使用。无第三方依赖，跨平台，手机端 cpu 的速度快于目前所有已知的开源框架。基于 ncnn，开发者能够将深度学习算法轻松移植到手机端高效执行，开发出人工智能 APP，将 AI 带到你的指尖。ncnn 目前已在腾讯多款应用中使用，如 QQ，Qzone，微信，天天P图等。

 ncnn is a high-performance neural network inference computing framework optimized for the mobile platform. ncnn is deeply considered of the deployment and uses on mobile phones from the beginning of the design. ncnn does not have third party dependent, it is cross-platform, and runs faster than all known open source framework on mobile phone cpu. Developers can easily deploy deep learning algorithm models to the mobile platform by using the efficient ncnn implementation, create intelligent APP, and bring the artificial intelligence to your fingertips. ncnn is currently being used in many Tencent applications, such as QQ, Qzone, WeChat, Pitu and so on.

 ---

 ### 功能概述

 * 支持卷积神经网络，支持多输入和多分支结构，可计算部分分支
 * 无任何第三方库依赖，不依赖 BLAS/NNPACK 等计算框架
 * 纯 C++ 实现，跨平台，支持 android ios 等
 * ARM NEON 汇编级良心优化，计算速度极快
 * 精细的内存管理和数据结构设计，内存占用极低
 * 支持多核并行计算加速，ARM big.LITTLE cpu 调度优化
 * 整体库体积小于 500K，并可轻松精简到小于 300K
 * 可扩展的模型设计，支持 8bit 量化和半精度浮点存储，可导入 caffe 模型
 * 支持直接内存零拷贝引用加载网络模型
 * 可注册自定义层实现并扩展
 * 恩，很强就是了，不怕被塞卷 QvQ

 ### Features

 * Support convolution neural network, support multiple input and multi-branch structure, can calculate part of the branch
 * No third-party library dependent, do not rely on BLAS / NNPACK or other computing framework
 * Pure C ++ implementation, cross-platform, support android ios and so on
 * ARM NEON assembly level of careful optimization, the calculation speed is extremely fast
 * Sophisticated memory management and data structure design, very low memory footprint
 * Support multi-core parallel computing acceleration, ARM big.LITTLE cpu scheduling optimization
 * The overall library size is less than 500K, and can be easily reduced to less than 300K
 * Extensible model design, support 8bit quantization and half-precision floating point storage, can import caffe model
 * Support direct memory zero copy reference load network model
 * Can be registered with custom layer implementation and extented
 * Well, it is strong, not afraid of being stuffed with 卷   QvQ

 ---

 ### License

 BSD 3 Clause

--- a/android.toolchain.cmake
+++ b/android.toolchain.cmake
--- a/build.sh
+++ b/build.sh
@@ -0,0 +1,33 @@
 #!/usr/bin/bash

 ##### android armv7
 mkdir -p build-android-armv7
 pushd build-android-armv7
 cmake -DCMAKE_TOOLCHAIN_FILE=../android.toolchain.cmake -DANDROID_ABI="armeabi-v7a with NEON" -DANDROID_NATIVE_API_LEVEL=android-9 -DANDROID_FORCE_ARM_BUILD=OFF -DANDROID_STL_FORCE_FEATURES=OFF ..
 make
 make install
 popd

 ##### android aarch64
 mkdir -p build-android-aarch64
 pushd build-android-aarch64
 cmake -DCMAKE_TOOLCHAIN_FILE=../android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_NATIVE_API_LEVEL=android-21 -DANDROID_FORCE_ARM_BUILD=OFF -DANDROID_STL_FORCE_FEATURES=OFF ..
 make
 make install
 popd

 ##### ios armv7 arm64
 mkdir -p build-ios
 pushd build-ios
 cmake -DCMAKE_TOOLCHAIN_FILE=../iosxc.toolchain.cmake ..
 make
 make install
 popd

 ##### ios simulator i386 x86_64
 mkdir -p build-ios-sim
 pushd build-ios-sim
 cmake -DCMAKE_TOOLCHAIN_FILE=../iossimxc.toolchain.cmake ..
 make
 make install
 popd
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -0,0 +1,9 @@

 find_package(OpenCV REQUIRED core highgui imgproc)

 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src)
 include_directories(${CMAKE_CURRENT_BINARY_DIR}/../src)

 add_executable(squeezenet squeezenet.cpp)

 target_link_libraries(squeezenet ncnn ${OpenCV_LIBS})
--- a/examples/squeezencnn/AndroidManifest.xml
+++ b/examples/squeezencnn/AndroidManifest.xml
@@ -0,0 +1,15 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
      package="com.tencent.squeezencnn"
      android:versionCode="1"
      android:versionName="1.1">
    <application android:label="@string/app_name" >
        <activity android:name="MainActivity"
                  android:label="@string/app_name">
            <intent-filter>
                <action android:name="android.intent.action.MAIN" />
                <category android:name="android.intent.category.LAUNCHER" />
            </intent-filter>
        </activity>
    </application>
 </manifest> 
--- a/examples/squeezencnn/ant.properties
+++ b/examples/squeezencnn/ant.properties
@@ -0,0 +1,21 @@
 # This file is used to override default values used by the Ant build system.
 #
 # This file must be checked into Version Control Systems, as it is
 # integral to the build system of your project.

 # This file is only used by the Ant script.

 # You can use this to override default values such as
 #  'source.dir' for the location of your java source folder and
 #  'out.dir' for the location of your output folder.

 # You can also use it define how the release builds are signed by declaring
 # the following properties:
 #  'key.store' for the location of your keystore and
 #  'key.alias' for the name of the key to use.
 # The password will be asked during the build when you use the 'release' target.

 key.store=/home/nihui/osd/nihuini-release-key.keystore
 key.alias=nihuini
 key.store.password=nihuini
 key.alias.password=nihuini
--- a/examples/squeezencnn/assets/squeezenet_v1.1.bin
+++ b/examples/squeezencnn/assets/squeezenet_v1.1.bin
@@ -0,0 +1 @@
 ../../squeezenet_v1.1.bin
--- a/examples/squeezencnn/assets/squeezenet_v1.1.param.bin
+++ b/examples/squeezencnn/assets/squeezenet_v1.1.param.bin
--- a/examples/squeezencnn/assets/synset_words.txt
+++ b/examples/squeezencnn/assets/synset_words.txt
@@ -0,0 +1 @@
 ../../synset_words.txt
--- a/examples/squeezencnn/build.xml
+++ b/examples/squeezencnn/build.xml
@@ -0,0 +1,92 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project name="squeezencnn" default="help">

    <!-- The local.properties file is created and updated by the 'android' tool.
         It contains the path to the SDK. It should *NOT* be checked into
         Version Control Systems. -->
    <property file="local.properties" />

    <!-- The ant.properties file can be created by you. It is only edited by the
         'android' tool to add properties to it.
         This is the place to change some Ant specific build properties.
         Here are some properties you may want to change/update:

         source.dir
             The name of the source directory. Default is 'src'.
         out.dir
             The name of the output directory. Default is 'bin'.

         For other overridable properties, look at the beginning of the rules
         files in the SDK, at tools/ant/build.xml

         Properties related to the SDK location or the project target should
         be updated using the 'android' tool with the 'update' action.

         This file is an integral part of the build system for your
         application and should be checked into Version Control Systems.

         -->
    <property file="ant.properties" />

    <!-- if sdk.dir was not set from one of the property file, then
         get it from the ANDROID_HOME env var.
         This must be done before we load project.properties since
         the proguard config can use sdk.dir -->
    <property environment="env" />
    <condition property="sdk.dir" value="${env.ANDROID_HOME}">
        <isset property="env.ANDROID_HOME" />
    </condition>

    <!-- The project.properties file is created and updated by the 'android'
         tool, as well as ADT.

         This contains project specific properties such as project target, and library
         dependencies. Lower level build properties are stored in ant.properties
         (or in .classpath for Eclipse projects).

         This file is an integral part of the build system for your
         application and should be checked into Version Control Systems. -->
    <loadproperties srcFile="project.properties" />

    <!-- quick check on sdk.dir -->
    <fail
            message="sdk.dir is missing. Make sure to generate local.properties using 'android update project' or to inject it through the ANDROID_HOME environment variable."
            unless="sdk.dir"
    />

    <!--
        Import per project custom build rules if present at the root of the project.
        This is the place to put custom intermediary targets such as:
            -pre-build
            -pre-compile
            -post-compile (This is typically used for code obfuscation.
                           Compiled code location: ${out.classes.absolute.dir}
                           If this is not done in place, override ${out.dex.input.absolute.dir})
            -post-package
            -post-build
            -pre-clean
    -->
    <import file="custom_rules.xml" optional="true" />

    <!-- Import the actual build file.

         To customize existing targets, there are two options:
         - Customize only one target:
             - copy/paste the target into this file, *before* the
               <import> task.
             - customize it to your needs.
         - Customize the whole content of build.xml
             - copy/paste the content of the rules files (minus the top node)
               into this file, replacing the <import> task.
             - customize to your needs.

         ***********************
         ****** IMPORTANT ******
         ***********************
         In all cases you must update the value of version-tag below to read 'custom' instead of an integer,
         in order to avoid having your file be overridden by tools such as "android update project"
    -->
    <!-- version-tag: 1 -->
    <import file="${sdk.dir}/tools/ant/build.xml" />

 </project>
--- a/examples/squeezencnn/jni/Android.mk
+++ b/examples/squeezencnn/jni/Android.mk
@@ -0,0 +1,30 @@
 LOCAL_PATH := $(call my-dir)

 # change this folder path to yours
 NCNN_INSTALL_PATH := /home/nihui/dev/qqfacecnn/ncnn/build-android-armv7/install

 include $(CLEAR_VARS)
 LOCAL_MODULE := ncnn
 LOCAL_SRC_FILES := $(NCNN_INSTALL_PATH)/lib/libncnn.a
 include $(PREBUILT_STATIC_LIBRARY)

 include $(CLEAR_VARS)

 LOCAL_MODULE := squeezencnn
 LOCAL_SRC_FILES := squeezencnn_jni.cpp

 LOCAL_C_INCLUDES := $(NCNN_INSTALL_PATH)/include

 LOCAL_STATIC_LIBRARIES := ncnn

 LOCAL_CFLAGS := -O2 -fvisibility=hidden -fomit-frame-pointer -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math
 LOCAL_CPPFLAGS := -O2 -fvisibility=hidden -fvisibility-inlines-hidden -fomit-frame-pointer -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math
 LOCAL_LDFLAGS += -Wl,--gc-sections

 LOCAL_CFLAGS += -fopenmp
 LOCAL_CPPFLAGS += -fopenmp
 LOCAL_LDFLAGS += -fopenmp

 LOCAL_LDLIBS := -lz -llog -ljnigraphics

 include $(BUILD_SHARED_LIBRARY)
--- a/examples/squeezencnn/jni/Application.mk
+++ b/examples/squeezencnn/jni/Application.mk
@@ -0,0 +1,7 @@

 # APP_STL := stlport_static
 APP_STL := gnustl_static
 # APP_ABI := armeabi armeabi-v7a
 APP_ABI := armeabi-v7a
 APP_PLATFORM := android-9
 NDK_TOOLCHAIN_VERSION := 4.9
--- a/examples/squeezencnn/jni/squeezencnn_jni.cpp
+++ b/examples/squeezencnn/jni/squeezencnn_jni.cpp
@@ -0,0 +1,181 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include <android/bitmap.h>
 #include <android/log.h>

 #include <jni.h>

 #include <string>
 #include <vector>

 // ncnn
 #include "net.h"

 #include "squeezenet_v1.1.id.h"

 #include <sys/time.h>
 #include <unistd.h>

 static struct timeval tv_begin;
 static struct timeval tv_end;
 static double elasped;

 static void bench_start()
 {
    gettimeofday(&tv_begin, NULL);
 }

 static void bench_end(const char* comment)
 {
    gettimeofday(&tv_end, NULL);
    elasped = ((tv_end.tv_sec - tv_begin.tv_sec) * 1000000.0f + tv_end.tv_usec - tv_begin.tv_usec) / 1000.0f;
 //     fprintf(stderr, "%.2fms   %s\n", elasped, comment);
    __android_log_print(ANDROID_LOG_DEBUG, "SqueezeNcnn", "%.2fms   %s", elasped, comment);
 }

 static std::vector<unsigned char> squeezenet_param;
 static std::vector<unsigned char> squeezenet_bin;
 static std::vector<std::string> squeezenet_words;
 static ncnn::Net squeezenet;

 static std::vector<std::string> split_string(const std::string& str, const std::string& delimiter)
 {
    std::vector<std::string> strings;

    std::string::size_type pos = 0;
    std::string::size_type prev = 0;
    while ((pos = str.find(delimiter, prev)) != std::string::npos)
    {
        strings.push_back(str.substr(prev, pos - prev));
        prev = pos + 1;
    }

    // To get the last substring (or only, if delimiter is not found)
    strings.push_back(str.substr(prev));

    return strings;
 }

 extern "C" {

 // public native boolean Init(byte[] param, byte[] bin, byte[] words);
 JNIEXPORT jboolean JNICALL Java_com_tencent_squeezencnn_SqueezeNcnn_Init(JNIEnv* env, jobject thiz, jbyteArray param, jbyteArray bin, jbyteArray words)
 {
    // init param
    {
        int len = env->GetArrayLength(param);
        squeezenet_param.resize(len);
        env->GetByteArrayRegion(param, 0, len, (jbyte*)squeezenet_param.data());
        int ret = squeezenet.load_param(squeezenet_param.data());
        __android_log_print(ANDROID_LOG_DEBUG, "SqueezeNcnn", "load_param %d %d", ret, len);
    }

    // init bin
    {
        int len = env->GetArrayLength(bin);
        squeezenet_bin.resize(len);
        env->GetByteArrayRegion(bin, 0, len, (jbyte*)squeezenet_bin.data());
        int ret = squeezenet.load_model(squeezenet_bin.data());
        __android_log_print(ANDROID_LOG_DEBUG, "SqueezeNcnn", "load_model %d %d", ret, len);
    }

    // init words
    {
        int len = env->GetArrayLength(words);
        std::string words_buffer;
        words_buffer.resize(len);
        env->GetByteArrayRegion(words, 0, len, (jbyte*)words_buffer.data());
        squeezenet_words = split_string(words_buffer, "\n");
    }

    return JNI_TRUE;
 }

 // public native String Detect(Bitmap bitmap);
 JNIEXPORT jstring JNICALL Java_com_tencent_squeezencnn_SqueezeNcnn_Detect(JNIEnv* env, jobject thiz, jobject bitmap)
 {
    bench_start();

    // ncnn from bitmap
    ncnn::Mat in;
    {
        AndroidBitmapInfo info;
        AndroidBitmap_getInfo(env, bitmap, &info);
        int width = info.width;
        int height = info.height;
        if (width != 227 || height != 227)
            return NULL;
        if (info.format != ANDROID_BITMAP_FORMAT_RGBA_8888)
            return NULL;

        void* indata;
        AndroidBitmap_lockPixels(env, bitmap, &indata);

        in = ncnn::Mat::from_pixels((const unsigned char*)indata, ncnn::Mat::PIXEL_RGBA2BGR, width, height);

        AndroidBitmap_unlockPixels(env, bitmap);
    }

    // squeezenet
    std::vector<float> cls_scores;
    {
        const float mean_vals[3] = {104.f, 117.f, 123.f};
        in.substract_mean_normalize(mean_vals, 0);

        ncnn::Extractor ex = squeezenet.create_extractor();
        ex.set_light_mode(true);
        ex.set_num_threads(4);

        ex.input(squeezenet_v1_1_param_id::BLOB_data, in);

        ncnn::Mat out;
        ex.extract(squeezenet_v1_1_param_id::BLOB_prob, out);

        cls_scores.resize(out.c);
        for (int j=0; j<out.c; j++)
        {
            const float* prob = out.data + out.cstep * j;
            cls_scores[j] = prob[0];
        }
    }

    // return top class
    int top_class = 0;
    float max_score = 0.f;
    for (size_t i=0; i<cls_scores.size(); i++)
    {
        float s = cls_scores[i];
 //         __android_log_print(ANDROID_LOG_DEBUG, "SqueezeNcnn", "%d %f", i, s);
        if (s > max_score)
        {
            top_class = i;
            max_score = s;
        }
    }

    const std::string& word = squeezenet_words[top_class];
    char tmp[32];
    sprintf(tmp, "%.3f", max_score);
    std::string result_str = std::string(word.c_str() + 10) + " = " + tmp;

    // +10 to skip leading n03179701
    jstring result = env->NewStringUTF(result_str.c_str());

    bench_end("detect");

    return result;
 }

 }
--- a/examples/squeezencnn/jni/squeezenet_v1.1.id.h
+++ b/examples/squeezencnn/jni/squeezenet_v1.1.id.h
@@ -0,0 +1,163 @@
 #ifndef NCNN_INCLUDE_GUARD_squeezenet_v1_1_id_h
 #define NCNN_INCLUDE_GUARD_squeezenet_v1_1_id_h
 namespace squeezenet_v1_1_param_id {
 const int LAYER_data = 0;
 const int BLOB_data = 0;
 const int LAYER_conv1 = 1;
 const int BLOB_conv1 = 1;
 const int LAYER_relu_conv1 = 2;
 const int BLOB_conv1_relu_conv1 = 2;
 const int LAYER_pool1 = 3;
 const int BLOB_pool1 = 3;
 const int LAYER_fire2_squeeze1x1 = 4;
 const int BLOB_fire2_squeeze1x1 = 4;
 const int LAYER_fire2_relu_squeeze1x1 = 5;
 const int BLOB_fire2_squeeze1x1_fire2_relu_squeeze1x1 = 5;
 const int LAYER_splitncnn_0 = 6;
 const int BLOB_fire2_squeeze1x1_fire2_relu_squeeze1x1_splitncnn_0 = 6;
 const int BLOB_fire2_squeeze1x1_fire2_relu_squeeze1x1_splitncnn_1 = 7;
 const int LAYER_fire2_expand1x1 = 7;
 const int BLOB_fire2_expand1x1 = 8;
 const int LAYER_fire2_relu_expand1x1 = 8;
 const int BLOB_fire2_expand1x1_fire2_relu_expand1x1 = 9;
 const int LAYER_fire2_expand3x3 = 9;
 const int BLOB_fire2_expand3x3 = 10;
 const int LAYER_fire2_relu_expand3x3 = 10;
 const int BLOB_fire2_expand3x3_fire2_relu_expand3x3 = 11;
 const int LAYER_fire2_concat = 11;
 const int BLOB_fire2_concat = 12;
 const int LAYER_fire3_squeeze1x1 = 12;
 const int BLOB_fire3_squeeze1x1 = 13;
 const int LAYER_fire3_relu_squeeze1x1 = 13;
 const int BLOB_fire3_squeeze1x1_fire3_relu_squeeze1x1 = 14;
 const int LAYER_splitncnn_1 = 14;
 const int BLOB_fire3_squeeze1x1_fire3_relu_squeeze1x1_splitncnn_0 = 15;
 const int BLOB_fire3_squeeze1x1_fire3_relu_squeeze1x1_splitncnn_1 = 16;
 const int LAYER_fire3_expand1x1 = 15;
 const int BLOB_fire3_expand1x1 = 17;
 const int LAYER_fire3_relu_expand1x1 = 16;
 const int BLOB_fire3_expand1x1_fire3_relu_expand1x1 = 18;
 const int LAYER_fire3_expand3x3 = 17;
 const int BLOB_fire3_expand3x3 = 19;
 const int LAYER_fire3_relu_expand3x3 = 18;
 const int BLOB_fire3_expand3x3_fire3_relu_expand3x3 = 20;
 const int LAYER_fire3_concat = 19;
 const int BLOB_fire3_concat = 21;
 const int LAYER_pool3 = 20;
 const int BLOB_pool3 = 22;
 const int LAYER_fire4_squeeze1x1 = 21;
 const int BLOB_fire4_squeeze1x1 = 23;
 const int LAYER_fire4_relu_squeeze1x1 = 22;
 const int BLOB_fire4_squeeze1x1_fire4_relu_squeeze1x1 = 24;
 const int LAYER_splitncnn_2 = 23;
 const int BLOB_fire4_squeeze1x1_fire4_relu_squeeze1x1_splitncnn_0 = 25;
 const int BLOB_fire4_squeeze1x1_fire4_relu_squeeze1x1_splitncnn_1 = 26;
 const int LAYER_fire4_expand1x1 = 24;
 const int BLOB_fire4_expand1x1 = 27;
 const int LAYER_fire4_relu_expand1x1 = 25;
 const int BLOB_fire4_expand1x1_fire4_relu_expand1x1 = 28;
 const int LAYER_fire4_expand3x3 = 26;
 const int BLOB_fire4_expand3x3 = 29;
 const int LAYER_fire4_relu_expand3x3 = 27;
 const int BLOB_fire4_expand3x3_fire4_relu_expand3x3 = 30;
 const int LAYER_fire4_concat = 28;
 const int BLOB_fire4_concat = 31;
 const int LAYER_fire5_squeeze1x1 = 29;
 const int BLOB_fire5_squeeze1x1 = 32;
 const int LAYER_fire5_relu_squeeze1x1 = 30;
 const int BLOB_fire5_squeeze1x1_fire5_relu_squeeze1x1 = 33;
 const int LAYER_splitncnn_3 = 31;
 const int BLOB_fire5_squeeze1x1_fire5_relu_squeeze1x1_splitncnn_0 = 34;
 const int BLOB_fire5_squeeze1x1_fire5_relu_squeeze1x1_splitncnn_1 = 35;
 const int LAYER_fire5_expand1x1 = 32;
 const int BLOB_fire5_expand1x1 = 36;
 const int LAYER_fire5_relu_expand1x1 = 33;
 const int BLOB_fire5_expand1x1_fire5_relu_expand1x1 = 37;
 const int LAYER_fire5_expand3x3 = 34;
 const int BLOB_fire5_expand3x3 = 38;
 const int LAYER_fire5_relu_expand3x3 = 35;
 const int BLOB_fire5_expand3x3_fire5_relu_expand3x3 = 39;
 const int LAYER_fire5_concat = 36;
 const int BLOB_fire5_concat = 40;
 const int LAYER_pool5 = 37;
 const int BLOB_pool5 = 41;
 const int LAYER_fire6_squeeze1x1 = 38;
 const int BLOB_fire6_squeeze1x1 = 42;
 const int LAYER_fire6_relu_squeeze1x1 = 39;
 const int BLOB_fire6_squeeze1x1_fire6_relu_squeeze1x1 = 43;
 const int LAYER_splitncnn_4 = 40;
 const int BLOB_fire6_squeeze1x1_fire6_relu_squeeze1x1_splitncnn_0 = 44;
 const int BLOB_fire6_squeeze1x1_fire6_relu_squeeze1x1_splitncnn_1 = 45;
 const int LAYER_fire6_expand1x1 = 41;
 const int BLOB_fire6_expand1x1 = 46;
 const int LAYER_fire6_relu_expand1x1 = 42;
 const int BLOB_fire6_expand1x1_fire6_relu_expand1x1 = 47;
 const int LAYER_fire6_expand3x3 = 43;
 const int BLOB_fire6_expand3x3 = 48;
 const int LAYER_fire6_relu_expand3x3 = 44;
 const int BLOB_fire6_expand3x3_fire6_relu_expand3x3 = 49;
 const int LAYER_fire6_concat = 45;
 const int BLOB_fire6_concat = 50;
 const int LAYER_fire7_squeeze1x1 = 46;
 const int BLOB_fire7_squeeze1x1 = 51;
 const int LAYER_fire7_relu_squeeze1x1 = 47;
 const int BLOB_fire7_squeeze1x1_fire7_relu_squeeze1x1 = 52;
 const int LAYER_splitncnn_5 = 48;
 const int BLOB_fire7_squeeze1x1_fire7_relu_squeeze1x1_splitncnn_0 = 53;
 const int BLOB_fire7_squeeze1x1_fire7_relu_squeeze1x1_splitncnn_1 = 54;
 const int LAYER_fire7_expand1x1 = 49;
 const int BLOB_fire7_expand1x1 = 55;
 const int LAYER_fire7_relu_expand1x1 = 50;
 const int BLOB_fire7_expand1x1_fire7_relu_expand1x1 = 56;
 const int LAYER_fire7_expand3x3 = 51;
 const int BLOB_fire7_expand3x3 = 57;
 const int LAYER_fire7_relu_expand3x3 = 52;
 const int BLOB_fire7_expand3x3_fire7_relu_expand3x3 = 58;
 const int LAYER_fire7_concat = 53;
 const int BLOB_fire7_concat = 59;
 const int LAYER_fire8_squeeze1x1 = 54;
 const int BLOB_fire8_squeeze1x1 = 60;
 const int LAYER_fire8_relu_squeeze1x1 = 55;
 const int BLOB_fire8_squeeze1x1_fire8_relu_squeeze1x1 = 61;
 const int LAYER_splitncnn_6 = 56;
 const int BLOB_fire8_squeeze1x1_fire8_relu_squeeze1x1_splitncnn_0 = 62;
 const int BLOB_fire8_squeeze1x1_fire8_relu_squeeze1x1_splitncnn_1 = 63;
 const int LAYER_fire8_expand1x1 = 57;
 const int BLOB_fire8_expand1x1 = 64;
 const int LAYER_fire8_relu_expand1x1 = 58;
 const int BLOB_fire8_expand1x1_fire8_relu_expand1x1 = 65;
 const int LAYER_fire8_expand3x3 = 59;
 const int BLOB_fire8_expand3x3 = 66;
 const int LAYER_fire8_relu_expand3x3 = 60;
 const int BLOB_fire8_expand3x3_fire8_relu_expand3x3 = 67;
 const int LAYER_fire8_concat = 61;
 const int BLOB_fire8_concat = 68;
 const int LAYER_fire9_squeeze1x1 = 62;
 const int BLOB_fire9_squeeze1x1 = 69;
 const int LAYER_fire9_relu_squeeze1x1 = 63;
 const int BLOB_fire9_squeeze1x1_fire9_relu_squeeze1x1 = 70;
 const int LAYER_splitncnn_7 = 64;
 const int BLOB_fire9_squeeze1x1_fire9_relu_squeeze1x1_splitncnn_0 = 71;
 const int BLOB_fire9_squeeze1x1_fire9_relu_squeeze1x1_splitncnn_1 = 72;
 const int LAYER_fire9_expand1x1 = 65;
 const int BLOB_fire9_expand1x1 = 73;
 const int LAYER_fire9_relu_expand1x1 = 66;
 const int BLOB_fire9_expand1x1_fire9_relu_expand1x1 = 74;
 const int LAYER_fire9_expand3x3 = 67;
 const int BLOB_fire9_expand3x3 = 75;
 const int LAYER_fire9_relu_expand3x3 = 68;
 const int BLOB_fire9_expand3x3_fire9_relu_expand3x3 = 76;
 const int LAYER_fire9_concat = 69;
 const int BLOB_fire9_concat = 77;
 const int LAYER_drop9 = 70;
 const int BLOB_fire9_concat_drop9 = 78;
 const int LAYER_conv10 = 71;
 const int BLOB_conv10 = 79;
 const int LAYER_relu_conv10 = 72;
 const int BLOB_conv10_relu_conv10 = 80;
 const int LAYER_pool10 = 73;
 const int BLOB_pool10 = 81;
 const int LAYER_prob = 74;
 const int BLOB_prob = 82;
 } // namespace squeezenet_v1_1_param_id
 #endif // NCNN_INCLUDE_GUARD_squeezenet_v1_1_id_h
--- a/examples/squeezencnn/local.properties
+++ b/examples/squeezencnn/local.properties
@@ -0,0 +1,10 @@
 # This file is automatically generated by Android Tools.
 # Do not modify this file -- YOUR CHANGES WILL BE ERASED!
 #
 # This file must *NOT* be checked into Version Control Systems,
 # as it contains information specific to your local configuration.

 # location of the SDK. This is only used by Ant
 # For customization when using a Version Control System, please read the
 # header note.
 sdk.dir=/home/nihui/osd/android-sdk-linux
--- a/examples/squeezencnn/proguard-project.txt
+++ b/examples/squeezencnn/proguard-project.txt
@@ -0,0 +1,20 @@
 # To enable ProGuard in your project, edit project.properties
 # to define the proguard.config property as described in that file.
 #
 # Add project specific ProGuard rules here.
 # By default, the flags in this file are appended to flags specified
 # in ${sdk.dir}/tools/proguard/proguard-android.txt
 # You can edit the include path and order by changing the ProGuard
 # include property in project.properties.
 #
 # For more details, see
 #   http://developer.android.com/guide/developing/tools/proguard.html

 # Add any project specific keep options here:

 # If your project uses WebView with JS, uncomment the following
 # and specify the fully qualified class name to the JavaScript interface
 # class:
 #-keepclassmembers class fqcn.of.javascript.interface.for.webview {
 #   public *;
 #}
--- a/examples/squeezencnn/project.properties
+++ b/examples/squeezencnn/project.properties
@@ -0,0 +1,14 @@
 # This file is automatically generated by Android Tools.
 # Do not modify this file -- YOUR CHANGES WILL BE ERASED!
 #
 # This file must be checked in Version Control Systems.
 #
 # To customize properties used by the Ant build system edit
 # "ant.properties", and override values to adapt the script to your
 # project structure.
 #
 # To enable ProGuard to shrink and obfuscate your code, uncomment this (available properties: sdk.dir, user.home):
 #proguard.config=${sdk.dir}/tools/proguard/proguard-android.txt:proguard-project.txt

 # Project target.
 target=android-9
--- a/examples/squeezencnn/res/layout/main.xml
+++ b/examples/squeezencnn/res/layout/main.xml
@@ -0,0 +1,36 @@
 <?xml version="1.0" encoding="utf-8"?>
 <LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
    android:orientation="vertical"
    android:layout_width="fill_parent"
    android:layout_height="fill_parent">

    <LinearLayout
        android:orientation="horizontal"
        android:layout_width="fill_parent"
        android:layout_height="wrap_content">

    <Button
        android:id="@+id/buttonImage"
        android:layout_width="wrap_content"
        android:layout_height="wrap_content"
        android:text="选图" />
    <Button
        android:id="@+id/buttonDetect"
        android:layout_width="wrap_content"
        android:layout_height="wrap_content"
        android:text="识别" />
    </LinearLayout>

    <TextView
        android:id="@+id/infoResult"
        android:layout_width="fill_parent"
        android:layout_height="wrap_content"
        android:text="" />

    <ImageView
        android:id="@+id/imageView"
        android:layout_width="fill_parent"
        android:layout_height="fill_parent"
        android:layout_weight="1" />

 </LinearLayout>
--- a/examples/squeezencnn/res/values/strings.xml
+++ b/examples/squeezencnn/res/values/strings.xml
@@ -0,0 +1,4 @@
 <?xml version="1.0" encoding="utf-8"?>
 <resources>
    <string name="app_name">squeezencnn</string>
 </resources>
--- a/examples/squeezencnn/src/com/tencent/squeezencnn/MainActivity.java
+++ b/examples/squeezencnn/src/com/tencent/squeezencnn/MainActivity.java
@@ -0,0 +1,189 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 package com.tencent.squeezencnn;

 import android.app.Activity;
 import android.os.Bundle;

 import android.content.Context;
 import android.content.Intent;
 import android.database.Cursor;
 import android.graphics.Bitmap;
 import android.graphics.BitmapFactory;
 import android.net.Uri;
 import android.provider.MediaStore;
 import android.util.Log;
 import android.view.View;
 import android.widget.Button;
 import android.widget.ImageView;
 import android.widget.TextView;

 import java.io.FileNotFoundException;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;

 import com.tencent.squeezencnn.SqueezeNcnn;

 public class MainActivity extends Activity
 {
    private static final int SELECT_IMAGE = 1;

    private TextView infoResult;
    private ImageView imageView;
    private Bitmap yourSelectedImage = null;

    private SqueezeNcnn squeezencnn = new SqueezeNcnn();

    /** Called when the activity is first created. */
    @Override
    public void onCreate(Bundle savedInstanceState)
    {
        super.onCreate(savedInstanceState);
        setContentView(R.layout.main);

        try
        {
            initSqueezeNcnn();
        }
        catch (IOException e)
        {
            Log.e("MainActivity", "initSqueezeNcnn error");
        }

        infoResult = (TextView) findViewById(R.id.infoResult);
        imageView = (ImageView) findViewById(R.id.imageView);

        Button buttonImage = (Button) findViewById(R.id.buttonImage);
        buttonImage.setOnClickListener(new View.OnClickListener() {
            @Override
            public void onClick(View arg0) {
                Intent i = new Intent(Intent.ACTION_PICK);
                i.setType("image/*");
                startActivityForResult(i, SELECT_IMAGE);
            }
        });

        Button buttonDetect = (Button) findViewById(R.id.buttonDetect);
        buttonDetect.setOnClickListener(new View.OnClickListener() {
            @Override
            public void onClick(View arg0) {
                if (yourSelectedImage == null)
                    return;

                String result = squeezencnn.Detect(yourSelectedImage);

                if (result == null)
                {
                    infoResult.setText("detect failed");
                }
                else
                {
                    infoResult.setText(result);
                }
            }
        });
    }

    private void initSqueezeNcnn() throws IOException
    {
        byte[] param = null;
        byte[] bin = null;
        byte[] words = null;

        {
            InputStream assetsInputStream = getAssets().open("squeezenet_v1.1.param.bin");
            int available = assetsInputStream.available();
            param = new byte[available];
            int byteCode = assetsInputStream.read(param);
            assetsInputStream.close();
        }
        {
            InputStream assetsInputStream = getAssets().open("squeezenet_v1.1.bin");
            int available = assetsInputStream.available();
            bin = new byte[available];
            int byteCode = assetsInputStream.read(bin);
            assetsInputStream.close();
        }
        {
            InputStream assetsInputStream = getAssets().open("synset_words.txt");
            int available = assetsInputStream.available();
            words = new byte[available];
            int byteCode = assetsInputStream.read(words);
            assetsInputStream.close();
        }

        squeezencnn.Init(param, bin, words);
    }

    @Override
    protected void onActivityResult(int requestCode, int resultCode, Intent data)
    {
        super.onActivityResult(requestCode, resultCode, data);

        if (resultCode == RESULT_OK && null != data) {
            Uri selectedImage = data.getData();

            try
            {
                if (requestCode == SELECT_IMAGE) {
                    Bitmap bitmap = decodeUri(selectedImage);

                    Bitmap rgba = bitmap.copy(Bitmap.Config.ARGB_8888, true);

                    // resize to 227x227
                    yourSelectedImage = Bitmap.createScaledBitmap(rgba, 227, 227, false);

                    imageView.setImageBitmap(yourSelectedImage);
                }
            }
            catch (FileNotFoundException e)
            {
                Log.e("MainActivity", "FileNotFoundException");
                return;
            }
        }
    }

    private Bitmap decodeUri(Uri selectedImage) throws FileNotFoundException
    {
        // Decode image size
        BitmapFactory.Options o = new BitmapFactory.Options();
        o.inJustDecodeBounds = true;
        BitmapFactory.decodeStream(getContentResolver().openInputStream(selectedImage), null, o);

        // The new size we want to scale to
        final int REQUIRED_SIZE = 400;

        // Find the correct scale value. It should be the power of 2.
        int width_tmp = o.outWidth, height_tmp = o.outHeight;
        int scale = 1;
        while (true) {
            if (width_tmp / 2 < REQUIRED_SIZE
               || height_tmp / 2 < REQUIRED_SIZE) {
                break;
            }
            width_tmp /= 2;
            height_tmp /= 2;
            scale *= 2;
        }

        // Decode with inSampleSize
        BitmapFactory.Options o2 = new BitmapFactory.Options();
        o2.inSampleSize = scale;
        return BitmapFactory.decodeStream(getContentResolver().openInputStream(selectedImage), null, o2);
    }

 }
--- a/examples/squeezencnn/src/com/tencent/squeezencnn/SqueezeNcnn.java
+++ b/examples/squeezencnn/src/com/tencent/squeezencnn/SqueezeNcnn.java
@@ -0,0 +1,29 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 package com.tencent.squeezencnn;

 import android.graphics.Bitmap;
 import android.content.Context;

 public class SqueezeNcnn
 {
    public native boolean Init(byte[] param, byte[] bin, byte[] words);

    public native String Detect(Bitmap bitmap);

    static {
        System.loadLibrary("squeezencnn");
    }
 }
--- a/examples/squeezenet.cpp
+++ b/examples/squeezenet.cpp
@@ -0,0 +1,95 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include <stdio.h>
 #include <algorithm>
 #include <vector>
 #include <opencv2/core/core.hpp>
 #include <opencv2/highgui/highgui.hpp>

 #include "net.h"

 static int detect_squeezenet(const cv::Mat& bgr, std::vector<float>& cls_scores)
 {
    ncnn::Net squeezenet;
    squeezenet.load_param("squeezenet_v1.1.param");
    squeezenet.load_model("squeezenet_v1.1.bin");

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, 227, 227);

    const float mean_vals[3] = {104.f, 117.f, 123.f};
    in.substract_mean_normalize(mean_vals, 0);

    ncnn::Extractor ex = squeezenet.create_extractor();
    ex.set_light_mode(true);

    ex.input("data", in);

    ncnn::Mat out;
    ex.extract("prob", out);

    cls_scores.resize(out.c);
    for (int j=0; j<out.c; j++)
    {
        const float* prob = out.data + out.cstep * j;
        cls_scores[j] = prob[0];
    }

    return 0;
 }

 static int print_topk(const std::vector<float>& cls_scores, int topk)
 {
    // partial sort topk with index
    int size = cls_scores.size();
    std::vector< std::pair<float, int> > vec;
    vec.resize(size);
    for (int i=0; i<size; i++)
    {
        vec[i] = std::make_pair(cls_scores[i], i);
    }

    std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
                      std::greater< std::pair<float, int> >());

    // print topk and score
    for (int i=0; i<topk; i++)
    {
        float score = vec[i].first;
        int index = vec[i].second;
        fprintf(stderr, "%d = %f\n", index, score);
    }

    return 0;
 }

 int main(int argc, char** argv)
 {
    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, CV_LOAD_IMAGE_COLOR);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<float> cls_scores;
    detect_squeezenet(m, cls_scores);

    print_topk(cls_scores, 3);

    return 0;
 }

--- a/examples/squeezenet_v1.1.bin
+++ b/examples/squeezenet_v1.1.bin
--- a/examples/squeezenet_v1.1.caffemodel
+++ b/examples/squeezenet_v1.1.caffemodel
--- a/examples/squeezenet_v1.1.param
+++ b/examples/squeezenet_v1.1.param
@@ -0,0 +1,76 @@
 75 83
 Input            data             0 1 data 3 227 227
 Convolution      conv1            1 1 data conv1 64 3 1 2 0 1 1728
 ReLU             relu_conv1       1 1 conv1 conv1_relu_conv1 0.000000
 Pooling          pool1            1 1 conv1_relu_conv1 pool1 0 3 2 0 0
 Convolution      fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1 16 1 1 1 0 1 1024
 ReLU             fire2/relu_squeeze1x1 1 1 fire2/squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1 0.000000
 Split            splitncnn_0      1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1
 Convolution      fire2/expand1x1  1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1 64 1 1 1 0 1 1024
 ReLU             fire2/relu_expand1x1 1 1 fire2/expand1x1 fire2/expand1x1_fire2/relu_expand1x1 0.000000
 Convolution      fire2/expand3x3  1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3 64 3 1 1 1 1 9216
 ReLU             fire2/relu_expand3x3 1 1 fire2/expand3x3 fire2/expand3x3_fire2/relu_expand3x3 0.000000
 Concat           fire2/concat     2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat
 Convolution      fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1 16 1 1 1 0 1 2048
 ReLU             fire3/relu_squeeze1x1 1 1 fire3/squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1 0.000000
 Split            splitncnn_1      1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1
 Convolution      fire3/expand1x1  1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1 64 1 1 1 0 1 1024
 ReLU             fire3/relu_expand1x1 1 1 fire3/expand1x1 fire3/expand1x1_fire3/relu_expand1x1 0.000000
 Convolution      fire3/expand3x3  1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3 64 3 1 1 1 1 9216
 ReLU             fire3/relu_expand3x3 1 1 fire3/expand3x3 fire3/expand3x3_fire3/relu_expand3x3 0.000000
 Concat           fire3/concat     2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat
 Pooling          pool3            1 1 fire3/concat pool3 0 3 2 0 0
 Convolution      fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1 32 1 1 1 0 1 4096
 ReLU             fire4/relu_squeeze1x1 1 1 fire4/squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1 0.000000
 Split            splitncnn_2      1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1
 Convolution      fire4/expand1x1  1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1 128 1 1 1 0 1 4096
 ReLU             fire4/relu_expand1x1 1 1 fire4/expand1x1 fire4/expand1x1_fire4/relu_expand1x1 0.000000
 Convolution      fire4/expand3x3  1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3 128 3 1 1 1 1 36864
 ReLU             fire4/relu_expand3x3 1 1 fire4/expand3x3 fire4/expand3x3_fire4/relu_expand3x3 0.000000
 Concat           fire4/concat     2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat
 Convolution      fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1 32 1 1 1 0 1 8192
 ReLU             fire5/relu_squeeze1x1 1 1 fire5/squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1 0.000000
 Split            splitncnn_3      1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1
 Convolution      fire5/expand1x1  1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1 128 1 1 1 0 1 4096
 ReLU             fire5/relu_expand1x1 1 1 fire5/expand1x1 fire5/expand1x1_fire5/relu_expand1x1 0.000000
 Convolution      fire5/expand3x3  1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3 128 3 1 1 1 1 36864
 ReLU             fire5/relu_expand3x3 1 1 fire5/expand3x3 fire5/expand3x3_fire5/relu_expand3x3 0.000000
 Concat           fire5/concat     2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat
 Pooling          pool5            1 1 fire5/concat pool5 0 3 2 0 0
 Convolution      fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1 48 1 1 1 0 1 12288
 ReLU             fire6/relu_squeeze1x1 1 1 fire6/squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1 0.000000
 Split            splitncnn_4      1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1
 Convolution      fire6/expand1x1  1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1 192 1 1 1 0 1 9216
 ReLU             fire6/relu_expand1x1 1 1 fire6/expand1x1 fire6/expand1x1_fire6/relu_expand1x1 0.000000
 Convolution      fire6/expand3x3  1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3 192 3 1 1 1 1 82944
 ReLU             fire6/relu_expand3x3 1 1 fire6/expand3x3 fire6/expand3x3_fire6/relu_expand3x3 0.000000
 Concat           fire6/concat     2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat
 Convolution      fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1 48 1 1 1 0 1 18432
 ReLU             fire7/relu_squeeze1x1 1 1 fire7/squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1 0.000000
 Split            splitncnn_5      1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1
 Convolution      fire7/expand1x1  1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1 192 1 1 1 0 1 9216
 ReLU             fire7/relu_expand1x1 1 1 fire7/expand1x1 fire7/expand1x1_fire7/relu_expand1x1 0.000000
 Convolution      fire7/expand3x3  1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3 192 3 1 1 1 1 82944
 ReLU             fire7/relu_expand3x3 1 1 fire7/expand3x3 fire7/expand3x3_fire7/relu_expand3x3 0.000000
 Concat           fire7/concat     2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat
 Convolution      fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1 64 1 1 1 0 1 24576
 ReLU             fire8/relu_squeeze1x1 1 1 fire8/squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1 0.000000
 Split            splitncnn_6      1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1
 Convolution      fire8/expand1x1  1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1 256 1 1 1 0 1 16384
 ReLU             fire8/relu_expand1x1 1 1 fire8/expand1x1 fire8/expand1x1_fire8/relu_expand1x1 0.000000
 Convolution      fire8/expand3x3  1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3 256 3 1 1 1 1 147456
 ReLU             fire8/relu_expand3x3 1 1 fire8/expand3x3 fire8/expand3x3_fire8/relu_expand3x3 0.000000
 Concat           fire8/concat     2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat
 Convolution      fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1 64 1 1 1 0 1 32768
 ReLU             fire9/relu_squeeze1x1 1 1 fire9/squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1 0.000000
 Split            splitncnn_7      1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1
 Convolution      fire9/expand1x1  1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1 256 1 1 1 0 1 16384
 ReLU             fire9/relu_expand1x1 1 1 fire9/expand1x1 fire9/expand1x1_fire9/relu_expand1x1 0.000000
 Convolution      fire9/expand3x3  1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3 256 3 1 1 1 1 147456
 ReLU             fire9/relu_expand3x3 1 1 fire9/expand3x3 fire9/expand3x3_fire9/relu_expand3x3 0.000000
 Concat           fire9/concat     2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat
 Dropout          drop9            1 1 fire9/concat fire9/concat_drop9
 Convolution      conv10           1 1 fire9/concat_drop9 conv10 1000 1 1 1 1 1 512000
 ReLU             relu_conv10      1 1 conv10 conv10_relu_conv10 0.000000
 Pooling          pool10           1 1 conv10_relu_conv10 pool10 1 0 1 0 1
 Softmax          prob             1 1 pool10 prob
--- a/examples/squeezenet_v1.1.prototxt
+++ b/examples/squeezenet_v1.1.prototxt
@@ -0,0 +1,548 @@
 name: "squeezenet_v1.1_deploy"

 layer {
  name: "data"
  type: "Input"
  top: "data"
  input_param { shape: { dim: 1 dim: 3 dim: 227 dim: 227 } }
 }
 layer {
  name: "conv1"
  type: "Convolution"
  bottom: "data"
  top: "conv1"
  convolution_param {
    num_output: 64
    kernel_size: 3
    stride: 2
  }
 }
 layer {
  name: "relu_conv1"
  type: "ReLU"
  bottom: "conv1"
  top: "conv1"
 }
 layer {
  name: "pool1"
  type: "Pooling"
  bottom: "conv1"
  top: "pool1"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
 }
 layer {
  name: "fire2/squeeze1x1"
  type: "Convolution"
  bottom: "pool1"
  top: "fire2/squeeze1x1"
  convolution_param {
    num_output: 16
    kernel_size: 1
  }
 }
 layer {
  name: "fire2/relu_squeeze1x1"
  type: "ReLU"
  bottom: "fire2/squeeze1x1"
  top: "fire2/squeeze1x1"
 }
 layer {
  name: "fire2/expand1x1"
  type: "Convolution"
  bottom: "fire2/squeeze1x1"
  top: "fire2/expand1x1"
  convolution_param {
    num_output: 64
    kernel_size: 1
  }
 }
 layer {
  name: "fire2/relu_expand1x1"
  type: "ReLU"
  bottom: "fire2/expand1x1"
  top: "fire2/expand1x1"
 }
 layer {
  name: "fire2/expand3x3"
  type: "Convolution"
  bottom: "fire2/squeeze1x1"
  top: "fire2/expand3x3"
  convolution_param {
    num_output: 64
    pad: 1
    kernel_size: 3
  }
 }
 layer {
  name: "fire2/relu_expand3x3"
  type: "ReLU"
  bottom: "fire2/expand3x3"
  top: "fire2/expand3x3"
 }
 layer {
  name: "fire2/concat"
  type: "Concat"
  bottom: "fire2/expand1x1"
  bottom: "fire2/expand3x3"
  top: "fire2/concat"
 }
 layer {
  name: "fire3/squeeze1x1"
  type: "Convolution"
  bottom: "fire2/concat"
  top: "fire3/squeeze1x1"
  convolution_param {
    num_output: 16
    kernel_size: 1
  }
 }
 layer {
  name: "fire3/relu_squeeze1x1"
  type: "ReLU"
  bottom: "fire3/squeeze1x1"
  top: "fire3/squeeze1x1"
 }
 layer {
  name: "fire3/expand1x1"
  type: "Convolution"
  bottom: "fire3/squeeze1x1"
  top: "fire3/expand1x1"
  convolution_param {
    num_output: 64
    kernel_size: 1
  }
 }
 layer {
  name: "fire3/relu_expand1x1"
  type: "ReLU"
  bottom: "fire3/expand1x1"
  top: "fire3/expand1x1"
 }
 layer {
  name: "fire3/expand3x3"
  type: "Convolution"
  bottom: "fire3/squeeze1x1"
  top: "fire3/expand3x3"
  convolution_param {
    num_output: 64
    pad: 1
    kernel_size: 3
  }
 }
 layer {
  name: "fire3/relu_expand3x3"
  type: "ReLU"
  bottom: "fire3/expand3x3"
  top: "fire3/expand3x3"
 }
 layer {
  name: "fire3/concat"
  type: "Concat"
  bottom: "fire3/expand1x1"
  bottom: "fire3/expand3x3"
  top: "fire3/concat"
 }
 layer {
  name: "pool3"
  type: "Pooling"
  bottom: "fire3/concat"
  top: "pool3"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
 }
 layer {
  name: "fire4/squeeze1x1"
  type: "Convolution"
  bottom: "pool3"
  top: "fire4/squeeze1x1"
  convolution_param {
    num_output: 32
    kernel_size: 1
  }
 }
 layer {
  name: "fire4/relu_squeeze1x1"
  type: "ReLU"
  bottom: "fire4/squeeze1x1"
  top: "fire4/squeeze1x1"
 }
 layer {
  name: "fire4/expand1x1"
  type: "Convolution"
  bottom: "fire4/squeeze1x1"
  top: "fire4/expand1x1"
  convolution_param {
    num_output: 128
    kernel_size: 1
  }
 }
 layer {
  name: "fire4/relu_expand1x1"
  type: "ReLU"
  bottom: "fire4/expand1x1"
  top: "fire4/expand1x1"
 }
 layer {
  name: "fire4/expand3x3"
  type: "Convolution"
  bottom: "fire4/squeeze1x1"
  top: "fire4/expand3x3"
  convolution_param {
    num_output: 128
    pad: 1
    kernel_size: 3
  }
 }
 layer {
  name: "fire4/relu_expand3x3"
  type: "ReLU"
  bottom: "fire4/expand3x3"
  top: "fire4/expand3x3"
 }
 layer {
  name: "fire4/concat"
  type: "Concat"
  bottom: "fire4/expand1x1"
  bottom: "fire4/expand3x3"
  top: "fire4/concat"
 }
 layer {
  name: "fire5/squeeze1x1"
  type: "Convolution"
  bottom: "fire4/concat"
  top: "fire5/squeeze1x1"
  convolution_param {
    num_output: 32
    kernel_size: 1
  }
 }
 layer {
  name: "fire5/relu_squeeze1x1"
  type: "ReLU"
  bottom: "fire5/squeeze1x1"
  top: "fire5/squeeze1x1"
 }
 layer {
  name: "fire5/expand1x1"
  type: "Convolution"
  bottom: "fire5/squeeze1x1"
  top: "fire5/expand1x1"
  convolution_param {
    num_output: 128
    kernel_size: 1
  }
 }
 layer {
  name: "fire5/relu_expand1x1"
  type: "ReLU"
  bottom: "fire5/expand1x1"
  top: "fire5/expand1x1"
 }
 layer {
  name: "fire5/expand3x3"
  type: "Convolution"
  bottom: "fire5/squeeze1x1"
  top: "fire5/expand3x3"
  convolution_param {
    num_output: 128
    pad: 1
    kernel_size: 3
  }
 }
 layer {
  name: "fire5/relu_expand3x3"
  type: "ReLU"
  bottom: "fire5/expand3x3"
  top: "fire5/expand3x3"
 }
 layer {
  name: "fire5/concat"
  type: "Concat"
  bottom: "fire5/expand1x1"
  bottom: "fire5/expand3x3"
  top: "fire5/concat"
 }
 layer {
  name: "pool5"
  type: "Pooling"
  bottom: "fire5/concat"
  top: "pool5"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
 }
 layer {
  name: "fire6/squeeze1x1"
  type: "Convolution"
  bottom: "pool5"
  top: "fire6/squeeze1x1"
  convolution_param {
    num_output: 48
    kernel_size: 1
  }
 }
 layer {
  name: "fire6/relu_squeeze1x1"
  type: "ReLU"
  bottom: "fire6/squeeze1x1"
  top: "fire6/squeeze1x1"
 }
 layer {
  name: "fire6/expand1x1"
  type: "Convolution"
  bottom: "fire6/squeeze1x1"
  top: "fire6/expand1x1"
  convolution_param {
    num_output: 192
    kernel_size: 1
  }
 }
 layer {
  name: "fire6/relu_expand1x1"
  type: "ReLU"
  bottom: "fire6/expand1x1"
  top: "fire6/expand1x1"
 }
 layer {
  name: "fire6/expand3x3"
  type: "Convolution"
  bottom: "fire6/squeeze1x1"
  top: "fire6/expand3x3"
  convolution_param {
    num_output: 192
    pad: 1
    kernel_size: 3
  }
 }
 layer {
  name: "fire6/relu_expand3x3"
  type: "ReLU"
  bottom: "fire6/expand3x3"
  top: "fire6/expand3x3"
 }
 layer {
  name: "fire6/concat"
  type: "Concat"
  bottom: "fire6/expand1x1"
  bottom: "fire6/expand3x3"
  top: "fire6/concat"
 }
 layer {
  name: "fire7/squeeze1x1"
  type: "Convolution"
  bottom: "fire6/concat"
  top: "fire7/squeeze1x1"
  convolution_param {
    num_output: 48
    kernel_size: 1
  }
 }
 layer {
  name: "fire7/relu_squeeze1x1"
  type: "ReLU"
  bottom: "fire7/squeeze1x1"
  top: "fire7/squeeze1x1"
 }
 layer {
  name: "fire7/expand1x1"
  type: "Convolution"
  bottom: "fire7/squeeze1x1"
  top: "fire7/expand1x1"
  convolution_param {
    num_output: 192
    kernel_size: 1
  }
 }
 layer {
  name: "fire7/relu_expand1x1"
  type: "ReLU"
  bottom: "fire7/expand1x1"
  top: "fire7/expand1x1"
 }
 layer {
  name: "fire7/expand3x3"
  type: "Convolution"
  bottom: "fire7/squeeze1x1"
  top: "fire7/expand3x3"
  convolution_param {
    num_output: 192
    pad: 1
    kernel_size: 3
  }
 }
 layer {
  name: "fire7/relu_expand3x3"
  type: "ReLU"
  bottom: "fire7/expand3x3"
  top: "fire7/expand3x3"
 }
 layer {
  name: "fire7/concat"
  type: "Concat"
  bottom: "fire7/expand1x1"
  bottom: "fire7/expand3x3"
  top: "fire7/concat"
 }
 layer {
  name: "fire8/squeeze1x1"
  type: "Convolution"
  bottom: "fire7/concat"
  top: "fire8/squeeze1x1"
  convolution_param {
    num_output: 64
    kernel_size: 1
  }
 }
 layer {
  name: "fire8/relu_squeeze1x1"
  type: "ReLU"
  bottom: "fire8/squeeze1x1"
  top: "fire8/squeeze1x1"
 }
 layer {
  name: "fire8/expand1x1"
  type: "Convolution"
  bottom: "fire8/squeeze1x1"
  top: "fire8/expand1x1"
  convolution_param {
    num_output: 256
    kernel_size: 1
  }
 }
 layer {
  name: "fire8/relu_expand1x1"
  type: "ReLU"
  bottom: "fire8/expand1x1"
  top: "fire8/expand1x1"
 }
 layer {
  name: "fire8/expand3x3"
  type: "Convolution"
  bottom: "fire8/squeeze1x1"
  top: "fire8/expand3x3"
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
  }
 }
 layer {
  name: "fire8/relu_expand3x3"
  type: "ReLU"
  bottom: "fire8/expand3x3"
  top: "fire8/expand3x3"
 }
 layer {
  name: "fire8/concat"
  type: "Concat"
  bottom: "fire8/expand1x1"
  bottom: "fire8/expand3x3"
  top: "fire8/concat"
 }
 layer {
  name: "fire9/squeeze1x1"
  type: "Convolution"
  bottom: "fire8/concat"
  top: "fire9/squeeze1x1"
  convolution_param {
    num_output: 64
    kernel_size: 1
  }
 }
 layer {
  name: "fire9/relu_squeeze1x1"
  type: "ReLU"
  bottom: "fire9/squeeze1x1"
  top: "fire9/squeeze1x1"
 }
 layer {
  name: "fire9/expand1x1"
  type: "Convolution"
  bottom: "fire9/squeeze1x1"
  top: "fire9/expand1x1"
  convolution_param {
    num_output: 256
    kernel_size: 1
  }
 }
 layer {
  name: "fire9/relu_expand1x1"
  type: "ReLU"
  bottom: "fire9/expand1x1"
  top: "fire9/expand1x1"
 }
 layer {
  name: "fire9/expand3x3"
  type: "Convolution"
  bottom: "fire9/squeeze1x1"
  top: "fire9/expand3x3"
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
  }
 }
 layer {
  name: "fire9/relu_expand3x3"
  type: "ReLU"
  bottom: "fire9/expand3x3"
  top: "fire9/expand3x3"
 }
 layer {
  name: "fire9/concat"
  type: "Concat"
  bottom: "fire9/expand1x1"
  bottom: "fire9/expand3x3"
  top: "fire9/concat"
 }
 layer {
  name: "drop9"
  type: "Dropout"
  bottom: "fire9/concat"
  top: "fire9/concat"
  dropout_param {
    dropout_ratio: 0.5
  }
 }
 layer {
  name: "conv10"
  type: "Convolution"
  bottom: "fire9/concat"
  top: "conv10"
  convolution_param {
    num_output: 1000
    pad: 1
    kernel_size: 1
  }
 }
 layer {
  name: "relu_conv10"
  type: "ReLU"
  bottom: "conv10"
  top: "conv10"
 }
 layer {
  name: "pool10"
  type: "Pooling"
  bottom: "conv10"
  top: "pool10"
  pooling_param {
    pool: AVE
    global_pooling: true
  }
 }
 layer {
  name: "prob"
  type: "Softmax"
  bottom: "pool10"
  top: "prob"
 }
--- a/examples/synset_words.txt
+++ b/examples/synset_words.txt
--- a/ios.toolchain.cmake
+++ b/ios.toolchain.cmake
@@ -0,0 +1,193 @@
 # This file is based off of the Platform/Darwin.cmake and Platform/UnixPaths.cmake
 # files which are included with CMake 2.8.4
 # It has been altered for iOS development

 # Options:
 #
 # IOS_PLATFORM = iPhoneOS (default) or iPhoneSimulator
 #   This decides if SDKS will be selected from the iPhoneOS.platform or iPhoneSimulator.platform folders
 #   iPhoneOS - the default, used to build for iPhone and iPad physical devices, which have an arm arch.
 #   iPhoneSimulator - used to build for the Simulator platforms, which have an x86 arch.
 #
 # CMAKE_IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder
 #   By default this location is automatcially chosen based on the IOS_PLATFORM value above.
 #   If set manually, it will override the default location and force the user of a particular Developer Platform
 #
 # CMAKE_IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder
 #   By default this location is automatcially chosen based on the CMAKE_IOS_DEVELOPER_ROOT value.
 #   In this case it will always be the most up-to-date SDK found in the CMAKE_IOS_DEVELOPER_ROOT path.
 #   If set manually, this will force the use of a specific SDK version

 # Macros:
 #
 # set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE)
 #  A convenience macro for setting xcode specific properties on targets
 #  example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1")
 #
 # find_host_package (PROGRAM ARGS)
 #  A macro used to find executable programs on the host system, not within the iOS environment.
 #  Thanks to the android-cmake project for providing the command

 # Standard settings
 set (CMAKE_SYSTEM_NAME Darwin)
 set (CMAKE_SYSTEM_VERSION 1)
 set (UNIX True)
 set (APPLE True)
 set (IOS True)

 # Required as of cmake 2.8.10
 set (CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE)

 # Determine the cmake host system version so we know where to find the iOS SDKs
 find_program (CMAKE_UNAME uname /bin /usr/bin /usr/local/bin)
 if (CMAKE_UNAME)
 	exec_program(uname ARGS -r OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_VERSION)
 	string (REGEX REPLACE "^([0-9]+)\\.([0-9]+).*$" "\\1" DARWIN_MAJOR_VERSION "${CMAKE_HOST_SYSTEM_VERSION}")
 endif (CMAKE_UNAME)

 # Force the compilers to gcc for iOS
 include (CMakeForceCompiler)
 CMAKE_FORCE_C_COMPILER (/usr/bin/clang Apple)
 CMAKE_FORCE_CXX_COMPILER (/usr/bin/clang++ Apple)
 set(CMAKE_AR ar CACHE FILEPATH "" FORCE)

 # Skip the platform compiler checks for cross compiling
 set (CMAKE_CXX_COMPILER_WORKS TRUE)
 set (CMAKE_C_COMPILER_WORKS TRUE)

 # All iOS/Darwin specific settings - some may be redundant
 set (CMAKE_SHARED_LIBRARY_PREFIX "lib")
 set (CMAKE_SHARED_LIBRARY_SUFFIX ".dylib")
 set (CMAKE_SHARED_MODULE_PREFIX "lib")
 set (CMAKE_SHARED_MODULE_SUFFIX ".so")
 set (CMAKE_MODULE_EXISTS 1)
 set (CMAKE_DL_LIBS "")

 set (CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ")
 set (CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ")
 set (CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
 set (CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")

 # Hidden visibilty is required for cxx on iOS 
 set (CMAKE_C_FLAGS_INIT "-isysroot ${CMAKE_OSX_SYSROOT} -miphoneos-version-min=6.0")
 set (CMAKE_CXX_FLAGS_INIT "-stdlib=libc++ -fvisibility=hidden -fvisibility-inlines-hidden -isysroot ${CMAKE_OSX_SYSROOT} -miphoneos-version-min=6.0")

 set (CMAKE_C_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}")
 set (CMAKE_CXX_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}")

 set (CMAKE_PLATFORM_HAS_INSTALLNAME 1)
 set (CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names")
 set (CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names")
 set (CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,")
 set (CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,")
 set (CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a")

 # hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree
 # (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache
 # and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun)
 # hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex
 if (NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
 	find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool)
 endif (NOT DEFINED CMAKE_INSTALL_NAME_TOOL)

 # Setup iOS platform unless specified manually with IOS_PLATFORM
 if (NOT DEFINED IOS_PLATFORM)
    set (IOS_PLATFORM "iPhoneOS")
 endif (NOT DEFINED IOS_PLATFORM)
 set (IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")

 # Check the platform selection and setup for developer root
 if (${IOS_PLATFORM} STREQUAL "iPhoneOS")
 	set (IOS_PLATFORM_LOCATION "iPhoneOS.platform")

 	# This causes the installers to properly locate the output libraries
 	set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos")
 elseif (${IOS_PLATFORM} STREQUAL "iPhoneSimulator")
 	set (IOS_PLATFORM_LOCATION "iPhoneSimulator.platform")

 	# This causes the installers to properly locate the output libraries
 	set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator")
 else (${IOS_PLATFORM} STREQUAL "iPhoneOS")
    message (FATAL_ERROR "Unsupported IOS_PLATFORM value selected. Please choose iPhoneOS or iPhoneSimulator")
 endif (${IOS_PLATFORM} STREQUAL "iPhoneOS")

 # Setup iOS developer location unless specified manually with CMAKE_IOS_DEVELOPER_ROOT
 # Note Xcode 4.3 changed the installation location, choose the most recent one available
 set (XCODE_POST_43_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
 set (XCODE_PRE_43_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
 if (NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT)
 	if (EXISTS ${XCODE_POST_43_ROOT})
 		set (CMAKE_IOS_DEVELOPER_ROOT ${XCODE_POST_43_ROOT})
 	elseif(EXISTS ${XCODE_PRE_43_ROOT})
 		set (CMAKE_IOS_DEVELOPER_ROOT ${XCODE_PRE_43_ROOT})
 	endif (EXISTS ${XCODE_POST_43_ROOT})
 endif (NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT)
 set (CMAKE_IOS_DEVELOPER_ROOT ${CMAKE_IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform")

 # Find and use the most recent iOS sdk unless specified manually with CMAKE_IOS_SDK_ROOT
 if (NOT DEFINED CMAKE_IOS_SDK_ROOT)
 	file (GLOB _CMAKE_IOS_SDKS "${CMAKE_IOS_DEVELOPER_ROOT}/SDKs/*")
 	if (_CMAKE_IOS_SDKS) 
 		list (SORT _CMAKE_IOS_SDKS)
 		list (REVERSE _CMAKE_IOS_SDKS)
 		list (GET _CMAKE_IOS_SDKS 0 CMAKE_IOS_SDK_ROOT)
 	else (_CMAKE_IOS_SDKS)
 		message (FATAL_ERROR "No iOS SDK's found in default search path ${CMAKE_IOS_DEVELOPER_ROOT}. Manually set CMAKE_IOS_SDK_ROOT or install the iOS SDK.")
 	endif (_CMAKE_IOS_SDKS)
 	message (STATUS "Toolchain using default iOS SDK: ${CMAKE_IOS_SDK_ROOT}")
 endif (NOT DEFINED CMAKE_IOS_SDK_ROOT)
 set (CMAKE_IOS_SDK_ROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK")

 # Set the sysroot default to the most recent SDK
 set (CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")

 # set the architecture for iOS 
 # NOTE: Currently both ARCHS_STANDARD_32_BIT and ARCHS_UNIVERSAL_IPHONE_OS set armv7 only, so set both manually
 if (${IOS_PLATFORM} STREQUAL "iPhoneOS")
 	set (IOS_ARCH armv7)
 else (${IOS_PLATFORM} STREQUAL "iPhoneOS")
 	set (IOS_ARCH i386)
 endif (${IOS_PLATFORM} STREQUAL "iPhoneOS")

 set (CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string  "Build architecture for iOS")

 # Set the find root to the iOS developer roots and to user defined paths
 set (CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE string  "iOS find search path root")

 # default to searching for frameworks first
 set (CMAKE_FIND_FRAMEWORK FIRST)

 # set up the default search directories for frameworks
 set (CMAKE_SYSTEM_FRAMEWORK_PATH
 	${CMAKE_IOS_SDK_ROOT}/System/Library/Frameworks
 	${CMAKE_IOS_SDK_ROOT}/System/Library/PrivateFrameworks
 	${CMAKE_IOS_SDK_ROOT}/Developer/Library/Frameworks
 )

 # only search the iOS sdks, not the remainder of the host filesystem
 set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
 set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
 set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)


 # This little macro lets you set any XCode specific property
 macro (set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE)
 	set_property (TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE})
 endmacro (set_xcode_property)


 # This macro lets you find executable programs on the host system
 macro (find_host_package)
 	set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
 	set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
 	set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
 	set (IOS FALSE)

 	find_package(${ARGN})

 	set (IOS TRUE)
 	set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
 	set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
 	set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
 endmacro (find_host_package)

--- a/iossimxc.toolchain.cmake
+++ b/iossimxc.toolchain.cmake
@@ -0,0 +1,40 @@
 # Standard settings
 # set(UNIX True)
 # set(Darwin True)
 # set(IOS True)
 set (CMAKE_SYSTEM_NAME Darwin)
 set (CMAKE_SYSTEM_VERSION 1)
 set (UNIX True)
 set (APPLE True)
 set (IOS True)

 # suppress -rdynamic
 # set(CMAKE_SYSTEM_NAME Generic)

 set(CMAKE_C_COMPILER i386-apple-darwin11-clang)
 set(CMAKE_CXX_COMPILER i386-apple-darwin11-clang++)

 set(_CMAKE_TOOLCHAIN_PREFIX i386-apple-darwin11-)

 set(CMAKE_IOS_SDK_ROOT "/home/nihui/osd/cctools-port/usage_examples/ios_toolchain/target-sim/SDK/")

 # Set the sysroot default to the most recent SDK
 set(CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS Simulator support")

 # set the architecture for iOS
 # set(IOS_ARCH i386)
 # set(IOS_ARCH x86_64)
 set(IOS_ARCH i386;x86_64)

 set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS Simulator")

 # Set the find root to the iOS developer roots and to user defined paths
 set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE string "iOS Simulator find search path root")

 # searching for frameworks only
 set(CMAKE_FIND_FRAMEWORK FIRST)

 # set up the default search directories for frameworks
 set(CMAKE_SYSTEM_FRAMEWORK_PATH
    ${CMAKE_IOS_SDK_ROOT}/System/Library/Frameworks
 )
--- a/iosxc.toolchain.cmake
+++ b/iosxc.toolchain.cmake
@@ -0,0 +1,39 @@
 # Standard settings
 # set(UNIX True)
 # set(Darwin True)
 # set(IOS True)
 set (CMAKE_SYSTEM_NAME Darwin)
 set (CMAKE_SYSTEM_VERSION 1)
 set (UNIX True)
 set (APPLE True)
 set (IOS True)

 # suppress -rdynamic
 # set(CMAKE_SYSTEM_NAME Generic)

 set(CMAKE_C_COMPILER arm-apple-darwin11-clang)
 set(CMAKE_CXX_COMPILER arm-apple-darwin11-clang++)

 set(_CMAKE_TOOLCHAIN_PREFIX arm-apple-darwin11-)

 set(CMAKE_IOS_SDK_ROOT "/home/nihui/osd/cctools-port/usage_examples/ios_toolchain/target/SDK/")

 # Set the sysroot default to the most recent SDK
 set(CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")

 # set the architecture for iOS
 # set(IOS_ARCH arm64)
 set(IOS_ARCH armv7;arm64)

 set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS")

 # Set the find root to the iOS developer roots and to user defined paths
 set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE string "iOS find search path root")

 # searching for frameworks only
 set(CMAKE_FIND_FRAMEWORK FIRST)

 # set up the default search directories for frameworks
 set(CMAKE_SYSTEM_FRAMEWORK_PATH
    ${CMAKE_IOS_SDK_ROOT}/System/Library/Frameworks
 )
--- a/package.sh
+++ b/package.sh
@@ -0,0 +1,35 @@
 #!/usr/bin/bash

 NAME=ncnn

 ##### package android lib
 ANDROIDPKGNAME=${NAME}-android-lib
 rm -rf $ANDROIDPKGNAME
 mkdir -p $ANDROIDPKGNAME
 mkdir -p $ANDROIDPKGNAME/armeabi-v7a
 mkdir -p $ANDROIDPKGNAME/arm64-v8a
 mkdir -p $ANDROIDPKGNAME/include
 cp build-android-armv7/install/lib/lib${NAME}.a $ANDROIDPKGNAME/armeabi-v7a/
 cp build-android-aarch64/install/lib/lib${NAME}.a $ANDROIDPKGNAME/arm64-v8a/
 cp build-android-aarch64/install/include/* $ANDROIDPKGNAME/include/
 rm -f $ANDROIDPKGNAME.zip
 zip -9 -r $ANDROIDPKGNAME.zip $ANDROIDPKGNAME

 ##### package ios framework
 IOSPKGNAME=${NAME}.framework
 rm -rf $IOSPKGNAME
 mkdir -p $IOSPKGNAME/Versions/A/Headers
 mkdir -p $IOSPKGNAME/Versions/A/Resources
 ln -s A $IOSPKGNAME/Versions/Current
 ln -s Versions/Current/Headers $IOSPKGNAME/Headers
 ln -s Versions/Current/Resources $IOSPKGNAME/Resources
 ln -s Versions/Current/${NAME} $IOSPKGNAME/${NAME}
 lipo -create \
    build-ios/install/lib/lib${NAME}.a \
    build-ios-sim/install/lib/lib${NAME}.a \
    -o $IOSPKGNAME/Versions/A/${NAME}
 cp -r build-ios/install/include/* $IOSPKGNAME/Versions/A/Headers/
 cp Info.plist ${IOSPKGNAME}/Versions/A/Resources/
 rm -f $IOSPKGNAME.zip
 zip -9 -y -r $IOSPKGNAME.zip $IOSPKGNAME

--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -0,0 +1,135 @@

 ##############################################

 configure_file(platform.h.in ${CMAKE_CURRENT_BINARY_DIR}/platform.h)

 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/layer)

 set(ncnn_SRCS
    blob.cpp
    cpu.cpp
    layer.cpp
    mat.cpp
    mat_pixel.cpp
    net.cpp
    opencv.cpp
 )

 macro(ncnn_add_layer class)
    string(TOLOWER ${class} name)

    # WITH_LAYER_xxx option
    if(${ARGC} EQUAL 2)
        option(WITH_LAYER_${name} "build with layer ${name}" ${ARGV1})
    else()
        option(WITH_LAYER_${name} "build with layer ${name}" ON)
    endif()

    message("WITH_LAYER_${name} = ${WITH_LAYER_${name}}")

    if(WITH_LAYER_${name})
        list(APPEND ncnn_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/layer/${name}.cpp")

        # look for arch specific implementation and append source
        # optimized implementation for armv7 aarch64
        if((ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a"))
            OR (ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64"))
            OR (IOS AND ("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "armv7"))
            OR (IOS AND ("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "arm64"))
            OR (IOS AND ("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "armv7;arm64")))
            if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/layer/arm/${name}_arm.cpp")
                list(APPEND ncnn_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/layer/arm/${name}_arm.cpp")
                set(WITH_LAYER_${name}_arm 1)
            endif()
        else()
            if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/layer/x86/${name}_x86.cpp")
                list(APPEND ncnn_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/layer/x86/${name}_x86.cpp")
                set(WITH_LAYER_${name}_x86 1)
            endif()
        endif()
    endif()

    # generate layer_declaration and layer_registry file
    if(WITH_LAYER_${name})
        if(WITH_LAYER_${name}_arm)
            file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_declaration.h
                "extern Layer* ${class}_arm_layer_creator();\n")
            file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h
                "#if NCNN_STRING\n{\"${class}\",${class}_arm_layer_creator},\n#else\n{${class}_arm_layer_creator},\n#endif\n")
        elseif(WITH_LAYER_${name}_x86)
            file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_declaration.h
                "extern Layer* ${class}_x86_layer_creator();\n")
            file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h
                "#if NCNN_STRING\n{\"${class}\",${class}_x86_layer_creator},\n#else\n{${class}_x86_layer_creator},\n#endif\n")
        else()
            file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_declaration.h
                "extern Layer* ${class}_layer_creator();\n")
            file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h
                "#if NCNN_STRING\n{\"${class}\",${class}_layer_creator},\n#else\n{${class}_layer_creator},\n#endif\n")
        endif()
    else()
        file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h "#if NCNN_STRING\n{\"${class}\",0},\n#else\n{0},\n#endif\n")
    endif()
 endmacro()

 # create new
 file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/layer_declaration.h)
 file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h)

 # layer implementation
 ncnn_add_layer(AbsVal)
 ncnn_add_layer(ArgMax OFF)
 ncnn_add_layer(BatchNorm)
 ncnn_add_layer(Bias)
 ncnn_add_layer(BNLL)
 ncnn_add_layer(Concat)
 ncnn_add_layer(Convolution)
 ncnn_add_layer(Crop)
 ncnn_add_layer(Deconvolution)
 ncnn_add_layer(Dropout)
 ncnn_add_layer(Eltwise)
 ncnn_add_layer(ELU)
 ncnn_add_layer(Embed OFF)
 ncnn_add_layer(Exp)
 ncnn_add_layer(Flatten)
 ncnn_add_layer(InnerProduct)
 ncnn_add_layer(Input)
 ncnn_add_layer(Log)
 ncnn_add_layer(LRN)
 ncnn_add_layer(MemoryData OFF)
 ncnn_add_layer(MVN)
 ncnn_add_layer(Pooling)
 ncnn_add_layer(Power)
 ncnn_add_layer(PReLU)
 ncnn_add_layer(Proposal OFF)
 ncnn_add_layer(Reduction OFF)
 ncnn_add_layer(ReLU)
 ncnn_add_layer(Reshape OFF)
 ncnn_add_layer(ROIPooling OFF)
 ncnn_add_layer(Scale)
 ncnn_add_layer(Sigmoid)
 ncnn_add_layer(Slice)
 ncnn_add_layer(Softmax)
 ncnn_add_layer(Split)
 ncnn_add_layer(SPP OFF)
 ncnn_add_layer(TanH)
 ncnn_add_layer(Threshold)
 ncnn_add_layer(Tile OFF)
 ncnn_add_layer(RNN OFF)
 ncnn_add_layer(LSTM OFF)

 add_library(ncnn STATIC ${ncnn_SRCS})

 install(TARGETS ncnn ARCHIVE DESTINATION lib)
 install(FILES
    blob.h
    cpu.h
    layer.h
    mat.h
    net.h
    opencv.h
    ${CMAKE_CURRENT_BINARY_DIR}/platform.h
    DESTINATION include
 )
--- a/src/blob.cpp
+++ b/src/blob.cpp
@@ -0,0 +1,24 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "blob.h"

 namespace ncnn {

 Blob::Blob()
 {
    producer = -1;
 }

 } // namespace ncnn
--- a/src/blob.h
+++ b/src/blob.h
@@ -0,0 +1,43 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef NCNN_BLOB_H
 #define NCNN_BLOB_H

 #include <string>
 #include <vector>
 #include "platform.h"

 namespace ncnn {

 class Blob
 {
 public:
    // empty
    Blob();

 public:
 #if NCNN_STRING
    // blob name
    std::string name;
 #endif // NCNN_STRING
    // layer index which produce this blob as output
    int producer;
    // layer index which need this blob as input
    std::vector<int> consumers;
 };

 } // namespace ncnn

 #endif // NCNN_BLOB_H
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -0,0 +1,471 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "cpu.h"

 #include <stdio.h>
 #include <vector>

 #ifdef _OPENMP
 #include <omp.h>
 #endif

 #ifdef __ANDROID__
 #include <sys/syscall.h>
 #include <unistd.h>
 #endif

 #if __APPLE__
 #include "TargetConditionals.h"
 #if TARGET_OS_IPHONE
 #include <sys/types.h>
 #include <sys/sysctl.h>
 #include <mach/machine.h>
 #define __IOS__ 1
 #endif
 #endif

 namespace ncnn {

 #ifdef __ANDROID__

 // extract the ELF HW capabilities bitmap from /proc/self/auxv
 static unsigned int get_elf_hwcap_from_proc_self_auxv()
 {
    FILE* fp = fopen("/proc/self/auxv", "rb");
    if (!fp)
    {
        return 0;
    }

 #define AT_HWCAP 16
 #define AT_HWCAP2 26

    struct { unsigned int tag; unsigned int value; } entry;

    unsigned int result = 0;
    while (!feof(fp))
    {
        int nread = fread((char*)&entry, sizeof(entry), 1, fp);
        if (nread != 1)
            break;

        if (entry.tag == 0 && entry.value == 0)
            break;

        if (entry.tag == AT_HWCAP)
        {
            result = entry.value;
            break;
        }
    }

    fclose(fp);

    return result;
 }

 static unsigned int g_hwcaps = get_elf_hwcap_from_proc_self_auxv();

 #if __aarch64__
 // from arch/arm64/include/uapi/asm/hwcap.h
 #define HWCAP_ASIMD     (1 << 1)
 #define HWCAP_ASIMDHP   (1 << 10)
 #else
 // from arch/arm/include/uapi/asm/hwcap.h
 #define HWCAP_NEON      (1 << 12)
 #define HWCAP_VFPv4     (1 << 16)
 #endif

 #endif // __ANDROID__

 #if __IOS__
 static cpu_type_t get_hw_cputype()
 {
    cpu_type_t value = 0;
    size_t len = sizeof(value);
    sysctlbyname("hw.cputype", &value, &len, NULL, 0);
    return value;
 }

 static cpu_subtype_t get_hw_cpusubtype()
 {
    cpu_subtype_t value = 0;
    size_t len = sizeof(value);
    sysctlbyname("hw.cpusubtype", &value, &len, NULL, 0);
    return value;
 }

 static cpu_type_t g_hw_cputype = get_hw_cputype();
 static cpu_subtype_t g_hw_cpusubtype = get_hw_cpusubtype();
 #endif // __IOS__

 int cpu_support_arm_neon()
 {
 #ifdef __ANDROID__
 #if __aarch64__
    return g_hwcaps & HWCAP_ASIMD;
 #else
    return g_hwcaps & HWCAP_NEON;
 #endif
 #elif __IOS__
 #if __aarch64__
    return g_hw_cputype == CPU_TYPE_ARM64;
 #else
    return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7;
 #endif
 #else
    return 0;
 #endif
 }

 int cpu_support_arm_vfpv4()
 {
 #ifdef __ANDROID__
 #if __aarch64__
    // neon always enable fma and fp16
    return g_hwcaps & HWCAP_ASIMD;
 #else
    return g_hwcaps & HWCAP_VFPv4;
 #endif
 #elif __IOS__
 #if __aarch64__
    return g_hw_cputype == CPU_TYPE_ARM64;
 #else
    return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S;
 #endif
 #else
    return 0;
 #endif
 }

 int cpu_support_arm_asimdhp()
 {
 #ifdef __ANDROID__
 #if __aarch64__
    return g_hwcaps & HWCAP_ASIMDHP;
 #else
    return 0;
 #endif
 #elif __IOS__
 #if __aarch64__
    return 0;
 #else
    return 0;
 #endif
 #else
    return 0;
 #endif
 }

 static int get_cpucount()
 {
 #ifdef __ANDROID__
    // get cpu count from /proc/cpuinfo
    FILE* fp = fopen("/proc/cpuinfo", "rb");
    if (!fp)
        return 1;

    int count = 0;
    char line[1024];
    while (!feof(fp))
    {
        char* s = fgets(line, 1024, fp);
        if (!s)
            break;

        if (memcmp(line, "processor", 9) == 0)
        {
            count++;
        }
    }

    fclose(fp);

    if (count < 1)
        count = 1;

    return count;
 #elif __IOS__
    int count = 0;
    size_t len = sizeof(count);
    sysctlbyname("hw.ncpu", &count, &len, NULL, 0);

    if (count < 1)
        count = 1;

    return count;
 #else
    return 1;
 #endif
 }

 static int g_cpucount = get_cpucount();

 int get_cpu_count()
 {
    return g_cpucount;
 }

 #ifdef __ANDROID__
 static int get_max_freq_khz(int cpuid)
 {
    char path[256];
    sprintf(path, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);

    FILE* fp = fopen(path, "rb");

    if (!fp)
        return -1;

    int max_freq_khz = 0;
    while (!feof(fp))
    {
        int freq_khz = 0;
        int nscan = fscanf(fp, "%d %*d", &freq_khz);
        if (nscan != 1)
            break;

        if (freq_khz > max_freq_khz)
            max_freq_khz = freq_khz;
    }

    fclose(fp);

    return max_freq_khz;
 }

 static int set_sched_affinity(const std::vector<int>& cpuids)
 {
    // cpu_set_t definition
    // ref http://stackoverflow.com/questions/16319725/android-set-thread-affinity
 #define CPU_SETSIZE 1024
 #define __NCPUBITS  (8 * sizeof (unsigned long))
 typedef struct
 {
   unsigned long __bits[CPU_SETSIZE / __NCPUBITS];
 } cpu_set_t;

 #define CPU_SET(cpu, cpusetp) \
  ((cpusetp)->__bits[(cpu)/__NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS)))

 #define CPU_ZERO(cpusetp) \
  memset((cpusetp), 0, sizeof(cpu_set_t))

    // set affinity for thread
    pid_t pid = gettid();

    cpu_set_t mask;
    CPU_ZERO(&mask);
    for (int i=0; i<(int)cpuids.size(); i++)
    {
        CPU_SET(cpuids[i], &mask);
    }

    int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask);
    if (syscallret)
    {
        fprintf(stderr, "syscall error %d\n", syscallret);
        return -1;
    }

    return 0;
 }

 static int sort_cpuid_by_max_frequency(std::vector<int>& cpuids, int* little_cluster_offset)
 {
    const int cpu_count = cpuids.size();

    *little_cluster_offset = 0;

    if (cpu_count == 0)
        return 0;

    std::vector<int> cpu_max_freq_khz;
    cpu_max_freq_khz.resize(cpu_count);

    for (int i=0; i<cpu_count; i++)
    {
        int max_freq_khz = get_max_freq_khz(i);

 //         printf("%d max freq = %d khz\n", i, max_freq_khz);

        cpuids[i] = i;
        cpu_max_freq_khz[i] = max_freq_khz;
    }

    // sort cpuid as big core first
    // simple bubble sort
    for (int i=0; i<cpu_count; i++)
    {
        for (int j=i+1; j<cpu_count; j++)
        {
            if (cpu_max_freq_khz[i] < cpu_max_freq_khz[j])
            {
                // swap
                int tmp = cpuids[i];
                cpuids[i] = cpuids[j];
                cpuids[j] = tmp;

                tmp = cpu_max_freq_khz[i];
                cpu_max_freq_khz[i] = cpu_max_freq_khz[j];
                cpu_max_freq_khz[j] = tmp;
            }
        }
    }

    // SMP
    int mid_max_freq_khz = (cpu_max_freq_khz.front() + cpu_max_freq_khz.back()) / 2;
    if (mid_max_freq_khz == cpu_max_freq_khz.back())
        return 0;

    for (int i=0; i<cpu_count; i++)
    {
        if (cpu_max_freq_khz[i] < mid_max_freq_khz)
        {
            *little_cluster_offset = i;
            break;
        }
    }

    return 0;
 }
 #endif // __ANDROID__

 static int g_powersave = 0;

 int get_cpu_powersave()
 {
    return g_powersave;
 }

 int set_cpu_powersave(int powersave)
 {
 #ifdef __ANDROID__
    static std::vector<int> sorted_cpuids;
    static int little_cluster_offset = 0;

    if (sorted_cpuids.empty())
    {
        // 0 ~ g_cpucount
        sorted_cpuids.resize(g_cpucount);
        for (int i=0; i<g_cpucount; i++)
        {
            sorted_cpuids[i] = i;
        }

        // descent sort by max frequency
        sort_cpuid_by_max_frequency(sorted_cpuids, &little_cluster_offset);
    }

    if (little_cluster_offset == 0)
    {
        fprintf(stderr, "SMP cpu powersave not supported\n");
        return -1;
    }

    // prepare affinity cpuid
    std::vector<int> cpuids;
    if (powersave == 0)
    {
        cpuids = sorted_cpuids;
    }
    else if (powersave == 1)
    {
        cpuids = std::vector<int>(sorted_cpuids.begin() + little_cluster_offset, sorted_cpuids.end());
    }
    else if (powersave == 2)
    {
        cpuids = std::vector<int>(sorted_cpuids.begin(), sorted_cpuids.begin() +  + little_cluster_offset);
    }
    else
    {
        fprintf(stderr, "powersave %d not supported\n", powersave);
        return -1;
    }

 #ifdef _OPENMP
    // set affinity for each thread
    int num_threads = cpuids.size();
    omp_set_num_threads(num_threads);
    std::vector<int> ssarets(num_threads, 0);
    #pragma omp parallel for
    for (int i=0; i<num_threads; i++)
    {
        ssarets[i] = set_sched_affinity(cpuids);
    }
    for (int i=0; i<num_threads; i++)
    {
        if (ssarets[i] != 0)
        {
            return -1;
        }
    }
 #else
    int ssaret = set_sched_affinity(cpuids);
    if (ssaret != 0)
    {
        return -1;
    }
 #endif

    g_powersave = powersave;

    return 0;
 #elif __IOS__
    // thread affinity not supported on ios
    return -1;
 #else
    // TODO
    return -1;
 #endif
 }

 int get_omp_num_threads()
 {
 #ifdef _OPENMP
    return omp_get_num_threads();
 #else
    return 1;
 #endif
 }

 void set_omp_num_threads(int num_threads)
 {
 #ifdef _OPENMP
    omp_set_num_threads(num_threads);
 #else
    (void)num_threads;
 #endif
 }

 int get_omp_dynamic()
 {
 #ifdef _OPENMP
    return omp_get_dynamic();
 #else
    return 0;
 #endif
 }

 void set_omp_dynamic(int dynamic)
 {
 #ifdef _OPENMP
    omp_set_dynamic(dynamic);
 #else
    (void)dynamic;
 #endif
 }

 } // namespace ncnn
--- a/src/cpu.h
+++ b/src/cpu.h
@@ -0,0 +1,51 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef NCNN_CPU_H
 #define NCNN_CPU_H

 namespace ncnn {

 // test optional cpu features
 // neon = armv7 neon or aarch64 asimd
 int cpu_support_arm_neon();
 // vfpv4 = armv7 fp16 + fma
 int cpu_support_arm_vfpv4();
 // asimdhp = aarch64 asimd half precision
 int cpu_support_arm_asimdhp();

 // cpu info
 int get_cpu_count();

 // bind all threads on little clusters if powersave enabled
 // affacts HMP arch cpu like ARM big.LITTLE
 // only implemented on android at the moment
 // switching powersave is expensive and not thread-safe
 // 0 = all cores enabled(default)
 // 1 = only little clusters enabled
 // 2 = only big clusters enabled
 // return 0 if success for setter function
 int get_cpu_powersave();
 int set_cpu_powersave(int powersave);

 // misc function wrapper for openmp routines
 int get_omp_num_threads();
 void set_omp_num_threads(int num_threads);

 int get_omp_dynamic();
 void set_omp_dynamic(int dynamic);

 } // namespace ncnn

 #endif // NCNN_CPU_H
--- a/src/layer.cpp
+++ b/src/layer.cpp
@@ -0,0 +1,130 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "layer.h"

 #include <stdio.h>
 #include <string.h>

 namespace ncnn {

 Layer::Layer()
 {
    one_blob_only = false;
    support_inplace = false;
 }

 Layer::~Layer()
 {
 }

 #if NCNN_STDIO
 #if NCNN_STRING
 int Layer::load_param(FILE* /*paramfp*/)
 {
    return 0;
 }
 #endif // NCNN_STRING

 int Layer::load_param_bin(FILE* /*paramfp*/)
 {
    return 0;
 }

 int Layer::load_model(FILE* /*binfp*/)
 {
    return 0;
 }
 #endif // NCNN_STDIO

 int Layer::load_param(const unsigned char*& /*mem*/)
 {
    return 0;
 }

 int Layer::load_model(const unsigned char*& /*mem*/)
 {
    return 0;
 }

 int Layer::forward(const std::vector<Mat>& /*bottom_blobs*/, std::vector<Mat>& /*top_blobs*/) const
 {
    return -1;
 }

 int Layer::forward(const Mat& /*bottom_blob*/, Mat& /*top_blob*/) const
 {
    return -1;
 }

 int Layer::forward_inplace(std::vector<Mat>& bottom_top_blobs) const
 {
    std::vector<Mat> top_blobs;
    int ret = forward(bottom_top_blobs, top_blobs);
    bottom_top_blobs = top_blobs;
    return ret;
 }

 int Layer::forward_inplace(Mat& bottom_top_blob) const
 {
    Mat top_blob;
    int ret = forward(bottom_top_blob, top_blob);
    bottom_top_blob = top_blob;
    return ret;
 }

 #include "layer_declaration.h"

 static const layer_registry_entry layer_registry[] =
 {
 #include "layer_registry.h"
 };

 static const int layer_registry_entry_count = sizeof(layer_registry) / sizeof(layer_registry_entry);

 #if NCNN_STRING
 int layer_to_index(const char* type)
 {
    for (int i=0; i<layer_registry_entry_count; i++)
    {
        if (strcmp(type, layer_registry[i].name) == 0)
        {
            return i;
        }
    }

    fprintf(stderr, "layer %s not exists\n", type);
    return -1;
 }
 #endif // NCNN_STRING

 Layer* create_layer(int index)
 {
    if (index < 0 || index >= layer_registry_entry_count)
    {
        fprintf(stderr, "layer index %d not exists\n", index);
        return 0;
    }

    layer_creator_func layer_creator = layer_registry[index].creator;
    if (!layer_creator)
    {
        fprintf(stderr, "layer index %d not enabled\n", index);
        return 0;
    }

    return layer_creator();
 }

 } // namespace ncnn
--- a/src/layer.h
+++ b/src/layer.h
@@ -0,0 +1,163 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef NCNN_LAYER_H
 #define NCNN_LAYER_H

 #include <stdio.h>
 #include <string>
 #include <vector>
 #include "mat.h"
 #include "platform.h"

 namespace ncnn {

 class Layer
 {
 public:
    // empty
    Layer();
    // virtual destructor
    virtual ~Layer();

 #if NCNN_STDIO
 #if NCNN_STRING
    // load layer specific parameter from plain param file
    // return 0 if success
    virtual int load_param(FILE* paramfp);
 #endif // NCNN_STRING
    // load layer specific parameter from binary param file
    // return 0 if success
    virtual int load_param_bin(FILE* paramfp);

    // load layer specific weight data from model file
    // return 0 if success
    virtual int load_model(FILE* binfp);
 #endif // NCNN_STDIO

    // load layer specific parameter from memory
    // memory pointer is 32-bit aligned
    // return 0 if success
    virtual int load_param(const unsigned char*& mem);

    // load layer specific weight data from memory
    // memory pointer is 32-bit aligned
    // return 0 if success
    virtual int load_model(const unsigned char*& mem);

 public:
    // one input and one output blob
    bool one_blob_only;

    // support inplace inference
    bool support_inplace;

 public:
    // implement inference
    // return 0 if success
    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

    // implement inplace inference
    // return 0 if success
    virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs) const;
    virtual int forward_inplace(Mat& bottom_top_blob) const;

 public:
 #if NCNN_STRING
    // layer type name
    std::string type;
    // layer name
    std::string name;
 #endif // NCNN_STRING
    // blob index which this layer needs as input
    std::vector<int> bottoms;
    // blob index which this layer produces as output
    std::vector<int> tops;
 };

 namespace LayerType {
 enum
 {
    AbsVal      = 0,
    ArgMax      = 1,
    BatchNorm   = 2,
    Bias        = 3,
    BNLL        = 4,
    Concat      = 5,
    Convolution = 6,
    Crop        = 7,
    Deconvolution = 8,
    Dropout     = 9,
    ELU         = 10,
    Eltwise     = 11,
    Embed       = 12,
    Exp         = 13,
    Flatten     = 14,
    InnerProduct = 15,
    Input       = 16,
    Log         = 17,
    LRN         = 18,
    MemoryData  = 19,
    MVN         = 20,
    Pooling     = 21,
    Power       = 22,
    PReLU       = 23,
    Proposal    = 24,
    Reduction   = 25,
    ReLU        = 26,
    Reshape     = 27,
    ROIPooling  = 28,
    Scale       = 29,
    Sigmoid     = 30,
    Slice       = 31,
    Softmax     = 32,
    Split       = 33,
    SPP         = 34,
    TanH        = 35,
    Threshold   = 36,
    Tile        = 37,
    RNN         = 38,
    LSTM        = 39,

    CustomBit   = (1<<8),
 };
 } // namespace LayerType

 // layer factory function
 typedef Layer* (*layer_creator_func)();

 struct layer_registry_entry
 {
 #if NCNN_STRING
    // layer type name
    const char* name;
 #endif // NCNN_STRING
    // layer factory entry
    layer_creator_func creator;
 };

 #if NCNN_STRING
 // get layer type from type name
 int layer_to_index(const char* type);
 #endif // NCNN_STRING
 // create layer from layer type
 Layer* create_layer(int index);

 #define DEFINE_LAYER_CREATOR(name) \
    Layer* name##_layer_creator() { return new name; }

 } // namespace ncnn

 #endif // NCNN_LAYER_H
--- a/src/layer/absval.cpp
+++ b/src/layer/absval.cpp
@@ -0,0 +1,76 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "absval.h"

 namespace ncnn {

 DEFINE_LAYER_CREATOR(AbsVal)

 AbsVal::AbsVal()
 {
    one_blob_only = true;
    support_inplace = true;
 }

 int AbsVal::forward(const Mat& bottom_blob, Mat& top_blob) const
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    int size = w * h;

    top_blob.create(w, h, channels);

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_blob.channel(q);
        float* outptr = top_blob.channel(q);

        for (int i=0; i<size; i++)
        {
            if (ptr[i] < 0)
                outptr[i] = -ptr[i];
            else
                outptr[i] = ptr[i];
        }
    }

    return 0;
 }

 int AbsVal::forward_inplace(Mat& bottom_top_blob) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        for (int i=0; i<size; i++)
        {
            if (ptr[i] < 0)
                ptr[i] = -ptr[i];
        }
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/absval.h
+++ b/src/layer/absval.h
@@ -0,0 +1,36 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_ABSVAL_H
 #define LAYER_ABSVAL_H

 #include "layer.h"

 namespace ncnn {

 class AbsVal : public Layer
 {
 public:
    AbsVal();

    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

    virtual int forward_inplace(Mat& bottom_top_blob) const;

 public:
 };

 } // namespace ncnn

 #endif // LAYER_ABSVAL_H
--- a/src/layer/argmax.cpp
+++ b/src/layer/argmax.cpp
@@ -0,0 +1,108 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "argmax.h"
 #include <algorithm>
 #include <functional>

 namespace ncnn {

 DEFINE_LAYER_CREATOR(ArgMax)

 ArgMax::ArgMax()
 {
 }

 #if NCNN_STDIO
 #if NCNN_STRING
 int ArgMax::load_param(FILE* paramfp)
 {
    int nscan = fscanf(paramfp, "%d %d", &out_max_val, &topk);
    if (nscan != 2)
    {
        fprintf(stderr, "ArgMax load_param failed %d\n", nscan);
        return -1;
    }

    return 0;
 }
 #endif // NCNN_STRING
 int ArgMax::load_param_bin(FILE* paramfp)
 {
    fread(&out_max_val, sizeof(int), 1, paramfp);

    fread(&topk, sizeof(int), 1, paramfp);

    return 0;
 }
 #endif // NCNN_STDIO

 int ArgMax::load_param(const unsigned char*& mem)
 {
    out_max_val = *(int*)(mem);
    mem += 4;

    topk = *(int*)(mem);
    mem += 4;

    return 0;
 }

 int ArgMax::forward(const Mat& bottom_blob, Mat& top_blob) const
 {
    int size = bottom_blob.total();

    if (out_max_val)
        top_blob.create(topk, 2);
    else
        top_blob.create(topk, 1);
    if (top_blob.empty())
        return -100;

    const float* ptr = bottom_blob;

    // partial sort topk with index
    // optional value
    std::vector< std::pair<float, int> > vec;
    vec.resize(size);
    for (int i=0; i<size; i++)
    {
        vec[i] = std::make_pair(ptr[i], i);
    }

    std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
                        std::greater< std::pair<float, int> >());

    float* outptr = top_blob;
    if (out_max_val)
    {
        float* valptr = outptr + topk;
        for (int i=0; i<topk; i++)
        {
            outptr[i] = vec[i].first;
            valptr[i] = vec[i].second;
        }
    }
    else
    {
        for (int i=0; i<topk; i++)
        {
            outptr[i] = vec[i].second;
        }
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/argmax.h
+++ b/src/layer/argmax.h
@@ -0,0 +1,44 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_ARGMAX_H
 #define LAYER_ARGMAX_H

 #include "layer.h"

 namespace ncnn {

 class ArgMax : public Layer
 {
 public:
    ArgMax();

 #if NCNN_STDIO
 #if NCNN_STRING
    virtual int load_param(FILE* paramfp);
 #endif // NCNN_STRING
    virtual int load_param_bin(FILE* paramfp);
 #endif // NCNN_STDIO
    virtual int load_param(const unsigned char*& mem);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

 public:
    int out_max_val;
    int topk;
 };

 } // namespace ncnn

 #endif // LAYER_ARGMAX_H
--- a/src/layer/arm/absval_arm.cpp
+++ b/src/layer/arm/absval_arm.cpp
@@ -0,0 +1,152 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "absval_arm.h"

 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON

 namespace ncnn {

 DEFINE_LAYER_CREATOR(AbsVal_arm)

 int AbsVal_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    int size = w * h;

    top_blob.create(w, h, channels);
    if (top_blob.empty())
        return -100;

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_blob.channel(q);
        float* outptr = top_blob.channel(q);

 #if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
 #else
        int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
        for (; nn>0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _outp = vabsq_f32(_p);
            vst1q_f32(outptr, _outp);

            ptr += 4;
            outptr += 4;
        }
 #else
        if (nn > 0)
        {
        asm volatile(
            "0:                             \n"
            "vld1.f32   {d0-d1}, [%1]!      \n"
            "vabs.f32   q0, q0              \n"
            "subs       %0, #1              \n"
            "vst1.f32   {d0-d1}, [%2]!      \n"
            "bne        0b                  \n"
            : "=r"(nn),     // %0
              "=r"(ptr),    // %1
              "=r"(outptr)  // %2
            : "0"(nn),
              "1"(ptr),
              "2"(outptr)
            : "cc", "memory", "q0"
        );
        }
 #endif // __aarch64__
 #endif // __ARM_NEON
        for (; remain>0; remain--)
        {
            *outptr = *ptr > 0 ? *ptr : -*ptr;

            ptr++;
            outptr++;
        }
    }

    return 0;
 }

 int AbsVal_arm::forward_inplace(Mat& bottom_top_blob) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

 #if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
 #else
        int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
        for (; nn>0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            _p = vabsq_f32(_p);
            vst1q_f32(ptr, _p);

            ptr += 4;
        }
 #else
        if (nn > 0)
        {
        asm volatile(
            "0:                             \n"
            "vld1.f32   {d0-d1}, [%1]       \n"
            "vabs.f32   q0, q0              \n"
            "subs       %0, #1              \n"
            "vst1.f32   {d0-d1}, [%1]!      \n"
            "bne        0b                  \n"
            : "=r"(nn),     // %0
              "=r"(ptr)     // %1
            : "0"(nn),
              "1"(ptr)
            : "cc", "memory", "q0"
        );
        }
 #endif // __aarch64__
 #endif // __ARM_NEON
        for (; remain>0; remain--)
        {
            *ptr = *ptr > 0 ? *ptr : -*ptr;

            ptr++;
        }
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/arm/absval_arm.h
+++ b/src/layer/arm/absval_arm.h
@@ -0,0 +1,34 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_ABSVAL_ARM_H
 #define LAYER_ABSVAL_ARM_H

 #include "absval.h"

 namespace ncnn {

 class AbsVal_arm : public AbsVal
 {
 public:
    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

    virtual int forward_inplace(Mat& bottom_top_blob) const;

 public:
 };

 } // namespace ncnn

 #endif // LAYER_ABSVAL_ARM_H
--- a/src/layer/arm/batchnorm_arm.cpp
+++ b/src/layer/arm/batchnorm_arm.cpp
@@ -0,0 +1,186 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "batchnorm_arm.h"

 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON

 namespace ncnn {

 DEFINE_LAYER_CREATOR(BatchNorm_arm)

 int BatchNorm_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
 {
    // a = bias - slope * mean / sqrt(var)
    // b = slope / sqrt(var)
    // value = b * value + a

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int size = w * h;

    top_blob.create(w, h, channels);
    if (top_blob.empty())
        return -100;

    const float* a_data_ptr = a_data;
    const float* b_data_ptr = b_data;
    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_blob.channel(q);
        float* outptr = top_blob.channel(q);

        float a = a_data_ptr[q];
        float b = b_data_ptr[q];

 #if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
 #else
        int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
        float32x4_t _a = vdupq_n_f32(a);
        float32x4_t _b = vdupq_n_f32(b);
        for (; nn>0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _outp = _a;
            _outp = vfmaq_f32(_outp, _p, _b);
            vst1q_f32(outptr, _outp);

            ptr += 4;
            outptr += 4;
        }
 #else
        if (nn > 0)
        {
        asm volatile(
            "vdup.f32   q1, %6              \n"
            "vdup.f32   q2, %7              \n"
            "0:                             \n"
            "pld        [%1, #128]          \n"
            "vld1.f32   {d0-d1}, [%1 :128]! \n"
            "vorr.32    q3, q1, q1          \n"
            "vmla.f32   q3, q0, q2          \n"
            "subs       %0, #1              \n"
            "vst1.f32   {d6-d7}, [%2 :128]! \n"
            "bne        0b                  \n"
            : "=r"(nn),     // %0
              "=r"(ptr),    // %1
              "=r"(outptr)  // %2
            : "0"(nn),
              "1"(ptr),
              "2"(outptr),
              "r"(a),       // %6
              "r"(b)        // %7
            : "cc", "memory", "q0", "q1", "q2", "q3"
        );
        }
 #endif // __aarch64__
 #endif // __ARM_NEON
        for (; remain>0; remain--)
        {
            *outptr = b * *ptr + a;

            ptr++;
            outptr++;
        }
    }

    return 0;
 }

 int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob) const
 {
    // a = bias - slope * mean / sqrt(var)
    // b = slope / sqrt(var)
    // value = b * value + a

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int size = w * h;

    const float* a_data_ptr = a_data;
    const float* b_data_ptr = b_data;
    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        float a = a_data_ptr[q];
        float b = b_data_ptr[q];

 #if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
 #else
        int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
        float32x4_t _a = vdupq_n_f32(a);
        float32x4_t _b = vdupq_n_f32(b);
        for (; nn>0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _outp = _a;
            _outp = vfmaq_f32(_outp, _p, _b);
            vst1q_f32(ptr, _outp);

            ptr += 4;
        }
 #else
        if (nn > 0)
        {
        asm volatile(
            "vdup.f32   q1, %4              \n"
            "vdup.f32   q2, %5              \n"
            "0:                             \n"
            "pld        [%1, #128]          \n"
            "vld1.f32   {d0-d1}, [%1 :128]  \n"
            "vorr.32    q3, q1, q1          \n"
            "vmla.f32   q3, q0, q2          \n"
            "subs       %0, #1              \n"
            "vst1.f32   {d6-d7}, [%1 :128]! \n"
            "bne        0b                  \n"
            : "=r"(nn),     // %0
              "=r"(ptr)     // %1
            : "0"(nn),
              "1"(ptr),
              "r"(a),       // %4
              "r"(b)        // %5
            : "cc", "memory", "q0", "q1", "q2", "q3"
        );
        }
 #endif // __aarch64__
 #endif // __ARM_NEON
        for (; remain>0; remain--)
        {
            *ptr = b * *ptr + a;

            ptr++;
        }
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/arm/batchnorm_arm.h
+++ b/src/layer/arm/batchnorm_arm.h
@@ -0,0 +1,32 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_BATCHNORM_ARM_H
 #define LAYER_BATCHNORM_ARM_H

 #include "batchnorm.h"

 namespace ncnn {

 class BatchNorm_arm : public BatchNorm
 {
 public:
    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

    virtual int forward_inplace(Mat& bottom_top_blob) const;
 };

 } // namespace ncnn

 #endif // LAYER_BATCHNORM_ARM_H
--- a/src/layer/arm/bias_arm.cpp
+++ b/src/layer/arm/bias_arm.cpp
@@ -0,0 +1,122 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "bias_arm.h"

 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON

 namespace ncnn {

 DEFINE_LAYER_CREATOR(Bias_arm)

 int Bias_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    int size = w * h;

    top_blob.create(w, h, channels);
    if (top_blob.empty())
        return -100;

    const float* bias_ptr = bias_data;
    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_blob.channel(q);
        float* outptr = top_blob.channel(q);

        float bias = bias_ptr[q];

 #if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
 #else
        int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
        float32x4_t _bias = vdupq_n_f32(bias);
        for (; nn>0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _outp = vaddq_f32(_p, _bias);
            vst1q_f32(outptr, _outp);

            ptr += 4;
            outptr += 4;
        }
 #endif // __ARM_NEON

        for (; remain>0; remain--)
        {
            *outptr = *ptr + bias;

            ptr++;
            outptr++;
        }
    }

    return 0;
 }

 int Bias_arm::forward_inplace(Mat& bottom_top_blob) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    const float* bias_ptr = bias_data;
    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        float bias = bias_ptr[q];

 #if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
 #else
        int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
        float32x4_t _bias = vdupq_n_f32(bias);
        for (; nn>0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _outp = vaddq_f32(_p, _bias);
            vst1q_f32(ptr, _outp);

            ptr += 4;
        }
 #endif // __ARM_NEON

        for (; remain>0; remain--)
        {
            *ptr = *ptr + bias;

            ptr++;
        }
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/arm/bias_arm.h
+++ b/src/layer/arm/bias_arm.h
@@ -0,0 +1,32 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_BIAS_ARM_H
 #define LAYER_BIAS_ARM_H

 #include "bias.h"

 namespace ncnn {

 class Bias_arm : public Bias
 {
 public:
    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

    virtual int forward_inplace(Mat& bottom_top_blob) const;
 };

 } // namespace ncnn

 #endif // LAYER_BIAS_ARM_H
--- a/src/layer/arm/convolution_1x1.h
+++ b/src/layer/arm/convolution_1x1.h
@@ -0,0 +1,543 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON

 static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for
    for (int p=0; p<outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;

        out.fill(bias0);

        int q = 0;

        for (; q+3<inch; q+=4)
        {
            float* outptr = out;

            const float* img0 = bottom_blob.channel(q);
            const float* img1 = bottom_blob.channel(q+1);
            const float* img2 = bottom_blob.channel(q+2);
            const float* img3 = bottom_blob.channel(q+3);

            const float* kernel0 = kernel + p*inch  + q;
            const float k0 = kernel0[0];
            const float k1 = kernel0[1];
            const float k2 = kernel0[2];
            const float k3 = kernel0[3];

            const float* r0 = img0;
            const float* r1 = img1;
            const float* r2 = img2;
            const float* r3 = img3;

            int size = outw * outh;

 #if __ARM_NEON
            int nn = size >> 3;
            int remain = size & 7;
 #else
            int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
            float32x4_t _k0 = vdupq_n_f32(k0);
            float32x4_t _k1 = vdupq_n_f32(k1);
            float32x4_t _k2 = vdupq_n_f32(k2);
            float32x4_t _k3 = vdupq_n_f32(k3);
 #if __aarch64__
            for (; nn>0; nn--)
            {
                float32x4_t _p = vld1q_f32(r0);
                float32x4_t _pn = vld1q_f32(r0+4);

                float32x4_t _outp = vld1q_f32(outptr);
                float32x4_t _outpn = vld1q_f32(outptr+4);

                _outp = vfmaq_f32(_outp, _p, _k0);
                _outpn = vfmaq_f32(_outpn, _pn, _k0);

                float32x4_t _p1 = vld1q_f32(r1);
                float32x4_t _p1n = vld1q_f32(r1+4);

                _outp = vfmaq_f32(_outp, _p1, _k1);
                _outpn = vfmaq_f32(_outpn, _p1n, _k1);

                float32x4_t _p2 = vld1q_f32(r2);
                float32x4_t _p2n = vld1q_f32(r2+4);

                _outp = vfmaq_f32(_outp, _p2, _k2);
                _outpn = vfmaq_f32(_outpn, _p2n, _k2);

                float32x4_t _p3 = vld1q_f32(r3);
                float32x4_t _p3n = vld1q_f32(r3+4);

                _outp = vfmaq_f32(_outp, _p3, _k3);
                _outpn = vfmaq_f32(_outpn, _p3n, _k3);

                vst1q_f32(outptr, _outp);
                vst1q_f32(outptr+4, _outpn);

                r0 += 8;
                r1 += 8;
                r2 += 8;
                r3 += 8;
                outptr += 8;
            }
 #else
            if (nn > 0)
            {
            asm volatile(
                "pld        [%2, #256]          \n"
                "vld1.f32   {d4-d7}, [%2 :128]! \n"
                "0:                             \n"
                "pld        [%1, #256]          \n"
                "vld1.f32   {d0-d3}, [%1 :128]  \n"
                "vmla.f32   q0, q2, %q12        \n"
                "vmla.f32   q1, q3, %q12        \n"
                "pld        [%3, #256]          \n"
                "vld1.f32   {d4-d7}, [%3 :128]! \n"
                "vmla.f32   q0, q2, %q13        \n"
                "vmla.f32   q1, q3, %q13        \n"
                "pld        [%4, #256]          \n"
                "vld1.f32   {d4-d7}, [%4 :128]! \n"
                "vmla.f32   q0, q2, %q14        \n"
                "vmla.f32   q1, q3, %q14        \n"
                "pld        [%5, #256]          \n"
                "vld1.f32   {d4-d7}, [%5 :128]! \n"
                "vmla.f32   q0, q2, %q15        \n"
                "vmla.f32   q1, q3, %q15        \n"
                "pld        [%2, #256]          \n"
                "vld1.f32   {d4-d7}, [%2 :128]! \n"
                "subs       %0, #1              \n"
                "vst1.f32   {d0-d3}, [%1 :128]! \n"
                "bne        0b                  \n"
                "sub        %2, #32             \n"
                : "=r"(nn),     // %0
                  "=r"(outptr), // %1
                  "=r"(r0),     // %2
                  "=r"(r1),     // %3
                  "=r"(r2),     // %4
                  "=r"(r3)      // %5
                : "0"(nn),
                  "1"(outptr),
                  "2"(r0),
                  "3"(r1),
                  "4"(r2),
                  "5"(r3),
                  "w"(_k0),     // %12
                  "w"(_k1),     // %13
                  "w"(_k2),     // %14
                  "w"(_k3)      // %15
                : "cc", "memory", "q0", "q1", "q2", "q3"
            );
            }
 #endif // __aarch64__
 #endif // __ARM_NEON
            for (; remain>0; remain--)
            {
                float sum = *r0 * k0;
                float sum1 = *r1 * k1;
                float sum2 = *r2 * k2;
                float sum3 = *r3 * k3;

                *outptr += sum + sum1 + sum2 + sum3;

                r0++;
                r1++;
                r2++;
                r3++;
                outptr++;
            }

        }

        for (; q<inch; q++)
        {
            float* outptr = out;

            const float* img0 = bottom_blob.channel(q);

            const float* kernel0 = kernel + p*inch  + q;
            const float k0 = kernel0[0];

            const float* r0 = img0;

            int size = outw * outh;

 #if __ARM_NEON
            int nn = size >> 3;
            int remain = size & 7;
 #else
            int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
            float32x4_t _k0 = vdupq_n_f32(k0);
 #if __aarch64__
            for (; nn>0; nn--)
            {
                float32x4_t _p = vld1q_f32(r0);
                float32x4_t _outp = vld1q_f32(outptr);

                float32x4_t _pn = vld1q_f32(r0+4);
                float32x4_t _outpn = vld1q_f32(outptr+4);

                _outp = vfmaq_f32(_outp, _p, _k0);
                _outpn = vfmaq_f32(_outpn, _pn, _k0);

                vst1q_f32(outptr, _outp);
                vst1q_f32(outptr+4, _outpn);

                r0 += 8;
                outptr += 8;
            }
 #else
            if (nn > 0)
            {
            asm volatile(
                "pld        [%2, #256]          \n"
                "vld1.f32   {d4-d7}, [%2 :128]! \n"
                "0:                             \n"
                "pld        [%1, #256]          \n"
                "vld1.f32   {d0-d3}, [%1 :128]  \n"
                "vmla.f32   q0, q2, %q6         \n"
                "vmla.f32   q1, q3, %q6         \n"
                "pld        [%2, #256]          \n"
                "vld1.f32   {d4-d7}, [%2 :128]! \n"
                "subs       %0, #1              \n"
                "vst1.f32   {d0-d3}, [%1 :128]! \n"
                "bne        0b                  \n"
                "sub        %2, #32             \n"
                : "=r"(nn),     // %0
                  "=r"(outptr), // %1
                  "=r"(r0)      // %2
                : "0"(nn),
                  "1"(outptr),
                  "2"(r0),
                  "w"(_k0)      // %6
                : "cc", "memory", "q0", "q1", "q2", "q3"
            );
            }
 #endif // __aarch64__
 #endif // __ARM_NEON
            for (; remain>0; remain--)
            {
                float sum = *r0 * k0;

                *outptr += sum;

                r0++;
                outptr++;
            }

        }
    }

 }

 static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = w - 2*outw + w;

    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for
    for (int p=0; p<outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;

        out.fill(bias0);

        int q = 0;

        for (; q+3<inch; q+=4)
        {
            float* outptr = out;

            const float* img0 = bottom_blob.channel(q);
            const float* img1 = bottom_blob.channel(q+1);
            const float* img2 = bottom_blob.channel(q+2);
            const float* img3 = bottom_blob.channel(q+3);

            const float* kernel0 = kernel + p*inch  + q;
            const float k0 = kernel0[0];
            const float k1 = kernel0[1];
            const float k2 = kernel0[2];
            const float k3 = kernel0[3];

            const float* r0 = img0;
            const float* r1 = img1;
            const float* r2 = img2;
            const float* r3 = img3;

            for (int i = 0; i < outh; i++)
            {
 #if __ARM_NEON
                int nn = outw >> 3;
                int remain = outw & 7;
 #else
                int remain = outw;
 #endif // __ARM_NEON

 #if __ARM_NEON
                float32x4_t _k0 = vdupq_n_f32(k0);
                float32x4_t _k1 = vdupq_n_f32(k1);
                float32x4_t _k2 = vdupq_n_f32(k2);
                float32x4_t _k3 = vdupq_n_f32(k3);
 #if __aarch64__
                for (; nn>0; nn--)
                {
                    float32x4x2_t _px2 = vld2q_f32(r0);
                    float32x4_t _p = _px2.val[0];
                    float32x4_t _outp = vld1q_f32(outptr);

                    float32x4x2_t _pnx2 = vld2q_f32(r0+8);
                    float32x4_t _pn = _pnx2.val[0];
                    float32x4_t _outpn = vld1q_f32(outptr+4);

                    _outp = vmlaq_f32(_outp, _p, _k0);
                    _outpn = vmlaq_f32(_outpn, _pn, _k0);

                    float32x4x2_t _p1x2 = vld2q_f32(r1);
                    float32x4_t _p1 = _p1x2.val[0];
                    float32x4x2_t _p1nx2 = vld2q_f32(r1+8);
                    float32x4_t _p1n = _p1nx2.val[0];

                    _outp = vmlaq_f32(_outp, _p1, _k1);
                    _outpn = vmlaq_f32(_outpn, _p1n, _k1);

                    float32x4x2_t _p2x2 = vld2q_f32(r2);
                    float32x4_t _p2 = _p2x2.val[0];
                    float32x4x2_t _p2nx2 = vld2q_f32(r2+8);
                    float32x4_t _p2n = _p2nx2.val[0];

                    _outp = vmlaq_f32(_outp, _p2, _k2);
                    _outpn = vmlaq_f32(_outpn, _p2n, _k2);

                    float32x4x2_t _p3x2 = vld2q_f32(r3);
                    float32x4_t _p3 = _p3x2.val[0];
                    float32x4x2_t _p3nx2 = vld2q_f32(r3+8);
                    float32x4_t _p3n = _p3nx2.val[0];

                    _outp = vmlaq_f32(_outp, _p3, _k3);
                    _outpn = vmlaq_f32(_outpn, _p3n, _k3);

                    vst1q_f32(outptr, _outp);
                    vst1q_f32(outptr+8, _outpn);

                    r0 += 16;
                    r1 += 16;
                    r2 += 16;
                    r3 += 16;
                    outptr += 8;
                }
 #else
                if (nn > 0)
                {
                asm volatile(
                    "pld        [%2, #512]          \n"
                    "vld2.f32   {d4-d7}, [%2]!      \n"
                    "vld2.f32   {d16-d19}, [%2]!    \n"
                    "0:                             \n"
                    "pld        [%1, #256]          \n"
                    "vld1.f32   {d0-d3}, [%1]       \n"
                    "vmla.f32   q0, q2, %q12        \n"
                    "vmla.f32   q1, q8, %q12        \n"
                    "pld        [%3, #512]          \n"
                    "vld2.f32   {d4-d7}, [%3]!      \n"
                    "vld2.f32   {d16-d19}, [%3]!    \n"
                    "vmla.f32   q0, q2, %q13        \n"
                    "vmla.f32   q1, q8, %q13        \n"
                    "pld        [%4, #512]          \n"
                    "vld2.f32   {d4-d7}, [%4]!      \n"
                    "vld2.f32   {d16-d19}, [%4]!    \n"
                    "vmla.f32   q0, q2, %q14        \n"
                    "vmla.f32   q1, q8, %q14        \n"
                    "pld        [%5, #512]          \n"
                    "vld2.f32   {d4-d7}, [%5]!      \n"
                    "vld2.f32   {d16-d19}, [%5]!    \n"
                    "vmla.f32   q0, q2, %q15        \n"
                    "vmla.f32   q1, q8, %q15        \n"
                    "pld        [%2, #512]          \n"
                    "vld2.f32   {d4-d7}, [%2]!      \n"
                    "vld2.f32   {d16-d19}, [%2]!    \n"
                    "subs       %0, #1              \n"
                    "vst1.f32   {d0-d3}, [%1]!      \n"
                    "bne        0b                  \n"
                    "sub        %2, #64             \n"
                    : "=r"(nn),     // %0
                      "=r"(outptr), // %1
                      "=r"(r0),     // %2
                      "=r"(r1),     // %3
                      "=r"(r2),     // %4
                      "=r"(r3)      // %5
                    : "0"(nn),
                      "1"(outptr),
                      "2"(r0),
                      "3"(r1),
                      "4"(r2),
                      "5"(r3),
                      "w"(_k0),     // %12
                      "w"(_k1),     // %13
                      "w"(_k2),     // %14
                      "w"(_k3)      // %15
                    : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9"
                );
                }
 #endif // __aarch64__
 #endif // __ARM_NEON
                for (; remain>0; remain--)
                {
                    float sum = *r0 * k0;
                    float sum1 = *r1 * k1;
                    float sum2 = *r2 * k2;
                    float sum3 = *r3 * k3;

                    *outptr += sum + sum1 + sum2 + sum3;

                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                    r3 += 2;
                    outptr++;
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
                r3 += tailstep;
            }

        }

        for (; q<inch; q++)
        {
            float* outptr = out;

            const float* img0 = bottom_blob.channel(q);

            const float* kernel0 = kernel + p*inch  + q;
            const float k0 = kernel0[0];

            const float* r0 = img0;

            for (int i = 0; i < outh; i++)
            {
 #if __ARM_NEON
                int nn = outw >> 3;
                int remain = outw & 7;
 #else
                int remain = outw;
 #endif // __ARM_NEON

 #if __ARM_NEON
                float32x4_t _k0 = vdupq_n_f32(k0);
 #if __aarch64__
                for (; nn>0; nn--)
                {
                    float32x4x2_t _px2 = vld2q_f32(r0);
                    float32x4_t _p = _px2.val[0];
                    float32x4_t _outp = vld1q_f32(outptr);

                    float32x4x2_t _pnx2 = vld2q_f32(r0+8);
                    float32x4_t _pn = _pnx2.val[0];
                    float32x4_t _outpn = vld1q_f32(outptr+4);

                    _outp = vmlaq_f32(_outp, _p, _k0);
                    _outpn = vmlaq_f32(_outpn, _pn, _k0);

                    vst1q_f32(outptr, _outp);
                    vst1q_f32(outptr+4, _outpn);

                    r0 += 16;
                    outptr += 8;
                }
 #else
                if (nn > 0)
                {
                asm volatile(
                    "pld        [%2, #512]          \n"
                    "vld2.f32   {d4-d7}, [%2]!      \n"
                    "vld2.f32   {d16-d19}, [%2]!    \n"
                    "0:                             \n"
                    "pld        [%1, #256]          \n"
                    "vld1.f32   {d0-d3}, [%1]       \n"
                    "vmla.f32   q0, q2, %q6         \n"
                    "vmla.f32   q1, q8, %q6         \n"
                    "pld        [%2, #512]          \n"
                    "vld2.f32   {d4-d7}, [%2]!      \n"
                    "vld2.f32   {d16-d19}, [%2]!    \n"
                    "subs       %0, #1              \n"
                    "vst1.f32   {d0-d3}, [%1]!      \n"
                    "bne        0b                  \n"
                    "sub        %2, #64             \n"
                    : "=r"(nn),     // %0
                      "=r"(outptr), // %1
                      "=r"(r0)      // %2
                    : "0"(nn),
                      "1"(outptr),
                      "2"(r0),
                      "w"(_k0)      // %6
                    : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9"
                );
                }
 #endif // __aarch64__
 #endif // __ARM_NEON
                for (; remain>0; remain--)
                {
                    float sum = *r0 * k0;

                    *outptr += sum;

                    r0 += 2;
                    outptr++;
                }

                r0 += tailstep;
            }

        }
    }

 }
--- a/src/layer/arm/convolution_2x2.h
+++ b/src/layer/arm/convolution_2x2.h
@@ -0,0 +1,381 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON

 static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for
    for (int p=0; p<outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;

        out.fill(bias0);

        int q = 0;

        for (; q+1<inch; q+=2)
        {
            float* outptr = out;

            const float* img0 = bottom_blob.channel(q);
            const float* img1 = bottom_blob.channel(q+1);

            const float* kernel0 = kernel + p*inch*4  + q*4;
            const float* kernel1 = kernel0 + 4;

            const float* r00 = img0;
            const float* r01 = img0 + w;

            const float* r10 = img1;
            const float* r11 = img1 + w;

 #if __ARM_NEON
            float32x4_t _k0 = vld1q_f32(kernel0);
            float32x4_t _k1 = vld1q_f32(kernel1);
 #endif // __ARM_NEON

            for (int i = 0; i < outh; i++)
            {
 #if __ARM_NEON
                int nn = outw >> 2;
                int remain = outw & 3;
 #else
                int remain = outw;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
                for (; nn>0; nn--)
                {
                    float32x4_t _r000 = vld1q_f32(r00);
                    float32x4_t _r010 = vld1q_f32(r01);
                    float32x4_t _r001 = vld1q_f32(r00 + 1);
                    float32x4_t _r011 = vld1q_f32(r01 + 1);

                    float32x4_t _r100 = vld1q_f32(r10);
                    float32x4_t _r110 = vld1q_f32(r11);
                    float32x4_t _r101 = vld1q_f32(r10 + 1);
                    float32x4_t _r111 = vld1q_f32(r11 + 1);

                    float32x4_t _sum = vld1q_f32(outptr);

                    _sum = vmlaq_lane_f32(_sum, _r000, vget_low_f32(_k0), 0);
                    _sum = vmlaq_lane_f32(_sum, _r001, vget_low_f32(_k0), 1);
                    _sum = vmlaq_lane_f32(_sum, _r010, vget_high_f32(_k0), 0);
                    _sum = vmlaq_lane_f32(_sum, _r011, vget_high_f32(_k0), 1);

                    _sum = vmlaq_lane_f32(_sum, _r100, vget_low_f32(_k1), 0);
                    _sum = vmlaq_lane_f32(_sum, _r101, vget_low_f32(_k1), 1);
                    _sum = vmlaq_lane_f32(_sum, _r110, vget_high_f32(_k1), 0);
                    _sum = vmlaq_lane_f32(_sum, _r111, vget_high_f32(_k1), 1);

                    vst1q_f32(outptr, _sum);

                    r00 += 4;
                    r01 += 4;
                    r10 += 4;
                    r11 += 4;
                    outptr += 4;
                }
 #else
                if (nn > 0)
                {
                asm volatile(
                    "pld        [%1, #128]          \n"
                    "vld1.f32   {d0-d1}, [%1]!      \n"
                    "pld        [%2, #128]          \n"
                    "vld1.f32   {d4-d5}, [%2]!      \n"

                    "pld        [%3, #128]          \n"
                    "vld1.f32   {d24-d25}, [%3]!    \n"
                    "pld        [%4, #128]          \n"
                    "vld1.f32   {d28-d29}, [%4]!    \n"

                    "0:                             \n"
                    "pld        [%5, #128]          \n"
                    "vld1.f32   {d18-d19}, [%5]     \n"// q9 = sum

                    "vmul.f32   q8, q0, %e12[0]     \n"
                    "vmla.f32   q9, q2, %f12[0]     \n"

                    "pld        [%1, #128]          \n"
                    "vld1.f32   {d2-d3}, [%1]!      \n"

                    "pld        [%2, #128]          \n"
                    "vld1.f32   {d6-d7}, [%2]!      \n"

                    "vext.f32   q10, q0, q1, #1     \n"
                    "vext.f32   q11, q2, q3, #1     \n"

                    "vmla.f32   q8, q12, %e13[0]    \n"
                    "vmla.f32   q9, q14, %f13[0]    \n"

                    "pld        [%3, #128]          \n"
                    "vld1.f32   {d26-d27}, [%3]!    \n"

                    "pld        [%4, #128]          \n"
                    "vld1.f32   {d30-d31}, [%4]!    \n"

                    "vmla.f32   q8, q10, %e12[1]    \n"
                    "vmla.f32   q9, q11, %f12[1]    \n"

                    "vext.f32   q10, q12, q13, #1   \n"
                    "vext.f32   q11, q14, q15, #1   \n"

                    "vmla.f32   q8, q10, %e13[1]    \n"
                    "vmla.f32   q9, q11, %f13[1]    \n"

                    "vorr       q0, q1, q1          \n"
                    "vorr       q2, q3, q3          \n"

                    "vadd.f32   q8, q8, q9          \n"

                    "vorr       q12, q13, q13       \n"
                    "vorr       q14, q15, q15       \n"

                    "subs       %0, #1              \n"

                    "vst1.f32   {d16-d17}, [%5]!    \n"

                    "bne        0b                  \n"
                    "sub        %1, #16             \n"
                    "sub        %2, #16             \n"
                    "sub        %3, #16             \n"
                    "sub        %4, #16             \n"
                    : "=r"(nn),     // %0
                      "=r"(r00),    // %1
                      "=r"(r01),    // %2
                      "=r"(r10),    // %3
                      "=r"(r11),    // %4
                      "=r"(outptr)  // %5
                    : "0"(nn),
                      "1"(r00),
                      "2"(r01),
                      "3"(r10),
                      "4"(r11),
                      "5"(outptr),
                      "w"(_k0),     // %12
                      "w"(_k1)      // %13
                    : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
                );
                }
 #endif // __aarch64__
 #endif // __ARM_NEON

                for (; remain>0; remain--)
                {
 #if __ARM_NEON
                    float32x2_t _r00 = vld1_f32(r00);
                    float32x2_t _r01 = vld1_f32(r01);
                    float32x4_t _r00r1 = vcombine_f32(_r00, _r01);
                    float32x4_t _s0s1 = vmulq_f32(_r00r1, _k0);

                    float32x2_t _r10 = vld1_f32(r10);
                    float32x2_t _r11 = vld1_f32(r11);
                    float32x4_t _r10r1 = vcombine_f32(_r10, _r11);
                    _s0s1 = vmlaq_f32(_s0s1, _r10r1, _k1);

                    float32x2_t _s = vadd_f32(vget_low_f32(_s0s1), vget_high_f32(_s0s1));
                    _s = vpadd_f32(_s, _s);
                    *outptr += vget_lane_f32(_s, 0);
 #else
                    float sum = 0.f;

                    sum += r00[0] * kernel0[0];
                    sum += r00[1] * kernel0[1];
                    sum += r01[0] * kernel0[2];
                    sum += r01[1] * kernel0[3];

                    sum += r10[0] * kernel1[0];
                    sum += r10[1] * kernel1[1];
                    sum += r11[0] * kernel1[2];
                    sum += r11[1] * kernel1[3];

                    *outptr += sum;
 #endif // __ARM_NEON

                    r00 += 1;
                    r01 += 1;
                    r10 += 1;
                    r11 += 1;
                    outptr++;
                }

                r00 += 1;
                r01 += 1;
                r10 += 1;
                r11 += 1;
            }
        }

        for (; q<inch; q++)
        {
            float* outptr = out;

            const float* img0 = bottom_blob.channel(q);

            const float* kernel0 = kernel + p*inch*4  + q*4;

            const float* r0 = img0;
            const float* r1 = img0 + w;

 #if __ARM_NEON
            float32x4_t _k0 = vdupq_n_f32(kernel0[0]);
            float32x4_t _k1 = vdupq_n_f32(kernel0[1]);
            float32x4_t _k2 = vdupq_n_f32(kernel0[2]);
            float32x4_t _k3 = vdupq_n_f32(kernel0[3]);
 #endif // __ARM_NEON

            for (int i = 0; i < outh; i++)
            {
 #if __ARM_NEON
                int nn = outw >> 2;
                int remain = outw & 3;
 #else
                int remain = outw;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
                for (; nn>0; nn--)
                {
                    float32x4_t _r00 = vld1q_f32(r0);
                    float32x4_t _r10 = vld1q_f32(r1);
                    float32x4_t _r01 = vld1q_f32(r0 + 1);
                    float32x4_t _r11 = vld1q_f32(r1 + 1);

                    float32x4_t _sum = vld1q_f32(outptr);
                    float32x4_t _sum2;

                    _sum = vmlaq_f32(_sum, _r00, _k0);
                    _sum2 = vmulq_f32(_r01, _k1);
                    _sum = vmlaq_f32(_sum, _r10, _k2);
                    _sum2 = vmlaq_f32(_sum2, _r11, _k3);

                    _sum = vaddq_f32(_sum, _sum2);

                    vst1q_f32(outptr, _sum);

                    r0 += 4;
                    r1 += 4;
                    outptr += 4;
                }
 #else
                if (nn > 0)
                {
                asm volatile(
                    "pld        [%1, #128]          \n"
                    "vld1.f32   {d0-d1}, [%1]!      \n"
                    "pld        [%2, #128]          \n"
                    "vld1.f32   {d4-d5}, [%2]!      \n"

                    "0:                             \n"
                    "pld        [%3, #128]          \n"
                    "vld1.f32   {d18-d19}, [%3]     \n"// q9 = sum

                    "vmul.f32   q8, q0, %q8         \n"
                    "vmla.f32   q9, q2, %q10        \n"

                    "pld        [%1, #128]          \n"
                    "vld1.f32   {d2-d3}, [%1]!      \n"
                    "vext.f32   q10, q0, q1, #1     \n"

                    "vmla.f32   q8, q10, %q9        \n"

                    "pld        [%2, #128]          \n"
                    "vld1.f32   {d6-d7}, [%2]!      \n"
                    "vext.f32   q11, q2, q3, #1     \n"

                    "vmla.f32   q9, q11, %q11       \n"

                    "vorr       q0, q1, q1          \n"
                    "vadd.f32   q8, q8, q9          \n"
                    "vorr       q2, q3, q3          \n"

                    "subs       %0, #1              \n"
                    "vst1.f32   {d16-d17}, [%3]!    \n"
                    "bne        0b                  \n"
                    "sub        %1, #16             \n"
                    "sub        %2, #16             \n"
                    : "=r"(nn),     // %0
                      "=r"(r0),     // %1
                      "=r"(r1),     // %2
                      "=r"(outptr)  // %3
                    : "0"(nn),
                      "1"(r0),
                      "2"(r1),
                      "3"(outptr),
                      "w"(_k0),     // %8
                      "w"(_k1),     // %9
                      "w"(_k2),     // %10
                      "w"(_k3)      // %11
                    : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
                );
                }
 #endif // __aarch64__
 #endif // __ARM_NEON

 #if __ARM_NEON
                float32x4_t _k0123 = vld1q_f32(kernel0);
 #endif

                for (; remain>0; remain--)
                {
 #if __ARM_NEON
                    float32x2_t _r0 = vld1_f32(r0);
                    float32x2_t _r1 = vld1_f32(r1);
                    float32x4_t _r0r1 = vcombine_f32(_r0, _r1);
                    float32x4_t _s0s1 = vmulq_f32(_r0r1, _k0123);
                    float32x2_t _s = vadd_f32(vget_low_f32(_s0s1), vget_high_f32(_s0s1));
                    _s = vpadd_f32(_s, _s);
                    *outptr += vget_lane_f32(_s, 0);
 #else
                    float sum = 0.f;
                    sum += r0[0] * kernel0[0];
                    sum += r0[1] * kernel0[1];
                    sum += r1[0] * kernel0[2];
                    sum += r1[1] * kernel0[3];
                    *outptr += sum;
 #endif

                    r0 += 1;
                    r1 += 1;
                    outptr++;
                }

                r0 += 1;
                r1 += 1;

            }

        }
    }

 }
--- a/src/layer/arm/convolution_3x3.h
+++ b/src/layer/arm/convolution_3x3.h
@@ -0,0 +1,753 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON

 static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for
    for (int p=0; p<outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;

        out.fill(bias0);

        const float* kernel0 = kernel + p*inch*9;

        for (int q=0; q<inch; q++)
        {
            float* outptr = out;
            float* outptr2 = outptr + outw;

            const float* img0 = bottom_blob.channel(q);

            const float* r0 = img0;
            const float* r1 = img0 + w;
            const float* r2 = img0 + w*2;
            const float* r3 = img0 + w*3;

            const float* k0 = kernel0;
            const float* k1 = kernel0 + 3;
            const float* k2 = kernel0 + 6;

 #if __ARM_NEON
            float32x4_t _k0123 = vld1q_f32(kernel0);
            float32x4_t _k3456 = vld1q_f32(kernel0+3);
            float32x4_t _k6789 = vld1q_f32(kernel0+6);
 #endif // __ARM_NEON

            int i = 0;

            for (; i+1 < outh; i+=2)
            {

 #if __ARM_NEON
                int nn = outw >> 2;
                int remain = outw & 3;
 #else
                int remain = outw;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
                for (; nn>0; nn--)
                {
                    float32x4_t _sum1 = vld1q_f32(outptr);
                    float32x4_t _sum2 = vdupq_n_f32(0.f);
                    float32x4_t _sum3 = vld1q_f32(outptr2);
                    float32x4_t _sum4 = vdupq_n_f32(0.f);

                    float32x4_t _r00 = vld1q_f32(r0);
                    float32x4_t _r00n = vld1q_f32(r0 + 4);
                    float32x4_t _r01 = vextq_f32(_r00, _r00n, 1);
                    float32x4_t _r02 = vextq_f32(_r00, _r00n, 2);

                    float32x4_t _r10 = vld1q_f32(r1);
                    float32x4_t _r10n = vld1q_f32(r1 + 4);
                    float32x4_t _r11 = vextq_f32(_r10, _r10n, 1);
                    float32x4_t _r12 = vextq_f32(_r10, _r10n, 2);

                    float32x4_t _r20 = vld1q_f32(r2);
                    float32x4_t _r20n = vld1q_f32(r2 + 4);
                    float32x4_t _r21 = vextq_f32(_r20, _r20n, 1);
                    float32x4_t _r22 = vextq_f32(_r20, _r20n, 2);

                    float32x4_t _r30 = vld1q_f32(r3);
                    float32x4_t _r30n = vld1q_f32(r3 + 4);
                    float32x4_t _r31 = vextq_f32(_r30, _r30n, 1);
                    float32x4_t _r32 = vextq_f32(_r30, _r30n, 2);

                    _sum1 = vfmaq_laneq_f32(_sum1, _r00, _k0123, 0);
                    _sum2 = vfmaq_laneq_f32(_sum2, _r01, _k0123, 1);
                    _sum1 = vfmaq_laneq_f32(_sum1, _r02, _k0123, 2);
                    _sum2 = vfmaq_laneq_f32(_sum2, _r10, _k3456, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _r11, _k3456, 1);
                    _sum2 = vfmaq_laneq_f32(_sum2, _r12, _k3456, 2);
                    _sum1 = vfmaq_laneq_f32(_sum1, _r20, _k6789, 0);
                    _sum2 = vfmaq_laneq_f32(_sum2, _r21, _k6789, 1);
                    _sum1 = vfmaq_laneq_f32(_sum1, _r22, _k6789, 2);

                    _sum3 = vfmaq_laneq_f32(_sum3, _r10, _k0123, 0);
                    _sum4 = vfmaq_laneq_f32(_sum4, _r11, _k0123, 1);
                    _sum3 = vfmaq_laneq_f32(_sum3, _r12, _k0123, 2);
                    _sum4 = vfmaq_laneq_f32(_sum4, _r20, _k3456, 0);
                    _sum3 = vfmaq_laneq_f32(_sum3, _r21, _k3456, 1);
                    _sum4 = vfmaq_laneq_f32(_sum4, _r22, _k3456, 2);
                    _sum3 = vfmaq_laneq_f32(_sum3, _r30, _k6789, 0);
                    _sum4 = vfmaq_laneq_f32(_sum4, _r31, _k6789, 1);
                    _sum3 = vfmaq_laneq_f32(_sum3, _r32, _k6789, 2);

                    _sum1 = vaddq_f32(_sum1, _sum2);
                    _sum3 = vaddq_f32(_sum3, _sum4);

                    vst1q_f32(outptr, _sum1);
                    vst1q_f32(outptr2, _sum3);

                    r0 += 4;
                    r1 += 4;
                    r2 += 4;
                    r3 += 4;
                    outptr += 4;
                    outptr2 += 4;
                }
 #else
                if (nn > 0)
                {
                asm volatile(
                    "veor       q6, q6              \n"
                    "veor       q15, q15            \n"

                    "pld        [%3, #192]          \n"
                    "vld1.f32   {d18-d20}, [%3 :64] \n"// r0
                    "add        %3, #16             \n"

                    "veor       q13, q13            \n"
                    "veor       q14, q14            \n"

                    "vext.32    q11, q9, q10, #1    \n"
                    "vext.32    q12, q9, q10, #2    \n"

                    "0:                             \n"

                    "pld        [%1, #128]          \n"
                    "vld1.f32   {d14-d15}, [%1 :64] \n"// _sum

                    "vmla.f32   q7, q9, %e14[0]     \n"
                    "vmla.f32   q6, q11, %e14[1]    \n"
                    "vmla.f32   q13, q12, %f14[0]   \n"

                    "pld        [%4, #192]          \n"
                    "vld1.f32   {d18-d20}, [%4]     \n"// r1
                    "add        %4, #16             \n"

                    "vmla.f32   q7, q9, %e15[0]     \n"

                    "vext.32    q11, q9, q10, #1    \n"
                    "vext.32    q12, q9, q10, #2    \n"

                    "vmla.f32   q6, q11, %e15[1]    \n"
                    "vmla.f32   q13, q12, %f15[0]   \n"

                    "pld        [%2, #128]          \n"
                    "vld1.f32   {d16-d17}, [%2]     \n"// _sum2

                    "vmla.f32   q8, q9, %e14[0]     \n"
                    "vmla.f32   q14, q11, %e14[1]   \n"
                    "vmla.f32   q15, q12, %f14[0]   \n"

                    "pld        [%5, #192]          \n"
                    "vld1.f32   {d18-d20}, [%5 :64] \n"// r2
                    "add        %5, #16             \n"

                    "vmla.f32   q7, q9, %e16[0]     \n"

                    "vext.32    q11, q9, q10, #1    \n"
                    "vext.32    q12, q9, q10, #2    \n"

                    "vmla.f32   q6, q11, %e16[1]    \n"
                    "vmla.f32   q13, q12, %f16[0]   \n"

                    "vmla.f32   q8, q9, %e15[0]     \n"
                    "vmla.f32   q14, q11, %e15[1]   \n"
                    "vmla.f32   q15, q12, %f15[0]   \n"

                    "pld        [%6, #192]          \n"
                    "vld1.f32   {d18-d20}, [%6]     \n"// r3
                    "add        %6, #16             \n"

                    "vmla.f32   q8, q9, %e16[0]     \n"

                    "vext.32    q11, q9, q10, #1    \n"
                    "vext.32    q12, q9, q10, #2    \n"

                    "vmla.f32   q14, q11, %e16[1]   \n"
                    "vmla.f32   q15, q12, %f16[0]   \n"

                    "vadd.f32   q7, q7, q6          \n"
                    "veor       q6, q6              \n"

                    "pld        [%3, #192]          \n"
                    "vld1.f32   {d18-d20}, [%3 :64] \n"// r0

                    "vadd.f32   q8, q8, q14         \n"
                    "veor       q14, q14            \n"
                    "vadd.f32   q7, q7, q13         \n"
                    "veor       q13, q13            \n"
                    "vadd.f32   q8, q8, q15         \n"
                    "veor       q15, q15            \n"

                    "vext.32    q11, q9, q10, #1    \n"
                    "vext.32    q12, q9, q10, #2    \n"

                    "add        %3, #16             \n"

                    "vst1.f32   {d14-d15}, [%1]!    \n"
                    "vst1.f32   {d16-d17}, [%2]!    \n"

                    "subs       %0, #1              \n"
                    "bne        0b                  \n"

                    "sub        %3, #16             \n"
                    : "=r"(nn),         // %0
                      "=r"(outptr),     // %1
                      "=r"(outptr2),    // %2
                      "=r"(r0),         // %3
                      "=r"(r1),         // %4
                      "=r"(r2),         // %5
                      "=r"(r3)          // %6
                    : "0"(nn),
                      "1"(outptr),
                      "2"(outptr2),
                      "3"(r0),
                      "4"(r1),
                      "5"(r2),
                      "6"(r3),
                      "w"(_k0123),      // %14
                      "w"(_k3456),      // %15
                      "w"(_k6789)       // %16
                    : "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
                );
                }
 #endif // __aarch64__
 #endif // __ARM_NEON
                for (; remain>0; remain--)
                {
 #if __ARM_NEON
                    float32x4_t _r00 = vld1q_f32(r0);
                    float32x4_t _r10 = vld1q_f32(r1);
                    float32x4_t _r20 = vld1q_f32(r2);
                    float32x4_t _r30 = vld1q_f32(r3);

                    float32x4_t _sum = vmulq_f32(_r00, _k0123);
                    _sum = vmlaq_f32(_sum, _r10, _k3456);
                    _sum = vmlaq_f32(_sum, _r20, _k6789);

                    float32x4_t _sum2 = vmulq_f32(_r10, _k0123);
                    _sum2 = vmlaq_f32(_sum2, _r20, _k3456);
                    _sum2 = vmlaq_f32(_sum2, _r30, _k6789);

                    _sum = vsetq_lane_f32(*outptr, _sum, 3);
                    _sum2 = vsetq_lane_f32(*outptr2, _sum2, 3);

 #if __aarch64__
                    *outptr = vaddvq_f32(_sum);
                    *outptr2 = vaddvq_f32(_sum2);
 #else
                    float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
                    float32x2_t _ss2 = vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));

                    float32x2_t _sss2 = vpadd_f32(_ss, _ss2);

                    *outptr = vget_lane_f32(_sss2, 0);
                    *outptr2 = vget_lane_f32(_sss2, 1);
 #endif // __aarch64__
 #else
                    float sum = 0;
                    float sum2 = 0;

                    sum += r0[0] * k0[0];
                    sum += r0[1] * k0[1];
                    sum += r0[2] * k0[2];
                    sum += r1[0] * k1[0];
                    sum += r1[1] * k1[1];
                    sum += r1[2] * k1[2];
                    sum += r2[0] * k2[0];
                    sum += r2[1] * k2[1];
                    sum += r2[2] * k2[2];

                    sum2 += r1[0] * k0[0];
                    sum2 += r1[1] * k0[1];
                    sum2 += r1[2] * k0[2];
                    sum2 += r2[0] * k1[0];
                    sum2 += r2[1] * k1[1];
                    sum2 += r2[2] * k1[2];
                    sum2 += r3[0] * k2[0];
                    sum2 += r3[1] * k2[1];
                    sum2 += r3[2] * k2[2];

                    *outptr += sum;
                    *outptr2 += sum2;
 #endif
                    r0++;
                    r1++;
                    r2++;
                    r3++;
                    outptr++;
                    outptr2++;
                }

                r0 += 2 + w;
                r1 += 2 + w;
                r2 += 2 + w;
                r3 += 2 + w;

                outptr += outw;
                outptr2 += outw;
            }

            for (; i < outh; i++)
            {

 #if __ARM_NEON
                int nn = outw >> 2;
                int remain = outw & 3;
 #else
                int remain = outw;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
                for (; nn>0; nn--)
                {
                    float32x4_t _sum1 = vld1q_f32(outptr);
                    float32x4_t _sum2 = vdupq_n_f32(0.f);

                    float32x4_t _r00 = vld1q_f32(r0);
                    float32x4_t _r00n = vld1q_f32(r0 + 4);
                    float32x4_t _r01 = vextq_f32(_r00, _r00n, 1);
                    float32x4_t _r02 = vextq_f32(_r00, _r00n, 2);

                    float32x4_t _r10 = vld1q_f32(r1);
                    float32x4_t _r10n = vld1q_f32(r1 + 4);
                    float32x4_t _r11 = vextq_f32(_r10, _r10n, 1);
                    float32x4_t _r12 = vextq_f32(_r10, _r10n, 2);

                    float32x4_t _r20 = vld1q_f32(r2);
                    float32x4_t _r20n = vld1q_f32(r2 + 4);
                    float32x4_t _r21 = vextq_f32(_r20, _r20n, 1);
                    float32x4_t _r22 = vextq_f32(_r20, _r20n, 2);

                    _sum1 = vfmaq_laneq_f32(_sum1, _r00, _k0123, 0);
                    _sum2 = vfmaq_laneq_f32(_sum2, _r01, _k0123, 1);
                    _sum1 = vfmaq_laneq_f32(_sum1, _r02, _k0123, 2);
                    _sum2 = vfmaq_laneq_f32(_sum2, _r10, _k3456, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _r11, _k3456, 1);
                    _sum2 = vfmaq_laneq_f32(_sum2, _r12, _k3456, 2);
                    _sum1 = vfmaq_laneq_f32(_sum1, _r20, _k6789, 0);
                    _sum2 = vfmaq_laneq_f32(_sum2, _r21, _k6789, 1);
                    _sum1 = vfmaq_laneq_f32(_sum1, _r22, _k6789, 2);

                    _sum1 = vaddq_f32(_sum1, _sum2);

                    vst1q_f32(outptr, _sum1);

                    r0 += 4;
                    r1 += 4;
                    r2 += 4;
                    outptr += 4;
                }
 #else
                if (nn > 0)
                {
                asm volatile(
                    "pld        [%2, #192]          \n"
                    "vld1.f32   {d16-d18}, [%2]     \n"// r0
                    "add        %2, #16             \n"

                    "veor       q13, q13            \n"
                    "veor       q14, q14            \n"

                    "vext.32    q10, q8, q9, #1     \n"
                    "vext.32    q11, q8, q9, #2     \n"

                    "0:                             \n"

                    "pld        [%1, #128]          \n"
                    "vld1.f32   {d14-d15}, [%1]     \n"// _sum

                    "vmla.f32   q7, q8, %e10[0]     \n"
                    "vmla.f32   q13, q10, %e10[1]   \n"
                    "vmla.f32   q14, q11, %f10[0]   \n"

                    "pld        [%3, #192]          \n"
                    "vld1.f32   {d16-d18}, [%3]     \n"// r1
                    "add        %3, #16             \n"

                    "vmla.f32   q7, q8, %e11[0]     \n"

                    "vext.32    q10, q8, q9, #1     \n"
                    "vext.32    q11, q8, q9, #2     \n"

                    "vmla.f32   q13, q10, %e11[1]   \n"
                    "vmla.f32   q14, q11, %f11[0]   \n"

                    "pld        [%4, #192]          \n"
                    "vld1.f32   {d16-d18}, [%4]     \n"// r2
                    "add        %4, #16             \n"

                    "vmla.f32   q7, q8, %e12[0]     \n"

                    "vext.32    q10, q8, q9, #1     \n"
                    "vext.32    q11, q8, q9, #2     \n"

                    "vmla.f32   q13, q10, %e12[1]   \n"
                    "vmla.f32   q14, q11, %f12[0]   \n"

                    "pld        [%2, #192]          \n"
                    "vld1.f32   {d16-d18}, [%2]     \n"// r0
                    "add        %2, #16             \n"

                    "vadd.f32   q7, q7, q13         \n"
                    "veor       q13, q13            \n"
                    "vadd.f32   q7, q7, q14         \n"
                    "veor       q14, q14            \n"

                    "vext.32    q10, q8, q9, #1     \n"
                    "vext.32    q11, q8, q9, #2     \n"

                    "vst1.f32   {d14-d15}, [%1]!    \n"

                    "subs       %0, #1              \n"
                    "bne        0b                  \n"

                    "sub        %2, #16             \n"
                    : "=r"(nn),         // %0
                      "=r"(outptr),     // %1
                      "=r"(r0),         // %2
                      "=r"(r1),         // %3
                      "=r"(r2)          // %4
                    : "0"(nn),
                      "1"(outptr),
                      "2"(r0),
                      "3"(r1),
                      "4"(r2),
                      "w"(_k0123),      // %10
                      "w"(_k3456),      // %11
                      "w"(_k6789)       // %12
                    : "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
                );
                }
 #endif // __aarch64__
 #endif // __ARM_NEON
                for (; remain>0; remain--)
                {
 #if __ARM_NEON
                    float32x4_t _r00 = vld1q_f32(r0);
                    float32x4_t _r10 = vld1q_f32(r1);
                    float32x4_t _r20 = vld1q_f32(r2);

                    float32x4_t _sum = vmulq_f32(_r00, _k0123);
                    _sum = vmlaq_f32(_sum, _r10, _k3456);
                    _sum = vmlaq_f32(_sum, _r20, _k6789);

                    _sum = vsetq_lane_f32(*outptr, _sum, 3);

 #if __aarch64__
                    *outptr = vaddvq_f32(_sum);
 #else
                    float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
                    _ss = vpadd_f32(_ss, _ss);

                    *outptr = vget_lane_f32(_ss, 0);
 #endif // __aarch64__
 #else
                    float sum = 0;

                    sum += r0[0] * k0[0];
                    sum += r0[1] * k0[1];
                    sum += r0[2] * k0[2];
                    sum += r1[0] * k1[0];
                    sum += r1[1] * k1[1];
                    sum += r1[2] * k1[2];
                    sum += r2[0] * k2[0];
                    sum += r2[1] * k2[1];
                    sum += r2[2] * k2[2];

                    *outptr += sum;
 #endif
                    r0++;
                    r1++;
                    r2++;
                    outptr++;
                }

                r0 += 2;
                r1 += 2;
                r2 += 2;
            }

            kernel0 += 9;
        }
    }

 }

 static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = w - 2*outw + w;

    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for
    for (int p=0; p<outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;

        out.fill(bias0);

        const float* kernel0 = kernel + p*inch*9;

        for (int q=0; q<inch; q++)
        {
            float* outptr = out;
            float* outptr2 = outptr + outw;

            const float* img0 = bottom_blob.channel(q);

            const float* r0 = img0;
            const float* r1 = img0 + w;
            const float* r2 = img0 + w*2;

            const float* k0 = kernel0;
            const float* k1 = kernel0 + 3;
            const float* k2 = kernel0 + 6;

 #if __ARM_NEON
            float32x4_t _k0123 = vld1q_f32(k0);
            float32x4_t _k3456 = vld1q_f32(k1);
            float32x4_t _k6789 = vld1q_f32(k2);
 #endif // __ARM_NEON

            int i = 0;

            for (; i < outh; i++)
            {
 #if __ARM_NEON
                int nn = outw >> 2;
                int remain = outw & 3;
 #else
                int remain = outw;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
                for (; nn>0; nn--)
                {
                    float32x4_t _outp = vld1q_f32(outptr);

                    float32x4x2_t _r0 = vld2q_f32(r0);
                    float32x4x2_t _r0n = vld2q_f32(r0+8);

                    float32x4_t _r00 = _r0.val[0];// 0 2 4 6
                    float32x4_t _r01 = _r0.val[1];// 1 3 5 7
                    float32x4_t _r02 = vextq_f32(_r00, _r0n.val[0], 1);// 2 4 6 8

                    _outp = vfmaq_laneq_f32(_outp, _r00, _k0123, 0);
                    _outp = vfmaq_laneq_f32(_outp, _r01, _k0123, 1);
                    _outp = vfmaq_laneq_f32(_outp, _r02, _k0123, 2);

                    float32x4x2_t _r1 = vld2q_f32(r1);
                    float32x4x2_t _r1n = vld2q_f32(r1+8);

                    float32x4_t _r10 = _r1.val[0];
                    float32x4_t _r11 = _r1.val[1];
                    float32x4_t _r12 = vextq_f32(_r10, _r1n.val[0], 1);

                    _outp = vfmaq_laneq_f32(_outp, _r10, _k3456, 0);
                    _outp = vfmaq_laneq_f32(_outp, _r11, _k3456, 1);
                    _outp = vfmaq_laneq_f32(_outp, _r12, _k3456, 2);

                    float32x4x2_t _r2 = vld2q_f32(r2);
                    float32x4x2_t _r2n = vld2q_f32(r2+8);

                    float32x4_t _r20 = _r2.val[0];
                    float32x4_t _r21 = _r2.val[1];
                    float32x4_t _r22 = vextq_f32(_r20, _r2n.val[0], 1);

                    _outp = vfmaq_laneq_f32(_outp, _r20, _k6789, 0);
                    _outp = vfmaq_laneq_f32(_outp, _r21, _k6789, 1);
                    _outp = vfmaq_laneq_f32(_outp, _r22, _k6789, 2);

                    vst1q_f32(outptr, _outp);

                    r0 += 8;
                    r1 += 8;
                    r2 += 8;
                    outptr += 4;
                }
 #else
                if (nn > 0)
                {
                asm volatile(
                    "pld        [%2, #256]          \n"
                    "vld2.f32   {d4-d7}, [%2]!      \n"

                    "veor       q10, q10            \n"
                    "veor       q11, q11            \n"

                    "0:                             \n"
                    "pld        [%1, #128]          \n"
                    "vld1.f32   {d0-d1}, [%1]       \n"

                    "vmla.f32   q0, q2, %e10[0]     \n"
                    "vmla.f32   q10, q3, %e10[1]    \n"

                    "pld        [%2, #256]          \n"
                    "vld2.f32   {d16-d19}, [%2]     \n"
                    "vext.32    q1, q2, q8, #1      \n"

                    "vmla.f32   q11, q1, %f10[0]    \n"

                    "pld        [%3, #256]          \n"
                    "vld2.f32   {d4-d7}, [%3]!      \n"

                    "vmla.f32   q0, q2, %e11[0]     \n"
                    "vmla.f32   q10, q3, %e11[1]    \n"

                    "pld        [%3, #256]          \n"
                    "vld2.f32   {d16-d19}, [%3]     \n"
                    "vext.32    q1, q2, q8, #1      \n"

                    "vmla.f32   q11, q1, %f11[0]    \n"

                    "pld        [%4, #256]          \n"
                    "vld2.f32   {d4-d7}, [%4]!      \n"

                    "vmla.f32   q0, q2, %e12[0]     \n"
                    "vmla.f32   q10, q3, %e12[1]    \n"

                    "pld        [%4, #256]          \n"
                    "vld2.f32   {d16-d19}, [%4]     \n"
                    "vext.32    q1, q2, q8, #1      \n"

                    "vmla.f32   q11, q1, %f12[0]    \n"

                    "pld        [%2, #256]          \n"
                    "vld2.f32   {d4-d7}, [%2]!      \n"

                    "vadd.f32   q0, q0, q10         \n"
                    "veor       q10, q10            \n"
                    "vadd.f32   q0, q0, q11         \n"
                    "veor       q11, q11            \n"

                    "subs       %0, #1              \n"
                    "vst1.f32   {d0-d1}, [%1]!      \n"
                    "bne        0b                  \n"
                    "sub        %2, #32             \n"
                    : "=r"(nn),     // %0
                      "=r"(outptr), // %1
                      "=r"(r0),     // %2
                      "=r"(r1),
                      "=r"(r2)
                    : "0"(nn),
                      "1"(outptr),
                      "2"(r0),
                      "3"(r1),
                      "4"(r2),
                      "w"(_k0123),  // %10
                      "w"(_k3456),  // %11
                      "w"(_k6789)   // %12
                    : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
                );
                }
 #endif // __aarch64__
 #endif // __ARM_NEON
                for (; remain>0; remain--)
                {
 #if __ARM_NEON
                    float32x4_t _r00 = vld1q_f32(r0);
                    float32x4_t _r10 = vld1q_f32(r1);
                    float32x4_t _r20 = vld1q_f32(r2);

                    float32x4_t _sum = vmulq_f32(_r00, _k0123);
                    _sum = vmlaq_f32(_sum, _r10, _k3456);
                    _sum = vmlaq_f32(_sum, _r20, _k6789);

                    _sum = vsetq_lane_f32(*outptr, _sum, 3);

 #if __aarch64__
                    *outptr = vaddvq_f32(_sum);
 #else
                    float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
                    _ss = vpadd_f32(_ss, _ss);

                    *outptr = vget_lane_f32(_ss, 0);
 #endif // __aarch64__
 #else
                    float sum = 0;

                    sum += r0[0] * k0[0];
                    sum += r0[1] * k0[1];
                    sum += r0[2] * k0[2];
                    sum += r1[0] * k1[0];
                    sum += r1[1] * k1[1];
                    sum += r1[2] * k1[2];
                    sum += r2[0] * k2[0];
                    sum += r2[1] * k2[1];
                    sum += r2[2] * k2[2];

                    *outptr += sum;
 #endif // __ARM_NEON

                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                    outptr++;
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
            }

            kernel0 += 9;
        }
    }
 }
--- a/src/layer/arm/convolution_4x4.h
+++ b/src/layer/arm/convolution_4x4.h
@@ -0,0 +1,340 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON

 static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for
    for (int p=0; p<outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;

        out.fill(bias0);

        for (int q=0; q<inch; q++)
        {
            float* outptr = out;

            const float* img0 = bottom_blob.channel(q);

            const float* kernel0 = kernel + p*inch*16  + q*16;

            const float* r0 = img0;
            const float* r1 = img0 + w;
            const float* r2 = img0 + w*2;
            const float* r3 = img0 + w*3;

 #if __ARM_NEON
            float32x4_t _k0123 = vld1q_f32(kernel0);
            float32x4_t _k4567 = vld1q_f32(kernel0+4);
            float32x4_t _k891011 = vld1q_f32(kernel0+8);
            float32x4_t _k12131415 = vld1q_f32(kernel0+12);
 #else
            const float* k0 = kernel0;
            const float* k1 = kernel0 + 4;
            const float* k2 = kernel0 + 8;
            const float* k3 = kernel0 + 12;
 #endif // __ARM_NEON

            for (int i = 0; i < outh; i++)
            {
 #if __ARM_NEON
                int nn = outw >> 2;
                int remain = outw - (nn << 2);
 #else
                int remain = outw;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
                for (; nn>0; nn--)
                {
                    float32x4_t _r00 = vld1q_f32(r0);
                    float32x4_t _r10 = vld1q_f32(r1);
                    float32x4_t _r20 = vld1q_f32(r2);
                    float32x4_t _r30 = vld1q_f32(r3);

                    float32x4_t _r01 = vld1q_f32(r0 + 4);
                    float32x4_t _r11 = vld1q_f32(r1 + 4);
                    float32x4_t _r21 = vld1q_f32(r2 + 4);
                    float32x4_t _r31 = vld1q_f32(r3 + 4);

                    float32x4_t _r02 = vld1q_f32(r0 + 8);
                    float32x4_t _r12 = vld1q_f32(r1 + 8);
                    float32x4_t _r22 = vld1q_f32(r2 + 8);
                    float32x4_t _r32 = vld1q_f32(r3 + 8);

                    float32x4_t _r03 = vld1q_f32(r0 + 12);
                    float32x4_t _r13 = vld1q_f32(r1 + 12);
                    float32x4_t _r23 = vld1q_f32(r2 + 12);
                    float32x4_t _r33 = vld1q_f32(r3 + 12);

                    float32x4_t _sum0 = vmulq_f32(_r00, _k0123);
                    float32x4_t _sum1 = vmulq_f32(_r01, _k0123);
                    float32x4_t _sum2 = vmulq_f32(_r02, _k0123);
                    float32x4_t _sum3 = vmulq_f32(_r03, _k0123);

                    _sum0 = vfmaq_f32(_sum0, _r10, _k4567);
                    _sum1 = vfmaq_f32(_sum1, _r11, _k4567);
                    _sum2 = vfmaq_f32(_sum2, _r12, _k4567);
                    _sum3 = vfmaq_f32(_sum3, _r13, _k4567);

                    _sum0 = vfmaq_f32(_sum0, _r20, _k891011);
                    _sum1 = vfmaq_f32(_sum1, _r21, _k891011);
                    _sum2 = vfmaq_f32(_sum2, _r22, _k891011);
                    _sum3 = vfmaq_f32(_sum3, _r23, _k891011);

                    _sum0 = vfmaq_f32(_sum0, _r30, _k12131415);
                    _sum1 = vfmaq_f32(_sum1, _r31, _k12131415);
                    _sum2 = vfmaq_f32(_sum2, _r32, _k12131415);
                    _sum3 = vfmaq_f32(_sum3, _r33, _k12131415);

                    float32x4_t _s01 = vpaddq_f32(_sum0, _sum1);
                    float32x4_t _s23 = vpaddq_f32(_sum2, _sum3);
                    float32x4_t _sum = vpaddq_f32(_s01, _s23);

                    float32x4_t _outp = vld1q_f32(outptr);

                    _outp = vaddq_f32(_outp, _sum);

                    vst1q_f32(outptr, _sum);

                    r0 += 16;
                    r1 += 16;
                    r2 += 16;
                    r3 += 16;
                    outptr += 4;
                }
 #else
                if (nn > 0)
                {
                asm volatile(

                    "pld        [%1, #128]          \n"

                    "0:                             \n"

                    "pld        [%2, #512]          \n"
                    "pld        [%3, #512]          \n"

                    "vld1.f32   {d14-d15}, [%1]     \n"// q7 = outptr

                    "vld1.f32   {d16-d17}, [%2]!    \n"// q8  = r0
                    "vld1.f32   {d18-d19}, [%3]!    \n"// q9  = r1

                    "pld        [%4, #512]          \n"
                    "pld        [%5, #512]          \n"

                    "vmul.f32   q12, q8, %q12       \n"
                    "vmul.f32   q13, q9, %q13       \n"

                    "vld1.f32   {d20-d21}, [%4]!    \n"// q10 = r2
                    "vld1.f32   {d22-d23}, [%5]!    \n"// q11 = r3

                    "vmla.f32   q12, q10, %q14      \n"
                    "vmla.f32   q13, q11, %q15      \n"

                    "vadd.f32   q5, q12, q13        \n"

                    "vld1.f32   {d16-d17}, [%2]!    \n"// q8  = r0
                    "vld1.f32   {d18-d19}, [%3]!    \n"// q9  = r1

                    "vmul.f32   q12, q8, %q12       \n"
                    "vmul.f32   q13, q9, %q13       \n"

                    "vld1.f32   {d20-d21}, [%4]!    \n"// q10 = r2
                    "vld1.f32   {d22-d23}, [%5]!    \n"// q11 = r3

                    "vmla.f32   q12, q10, %q14      \n"
                    "vmla.f32   q13, q11, %q15      \n"

                    "vadd.f32   q6, q12, q13        \n"

                    "vld1.f32   {d16-d17}, [%2]!    \n"// q8  = r0
                    "vld1.f32   {d18-d19}, [%3]!    \n"// q9  = r1

                    "vmul.f32   q12, q8, %q12       \n"
                    "vmul.f32   q13, q9, %q13       \n"

                    "vld1.f32   {d20-d21}, [%4]!    \n"// q10 = r2
                    "vld1.f32   {d22-d23}, [%5]!    \n"// q11 = r3

                    "vmla.f32   q12, q10, %q14      \n"
                    "vmla.f32   q13, q11, %q15      \n"

                    "vadd.f32   q14, q12, q13       \n"

                    "vld1.f32   {d16-d17}, [%2]!    \n"// q8  = r0
                    "vld1.f32   {d18-d19}, [%3]!    \n"// q9  = r1

                    "vmul.f32   q12, q8, %q12       \n"
                    "vmul.f32   q13, q9, %q13       \n"

                    "vld1.f32   {d20-d21}, [%4]!    \n"// q10 = r2
                    "vld1.f32   {d22-d23}, [%5]!    \n"// q11 = r3

                    "vmla.f32   q12, q10, %q14      \n"
                    "vmla.f32   q13, q11, %q15      \n"

                    "vadd.f32   q15, q12, q13       \n"

                    "vadd.f32   d10, d10, d11       \n"
                    "vadd.f32   d28, d28, d29       \n"
                    "vadd.f32   d11, d12, d13       \n"
                    "vadd.f32   d29, d30, d31       \n"

                    "vpadd.f32  d10, d10, d11       \n"
                    "vpadd.f32  d11, d28, d29       \n"

                    "vadd.f32   q7, q7, q5          \n"

                    "vst1.f32   {d14-d15}, [%1]!    \n"

                    "pld        [%1, #128]          \n"

                    "subs       %0, #1              \n"
                    "bne        0b                  \n"
                    : "=r"(nn),         // %0
                      "=r"(outptr),     // %1
                      "=r"(r0),         // %2
                      "=r"(r1),         // %3
                      "=r"(r2),         // %4
                      "=r"(r3)          // %5
                    : "0"(nn),
                      "1"(outptr),
                      "2"(r0),
                      "3"(r1),
                      "4"(r2),
                      "5"(r3),
                      "w"(_k0123),      // %12
                      "w"(_k4567),      // %13
                      "w"(_k891011),    // %14
                      "w"(_k12131415)   // %15
                    : "cc", "memory", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
                );
                }
 #endif // __aarch64__
 #endif // __ARM_NEON
                for (; remain>0; remain--)
                {
 #if __ARM_NEON
 #if __aarch64__
                    float32x4_t _r0 = vld1q_f32(r0);
                    float32x4_t _r1 = vld1q_f32(r1);
                    float32x4_t _r2 = vld1q_f32(r2);
                    float32x4_t _r3 = vld1q_f32(r3);

                    float32x4_t _sum = vmulq_f32(_r0, _k0123);
                    _sum = vmlaq_f32(_sum, _r1, _k4567);
                    _sum = vmlaq_f32(_sum, _r2, _k891011);
                    _sum = vmlaq_f32(_sum, _r3, _k12131415);

                    *outptr += vaddvq_f32(_sum);
 #else
                    float sum = 0.f;

                    asm volatile(
                        "vld1.f32   {d16-d17}, [%0]!    \n"// q8  = r0
                        "vld1.f32   {d18-d19}, [%1]!    \n"// q9  = r1

                        "vmul.f32   q12, q8, %q9        \n"
                        "vmul.f32   q13, q9, %q10       \n"

                        "vld1.f32   {d20-d21}, [%2]!    \n"// q10 = r2
                        "vld1.f32   {d22-d23}, [%3]!    \n"// q11 = r3

                        "vmla.f32   q12, q10, %q11      \n"
                        "vmla.f32   q13, q11, %q12      \n"

                        "vadd.f32   q5, q12, q13        \n"
                        "vadd.f32   d10, d10, d11       \n"
                        "vpadd.f32  d10, d10, d10       \n"
                        "vmov.f32   %4, d10[0]          \n"
                        : "=r"(r0),         // %0
                          "=r"(r1),         // %1
                          "=r"(r2),         // %2
                          "=r"(r3),         // %3
                          "=r"(sum)         // %4
                        : "0"(r0),
                          "1"(r1),
                          "2"(r2),
                          "3"(r3),
                          "w"(_k0123),      // %9
                          "w"(_k4567),      // %10
                          "w"(_k891011),    // %11
                          "w"(_k12131415)   // %12
                        : "cc", "memory", "q5", "q6", "q8", "q9", "q10", "q11", "q12", "q13"
                    );

                    *outptr += sum;
 #endif // __aarch64__
 #else
                    float sum = 0;

                    sum += r0[0] * k0[0];
                    sum += r0[1] * k0[1];
                    sum += r0[2] * k0[2];
                    sum += r0[3] * k0[3];

                    sum += r1[0] * k1[0];
                    sum += r1[1] * k1[1];
                    sum += r1[2] * k1[2];
                    sum += r1[3] * k1[3];

                    sum += r2[0] * k2[0];
                    sum += r2[1] * k2[1];
                    sum += r2[2] * k2[2];
                    sum += r2[3] * k2[3];

                    sum += r3[0] * k3[0];
                    sum += r3[1] * k3[1];
                    sum += r3[2] * k3[2];
                    sum += r3[3] * k3[3];

                    *outptr += sum;
 #endif // __ARM_NEON
                    r0 += 4;
                    r1 += 4;
                    r2 += 4;
                    r3 += 4;
                    outptr++;
                }

                r0 += w * 3;
                r1 += w * 3;
                r2 += w * 3;
                r3 += w * 3;
            }

        }
    }

 }

--- a/src/layer/arm/convolution_5x5.h
+++ b/src/layer/arm/convolution_5x5.h
--- a/src/layer/arm/convolution_7x7.h
+++ b/src/layer/arm/convolution_7x7.h
--- a/src/layer/arm/convolution_arm.cpp
+++ b/src/layer/arm/convolution_arm.cpp
@@ -0,0 +1,120 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "convolution_arm.h"

 namespace ncnn {

 #include "convolution_1x1.h"
 #include "convolution_2x2.h"
 #include "convolution_3x3.h"
 #include "convolution_4x4.h"
 #include "convolution_5x5.h"
 #include "convolution_7x7.h"

 DEFINE_LAYER_CREATOR(Convolution_arm)

 int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
 {
    // convolv with NxN kernel
    // value = value + bias

    if (kernel_size > 7 || stride > 4 || dilation != 1)
    {
        return Convolution::forward(bottom_blob, top_blob);
    }

    typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&);

    // kernel_size x stride
    conv_func conv_func_table[7][4] =
    {
        {
            conv1x1s1_neon,
            conv1x1s2_neon,
            0,
            0
        }, // kernel_size = 1
        {
            conv2x2s1_neon,
            0,
            0,
            0
        }, // kernel_size = 2
        {
            conv3x3s1_neon,
            conv3x3s2_neon,
            0,
            0
        }, // kernel_size = 3
        {
            0,
            0,
            0,
            conv4x4s4_neon
        }, // kernel_size = 4
        {
            conv5x5s1_neon,
            conv5x5s2_neon,
            0,
            0
        }, // kernel_size = 5
        {
            0,
            0,
            0,
            0
        }, // kernel_size = 6
        {
            conv7x7s1_neon,
            conv7x7s2_neon,
            0,
            0
        }  // kernel_size = 7
    };

    conv_func conv = conv_func_table[kernel_size-1][stride-1];
    if (!conv)
    {
        return Convolution::forward(bottom_blob, top_blob);
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;

    Mat bottom_blob_bordered = bottom_blob;
    if (pad > 0)
    {
        copy_make_border(bottom_blob, bottom_blob_bordered, pad, pad, pad, pad, BORDER_CONSTANT, 0.f);
        if (bottom_blob_bordered.empty())
            return -100;

        w = bottom_blob_bordered.w;
        h = bottom_blob_bordered.h;
    }

    int outw = (w - kernel_size) / stride + 1;
    int outh = (h - kernel_size) / stride + 1;

    top_blob.create(outw, outh, num_output);
    if (top_blob.empty())
        return -100;

    conv(bottom_blob_bordered, top_blob, weight_data, bias_data);

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/arm/convolution_arm.h
+++ b/src/layer/arm/convolution_arm.h
@@ -0,0 +1,30 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_CONVOLUTION_ARM_H
 #define LAYER_CONVOLUTION_ARM_H

 #include "convolution.h"

 namespace ncnn {

 class Convolution_arm : public Convolution
 {
 public:
    virtual int forward(const Mat& bottom_blobs, Mat& top_blobs) const;
 };

 } // namespace ncnn

 #endif // LAYER_CONVOLUTION_ARM_H
--- a/src/layer/arm/eltwise_arm.cpp
+++ b/src/layer/arm/eltwise_arm.cpp
@@ -0,0 +1,574 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "eltwise_arm.h"

 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON

 namespace ncnn {

 DEFINE_LAYER_CREATOR(Eltwise_arm)

 int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
 {
    const Mat& bottom_blob = bottom_blobs[0];
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    int size = w * h;

    Mat& top_blob = top_blobs[0];
    top_blob.create(w, h, channels);
    if (top_blob.empty())
        return -100;

    if (op_type == Operation_PROD)
    {
        // first blob
        const Mat& bottom_blob1 = bottom_blobs[1];
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            const float* ptr1 = bottom_blob1.channel(q);
            float* outptr = top_blob.channel(q);

 #if __ARM_NEON
            int nn = size >> 2;
            int remain = size - (nn << 2);
 #else
            int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
            for (; nn>0; nn--)
            {
                float32x4_t _ptr = vld1q_f32(ptr);
                float32x4_t _ptr1 = vld1q_f32(ptr1);
                float32x4_t _p = vmulq_f32(_ptr, _ptr1);
                vst1q_f32(outptr, _p);

                ptr += 4;
                ptr1 += 4;
                outptr += 4;
            }
 #else
            if (nn > 0)
            {
            asm volatile(
                "0:                             \n"
                "pld        [%1, #128]          \n"
                "pld        [%2, #128]          \n"
                "vld1.f32   {d0-d1}, [%1 :128]! \n"
                "vld1.f32   {d2-d3}, [%2 :128]! \n"
                "vmul.f32   q0, q0, q1          \n"
                "subs       %0, #1              \n"
                "vst1.f32   {d0-d1}, [%3 :128]! \n"
                "bne        0b                  \n"
                : "=r"(nn),     // %0
                  "=r"(ptr),    // %1
                  "=r"(ptr1),   // %2
                  "=r"(outptr)  // %3
                : "0"(nn),
                  "1"(ptr),
                  "2"(ptr1),
                  "3"(outptr)
                : "cc", "memory", "q0", "q1"
            );
            }
 #endif // __aarch64__
 #endif // __ARM_NEON
            for (; remain>0; remain--)
            {
                *outptr = *ptr * *ptr1;

                ptr++;
                ptr1++;
                outptr++;
            }
        }

        for (size_t b=2; b<bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob1 = bottom_blobs[b];
            #pragma omp parallel for
            for (int q=0; q<channels; q++)
            {
                const float* ptr = bottom_blob1.channel(q);
                float* outptr = top_blob.channel(q);

 #if __ARM_NEON
                int nn = size >> 2;
                int remain = size - (nn << 2);
 #else
                int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
                for (; nn>0; nn--)
                {
                    float32x4_t _ptr = vld1q_f32(ptr);
                    float32x4_t _p = vld1q_f32(outptr);
                    _p = vmulq_f32(_ptr, _p);
                    vst1q_f32(outptr, _p);

                    ptr += 4;
                    outptr += 4;
                }
 #else
                if (nn > 0)
                {
                asm volatile(
                    "0:                             \n"
                    "pld        [%1, #128]          \n"
                    "pld        [%2, #128]          \n"
                    "vld1.f32   {d0-d1}, [%1 :128]! \n"
                    "vld1.f32   {d2-d3}, [%2 :128]  \n"
                    "vmul.f32   q0, q0, q1          \n"
                    "subs       %0, #1              \n"
                    "vst1.f32   {d0-d1}, [%2 :128]! \n"
                    "bne        0b                  \n"
                    : "=r"(nn),     // %0
                      "=r"(ptr),    // %1
                      "=r"(outptr)  // %2
                    : "0"(nn),
                      "1"(ptr),
                      "2"(outptr)
                    : "cc", "memory", "q0", "q1"
                );
                }
 #endif // __aarch64__
 #endif // __ARM_NEON
                for (; remain>0; remain--)
                {
                    *outptr *= *ptr;

                    ptr++;
                    outptr++;
                }
            }
        }
    }
    else if (op_type == Operation_SUM)
    {
        if (num_coeff == 0)
        {
            // first blob
            const Mat& bottom_blob1 = bottom_blobs[1];
            #pragma omp parallel for
            for (int q=0; q<channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                const float* ptr1 = bottom_blob1.channel(q);
                float* outptr = top_blob.channel(q);

 #if __ARM_NEON
                int nn = size >> 2;
                int remain = size - (nn << 2);
 #else
                int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
                for (; nn>0; nn--)
                {
                    float32x4_t _ptr = vld1q_f32(ptr);
                    float32x4_t _ptr1 = vld1q_f32(ptr1);
                    float32x4_t _p = vaddq_f32(_ptr, _ptr1);
                    vst1q_f32(outptr, _p);

                    ptr += 4;
                    ptr1 += 4;
                    outptr += 4;
                }
 #else
                if (nn > 0)
                {
                asm volatile(
                    "0:                             \n"
                    "pld        [%1, #128]          \n"
                    "pld        [%2, #128]          \n"
                    "vld1.f32   {d0-d1}, [%1 :128]! \n"
                    "vld1.f32   {d2-d3}, [%2 :128]! \n"
                    "vadd.f32   q0, q0, q1          \n"
                    "subs       %0, #1              \n"
                    "vst1.f32   {d0-d1}, [%3 :128]! \n"
                    "bne        0b                  \n"
                    : "=r"(nn),     // %0
                      "=r"(ptr),    // %1
                      "=r"(ptr1),   // %2
                      "=r"(outptr)  // %3
                    : "0"(nn),
                      "1"(ptr),
                      "2"(ptr1),
                      "3"(outptr)
                    : "cc", "memory", "q0", "q1"
                );
                }
 #endif // __aarch64__
 #endif // __ARM_NEON
                for (; remain>0; remain--)
                {
                    *outptr = *ptr + *ptr1;

                    ptr++;
                    ptr1++;
                    outptr++;
                }
            }

            for (size_t b=2; b<bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob1 = bottom_blobs[b];
                #pragma omp parallel for
                for (int q=0; q<channels; q++)
                {
                    const float* ptr = bottom_blob1.channel(q);
                    float* outptr = top_blob.channel(q);

 #if __ARM_NEON
                    int nn = size >> 2;
                    int remain = size - (nn << 2);
 #else
                    int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
                    for (; nn>0; nn--)
                    {
                        float32x4_t _ptr = vld1q_f32(ptr);
                        float32x4_t _p = vld1q_f32(outptr);
                        _p = vaddq_f32(_ptr, _p);
                        vst1q_f32(outptr, _p);

                        ptr += 4;
                        outptr += 4;
                    }
 #else
                    if (nn > 0)
                    {
                    asm volatile(
                        "0:                             \n"
                        "pld        [%1, #128]          \n"
                        "pld        [%2, #128]          \n"
                        "vld1.f32   {d0-d1}, [%1 :128]! \n"
                        "vld1.f32   {d2-d3}, [%2 :128]  \n"
                        "vadd.f32   q0, q0, q1          \n"
                        "subs       %0, #1              \n"
                        "vst1.f32   {d0-d1}, [%2 :128]! \n"
                        "bne        0b                  \n"
                        : "=r"(nn),     // %0
                          "=r"(ptr),    // %1
                          "=r"(outptr)  // %2
                        : "0"(nn),
                          "1"(ptr),
                          "2"(outptr)
                        : "cc", "memory", "q0", "q1"
                    );
                    }
 #endif // __aarch64__
 #endif // __ARM_NEON
                    for (; remain>0; remain--)
                    {
                        *outptr += *ptr;

                        ptr++;
                        outptr++;
                    }
                }
            }
        }
        else
        {
            const float* coeffs_ptr = coeffs;

            // first blob
            const Mat& bottom_blob1 = bottom_blobs[1];
            float coeff0 = coeffs_ptr[0];
            float coeff1 = coeffs_ptr[1];
            #pragma omp parallel for
            for (int q=0; q<channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                const float* ptr1 = bottom_blob1.channel(q);
                float* outptr = top_blob.channel(q);

 #if __ARM_NEON
                int nn = size >> 2;
                int remain = size - (nn << 2);
 #else
                int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
                float32x4_t _coeff0 = vdupq_n_f32(coeff0);
                float32x4_t _coeff1 = vdupq_n_f32(coeff1);
 #if __aarch64__
                for (; nn>0; nn--)
                {
                    float32x4_t _ptr = vld1q_f32(ptr);
                    float32x4_t _ptr1 = vld1q_f32(ptr1);
                    float32x4_t _p = vmulq_f32(_ptr, _coeff0);
                    _p = vmlaq_f32(_p, _ptr1, _coeff1);
                    vst1q_f32(outptr, _p);

                    ptr += 4;
                    ptr1 += 4;
                    outptr += 4;
                }
 #else
                if (nn > 0)
                {
                asm volatile(
                    "0:                             \n"
                    "pld        [%1, #128]          \n"
                    "pld        [%2, #128]          \n"
                    "vld1.f32   {d0-d1}, [%1 :128]! \n"
                    "vld1.f32   {d2-d3}, [%2 :128]! \n"
                    "vmul.f32   q0, q0, %q8         \n"
                    "vmla.f32   q0, q1, %q9         \n"
                    "subs       %0, #1              \n"
                    "vst1.f32   {d0-d1}, [%3 :128]! \n"
                    "bne        0b                  \n"
                    : "=r"(nn),     // %0
                      "=r"(ptr),    // %1
                      "=r"(ptr1),   // %2
                      "=r"(outptr)  // %3
                    : "0"(nn),
                      "1"(ptr),
                      "2"(ptr1),
                      "3"(outptr),
                      "w"(_coeff0), // %8
                      "w"(_coeff1)  // %9
                    : "cc", "memory", "q0", "q1"
                );
                }
 #endif // __aarch64__
 #endif // __ARM_NEON
                for (; remain>0; remain--)
                {
                    *outptr = *ptr * coeff0 + *ptr1 * coeff1;

                    ptr++;
                    ptr1++;
                    outptr++;
                }
            }

            for (size_t b=2; b<bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob1 = bottom_blobs[b];
                float coeff = coeffs_ptr[b];
                #pragma omp parallel for
                for (int q=0; q<channels; q++)
                {
                    const float* ptr = bottom_blob1.channel(q);
                    float* outptr = top_blob.channel(q);

 #if __ARM_NEON
                    int nn = size >> 2;
                    int remain = size - (nn << 2);
 #else
                    int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
                    float32x4_t _coeff = vdupq_n_f32(coeff);
 #if __aarch64__
                    for (; nn>0; nn--)
                    {
                        float32x4_t _ptr = vld1q_f32(ptr);
                        float32x4_t _p = vld1q_f32(outptr);
                        _p = vmlaq_f32(_p, _ptr, _coeff);
                        vst1q_f32(outptr, _p);

                        ptr += 4;
                        outptr += 4;
                    }
 #else
                    if (nn > 0)
                    {
                    asm volatile(
                        "0:                             \n"
                        "pld        [%1, #128]          \n"
                        "pld        [%2, #128]          \n"
                        "vld1.f32   {d0-d1}, [%1 :128]! \n"
                        "vld1.f32   {d2-d3}, [%2 :128]  \n"
                        "vmla.f32   q1, q0, %q6         \n"
                        "subs       %0, #1              \n"
                        "vst1.f32   {d0-d1}, [%2 :128]! \n"
                        "bne        0b                  \n"
                        : "=r"(nn),     // %0
                          "=r"(ptr),    // %1
                          "=r"(outptr)  // %2
                        : "0"(nn),
                          "1"(ptr),
                          "2"(outptr),
                          "w"(_coeff)   // %6
                        : "cc", "memory", "q0", "q1"
                    );
                    }
 #endif // __aarch64__
 #endif // __ARM_NEON
                    for (; remain>0; remain--)
                    {
                        *outptr += *ptr * coeff;

                        ptr++;
                        outptr++;
                    }
                }
            }
        }
    }
    else if (op_type == Operation_MAX)
    {
        // first blob
        const Mat& bottom_blob1 = bottom_blobs[1];
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            const float* ptr1 = bottom_blob1.channel(q);
            float* outptr = top_blob.channel(q);

 #if __ARM_NEON
            int nn = size >> 2;
            int remain = size - (nn << 2);
 #else
            int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
            for (; nn>0; nn--)
            {
                float32x4_t _ptr = vld1q_f32(ptr);
                float32x4_t _ptr1 = vld1q_f32(ptr1);
                float32x4_t _p = vmaxq_f32(_ptr, _ptr1);
                vst1q_f32(outptr, _p);

                ptr += 4;
                ptr1 += 4;
                outptr += 4;
            }
 #else
            if (nn > 0)
            {
            asm volatile(
                "0:                             \n"
                "pld        [%1, #128]          \n"
                "pld        [%2, #128]          \n"
                "vld1.f32   {d0-d1}, [%1 :128]! \n"
                "vld1.f32   {d2-d3}, [%2 :128]! \n"
                "vmax.f32   q0, q0, q1          \n"
                "subs       %0, #1              \n"
                "vst1.f32   {d0-d1}, [%3 :128]! \n"
                "bne        0b                  \n"
                : "=r"(nn),     // %0
                  "=r"(ptr),    // %1
                  "=r"(ptr1),   // %2
                  "=r"(outptr)  // %3
                : "0"(nn),
                  "1"(ptr),
                  "2"(ptr1),
                  "3"(outptr)
                : "cc", "memory", "q0", "q1"
            );
            }
 #endif // __aarch64__
 #endif // __ARM_NEON
            for (; remain>0; remain--)
            {
                *outptr = std::max(*ptr, *ptr1);

                ptr++;
                ptr1++;
                outptr++;
            }
        }

        for (size_t b=2; b<bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob1 = bottom_blobs[b];
            #pragma omp parallel for
            for (int q=0; q<channels; q++)
            {
                const float* ptr = bottom_blob1.channel(q);
                float* outptr = top_blob.channel(q);

 #if __ARM_NEON
                int nn = size >> 2;
                int remain = size - (nn << 2);
 #else
                int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
                for (; nn>0; nn--)
                {
                    float32x4_t _ptr = vld1q_f32(ptr);
                    float32x4_t _p = vld1q_f32(outptr);
                    _p = vmaxq_f32(_ptr, _p);
                    vst1q_f32(outptr, _p);

                    ptr += 4;
                    outptr += 4;
                }
 #else
                if (nn > 0)
                {
                asm volatile(
                    "0:                             \n"
                    "pld        [%1, #128]          \n"
                    "pld        [%2, #128]          \n"
                    "vld1.f32   {d0-d1}, [%1 :128]! \n"
                    "vld1.f32   {d2-d3}, [%2 :128]  \n"
                    "vmax.f32   q0, q0, q1          \n"
                    "subs       %0, #1              \n"
                    "vst1.f32   {d0-d1}, [%2 :128]! \n"
                    "bne        0b                  \n"
                    : "=r"(nn),     // %0
                      "=r"(ptr),    // %1
                      "=r"(outptr)  // %2
                    : "0"(nn),
                      "1"(ptr),
                      "2"(outptr)
                    : "cc", "memory", "q0", "q1"
                );
                }
 #endif // __aarch64__
 #endif // __ARM_NEON
                for (; remain>0; remain--)
                {
                    *outptr = std::max(*ptr, *outptr);

                    ptr++;
                    outptr++;
                }
            }
        }
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/arm/eltwise_arm.h
+++ b/src/layer/arm/eltwise_arm.h
@@ -0,0 +1,30 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_ELTWISE_ARM_H
 #define LAYER_ELTWISE_ARM_H

 #include "eltwise.h"

 namespace ncnn {

 class Eltwise_arm : public Eltwise
 {
 public:
    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
 };

 } // namespace ncnn

 #endif // LAYER_ELTWISE_ARM_H
--- a/src/layer/arm/innerproduct_arm.cpp
+++ b/src/layer/arm/innerproduct_arm.cpp
@@ -0,0 +1,136 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "innerproduct_arm.h"

 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON

 namespace ncnn {

 DEFINE_LAYER_CREATOR(InnerProduct_arm)

 int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    int size = w * h;

    top_blob.create(1, 1, num_output);
    if (top_blob.empty())
        return -100;

    // num_output
    const float* weight_data_ptr = weight_data;
    #pragma omp parallel for
    for (int p=0; p<num_output; p++)
    {
        float* outptr = top_blob.channel(p);
        float sum = 0.f;

        if (bias_term)
            sum = bias_data.data[p];

        const float* w = weight_data_ptr + size * channels * p;
        const float* w2 = w + size;

 #if __ARM_NEON
        float32x4_t _sum = vdupq_n_f32(0.f);
        float32x4_t _sum2 = vdupq_n_f32(0.f);
 #endif // __ARM_NEON

        // channels
        for (int q=0; q<channels; q++)
        {
            const float* m = bottom_blob.channel(q);

 #if __ARM_NEON
            int nn = size >> 3;
            int remain = size & 7;
 #else
            int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
            for (; nn>0; nn--)
            {
                float32x4_t _m = vld1q_f32(m);
                float32x4_t _w = vld1q_f32(w);
                _sum = vfmaq_f32(_sum, _m, _w);

                _m = vld1q_f32(m + 4);
                _w = vld1q_f32(w + 4);
                _sum2 = vfmaq_f32(_sum2, _m, _w);

                m += 8;
                w += 8;
            }
 #else
            if (nn > 0)
            {
            asm volatile(
                "0:                             \n"
                "pld        [%1, #256]          \n"
                "vld1.f32   {d0-d3}, [%1 :128]! \n"
                "pld        [%2, #256]          \n"
                "vld1.f32   {d4-d7}, [%2]!      \n"
                "vmla.f32   %q3, q0, q2         \n"
                "subs       %0, #1              \n"
                "vmla.f32   %q4, q1, q3         \n"
                "bne        0b                  \n"
                : "=r"(nn),     // %0
                  "=r"(m),      // %1
                  "=r"(w),      // %2
                  "=w"(_sum),   // %3
                  "=w"(_sum2)   // %4
                : "0"(nn),
                  "1"(m),
                  "2"(w),
                  "3"(_sum),
                  "4"(_sum2)
                : "cc", "memory", "q0", "q1", "q2", "q3"
            );
            }
 #endif // __aarch64__
 #endif // __ARM_NEON
            for (; remain>0; remain--)
            {
                sum += *m * *w;

                m++;
                w++;
            }
        }

 #if __ARM_NEON
        _sum = vaddq_f32(_sum, _sum2);
 #if __aarch64__
        sum += vaddvq_f32(_sum);
 #else
        float32x2_t _sumss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
        _sumss = vpadd_f32(_sumss, _sumss);
        sum += vget_lane_f32(_sumss, 0);
 #endif // __aarch64__
 #endif // __ARM_NEON

        outptr[0] = sum;
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/arm/innerproduct_arm.h
+++ b/src/layer/arm/innerproduct_arm.h
@@ -0,0 +1,30 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_INNERPRODUCT_ARM_H
 #define LAYER_INNERPRODUCT_ARM_H

 #include "innerproduct.h"

 namespace ncnn {

 class InnerProduct_arm : public InnerProduct
 {
 public:
    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
 };

 } // namespace ncnn

 #endif // LAYER_INNERPRODUCT_ARM_H
--- a/src/layer/arm/lrn_arm.cpp
+++ b/src/layer/arm/lrn_arm.cpp
@@ -0,0 +1,227 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "lrn_arm.h"
 #include <math.h>

 #if __ARM_NEON
 #include <arm_neon.h>
 #include "neon_mathfun.h"
 #endif // __ARM_NEON

 namespace ncnn {

 DEFINE_LAYER_CREATOR(LRN_arm)

 int LRN_arm::forward_inplace(Mat& bottom_top_blob) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    // squared values with local_size padding
    Mat square_blob;
    square_blob.create(w, h, channels);
    if (square_blob.empty())
        return -100;

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_top_blob.channel(q);
        float* outptr = square_blob.channel(q);

 #if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
 #else
        int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
        for (; nn>0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _outp = vmulq_f32(_p, _p);
            vst1q_f32(outptr, _outp);

            ptr += 4;
            outptr += 4;
        }
 #endif // __ARM_NEON
        for (; remain>0; remain--)
        {
            *outptr = *ptr * *ptr;

            ptr++;
            outptr++;
        }
    }

    float alpha_div_size = alpha / local_size;

    if (region_type == NormRegion_ACROSS_CHANNELS)
    {
        Mat square_sum;
        square_sum.create(w, h, channels);
        if (square_sum.empty())
            return -100;
        square_sum.fill(0.f);

        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            // square sum
            for (int p=q - local_size / 2; p<q + local_size; p++)
            {
                if (p < 0 || p >= channels)
                    continue;

                const float* sptr = square_blob.channel(p);
                float* ssptr = square_sum.channel(q);

 #if __ARM_NEON
                int nn = size >> 2;
                int remain = size - (nn << 2);
 #else
                int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
                for (; nn>0; nn--)
                {
                    float32x4_t _sp = vld1q_f32(sptr);
                    float32x4_t _ssp = vld1q_f32(ssptr);
                    _ssp = vaddq_f32(_ssp, _sp);
                    vst1q_f32(ssptr, _ssp);

                    sptr += 4;
                    ssptr += 4;
                }
 #endif // __ARM_NEON
                for (; remain>0; remain--)
                {
                    *ssptr += *sptr;
                    sptr++;
                    ssptr++;
                }
            }

            float* ptr = bottom_top_blob.channel(q);
            float* ssptr = square_sum.channel(q);

 #if __ARM_NEON
            int nn = size >> 2;
            int remain = size - (nn << 2);
 #else
            int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
            float32x4_t _v1 = vdupq_n_f32(1.f);
            float32x4_t _ads = vdupq_n_f32(alpha_div_size);
            float32x4_t _mb = vdupq_n_f32(-beta);
            for (; nn>0; nn--)
            {
                float32x4_t _p = vld1q_f32(ptr);
                float32x4_t _ssp = vld1q_f32(ssptr);
                _ssp = vmulq_f32(_ssp, _ads);
                _ssp = vaddq_f32(_ssp, _v1);
                _ssp = pow_ps(_ssp, _mb);
                _p = vmulq_f32(_p, _ssp);
                vst1q_f32(ptr, _p);

                ssptr += 4;
                ptr += 4;
            }
 #endif // __ARM_NEON
            for (; remain>0; remain--)
            {
                *ptr = *ptr * pow(1.f + alpha_div_size * *ssptr, -beta);

                ssptr++;
                ptr++;
            }
        }
    }
    else if (region_type == NormRegion_WITHIN_CHANNEL)
    {
        int outw = w;
        int outh = h;

        Mat square_blob_bordered = square_blob;
        int pad = local_size / 2;
        if (pad > 0)
        {
            copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f);
            if (square_blob_bordered.empty())
                return -100;

            w = square_blob_bordered.w;
            h = square_blob_bordered.h;
        }

        const int maxk = local_size * local_size;

        // norm window offsets
        std::vector<int> _space_ofs(maxk);
        int* space_ofs = &_space_ofs[0];
        {
            int p1 = 0;
            int p2 = 0;
            int gap = w - local_size;
            for (int i = 0; i < local_size; i++)
            {
                for (int j = 0; j < local_size; j++)
                {
                    space_ofs[p1] = p2;
                    p1++;
                    p2++;
                }
                p2 += gap;
            }
        }

        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
            const float* sptr = square_blob_bordered.channel(q);

            for (int i = 0; i < outh; i++)
            {
                for (int j = 0; j < outw; j++)
                {
                    float ss = 0.f;

                    for (int k = 0; k < maxk; k++)
                    {
                        float val = sptr[ space_ofs[k] ];
                        ss += val;
                    }

                    ptr[j] = ptr[j] * pow(1.f + alpha_div_size * ss, -beta);
                }

                ptr += outw;
                sptr += w;
            }
        }
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/arm/lrn_arm.h
+++ b/src/layer/arm/lrn_arm.h
@@ -0,0 +1,30 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_LRN_ARM_H
 #define LAYER_LRN_ARM_H

 #include "lrn.h"

 namespace ncnn {

 class LRN_arm : public LRN
 {
 public:
    virtual int forward_inplace(Mat& bottom_top_blob) const;
 };

 } // namespace ncnn

 #endif // LAYER_LRN_ARM_H
--- a/src/layer/arm/neon_mathfun.h
+++ b/src/layer/arm/neon_mathfun.h
@@ -0,0 +1,316 @@
 /* NEON implementation of sin, cos, exp and log
 *
 *   Inspired by Intel Approximate Math library, and based on the
 *   corresponding algorithms of the cephes math library
 */

 /* Copyright (C) 2011  Julien Pommier
 *
 *  This software is provided 'as-is', without any express or implied
 *  warranty.  In no event will the authors be held liable for any damages
 *  arising from the use of this software.
 *
 *  Permission is granted to anyone to use this software for any purpose,
 *  including commercial applications, and to alter it and redistribute it
 *  freely, subject to the following restrictions:
 *
 *  1. The origin of this software must not be misrepresented; you must not
 *     claim that you wrote the original software. If you use this software
 *     in a product, an acknowledgment in the product documentation would be
 *     appreciated but is not required.
 *  2. Altered source versions must be plainly marked as such, and must not be
 *     misrepresented as being the original software.
 *  3. This notice may not be removed or altered from any source distribution.
 *
 *  (this is the zlib license)
 */

 #include <arm_neon.h>

 #define c_inv_mant_mask ~0x7f800000u
 #define c_cephes_SQRTHF 0.707106781186547524
 #define c_cephes_log_p0 7.0376836292E-2
 #define c_cephes_log_p1 - 1.1514610310E-1
 #define c_cephes_log_p2 1.1676998740E-1
 #define c_cephes_log_p3 - 1.2420140846E-1
 #define c_cephes_log_p4 + 1.4249322787E-1
 #define c_cephes_log_p5 - 1.6668057665E-1
 #define c_cephes_log_p6 + 2.0000714765E-1
 #define c_cephes_log_p7 - 2.4999993993E-1
 #define c_cephes_log_p8 + 3.3333331174E-1
 #define c_cephes_log_q1 -2.12194440e-4
 #define c_cephes_log_q2 0.693359375

 /* natural logarithm computed for 4 simultaneous float
 *   return NaN for x <= 0
 */
 static inline float32x4_t log_ps(float32x4_t x)
 {
    float32x4_t one = vdupq_n_f32(1);

    x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
    uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0));

    int32x4_t ux = vreinterpretq_s32_f32(x);

    int32x4_t emm0 = vshrq_n_s32(ux, 23);

    /* keep only the fractional part */
    ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask));
    ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f)));
    x = vreinterpretq_f32_s32(ux);

    emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f));
    float32x4_t e = vcvtq_f32_s32(emm0);

    e = vaddq_f32(e, one);

    /* part2:
     *     if( x < SQRTHF ) {
     *       e -= 1;
     *       x = x + x - 1.0;
     *     } else { x = x - 1.0; }
     */
    uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF));
    float32x4_t tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
    x = vsubq_f32(x, one);
    e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask)));
    x = vaddq_f32(x, tmp);

    float32x4_t z = vmulq_f32(x,x);

    float32x4_t y = vdupq_n_f32(c_cephes_log_p0);
    y = vmulq_f32(y, x);
    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1));
    y = vmulq_f32(y, x);
    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2));
    y = vmulq_f32(y, x);
    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3));
    y = vmulq_f32(y, x);
    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4));
    y = vmulq_f32(y, x);
    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5));
    y = vmulq_f32(y, x);
    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6));
    y = vmulq_f32(y, x);
    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7));
    y = vmulq_f32(y, x);
    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8));
    y = vmulq_f32(y, x);

    y = vmulq_f32(y, z);


    tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1));
    y = vaddq_f32(y, tmp);


    tmp = vmulq_f32(z, vdupq_n_f32(0.5f));
    y = vsubq_f32(y, tmp);

    tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2));
    x = vaddq_f32(x, y);
    x = vaddq_f32(x, tmp);
    x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN
    return x;
 }

 #define c_exp_hi 88.3762626647949f
 #define c_exp_lo -88.3762626647949f

 #define c_cephes_LOG2EF 1.44269504088896341
 #define c_cephes_exp_C1 0.693359375
 #define c_cephes_exp_C2 -2.12194440e-4

 #define c_cephes_exp_p0 1.9875691500E-4
 #define c_cephes_exp_p1 1.3981999507E-3
 #define c_cephes_exp_p2 8.3334519073E-3
 #define c_cephes_exp_p3 4.1665795894E-2
 #define c_cephes_exp_p4 1.6666665459E-1
 #define c_cephes_exp_p5 5.0000001201E-1

 /* exp() computed for 4 float at once */
 static inline float32x4_t exp_ps(float32x4_t x)
 {
    float32x4_t tmp, fx;

    float32x4_t one = vdupq_n_f32(1);
    x = vminq_f32(x, vdupq_n_f32(c_exp_hi));
    x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo));

    /* express exp(x) as exp(g + n*log(2)) */
    fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));

    /* perform a floorf */
    tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));

    /* if greater, substract 1 */
    uint32x4_t mask = vcgtq_f32(tmp, fx);
    mask = vandq_u32(mask, vreinterpretq_u32_f32(one));


    fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));

    tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1));
    float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2));
    x = vsubq_f32(x, tmp);
    x = vsubq_f32(x, z);

    static const float cephes_exp_p[6] = { c_cephes_exp_p0, c_cephes_exp_p1, c_cephes_exp_p2, c_cephes_exp_p3, c_cephes_exp_p4, c_cephes_exp_p5 };
    float32x4_t y = vld1q_dup_f32(cephes_exp_p+0);
    float32x4_t c1 = vld1q_dup_f32(cephes_exp_p+1);
    float32x4_t c2 = vld1q_dup_f32(cephes_exp_p+2);
    float32x4_t c3 = vld1q_dup_f32(cephes_exp_p+3);
    float32x4_t c4 = vld1q_dup_f32(cephes_exp_p+4);
    float32x4_t c5 = vld1q_dup_f32(cephes_exp_p+5);

    y = vmulq_f32(y, x);
    z = vmulq_f32(x, x);

    y = vaddq_f32(y, c1);
    y = vmulq_f32(y, x);
    y = vaddq_f32(y, c2);
    y = vmulq_f32(y, x);
    y = vaddq_f32(y, c3);
    y = vmulq_f32(y, x);
    y = vaddq_f32(y, c4);
    y = vmulq_f32(y, x);
    y = vaddq_f32(y, c5);

    y = vmulq_f32(y, z);
    y = vaddq_f32(y, x);
    y = vaddq_f32(y, one);

    /* build 2^n */
    int32x4_t mm;
    mm = vcvtq_s32_f32(fx);
    mm = vaddq_s32(mm, vdupq_n_s32(0x7f));
    mm = vshlq_n_s32(mm, 23);
    float32x4_t pow2n = vreinterpretq_f32_s32(mm);

    y = vmulq_f32(y, pow2n);
    return y;
 }

 #define c_minus_cephes_DP1 -0.78515625
 #define c_minus_cephes_DP2 -2.4187564849853515625e-4
 #define c_minus_cephes_DP3 -3.77489497744594108e-8
 #define c_sincof_p0 -1.9515295891E-4
 #define c_sincof_p1  8.3321608736E-3
 #define c_sincof_p2 -1.6666654611E-1
 #define c_coscof_p0  2.443315711809948E-005
 #define c_coscof_p1 -1.388731625493765E-003
 #define c_coscof_p2  4.166664568298827E-002
 #define c_cephes_FOPI 1.27323954473516 // 4 / M_PI

 /* evaluation of 4 sines & cosines at once.
 *
 *   The code is the exact rewriting of the cephes sinf function.
 *   Precision is excellent as long as x < 8192 (I did not bother to
 *   take into account the special handling they have for greater values
 *   -- it does not return garbage for arguments over 8192, though, but
 *   the extra precision is missing).
 *
 *   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
 *   surprising but correct result.
 *
 *   Note also that when you compute sin(x), cos(x) is available at
 *   almost no extra price so both sin_ps and cos_ps make use of
 *   sincos_ps..
 */
 static inline void sincos_ps(float32x4_t x, float32x4_t *ysin, float32x4_t *ycos)
 {
    // any x
    float32x4_t xmm1, xmm2, xmm3, y;

    uint32x4_t emm2;

    uint32x4_t sign_mask_sin, sign_mask_cos;
    sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0));
    x = vabsq_f32(x);

    /* scale by 4/Pi */
    y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI));

    /* store the integer part of y in mm0 */
    emm2 = vcvtq_u32_f32(y);
    /* j=(j+1) & (~1) (see the cephes sources) */
    emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
    emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
    y = vcvtq_f32_u32(emm2);

    /* get the polynom selection mask
     *     there is one polynom for 0 <= x <= Pi/4
     *     and another one for Pi/4<x<=Pi/2
     *
     *     Both branches will be computed.
     */
    uint32x4_t poly_mask = vtstq_u32(emm2, vdupq_n_u32(2));

    /* The magic pass: "Extended precision modular arithmetic"
     *     x = ((x - y * DP1) - y * DP2) - y * DP3; */
    xmm1 = vmulq_n_f32(y, c_minus_cephes_DP1);
    xmm2 = vmulq_n_f32(y, c_minus_cephes_DP2);
    xmm3 = vmulq_n_f32(y, c_minus_cephes_DP3);
    x = vaddq_f32(x, xmm1);
    x = vaddq_f32(x, xmm2);
    x = vaddq_f32(x, xmm3);

    sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4)));
    sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4));

    /* Evaluate the first polynom  (0 <= x <= Pi/4) in y1,
     *     and the second polynom      (Pi/4 <= x <= 0) in y2 */
    float32x4_t z = vmulq_f32(x,x);
    float32x4_t y1, y2;

    y1 = vmulq_n_f32(z, c_coscof_p0);
    y2 = vmulq_n_f32(z, c_sincof_p0);
    y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p1));
    y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p1));
    y1 = vmulq_f32(y1, z);
    y2 = vmulq_f32(y2, z);
    y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p2));
    y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p2));
    y1 = vmulq_f32(y1, z);
    y2 = vmulq_f32(y2, z);
    y1 = vmulq_f32(y1, z);
    y2 = vmulq_f32(y2, x);
    y1 = vsubq_f32(y1, vmulq_f32(z, vdupq_n_f32(0.5f)));
    y2 = vaddq_f32(y2, x);
    y1 = vaddq_f32(y1, vdupq_n_f32(1));

    /* select the correct result from the two polynoms */
    float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
    float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
    *ysin = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
    *ycos = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
 }

 static inline float32x4_t sin_ps(float32x4_t x)
 {
    float32x4_t ysin, ycos;
    sincos_ps(x, &ysin, &ycos);
    return ysin;
 }

 static inline float32x4_t cos_ps(float32x4_t x)
 {
    float32x4_t ysin, ycos;
    sincos_ps(x, &ysin, &ycos);
    return ycos;
 }

 static inline float32x4_t div_ps(float32x4_t a, float32x4_t b)
 {
    float32x4_t reciprocal = vrecpeq_f32(b);
    reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
 //     reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
    return vmulq_f32(a, reciprocal);
 }

 static inline float32x4_t pow_ps(float32x4_t a, float32x4_t b)
 {
    // pow(x, m) = exp(m * log(x))
    return exp_ps(vmulq_f32(b, log_ps(a)));
 }
--- a/src/layer/arm/pooling_2x2.h
+++ b/src/layer/arm/pooling_2x2.h
@@ -0,0 +1,112 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON

 static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    #pragma omp parallel for
    for (int q=0; q<inch; q++)
    {
        const float* img0 = bottom_blob.channel(q);
        float* outptr = top_blob.channel(q);

        const float* r0 = img0;
        const float* r1 = img0 + w;

        for (int i = 0; i < outh; i++)
        {
 #if __ARM_NEON
            int nn = outw >> 2;
            int remain = outw - (nn << 2);
 #else
            int remain = outw;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
            for (; nn>0; nn--)
            {
                float32x4_t _r00 = vld1q_f32(r0);
                float32x4_t _r10 = vld1q_f32(r1);
                float32x4_t _r01 = vld1q_f32(r0 + 4);
                float32x4_t _r11 = vld1q_f32(r1 + 4);

                float32x4_t _max0 = vmaxq_f32(_r00, _r10);
                float32x4_t _max1 = vmaxq_f32(_r01, _r11);

                float32x4_t _max = vpmaxq_f32(_max0, _max1);

                vst1q_f32(outptr, _max);

                r0 += 8;
                r1 += 8;
                outptr += 4;
            }
 #else
            if (nn > 0)
            {
            asm volatile(
                "0:                             \n"
                "pld        [%1, #256]          \n"
                "pld        [%2, #256]          \n"
                "vld1.f32   {d0-d3}, [%1]!      \n"
                "vld1.f32   {d4-d7}, [%2]!      \n"
                "vmax.f32   q0, q0, q2          \n"
                "vmax.f32   q1, q1, q3          \n"
                "vpmax.f32  d4, d0, d1          \n"
                "vpmax.f32  d5, d2, d3          \n"
                "subs       %0, #1              \n"
                "vst1.f32   {d4-d5}, [%3]!      \n"
                "bne        0b                  \n"
                : "=r"(nn),     // %0
                  "=r"(r0),     // %1
                  "=r"(r1),     // %2
                  "=r"(outptr)  // %3
                : "0"(nn),
                  "1"(r0),
                  "2"(r1),
                  "3"(outptr)
                : "cc", "memory", "q0", "q1", "q2", "q3"
            );
            }
 #endif // __aarch64__
 #endif // __ARM_NEON
            for (; remain>0; remain--)
            {
                float max0 = std::max(r0[0], r0[1]);
                float max1 = std::max(r1[0], r1[1]);

                *outptr = std::max(max0, max1);

                r0 += 2;
                r1 += 2;
                outptr++;
            }

            r0 += w;
            r1 += w;
        }
    }
 }
--- a/src/layer/arm/pooling_3x3.h
+++ b/src/layer/arm/pooling_3x3.h
@@ -0,0 +1,170 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON

 static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = w - 2*outw + w;

    #pragma omp parallel for
    for (int q=0; q<inch; q++)
    {
        const float* img0 = bottom_blob.channel(q);
        float* outptr = top_blob.channel(q);

        const float* r0 = img0;
        const float* r1 = img0 + w;
        const float* r2 = img0 + w*2;

        for (int i = 0; i < outh; i++)
        {
 #if __ARM_NEON
            int nn = outw >> 2;
            int remain = outw - (nn << 2);
 #else
            int remain = outw;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
            float32x4x2_t _r0 = vld2q_f32(r0);
            float32x4x2_t _r1 = vld2q_f32(r1);
            float32x4x2_t _r2 = vld2q_f32(r2);
            for (; nn>0; nn--)
            {
                float32x4x2_t _r0n = vld2q_f32(r0+8);
                float32x4x2_t _r1n = vld2q_f32(r1+8);
                float32x4x2_t _r2n = vld2q_f32(r2+8);

                float32x4_t _max0 = vmaxq_f32(_r0.val[0], _r0.val[1]);
                float32x4_t _max1 = vmaxq_f32(_r1.val[0], _r1.val[1]);
                float32x4_t _max2 = vmaxq_f32(_r2.val[0], _r2.val[1]);

                float32x4_t _r02 = vextq_f32(_r0.val[0], _r0n.val[0], 1);
                float32x4_t _r12 = vextq_f32(_r1.val[0], _r1n.val[0], 1);
                float32x4_t _r22 = vextq_f32(_r2.val[0], _r2n.val[0], 1);

                _max0 = vmaxq_f32(_max0, _r02);
                _max1 = vmaxq_f32(_max1, _r12);
                _max2 = vmaxq_f32(_max2, _r22);

                float32x4_t _max = vmaxq_f32(vmaxq_f32(_max0, _max1), _max2);

                vst1q_f32(outptr, _max);

                _r0 = _r0n;
                _r1 = _r1n;
                _r2 = _r2n;

                r0 += 8;
                r1 += 8;
                r2 += 8;
                outptr += 4;
            }
 #else
            if (nn > 0)
            {
            asm volatile(
                "pld        [%1, #256]          \n"
                "vld2.f32   {d0-d3}, [%1]!      \n"// q0 = 0 2 4 6  q1 = 1 3 5 7
                "pld        [%2, #256]          \n"
                "vld2.f32   {d4-d7}, [%2]!      \n"
                "pld        [%3, #256]          \n"
                "vld2.f32   {d8-d11}, [%3]!     \n"
                "0:                             \n"
                "pld        [%1, #256]          \n"
                "vld2.f32   {d12-d15}, [%1]!    \n"// q6 = 8 10 12 14  q7 = 9 11 13 15

                "vmax.f32   q12, q0, q1         \n"
                "vmax.f32   q13, q2, q3         \n"

                "pld        [%2, #256]          \n"
                "vld2.f32   {d16-d19}, [%2]!    \n"

                "vmax.f32   q14, q4, q5         \n"
                "vext.32    q0, q0, q6, #1      \n"

                "pld        [%3, #256]          \n"
                "vld2.f32   {d20-d23}, [%3]!    \n"

                "vext.32    q2, q2, q8, #1      \n"

                "vmax.f32   q12, q12, q0        \n"
                "vext.32    q4, q4, q10, #1     \n"

                "vmax.f32   q13, q13, q2        \n"
                "vmax.f32   q14, q14, q4        \n"
                "vmax.f32   q12, q12, q13       \n"

                "vorr       q0, q6, q6          \n"
                "vorr       q1, q7, q7          \n"
                "vmax.f32   q12, q12, q14       \n"

                "vorr       q2, q8, q8          \n"
                "vorr       q3, q9, q9          \n"
                "vorr       q4, q10, q10        \n"
                "vorr       q5, q11, q11        \n"

                "subs       %0, #1              \n"
                "vst1.f32   {d24-d25}, [%4]!    \n"
                "bne        0b                  \n"
                "sub        %1, #32             \n"
                "sub        %2, #32             \n"
                "sub        %3, #32             \n"
                : "=r"(nn),     // %0
                  "=r"(r0),     // %1
                  "=r"(r1),     // %2
                  "=r"(r2),     // %3
                  "=r"(outptr)  // %4
                : "0"(nn),
                  "1"(r0),
                  "2"(r1),
                  "3"(r2),
                  "4"(outptr)
                : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14"
            );
            }
 #endif // __aarch64__
 #endif // __ARM_NEON
            for (; remain>0; remain--)
            {
                float max0 = std::max(std::max(r0[0], r0[1]), r0[2]);
                float max1 = std::max(std::max(r1[0], r1[1]), r1[2]);
                float max2 = std::max(std::max(r2[0], r2[1]), r2[2]);

                *outptr = std::max(std::max(max0, max1), max2);

                r0 += 2;
                r1 += 2;
                r2 += 2;
                outptr++;
            }

            r0 += tailstep;//1 + w;
            r1 += tailstep;//1 + w;
            r2 += tailstep;//1 + w;
        }
    }
 }
--- a/src/layer/arm/pooling_arm.cpp
+++ b/src/layer/arm/pooling_arm.cpp
@@ -0,0 +1,96 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "pooling_arm.h"

 namespace ncnn {

 #include "pooling_2x2.h"
 #include "pooling_3x3.h"

 DEFINE_LAYER_CREATOR(Pooling_arm)

 int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
 {
    // max value in NxN window
    // avg value in NxN window

    if (pooling_type != PoolMethod_MAX || stride != 2 || global_pooling == 1)
    {
        return Pooling::forward(bottom_blob, top_blob);
    }

    if (kernel_size != 2 && kernel_size != 3)
    {
        return Pooling::forward(bottom_blob, top_blob);
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;

    Mat bottom_blob_bordered = bottom_blob;
    if (pad > 0)
    {
        copy_make_border(bottom_blob, bottom_blob_bordered, pad, pad, pad, pad, BORDER_CONSTANT, 0.f);
        if (bottom_blob_bordered.empty())
            return -100;

        w = bottom_blob_bordered.w;
        h = bottom_blob_bordered.h;
    }

    int outw = (w - kernel_size) / stride + 1;
    int outh = (h - kernel_size) / stride + 1;

    int wtail = (w - kernel_size) % stride;
    int htail = (h - kernel_size) % stride;
    if (wtail != 0 || htail != 0)
    {
        int wtailpad = 0;
        int htailpad = 0;
        if (wtail != 0)
            wtailpad = kernel_size - wtail;
        if (htail != 0)
            htailpad = kernel_size - htail;

        Mat bottom_blob_bordered2;
        copy_make_border(bottom_blob_bordered, bottom_blob_bordered2, 0, htailpad, 0, wtailpad, BORDER_REPLICATE, 0.f);
        if (bottom_blob_bordered2.empty())
            return -100;

        bottom_blob_bordered = bottom_blob_bordered2;

        w = bottom_blob_bordered.w;
        h = bottom_blob_bordered.h;

        if (wtail != 0)
            outw += 1;
        if (htail != 0)
            outh += 1;
    }

    top_blob.create(outw, outh, channels);
    if (top_blob.empty())
        return -100;

    if (kernel_size == 2)
        pooling2x2s2_max_neon(bottom_blob_bordered, top_blob);
    if (kernel_size == 3)
        pooling3x3s2_max_neon(bottom_blob_bordered, top_blob);

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/arm/pooling_arm.h
+++ b/src/layer/arm/pooling_arm.h
@@ -0,0 +1,30 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_POOLING_ARM_H
 #define LAYER_POOLING_ARM_H

 #include "pooling.h"

 namespace ncnn {

 class Pooling_arm : public Pooling
 {
 public:
    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
 };

 } // namespace ncnn

 #endif // LAYER_POOLING_ARM_H
--- a/src/layer/arm/prelu_arm.cpp
+++ b/src/layer/arm/prelu_arm.cpp
@@ -0,0 +1,182 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "prelu_arm.h"

 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON

 namespace ncnn {

 DEFINE_LAYER_CREATOR(PReLU_arm)

 int PReLU_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    int size = w * h;

    top_blob.create(w, h, channels);
    if (top_blob.empty())
        return -100;

    const float* slope_data_ptr = slope_data;

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_blob.channel(q);
        float* outptr = top_blob.channel(q);
        float slope = num_slope > 1 ? slope_data_ptr[q] : slope_data_ptr[0];

 #if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
 #else
        int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
        float32x4_t _zero = vdupq_n_f32(0.f);
        float32x4_t _slope = vdupq_n_f32(slope);
        for (; nn>0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            uint32x4_t _lemask = vcleq_f32(_p, _zero);
            float32x4_t _ps = vmulq_f32(_p, _slope);
            float32x4_t _outp = vbslq_f32(_lemask, _ps, _p);
            vst1q_f32(outptr, _outp);

            ptr += 4;
            outptr += 4;
        }
 #else
        if (nn > 0)
        {
        asm volatile(
            "veor       q1, q0, q0          \n"
            "vdup.f32   q2, %6              \n"
            "0:                             \n"
            "pld        [%1, #128]          \n"
            "vld1.f32   {d0-d1}, [%1 :128]  \n"
            "vcle.f32   q3, q0, q1          \n"
            "vmul.f32   q4, q0, q2          \n"
            "vbit.32    q0, q4, q3          \n"
            "subs       %0, #1              \n"
            "vst1.f32   {d0-d1}, [%2 :128]! \n"
            "bne        0b                  \n"
            : "=r"(nn),     // %0
              "=r"(ptr),    // %1
              "=r"(outptr)  // %2
            : "0"(nn),
              "1"(ptr),
              "2"(outptr),
              "r"(slope)    // %6
            : "cc", "memory", "q0", "q1", "q2", "q3", "q4"
        );
        }
 #endif // __aarch64__
 #endif // __ARM_NEON
        for (; remain>0; remain--)
        {
            if (*ptr < 0)
                *outptr = *ptr * slope;
            else
                *outptr = *ptr;

            ptr++;
            outptr++;
        }
    }

    return 0;
 }

 int PReLU_arm::forward_inplace(Mat& bottom_top_blob) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    const float* slope_data_ptr = slope_data;

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
        float slope = num_slope > 1 ? slope_data_ptr[q] : slope_data_ptr[0];

 #if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
 #else
        int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
        float32x4_t _zero = vdupq_n_f32(0.f);
        float32x4_t _slope = vdupq_n_f32(slope);
        for (; nn>0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            uint32x4_t _lemask = vcleq_f32(_p, _zero);
            float32x4_t _ps = vmulq_f32(_p, _slope);
            _p = vbslq_f32(_lemask, _ps, _p);
            vst1q_f32(ptr, _p);

            ptr += 4;
        }
 #else
        if (nn > 0)
        {
        asm volatile(
            "veor       q1, q0, q0          \n"
            "vdup.f32   q2, %4              \n"
            "0:                             \n"
            "pld        [%1, #128]          \n"
            "vld1.f32   {d0-d1}, [%1 :128]  \n"
            "vcle.f32   q3, q0, q1          \n"
            "vmul.f32   q4, q0, q2          \n"
            "vbit.32    q0, q4, q3          \n"
            "subs       %0, #1              \n"
            "vst1.f32   {d0-d1}, [%1 :128]! \n"
            "bne        0b                  \n"
            : "=r"(nn),     // %0
              "=r"(ptr)     // %1
            : "0"(nn),
              "1"(ptr),
              "r"(slope)    // %4
            : "cc", "memory", "q0", "q1", "q2", "q3", "q4"
        );
        }
 #endif // __aarch64__
 #endif // __ARM_NEON
        for (; remain>0; remain--)
        {
            if (*ptr < 0)
                *ptr *= slope;

            ptr++;
        }
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/arm/prelu_arm.h
+++ b/src/layer/arm/prelu_arm.h
@@ -0,0 +1,32 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_PRELU_ARM_H
 #define LAYER_PRELU_ARM_H

 #include "prelu.h"

 namespace ncnn {

 class PReLU_arm : public PReLU
 {
 public:
    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

    virtual int forward_inplace(Mat& bottom_top_blob) const;
 };

 } // namespace ncnn

 #endif // LAYER_PRELU_ARM_H
--- a/src/layer/arm/relu_arm.cpp
+++ b/src/layer/arm/relu_arm.cpp
@@ -0,0 +1,295 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "relu_arm.h"

 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON

 namespace ncnn {

 DEFINE_LAYER_CREATOR(ReLU_arm)

 int ReLU_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    int size = w * h;

    top_blob.create(w, h, channels);
    if (top_blob.empty())
        return -100;

    if (slope == 0.f)
    {
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);

 #if __ARM_NEON
            int nn = size >> 2;
            int remain = size - (nn << 2);
 #else
            int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
            float32x4_t _zero = vdupq_n_f32(0.f);
            for (; nn>0; nn--)
            {
                float32x4_t _p = vld1q_f32(ptr);
                float32x4_t _outp = vmaxq_f32(_p, _zero);
                vst1q_f32(outptr, _outp);

                ptr += 4;
                outptr += 4;
            }
 #else
            if (nn > 0)
            {
            asm volatile(
                "veor       q1, q0, q0          \n"
                "0:                             \n"
                "pld        [%1, #128]          \n"
                "vld1.f32   {d0-d1}, [%1 :128]! \n"
                "vmax.f32   q0, q0, q1          \n"
                "subs       %0, #1              \n"
                "vst1.f32   {d0-d1}, [%2 :128]! \n"
                "bne        0b                  \n"
                : "=r"(nn),     // %0
                  "=r"(ptr),    // %1
                  "=r"(outptr)  // %2
                : "0"(nn),
                  "1"(ptr),
                  "2"(outptr)
                : "cc", "memory", "q0", "q1"
            );
            }
 #endif // __aarch64__
 #endif // __ARM_NEON
            for (; remain>0; remain--)
            {
                *outptr = std::max(*ptr, 0.f);

                ptr++;
                outptr++;
            }
        }
    }
    else
    {
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);

 #if __ARM_NEON
            int nn = size >> 2;
            int remain = size - (nn << 2);
 #else
            int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
            float32x4_t _zero = vdupq_n_f32(0.f);
            float32x4_t _slope = vdupq_n_f32(slope);
            for (; nn>0; nn--)
            {
                float32x4_t _p = vld1q_f32(ptr);
                uint32x4_t _lemask = vcleq_f32(_p, _zero);
                float32x4_t _ps = vmulq_f32(_p, _slope);
                float32x4_t _outp = vbslq_f32(_lemask, _ps, _p);
                vst1q_f32(outptr, _outp);

                ptr += 4;
                outptr += 4;
            }
 #else
            if (nn > 0)
            {
            asm volatile(
                "veor       q1, q0, q0          \n"
                "vdup.f32   q2, %6              \n"
                "0:                             \n"
                "pld        [%1, #128]          \n"
                "vld1.f32   {d0-d1}, [%1 :128]  \n"
                "vcle.f32   q3, q0, q1          \n"
                "vmul.f32   q4, q0, q2          \n"
                "vbit.32    q0, q4, q3          \n"
                "subs       %0, #1              \n"
                "vst1.f32   {d0-d1}, [%2 :128]! \n"
                "bne        0b                  \n"
                : "=r"(nn),     // %0
                  "=r"(ptr),    // %1
                  "=r"(outptr)  // %2
                : "0"(nn),
                  "1"(ptr),
                  "2"(outptr),
                  "r"(slope)    // %6
                : "cc", "memory", "q0", "q1", "q2", "q3", "q4"
            );
            }
 #endif // __aarch64__
 #endif // __ARM_NEON
            for (; remain>0; remain--)
            {
                if (*ptr < 0)
                    *outptr = *ptr * slope;
                else
                    *outptr = *ptr;

                ptr++;
                outptr++;
            }
        }
    }

    return 0;
 }

 int ReLU_arm::forward_inplace(Mat& bottom_top_blob) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    if (slope == 0.f)
    {
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

 #if __ARM_NEON
            int nn = size >> 2;
            int remain = size - (nn << 2);
 #else
            int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
            float32x4_t _zero = vdupq_n_f32(0.f);
            for (; nn>0; nn--)
            {
                float32x4_t _p = vld1q_f32(ptr);
                _p = vmaxq_f32(_p, _zero);
                vst1q_f32(ptr, _p);

                ptr += 4;
            }
 #else
            if (nn > 0)
            {
            asm volatile(
                "veor       q1, q0, q0          \n"
                "0:                             \n"
                "pld        [%1, #128]          \n"
                "vld1.f32   {d0-d1}, [%1 :128]  \n"
                "vmax.f32   q0, q0, q1          \n"
                "subs       %0, #1              \n"
                "vst1.f32   {d0-d1}, [%1 :128]! \n"
                "bne        0b                  \n"
                : "=r"(nn),     // %0
                  "=r"(ptr)     // %1
                : "0"(nn),
                  "1"(ptr)
                : "cc", "memory", "q0", "q1"
            );
            }
 #endif // __aarch64__
 #endif // __ARM_NEON
            for (; remain>0; remain--)
            {
                *ptr = std::max(*ptr, 0.f);

                ptr++;
            }
        }
    }
    else
    {
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

 #if __ARM_NEON
            int nn = size >> 2;
            int remain = size - (nn << 2);
 #else
            int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
            float32x4_t _zero = vdupq_n_f32(0.f);
            float32x4_t _slope = vdupq_n_f32(slope);
            for (; nn>0; nn--)
            {
                float32x4_t _p = vld1q_f32(ptr);
                uint32x4_t _lemask = vcleq_f32(_p, _zero);
                float32x4_t _ps = vmulq_f32(_p, _slope);
                _p = vbslq_f32(_lemask, _ps, _p);
                vst1q_f32(ptr, _p);

                ptr += 4;
            }
 #else
            if (nn > 0)
            {
            asm volatile(
                "veor       q1, q0, q0          \n"
                "vdup.f32   q2, %4              \n"
                "0:                             \n"
                "pld        [%1, #128]          \n"
                "vld1.f32   {d0-d1}, [%1 :128]  \n"
                "vcle.f32   q3, q0, q1          \n"
                "vmul.f32   q4, q0, q2          \n"
                "vbit.32    q0, q4, q3          \n"
                "subs       %0, #1              \n"
                "vst1.f32   {d0-d1}, [%1 :128]! \n"
                "bne        0b                  \n"
                : "=r"(nn),     // %0
                  "=r"(ptr)     // %1
                : "0"(nn),
                  "1"(ptr),
                  "r"(slope)    // %4
                : "cc", "memory", "q0", "q1", "q2", "q3", "q4"
            );
            }
 #endif // __aarch64__
 #endif // __ARM_NEON
            for (; remain>0; remain--)
            {
                if (*ptr < 0)
                    *ptr *= slope;

                ptr++;
            }
        }
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/arm/relu_arm.h
+++ b/src/layer/arm/relu_arm.h
@@ -0,0 +1,32 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_RELU_ARM_H
 #define LAYER_RELU_ARM_H

 #include "relu.h"

 namespace ncnn {

 class ReLU_arm : public ReLU
 {
 public:
    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

    virtual int forward_inplace(Mat& bottom_top_blob) const;
 };

 } // namespace ncnn

 #endif // LAYER_RELU_ARM_H
--- a/src/layer/arm/scale_arm.cpp
+++ b/src/layer/arm/scale_arm.cpp
@@ -0,0 +1,211 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "scale_arm.h"

 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON

 namespace ncnn {

 DEFINE_LAYER_CREATOR(Scale_arm)

 int Scale_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    int size = w * h;

    top_blob.create(w, h, channels);
    if (top_blob.empty())
        return -100;

    if (bias_term)
    {
        const float* scale_ptr = scale_data;
        const float* bias_ptr = bias_data;
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);

            float s = scale_ptr[q];
            float bias = bias_ptr[q];

 #if __ARM_NEON
            int nn = size >> 2;
            int remain = size - (nn << 2);
 #else
            int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
            float32x4_t _s = vdupq_n_f32(s);
            float32x4_t _bias = vdupq_n_f32(bias);
            for (; nn>0; nn--)
            {
                float32x4_t _p = vld1q_f32(ptr);
                _p = vmlaq_f32(_bias, _p, _s);
                vst1q_f32(outptr, _p);

                ptr += 4;
                outptr += 4;
            }
 #endif // __ARM_NEON

            for (; remain>0; remain--)
            {
                *outptr = *ptr * s + bias;

                ptr++;
                outptr++;
            }
        }
    }
    else
    {
        const float* scale_ptr = scale_data;
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);

            float s = scale_ptr[q];

 #if __ARM_NEON
            int nn = size >> 2;
            int remain = size - (nn << 2);
 #else
            int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
            float32x4_t _s = vdupq_n_f32(s);
            for (; nn>0; nn--)
            {
                float32x4_t _p = vld1q_f32(ptr);
                _p = vmulq_f32(_p, _s);
                vst1q_f32(outptr, _p);

                ptr += 4;
                outptr += 4;
            }
 #endif // __ARM_NEON

            for (; remain>0; remain--)
            {
                *outptr = *ptr * s;

                ptr++;
                outptr++;
            }
        }
    }

    return 0;
 }

 int Scale_arm::forward_inplace(Mat& bottom_top_blob) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    if (bias_term)
    {
        const float* scale_ptr = scale_data;
        const float* bias_ptr = bias_data;
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            float s = scale_ptr[q];
            float bias = bias_ptr[q];

 #if __ARM_NEON
            int nn = size >> 2;
            int remain = size - (nn << 2);
 #else
            int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
            float32x4_t _s = vdupq_n_f32(s);
            float32x4_t _bias = vdupq_n_f32(bias);
            for (; nn>0; nn--)
            {
                float32x4_t _p = vld1q_f32(ptr);
                _p = vmlaq_f32(_bias, _p, _s);
                vst1q_f32(ptr, _p);

                ptr += 4;
            }
 #endif // __ARM_NEON

            for (; remain>0; remain--)
            {
                *ptr = *ptr * s + bias;

                ptr++;
            }
        }
    }
    else
    {
        const float* scale_ptr = scale_data;
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            float s = scale_ptr[q];

 #if __ARM_NEON
            int nn = size >> 2;
            int remain = size - (nn << 2);
 #else
            int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
            float32x4_t _s = vdupq_n_f32(s);
            for (; nn>0; nn--)
            {
                float32x4_t _p = vld1q_f32(ptr);
                _p = vmulq_f32(_p, _s);
                vst1q_f32(ptr, _p);

                ptr += 4;
            }
 #endif // __ARM_NEON

            for (; remain>0; remain--)
            {
                *ptr *= s;

                ptr++;
            }
        }
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/arm/scale_arm.h
+++ b/src/layer/arm/scale_arm.h
@@ -0,0 +1,32 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_SCALE_ARM_H
 #define LAYER_SCALE_ARM_H

 #include "scale.h"

 namespace ncnn {

 class Scale_arm : public Scale
 {
 public:
    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

    virtual int forward_inplace(Mat& bottom_top_blob) const;
 };

 } // namespace ncnn

 #endif // LAYER_SCALE_ARM_H
--- a/src/layer/arm/sigmoid_arm.cpp
+++ b/src/layer/arm/sigmoid_arm.cpp
@@ -0,0 +1,127 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "sigmoid_arm.h"

 #if __ARM_NEON
 #include <arm_neon.h>
 #include "neon_mathfun.h"
 #endif // __ARM_NEON

 #include <math.h>

 namespace ncnn {

 DEFINE_LAYER_CREATOR(Sigmoid_arm)

 int Sigmoid_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    int size = w * h;

    top_blob.create(w, h, channels);
    if (top_blob.empty())
        return -100;

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_blob.channel(q);
        float* outptr = top_blob.channel(q);

 #if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
 #else
        int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
        float32x4_t _one = vdupq_n_f32(1.f);
        for (; nn>0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            _p = vnegq_f32(_p);
            _p = exp_ps(_p);
            _p = vaddq_f32(_p, _one);
            float32x4_t _outp = vrecpeq_f32(_p);
            _outp = vmulq_f32(vrecpsq_f32(_p, _outp), _outp);
 //             _outp = vmulq_f32(vrecpsq_f32(_p, _outp), _outp);
            vst1q_f32(outptr, _outp);

            ptr += 4;
            outptr += 4;
        }
 #endif // __ARM_NEON
        for (; remain>0; remain--)
        {
            *outptr = 1.f / (1.f + exp(-*ptr));

            ptr++;
            outptr++;
        }
    }

    return 0;
 }

 int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

 #if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
 #else
        int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
        float32x4_t _one = vdupq_n_f32(1.f);
        for (; nn>0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            _p = vnegq_f32(_p);
            _p = exp_ps(_p);
            _p = vaddq_f32(_p, _one);
            _p = vrecpeq_f32(_p);
            _p = vmulq_f32(vrecpsq_f32(_p, _p), _p);
 //             _p = vmulq_f32(vrecpsq_f32(_p, _p), _p);
            vst1q_f32(ptr, _p);

            ptr += 4;
        }
 #endif // __ARM_NEON
        for (; remain>0; remain--)
        {
            *ptr = 1.f / (1.f + exp(-*ptr));

            ptr++;
        }
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/arm/sigmoid_arm.h
+++ b/src/layer/arm/sigmoid_arm.h
@@ -0,0 +1,32 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_SIGMOID_ARM_H
 #define LAYER_SIGMOID_ARM_H

 #include "sigmoid.h"

 namespace ncnn {

 class Sigmoid_arm : public Sigmoid
 {
 public:
    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

    virtual int forward_inplace(Mat& bottom_top_blob) const;
 };

 } // namespace ncnn

 #endif // LAYER_SIGMOID_ARM_H
--- a/src/layer/arm/slice_arm.cpp
+++ b/src/layer/arm/slice_arm.cpp
@@ -0,0 +1,102 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "slice_arm.h"
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON

 namespace ncnn {

 DEFINE_LAYER_CREATOR(Slice_arm)

 int Slice_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
 {
    const Mat& bottom_blob = bottom_blobs[0];
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;

    int q = 0;
    const int* slices_ptr = (const int*)slices.data;
    for (size_t i=0; i<top_blobs.size(); i++)
    {
        int slice = slices_ptr[i];
        if (slice == -233)
        {
            slice = (channels - q) / (top_blobs.size() - i);
        }

        Mat& top_blob = top_blobs[i];
        top_blob.create(w, h, slice);
        if (top_blob.empty())
            return -100;

        int size = bottom_blob.cstep * slice;

        const float* ptr = bottom_blob.channel(q);
        float* outptr = top_blob.data;

 #if __ARM_NEON
        int nn = size >> 3;
        int remain = size - (nn << 3);
 #else
        int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
        for (; nn>0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _p2 = vld1q_f32(ptr+4);
            vst1q_f32(outptr, _p);
            vst1q_f32(outptr+4, _p2);

            ptr += 8;
            outptr += 8;
        }
 #else
        if (nn > 0)
        {
        asm volatile(
            "0:                             \n"
            "pld        [%1, #256]          \n"
            "vld1.f32   {d0-d3}, [%1 :128]! \n"
            "subs       %0, #1              \n"
            "vst1.f32   {d0-d3}, [%2 :128]! \n"
            "bne        0b                  \n"
            : "=r"(nn),     // %0
              "=r"(ptr),    // %1
              "=r"(outptr)  // %2
            : "0"(nn),
              "1"(ptr),
              "2"(outptr)
            : "cc", "memory", "q0"
        );
        }
 #endif // __aarch64__
 #endif // __ARM_NEON
        for (; remain>0; remain--)
        {
            *outptr++ = *ptr++;
        }

        q += slice;
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/arm/slice_arm.h
+++ b/src/layer/arm/slice_arm.h
@@ -0,0 +1,30 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_SLICE_ARM_H
 #define LAYER_SLICE_ARM_H

 #include "slice.h"

 namespace ncnn {

 class Slice_arm : public Slice
 {
 public:
    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
 };

 } // namespace ncnn

 #endif // LAYER_SLICE_ARM_H
--- a/src/layer/arm/softmax_arm.cpp
+++ b/src/layer/arm/softmax_arm.cpp
@@ -0,0 +1,302 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "softmax_arm.h"
 #include <float.h>
 #include <math.h>

 #if __ARM_NEON
 #include <arm_neon.h>
 #include "neon_mathfun.h"
 #endif // __ARM_NEON

 namespace ncnn {

 DEFINE_LAYER_CREATOR(Softmax_arm)

 int Softmax_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
 {
    // value = exp( value - global max value )
    // sum all value
    // value = value / sum

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    int size = w * h;

    top_blob.create(w, h, channels);
    if (top_blob.empty())
        return -100;

    Mat max;
    max.create(w, h);
    if (max.empty())
        return -100;
    max.fill(-FLT_MAX);
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_blob.channel(q);
        float* maxptr = max;

        for (int i=0; i<size; i++)
        {
            maxptr[i] = std::max(maxptr[i], ptr[i]);
        }
    }

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_blob.channel(q);
        float* outptr = top_blob.channel(q);
        float* maxptr = max;

 #if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
 #else
        int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
        for (; nn>0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _max = vld1q_f32(maxptr);

            _p = exp_ps(vsubq_f32(_p, _max));

            vst1q_f32(outptr, _p);

            ptr += 4;
            maxptr += 4;
            outptr += 4;
        }
 #endif // __ARM_NEON

        for (; remain>0; remain--)
        {
            *outptr = exp(*ptr - *maxptr);

            ptr++;
            maxptr++;
            outptr++;
        }
    }

    Mat sum;
    sum.create(w, h);
    if (sum.empty())
        return -100;
    sum.fill(0.f);
    for (int q=0; q<channels; q++)
    {
        const float* outptr = top_blob.channel(q);
        float* sumptr = sum;

        for (int i=0; i<size; i++)
        {
            sumptr[i] += outptr[i];
        }
    }

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        float* outptr = top_blob.channel(q);
        float* sumptr = sum;

 #if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
 #else
        int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
        for (; nn>0; nn--)
        {
            float32x4_t _p = vld1q_f32(outptr);
            float32x4_t _sum = vld1q_f32(sumptr);
 #if __aarch64__
            _p = vdivq_f32(_p, _sum);
 #else
            _p = div_ps(_p, _sum);
 #endif // __aarch64__
            vst1q_f32(outptr, _p);

            outptr += 4;
            sumptr += 4;
        }
 #endif // __ARM_NEON

        for (; remain>0; remain--)
        {
            *outptr /= *sumptr;

            outptr++;
            sumptr++;
        }
    }

    return 0;
 }

 int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const
 {
    // value = exp( value - global max value )
    // sum all value
    // value = value / sum

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    Mat max;
    max.create(w, h);
    if (max.empty())
        return -100;
    max.fill(-FLT_MAX);
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
        float* maxptr = max;

        for (int i=0; i<size; i++)
        {
            maxptr[i] = std::max(maxptr[i], ptr[i]);
        }
    }

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
        float* maxptr = max;

 #if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
 #else
        int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
        for (; nn>0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _max = vld1q_f32(maxptr);

            _p = exp_ps(vsubq_f32(_p, _max));

            vst1q_f32(ptr, _p);

            ptr += 4;
            maxptr += 4;
        }
 #endif // __ARM_NEON

        for (; remain>0; remain--)
        {
            *ptr = exp(*ptr - *maxptr);

            ptr++;
            maxptr++;
        }
    }

    Mat sum;
    sum.create(w, h);
    if (sum.empty())
        return -100;
    sum.fill(0.f);
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
        float* sumptr = sum;

 #if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
 #else
        int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
        for (; nn>0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _sum = vld1q_f32(sumptr);
            _sum = vaddq_f32(_sum, _p);
            vst1q_f32(sumptr, _sum);

            ptr += 4;
            sumptr += 4;
        }
 #endif // __ARM_NEON

        for (; remain>0; remain--)
        {
            *sumptr += *ptr;

            ptr++;
            sumptr++;
        }
    }

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
        float* sumptr = sum;

 #if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
 #else
        int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
        for (; nn>0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _sum = vld1q_f32(sumptr);
 #if __aarch64__
            _p = vdivq_f32(_p, _sum);
 #else
            _p = div_ps(_p, _sum);
 #endif // __aarch64__
            vst1q_f32(ptr, _p);

            ptr += 4;
            sumptr += 4;
        }
 #endif // __ARM_NEON

        for (; remain>0; remain--)
        {
            *ptr /= *sumptr;

            ptr++;
            sumptr++;
        }
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/arm/softmax_arm.h
+++ b/src/layer/arm/softmax_arm.h
@@ -0,0 +1,32 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_SOFTMAX_ARM_H
 #define LAYER_SOFTMAX_ARM_H

 #include "softmax.h"

 namespace ncnn {

 class Softmax_arm : public Softmax
 {
 public:
    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

    virtual int forward_inplace(Mat& bottom_top_blob) const;
 };

 } // namespace ncnn

 #endif // LAYER_SOFTMAX_ARM_H
--- a/src/layer/batchnorm.cpp
+++ b/src/layer/batchnorm.cpp
@@ -0,0 +1,227 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "batchnorm.h"
 #include <math.h>

 namespace ncnn {

 DEFINE_LAYER_CREATOR(BatchNorm)

 BatchNorm::BatchNorm()
 {
    one_blob_only = true;
    support_inplace = true;
 }

 BatchNorm::~BatchNorm()
 {
 }

 #if NCNN_STDIO
 #if NCNN_STRING
 int BatchNorm::load_param(FILE* paramfp)
 {
    int nscan = fscanf(paramfp, "%d", &channels);
    if (nscan != 1)
    {
        fprintf(stderr, "BatchNorm load_param failed %d\n", nscan);
        return -1;
    }

    return 0;
 }
 #endif // NCNN_STRING
 int BatchNorm::load_param_bin(FILE* paramfp)
 {
    fread(&channels, sizeof(int), 1, paramfp);

    return 0;
 }

 int BatchNorm::load_model(FILE* binfp)
 {
    int nread;

    slope_data.create(channels);
    if (slope_data.empty())
        return -100;
    nread = fread(slope_data, channels * sizeof(float), 1, binfp);
    if (nread != 1)
    {
        fprintf(stderr, "BatchNorm read slope_data failed %d\n", nread);
        return -1;
    }

    mean_data.create(channels);
    if (mean_data.empty())
        return -100;
    nread = fread(mean_data, channels * sizeof(float), 1, binfp);
    if (nread != 1)
    {
        fprintf(stderr, "BatchNorm read mean_data failed %d\n", nread);
        return -1;
    }

    var_data.create(channels);
    if (var_data.empty())
        return -100;
    nread = fread(var_data, channels * sizeof(float), 1, binfp);
    if (nread != 1)
    {
        fprintf(stderr, "BatchNorm read var_data failed %d\n", nread);
        return -1;
    }

    bias_data.create(channels);
    if (bias_data.empty())
        return -100;
    nread = fread(bias_data, channels * sizeof(float), 1, binfp);
    if (nread != 1)
    {
        fprintf(stderr, "BatchNorm read bias_data failed %d\n", nread);
        return -1;
    }

    a_data.create(channels);
    if (a_data.empty())
        return -100;
    b_data.create(channels);
    if (b_data.empty())
        return -100;
    const float* slope_data_ptr = slope_data;
    const float* mean_data_ptr = mean_data;
    const float* var_data_ptr = var_data;
    const float* bias_data_ptr = bias_data;
    float* a_data_ptr = a_data;
    float* b_data_ptr = b_data;
    for (int i=0; i<channels; i++)
    {
        float sqrt_var = sqrt(var_data_ptr[i]);
        a_data_ptr[i] = bias_data_ptr[i] - slope_data_ptr[i] * mean_data_ptr[i] / sqrt_var;
        b_data_ptr[i] = slope_data_ptr[i] / sqrt_var;
    }

    return 0;
 }
 #endif // NCNN_STDIO

 int BatchNorm::load_param(const unsigned char*& mem)
 {
    channels = *(int*)(mem);
    mem += 4;

    return 0;
 }

 int BatchNorm::load_model(const unsigned char*& mem)
 {
    slope_data = Mat(channels, (float*)mem);
    mem += channels * sizeof(float);

    mean_data = Mat(channels, (float*)mem);
    mem += channels * sizeof(float);

    var_data = Mat(channels, (float*)mem);
    mem += channels * sizeof(float);

    bias_data = Mat(channels, (float*)mem);
    mem += channels * sizeof(float);

    a_data.create(channels);
    if (a_data.empty())
        return -100;
    b_data.create(channels);
    if (b_data.empty())
        return -100;
    const float* slope_data_ptr = slope_data;
    const float* mean_data_ptr = mean_data;
    const float* var_data_ptr = var_data;
    const float* bias_data_ptr = bias_data;
    float* a_data_ptr = a_data;
    float* b_data_ptr = b_data;
    for (int i=0; i<channels; i++)
    {
        float sqrt_var = sqrt(var_data_ptr[i]);
        a_data_ptr[i] = bias_data_ptr[i] - slope_data_ptr[i] * mean_data_ptr[i] / sqrt_var;
        b_data_ptr[i] = slope_data_ptr[i] / sqrt_var;
    }

    return 0;
 }

 int BatchNorm::forward(const Mat& bottom_blob, Mat& top_blob) const
 {
    // a = bias - slope * mean / sqrt(var)
    // b = slope / sqrt(var)
    // value = b * value + a

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int size = w * h;

    top_blob.create(w, h, channels);
    if (top_blob.empty())
        return -100;

    const float* a_data_ptr = a_data;
    const float* b_data_ptr = b_data;
    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_blob.channel(q);
        float* outptr = top_blob.channel(q);

        float a = a_data_ptr[q];
        float b = b_data_ptr[q];

        for (int i=0; i<size; i++)
        {
            outptr[i] = b * ptr[i] + a;
        }
    }

    return 0;
 }

 int BatchNorm::forward_inplace(Mat& bottom_top_blob) const
 {
    // a = bias - slope * mean / sqrt(var)
    // b = slope / sqrt(var)
    // value = b * value + a

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int size = w * h;

    const float* a_data_ptr = a_data;
    const float* b_data_ptr = b_data;
    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        float a = a_data_ptr[q];
        float b = b_data_ptr[q];

        for (int i=0; i<size; i++)
        {
            ptr[i] = b * ptr[i] + a;
        }
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/batchnorm.h
+++ b/src/layer/batchnorm.h
@@ -0,0 +1,58 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_BATCHNORM_H
 #define LAYER_BATCHNORM_H

 #include "layer.h"

 namespace ncnn {

 class BatchNorm : public Layer
 {
 public:
    BatchNorm();
    virtual ~BatchNorm();

 #if NCNN_STDIO
 #if NCNN_STRING
    virtual int load_param(FILE* paramfp);
 #endif // NCNN_STRING
    virtual int load_param_bin(FILE* paramfp);
    virtual int load_model(FILE* binfp);
 #endif // NCNN_STDIO
    virtual int load_param(const unsigned char*& mem);
    virtual int load_model(const unsigned char*& mem);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

    virtual int forward_inplace(Mat& bottom_top_blob) const;

 public:
    // param
    int channels;

    // model
    Mat slope_data;
    Mat mean_data;
    Mat var_data;
    Mat bias_data;

    Mat a_data;
    Mat b_data;
 };

 } // namespace ncnn

 #endif // LAYER_BATCHNORM_H
--- a/src/layer/bias.cpp
+++ b/src/layer/bias.cpp
@@ -0,0 +1,139 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "bias.h"

 namespace ncnn {

 DEFINE_LAYER_CREATOR(Bias)

 Bias::Bias()
 {
    one_blob_only = true;
    support_inplace = true;
 }

 Bias::~Bias()
 {
 }

 #if NCNN_STDIO
 #if NCNN_STRING
 int Bias::load_param(FILE* paramfp)
 {
    int nscan = fscanf(paramfp, "%d", &bias_data_size);
    if (nscan != 1)
    {
        fprintf(stderr, "Bias load_param failed %d\n", nscan);
        return -1;
    }

    return 0;
 }
 #endif // NCNN_STRING
 int Bias::load_param_bin(FILE* paramfp)
 {
    fread(&bias_data_size, sizeof(int), 1, paramfp);

    return 0;
 }

 int Bias::load_model(FILE* binfp)
 {
    int nread;

    bias_data.create(bias_data_size);
    if (bias_data.empty())
        return -100;
    nread = fread(bias_data, bias_data_size * sizeof(float), 1, binfp);
    if (nread != 1)
    {
        fprintf(stderr, "Bias read bias_data failed %d\n", nread);
        return -1;
    }

    return 0;
 }
 #endif // NCNN_STDIO

 int Bias::load_param(const unsigned char*& mem)
 {
    bias_data_size = *(int*)(mem);
    mem += 4;

    return 0;
 }

 int Bias::load_model(const unsigned char*& mem)
 {
    bias_data = Mat(bias_data_size, (float*)mem);
    mem += bias_data_size * sizeof(float);

    return 0;
 }

 int Bias::forward(const Mat& bottom_blob, Mat& top_blob) const
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    int size = w * h;

    top_blob.create(w, h, channels);
    if (top_blob.empty())
        return -100;

    const float* bias_ptr = bias_data;
    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_blob.channel(q);
        float* outptr = top_blob.channel(q);

        float bias = bias_ptr[q];

        for (int i=0; i<size; i++)
        {
            outptr[i] = ptr[i] + bias;
        }
    }

    return 0;
 }

 int Bias::forward_inplace(Mat& bottom_top_blob) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    const float* bias_ptr = bias_data;
    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        float bias = bias_ptr[q];

        for (int i=0; i<size; i++)
        {
            ptr[i] += bias;
        }
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/bias.h
+++ b/src/layer/bias.h
@@ -0,0 +1,52 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_BIAS_H
 #define LAYER_BIAS_H

 #include "layer.h"

 namespace ncnn {

 class Bias : public Layer
 {
 public:
    Bias();
    virtual ~Bias();

 #if NCNN_STDIO
 #if NCNN_STRING
    virtual int load_param(FILE* paramfp);
 #endif // NCNN_STRING
    virtual int load_param_bin(FILE* paramfp);
    virtual int load_model(FILE* binfp);
 #endif // NCNN_STDIO
    virtual int load_param(const unsigned char*& mem);
    virtual int load_model(const unsigned char*& mem);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

    virtual int forward_inplace(Mat& bottom_top_blob) const;

 public:
    // param
    int bias_data_size;

    // model
    Mat bias_data;
 };

 } // namespace ncnn

 #endif // LAYER_BIAS_H
--- a/src/layer/bnll.cpp
+++ b/src/layer/bnll.cpp
@@ -0,0 +1,81 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "bnll.h"
 #include <math.h>

 namespace ncnn {

 DEFINE_LAYER_CREATOR(BNLL)

 BNLL::BNLL()
 {
    one_blob_only = true;
    support_inplace = true;
 }

 int BNLL::forward(const Mat& bottom_blob, Mat& top_blob) const
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    int size = w * h;

    top_blob.create(w, h, channels);
    if (top_blob.empty())
        return -100;

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_blob.channel(q);
        float* outptr = top_blob.channel(q);

        for (int i=0; i<size; i++)
        {
            if (ptr[i] > 0)
                outptr[i] = ptr[i] + log(1.f + exp(-ptr[i]));
            else
                outptr[i] = log(1.f + exp(ptr[i]));
        }
    }

    return 0;
 }

 int BNLL::forward_inplace(Mat& bottom_top_blob) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        for (int i=0; i<size; i++)
        {
            if (ptr[i] > 0)
                ptr[i] = ptr[i] + log(1.f + exp(-ptr[i]));
            else
                ptr[i] = log(1.f + exp(ptr[i]));
        }
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/bnll.h
+++ b/src/layer/bnll.h
@@ -0,0 +1,36 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_BNLL_H
 #define LAYER_BNLL_H

 #include "layer.h"

 namespace ncnn {

 class BNLL : public Layer
 {
 public:
    BNLL();

    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

    virtual int forward_inplace(Mat& bottom_top_blob) const;

 public:
 };

 } // namespace ncnn

 #endif // LAYER_BNLL_H
--- a/src/layer/concat.cpp
+++ b/src/layer/concat.cpp
@@ -0,0 +1,64 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "concat.h"

 namespace ncnn {

 DEFINE_LAYER_CREATOR(Concat)

 Concat::Concat()
 {
 }

 int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
 {
    int w = bottom_blobs[0].w;
    int h = bottom_blobs[0].h;

    // total channels
    int top_channels = 0;
    for (size_t b=0; b<bottom_blobs.size(); b++)
    {
        const Mat& bottom_blob = bottom_blobs[b];
        top_channels += bottom_blob.c;
    }

    Mat& top_blob = top_blobs[0];
    top_blob.create(w, h, top_channels);
    if (top_blob.empty())
        return -100;

    int q = 0;
    for (size_t b=0; b<bottom_blobs.size(); b++)
    {
        const Mat& bottom_blob = bottom_blobs[b];

        int channels = bottom_blob.c;
        int size = bottom_blob.cstep * channels;

        const float* ptr = bottom_blob;
        float* outptr = top_blob.channel(q);
        for (int i=0; i<size; i++)
        {
            outptr[i] = ptr[i];
        }

        q += channels;
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/concat.h
+++ b/src/layer/concat.h
@@ -0,0 +1,34 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_CONCAT_H
 #define LAYER_CONCAT_H

 #include "layer.h"

 namespace ncnn {

 class Concat : public Layer
 {
 public:
    Concat();

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;

 public:
 };

 } // namespace ncnn

 #endif // LAYER_CONCAT_H
--- a/src/layer/convolution.cpp
+++ b/src/layer/convolution.cpp
@@ -0,0 +1,350 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "convolution.h"

 namespace ncnn {

 DEFINE_LAYER_CREATOR(Convolution)

 Convolution::Convolution()
 {
    one_blob_only = true;
    support_inplace = false;
 }

 Convolution::~Convolution()
 {
 }

 #if NCNN_STDIO
 #if NCNN_STRING
 int Convolution::load_param(FILE* paramfp)
 {
    int nscan = fscanf(paramfp, "%d %d %d %d %d %d %d",
                       &num_output, &kernel_size, &dilation, &stride, &pad, &bias_term,
                       &weight_data_size);
    if (nscan != 7)
    {
        fprintf(stderr, "Convolution load_param failed %d\n", nscan);
        return -1;
    }

    return 0;
 }
 #endif // NCNN_STRING
 int Convolution::load_param_bin(FILE* paramfp)
 {
    fread(&num_output, sizeof(int), 1, paramfp);

    fread(&kernel_size, sizeof(int), 1, paramfp);

    fread(&dilation, sizeof(int), 1, paramfp);

    fread(&stride, sizeof(int), 1, paramfp);

    fread(&pad, sizeof(int), 1, paramfp);

    fread(&bias_term, sizeof(int), 1, paramfp);

    fread(&weight_data_size, sizeof(int), 1, paramfp);

    return 0;
 }

 int Convolution::load_model(FILE* binfp)
 {
    int nread;

    union
    {
        struct
        {
            unsigned char f0;
            unsigned char f1;
            unsigned char f2;
            unsigned char f3;
        };
        unsigned int tag;
    } flag_struct;

    nread = fread(&flag_struct, sizeof(flag_struct), 1, binfp);
    if (nread != 1)
    {
        fprintf(stderr, "Convolution read flag_struct failed %d\n", nread);
        return -1;
    }

    unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;

    weight_data.create(weight_data_size);
    if (weight_data.empty())
        return -100;

    if (flag_struct.tag == 0x01306B47)
    {
        // half-precision weight data
        int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned short), 4);
        std::vector<unsigned short> float16_weights;
        float16_weights.resize(align_weight_data_size);
        nread = fread(float16_weights.data(), align_weight_data_size, 1, binfp);
        if (nread != 1)
        {
            fprintf(stderr, "Convolution read float16_weights failed %d\n", nread);
            return -1;
        }

        weight_data = Mat::from_float16(float16_weights.data(), weight_data_size);
        if (weight_data.empty())
            return -100;
    }
    else if (flag != 0)
    {
        // quantized weight data
        float quantization_value[256];
        nread = fread(quantization_value, 256 * sizeof(float), 1, binfp);
        if (nread != 1)
        {
            fprintf(stderr, "Convolution read quantization_value failed %d\n", nread);
            return -1;
        }

        int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned char), 4);
        std::vector<unsigned char> index_array;
        index_array.resize(align_weight_data_size);
        nread = fread(index_array.data(), align_weight_data_size, 1, binfp);
        if (nread != 1)
        {
            fprintf(stderr, "Convolution read index_array failed %d\n", nread);
            return -1;
        }

        float* weight_data_ptr = weight_data;
        for (int i = 0; i < weight_data_size; i++)
        {
            weight_data_ptr[i] = quantization_value[ index_array[i] ];
        }
    }
    else if (flag_struct.f0 == 0)
    {
        // raw weight data
        nread = fread(weight_data, weight_data_size * sizeof(float), 1, binfp);
        if (nread != 1)
        {
            fprintf(stderr, "Convolution read weight_data failed %d\n", nread);
            return -1;
        }
    }

    if (bias_term)
    {
        bias_data.create(num_output);
        if (bias_data.empty())
            return -100;
        nread = fread(bias_data, num_output * sizeof(float), 1, binfp);
        if (nread != 1)
        {
            fprintf(stderr, "Convolution read bias_data failed %d\n", nread);
            return -1;
        }
    }

    return 0;
 }
 #endif // NCNN_STDIO

 int Convolution::load_param(const unsigned char*& mem)
 {
    num_output = *(int*)(mem);
    mem += 4;

    kernel_size = *(int*)(mem);
    mem += 4;

    dilation = *(int*)(mem);
    mem += 4;

    stride = *(int*)(mem);
    mem += 4;

    pad = *(int*)(mem);
    mem += 4;

    bias_term = *(int*)(mem);
    mem += 4;

    weight_data_size = *(int*)(mem);
    mem += 4;

    return 0;
 }

 int Convolution::load_model(const unsigned char*& mem)
 {
    union
    {
        struct
        {
            unsigned char f0;
            unsigned char f1;
            unsigned char f2;
            unsigned char f3;
        };
        unsigned int tag;
    } flag_struct;

    memcpy(&flag_struct, mem, sizeof(flag_struct));
    mem += sizeof(flag_struct);

    unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;

    if (flag_struct.tag == 0x01306B47)
    {
        // half-precision weight data
        weight_data = Mat::from_float16((unsigned short*)mem, weight_data_size);
        mem += alignSize(weight_data_size * sizeof(unsigned short), 4);
        if (weight_data.empty())
            return -100;
    }
    else if (flag != 0)
    {
        // quantized weight data
        const float* quantization_value = (const float*)mem;
        mem += 256 * sizeof(float);

        const unsigned char* index_array = (const unsigned char*)mem;
        mem += alignSize(weight_data_size * sizeof(unsigned char), 4);

        weight_data.create(weight_data_size);
        if (weight_data.empty())
            return -100;
        float* weight_data_ptr = weight_data;
        for (int i = 0; i < weight_data_size; i++)
        {
            weight_data_ptr[i] = quantization_value[ index_array[i] ];
        }
    }
    else if (flag_struct.f0 == 0)
    {
        // raw weight data
        weight_data = Mat(weight_data_size, (float*)mem);
        mem += weight_data_size * sizeof(float);
    }

    if (bias_term)
    {
        bias_data = Mat(num_output, (float*)mem);
        mem += num_output * sizeof(float);
    }

    return 0;
 }

 int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
 {
    // convolv with NxN kernel
    // value = value + bias

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;

 //     fprintf(stderr, "Convolution input %d x %d  pad = %d  ksize=%d  stride=%d\n", w, h, pad, kernel_size, stride);

    Mat bottom_blob_bordered = bottom_blob;
    if (pad > 0)
    {
        copy_make_border(bottom_blob, bottom_blob_bordered, pad, pad, pad, pad, BORDER_CONSTANT, 0.f);
        if (bottom_blob_bordered.empty())
            return -100;

        w = bottom_blob_bordered.w;
        h = bottom_blob_bordered.h;
    }

    const int kernel_extent = dilation * (kernel_size - 1) + 1;

    int outw = (w - kernel_extent) / stride + 1;
    int outh = (h - kernel_extent) / stride + 1;

    top_blob.create(outw, outh, num_output);
    if (top_blob.empty())
        return -100;

    const int maxk = kernel_size * kernel_size;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w * dilation - kernel_extent;
        for (int i = 0; i < kernel_size; i++)
        {
            for (int j = 0; j < kernel_size; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2 += dilation;
            }
            p2 += gap;
        }
    }

    // num_output
    const float* weight_data_ptr = weight_data;
    #pragma omp parallel for
    for (int p=0; p<num_output; p++)
    {
        float* outptr = top_blob.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                float sum = 0.f;

                if (bias_term)
                    sum = bias_data.data[p];

                const float* kptr = weight_data_ptr + maxk * channels * p;

                // channels
                for (int q=0; q<channels; q++)
                {
                    const Mat m = bottom_blob_bordered.channel(q);
                    const float* sptr = m.data + m.w * i*stride + j*stride;

                    for (int k = 0; k < maxk; k++) // 29.23
                    {
                        float val = sptr[ space_ofs[k] ]; // 20.72
                        float w = kptr[k];
                        sum += val * w; // 41.45
                    }

                    kptr += maxk;
                }

                outptr[j] = sum;
            }

            outptr += outw;
        }
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/convolution.h
+++ b/src/layer/convolution.h
@@ -0,0 +1,58 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_CONVOLUTION_H
 #define LAYER_CONVOLUTION_H

 #include "layer.h"

 namespace ncnn {

 class Convolution : public Layer
 {
 public:
    Convolution();
    virtual ~Convolution();

 #if NCNN_STDIO
 #if NCNN_STRING
    virtual int load_param(FILE* paramfp);
 #endif // NCNN_STRING
    virtual int load_param_bin(FILE* paramfp);
    virtual int load_model(FILE* binfp);
 #endif // NCNN_STDIO
    virtual int load_param(const unsigned char*& mem);
    virtual int load_model(const unsigned char*& mem);

    virtual int forward(const Mat& bottom_blobs, Mat& top_blobs) const;

 public:
    // param
    int num_output;
    int kernel_size;
    int dilation;
    int stride;
    int pad;
    int bias_term;

    int weight_data_size;

    // model
    Mat weight_data;
    Mat bias_data;
 };

 } // namespace ncnn

 #endif // LAYER_CONVOLUTION_H
--- a/src/layer/crop.cpp
+++ b/src/layer/crop.cpp
@@ -0,0 +1,85 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "crop.h"

 namespace ncnn {

 DEFINE_LAYER_CREATOR(Crop)

 Crop::Crop()
 {
 }

 #if NCNN_STDIO
 #if NCNN_STRING
 int Crop::load_param(FILE* paramfp)
 {
    int nscan = fscanf(paramfp, "%d %d", &woffset, &hoffset);
    if (nscan != 2)
    {
        fprintf(stderr, "Crop load_param failed %d\n", nscan);
        return -1;
    }

    return 0;
 }
 #endif // NCNN_STRING
 int Crop::load_param_bin(FILE* paramfp)
 {
    fread(&woffset, sizeof(int), 1, paramfp);

    fread(&hoffset, sizeof(int), 1, paramfp);

    return 0;
 }
 #endif // NCNN_STDIO

 int Crop::load_param(const unsigned char*& mem)
 {
    woffset = *(int*)(mem);
    mem += 4;

    hoffset = *(int*)(mem);
    mem += 4;

    return 0;
 }

 int Crop::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
 {
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& reference_blob = bottom_blobs[1];

    int w = bottom_blob.w;
    int h = bottom_blob.h;

    int outw = reference_blob.w;
    int outh = reference_blob.h;

    int top = hoffset;
    int bottom = h - outh - hoffset;
    int left = woffset;
    int right = w - outw - woffset;

    Mat& top_blob = top_blobs[0];

    copy_cut_border(bottom_blob, top_blob, top, bottom, left, right);
    if (top_blob.empty())
        return -100;

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/crop.h
+++ b/src/layer/crop.h
@@ -0,0 +1,44 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_CROP_H
 #define LAYER_CROP_H

 #include "layer.h"

 namespace ncnn {

 class Crop : public Layer
 {
 public:
    Crop();

 #if NCNN_STDIO
 #if NCNN_STRING
    virtual int load_param(FILE* paramfp);
 #endif // NCNN_STRING
    virtual int load_param_bin(FILE* paramfp);
 #endif // NCNN_STDIO
    virtual int load_param(const unsigned char*& mem);

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;

 public:
    int woffset;
    int hoffset;
 };

 } // namespace ncnn

 #endif // LAYER_CROP_H
--- a/src/layer/deconvolution.cpp
+++ b/src/layer/deconvolution.cpp
@@ -0,0 +1,348 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "deconvolution.h"

 namespace ncnn {

 DEFINE_LAYER_CREATOR(Deconvolution)

 Deconvolution::Deconvolution()
 {
    one_blob_only = true;
    support_inplace = false;
 }

 Deconvolution::~Deconvolution()
 {
 }

 #if NCNN_STDIO
 #if NCNN_STRING
 int Deconvolution::load_param(FILE* paramfp)
 {
    int nscan = fscanf(paramfp, "%d %d %d %d %d %d %d",
                       &num_output, &kernel_size, &dilation, &stride, &pad, &bias_term,
                       &weight_data_size);
    if (nscan != 7)
    {
        fprintf(stderr, "Deconvolution load_param failed %d\n", nscan);
        return -1;
    }

    return 0;
 }
 #endif // NCNN_STRING
 int Deconvolution::load_param_bin(FILE* paramfp)
 {
    fread(&num_output, sizeof(int), 1, paramfp);

    fread(&kernel_size, sizeof(int), 1, paramfp);

    fread(&dilation, sizeof(int), 1, paramfp);

    fread(&stride, sizeof(int), 1, paramfp);

    fread(&pad, sizeof(int), 1, paramfp);

    fread(&bias_term, sizeof(int), 1, paramfp);

    fread(&weight_data_size, sizeof(int), 1, paramfp);

    return 0;
 }

 int Deconvolution::load_model(FILE* binfp)
 {
    int nread;

    union
    {
        struct
        {
            unsigned char f0;
            unsigned char f1;
            unsigned char f2;
            unsigned char f3;
        };
        unsigned int tag;
    } flag_struct;

    nread = fread(&flag_struct, sizeof(flag_struct), 1, binfp);
    if (nread != 1)
    {
        fprintf(stderr, "Deconvolution read flag_struct failed %d\n", nread);
        return -1;
    }

    unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;

    weight_data.create(weight_data_size);
    if (weight_data.empty())
        return -100;

    if (flag_struct.tag == 0x01306B47)
    {
        // half-precision weight data
        int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned short), 4);
        std::vector<unsigned short> float16_weights;
        float16_weights.resize(align_weight_data_size);
        nread = fread(float16_weights.data(), align_weight_data_size, 1, binfp);
        if (nread != 1)
        {
            fprintf(stderr, "Deconvolution read float16_weights failed %d\n", nread);
            return -1;
        }

        weight_data = Mat::from_float16(float16_weights.data(), weight_data_size);
        if (weight_data.empty())
            return -100;
    }
    else if (flag != 0)
    {
        // quantized weight data
        float quantization_value[256];
        nread = fread(quantization_value, 256 * sizeof(float), 1, binfp);
        if (nread != 1)
        {
            fprintf(stderr, "Deconvolution read quantization_value failed %d\n", nread);
            return -1;
        }

        int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned char), 4);
        std::vector<unsigned char> index_array;
        index_array.resize(align_weight_data_size);
        nread = fread(index_array.data(), align_weight_data_size, 1, binfp);
        if (nread != 1)
        {
            fprintf(stderr, "Deconvolution read index_array failed %d\n", nread);
            return -1;
        }

        float* weight_data_ptr = weight_data;
        for (int i = 0; i < weight_data_size; i++)
        {
            weight_data_ptr[i] = quantization_value[ index_array[i] ];
        }
    }
    else if (flag_struct.f0 == 0)
    {
        // raw weight data
        nread = fread(weight_data, weight_data_size * sizeof(float), 1, binfp);
        if (nread != 1)
        {
            fprintf(stderr, "Deconvolution read weight_data failed %d\n", nread);
            return -1;
        }
    }

    if (bias_term)
    {
        bias_data.create(num_output);
        if (bias_data.empty())
            return -100;
        nread = fread(bias_data, num_output * sizeof(float), 1, binfp);
        if (nread != 1)
        {
            fprintf(stderr, "Deconvolution read bias_data failed %d\n", nread);
            return -1;
        }
    }

    return 0;
 }
 #endif // NCNN_STDIO

 int Deconvolution::load_param(const unsigned char*& mem)
 {
    num_output = *(int*)(mem);
    mem += 4;

    kernel_size = *(int*)(mem);
    mem += 4;

    dilation = *(int*)(mem);
    mem += 4;

    stride = *(int*)(mem);
    mem += 4;

    pad = *(int*)(mem);
    mem += 4;

    bias_term = *(int*)(mem);
    mem += 4;

    weight_data_size = *(int*)(mem);
    mem += 4;

    return 0;
 }

 int Deconvolution::load_model(const unsigned char*& mem)
 {
    union
    {
        struct
        {
            unsigned char f0;
            unsigned char f1;
            unsigned char f2;
            unsigned char f3;
        };
        unsigned int tag;
    } flag_struct;

    memcpy(&flag_struct, mem, sizeof(flag_struct));
    mem += sizeof(flag_struct);

    unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;

    if (flag_struct.tag == 0x01306B47)
    {
        // half-precision weight data
        weight_data = Mat::from_float16((unsigned short*)mem, weight_data_size);
        mem += alignSize(weight_data_size * sizeof(unsigned short), 4);
        if (weight_data.empty())
            return -100;
    }
    else if (flag != 0)
    {
        // quantized weight data
        const float* quantization_value = (const float*)mem;
        mem += 256 * sizeof(float);

        const unsigned char* index_array = (const unsigned char*)mem;
        mem += alignSize(weight_data_size * sizeof(unsigned char), 4);

        weight_data.create(weight_data_size);
        if (weight_data.empty())
            return -100;
        float* weight_data_ptr = weight_data;
        for (int i = 0; i < weight_data_size; i++)
        {
            weight_data_ptr[i] = quantization_value[ index_array[i] ];
        }
    }
    else if (flag_struct.f0 == 0)
    {
        // raw weight data
        weight_data = Mat(weight_data_size, (float*)mem);
        mem += weight_data_size * sizeof(float);
    }

    if (bias_term)
    {
        bias_data = Mat(num_output, (float*)mem);
        mem += num_output * sizeof(float);
    }

    return 0;
 }

 int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const
 {
    // backward strided convolv with NxN kernel
    // value = value + bias

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;

 //     fprintf(stderr, "Deconvolution input %d x %d  pad = %d  ksize=%d  stride=%d\n", w, h, pad, kernel_size, stride);

    const int kernel_extent = dilation * (kernel_size - 1) + 1;

    int outw = (w - 1) * stride + kernel_extent;
    int outh = (h - 1) * stride + kernel_extent;

    Mat top_blob_bordered;
    top_blob_bordered.create(outw, outh, num_output);
    if (top_blob_bordered.empty())
        return -100;

    const int maxk = kernel_size * kernel_size;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = outw * dilation - kernel_extent;
        for (int i = 0; i < kernel_size; i++)
        {
            for (int j = 0; j < kernel_size; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2 += dilation;
            }
            p2 += gap;
        }
    }

    // num_output
    const float* weight_data_ptr = weight_data;
    #pragma omp parallel for
    for (int p=0; p<num_output; p++)
    {
        Mat out = top_blob_bordered.channel(p);

        const float bias = bias_term ? bias_data.data[p] : 0.f;

        out.fill(bias);

        for (int i = 0; i < h; i++)
        {
            for (int j = 0; j < w; j++)
            {
                float* outptr = out.data + out.w * i*stride + j*stride;

                const float* kptr = weight_data_ptr + maxk * channels * p;

                // channels
                for (int q=0; q<channels; q++)
                {
                    const Mat m = bottom_blob.channel(q);
                    float val = *(m.data + m.w * i + j);

                    for (int k = 0; k < maxk; k++)
                    {
                        float w = kptr[k];
                        outptr[ space_ofs[k] ] += val * w;
                    }

                    kptr += maxk;
                }
            }
        }
    }

    top_blob = top_blob_bordered;

    if (pad > 0)
    {
        copy_cut_border(top_blob_bordered, top_blob, pad, pad, pad, pad);
        if (top_blob.empty())
            return -100;

        outw = top_blob.w;
        outh = top_blob.h;
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/deconvolution.h
+++ b/src/layer/deconvolution.h
@@ -0,0 +1,58 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_DECONVOLUTION_H
 #define LAYER_DECONVOLUTION_H

 #include "layer.h"

 namespace ncnn {

 class Deconvolution : public Layer
 {
 public:
    Deconvolution();
    virtual ~Deconvolution();

 #if NCNN_STDIO
 #if NCNN_STRING
    virtual int load_param(FILE* paramfp);
 #endif // NCNN_STRING
    virtual int load_param_bin(FILE* paramfp);
    virtual int load_model(FILE* binfp);
 #endif // NCNN_STDIO
    virtual int load_param(const unsigned char*& mem);
    virtual int load_model(const unsigned char*& mem);

    virtual int forward(const Mat& bottom_blobs, Mat& top_blobs) const;

 public:
    // param
    int num_output;
    int kernel_size;
    int dilation;
    int stride;
    int pad;
    int bias_term;

    int weight_data_size;

    // model
    Mat weight_data;
    Mat bias_data;
 };

 } // namespace ncnn

 #endif // LAYER_DECONVOLUTION_H
--- a/src/layer/dropout.cpp
+++ b/src/layer/dropout.cpp
@@ -0,0 +1,38 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "dropout.h"

 namespace ncnn {

 DEFINE_LAYER_CREATOR(Dropout)

 Dropout::Dropout()
 {
    one_blob_only = true;
    support_inplace = true;
 }

 int Dropout::forward(const Mat& bottom_blob, Mat& top_blob) const
 {
    top_blob = bottom_blob;
    return 0;
 }

 int Dropout::forward_inplace(Mat& /*bottom_top_blob*/) const
 {
    return 0;
 }

 } // namespace ncnn
--- a/src/layer/dropout.h
+++ b/src/layer/dropout.h
@@ -0,0 +1,35 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_DROPOUT_H
 #define LAYER_DROPOUT_H

 #include "layer.h"

 namespace ncnn {

 class Dropout : public Layer
 {
 public:
    Dropout();

    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

    virtual int forward_inplace(Mat& bottom_top_blob) const;

 };

 } // namespace ncnn

 #endif // LAYER_DROPOUT_H
--- a/src/layer/eltwise.cpp
+++ b/src/layer/eltwise.cpp
@@ -0,0 +1,246 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "eltwise.h"

 namespace ncnn {

 DEFINE_LAYER_CREATOR(Eltwise)

 Eltwise::Eltwise()
 {
 }

 #if NCNN_STDIO
 #if NCNN_STRING
 int Eltwise::load_param(FILE* paramfp)
 {
    int nscan = fscanf(paramfp, "%d %d", &op_type, &num_coeff);
    if (nscan != 2)
    {
        fprintf(stderr, "Eltwise load_param failed %d\n", nscan);
        return -1;
    }

    if (num_coeff > 0)
    {
        coeffs.create(num_coeff);
        if (coeffs.empty())
            return -100;
        float* coeffs_ptr = coeffs;
        for (int i=0; i<num_coeff; i++)
        {
            int nscan = fscanf(paramfp, "%f", &coeffs_ptr[i]);
            if (nscan != 1)
            {
                fprintf(stderr, "Eltwise load_param failed %d\n", nscan);
                return -1;
            }
        }
    }

    return 0;
 }
 #endif // NCNN_STRING
 int Eltwise::load_param_bin(FILE* paramfp)
 {
    fread(&op_type, sizeof(int), 1, paramfp);

    fread(&num_coeff, sizeof(int), 1, paramfp);

    if (num_coeff > 0)
    {
        coeffs.create(num_coeff);
        if (coeffs.empty())
            return -100;
        float* coeffs_ptr = coeffs;
        fread(coeffs_ptr, sizeof(float), num_coeff, paramfp);
    }

    return 0;
 }
 #endif // NCNN_STDIO

 int Eltwise::load_param(const unsigned char*& mem)
 {
    op_type = *(int*)(mem);
    mem += 4;

    num_coeff = *(int*)(mem);
    mem += 4;

    coeffs = Mat(num_coeff, (float*)mem);
    mem += num_coeff * sizeof(float);

    return 0;
 }

 int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
 {
    const Mat& bottom_blob = bottom_blobs[0];
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    int size = w * h;

    Mat& top_blob = top_blobs[0];
    top_blob.create(w, h, channels);
    if (top_blob.empty())
        return -100;

    if (op_type == Operation_PROD)
    {
        // first blob
        const Mat& bottom_blob1 = bottom_blobs[1];
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            const float* ptr1 = bottom_blob1.channel(q);
            float* outptr = top_blob.channel(q);

            for (int i=0; i<size; i++)
            {
                outptr[i] = ptr[i] * ptr1[i];
            }
        }

        for (size_t b=2; b<bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob1 = bottom_blobs[b];
            #pragma omp parallel for
            for (int q=0; q<channels; q++)
            {
                const float* ptr = bottom_blob1.channel(q);
                float* outptr = top_blob.channel(q);

                for (int i=0; i<size; i++)
                {
                    outptr[i] *= ptr[i];
                }
            }
        }
    }
    else if (op_type == Operation_SUM)
    {
        if (num_coeff == 0)
        {
            // first blob
            const Mat& bottom_blob1 = bottom_blobs[1];
            #pragma omp parallel for
            for (int q=0; q<channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                const float* ptr1 = bottom_blob1.channel(q);
                float* outptr = top_blob.channel(q);

                for (int i=0; i<size; i++)
                {
                    outptr[i] = ptr[i] + ptr1[i];
                }
            }

            for (size_t b=2; b<bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob1 = bottom_blobs[b];
                #pragma omp parallel for
                for (int q=0; q<channels; q++)
                {
                    const float* ptr = bottom_blob1.channel(q);
                    float* outptr = top_blob.channel(q);

                    for (int i=0; i<size; i++)
                    {
                        outptr[i] += ptr[i];
                    }
                }
            }
        }
        else
        {
            const float* coeffs_ptr = coeffs;

            // first blob
            const Mat& bottom_blob1 = bottom_blobs[1];
            float coeff0 = coeffs_ptr[0];
            float coeff1 = coeffs_ptr[1];
            #pragma omp parallel for
            for (int q=0; q<channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                const float* ptr1 = bottom_blob1.channel(q);
                float* outptr = top_blob.channel(q);

                for (int i=0; i<size; i++)
                {
                    outptr[i] = ptr[i] * coeff0 + ptr1[i] * coeff1;
                }
            }

            for (size_t b=2; b<bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob1 = bottom_blobs[b];
                float coeff = coeffs_ptr[b];
                #pragma omp parallel for
                for (int q=0; q<channels; q++)
                {
                    const float* ptr = bottom_blob1.channel(q);
                    float* outptr = top_blob.channel(q);

                    for (int i=0; i<size; i++)
                    {
                        outptr[i] += ptr[i] * coeff;
                    }
                }
            }
        }
    }
    else if (op_type == Operation_MAX)
    {
        // first blob
        const Mat& bottom_blob1 = bottom_blobs[1];
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            const float* ptr1 = bottom_blob1.channel(q);
            float* outptr = top_blob.channel(q);

            for (int i=0; i<size; i++)
            {
                outptr[i] = std::max(ptr[i], ptr1[i]);
            }
        }

        for (size_t b=2; b<bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob1 = bottom_blobs[b];
            #pragma omp parallel for
            for (int q=0; q<channels; q++)
            {
                const float* ptr = bottom_blob1.channel(q);
                float* outptr = top_blob.channel(q);

                for (int i=0; i<size; i++)
                {
                    outptr[i] = std::max(outptr[i], ptr[i]);
                }
            }
        }
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/eltwise.h
+++ b/src/layer/eltwise.h
@@ -0,0 +1,48 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_ELTWISE_H
 #define LAYER_ELTWISE_H

 #include "layer.h"

 namespace ncnn {

 class Eltwise : public Layer
 {
 public:
    Eltwise();

 #if NCNN_STDIO
 #if NCNN_STRING
    virtual int load_param(FILE* paramfp);
 #endif // NCNN_STRING
    virtual int load_param_bin(FILE* paramfp);
 #endif // NCNN_STDIO
    virtual int load_param(const unsigned char*& mem);

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;

    enum { Operation_PROD = 0, Operation_SUM = 1, Operation_MAX = 2 };

 public:
    // param
    int op_type;
    int num_coeff;
    Mat coeffs;
 };

 } // namespace ncnn

 #endif // LAYER_ELTWISE_H