Browse Source

add ncnn source qwq

tags/20170724
nihuini 9 years ago
commit
b7db8be4f6
100 changed files with 16317 additions and 0 deletions
  1. +62
    -0
      CMakeLists.txt
  2. +18
    -0
      Info.plist
  3. +86
    -0
      LICENSE.txt
  4. +44
    -0
      README.md
  5. +1735
    -0
      android.toolchain.cmake
  6. +33
    -0
      build.sh
  7. +9
    -0
      examples/CMakeLists.txt
  8. +15
    -0
      examples/squeezencnn/AndroidManifest.xml
  9. +21
    -0
      examples/squeezencnn/ant.properties
  10. +1
    -0
      examples/squeezencnn/assets/squeezenet_v1.1.bin
  11. BIN
      examples/squeezencnn/assets/squeezenet_v1.1.param.bin
  12. +1
    -0
      examples/squeezencnn/assets/synset_words.txt
  13. +92
    -0
      examples/squeezencnn/build.xml
  14. +30
    -0
      examples/squeezencnn/jni/Android.mk
  15. +7
    -0
      examples/squeezencnn/jni/Application.mk
  16. +181
    -0
      examples/squeezencnn/jni/squeezencnn_jni.cpp
  17. +163
    -0
      examples/squeezencnn/jni/squeezenet_v1.1.id.h
  18. +10
    -0
      examples/squeezencnn/local.properties
  19. +20
    -0
      examples/squeezencnn/proguard-project.txt
  20. +14
    -0
      examples/squeezencnn/project.properties
  21. +36
    -0
      examples/squeezencnn/res/layout/main.xml
  22. +4
    -0
      examples/squeezencnn/res/values/strings.xml
  23. +189
    -0
      examples/squeezencnn/src/com/tencent/squeezencnn/MainActivity.java
  24. +29
    -0
      examples/squeezencnn/src/com/tencent/squeezencnn/SqueezeNcnn.java
  25. +95
    -0
      examples/squeezenet.cpp
  26. BIN
      examples/squeezenet_v1.1.bin
  27. BIN
      examples/squeezenet_v1.1.caffemodel
  28. +76
    -0
      examples/squeezenet_v1.1.param
  29. +548
    -0
      examples/squeezenet_v1.1.prototxt
  30. +1000
    -0
      examples/synset_words.txt
  31. +193
    -0
      ios.toolchain.cmake
  32. +40
    -0
      iossimxc.toolchain.cmake
  33. +39
    -0
      iosxc.toolchain.cmake
  34. +35
    -0
      package.sh
  35. +135
    -0
      src/CMakeLists.txt
  36. +24
    -0
      src/blob.cpp
  37. +43
    -0
      src/blob.h
  38. +471
    -0
      src/cpu.cpp
  39. +51
    -0
      src/cpu.h
  40. +130
    -0
      src/layer.cpp
  41. +163
    -0
      src/layer.h
  42. +76
    -0
      src/layer/absval.cpp
  43. +36
    -0
      src/layer/absval.h
  44. +108
    -0
      src/layer/argmax.cpp
  45. +44
    -0
      src/layer/argmax.h
  46. +152
    -0
      src/layer/arm/absval_arm.cpp
  47. +34
    -0
      src/layer/arm/absval_arm.h
  48. +186
    -0
      src/layer/arm/batchnorm_arm.cpp
  49. +32
    -0
      src/layer/arm/batchnorm_arm.h
  50. +122
    -0
      src/layer/arm/bias_arm.cpp
  51. +32
    -0
      src/layer/arm/bias_arm.h
  52. +543
    -0
      src/layer/arm/convolution_1x1.h
  53. +381
    -0
      src/layer/arm/convolution_2x2.h
  54. +753
    -0
      src/layer/arm/convolution_3x3.h
  55. +340
    -0
      src/layer/arm/convolution_4x4.h
  56. +1251
    -0
      src/layer/arm/convolution_5x5.h
  57. +1073
    -0
      src/layer/arm/convolution_7x7.h
  58. +120
    -0
      src/layer/arm/convolution_arm.cpp
  59. +30
    -0
      src/layer/arm/convolution_arm.h
  60. +574
    -0
      src/layer/arm/eltwise_arm.cpp
  61. +30
    -0
      src/layer/arm/eltwise_arm.h
  62. +136
    -0
      src/layer/arm/innerproduct_arm.cpp
  63. +30
    -0
      src/layer/arm/innerproduct_arm.h
  64. +227
    -0
      src/layer/arm/lrn_arm.cpp
  65. +30
    -0
      src/layer/arm/lrn_arm.h
  66. +316
    -0
      src/layer/arm/neon_mathfun.h
  67. +112
    -0
      src/layer/arm/pooling_2x2.h
  68. +170
    -0
      src/layer/arm/pooling_3x3.h
  69. +96
    -0
      src/layer/arm/pooling_arm.cpp
  70. +30
    -0
      src/layer/arm/pooling_arm.h
  71. +182
    -0
      src/layer/arm/prelu_arm.cpp
  72. +32
    -0
      src/layer/arm/prelu_arm.h
  73. +295
    -0
      src/layer/arm/relu_arm.cpp
  74. +32
    -0
      src/layer/arm/relu_arm.h
  75. +211
    -0
      src/layer/arm/scale_arm.cpp
  76. +32
    -0
      src/layer/arm/scale_arm.h
  77. +127
    -0
      src/layer/arm/sigmoid_arm.cpp
  78. +32
    -0
      src/layer/arm/sigmoid_arm.h
  79. +102
    -0
      src/layer/arm/slice_arm.cpp
  80. +30
    -0
      src/layer/arm/slice_arm.h
  81. +302
    -0
      src/layer/arm/softmax_arm.cpp
  82. +32
    -0
      src/layer/arm/softmax_arm.h
  83. +227
    -0
      src/layer/batchnorm.cpp
  84. +58
    -0
      src/layer/batchnorm.h
  85. +139
    -0
      src/layer/bias.cpp
  86. +52
    -0
      src/layer/bias.h
  87. +81
    -0
      src/layer/bnll.cpp
  88. +36
    -0
      src/layer/bnll.h
  89. +64
    -0
      src/layer/concat.cpp
  90. +34
    -0
      src/layer/concat.h
  91. +350
    -0
      src/layer/convolution.cpp
  92. +58
    -0
      src/layer/convolution.h
  93. +85
    -0
      src/layer/crop.cpp
  94. +44
    -0
      src/layer/crop.h
  95. +348
    -0
      src/layer/deconvolution.cpp
  96. +58
    -0
      src/layer/deconvolution.h
  97. +38
    -0
      src/layer/dropout.cpp
  98. +35
    -0
      src/layer/dropout.h
  99. +246
    -0
      src/layer/eltwise.cpp
  100. +48
    -0
      src/layer/eltwise.h

+ 62
- 0
CMakeLists.txt View File

@@ -0,0 +1,62 @@

if(CMAKE_TOOLCHAIN_FILE)
set(LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_BINARY_DIR} CACHE PATH "root for library output, set this to change where android libs are compiled to")
# get absolute path, but get_filename_component ABSOLUTE only refer with source dir, so find_file here :(
get_filename_component(CMAKE_TOOLCHAIN_FILE_NAME ${CMAKE_TOOLCHAIN_FILE} NAME)
find_file(CMAKE_TOOLCHAIN_FILE ${CMAKE_TOOLCHAIN_FILE_NAME} PATHS ${CMAKE_SOURCE_DIR} NO_DEFAULT_PATH)
message(STATUS "CMAKE_TOOLCHAIN_FILE = ${CMAKE_TOOLCHAIN_FILE}")
endif()

if(NOT DEFINED CMAKE_INSTALL_PREFIX)
set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "Installation Directory")
endif()
message(STATUS "CMAKE_INSTALL_PREFIX = ${CMAKE_INSTALL_PREFIX}")

project(ncnn)

cmake_minimum_required(VERSION 2.8.10)

# set(CMAKE_BUILD_TYPE debug)
# set(CMAKE_BUILD_TYPE relwithdebinfo)
set(CMAKE_BUILD_TYPE release)

option(NCNN_OPENMP "openmp support" ON)
option(NCNN_STDIO "load model from external file" ON)
option(NCNN_STRING "plain and verbose string" ON)
option(NCNN_OPENCV "minimal opencv structure emulation" OFF)

if(NCNN_OPENMP)
find_package(OpenMP)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
endif()

add_definitions(-Wall -Wextra)

add_definitions(-fPIC)
add_definitions(-Ofast)

add_definitions(-ffast-math)
# add_definitions(-march=native)

# add_definitions(-flto)

add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)

if(ANDROID)
# disable shared library on android
set_property(GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS FALSE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti -fno-exceptions")
elseif(IOS)
# disable shared library on xcode ios
set_property(GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS FALSE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti -fno-exceptions")
endif()

##############################################

# add_subdirectory(examples)
add_subdirectory(src)
if(NOT ANDROID AND NOT IOS)
add_subdirectory(tools)
endif()

+ 18
- 0
Info.plist View File

@@ -0,0 +1,18 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>CFBundleName</key>
<string>ncnn</string>
<key>CFBundleIdentifier</key>
<string>com.tencent.ncnn</string>
<key>CFBundleVersion</key>
<string>1.0</string>
<key>CFBundleShortVersionString</key>
<string>1.0</string>
<key>CFBundleSignature</key>
<string>????</string>
<key>CFBundlePackageType</key>
<string>FMWK</string>
</dict>
</plist>

+ 86
- 0
LICENSE.txt View File

@@ -0,0 +1,86 @@
Tencent is pleased to support the open source community by making ncnn available.
Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
If you have downloaded a copy of the ncnn binary from Tencent, please note that the ncnn binary is licensed under the BSD 3-Clause License.
If you have downloaded a copy of the ncnn source code from Tencent, please note that ncnn source code is licensed under the BSD 3-Clause License, except for the third-party components listed below which are subject to different license terms. Your integration of ncnn into your own projects may require compliance with the BSD 3-Clause License, as well as the other licenses applicable to the third-party components included within ncnn.
A copy of the BSD 3-Clause License is included in this file.

Other dependencies and licenses:

Open Source Software Licensed Under the zlib License:
The below software in this distribution may have been modified by THL A29 Limited (“Tencent Modifications”). All Tencent Modifications are Copyright (C) 2017 THL A29 Limited.
----------------------------------------------------------------------------------------
1. neon_mathfun.h
Copyright (C) 2011 Julien Pommier

2. sse_mathfun.h
Copyright (C) 2007 Julien Pommier

3. avx_mathfun.h
Copyright (C) 2012 Giovanni Garberoglio
Interdisciplinary Laboratory for Computational Science (LISC)
Fondazione Bruno Kessler and University of Trento
via Sommarive, 18
I-38123 Trento (Italy)


Terms of the zlib License:
---------------------------------------------------
Copyright (c) <year> <copyright holders>

This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software.

Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions:

1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.



Open Source Software Licensed Under the BSD 2-Clause License:
The below software in this distribution may have been modified by THL A29 Limited (“Tencent Modifications”). All Tencent Modifications are Copyright (C) 2017 THL A29 Limited.
----------------------------------------------------------------------------------------
1. squeezenet 1.1
Copyright (c) 2016 Forrest N. Iandola and Matthew W. Moskewicz and Khalid Ashraf and Song Han and William J. Dally and Kurt Keutzer
All rights reserved.

2. caffe.proto master
All contributions by the University of California:
Copyright (c) 2014-2017 The Regents of the University of California (Regents)
All rights reserved.

All other contributions:
Copyright (c) 2014-2017, the respective contributors
All rights reserved.


Terms of the BSD 2-Clause License:
--------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.



Open Source Software Licensed Under the BSD 3-Clause License:
The below software in this distribution may have been modified by THL A29 Limited (“Tencent Modifications”). All Tencent Modifications are Copyright (C) 2017 THL A29 Limited.
----------------------------------------------------------------------------------------
1. android.toolchain.cmake master
Copyright (c) 2010-2011, Ethan Rublee
Copyright (c) 2011-2014, Andrey Kamaev
All rights reserved.


Terms of the BSD 3-Clause License:
--------------------------------------------------------------------

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
Neither the name of [copyright holder] nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ 44
- 0
README.md View File

@@ -0,0 +1,44 @@
# ncnn

---

ncnn 是一个为手机端极致优化的高性能神经网络前向计算框架。ncnn 从设计之初深刻考虑手机端的部属和使用。无第三方依赖,跨平台,手机端 cpu 的速度快于目前所有已知的开源框架。基于 ncnn,开发者能够将深度学习算法轻松移植到手机端高效执行,开发出人工智能 APP,将 AI 带到你的指尖。ncnn 目前已在腾讯多款应用中使用,如 QQ,Qzone,微信,天天P图等。

ncnn is a high-performance neural network inference computing framework optimized for the mobile platform. ncnn is deeply considered of the deployment and uses on mobile phones from the beginning of the design. ncnn does not have third party dependent, it is cross-platform, and runs faster than all known open source framework on mobile phone cpu. Developers can easily deploy deep learning algorithm models to the mobile platform by using the efficient ncnn implementation, create intelligent APP, and bring the artificial intelligence to your fingertips. ncnn is currently being used in many Tencent applications, such as QQ, Qzone, WeChat, Pitu and so on.

---

### 功能概述

* 支持卷积神经网络,支持多输入和多分支结构,可计算部分分支
* 无任何第三方库依赖,不依赖 BLAS/NNPACK 等计算框架
* 纯 C++ 实现,跨平台,支持 android ios 等
* ARM NEON 汇编级良心优化,计算速度极快
* 精细的内存管理和数据结构设计,内存占用极低
* 支持多核并行计算加速,ARM big.LITTLE cpu 调度优化
* 整体库体积小于 500K,并可轻松精简到小于 300K
* 可扩展的模型设计,支持 8bit 量化和半精度浮点存储,可导入 caffe 模型
* 支持直接内存零拷贝引用加载网络模型
* 可注册自定义层实现并扩展
* 恩,很强就是了,不怕被塞卷 QvQ

### Features

* Support convolution neural network, support multiple input and multi-branch structure, can calculate part of the branch
* No third-party library dependent, do not rely on BLAS / NNPACK or other computing framework
* Pure C ++ implementation, cross-platform, support android ios and so on
* ARM NEON assembly level of careful optimization, the calculation speed is extremely fast
* Sophisticated memory management and data structure design, very low memory footprint
* Support multi-core parallel computing acceleration, ARM big.LITTLE cpu scheduling optimization
* The overall library size is less than 500K, and can be easily reduced to less than 300K
* Extensible model design, support 8bit quantization and half-precision floating point storage, can import caffe model
* Support direct memory zero copy reference load network model
* Can be registered with custom layer implementation and extented
* Well, it is strong, not afraid of being stuffed with 卷 QvQ

---

### License

BSD 3 Clause


+ 1735
- 0
android.toolchain.cmake
File diff suppressed because it is too large
View File


+ 33
- 0
build.sh View File

@@ -0,0 +1,33 @@
#!/usr/bin/bash

##### android armv7
mkdir -p build-android-armv7
pushd build-android-armv7
cmake -DCMAKE_TOOLCHAIN_FILE=../android.toolchain.cmake -DANDROID_ABI="armeabi-v7a with NEON" -DANDROID_NATIVE_API_LEVEL=android-9 -DANDROID_FORCE_ARM_BUILD=OFF -DANDROID_STL_FORCE_FEATURES=OFF ..
make
make install
popd

##### android aarch64
mkdir -p build-android-aarch64
pushd build-android-aarch64
cmake -DCMAKE_TOOLCHAIN_FILE=../android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_NATIVE_API_LEVEL=android-21 -DANDROID_FORCE_ARM_BUILD=OFF -DANDROID_STL_FORCE_FEATURES=OFF ..
make
make install
popd

##### ios armv7 arm64
mkdir -p build-ios
pushd build-ios
cmake -DCMAKE_TOOLCHAIN_FILE=../iosxc.toolchain.cmake ..
make
make install
popd

##### ios simulator i386 x86_64
mkdir -p build-ios-sim
pushd build-ios-sim
cmake -DCMAKE_TOOLCHAIN_FILE=../iossimxc.toolchain.cmake ..
make
make install
popd

+ 9
- 0
examples/CMakeLists.txt View File

@@ -0,0 +1,9 @@

find_package(OpenCV REQUIRED core highgui imgproc)

include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src)
include_directories(${CMAKE_CURRENT_BINARY_DIR}/../src)

add_executable(squeezenet squeezenet.cpp)

target_link_libraries(squeezenet ncnn ${OpenCV_LIBS})

+ 15
- 0
examples/squeezencnn/AndroidManifest.xml View File

@@ -0,0 +1,15 @@
<?xml version="1.0" encoding="utf-8"?>
<manifest xmlns:android="http://schemas.android.com/apk/res/android"
package="com.tencent.squeezencnn"
android:versionCode="1"
android:versionName="1.1">
<application android:label="@string/app_name" >
<activity android:name="MainActivity"
android:label="@string/app_name">
<intent-filter>
<action android:name="android.intent.action.MAIN" />
<category android:name="android.intent.category.LAUNCHER" />
</intent-filter>
</activity>
</application>
</manifest>

+ 21
- 0
examples/squeezencnn/ant.properties View File

@@ -0,0 +1,21 @@
# This file is used to override default values used by the Ant build system.
#
# This file must be checked into Version Control Systems, as it is
# integral to the build system of your project.

# This file is only used by the Ant script.

# You can use this to override default values such as
# 'source.dir' for the location of your java source folder and
# 'out.dir' for the location of your output folder.

# You can also use it define how the release builds are signed by declaring
# the following properties:
# 'key.store' for the location of your keystore and
# 'key.alias' for the name of the key to use.
# The password will be asked during the build when you use the 'release' target.

key.store=/home/nihui/osd/nihuini-release-key.keystore
key.alias=nihuini
key.store.password=nihuini
key.alias.password=nihuini

+ 1
- 0
examples/squeezencnn/assets/squeezenet_v1.1.bin View File

@@ -0,0 +1 @@
../../squeezenet_v1.1.bin

BIN
examples/squeezencnn/assets/squeezenet_v1.1.param.bin View File


+ 1
- 0
examples/squeezencnn/assets/synset_words.txt View File

@@ -0,0 +1 @@
../../synset_words.txt

+ 92
- 0
examples/squeezencnn/build.xml View File

@@ -0,0 +1,92 @@
<?xml version="1.0" encoding="UTF-8"?>
<project name="squeezencnn" default="help">

<!-- The local.properties file is created and updated by the 'android' tool.
It contains the path to the SDK. It should *NOT* be checked into
Version Control Systems. -->
<property file="local.properties" />

<!-- The ant.properties file can be created by you. It is only edited by the
'android' tool to add properties to it.
This is the place to change some Ant specific build properties.
Here are some properties you may want to change/update:

source.dir
The name of the source directory. Default is 'src'.
out.dir
The name of the output directory. Default is 'bin'.

For other overridable properties, look at the beginning of the rules
files in the SDK, at tools/ant/build.xml

Properties related to the SDK location or the project target should
be updated using the 'android' tool with the 'update' action.

This file is an integral part of the build system for your
application and should be checked into Version Control Systems.

-->
<property file="ant.properties" />

<!-- if sdk.dir was not set from one of the property file, then
get it from the ANDROID_HOME env var.
This must be done before we load project.properties since
the proguard config can use sdk.dir -->
<property environment="env" />
<condition property="sdk.dir" value="${env.ANDROID_HOME}">
<isset property="env.ANDROID_HOME" />
</condition>

<!-- The project.properties file is created and updated by the 'android'
tool, as well as ADT.

This contains project specific properties such as project target, and library
dependencies. Lower level build properties are stored in ant.properties
(or in .classpath for Eclipse projects).

This file is an integral part of the build system for your
application and should be checked into Version Control Systems. -->
<loadproperties srcFile="project.properties" />

<!-- quick check on sdk.dir -->
<fail
message="sdk.dir is missing. Make sure to generate local.properties using 'android update project' or to inject it through the ANDROID_HOME environment variable."
unless="sdk.dir"
/>

<!--
Import per project custom build rules if present at the root of the project.
This is the place to put custom intermediary targets such as:
-pre-build
-pre-compile
-post-compile (This is typically used for code obfuscation.
Compiled code location: ${out.classes.absolute.dir}
If this is not done in place, override ${out.dex.input.absolute.dir})
-post-package
-post-build
-pre-clean
-->
<import file="custom_rules.xml" optional="true" />

<!-- Import the actual build file.

To customize existing targets, there are two options:
- Customize only one target:
- copy/paste the target into this file, *before* the
<import> task.
- customize it to your needs.
- Customize the whole content of build.xml
- copy/paste the content of the rules files (minus the top node)
into this file, replacing the <import> task.
- customize to your needs.

***********************
****** IMPORTANT ******
***********************
In all cases you must update the value of version-tag below to read 'custom' instead of an integer,
in order to avoid having your file be overridden by tools such as "android update project"
-->
<!-- version-tag: 1 -->
<import file="${sdk.dir}/tools/ant/build.xml" />

</project>

+ 30
- 0
examples/squeezencnn/jni/Android.mk View File

@@ -0,0 +1,30 @@
LOCAL_PATH := $(call my-dir)

# change this folder path to yours
NCNN_INSTALL_PATH := /home/nihui/dev/qqfacecnn/ncnn/build-android-armv7/install

include $(CLEAR_VARS)
LOCAL_MODULE := ncnn
LOCAL_SRC_FILES := $(NCNN_INSTALL_PATH)/lib/libncnn.a
include $(PREBUILT_STATIC_LIBRARY)

include $(CLEAR_VARS)

LOCAL_MODULE := squeezencnn
LOCAL_SRC_FILES := squeezencnn_jni.cpp

LOCAL_C_INCLUDES := $(NCNN_INSTALL_PATH)/include

LOCAL_STATIC_LIBRARIES := ncnn

LOCAL_CFLAGS := -O2 -fvisibility=hidden -fomit-frame-pointer -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math
LOCAL_CPPFLAGS := -O2 -fvisibility=hidden -fvisibility-inlines-hidden -fomit-frame-pointer -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math
LOCAL_LDFLAGS += -Wl,--gc-sections

LOCAL_CFLAGS += -fopenmp
LOCAL_CPPFLAGS += -fopenmp
LOCAL_LDFLAGS += -fopenmp

LOCAL_LDLIBS := -lz -llog -ljnigraphics

include $(BUILD_SHARED_LIBRARY)

+ 7
- 0
examples/squeezencnn/jni/Application.mk View File

@@ -0,0 +1,7 @@

# APP_STL := stlport_static
APP_STL := gnustl_static
# APP_ABI := armeabi armeabi-v7a
APP_ABI := armeabi-v7a
APP_PLATFORM := android-9
NDK_TOOLCHAIN_VERSION := 4.9

+ 181
- 0
examples/squeezencnn/jni/squeezencnn_jni.cpp View File

@@ -0,0 +1,181 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include <android/bitmap.h>
#include <android/log.h>

#include <jni.h>

#include <string>
#include <vector>

// ncnn
#include "net.h"

#include "squeezenet_v1.1.id.h"

#include <sys/time.h>
#include <unistd.h>

static struct timeval tv_begin;
static struct timeval tv_end;
static double elasped;

static void bench_start()
{
gettimeofday(&tv_begin, NULL);
}

static void bench_end(const char* comment)
{
gettimeofday(&tv_end, NULL);
elasped = ((tv_end.tv_sec - tv_begin.tv_sec) * 1000000.0f + tv_end.tv_usec - tv_begin.tv_usec) / 1000.0f;
// fprintf(stderr, "%.2fms %s\n", elasped, comment);
__android_log_print(ANDROID_LOG_DEBUG, "SqueezeNcnn", "%.2fms %s", elasped, comment);
}

static std::vector<unsigned char> squeezenet_param;
static std::vector<unsigned char> squeezenet_bin;
static std::vector<std::string> squeezenet_words;
static ncnn::Net squeezenet;

static std::vector<std::string> split_string(const std::string& str, const std::string& delimiter)
{
std::vector<std::string> strings;

std::string::size_type pos = 0;
std::string::size_type prev = 0;
while ((pos = str.find(delimiter, prev)) != std::string::npos)
{
strings.push_back(str.substr(prev, pos - prev));
prev = pos + 1;
}

// To get the last substring (or only, if delimiter is not found)
strings.push_back(str.substr(prev));

return strings;
}

extern "C" {

// public native boolean Init(byte[] param, byte[] bin, byte[] words);
JNIEXPORT jboolean JNICALL Java_com_tencent_squeezencnn_SqueezeNcnn_Init(JNIEnv* env, jobject thiz, jbyteArray param, jbyteArray bin, jbyteArray words)
{
// init param
{
int len = env->GetArrayLength(param);
squeezenet_param.resize(len);
env->GetByteArrayRegion(param, 0, len, (jbyte*)squeezenet_param.data());
int ret = squeezenet.load_param(squeezenet_param.data());
__android_log_print(ANDROID_LOG_DEBUG, "SqueezeNcnn", "load_param %d %d", ret, len);
}

// init bin
{
int len = env->GetArrayLength(bin);
squeezenet_bin.resize(len);
env->GetByteArrayRegion(bin, 0, len, (jbyte*)squeezenet_bin.data());
int ret = squeezenet.load_model(squeezenet_bin.data());
__android_log_print(ANDROID_LOG_DEBUG, "SqueezeNcnn", "load_model %d %d", ret, len);
}

// init words
{
int len = env->GetArrayLength(words);
std::string words_buffer;
words_buffer.resize(len);
env->GetByteArrayRegion(words, 0, len, (jbyte*)words_buffer.data());
squeezenet_words = split_string(words_buffer, "\n");
}

return JNI_TRUE;
}

// public native String Detect(Bitmap bitmap);
JNIEXPORT jstring JNICALL Java_com_tencent_squeezencnn_SqueezeNcnn_Detect(JNIEnv* env, jobject thiz, jobject bitmap)
{
bench_start();

// ncnn from bitmap
ncnn::Mat in;
{
AndroidBitmapInfo info;
AndroidBitmap_getInfo(env, bitmap, &info);
int width = info.width;
int height = info.height;
if (width != 227 || height != 227)
return NULL;
if (info.format != ANDROID_BITMAP_FORMAT_RGBA_8888)
return NULL;

void* indata;
AndroidBitmap_lockPixels(env, bitmap, &indata);

in = ncnn::Mat::from_pixels((const unsigned char*)indata, ncnn::Mat::PIXEL_RGBA2BGR, width, height);

AndroidBitmap_unlockPixels(env, bitmap);
}

// squeezenet
std::vector<float> cls_scores;
{
const float mean_vals[3] = {104.f, 117.f, 123.f};
in.substract_mean_normalize(mean_vals, 0);

ncnn::Extractor ex = squeezenet.create_extractor();
ex.set_light_mode(true);
ex.set_num_threads(4);

ex.input(squeezenet_v1_1_param_id::BLOB_data, in);

ncnn::Mat out;
ex.extract(squeezenet_v1_1_param_id::BLOB_prob, out);

cls_scores.resize(out.c);
for (int j=0; j<out.c; j++)
{
const float* prob = out.data + out.cstep * j;
cls_scores[j] = prob[0];
}
}

// return top class
int top_class = 0;
float max_score = 0.f;
for (size_t i=0; i<cls_scores.size(); i++)
{
float s = cls_scores[i];
// __android_log_print(ANDROID_LOG_DEBUG, "SqueezeNcnn", "%d %f", i, s);
if (s > max_score)
{
top_class = i;
max_score = s;
}
}

const std::string& word = squeezenet_words[top_class];
char tmp[32];
sprintf(tmp, "%.3f", max_score);
std::string result_str = std::string(word.c_str() + 10) + " = " + tmp;

// +10 to skip leading n03179701
jstring result = env->NewStringUTF(result_str.c_str());

bench_end("detect");

return result;
}

}

+ 163
- 0
examples/squeezencnn/jni/squeezenet_v1.1.id.h View File

@@ -0,0 +1,163 @@
#ifndef NCNN_INCLUDE_GUARD_squeezenet_v1_1_id_h
#define NCNN_INCLUDE_GUARD_squeezenet_v1_1_id_h
namespace squeezenet_v1_1_param_id {
const int LAYER_data = 0;
const int BLOB_data = 0;
const int LAYER_conv1 = 1;
const int BLOB_conv1 = 1;
const int LAYER_relu_conv1 = 2;
const int BLOB_conv1_relu_conv1 = 2;
const int LAYER_pool1 = 3;
const int BLOB_pool1 = 3;
const int LAYER_fire2_squeeze1x1 = 4;
const int BLOB_fire2_squeeze1x1 = 4;
const int LAYER_fire2_relu_squeeze1x1 = 5;
const int BLOB_fire2_squeeze1x1_fire2_relu_squeeze1x1 = 5;
const int LAYER_splitncnn_0 = 6;
const int BLOB_fire2_squeeze1x1_fire2_relu_squeeze1x1_splitncnn_0 = 6;
const int BLOB_fire2_squeeze1x1_fire2_relu_squeeze1x1_splitncnn_1 = 7;
const int LAYER_fire2_expand1x1 = 7;
const int BLOB_fire2_expand1x1 = 8;
const int LAYER_fire2_relu_expand1x1 = 8;
const int BLOB_fire2_expand1x1_fire2_relu_expand1x1 = 9;
const int LAYER_fire2_expand3x3 = 9;
const int BLOB_fire2_expand3x3 = 10;
const int LAYER_fire2_relu_expand3x3 = 10;
const int BLOB_fire2_expand3x3_fire2_relu_expand3x3 = 11;
const int LAYER_fire2_concat = 11;
const int BLOB_fire2_concat = 12;
const int LAYER_fire3_squeeze1x1 = 12;
const int BLOB_fire3_squeeze1x1 = 13;
const int LAYER_fire3_relu_squeeze1x1 = 13;
const int BLOB_fire3_squeeze1x1_fire3_relu_squeeze1x1 = 14;
const int LAYER_splitncnn_1 = 14;
const int BLOB_fire3_squeeze1x1_fire3_relu_squeeze1x1_splitncnn_0 = 15;
const int BLOB_fire3_squeeze1x1_fire3_relu_squeeze1x1_splitncnn_1 = 16;
const int LAYER_fire3_expand1x1 = 15;
const int BLOB_fire3_expand1x1 = 17;
const int LAYER_fire3_relu_expand1x1 = 16;
const int BLOB_fire3_expand1x1_fire3_relu_expand1x1 = 18;
const int LAYER_fire3_expand3x3 = 17;
const int BLOB_fire3_expand3x3 = 19;
const int LAYER_fire3_relu_expand3x3 = 18;
const int BLOB_fire3_expand3x3_fire3_relu_expand3x3 = 20;
const int LAYER_fire3_concat = 19;
const int BLOB_fire3_concat = 21;
const int LAYER_pool3 = 20;
const int BLOB_pool3 = 22;
const int LAYER_fire4_squeeze1x1 = 21;
const int BLOB_fire4_squeeze1x1 = 23;
const int LAYER_fire4_relu_squeeze1x1 = 22;
const int BLOB_fire4_squeeze1x1_fire4_relu_squeeze1x1 = 24;
const int LAYER_splitncnn_2 = 23;
const int BLOB_fire4_squeeze1x1_fire4_relu_squeeze1x1_splitncnn_0 = 25;
const int BLOB_fire4_squeeze1x1_fire4_relu_squeeze1x1_splitncnn_1 = 26;
const int LAYER_fire4_expand1x1 = 24;
const int BLOB_fire4_expand1x1 = 27;
const int LAYER_fire4_relu_expand1x1 = 25;
const int BLOB_fire4_expand1x1_fire4_relu_expand1x1 = 28;
const int LAYER_fire4_expand3x3 = 26;
const int BLOB_fire4_expand3x3 = 29;
const int LAYER_fire4_relu_expand3x3 = 27;
const int BLOB_fire4_expand3x3_fire4_relu_expand3x3 = 30;
const int LAYER_fire4_concat = 28;
const int BLOB_fire4_concat = 31;
const int LAYER_fire5_squeeze1x1 = 29;
const int BLOB_fire5_squeeze1x1 = 32;
const int LAYER_fire5_relu_squeeze1x1 = 30;
const int BLOB_fire5_squeeze1x1_fire5_relu_squeeze1x1 = 33;
const int LAYER_splitncnn_3 = 31;
const int BLOB_fire5_squeeze1x1_fire5_relu_squeeze1x1_splitncnn_0 = 34;
const int BLOB_fire5_squeeze1x1_fire5_relu_squeeze1x1_splitncnn_1 = 35;
const int LAYER_fire5_expand1x1 = 32;
const int BLOB_fire5_expand1x1 = 36;
const int LAYER_fire5_relu_expand1x1 = 33;
const int BLOB_fire5_expand1x1_fire5_relu_expand1x1 = 37;
const int LAYER_fire5_expand3x3 = 34;
const int BLOB_fire5_expand3x3 = 38;
const int LAYER_fire5_relu_expand3x3 = 35;
const int BLOB_fire5_expand3x3_fire5_relu_expand3x3 = 39;
const int LAYER_fire5_concat = 36;
const int BLOB_fire5_concat = 40;
const int LAYER_pool5 = 37;
const int BLOB_pool5 = 41;
const int LAYER_fire6_squeeze1x1 = 38;
const int BLOB_fire6_squeeze1x1 = 42;
const int LAYER_fire6_relu_squeeze1x1 = 39;
const int BLOB_fire6_squeeze1x1_fire6_relu_squeeze1x1 = 43;
const int LAYER_splitncnn_4 = 40;
const int BLOB_fire6_squeeze1x1_fire6_relu_squeeze1x1_splitncnn_0 = 44;
const int BLOB_fire6_squeeze1x1_fire6_relu_squeeze1x1_splitncnn_1 = 45;
const int LAYER_fire6_expand1x1 = 41;
const int BLOB_fire6_expand1x1 = 46;
const int LAYER_fire6_relu_expand1x1 = 42;
const int BLOB_fire6_expand1x1_fire6_relu_expand1x1 = 47;
const int LAYER_fire6_expand3x3 = 43;
const int BLOB_fire6_expand3x3 = 48;
const int LAYER_fire6_relu_expand3x3 = 44;
const int BLOB_fire6_expand3x3_fire6_relu_expand3x3 = 49;
const int LAYER_fire6_concat = 45;
const int BLOB_fire6_concat = 50;
const int LAYER_fire7_squeeze1x1 = 46;
const int BLOB_fire7_squeeze1x1 = 51;
const int LAYER_fire7_relu_squeeze1x1 = 47;
const int BLOB_fire7_squeeze1x1_fire7_relu_squeeze1x1 = 52;
const int LAYER_splitncnn_5 = 48;
const int BLOB_fire7_squeeze1x1_fire7_relu_squeeze1x1_splitncnn_0 = 53;
const int BLOB_fire7_squeeze1x1_fire7_relu_squeeze1x1_splitncnn_1 = 54;
const int LAYER_fire7_expand1x1 = 49;
const int BLOB_fire7_expand1x1 = 55;
const int LAYER_fire7_relu_expand1x1 = 50;
const int BLOB_fire7_expand1x1_fire7_relu_expand1x1 = 56;
const int LAYER_fire7_expand3x3 = 51;
const int BLOB_fire7_expand3x3 = 57;
const int LAYER_fire7_relu_expand3x3 = 52;
const int BLOB_fire7_expand3x3_fire7_relu_expand3x3 = 58;
const int LAYER_fire7_concat = 53;
const int BLOB_fire7_concat = 59;
const int LAYER_fire8_squeeze1x1 = 54;
const int BLOB_fire8_squeeze1x1 = 60;
const int LAYER_fire8_relu_squeeze1x1 = 55;
const int BLOB_fire8_squeeze1x1_fire8_relu_squeeze1x1 = 61;
const int LAYER_splitncnn_6 = 56;
const int BLOB_fire8_squeeze1x1_fire8_relu_squeeze1x1_splitncnn_0 = 62;
const int BLOB_fire8_squeeze1x1_fire8_relu_squeeze1x1_splitncnn_1 = 63;
const int LAYER_fire8_expand1x1 = 57;
const int BLOB_fire8_expand1x1 = 64;
const int LAYER_fire8_relu_expand1x1 = 58;
const int BLOB_fire8_expand1x1_fire8_relu_expand1x1 = 65;
const int LAYER_fire8_expand3x3 = 59;
const int BLOB_fire8_expand3x3 = 66;
const int LAYER_fire8_relu_expand3x3 = 60;
const int BLOB_fire8_expand3x3_fire8_relu_expand3x3 = 67;
const int LAYER_fire8_concat = 61;
const int BLOB_fire8_concat = 68;
const int LAYER_fire9_squeeze1x1 = 62;
const int BLOB_fire9_squeeze1x1 = 69;
const int LAYER_fire9_relu_squeeze1x1 = 63;
const int BLOB_fire9_squeeze1x1_fire9_relu_squeeze1x1 = 70;
const int LAYER_splitncnn_7 = 64;
const int BLOB_fire9_squeeze1x1_fire9_relu_squeeze1x1_splitncnn_0 = 71;
const int BLOB_fire9_squeeze1x1_fire9_relu_squeeze1x1_splitncnn_1 = 72;
const int LAYER_fire9_expand1x1 = 65;
const int BLOB_fire9_expand1x1 = 73;
const int LAYER_fire9_relu_expand1x1 = 66;
const int BLOB_fire9_expand1x1_fire9_relu_expand1x1 = 74;
const int LAYER_fire9_expand3x3 = 67;
const int BLOB_fire9_expand3x3 = 75;
const int LAYER_fire9_relu_expand3x3 = 68;
const int BLOB_fire9_expand3x3_fire9_relu_expand3x3 = 76;
const int LAYER_fire9_concat = 69;
const int BLOB_fire9_concat = 77;
const int LAYER_drop9 = 70;
const int BLOB_fire9_concat_drop9 = 78;
const int LAYER_conv10 = 71;
const int BLOB_conv10 = 79;
const int LAYER_relu_conv10 = 72;
const int BLOB_conv10_relu_conv10 = 80;
const int LAYER_pool10 = 73;
const int BLOB_pool10 = 81;
const int LAYER_prob = 74;
const int BLOB_prob = 82;
} // namespace squeezenet_v1_1_param_id
#endif // NCNN_INCLUDE_GUARD_squeezenet_v1_1_id_h

+ 10
- 0
examples/squeezencnn/local.properties View File

@@ -0,0 +1,10 @@
# This file is automatically generated by Android Tools.
# Do not modify this file -- YOUR CHANGES WILL BE ERASED!
#
# This file must *NOT* be checked into Version Control Systems,
# as it contains information specific to your local configuration.

# location of the SDK. This is only used by Ant
# For customization when using a Version Control System, please read the
# header note.
sdk.dir=/home/nihui/osd/android-sdk-linux

+ 20
- 0
examples/squeezencnn/proguard-project.txt View File

@@ -0,0 +1,20 @@
# To enable ProGuard in your project, edit project.properties
# to define the proguard.config property as described in that file.
#
# Add project specific ProGuard rules here.
# By default, the flags in this file are appended to flags specified
# in ${sdk.dir}/tools/proguard/proguard-android.txt
# You can edit the include path and order by changing the ProGuard
# include property in project.properties.
#
# For more details, see
# http://developer.android.com/guide/developing/tools/proguard.html

# Add any project specific keep options here:

# If your project uses WebView with JS, uncomment the following
# and specify the fully qualified class name to the JavaScript interface
# class:
#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
# public *;
#}

+ 14
- 0
examples/squeezencnn/project.properties View File

@@ -0,0 +1,14 @@
# This file is automatically generated by Android Tools.
# Do not modify this file -- YOUR CHANGES WILL BE ERASED!
#
# This file must be checked in Version Control Systems.
#
# To customize properties used by the Ant build system edit
# "ant.properties", and override values to adapt the script to your
# project structure.
#
# To enable ProGuard to shrink and obfuscate your code, uncomment this (available properties: sdk.dir, user.home):
#proguard.config=${sdk.dir}/tools/proguard/proguard-android.txt:proguard-project.txt

# Project target.
target=android-9

+ 36
- 0
examples/squeezencnn/res/layout/main.xml View File

@@ -0,0 +1,36 @@
<?xml version="1.0" encoding="utf-8"?>
<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
android:orientation="vertical"
android:layout_width="fill_parent"
android:layout_height="fill_parent">

<LinearLayout
android:orientation="horizontal"
android:layout_width="fill_parent"
android:layout_height="wrap_content">

<Button
android:id="@+id/buttonImage"
android:layout_width="wrap_content"
android:layout_height="wrap_content"
android:text="选图" />
<Button
android:id="@+id/buttonDetect"
android:layout_width="wrap_content"
android:layout_height="wrap_content"
android:text="识别" />
</LinearLayout>

<TextView
android:id="@+id/infoResult"
android:layout_width="fill_parent"
android:layout_height="wrap_content"
android:text="" />

<ImageView
android:id="@+id/imageView"
android:layout_width="fill_parent"
android:layout_height="fill_parent"
android:layout_weight="1" />

</LinearLayout>

+ 4
- 0
examples/squeezencnn/res/values/strings.xml View File

@@ -0,0 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<resources>
<string name="app_name">squeezencnn</string>
</resources>

+ 189
- 0
examples/squeezencnn/src/com/tencent/squeezencnn/MainActivity.java View File

@@ -0,0 +1,189 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

package com.tencent.squeezencnn;

import android.app.Activity;
import android.os.Bundle;

import android.content.Context;
import android.content.Intent;
import android.database.Cursor;
import android.graphics.Bitmap;
import android.graphics.BitmapFactory;
import android.net.Uri;
import android.provider.MediaStore;
import android.util.Log;
import android.view.View;
import android.widget.Button;
import android.widget.ImageView;
import android.widget.TextView;

import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;

import com.tencent.squeezencnn.SqueezeNcnn;

public class MainActivity extends Activity
{
private static final int SELECT_IMAGE = 1;

private TextView infoResult;
private ImageView imageView;
private Bitmap yourSelectedImage = null;

private SqueezeNcnn squeezencnn = new SqueezeNcnn();

/** Called when the activity is first created. */
@Override
public void onCreate(Bundle savedInstanceState)
{
super.onCreate(savedInstanceState);
setContentView(R.layout.main);

try
{
initSqueezeNcnn();
}
catch (IOException e)
{
Log.e("MainActivity", "initSqueezeNcnn error");
}

infoResult = (TextView) findViewById(R.id.infoResult);
imageView = (ImageView) findViewById(R.id.imageView);

Button buttonImage = (Button) findViewById(R.id.buttonImage);
buttonImage.setOnClickListener(new View.OnClickListener() {
@Override
public void onClick(View arg0) {
Intent i = new Intent(Intent.ACTION_PICK);
i.setType("image/*");
startActivityForResult(i, SELECT_IMAGE);
}
});

Button buttonDetect = (Button) findViewById(R.id.buttonDetect);
buttonDetect.setOnClickListener(new View.OnClickListener() {
@Override
public void onClick(View arg0) {
if (yourSelectedImage == null)
return;

String result = squeezencnn.Detect(yourSelectedImage);

if (result == null)
{
infoResult.setText("detect failed");
}
else
{
infoResult.setText(result);
}
}
});
}

private void initSqueezeNcnn() throws IOException
{
byte[] param = null;
byte[] bin = null;
byte[] words = null;

{
InputStream assetsInputStream = getAssets().open("squeezenet_v1.1.param.bin");
int available = assetsInputStream.available();
param = new byte[available];
int byteCode = assetsInputStream.read(param);
assetsInputStream.close();
}
{
InputStream assetsInputStream = getAssets().open("squeezenet_v1.1.bin");
int available = assetsInputStream.available();
bin = new byte[available];
int byteCode = assetsInputStream.read(bin);
assetsInputStream.close();
}
{
InputStream assetsInputStream = getAssets().open("synset_words.txt");
int available = assetsInputStream.available();
words = new byte[available];
int byteCode = assetsInputStream.read(words);
assetsInputStream.close();
}

squeezencnn.Init(param, bin, words);
}

@Override
protected void onActivityResult(int requestCode, int resultCode, Intent data)
{
super.onActivityResult(requestCode, resultCode, data);

if (resultCode == RESULT_OK && null != data) {
Uri selectedImage = data.getData();

try
{
if (requestCode == SELECT_IMAGE) {
Bitmap bitmap = decodeUri(selectedImage);

Bitmap rgba = bitmap.copy(Bitmap.Config.ARGB_8888, true);

// resize to 227x227
yourSelectedImage = Bitmap.createScaledBitmap(rgba, 227, 227, false);

imageView.setImageBitmap(yourSelectedImage);
}
}
catch (FileNotFoundException e)
{
Log.e("MainActivity", "FileNotFoundException");
return;
}
}
}

private Bitmap decodeUri(Uri selectedImage) throws FileNotFoundException
{
// Decode image size
BitmapFactory.Options o = new BitmapFactory.Options();
o.inJustDecodeBounds = true;
BitmapFactory.decodeStream(getContentResolver().openInputStream(selectedImage), null, o);

// The new size we want to scale to
final int REQUIRED_SIZE = 400;

// Find the correct scale value. It should be the power of 2.
int width_tmp = o.outWidth, height_tmp = o.outHeight;
int scale = 1;
while (true) {
if (width_tmp / 2 < REQUIRED_SIZE
|| height_tmp / 2 < REQUIRED_SIZE) {
break;
}
width_tmp /= 2;
height_tmp /= 2;
scale *= 2;
}

// Decode with inSampleSize
BitmapFactory.Options o2 = new BitmapFactory.Options();
o2.inSampleSize = scale;
return BitmapFactory.decodeStream(getContentResolver().openInputStream(selectedImage), null, o2);
}

}

+ 29
- 0
examples/squeezencnn/src/com/tencent/squeezencnn/SqueezeNcnn.java View File

@@ -0,0 +1,29 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

package com.tencent.squeezencnn;

import android.graphics.Bitmap;
import android.content.Context;

public class SqueezeNcnn
{
public native boolean Init(byte[] param, byte[] bin, byte[] words);

public native String Detect(Bitmap bitmap);

static {
System.loadLibrary("squeezencnn");
}
}

+ 95
- 0
examples/squeezenet.cpp View File

@@ -0,0 +1,95 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include <stdio.h>
#include <algorithm>
#include <vector>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>

#include "net.h"

static int detect_squeezenet(const cv::Mat& bgr, std::vector<float>& cls_scores)
{
ncnn::Net squeezenet;
squeezenet.load_param("squeezenet_v1.1.param");
squeezenet.load_model("squeezenet_v1.1.bin");

ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, 227, 227);

const float mean_vals[3] = {104.f, 117.f, 123.f};
in.substract_mean_normalize(mean_vals, 0);

ncnn::Extractor ex = squeezenet.create_extractor();
ex.set_light_mode(true);

ex.input("data", in);

ncnn::Mat out;
ex.extract("prob", out);

cls_scores.resize(out.c);
for (int j=0; j<out.c; j++)
{
const float* prob = out.data + out.cstep * j;
cls_scores[j] = prob[0];
}

return 0;
}

static int print_topk(const std::vector<float>& cls_scores, int topk)
{
// partial sort topk with index
int size = cls_scores.size();
std::vector< std::pair<float, int> > vec;
vec.resize(size);
for (int i=0; i<size; i++)
{
vec[i] = std::make_pair(cls_scores[i], i);
}

std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
std::greater< std::pair<float, int> >());

// print topk and score
for (int i=0; i<topk; i++)
{
float score = vec[i].first;
int index = vec[i].second;
fprintf(stderr, "%d = %f\n", index, score);
}

return 0;
}

int main(int argc, char** argv)
{
const char* imagepath = argv[1];

cv::Mat m = cv::imread(imagepath, CV_LOAD_IMAGE_COLOR);
if (m.empty())
{
fprintf(stderr, "cv::imread %s failed\n", imagepath);
return -1;
}

std::vector<float> cls_scores;
detect_squeezenet(m, cls_scores);

print_topk(cls_scores, 3);

return 0;
}


BIN
examples/squeezenet_v1.1.bin View File


BIN
examples/squeezenet_v1.1.caffemodel View File


+ 76
- 0
examples/squeezenet_v1.1.param View File

@@ -0,0 +1,76 @@
75 83
Input data 0 1 data 3 227 227
Convolution conv1 1 1 data conv1 64 3 1 2 0 1 1728
ReLU relu_conv1 1 1 conv1 conv1_relu_conv1 0.000000
Pooling pool1 1 1 conv1_relu_conv1 pool1 0 3 2 0 0
Convolution fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1 16 1 1 1 0 1 1024
ReLU fire2/relu_squeeze1x1 1 1 fire2/squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1 0.000000
Split splitncnn_0 1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1
Convolution fire2/expand1x1 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1 64 1 1 1 0 1 1024
ReLU fire2/relu_expand1x1 1 1 fire2/expand1x1 fire2/expand1x1_fire2/relu_expand1x1 0.000000
Convolution fire2/expand3x3 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3 64 3 1 1 1 1 9216
ReLU fire2/relu_expand3x3 1 1 fire2/expand3x3 fire2/expand3x3_fire2/relu_expand3x3 0.000000
Concat fire2/concat 2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat
Convolution fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1 16 1 1 1 0 1 2048
ReLU fire3/relu_squeeze1x1 1 1 fire3/squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1 0.000000
Split splitncnn_1 1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1
Convolution fire3/expand1x1 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1 64 1 1 1 0 1 1024
ReLU fire3/relu_expand1x1 1 1 fire3/expand1x1 fire3/expand1x1_fire3/relu_expand1x1 0.000000
Convolution fire3/expand3x3 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3 64 3 1 1 1 1 9216
ReLU fire3/relu_expand3x3 1 1 fire3/expand3x3 fire3/expand3x3_fire3/relu_expand3x3 0.000000
Concat fire3/concat 2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat
Pooling pool3 1 1 fire3/concat pool3 0 3 2 0 0
Convolution fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1 32 1 1 1 0 1 4096
ReLU fire4/relu_squeeze1x1 1 1 fire4/squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1 0.000000
Split splitncnn_2 1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1
Convolution fire4/expand1x1 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1 128 1 1 1 0 1 4096
ReLU fire4/relu_expand1x1 1 1 fire4/expand1x1 fire4/expand1x1_fire4/relu_expand1x1 0.000000
Convolution fire4/expand3x3 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3 128 3 1 1 1 1 36864
ReLU fire4/relu_expand3x3 1 1 fire4/expand3x3 fire4/expand3x3_fire4/relu_expand3x3 0.000000
Concat fire4/concat 2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat
Convolution fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1 32 1 1 1 0 1 8192
ReLU fire5/relu_squeeze1x1 1 1 fire5/squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1 0.000000
Split splitncnn_3 1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1
Convolution fire5/expand1x1 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1 128 1 1 1 0 1 4096
ReLU fire5/relu_expand1x1 1 1 fire5/expand1x1 fire5/expand1x1_fire5/relu_expand1x1 0.000000
Convolution fire5/expand3x3 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3 128 3 1 1 1 1 36864
ReLU fire5/relu_expand3x3 1 1 fire5/expand3x3 fire5/expand3x3_fire5/relu_expand3x3 0.000000
Concat fire5/concat 2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat
Pooling pool5 1 1 fire5/concat pool5 0 3 2 0 0
Convolution fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1 48 1 1 1 0 1 12288
ReLU fire6/relu_squeeze1x1 1 1 fire6/squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1 0.000000
Split splitncnn_4 1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1
Convolution fire6/expand1x1 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1 192 1 1 1 0 1 9216
ReLU fire6/relu_expand1x1 1 1 fire6/expand1x1 fire6/expand1x1_fire6/relu_expand1x1 0.000000
Convolution fire6/expand3x3 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3 192 3 1 1 1 1 82944
ReLU fire6/relu_expand3x3 1 1 fire6/expand3x3 fire6/expand3x3_fire6/relu_expand3x3 0.000000
Concat fire6/concat 2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat
Convolution fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1 48 1 1 1 0 1 18432
ReLU fire7/relu_squeeze1x1 1 1 fire7/squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1 0.000000
Split splitncnn_5 1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1
Convolution fire7/expand1x1 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1 192 1 1 1 0 1 9216
ReLU fire7/relu_expand1x1 1 1 fire7/expand1x1 fire7/expand1x1_fire7/relu_expand1x1 0.000000
Convolution fire7/expand3x3 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3 192 3 1 1 1 1 82944
ReLU fire7/relu_expand3x3 1 1 fire7/expand3x3 fire7/expand3x3_fire7/relu_expand3x3 0.000000
Concat fire7/concat 2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat
Convolution fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1 64 1 1 1 0 1 24576
ReLU fire8/relu_squeeze1x1 1 1 fire8/squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1 0.000000
Split splitncnn_6 1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1
Convolution fire8/expand1x1 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1 256 1 1 1 0 1 16384
ReLU fire8/relu_expand1x1 1 1 fire8/expand1x1 fire8/expand1x1_fire8/relu_expand1x1 0.000000
Convolution fire8/expand3x3 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3 256 3 1 1 1 1 147456
ReLU fire8/relu_expand3x3 1 1 fire8/expand3x3 fire8/expand3x3_fire8/relu_expand3x3 0.000000
Concat fire8/concat 2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat
Convolution fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1 64 1 1 1 0 1 32768
ReLU fire9/relu_squeeze1x1 1 1 fire9/squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1 0.000000
Split splitncnn_7 1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1
Convolution fire9/expand1x1 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1 256 1 1 1 0 1 16384
ReLU fire9/relu_expand1x1 1 1 fire9/expand1x1 fire9/expand1x1_fire9/relu_expand1x1 0.000000
Convolution fire9/expand3x3 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3 256 3 1 1 1 1 147456
ReLU fire9/relu_expand3x3 1 1 fire9/expand3x3 fire9/expand3x3_fire9/relu_expand3x3 0.000000
Concat fire9/concat 2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat
Dropout drop9 1 1 fire9/concat fire9/concat_drop9
Convolution conv10 1 1 fire9/concat_drop9 conv10 1000 1 1 1 1 1 512000
ReLU relu_conv10 1 1 conv10 conv10_relu_conv10 0.000000
Pooling pool10 1 1 conv10_relu_conv10 pool10 1 0 1 0 1
Softmax prob 1 1 pool10 prob

+ 548
- 0
examples/squeezenet_v1.1.prototxt View File

@@ -0,0 +1,548 @@
name: "squeezenet_v1.1_deploy"

layer {
name: "data"
type: "Input"
top: "data"
input_param { shape: { dim: 1 dim: 3 dim: 227 dim: 227 } }
}
layer {
name: "conv1"
type: "Convolution"
bottom: "data"
top: "conv1"
convolution_param {
num_output: 64
kernel_size: 3
stride: 2
}
}
layer {
name: "relu_conv1"
type: "ReLU"
bottom: "conv1"
top: "conv1"
}
layer {
name: "pool1"
type: "Pooling"
bottom: "conv1"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layer {
name: "fire2/squeeze1x1"
type: "Convolution"
bottom: "pool1"
top: "fire2/squeeze1x1"
convolution_param {
num_output: 16
kernel_size: 1
}
}
layer {
name: "fire2/relu_squeeze1x1"
type: "ReLU"
bottom: "fire2/squeeze1x1"
top: "fire2/squeeze1x1"
}
layer {
name: "fire2/expand1x1"
type: "Convolution"
bottom: "fire2/squeeze1x1"
top: "fire2/expand1x1"
convolution_param {
num_output: 64
kernel_size: 1
}
}
layer {
name: "fire2/relu_expand1x1"
type: "ReLU"
bottom: "fire2/expand1x1"
top: "fire2/expand1x1"
}
layer {
name: "fire2/expand3x3"
type: "Convolution"
bottom: "fire2/squeeze1x1"
top: "fire2/expand3x3"
convolution_param {
num_output: 64
pad: 1
kernel_size: 3
}
}
layer {
name: "fire2/relu_expand3x3"
type: "ReLU"
bottom: "fire2/expand3x3"
top: "fire2/expand3x3"
}
layer {
name: "fire2/concat"
type: "Concat"
bottom: "fire2/expand1x1"
bottom: "fire2/expand3x3"
top: "fire2/concat"
}
layer {
name: "fire3/squeeze1x1"
type: "Convolution"
bottom: "fire2/concat"
top: "fire3/squeeze1x1"
convolution_param {
num_output: 16
kernel_size: 1
}
}
layer {
name: "fire3/relu_squeeze1x1"
type: "ReLU"
bottom: "fire3/squeeze1x1"
top: "fire3/squeeze1x1"
}
layer {
name: "fire3/expand1x1"
type: "Convolution"
bottom: "fire3/squeeze1x1"
top: "fire3/expand1x1"
convolution_param {
num_output: 64
kernel_size: 1
}
}
layer {
name: "fire3/relu_expand1x1"
type: "ReLU"
bottom: "fire3/expand1x1"
top: "fire3/expand1x1"
}
layer {
name: "fire3/expand3x3"
type: "Convolution"
bottom: "fire3/squeeze1x1"
top: "fire3/expand3x3"
convolution_param {
num_output: 64
pad: 1
kernel_size: 3
}
}
layer {
name: "fire3/relu_expand3x3"
type: "ReLU"
bottom: "fire3/expand3x3"
top: "fire3/expand3x3"
}
layer {
name: "fire3/concat"
type: "Concat"
bottom: "fire3/expand1x1"
bottom: "fire3/expand3x3"
top: "fire3/concat"
}
layer {
name: "pool3"
type: "Pooling"
bottom: "fire3/concat"
top: "pool3"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layer {
name: "fire4/squeeze1x1"
type: "Convolution"
bottom: "pool3"
top: "fire4/squeeze1x1"
convolution_param {
num_output: 32
kernel_size: 1
}
}
layer {
name: "fire4/relu_squeeze1x1"
type: "ReLU"
bottom: "fire4/squeeze1x1"
top: "fire4/squeeze1x1"
}
layer {
name: "fire4/expand1x1"
type: "Convolution"
bottom: "fire4/squeeze1x1"
top: "fire4/expand1x1"
convolution_param {
num_output: 128
kernel_size: 1
}
}
layer {
name: "fire4/relu_expand1x1"
type: "ReLU"
bottom: "fire4/expand1x1"
top: "fire4/expand1x1"
}
layer {
name: "fire4/expand3x3"
type: "Convolution"
bottom: "fire4/squeeze1x1"
top: "fire4/expand3x3"
convolution_param {
num_output: 128
pad: 1
kernel_size: 3
}
}
layer {
name: "fire4/relu_expand3x3"
type: "ReLU"
bottom: "fire4/expand3x3"
top: "fire4/expand3x3"
}
layer {
name: "fire4/concat"
type: "Concat"
bottom: "fire4/expand1x1"
bottom: "fire4/expand3x3"
top: "fire4/concat"
}
layer {
name: "fire5/squeeze1x1"
type: "Convolution"
bottom: "fire4/concat"
top: "fire5/squeeze1x1"
convolution_param {
num_output: 32
kernel_size: 1
}
}
layer {
name: "fire5/relu_squeeze1x1"
type: "ReLU"
bottom: "fire5/squeeze1x1"
top: "fire5/squeeze1x1"
}
layer {
name: "fire5/expand1x1"
type: "Convolution"
bottom: "fire5/squeeze1x1"
top: "fire5/expand1x1"
convolution_param {
num_output: 128
kernel_size: 1
}
}
layer {
name: "fire5/relu_expand1x1"
type: "ReLU"
bottom: "fire5/expand1x1"
top: "fire5/expand1x1"
}
layer {
name: "fire5/expand3x3"
type: "Convolution"
bottom: "fire5/squeeze1x1"
top: "fire5/expand3x3"
convolution_param {
num_output: 128
pad: 1
kernel_size: 3
}
}
layer {
name: "fire5/relu_expand3x3"
type: "ReLU"
bottom: "fire5/expand3x3"
top: "fire5/expand3x3"
}
layer {
name: "fire5/concat"
type: "Concat"
bottom: "fire5/expand1x1"
bottom: "fire5/expand3x3"
top: "fire5/concat"
}
layer {
name: "pool5"
type: "Pooling"
bottom: "fire5/concat"
top: "pool5"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layer {
name: "fire6/squeeze1x1"
type: "Convolution"
bottom: "pool5"
top: "fire6/squeeze1x1"
convolution_param {
num_output: 48
kernel_size: 1
}
}
layer {
name: "fire6/relu_squeeze1x1"
type: "ReLU"
bottom: "fire6/squeeze1x1"
top: "fire6/squeeze1x1"
}
layer {
name: "fire6/expand1x1"
type: "Convolution"
bottom: "fire6/squeeze1x1"
top: "fire6/expand1x1"
convolution_param {
num_output: 192
kernel_size: 1
}
}
layer {
name: "fire6/relu_expand1x1"
type: "ReLU"
bottom: "fire6/expand1x1"
top: "fire6/expand1x1"
}
layer {
name: "fire6/expand3x3"
type: "Convolution"
bottom: "fire6/squeeze1x1"
top: "fire6/expand3x3"
convolution_param {
num_output: 192
pad: 1
kernel_size: 3
}
}
layer {
name: "fire6/relu_expand3x3"
type: "ReLU"
bottom: "fire6/expand3x3"
top: "fire6/expand3x3"
}
layer {
name: "fire6/concat"
type: "Concat"
bottom: "fire6/expand1x1"
bottom: "fire6/expand3x3"
top: "fire6/concat"
}
layer {
name: "fire7/squeeze1x1"
type: "Convolution"
bottom: "fire6/concat"
top: "fire7/squeeze1x1"
convolution_param {
num_output: 48
kernel_size: 1
}
}
layer {
name: "fire7/relu_squeeze1x1"
type: "ReLU"
bottom: "fire7/squeeze1x1"
top: "fire7/squeeze1x1"
}
layer {
name: "fire7/expand1x1"
type: "Convolution"
bottom: "fire7/squeeze1x1"
top: "fire7/expand1x1"
convolution_param {
num_output: 192
kernel_size: 1
}
}
layer {
name: "fire7/relu_expand1x1"
type: "ReLU"
bottom: "fire7/expand1x1"
top: "fire7/expand1x1"
}
layer {
name: "fire7/expand3x3"
type: "Convolution"
bottom: "fire7/squeeze1x1"
top: "fire7/expand3x3"
convolution_param {
num_output: 192
pad: 1
kernel_size: 3
}
}
layer {
name: "fire7/relu_expand3x3"
type: "ReLU"
bottom: "fire7/expand3x3"
top: "fire7/expand3x3"
}
layer {
name: "fire7/concat"
type: "Concat"
bottom: "fire7/expand1x1"
bottom: "fire7/expand3x3"
top: "fire7/concat"
}
layer {
name: "fire8/squeeze1x1"
type: "Convolution"
bottom: "fire7/concat"
top: "fire8/squeeze1x1"
convolution_param {
num_output: 64
kernel_size: 1
}
}
layer {
name: "fire8/relu_squeeze1x1"
type: "ReLU"
bottom: "fire8/squeeze1x1"
top: "fire8/squeeze1x1"
}
layer {
name: "fire8/expand1x1"
type: "Convolution"
bottom: "fire8/squeeze1x1"
top: "fire8/expand1x1"
convolution_param {
num_output: 256
kernel_size: 1
}
}
layer {
name: "fire8/relu_expand1x1"
type: "ReLU"
bottom: "fire8/expand1x1"
top: "fire8/expand1x1"
}
layer {
name: "fire8/expand3x3"
type: "Convolution"
bottom: "fire8/squeeze1x1"
top: "fire8/expand3x3"
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
}
}
layer {
name: "fire8/relu_expand3x3"
type: "ReLU"
bottom: "fire8/expand3x3"
top: "fire8/expand3x3"
}
layer {
name: "fire8/concat"
type: "Concat"
bottom: "fire8/expand1x1"
bottom: "fire8/expand3x3"
top: "fire8/concat"
}
layer {
name: "fire9/squeeze1x1"
type: "Convolution"
bottom: "fire8/concat"
top: "fire9/squeeze1x1"
convolution_param {
num_output: 64
kernel_size: 1
}
}
layer {
name: "fire9/relu_squeeze1x1"
type: "ReLU"
bottom: "fire9/squeeze1x1"
top: "fire9/squeeze1x1"
}
layer {
name: "fire9/expand1x1"
type: "Convolution"
bottom: "fire9/squeeze1x1"
top: "fire9/expand1x1"
convolution_param {
num_output: 256
kernel_size: 1
}
}
layer {
name: "fire9/relu_expand1x1"
type: "ReLU"
bottom: "fire9/expand1x1"
top: "fire9/expand1x1"
}
layer {
name: "fire9/expand3x3"
type: "Convolution"
bottom: "fire9/squeeze1x1"
top: "fire9/expand3x3"
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
}
}
layer {
name: "fire9/relu_expand3x3"
type: "ReLU"
bottom: "fire9/expand3x3"
top: "fire9/expand3x3"
}
layer {
name: "fire9/concat"
type: "Concat"
bottom: "fire9/expand1x1"
bottom: "fire9/expand3x3"
top: "fire9/concat"
}
layer {
name: "drop9"
type: "Dropout"
bottom: "fire9/concat"
top: "fire9/concat"
dropout_param {
dropout_ratio: 0.5
}
}
layer {
name: "conv10"
type: "Convolution"
bottom: "fire9/concat"
top: "conv10"
convolution_param {
num_output: 1000
pad: 1
kernel_size: 1
}
}
layer {
name: "relu_conv10"
type: "ReLU"
bottom: "conv10"
top: "conv10"
}
layer {
name: "pool10"
type: "Pooling"
bottom: "conv10"
top: "pool10"
pooling_param {
pool: AVE
global_pooling: true
}
}
layer {
name: "prob"
type: "Softmax"
bottom: "pool10"
top: "prob"
}

+ 1000
- 0
examples/synset_words.txt
File diff suppressed because it is too large
View File


+ 193
- 0
ios.toolchain.cmake View File

@@ -0,0 +1,193 @@
# This file is based off of the Platform/Darwin.cmake and Platform/UnixPaths.cmake
# files which are included with CMake 2.8.4
# It has been altered for iOS development

# Options:
#
# IOS_PLATFORM = iPhoneOS (default) or iPhoneSimulator
# This decides if SDKS will be selected from the iPhoneOS.platform or iPhoneSimulator.platform folders
# iPhoneOS - the default, used to build for iPhone and iPad physical devices, which have an arm arch.
# iPhoneSimulator - used to build for the Simulator platforms, which have an x86 arch.
#
# CMAKE_IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder
# By default this location is automatcially chosen based on the IOS_PLATFORM value above.
# If set manually, it will override the default location and force the user of a particular Developer Platform
#
# CMAKE_IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder
# By default this location is automatcially chosen based on the CMAKE_IOS_DEVELOPER_ROOT value.
# In this case it will always be the most up-to-date SDK found in the CMAKE_IOS_DEVELOPER_ROOT path.
# If set manually, this will force the use of a specific SDK version

# Macros:
#
# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE)
# A convenience macro for setting xcode specific properties on targets
# example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1")
#
# find_host_package (PROGRAM ARGS)
# A macro used to find executable programs on the host system, not within the iOS environment.
# Thanks to the android-cmake project for providing the command

# Standard settings
set (CMAKE_SYSTEM_NAME Darwin)
set (CMAKE_SYSTEM_VERSION 1)
set (UNIX True)
set (APPLE True)
set (IOS True)

# Required as of cmake 2.8.10
set (CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE)

# Determine the cmake host system version so we know where to find the iOS SDKs
find_program (CMAKE_UNAME uname /bin /usr/bin /usr/local/bin)
if (CMAKE_UNAME)
exec_program(uname ARGS -r OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_VERSION)
string (REGEX REPLACE "^([0-9]+)\\.([0-9]+).*$" "\\1" DARWIN_MAJOR_VERSION "${CMAKE_HOST_SYSTEM_VERSION}")
endif (CMAKE_UNAME)

# Force the compilers to gcc for iOS
include (CMakeForceCompiler)
CMAKE_FORCE_C_COMPILER (/usr/bin/clang Apple)
CMAKE_FORCE_CXX_COMPILER (/usr/bin/clang++ Apple)
set(CMAKE_AR ar CACHE FILEPATH "" FORCE)

# Skip the platform compiler checks for cross compiling
set (CMAKE_CXX_COMPILER_WORKS TRUE)
set (CMAKE_C_COMPILER_WORKS TRUE)

# All iOS/Darwin specific settings - some may be redundant
set (CMAKE_SHARED_LIBRARY_PREFIX "lib")
set (CMAKE_SHARED_LIBRARY_SUFFIX ".dylib")
set (CMAKE_SHARED_MODULE_PREFIX "lib")
set (CMAKE_SHARED_MODULE_SUFFIX ".so")
set (CMAKE_MODULE_EXISTS 1)
set (CMAKE_DL_LIBS "")

set (CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ")
set (CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ")
set (CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
set (CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")

# Hidden visibilty is required for cxx on iOS
set (CMAKE_C_FLAGS_INIT "-isysroot ${CMAKE_OSX_SYSROOT} -miphoneos-version-min=6.0")
set (CMAKE_CXX_FLAGS_INIT "-stdlib=libc++ -fvisibility=hidden -fvisibility-inlines-hidden -isysroot ${CMAKE_OSX_SYSROOT} -miphoneos-version-min=6.0")

set (CMAKE_C_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}")
set (CMAKE_CXX_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}")

set (CMAKE_PLATFORM_HAS_INSTALLNAME 1)
set (CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names")
set (CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names")
set (CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,")
set (CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,")
set (CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a")

# hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree
# (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache
# and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun)
# hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex
if (NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool)
endif (NOT DEFINED CMAKE_INSTALL_NAME_TOOL)

# Setup iOS platform unless specified manually with IOS_PLATFORM
if (NOT DEFINED IOS_PLATFORM)
set (IOS_PLATFORM "iPhoneOS")
endif (NOT DEFINED IOS_PLATFORM)
set (IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")

# Check the platform selection and setup for developer root
if (${IOS_PLATFORM} STREQUAL "iPhoneOS")
set (IOS_PLATFORM_LOCATION "iPhoneOS.platform")

# This causes the installers to properly locate the output libraries
set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos")
elseif (${IOS_PLATFORM} STREQUAL "iPhoneSimulator")
set (IOS_PLATFORM_LOCATION "iPhoneSimulator.platform")

# This causes the installers to properly locate the output libraries
set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator")
else (${IOS_PLATFORM} STREQUAL "iPhoneOS")
message (FATAL_ERROR "Unsupported IOS_PLATFORM value selected. Please choose iPhoneOS or iPhoneSimulator")
endif (${IOS_PLATFORM} STREQUAL "iPhoneOS")

# Setup iOS developer location unless specified manually with CMAKE_IOS_DEVELOPER_ROOT
# Note Xcode 4.3 changed the installation location, choose the most recent one available
set (XCODE_POST_43_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
set (XCODE_PRE_43_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
if (NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT)
if (EXISTS ${XCODE_POST_43_ROOT})
set (CMAKE_IOS_DEVELOPER_ROOT ${XCODE_POST_43_ROOT})
elseif(EXISTS ${XCODE_PRE_43_ROOT})
set (CMAKE_IOS_DEVELOPER_ROOT ${XCODE_PRE_43_ROOT})
endif (EXISTS ${XCODE_POST_43_ROOT})
endif (NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT)
set (CMAKE_IOS_DEVELOPER_ROOT ${CMAKE_IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform")

# Find and use the most recent iOS sdk unless specified manually with CMAKE_IOS_SDK_ROOT
if (NOT DEFINED CMAKE_IOS_SDK_ROOT)
file (GLOB _CMAKE_IOS_SDKS "${CMAKE_IOS_DEVELOPER_ROOT}/SDKs/*")
if (_CMAKE_IOS_SDKS)
list (SORT _CMAKE_IOS_SDKS)
list (REVERSE _CMAKE_IOS_SDKS)
list (GET _CMAKE_IOS_SDKS 0 CMAKE_IOS_SDK_ROOT)
else (_CMAKE_IOS_SDKS)
message (FATAL_ERROR "No iOS SDK's found in default search path ${CMAKE_IOS_DEVELOPER_ROOT}. Manually set CMAKE_IOS_SDK_ROOT or install the iOS SDK.")
endif (_CMAKE_IOS_SDKS)
message (STATUS "Toolchain using default iOS SDK: ${CMAKE_IOS_SDK_ROOT}")
endif (NOT DEFINED CMAKE_IOS_SDK_ROOT)
set (CMAKE_IOS_SDK_ROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK")

# Set the sysroot default to the most recent SDK
set (CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")

# set the architecture for iOS
# NOTE: Currently both ARCHS_STANDARD_32_BIT and ARCHS_UNIVERSAL_IPHONE_OS set armv7 only, so set both manually
if (${IOS_PLATFORM} STREQUAL "iPhoneOS")
set (IOS_ARCH armv7)
else (${IOS_PLATFORM} STREQUAL "iPhoneOS")
set (IOS_ARCH i386)
endif (${IOS_PLATFORM} STREQUAL "iPhoneOS")

set (CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS")

# Set the find root to the iOS developer roots and to user defined paths
set (CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE string "iOS find search path root")

# default to searching for frameworks first
set (CMAKE_FIND_FRAMEWORK FIRST)

# set up the default search directories for frameworks
set (CMAKE_SYSTEM_FRAMEWORK_PATH
${CMAKE_IOS_SDK_ROOT}/System/Library/Frameworks
${CMAKE_IOS_SDK_ROOT}/System/Library/PrivateFrameworks
${CMAKE_IOS_SDK_ROOT}/Developer/Library/Frameworks
)

# only search the iOS sdks, not the remainder of the host filesystem
set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)


# This little macro lets you set any XCode specific property
macro (set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE)
set_property (TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE})
endmacro (set_xcode_property)


# This macro lets you find executable programs on the host system
macro (find_host_package)
set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
set (IOS FALSE)

find_package(${ARGN})

set (IOS TRUE)
set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
endmacro (find_host_package)


+ 40
- 0
iossimxc.toolchain.cmake View File

@@ -0,0 +1,40 @@
# Standard settings
# set(UNIX True)
# set(Darwin True)
# set(IOS True)
set (CMAKE_SYSTEM_NAME Darwin)
set (CMAKE_SYSTEM_VERSION 1)
set (UNIX True)
set (APPLE True)
set (IOS True)

# suppress -rdynamic
# set(CMAKE_SYSTEM_NAME Generic)

set(CMAKE_C_COMPILER i386-apple-darwin11-clang)
set(CMAKE_CXX_COMPILER i386-apple-darwin11-clang++)

set(_CMAKE_TOOLCHAIN_PREFIX i386-apple-darwin11-)

set(CMAKE_IOS_SDK_ROOT "/home/nihui/osd/cctools-port/usage_examples/ios_toolchain/target-sim/SDK/")

# Set the sysroot default to the most recent SDK
set(CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS Simulator support")

# set the architecture for iOS
# set(IOS_ARCH i386)
# set(IOS_ARCH x86_64)
set(IOS_ARCH i386;x86_64)

set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS Simulator")

# Set the find root to the iOS developer roots and to user defined paths
set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE string "iOS Simulator find search path root")

# searching for frameworks only
set(CMAKE_FIND_FRAMEWORK FIRST)

# set up the default search directories for frameworks
set(CMAKE_SYSTEM_FRAMEWORK_PATH
${CMAKE_IOS_SDK_ROOT}/System/Library/Frameworks
)

+ 39
- 0
iosxc.toolchain.cmake View File

@@ -0,0 +1,39 @@
# Standard settings
# set(UNIX True)
# set(Darwin True)
# set(IOS True)
set (CMAKE_SYSTEM_NAME Darwin)
set (CMAKE_SYSTEM_VERSION 1)
set (UNIX True)
set (APPLE True)
set (IOS True)

# suppress -rdynamic
# set(CMAKE_SYSTEM_NAME Generic)

set(CMAKE_C_COMPILER arm-apple-darwin11-clang)
set(CMAKE_CXX_COMPILER arm-apple-darwin11-clang++)

set(_CMAKE_TOOLCHAIN_PREFIX arm-apple-darwin11-)

set(CMAKE_IOS_SDK_ROOT "/home/nihui/osd/cctools-port/usage_examples/ios_toolchain/target/SDK/")

# Set the sysroot default to the most recent SDK
set(CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")

# set the architecture for iOS
# set(IOS_ARCH arm64)
set(IOS_ARCH armv7;arm64)

set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS")

# Set the find root to the iOS developer roots and to user defined paths
set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE string "iOS find search path root")

# searching for frameworks only
set(CMAKE_FIND_FRAMEWORK FIRST)

# set up the default search directories for frameworks
set(CMAKE_SYSTEM_FRAMEWORK_PATH
${CMAKE_IOS_SDK_ROOT}/System/Library/Frameworks
)

+ 35
- 0
package.sh View File

@@ -0,0 +1,35 @@
#!/usr/bin/bash

NAME=ncnn

##### package android lib
ANDROIDPKGNAME=${NAME}-android-lib
rm -rf $ANDROIDPKGNAME
mkdir -p $ANDROIDPKGNAME
mkdir -p $ANDROIDPKGNAME/armeabi-v7a
mkdir -p $ANDROIDPKGNAME/arm64-v8a
mkdir -p $ANDROIDPKGNAME/include
cp build-android-armv7/install/lib/lib${NAME}.a $ANDROIDPKGNAME/armeabi-v7a/
cp build-android-aarch64/install/lib/lib${NAME}.a $ANDROIDPKGNAME/arm64-v8a/
cp build-android-aarch64/install/include/* $ANDROIDPKGNAME/include/
rm -f $ANDROIDPKGNAME.zip
zip -9 -r $ANDROIDPKGNAME.zip $ANDROIDPKGNAME

##### package ios framework
IOSPKGNAME=${NAME}.framework
rm -rf $IOSPKGNAME
mkdir -p $IOSPKGNAME/Versions/A/Headers
mkdir -p $IOSPKGNAME/Versions/A/Resources
ln -s A $IOSPKGNAME/Versions/Current
ln -s Versions/Current/Headers $IOSPKGNAME/Headers
ln -s Versions/Current/Resources $IOSPKGNAME/Resources
ln -s Versions/Current/${NAME} $IOSPKGNAME/${NAME}
lipo -create \
build-ios/install/lib/lib${NAME}.a \
build-ios-sim/install/lib/lib${NAME}.a \
-o $IOSPKGNAME/Versions/A/${NAME}
cp -r build-ios/install/include/* $IOSPKGNAME/Versions/A/Headers/
cp Info.plist ${IOSPKGNAME}/Versions/A/Resources/
rm -f $IOSPKGNAME.zip
zip -9 -y -r $IOSPKGNAME.zip $IOSPKGNAME


+ 135
- 0
src/CMakeLists.txt View File

@@ -0,0 +1,135 @@

##############################################

configure_file(platform.h.in ${CMAKE_CURRENT_BINARY_DIR}/platform.h)

include_directories(${CMAKE_CURRENT_SOURCE_DIR})
include_directories(${CMAKE_CURRENT_BINARY_DIR})
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/layer)

set(ncnn_SRCS
blob.cpp
cpu.cpp
layer.cpp
mat.cpp
mat_pixel.cpp
net.cpp
opencv.cpp
)

macro(ncnn_add_layer class)
string(TOLOWER ${class} name)

# WITH_LAYER_xxx option
if(${ARGC} EQUAL 2)
option(WITH_LAYER_${name} "build with layer ${name}" ${ARGV1})
else()
option(WITH_LAYER_${name} "build with layer ${name}" ON)
endif()

message("WITH_LAYER_${name} = ${WITH_LAYER_${name}}")

if(WITH_LAYER_${name})
list(APPEND ncnn_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/layer/${name}.cpp")

# look for arch specific implementation and append source
# optimized implementation for armv7 aarch64
if((ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a"))
OR (ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64"))
OR (IOS AND ("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "armv7"))
OR (IOS AND ("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "arm64"))
OR (IOS AND ("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "armv7;arm64")))
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/layer/arm/${name}_arm.cpp")
list(APPEND ncnn_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/layer/arm/${name}_arm.cpp")
set(WITH_LAYER_${name}_arm 1)
endif()
else()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/layer/x86/${name}_x86.cpp")
list(APPEND ncnn_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/layer/x86/${name}_x86.cpp")
set(WITH_LAYER_${name}_x86 1)
endif()
endif()
endif()

# generate layer_declaration and layer_registry file
if(WITH_LAYER_${name})
if(WITH_LAYER_${name}_arm)
file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_declaration.h
"extern Layer* ${class}_arm_layer_creator();\n")
file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h
"#if NCNN_STRING\n{\"${class}\",${class}_arm_layer_creator},\n#else\n{${class}_arm_layer_creator},\n#endif\n")
elseif(WITH_LAYER_${name}_x86)
file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_declaration.h
"extern Layer* ${class}_x86_layer_creator();\n")
file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h
"#if NCNN_STRING\n{\"${class}\",${class}_x86_layer_creator},\n#else\n{${class}_x86_layer_creator},\n#endif\n")
else()
file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_declaration.h
"extern Layer* ${class}_layer_creator();\n")
file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h
"#if NCNN_STRING\n{\"${class}\",${class}_layer_creator},\n#else\n{${class}_layer_creator},\n#endif\n")
endif()
else()
file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h "#if NCNN_STRING\n{\"${class}\",0},\n#else\n{0},\n#endif\n")
endif()
endmacro()

# create new
file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/layer_declaration.h)
file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h)

# layer implementation
ncnn_add_layer(AbsVal)
ncnn_add_layer(ArgMax OFF)
ncnn_add_layer(BatchNorm)
ncnn_add_layer(Bias)
ncnn_add_layer(BNLL)
ncnn_add_layer(Concat)
ncnn_add_layer(Convolution)
ncnn_add_layer(Crop)
ncnn_add_layer(Deconvolution)
ncnn_add_layer(Dropout)
ncnn_add_layer(Eltwise)
ncnn_add_layer(ELU)
ncnn_add_layer(Embed OFF)
ncnn_add_layer(Exp)
ncnn_add_layer(Flatten)
ncnn_add_layer(InnerProduct)
ncnn_add_layer(Input)
ncnn_add_layer(Log)
ncnn_add_layer(LRN)
ncnn_add_layer(MemoryData OFF)
ncnn_add_layer(MVN)
ncnn_add_layer(Pooling)
ncnn_add_layer(Power)
ncnn_add_layer(PReLU)
ncnn_add_layer(Proposal OFF)
ncnn_add_layer(Reduction OFF)
ncnn_add_layer(ReLU)
ncnn_add_layer(Reshape OFF)
ncnn_add_layer(ROIPooling OFF)
ncnn_add_layer(Scale)
ncnn_add_layer(Sigmoid)
ncnn_add_layer(Slice)
ncnn_add_layer(Softmax)
ncnn_add_layer(Split)
ncnn_add_layer(SPP OFF)
ncnn_add_layer(TanH)
ncnn_add_layer(Threshold)
ncnn_add_layer(Tile OFF)
ncnn_add_layer(RNN OFF)
ncnn_add_layer(LSTM OFF)

add_library(ncnn STATIC ${ncnn_SRCS})

install(TARGETS ncnn ARCHIVE DESTINATION lib)
install(FILES
blob.h
cpu.h
layer.h
mat.h
net.h
opencv.h
${CMAKE_CURRENT_BINARY_DIR}/platform.h
DESTINATION include
)

+ 24
- 0
src/blob.cpp View File

@@ -0,0 +1,24 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "blob.h"

namespace ncnn {

Blob::Blob()
{
producer = -1;
}

} // namespace ncnn

+ 43
- 0
src/blob.h View File

@@ -0,0 +1,43 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef NCNN_BLOB_H
#define NCNN_BLOB_H

#include <string>
#include <vector>
#include "platform.h"

namespace ncnn {

class Blob
{
public:
// empty
Blob();

public:
#if NCNN_STRING
// blob name
std::string name;
#endif // NCNN_STRING
// layer index which produce this blob as output
int producer;
// layer index which need this blob as input
std::vector<int> consumers;
};

} // namespace ncnn

#endif // NCNN_BLOB_H

+ 471
- 0
src/cpu.cpp View File

@@ -0,0 +1,471 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "cpu.h"

#include <stdio.h>
#include <vector>

#ifdef _OPENMP
#include <omp.h>
#endif

#ifdef __ANDROID__
#include <sys/syscall.h>
#include <unistd.h>
#endif

#if __APPLE__
#include "TargetConditionals.h"
#if TARGET_OS_IPHONE
#include <sys/types.h>
#include <sys/sysctl.h>
#include <mach/machine.h>
#define __IOS__ 1
#endif
#endif

namespace ncnn {

#ifdef __ANDROID__

// extract the ELF HW capabilities bitmap from /proc/self/auxv
static unsigned int get_elf_hwcap_from_proc_self_auxv()
{
FILE* fp = fopen("/proc/self/auxv", "rb");
if (!fp)
{
return 0;
}

#define AT_HWCAP 16
#define AT_HWCAP2 26

struct { unsigned int tag; unsigned int value; } entry;

unsigned int result = 0;
while (!feof(fp))
{
int nread = fread((char*)&entry, sizeof(entry), 1, fp);
if (nread != 1)
break;

if (entry.tag == 0 && entry.value == 0)
break;

if (entry.tag == AT_HWCAP)
{
result = entry.value;
break;
}
}

fclose(fp);

return result;
}

static unsigned int g_hwcaps = get_elf_hwcap_from_proc_self_auxv();

#if __aarch64__
// from arch/arm64/include/uapi/asm/hwcap.h
#define HWCAP_ASIMD (1 << 1)
#define HWCAP_ASIMDHP (1 << 10)
#else
// from arch/arm/include/uapi/asm/hwcap.h
#define HWCAP_NEON (1 << 12)
#define HWCAP_VFPv4 (1 << 16)
#endif

#endif // __ANDROID__

#if __IOS__
static cpu_type_t get_hw_cputype()
{
cpu_type_t value = 0;
size_t len = sizeof(value);
sysctlbyname("hw.cputype", &value, &len, NULL, 0);
return value;
}

static cpu_subtype_t get_hw_cpusubtype()
{
cpu_subtype_t value = 0;
size_t len = sizeof(value);
sysctlbyname("hw.cpusubtype", &value, &len, NULL, 0);
return value;
}

static cpu_type_t g_hw_cputype = get_hw_cputype();
static cpu_subtype_t g_hw_cpusubtype = get_hw_cpusubtype();
#endif // __IOS__

int cpu_support_arm_neon()
{
#ifdef __ANDROID__
#if __aarch64__
return g_hwcaps & HWCAP_ASIMD;
#else
return g_hwcaps & HWCAP_NEON;
#endif
#elif __IOS__
#if __aarch64__
return g_hw_cputype == CPU_TYPE_ARM64;
#else
return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7;
#endif
#else
return 0;
#endif
}

int cpu_support_arm_vfpv4()
{
#ifdef __ANDROID__
#if __aarch64__
// neon always enable fma and fp16
return g_hwcaps & HWCAP_ASIMD;
#else
return g_hwcaps & HWCAP_VFPv4;
#endif
#elif __IOS__
#if __aarch64__
return g_hw_cputype == CPU_TYPE_ARM64;
#else
return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S;
#endif
#else
return 0;
#endif
}

int cpu_support_arm_asimdhp()
{
#ifdef __ANDROID__
#if __aarch64__
return g_hwcaps & HWCAP_ASIMDHP;
#else
return 0;
#endif
#elif __IOS__
#if __aarch64__
return 0;
#else
return 0;
#endif
#else
return 0;
#endif
}

static int get_cpucount()
{
#ifdef __ANDROID__
// get cpu count from /proc/cpuinfo
FILE* fp = fopen("/proc/cpuinfo", "rb");
if (!fp)
return 1;

int count = 0;
char line[1024];
while (!feof(fp))
{
char* s = fgets(line, 1024, fp);
if (!s)
break;

if (memcmp(line, "processor", 9) == 0)
{
count++;
}
}

fclose(fp);

if (count < 1)
count = 1;

return count;
#elif __IOS__
int count = 0;
size_t len = sizeof(count);
sysctlbyname("hw.ncpu", &count, &len, NULL, 0);

if (count < 1)
count = 1;

return count;
#else
return 1;
#endif
}

static int g_cpucount = get_cpucount();

int get_cpu_count()
{
return g_cpucount;
}

#ifdef __ANDROID__
static int get_max_freq_khz(int cpuid)
{
char path[256];
sprintf(path, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);

FILE* fp = fopen(path, "rb");

if (!fp)
return -1;

int max_freq_khz = 0;
while (!feof(fp))
{
int freq_khz = 0;
int nscan = fscanf(fp, "%d %*d", &freq_khz);
if (nscan != 1)
break;

if (freq_khz > max_freq_khz)
max_freq_khz = freq_khz;
}

fclose(fp);

return max_freq_khz;
}

static int set_sched_affinity(const std::vector<int>& cpuids)
{
// cpu_set_t definition
// ref http://stackoverflow.com/questions/16319725/android-set-thread-affinity
#define CPU_SETSIZE 1024
#define __NCPUBITS (8 * sizeof (unsigned long))
typedef struct
{
unsigned long __bits[CPU_SETSIZE / __NCPUBITS];
} cpu_set_t;

#define CPU_SET(cpu, cpusetp) \
((cpusetp)->__bits[(cpu)/__NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS)))

#define CPU_ZERO(cpusetp) \
memset((cpusetp), 0, sizeof(cpu_set_t))

// set affinity for thread
pid_t pid = gettid();

cpu_set_t mask;
CPU_ZERO(&mask);
for (int i=0; i<(int)cpuids.size(); i++)
{
CPU_SET(cpuids[i], &mask);
}

int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask);
if (syscallret)
{
fprintf(stderr, "syscall error %d\n", syscallret);
return -1;
}

return 0;
}

static int sort_cpuid_by_max_frequency(std::vector<int>& cpuids, int* little_cluster_offset)
{
const int cpu_count = cpuids.size();

*little_cluster_offset = 0;

if (cpu_count == 0)
return 0;

std::vector<int> cpu_max_freq_khz;
cpu_max_freq_khz.resize(cpu_count);

for (int i=0; i<cpu_count; i++)
{
int max_freq_khz = get_max_freq_khz(i);

// printf("%d max freq = %d khz\n", i, max_freq_khz);

cpuids[i] = i;
cpu_max_freq_khz[i] = max_freq_khz;
}

// sort cpuid as big core first
// simple bubble sort
for (int i=0; i<cpu_count; i++)
{
for (int j=i+1; j<cpu_count; j++)
{
if (cpu_max_freq_khz[i] < cpu_max_freq_khz[j])
{
// swap
int tmp = cpuids[i];
cpuids[i] = cpuids[j];
cpuids[j] = tmp;

tmp = cpu_max_freq_khz[i];
cpu_max_freq_khz[i] = cpu_max_freq_khz[j];
cpu_max_freq_khz[j] = tmp;
}
}
}

// SMP
int mid_max_freq_khz = (cpu_max_freq_khz.front() + cpu_max_freq_khz.back()) / 2;
if (mid_max_freq_khz == cpu_max_freq_khz.back())
return 0;

for (int i=0; i<cpu_count; i++)
{
if (cpu_max_freq_khz[i] < mid_max_freq_khz)
{
*little_cluster_offset = i;
break;
}
}

return 0;
}
#endif // __ANDROID__

static int g_powersave = 0;

int get_cpu_powersave()
{
return g_powersave;
}

int set_cpu_powersave(int powersave)
{
#ifdef __ANDROID__
static std::vector<int> sorted_cpuids;
static int little_cluster_offset = 0;

if (sorted_cpuids.empty())
{
// 0 ~ g_cpucount
sorted_cpuids.resize(g_cpucount);
for (int i=0; i<g_cpucount; i++)
{
sorted_cpuids[i] = i;
}

// descent sort by max frequency
sort_cpuid_by_max_frequency(sorted_cpuids, &little_cluster_offset);
}

if (little_cluster_offset == 0)
{
fprintf(stderr, "SMP cpu powersave not supported\n");
return -1;
}

// prepare affinity cpuid
std::vector<int> cpuids;
if (powersave == 0)
{
cpuids = sorted_cpuids;
}
else if (powersave == 1)
{
cpuids = std::vector<int>(sorted_cpuids.begin() + little_cluster_offset, sorted_cpuids.end());
}
else if (powersave == 2)
{
cpuids = std::vector<int>(sorted_cpuids.begin(), sorted_cpuids.begin() + + little_cluster_offset);
}
else
{
fprintf(stderr, "powersave %d not supported\n", powersave);
return -1;
}

#ifdef _OPENMP
// set affinity for each thread
int num_threads = cpuids.size();
omp_set_num_threads(num_threads);
std::vector<int> ssarets(num_threads, 0);
#pragma omp parallel for
for (int i=0; i<num_threads; i++)
{
ssarets[i] = set_sched_affinity(cpuids);
}
for (int i=0; i<num_threads; i++)
{
if (ssarets[i] != 0)
{
return -1;
}
}
#else
int ssaret = set_sched_affinity(cpuids);
if (ssaret != 0)
{
return -1;
}
#endif

g_powersave = powersave;

return 0;
#elif __IOS__
// thread affinity not supported on ios
return -1;
#else
// TODO
return -1;
#endif
}

int get_omp_num_threads()
{
#ifdef _OPENMP
return omp_get_num_threads();
#else
return 1;
#endif
}

void set_omp_num_threads(int num_threads)
{
#ifdef _OPENMP
omp_set_num_threads(num_threads);
#else
(void)num_threads;
#endif
}

int get_omp_dynamic()
{
#ifdef _OPENMP
return omp_get_dynamic();
#else
return 0;
#endif
}

void set_omp_dynamic(int dynamic)
{
#ifdef _OPENMP
omp_set_dynamic(dynamic);
#else
(void)dynamic;
#endif
}

} // namespace ncnn

+ 51
- 0
src/cpu.h View File

@@ -0,0 +1,51 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef NCNN_CPU_H
#define NCNN_CPU_H

namespace ncnn {

// test optional cpu features
// neon = armv7 neon or aarch64 asimd
int cpu_support_arm_neon();
// vfpv4 = armv7 fp16 + fma
int cpu_support_arm_vfpv4();
// asimdhp = aarch64 asimd half precision
int cpu_support_arm_asimdhp();

// cpu info
int get_cpu_count();

// bind all threads on little clusters if powersave enabled
// affacts HMP arch cpu like ARM big.LITTLE
// only implemented on android at the moment
// switching powersave is expensive and not thread-safe
// 0 = all cores enabled(default)
// 1 = only little clusters enabled
// 2 = only big clusters enabled
// return 0 if success for setter function
int get_cpu_powersave();
int set_cpu_powersave(int powersave);

// misc function wrapper for openmp routines
int get_omp_num_threads();
void set_omp_num_threads(int num_threads);

int get_omp_dynamic();
void set_omp_dynamic(int dynamic);

} // namespace ncnn

#endif // NCNN_CPU_H

+ 130
- 0
src/layer.cpp View File

@@ -0,0 +1,130 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "layer.h"

#include <stdio.h>
#include <string.h>

namespace ncnn {

Layer::Layer()
{
one_blob_only = false;
support_inplace = false;
}

Layer::~Layer()
{
}

#if NCNN_STDIO
#if NCNN_STRING
int Layer::load_param(FILE* /*paramfp*/)
{
return 0;
}
#endif // NCNN_STRING

int Layer::load_param_bin(FILE* /*paramfp*/)
{
return 0;
}

int Layer::load_model(FILE* /*binfp*/)
{
return 0;
}
#endif // NCNN_STDIO

int Layer::load_param(const unsigned char*& /*mem*/)
{
return 0;
}

int Layer::load_model(const unsigned char*& /*mem*/)
{
return 0;
}

int Layer::forward(const std::vector<Mat>& /*bottom_blobs*/, std::vector<Mat>& /*top_blobs*/) const
{
return -1;
}

int Layer::forward(const Mat& /*bottom_blob*/, Mat& /*top_blob*/) const
{
return -1;
}

int Layer::forward_inplace(std::vector<Mat>& bottom_top_blobs) const
{
std::vector<Mat> top_blobs;
int ret = forward(bottom_top_blobs, top_blobs);
bottom_top_blobs = top_blobs;
return ret;
}

int Layer::forward_inplace(Mat& bottom_top_blob) const
{
Mat top_blob;
int ret = forward(bottom_top_blob, top_blob);
bottom_top_blob = top_blob;
return ret;
}

#include "layer_declaration.h"

static const layer_registry_entry layer_registry[] =
{
#include "layer_registry.h"
};

static const int layer_registry_entry_count = sizeof(layer_registry) / sizeof(layer_registry_entry);

#if NCNN_STRING
int layer_to_index(const char* type)
{
for (int i=0; i<layer_registry_entry_count; i++)
{
if (strcmp(type, layer_registry[i].name) == 0)
{
return i;
}
}

fprintf(stderr, "layer %s not exists\n", type);
return -1;
}
#endif // NCNN_STRING

Layer* create_layer(int index)
{
if (index < 0 || index >= layer_registry_entry_count)
{
fprintf(stderr, "layer index %d not exists\n", index);
return 0;
}

layer_creator_func layer_creator = layer_registry[index].creator;
if (!layer_creator)
{
fprintf(stderr, "layer index %d not enabled\n", index);
return 0;
}

return layer_creator();
}

} // namespace ncnn

+ 163
- 0
src/layer.h View File

@@ -0,0 +1,163 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef NCNN_LAYER_H
#define NCNN_LAYER_H

#include <stdio.h>
#include <string>
#include <vector>
#include "mat.h"
#include "platform.h"

namespace ncnn {

class Layer
{
public:
// empty
Layer();
// virtual destructor
virtual ~Layer();

#if NCNN_STDIO
#if NCNN_STRING
// load layer specific parameter from plain param file
// return 0 if success
virtual int load_param(FILE* paramfp);
#endif // NCNN_STRING
// load layer specific parameter from binary param file
// return 0 if success
virtual int load_param_bin(FILE* paramfp);

// load layer specific weight data from model file
// return 0 if success
virtual int load_model(FILE* binfp);
#endif // NCNN_STDIO

// load layer specific parameter from memory
// memory pointer is 32-bit aligned
// return 0 if success
virtual int load_param(const unsigned char*& mem);

// load layer specific weight data from memory
// memory pointer is 32-bit aligned
// return 0 if success
virtual int load_model(const unsigned char*& mem);

public:
// one input and one output blob
bool one_blob_only;

// support inplace inference
bool support_inplace;

public:
// implement inference
// return 0 if success
virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

// implement inplace inference
// return 0 if success
virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs) const;
virtual int forward_inplace(Mat& bottom_top_blob) const;

public:
#if NCNN_STRING
// layer type name
std::string type;
// layer name
std::string name;
#endif // NCNN_STRING
// blob index which this layer needs as input
std::vector<int> bottoms;
// blob index which this layer produces as output
std::vector<int> tops;
};

namespace LayerType {
enum
{
AbsVal = 0,
ArgMax = 1,
BatchNorm = 2,
Bias = 3,
BNLL = 4,
Concat = 5,
Convolution = 6,
Crop = 7,
Deconvolution = 8,
Dropout = 9,
ELU = 10,
Eltwise = 11,
Embed = 12,
Exp = 13,
Flatten = 14,
InnerProduct = 15,
Input = 16,
Log = 17,
LRN = 18,
MemoryData = 19,
MVN = 20,
Pooling = 21,
Power = 22,
PReLU = 23,
Proposal = 24,
Reduction = 25,
ReLU = 26,
Reshape = 27,
ROIPooling = 28,
Scale = 29,
Sigmoid = 30,
Slice = 31,
Softmax = 32,
Split = 33,
SPP = 34,
TanH = 35,
Threshold = 36,
Tile = 37,
RNN = 38,
LSTM = 39,

CustomBit = (1<<8),
};
} // namespace LayerType

// layer factory function
typedef Layer* (*layer_creator_func)();

struct layer_registry_entry
{
#if NCNN_STRING
// layer type name
const char* name;
#endif // NCNN_STRING
// layer factory entry
layer_creator_func creator;
};

#if NCNN_STRING
// get layer type from type name
int layer_to_index(const char* type);
#endif // NCNN_STRING
// create layer from layer type
Layer* create_layer(int index);

#define DEFINE_LAYER_CREATOR(name) \
Layer* name##_layer_creator() { return new name; }

} // namespace ncnn

#endif // NCNN_LAYER_H

+ 76
- 0
src/layer/absval.cpp View File

@@ -0,0 +1,76 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "absval.h"

namespace ncnn {

DEFINE_LAYER_CREATOR(AbsVal)

AbsVal::AbsVal()
{
one_blob_only = true;
support_inplace = true;
}

int AbsVal::forward(const Mat& bottom_blob, Mat& top_blob) const
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
int size = w * h;

top_blob.create(w, h, channels);

#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);

for (int i=0; i<size; i++)
{
if (ptr[i] < 0)
outptr[i] = -ptr[i];
else
outptr[i] = ptr[i];
}
}

return 0;
}

int AbsVal::forward_inplace(Mat& bottom_top_blob) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;

#pragma omp parallel for
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

for (int i=0; i<size; i++)
{
if (ptr[i] < 0)
ptr[i] = -ptr[i];
}
}

return 0;
}

} // namespace ncnn

+ 36
- 0
src/layer/absval.h View File

@@ -0,0 +1,36 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_ABSVAL_H
#define LAYER_ABSVAL_H

#include "layer.h"

namespace ncnn {

class AbsVal : public Layer
{
public:
AbsVal();

virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

virtual int forward_inplace(Mat& bottom_top_blob) const;

public:
};

} // namespace ncnn

#endif // LAYER_ABSVAL_H

+ 108
- 0
src/layer/argmax.cpp View File

@@ -0,0 +1,108 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "argmax.h"
#include <algorithm>
#include <functional>

namespace ncnn {

DEFINE_LAYER_CREATOR(ArgMax)

ArgMax::ArgMax()
{
}

#if NCNN_STDIO
#if NCNN_STRING
int ArgMax::load_param(FILE* paramfp)
{
int nscan = fscanf(paramfp, "%d %d", &out_max_val, &topk);
if (nscan != 2)
{
fprintf(stderr, "ArgMax load_param failed %d\n", nscan);
return -1;
}

return 0;
}
#endif // NCNN_STRING
int ArgMax::load_param_bin(FILE* paramfp)
{
fread(&out_max_val, sizeof(int), 1, paramfp);

fread(&topk, sizeof(int), 1, paramfp);

return 0;
}
#endif // NCNN_STDIO

int ArgMax::load_param(const unsigned char*& mem)
{
out_max_val = *(int*)(mem);
mem += 4;

topk = *(int*)(mem);
mem += 4;

return 0;
}

int ArgMax::forward(const Mat& bottom_blob, Mat& top_blob) const
{
int size = bottom_blob.total();

if (out_max_val)
top_blob.create(topk, 2);
else
top_blob.create(topk, 1);
if (top_blob.empty())
return -100;

const float* ptr = bottom_blob;

// partial sort topk with index
// optional value
std::vector< std::pair<float, int> > vec;
vec.resize(size);
for (int i=0; i<size; i++)
{
vec[i] = std::make_pair(ptr[i], i);
}

std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
std::greater< std::pair<float, int> >());

float* outptr = top_blob;
if (out_max_val)
{
float* valptr = outptr + topk;
for (int i=0; i<topk; i++)
{
outptr[i] = vec[i].first;
valptr[i] = vec[i].second;
}
}
else
{
for (int i=0; i<topk; i++)
{
outptr[i] = vec[i].second;
}
}

return 0;
}

} // namespace ncnn

+ 44
- 0
src/layer/argmax.h View File

@@ -0,0 +1,44 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_ARGMAX_H
#define LAYER_ARGMAX_H

#include "layer.h"

namespace ncnn {

class ArgMax : public Layer
{
public:
ArgMax();

#if NCNN_STDIO
#if NCNN_STRING
virtual int load_param(FILE* paramfp);
#endif // NCNN_STRING
virtual int load_param_bin(FILE* paramfp);
#endif // NCNN_STDIO
virtual int load_param(const unsigned char*& mem);

virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

public:
int out_max_val;
int topk;
};

} // namespace ncnn

#endif // LAYER_ARGMAX_H

+ 152
- 0
src/layer/arm/absval_arm.cpp View File

@@ -0,0 +1,152 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "absval_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

namespace ncnn {

DEFINE_LAYER_CREATOR(AbsVal_arm)

int AbsVal_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
int size = w * h;

top_blob.create(w, h, channels);
if (top_blob.empty())
return -100;

#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(ptr);
float32x4_t _outp = vabsq_f32(_p);
vst1q_f32(outptr, _outp);

ptr += 4;
outptr += 4;
}
#else
if (nn > 0)
{
asm volatile(
"0: \n"
"vld1.f32 {d0-d1}, [%1]! \n"
"vabs.f32 q0, q0 \n"
"subs %0, #1 \n"
"vst1.f32 {d0-d1}, [%2]! \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr), // %1
"=r"(outptr) // %2
: "0"(nn),
"1"(ptr),
"2"(outptr)
: "cc", "memory", "q0"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
*outptr = *ptr > 0 ? *ptr : -*ptr;

ptr++;
outptr++;
}
}

return 0;
}

int AbsVal_arm::forward_inplace(Mat& bottom_top_blob) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;

#pragma omp parallel for
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(ptr);
_p = vabsq_f32(_p);
vst1q_f32(ptr, _p);

ptr += 4;
}
#else
if (nn > 0)
{
asm volatile(
"0: \n"
"vld1.f32 {d0-d1}, [%1] \n"
"vabs.f32 q0, q0 \n"
"subs %0, #1 \n"
"vst1.f32 {d0-d1}, [%1]! \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr) // %1
: "0"(nn),
"1"(ptr)
: "cc", "memory", "q0"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
*ptr = *ptr > 0 ? *ptr : -*ptr;

ptr++;
}
}

return 0;
}

} // namespace ncnn

+ 34
- 0
src/layer/arm/absval_arm.h View File

@@ -0,0 +1,34 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_ABSVAL_ARM_H
#define LAYER_ABSVAL_ARM_H

#include "absval.h"

namespace ncnn {

class AbsVal_arm : public AbsVal
{
public:
virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

virtual int forward_inplace(Mat& bottom_top_blob) const;

public:
};

} // namespace ncnn

#endif // LAYER_ABSVAL_ARM_H

+ 186
- 0
src/layer/arm/batchnorm_arm.cpp View File

@@ -0,0 +1,186 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "batchnorm_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

namespace ncnn {

DEFINE_LAYER_CREATOR(BatchNorm_arm)

int BatchNorm_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
{
// a = bias - slope * mean / sqrt(var)
// b = slope / sqrt(var)
// value = b * value + a

int w = bottom_blob.w;
int h = bottom_blob.h;
int size = w * h;

top_blob.create(w, h, channels);
if (top_blob.empty())
return -100;

const float* a_data_ptr = a_data;
const float* b_data_ptr = b_data;
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);

float a = a_data_ptr[q];
float b = b_data_ptr[q];

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
float32x4_t _a = vdupq_n_f32(a);
float32x4_t _b = vdupq_n_f32(b);
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(ptr);
float32x4_t _outp = _a;
_outp = vfmaq_f32(_outp, _p, _b);
vst1q_f32(outptr, _outp);

ptr += 4;
outptr += 4;
}
#else
if (nn > 0)
{
asm volatile(
"vdup.f32 q1, %6 \n"
"vdup.f32 q2, %7 \n"
"0: \n"
"pld [%1, #128] \n"
"vld1.f32 {d0-d1}, [%1 :128]! \n"
"vorr.32 q3, q1, q1 \n"
"vmla.f32 q3, q0, q2 \n"
"subs %0, #1 \n"
"vst1.f32 {d6-d7}, [%2 :128]! \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr), // %1
"=r"(outptr) // %2
: "0"(nn),
"1"(ptr),
"2"(outptr),
"r"(a), // %6
"r"(b) // %7
: "cc", "memory", "q0", "q1", "q2", "q3"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
*outptr = b * *ptr + a;

ptr++;
outptr++;
}
}

return 0;
}

int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob) const
{
// a = bias - slope * mean / sqrt(var)
// b = slope / sqrt(var)
// value = b * value + a

int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int size = w * h;

const float* a_data_ptr = a_data;
const float* b_data_ptr = b_data;
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

float a = a_data_ptr[q];
float b = b_data_ptr[q];

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
float32x4_t _a = vdupq_n_f32(a);
float32x4_t _b = vdupq_n_f32(b);
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(ptr);
float32x4_t _outp = _a;
_outp = vfmaq_f32(_outp, _p, _b);
vst1q_f32(ptr, _outp);

ptr += 4;
}
#else
if (nn > 0)
{
asm volatile(
"vdup.f32 q1, %4 \n"
"vdup.f32 q2, %5 \n"
"0: \n"
"pld [%1, #128] \n"
"vld1.f32 {d0-d1}, [%1 :128] \n"
"vorr.32 q3, q1, q1 \n"
"vmla.f32 q3, q0, q2 \n"
"subs %0, #1 \n"
"vst1.f32 {d6-d7}, [%1 :128]! \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr) // %1
: "0"(nn),
"1"(ptr),
"r"(a), // %4
"r"(b) // %5
: "cc", "memory", "q0", "q1", "q2", "q3"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
*ptr = b * *ptr + a;

ptr++;
}
}

return 0;
}

} // namespace ncnn

+ 32
- 0
src/layer/arm/batchnorm_arm.h View File

@@ -0,0 +1,32 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_BATCHNORM_ARM_H
#define LAYER_BATCHNORM_ARM_H

#include "batchnorm.h"

namespace ncnn {

class BatchNorm_arm : public BatchNorm
{
public:
virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

virtual int forward_inplace(Mat& bottom_top_blob) const;
};

} // namespace ncnn

#endif // LAYER_BATCHNORM_ARM_H

+ 122
- 0
src/layer/arm/bias_arm.cpp View File

@@ -0,0 +1,122 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "bias_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

namespace ncnn {

DEFINE_LAYER_CREATOR(Bias_arm)

int Bias_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
int size = w * h;

top_blob.create(w, h, channels);
if (top_blob.empty())
return -100;

const float* bias_ptr = bias_data;
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);

float bias = bias_ptr[q];

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
float32x4_t _bias = vdupq_n_f32(bias);
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(ptr);
float32x4_t _outp = vaddq_f32(_p, _bias);
vst1q_f32(outptr, _outp);

ptr += 4;
outptr += 4;
}
#endif // __ARM_NEON

for (; remain>0; remain--)
{
*outptr = *ptr + bias;

ptr++;
outptr++;
}
}

return 0;
}

int Bias_arm::forward_inplace(Mat& bottom_top_blob) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;

const float* bias_ptr = bias_data;
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

float bias = bias_ptr[q];

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
float32x4_t _bias = vdupq_n_f32(bias);
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(ptr);
float32x4_t _outp = vaddq_f32(_p, _bias);
vst1q_f32(ptr, _outp);

ptr += 4;
}
#endif // __ARM_NEON

for (; remain>0; remain--)
{
*ptr = *ptr + bias;

ptr++;
}
}

return 0;
}

} // namespace ncnn

+ 32
- 0
src/layer/arm/bias_arm.h View File

@@ -0,0 +1,32 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_BIAS_ARM_H
#define LAYER_BIAS_ARM_H

#include "bias.h"

namespace ncnn {

class Bias_arm : public Bias
{
public:
virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

virtual int forward_inplace(Mat& bottom_top_blob) const;
};

} // namespace ncnn

#endif // LAYER_BIAS_ARM_H

+ 543
- 0
src/layer/arm/convolution_1x1.h View File

@@ -0,0 +1,543 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const float* kernel = _kernel;
const float* bias = _bias;

#pragma omp parallel for
for (int p=0; p<outch; p++)
{
Mat out = top_blob.channel(p);

const float bias0 = bias ? bias[p] : 0.f;

out.fill(bias0);

int q = 0;

for (; q+3<inch; q+=4)
{
float* outptr = out;

const float* img0 = bottom_blob.channel(q);
const float* img1 = bottom_blob.channel(q+1);
const float* img2 = bottom_blob.channel(q+2);
const float* img3 = bottom_blob.channel(q+3);

const float* kernel0 = kernel + p*inch + q;
const float k0 = kernel0[0];
const float k1 = kernel0[1];
const float k2 = kernel0[2];
const float k3 = kernel0[3];

const float* r0 = img0;
const float* r1 = img1;
const float* r2 = img2;
const float* r3 = img3;

int size = outw * outh;

#if __ARM_NEON
int nn = size >> 3;
int remain = size & 7;
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
float32x4_t _k0 = vdupq_n_f32(k0);
float32x4_t _k1 = vdupq_n_f32(k1);
float32x4_t _k2 = vdupq_n_f32(k2);
float32x4_t _k3 = vdupq_n_f32(k3);
#if __aarch64__
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(r0);
float32x4_t _pn = vld1q_f32(r0+4);

float32x4_t _outp = vld1q_f32(outptr);
float32x4_t _outpn = vld1q_f32(outptr+4);

_outp = vfmaq_f32(_outp, _p, _k0);
_outpn = vfmaq_f32(_outpn, _pn, _k0);

float32x4_t _p1 = vld1q_f32(r1);
float32x4_t _p1n = vld1q_f32(r1+4);

_outp = vfmaq_f32(_outp, _p1, _k1);
_outpn = vfmaq_f32(_outpn, _p1n, _k1);

float32x4_t _p2 = vld1q_f32(r2);
float32x4_t _p2n = vld1q_f32(r2+4);

_outp = vfmaq_f32(_outp, _p2, _k2);
_outpn = vfmaq_f32(_outpn, _p2n, _k2);

float32x4_t _p3 = vld1q_f32(r3);
float32x4_t _p3n = vld1q_f32(r3+4);

_outp = vfmaq_f32(_outp, _p3, _k3);
_outpn = vfmaq_f32(_outpn, _p3n, _k3);

vst1q_f32(outptr, _outp);
vst1q_f32(outptr+4, _outpn);

r0 += 8;
r1 += 8;
r2 += 8;
r3 += 8;
outptr += 8;
}
#else
if (nn > 0)
{
asm volatile(
"pld [%2, #256] \n"
"vld1.f32 {d4-d7}, [%2 :128]! \n"
"0: \n"
"pld [%1, #256] \n"
"vld1.f32 {d0-d3}, [%1 :128] \n"
"vmla.f32 q0, q2, %q12 \n"
"vmla.f32 q1, q3, %q12 \n"
"pld [%3, #256] \n"
"vld1.f32 {d4-d7}, [%3 :128]! \n"
"vmla.f32 q0, q2, %q13 \n"
"vmla.f32 q1, q3, %q13 \n"
"pld [%4, #256] \n"
"vld1.f32 {d4-d7}, [%4 :128]! \n"
"vmla.f32 q0, q2, %q14 \n"
"vmla.f32 q1, q3, %q14 \n"
"pld [%5, #256] \n"
"vld1.f32 {d4-d7}, [%5 :128]! \n"
"vmla.f32 q0, q2, %q15 \n"
"vmla.f32 q1, q3, %q15 \n"
"pld [%2, #256] \n"
"vld1.f32 {d4-d7}, [%2 :128]! \n"
"subs %0, #1 \n"
"vst1.f32 {d0-d3}, [%1 :128]! \n"
"bne 0b \n"
"sub %2, #32 \n"
: "=r"(nn), // %0
"=r"(outptr), // %1
"=r"(r0), // %2
"=r"(r1), // %3
"=r"(r2), // %4
"=r"(r3) // %5
: "0"(nn),
"1"(outptr),
"2"(r0),
"3"(r1),
"4"(r2),
"5"(r3),
"w"(_k0), // %12
"w"(_k1), // %13
"w"(_k2), // %14
"w"(_k3) // %15
: "cc", "memory", "q0", "q1", "q2", "q3"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
float sum = *r0 * k0;
float sum1 = *r1 * k1;
float sum2 = *r2 * k2;
float sum3 = *r3 * k3;

*outptr += sum + sum1 + sum2 + sum3;

r0++;
r1++;
r2++;
r3++;
outptr++;
}

}

for (; q<inch; q++)
{
float* outptr = out;

const float* img0 = bottom_blob.channel(q);

const float* kernel0 = kernel + p*inch + q;
const float k0 = kernel0[0];

const float* r0 = img0;

int size = outw * outh;

#if __ARM_NEON
int nn = size >> 3;
int remain = size & 7;
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
float32x4_t _k0 = vdupq_n_f32(k0);
#if __aarch64__
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(r0);
float32x4_t _outp = vld1q_f32(outptr);

float32x4_t _pn = vld1q_f32(r0+4);
float32x4_t _outpn = vld1q_f32(outptr+4);

_outp = vfmaq_f32(_outp, _p, _k0);
_outpn = vfmaq_f32(_outpn, _pn, _k0);

vst1q_f32(outptr, _outp);
vst1q_f32(outptr+4, _outpn);

r0 += 8;
outptr += 8;
}
#else
if (nn > 0)
{
asm volatile(
"pld [%2, #256] \n"
"vld1.f32 {d4-d7}, [%2 :128]! \n"
"0: \n"
"pld [%1, #256] \n"
"vld1.f32 {d0-d3}, [%1 :128] \n"
"vmla.f32 q0, q2, %q6 \n"
"vmla.f32 q1, q3, %q6 \n"
"pld [%2, #256] \n"
"vld1.f32 {d4-d7}, [%2 :128]! \n"
"subs %0, #1 \n"
"vst1.f32 {d0-d3}, [%1 :128]! \n"
"bne 0b \n"
"sub %2, #32 \n"
: "=r"(nn), // %0
"=r"(outptr), // %1
"=r"(r0) // %2
: "0"(nn),
"1"(outptr),
"2"(r0),
"w"(_k0) // %6
: "cc", "memory", "q0", "q1", "q2", "q3"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
float sum = *r0 * k0;

*outptr += sum;

r0++;
outptr++;
}

}
}

}

static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int tailstep = w - 2*outw + w;

const float* kernel = _kernel;
const float* bias = _bias;

#pragma omp parallel for
for (int p=0; p<outch; p++)
{
Mat out = top_blob.channel(p);

const float bias0 = bias ? bias[p] : 0.f;

out.fill(bias0);

int q = 0;

for (; q+3<inch; q+=4)
{
float* outptr = out;

const float* img0 = bottom_blob.channel(q);
const float* img1 = bottom_blob.channel(q+1);
const float* img2 = bottom_blob.channel(q+2);
const float* img3 = bottom_blob.channel(q+3);

const float* kernel0 = kernel + p*inch + q;
const float k0 = kernel0[0];
const float k1 = kernel0[1];
const float k2 = kernel0[2];
const float k3 = kernel0[3];

const float* r0 = img0;
const float* r1 = img1;
const float* r2 = img2;
const float* r3 = img3;

for (int i = 0; i < outh; i++)
{
#if __ARM_NEON
int nn = outw >> 3;
int remain = outw & 7;
#else
int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
float32x4_t _k0 = vdupq_n_f32(k0);
float32x4_t _k1 = vdupq_n_f32(k1);
float32x4_t _k2 = vdupq_n_f32(k2);
float32x4_t _k3 = vdupq_n_f32(k3);
#if __aarch64__
for (; nn>0; nn--)
{
float32x4x2_t _px2 = vld2q_f32(r0);
float32x4_t _p = _px2.val[0];
float32x4_t _outp = vld1q_f32(outptr);

float32x4x2_t _pnx2 = vld2q_f32(r0+8);
float32x4_t _pn = _pnx2.val[0];
float32x4_t _outpn = vld1q_f32(outptr+4);

_outp = vmlaq_f32(_outp, _p, _k0);
_outpn = vmlaq_f32(_outpn, _pn, _k0);

float32x4x2_t _p1x2 = vld2q_f32(r1);
float32x4_t _p1 = _p1x2.val[0];
float32x4x2_t _p1nx2 = vld2q_f32(r1+8);
float32x4_t _p1n = _p1nx2.val[0];

_outp = vmlaq_f32(_outp, _p1, _k1);
_outpn = vmlaq_f32(_outpn, _p1n, _k1);

float32x4x2_t _p2x2 = vld2q_f32(r2);
float32x4_t _p2 = _p2x2.val[0];
float32x4x2_t _p2nx2 = vld2q_f32(r2+8);
float32x4_t _p2n = _p2nx2.val[0];

_outp = vmlaq_f32(_outp, _p2, _k2);
_outpn = vmlaq_f32(_outpn, _p2n, _k2);

float32x4x2_t _p3x2 = vld2q_f32(r3);
float32x4_t _p3 = _p3x2.val[0];
float32x4x2_t _p3nx2 = vld2q_f32(r3+8);
float32x4_t _p3n = _p3nx2.val[0];

_outp = vmlaq_f32(_outp, _p3, _k3);
_outpn = vmlaq_f32(_outpn, _p3n, _k3);

vst1q_f32(outptr, _outp);
vst1q_f32(outptr+8, _outpn);

r0 += 16;
r1 += 16;
r2 += 16;
r3 += 16;
outptr += 8;
}
#else
if (nn > 0)
{
asm volatile(
"pld [%2, #512] \n"
"vld2.f32 {d4-d7}, [%2]! \n"
"vld2.f32 {d16-d19}, [%2]! \n"
"0: \n"
"pld [%1, #256] \n"
"vld1.f32 {d0-d3}, [%1] \n"
"vmla.f32 q0, q2, %q12 \n"
"vmla.f32 q1, q8, %q12 \n"
"pld [%3, #512] \n"
"vld2.f32 {d4-d7}, [%3]! \n"
"vld2.f32 {d16-d19}, [%3]! \n"
"vmla.f32 q0, q2, %q13 \n"
"vmla.f32 q1, q8, %q13 \n"
"pld [%4, #512] \n"
"vld2.f32 {d4-d7}, [%4]! \n"
"vld2.f32 {d16-d19}, [%4]! \n"
"vmla.f32 q0, q2, %q14 \n"
"vmla.f32 q1, q8, %q14 \n"
"pld [%5, #512] \n"
"vld2.f32 {d4-d7}, [%5]! \n"
"vld2.f32 {d16-d19}, [%5]! \n"
"vmla.f32 q0, q2, %q15 \n"
"vmla.f32 q1, q8, %q15 \n"
"pld [%2, #512] \n"
"vld2.f32 {d4-d7}, [%2]! \n"
"vld2.f32 {d16-d19}, [%2]! \n"
"subs %0, #1 \n"
"vst1.f32 {d0-d3}, [%1]! \n"
"bne 0b \n"
"sub %2, #64 \n"
: "=r"(nn), // %0
"=r"(outptr), // %1
"=r"(r0), // %2
"=r"(r1), // %3
"=r"(r2), // %4
"=r"(r3) // %5
: "0"(nn),
"1"(outptr),
"2"(r0),
"3"(r1),
"4"(r2),
"5"(r3),
"w"(_k0), // %12
"w"(_k1), // %13
"w"(_k2), // %14
"w"(_k3) // %15
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
float sum = *r0 * k0;
float sum1 = *r1 * k1;
float sum2 = *r2 * k2;
float sum3 = *r3 * k3;

*outptr += sum + sum1 + sum2 + sum3;

r0 += 2;
r1 += 2;
r2 += 2;
r3 += 2;
outptr++;
}

r0 += tailstep;
r1 += tailstep;
r2 += tailstep;
r3 += tailstep;
}

}

for (; q<inch; q++)
{
float* outptr = out;

const float* img0 = bottom_blob.channel(q);

const float* kernel0 = kernel + p*inch + q;
const float k0 = kernel0[0];

const float* r0 = img0;

for (int i = 0; i < outh; i++)
{
#if __ARM_NEON
int nn = outw >> 3;
int remain = outw & 7;
#else
int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
float32x4_t _k0 = vdupq_n_f32(k0);
#if __aarch64__
for (; nn>0; nn--)
{
float32x4x2_t _px2 = vld2q_f32(r0);
float32x4_t _p = _px2.val[0];
float32x4_t _outp = vld1q_f32(outptr);

float32x4x2_t _pnx2 = vld2q_f32(r0+8);
float32x4_t _pn = _pnx2.val[0];
float32x4_t _outpn = vld1q_f32(outptr+4);

_outp = vmlaq_f32(_outp, _p, _k0);
_outpn = vmlaq_f32(_outpn, _pn, _k0);

vst1q_f32(outptr, _outp);
vst1q_f32(outptr+4, _outpn);

r0 += 16;
outptr += 8;
}
#else
if (nn > 0)
{
asm volatile(
"pld [%2, #512] \n"
"vld2.f32 {d4-d7}, [%2]! \n"
"vld2.f32 {d16-d19}, [%2]! \n"
"0: \n"
"pld [%1, #256] \n"
"vld1.f32 {d0-d3}, [%1] \n"
"vmla.f32 q0, q2, %q6 \n"
"vmla.f32 q1, q8, %q6 \n"
"pld [%2, #512] \n"
"vld2.f32 {d4-d7}, [%2]! \n"
"vld2.f32 {d16-d19}, [%2]! \n"
"subs %0, #1 \n"
"vst1.f32 {d0-d3}, [%1]! \n"
"bne 0b \n"
"sub %2, #64 \n"
: "=r"(nn), // %0
"=r"(outptr), // %1
"=r"(r0) // %2
: "0"(nn),
"1"(outptr),
"2"(r0),
"w"(_k0) // %6
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
float sum = *r0 * k0;

*outptr += sum;

r0 += 2;
outptr++;
}

r0 += tailstep;
}

}
}

}

+ 381
- 0
src/layer/arm/convolution_2x2.h View File

@@ -0,0 +1,381 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const float* kernel = _kernel;
const float* bias = _bias;

#pragma omp parallel for
for (int p=0; p<outch; p++)
{
Mat out = top_blob.channel(p);

const float bias0 = bias ? bias[p] : 0.f;

out.fill(bias0);

int q = 0;

for (; q+1<inch; q+=2)
{
float* outptr = out;

const float* img0 = bottom_blob.channel(q);
const float* img1 = bottom_blob.channel(q+1);

const float* kernel0 = kernel + p*inch*4 + q*4;
const float* kernel1 = kernel0 + 4;

const float* r00 = img0;
const float* r01 = img0 + w;

const float* r10 = img1;
const float* r11 = img1 + w;

#if __ARM_NEON
float32x4_t _k0 = vld1q_f32(kernel0);
float32x4_t _k1 = vld1q_f32(kernel1);
#endif // __ARM_NEON

for (int i = 0; i < outh; i++)
{
#if __ARM_NEON
int nn = outw >> 2;
int remain = outw & 3;
#else
int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
{
float32x4_t _r000 = vld1q_f32(r00);
float32x4_t _r010 = vld1q_f32(r01);
float32x4_t _r001 = vld1q_f32(r00 + 1);
float32x4_t _r011 = vld1q_f32(r01 + 1);

float32x4_t _r100 = vld1q_f32(r10);
float32x4_t _r110 = vld1q_f32(r11);
float32x4_t _r101 = vld1q_f32(r10 + 1);
float32x4_t _r111 = vld1q_f32(r11 + 1);

float32x4_t _sum = vld1q_f32(outptr);

_sum = vmlaq_lane_f32(_sum, _r000, vget_low_f32(_k0), 0);
_sum = vmlaq_lane_f32(_sum, _r001, vget_low_f32(_k0), 1);
_sum = vmlaq_lane_f32(_sum, _r010, vget_high_f32(_k0), 0);
_sum = vmlaq_lane_f32(_sum, _r011, vget_high_f32(_k0), 1);

_sum = vmlaq_lane_f32(_sum, _r100, vget_low_f32(_k1), 0);
_sum = vmlaq_lane_f32(_sum, _r101, vget_low_f32(_k1), 1);
_sum = vmlaq_lane_f32(_sum, _r110, vget_high_f32(_k1), 0);
_sum = vmlaq_lane_f32(_sum, _r111, vget_high_f32(_k1), 1);

vst1q_f32(outptr, _sum);

r00 += 4;
r01 += 4;
r10 += 4;
r11 += 4;
outptr += 4;
}
#else
if (nn > 0)
{
asm volatile(
"pld [%1, #128] \n"
"vld1.f32 {d0-d1}, [%1]! \n"
"pld [%2, #128] \n"
"vld1.f32 {d4-d5}, [%2]! \n"

"pld [%3, #128] \n"
"vld1.f32 {d24-d25}, [%3]! \n"
"pld [%4, #128] \n"
"vld1.f32 {d28-d29}, [%4]! \n"

"0: \n"
"pld [%5, #128] \n"
"vld1.f32 {d18-d19}, [%5] \n"// q9 = sum

"vmul.f32 q8, q0, %e12[0] \n"
"vmla.f32 q9, q2, %f12[0] \n"

"pld [%1, #128] \n"
"vld1.f32 {d2-d3}, [%1]! \n"

"pld [%2, #128] \n"
"vld1.f32 {d6-d7}, [%2]! \n"

"vext.f32 q10, q0, q1, #1 \n"
"vext.f32 q11, q2, q3, #1 \n"

"vmla.f32 q8, q12, %e13[0] \n"
"vmla.f32 q9, q14, %f13[0] \n"

"pld [%3, #128] \n"
"vld1.f32 {d26-d27}, [%3]! \n"

"pld [%4, #128] \n"
"vld1.f32 {d30-d31}, [%4]! \n"

"vmla.f32 q8, q10, %e12[1] \n"
"vmla.f32 q9, q11, %f12[1] \n"

"vext.f32 q10, q12, q13, #1 \n"
"vext.f32 q11, q14, q15, #1 \n"

"vmla.f32 q8, q10, %e13[1] \n"
"vmla.f32 q9, q11, %f13[1] \n"

"vorr q0, q1, q1 \n"
"vorr q2, q3, q3 \n"

"vadd.f32 q8, q8, q9 \n"

"vorr q12, q13, q13 \n"
"vorr q14, q15, q15 \n"

"subs %0, #1 \n"

"vst1.f32 {d16-d17}, [%5]! \n"

"bne 0b \n"
"sub %1, #16 \n"
"sub %2, #16 \n"
"sub %3, #16 \n"
"sub %4, #16 \n"
: "=r"(nn), // %0
"=r"(r00), // %1
"=r"(r01), // %2
"=r"(r10), // %3
"=r"(r11), // %4
"=r"(outptr) // %5
: "0"(nn),
"1"(r00),
"2"(r01),
"3"(r10),
"4"(r11),
"5"(outptr),
"w"(_k0), // %12
"w"(_k1) // %13
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
#endif // __aarch64__
#endif // __ARM_NEON

for (; remain>0; remain--)
{
#if __ARM_NEON
float32x2_t _r00 = vld1_f32(r00);
float32x2_t _r01 = vld1_f32(r01);
float32x4_t _r00r1 = vcombine_f32(_r00, _r01);
float32x4_t _s0s1 = vmulq_f32(_r00r1, _k0);

float32x2_t _r10 = vld1_f32(r10);
float32x2_t _r11 = vld1_f32(r11);
float32x4_t _r10r1 = vcombine_f32(_r10, _r11);
_s0s1 = vmlaq_f32(_s0s1, _r10r1, _k1);

float32x2_t _s = vadd_f32(vget_low_f32(_s0s1), vget_high_f32(_s0s1));
_s = vpadd_f32(_s, _s);
*outptr += vget_lane_f32(_s, 0);
#else
float sum = 0.f;

sum += r00[0] * kernel0[0];
sum += r00[1] * kernel0[1];
sum += r01[0] * kernel0[2];
sum += r01[1] * kernel0[3];

sum += r10[0] * kernel1[0];
sum += r10[1] * kernel1[1];
sum += r11[0] * kernel1[2];
sum += r11[1] * kernel1[3];

*outptr += sum;
#endif // __ARM_NEON

r00 += 1;
r01 += 1;
r10 += 1;
r11 += 1;
outptr++;
}

r00 += 1;
r01 += 1;
r10 += 1;
r11 += 1;
}
}

for (; q<inch; q++)
{
float* outptr = out;

const float* img0 = bottom_blob.channel(q);

const float* kernel0 = kernel + p*inch*4 + q*4;

const float* r0 = img0;
const float* r1 = img0 + w;

#if __ARM_NEON
float32x4_t _k0 = vdupq_n_f32(kernel0[0]);
float32x4_t _k1 = vdupq_n_f32(kernel0[1]);
float32x4_t _k2 = vdupq_n_f32(kernel0[2]);
float32x4_t _k3 = vdupq_n_f32(kernel0[3]);
#endif // __ARM_NEON

for (int i = 0; i < outh; i++)
{
#if __ARM_NEON
int nn = outw >> 2;
int remain = outw & 3;
#else
int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
{
float32x4_t _r00 = vld1q_f32(r0);
float32x4_t _r10 = vld1q_f32(r1);
float32x4_t _r01 = vld1q_f32(r0 + 1);
float32x4_t _r11 = vld1q_f32(r1 + 1);

float32x4_t _sum = vld1q_f32(outptr);
float32x4_t _sum2;

_sum = vmlaq_f32(_sum, _r00, _k0);
_sum2 = vmulq_f32(_r01, _k1);
_sum = vmlaq_f32(_sum, _r10, _k2);
_sum2 = vmlaq_f32(_sum2, _r11, _k3);

_sum = vaddq_f32(_sum, _sum2);

vst1q_f32(outptr, _sum);

r0 += 4;
r1 += 4;
outptr += 4;
}
#else
if (nn > 0)
{
asm volatile(
"pld [%1, #128] \n"
"vld1.f32 {d0-d1}, [%1]! \n"
"pld [%2, #128] \n"
"vld1.f32 {d4-d5}, [%2]! \n"

"0: \n"
"pld [%3, #128] \n"
"vld1.f32 {d18-d19}, [%3] \n"// q9 = sum

"vmul.f32 q8, q0, %q8 \n"
"vmla.f32 q9, q2, %q10 \n"

"pld [%1, #128] \n"
"vld1.f32 {d2-d3}, [%1]! \n"
"vext.f32 q10, q0, q1, #1 \n"

"vmla.f32 q8, q10, %q9 \n"

"pld [%2, #128] \n"
"vld1.f32 {d6-d7}, [%2]! \n"
"vext.f32 q11, q2, q3, #1 \n"

"vmla.f32 q9, q11, %q11 \n"

"vorr q0, q1, q1 \n"
"vadd.f32 q8, q8, q9 \n"
"vorr q2, q3, q3 \n"

"subs %0, #1 \n"
"vst1.f32 {d16-d17}, [%3]! \n"
"bne 0b \n"
"sub %1, #16 \n"
"sub %2, #16 \n"
: "=r"(nn), // %0
"=r"(r0), // %1
"=r"(r1), // %2
"=r"(outptr) // %3
: "0"(nn),
"1"(r0),
"2"(r1),
"3"(outptr),
"w"(_k0), // %8
"w"(_k1), // %9
"w"(_k2), // %10
"w"(_k3) // %11
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
);
}
#endif // __aarch64__
#endif // __ARM_NEON

#if __ARM_NEON
float32x4_t _k0123 = vld1q_f32(kernel0);
#endif

for (; remain>0; remain--)
{
#if __ARM_NEON
float32x2_t _r0 = vld1_f32(r0);
float32x2_t _r1 = vld1_f32(r1);
float32x4_t _r0r1 = vcombine_f32(_r0, _r1);
float32x4_t _s0s1 = vmulq_f32(_r0r1, _k0123);
float32x2_t _s = vadd_f32(vget_low_f32(_s0s1), vget_high_f32(_s0s1));
_s = vpadd_f32(_s, _s);
*outptr += vget_lane_f32(_s, 0);
#else
float sum = 0.f;
sum += r0[0] * kernel0[0];
sum += r0[1] * kernel0[1];
sum += r1[0] * kernel0[2];
sum += r1[1] * kernel0[3];
*outptr += sum;
#endif

r0 += 1;
r1 += 1;
outptr++;
}

r0 += 1;
r1 += 1;

}

}
}

}

+ 753
- 0
src/layer/arm/convolution_3x3.h View File

@@ -0,0 +1,753 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const float* kernel = _kernel;
const float* bias = _bias;

#pragma omp parallel for
for (int p=0; p<outch; p++)
{
Mat out = top_blob.channel(p);

const float bias0 = bias ? bias[p] : 0.f;

out.fill(bias0);

const float* kernel0 = kernel + p*inch*9;

for (int q=0; q<inch; q++)
{
float* outptr = out;
float* outptr2 = outptr + outw;

const float* img0 = bottom_blob.channel(q);

const float* r0 = img0;
const float* r1 = img0 + w;
const float* r2 = img0 + w*2;
const float* r3 = img0 + w*3;

const float* k0 = kernel0;
const float* k1 = kernel0 + 3;
const float* k2 = kernel0 + 6;

#if __ARM_NEON
float32x4_t _k0123 = vld1q_f32(kernel0);
float32x4_t _k3456 = vld1q_f32(kernel0+3);
float32x4_t _k6789 = vld1q_f32(kernel0+6);
#endif // __ARM_NEON

int i = 0;

for (; i+1 < outh; i+=2)
{

#if __ARM_NEON
int nn = outw >> 2;
int remain = outw & 3;
#else
int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
{
float32x4_t _sum1 = vld1q_f32(outptr);
float32x4_t _sum2 = vdupq_n_f32(0.f);
float32x4_t _sum3 = vld1q_f32(outptr2);
float32x4_t _sum4 = vdupq_n_f32(0.f);

float32x4_t _r00 = vld1q_f32(r0);
float32x4_t _r00n = vld1q_f32(r0 + 4);
float32x4_t _r01 = vextq_f32(_r00, _r00n, 1);
float32x4_t _r02 = vextq_f32(_r00, _r00n, 2);

float32x4_t _r10 = vld1q_f32(r1);
float32x4_t _r10n = vld1q_f32(r1 + 4);
float32x4_t _r11 = vextq_f32(_r10, _r10n, 1);
float32x4_t _r12 = vextq_f32(_r10, _r10n, 2);

float32x4_t _r20 = vld1q_f32(r2);
float32x4_t _r20n = vld1q_f32(r2 + 4);
float32x4_t _r21 = vextq_f32(_r20, _r20n, 1);
float32x4_t _r22 = vextq_f32(_r20, _r20n, 2);

float32x4_t _r30 = vld1q_f32(r3);
float32x4_t _r30n = vld1q_f32(r3 + 4);
float32x4_t _r31 = vextq_f32(_r30, _r30n, 1);
float32x4_t _r32 = vextq_f32(_r30, _r30n, 2);

_sum1 = vfmaq_laneq_f32(_sum1, _r00, _k0123, 0);
_sum2 = vfmaq_laneq_f32(_sum2, _r01, _k0123, 1);
_sum1 = vfmaq_laneq_f32(_sum1, _r02, _k0123, 2);
_sum2 = vfmaq_laneq_f32(_sum2, _r10, _k3456, 0);
_sum1 = vfmaq_laneq_f32(_sum1, _r11, _k3456, 1);
_sum2 = vfmaq_laneq_f32(_sum2, _r12, _k3456, 2);
_sum1 = vfmaq_laneq_f32(_sum1, _r20, _k6789, 0);
_sum2 = vfmaq_laneq_f32(_sum2, _r21, _k6789, 1);
_sum1 = vfmaq_laneq_f32(_sum1, _r22, _k6789, 2);

_sum3 = vfmaq_laneq_f32(_sum3, _r10, _k0123, 0);
_sum4 = vfmaq_laneq_f32(_sum4, _r11, _k0123, 1);
_sum3 = vfmaq_laneq_f32(_sum3, _r12, _k0123, 2);
_sum4 = vfmaq_laneq_f32(_sum4, _r20, _k3456, 0);
_sum3 = vfmaq_laneq_f32(_sum3, _r21, _k3456, 1);
_sum4 = vfmaq_laneq_f32(_sum4, _r22, _k3456, 2);
_sum3 = vfmaq_laneq_f32(_sum3, _r30, _k6789, 0);
_sum4 = vfmaq_laneq_f32(_sum4, _r31, _k6789, 1);
_sum3 = vfmaq_laneq_f32(_sum3, _r32, _k6789, 2);

_sum1 = vaddq_f32(_sum1, _sum2);
_sum3 = vaddq_f32(_sum3, _sum4);

vst1q_f32(outptr, _sum1);
vst1q_f32(outptr2, _sum3);

r0 += 4;
r1 += 4;
r2 += 4;
r3 += 4;
outptr += 4;
outptr2 += 4;
}
#else
if (nn > 0)
{
asm volatile(
"veor q6, q6 \n"
"veor q15, q15 \n"

"pld [%3, #192] \n"
"vld1.f32 {d18-d20}, [%3 :64] \n"// r0
"add %3, #16 \n"

"veor q13, q13 \n"
"veor q14, q14 \n"

"vext.32 q11, q9, q10, #1 \n"
"vext.32 q12, q9, q10, #2 \n"

"0: \n"

"pld [%1, #128] \n"
"vld1.f32 {d14-d15}, [%1 :64] \n"// _sum

"vmla.f32 q7, q9, %e14[0] \n"
"vmla.f32 q6, q11, %e14[1] \n"
"vmla.f32 q13, q12, %f14[0] \n"

"pld [%4, #192] \n"
"vld1.f32 {d18-d20}, [%4] \n"// r1
"add %4, #16 \n"

"vmla.f32 q7, q9, %e15[0] \n"

"vext.32 q11, q9, q10, #1 \n"
"vext.32 q12, q9, q10, #2 \n"

"vmla.f32 q6, q11, %e15[1] \n"
"vmla.f32 q13, q12, %f15[0] \n"

"pld [%2, #128] \n"
"vld1.f32 {d16-d17}, [%2] \n"// _sum2

"vmla.f32 q8, q9, %e14[0] \n"
"vmla.f32 q14, q11, %e14[1] \n"
"vmla.f32 q15, q12, %f14[0] \n"

"pld [%5, #192] \n"
"vld1.f32 {d18-d20}, [%5 :64] \n"// r2
"add %5, #16 \n"

"vmla.f32 q7, q9, %e16[0] \n"

"vext.32 q11, q9, q10, #1 \n"
"vext.32 q12, q9, q10, #2 \n"

"vmla.f32 q6, q11, %e16[1] \n"
"vmla.f32 q13, q12, %f16[0] \n"

"vmla.f32 q8, q9, %e15[0] \n"
"vmla.f32 q14, q11, %e15[1] \n"
"vmla.f32 q15, q12, %f15[0] \n"

"pld [%6, #192] \n"
"vld1.f32 {d18-d20}, [%6] \n"// r3
"add %6, #16 \n"

"vmla.f32 q8, q9, %e16[0] \n"

"vext.32 q11, q9, q10, #1 \n"
"vext.32 q12, q9, q10, #2 \n"

"vmla.f32 q14, q11, %e16[1] \n"
"vmla.f32 q15, q12, %f16[0] \n"

"vadd.f32 q7, q7, q6 \n"
"veor q6, q6 \n"

"pld [%3, #192] \n"
"vld1.f32 {d18-d20}, [%3 :64] \n"// r0

"vadd.f32 q8, q8, q14 \n"
"veor q14, q14 \n"
"vadd.f32 q7, q7, q13 \n"
"veor q13, q13 \n"
"vadd.f32 q8, q8, q15 \n"
"veor q15, q15 \n"

"vext.32 q11, q9, q10, #1 \n"
"vext.32 q12, q9, q10, #2 \n"

"add %3, #16 \n"

"vst1.f32 {d14-d15}, [%1]! \n"
"vst1.f32 {d16-d17}, [%2]! \n"

"subs %0, #1 \n"
"bne 0b \n"

"sub %3, #16 \n"
: "=r"(nn), // %0
"=r"(outptr), // %1
"=r"(outptr2), // %2
"=r"(r0), // %3
"=r"(r1), // %4
"=r"(r2), // %5
"=r"(r3) // %6
: "0"(nn),
"1"(outptr),
"2"(outptr2),
"3"(r0),
"4"(r1),
"5"(r2),
"6"(r3),
"w"(_k0123), // %14
"w"(_k3456), // %15
"w"(_k6789) // %16
: "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
#if __ARM_NEON
float32x4_t _r00 = vld1q_f32(r0);
float32x4_t _r10 = vld1q_f32(r1);
float32x4_t _r20 = vld1q_f32(r2);
float32x4_t _r30 = vld1q_f32(r3);

float32x4_t _sum = vmulq_f32(_r00, _k0123);
_sum = vmlaq_f32(_sum, _r10, _k3456);
_sum = vmlaq_f32(_sum, _r20, _k6789);

float32x4_t _sum2 = vmulq_f32(_r10, _k0123);
_sum2 = vmlaq_f32(_sum2, _r20, _k3456);
_sum2 = vmlaq_f32(_sum2, _r30, _k6789);

_sum = vsetq_lane_f32(*outptr, _sum, 3);
_sum2 = vsetq_lane_f32(*outptr2, _sum2, 3);

#if __aarch64__
*outptr = vaddvq_f32(_sum);
*outptr2 = vaddvq_f32(_sum2);
#else
float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
float32x2_t _ss2 = vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));

float32x2_t _sss2 = vpadd_f32(_ss, _ss2);

*outptr = vget_lane_f32(_sss2, 0);
*outptr2 = vget_lane_f32(_sss2, 1);
#endif // __aarch64__
#else
float sum = 0;
float sum2 = 0;

sum += r0[0] * k0[0];
sum += r0[1] * k0[1];
sum += r0[2] * k0[2];
sum += r1[0] * k1[0];
sum += r1[1] * k1[1];
sum += r1[2] * k1[2];
sum += r2[0] * k2[0];
sum += r2[1] * k2[1];
sum += r2[2] * k2[2];

sum2 += r1[0] * k0[0];
sum2 += r1[1] * k0[1];
sum2 += r1[2] * k0[2];
sum2 += r2[0] * k1[0];
sum2 += r2[1] * k1[1];
sum2 += r2[2] * k1[2];
sum2 += r3[0] * k2[0];
sum2 += r3[1] * k2[1];
sum2 += r3[2] * k2[2];

*outptr += sum;
*outptr2 += sum2;
#endif
r0++;
r1++;
r2++;
r3++;
outptr++;
outptr2++;
}

r0 += 2 + w;
r1 += 2 + w;
r2 += 2 + w;
r3 += 2 + w;

outptr += outw;
outptr2 += outw;
}

for (; i < outh; i++)
{

#if __ARM_NEON
int nn = outw >> 2;
int remain = outw & 3;
#else
int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
{
float32x4_t _sum1 = vld1q_f32(outptr);
float32x4_t _sum2 = vdupq_n_f32(0.f);

float32x4_t _r00 = vld1q_f32(r0);
float32x4_t _r00n = vld1q_f32(r0 + 4);
float32x4_t _r01 = vextq_f32(_r00, _r00n, 1);
float32x4_t _r02 = vextq_f32(_r00, _r00n, 2);

float32x4_t _r10 = vld1q_f32(r1);
float32x4_t _r10n = vld1q_f32(r1 + 4);
float32x4_t _r11 = vextq_f32(_r10, _r10n, 1);
float32x4_t _r12 = vextq_f32(_r10, _r10n, 2);

float32x4_t _r20 = vld1q_f32(r2);
float32x4_t _r20n = vld1q_f32(r2 + 4);
float32x4_t _r21 = vextq_f32(_r20, _r20n, 1);
float32x4_t _r22 = vextq_f32(_r20, _r20n, 2);

_sum1 = vfmaq_laneq_f32(_sum1, _r00, _k0123, 0);
_sum2 = vfmaq_laneq_f32(_sum2, _r01, _k0123, 1);
_sum1 = vfmaq_laneq_f32(_sum1, _r02, _k0123, 2);
_sum2 = vfmaq_laneq_f32(_sum2, _r10, _k3456, 0);
_sum1 = vfmaq_laneq_f32(_sum1, _r11, _k3456, 1);
_sum2 = vfmaq_laneq_f32(_sum2, _r12, _k3456, 2);
_sum1 = vfmaq_laneq_f32(_sum1, _r20, _k6789, 0);
_sum2 = vfmaq_laneq_f32(_sum2, _r21, _k6789, 1);
_sum1 = vfmaq_laneq_f32(_sum1, _r22, _k6789, 2);

_sum1 = vaddq_f32(_sum1, _sum2);

vst1q_f32(outptr, _sum1);

r0 += 4;
r1 += 4;
r2 += 4;
outptr += 4;
}
#else
if (nn > 0)
{
asm volatile(
"pld [%2, #192] \n"
"vld1.f32 {d16-d18}, [%2] \n"// r0
"add %2, #16 \n"

"veor q13, q13 \n"
"veor q14, q14 \n"

"vext.32 q10, q8, q9, #1 \n"
"vext.32 q11, q8, q9, #2 \n"

"0: \n"

"pld [%1, #128] \n"
"vld1.f32 {d14-d15}, [%1] \n"// _sum

"vmla.f32 q7, q8, %e10[0] \n"
"vmla.f32 q13, q10, %e10[1] \n"
"vmla.f32 q14, q11, %f10[0] \n"

"pld [%3, #192] \n"
"vld1.f32 {d16-d18}, [%3] \n"// r1
"add %3, #16 \n"

"vmla.f32 q7, q8, %e11[0] \n"

"vext.32 q10, q8, q9, #1 \n"
"vext.32 q11, q8, q9, #2 \n"

"vmla.f32 q13, q10, %e11[1] \n"
"vmla.f32 q14, q11, %f11[0] \n"

"pld [%4, #192] \n"
"vld1.f32 {d16-d18}, [%4] \n"// r2
"add %4, #16 \n"

"vmla.f32 q7, q8, %e12[0] \n"

"vext.32 q10, q8, q9, #1 \n"
"vext.32 q11, q8, q9, #2 \n"

"vmla.f32 q13, q10, %e12[1] \n"
"vmla.f32 q14, q11, %f12[0] \n"

"pld [%2, #192] \n"
"vld1.f32 {d16-d18}, [%2] \n"// r0
"add %2, #16 \n"

"vadd.f32 q7, q7, q13 \n"
"veor q13, q13 \n"
"vadd.f32 q7, q7, q14 \n"
"veor q14, q14 \n"

"vext.32 q10, q8, q9, #1 \n"
"vext.32 q11, q8, q9, #2 \n"

"vst1.f32 {d14-d15}, [%1]! \n"

"subs %0, #1 \n"
"bne 0b \n"

"sub %2, #16 \n"
: "=r"(nn), // %0
"=r"(outptr), // %1
"=r"(r0), // %2
"=r"(r1), // %3
"=r"(r2) // %4
: "0"(nn),
"1"(outptr),
"2"(r0),
"3"(r1),
"4"(r2),
"w"(_k0123), // %10
"w"(_k3456), // %11
"w"(_k6789) // %12
: "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
#if __ARM_NEON
float32x4_t _r00 = vld1q_f32(r0);
float32x4_t _r10 = vld1q_f32(r1);
float32x4_t _r20 = vld1q_f32(r2);

float32x4_t _sum = vmulq_f32(_r00, _k0123);
_sum = vmlaq_f32(_sum, _r10, _k3456);
_sum = vmlaq_f32(_sum, _r20, _k6789);

_sum = vsetq_lane_f32(*outptr, _sum, 3);

#if __aarch64__
*outptr = vaddvq_f32(_sum);
#else
float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
_ss = vpadd_f32(_ss, _ss);

*outptr = vget_lane_f32(_ss, 0);
#endif // __aarch64__
#else
float sum = 0;

sum += r0[0] * k0[0];
sum += r0[1] * k0[1];
sum += r0[2] * k0[2];
sum += r1[0] * k1[0];
sum += r1[1] * k1[1];
sum += r1[2] * k1[2];
sum += r2[0] * k2[0];
sum += r2[1] * k2[1];
sum += r2[2] * k2[2];

*outptr += sum;
#endif
r0++;
r1++;
r2++;
outptr++;
}

r0 += 2;
r1 += 2;
r2 += 2;
}

kernel0 += 9;
}
}

}

static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int tailstep = w - 2*outw + w;

const float* kernel = _kernel;
const float* bias = _bias;

#pragma omp parallel for
for (int p=0; p<outch; p++)
{
Mat out = top_blob.channel(p);

const float bias0 = bias ? bias[p] : 0.f;

out.fill(bias0);

const float* kernel0 = kernel + p*inch*9;

for (int q=0; q<inch; q++)
{
float* outptr = out;
float* outptr2 = outptr + outw;

const float* img0 = bottom_blob.channel(q);

const float* r0 = img0;
const float* r1 = img0 + w;
const float* r2 = img0 + w*2;

const float* k0 = kernel0;
const float* k1 = kernel0 + 3;
const float* k2 = kernel0 + 6;

#if __ARM_NEON
float32x4_t _k0123 = vld1q_f32(k0);
float32x4_t _k3456 = vld1q_f32(k1);
float32x4_t _k6789 = vld1q_f32(k2);
#endif // __ARM_NEON

int i = 0;

for (; i < outh; i++)
{
#if __ARM_NEON
int nn = outw >> 2;
int remain = outw & 3;
#else
int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
{
float32x4_t _outp = vld1q_f32(outptr);

float32x4x2_t _r0 = vld2q_f32(r0);
float32x4x2_t _r0n = vld2q_f32(r0+8);

float32x4_t _r00 = _r0.val[0];// 0 2 4 6
float32x4_t _r01 = _r0.val[1];// 1 3 5 7
float32x4_t _r02 = vextq_f32(_r00, _r0n.val[0], 1);// 2 4 6 8

_outp = vfmaq_laneq_f32(_outp, _r00, _k0123, 0);
_outp = vfmaq_laneq_f32(_outp, _r01, _k0123, 1);
_outp = vfmaq_laneq_f32(_outp, _r02, _k0123, 2);

float32x4x2_t _r1 = vld2q_f32(r1);
float32x4x2_t _r1n = vld2q_f32(r1+8);

float32x4_t _r10 = _r1.val[0];
float32x4_t _r11 = _r1.val[1];
float32x4_t _r12 = vextq_f32(_r10, _r1n.val[0], 1);

_outp = vfmaq_laneq_f32(_outp, _r10, _k3456, 0);
_outp = vfmaq_laneq_f32(_outp, _r11, _k3456, 1);
_outp = vfmaq_laneq_f32(_outp, _r12, _k3456, 2);

float32x4x2_t _r2 = vld2q_f32(r2);
float32x4x2_t _r2n = vld2q_f32(r2+8);

float32x4_t _r20 = _r2.val[0];
float32x4_t _r21 = _r2.val[1];
float32x4_t _r22 = vextq_f32(_r20, _r2n.val[0], 1);

_outp = vfmaq_laneq_f32(_outp, _r20, _k6789, 0);
_outp = vfmaq_laneq_f32(_outp, _r21, _k6789, 1);
_outp = vfmaq_laneq_f32(_outp, _r22, _k6789, 2);

vst1q_f32(outptr, _outp);

r0 += 8;
r1 += 8;
r2 += 8;
outptr += 4;
}
#else
if (nn > 0)
{
asm volatile(
"pld [%2, #256] \n"
"vld2.f32 {d4-d7}, [%2]! \n"

"veor q10, q10 \n"
"veor q11, q11 \n"

"0: \n"
"pld [%1, #128] \n"
"vld1.f32 {d0-d1}, [%1] \n"

"vmla.f32 q0, q2, %e10[0] \n"
"vmla.f32 q10, q3, %e10[1] \n"

"pld [%2, #256] \n"
"vld2.f32 {d16-d19}, [%2] \n"
"vext.32 q1, q2, q8, #1 \n"

"vmla.f32 q11, q1, %f10[0] \n"

"pld [%3, #256] \n"
"vld2.f32 {d4-d7}, [%3]! \n"

"vmla.f32 q0, q2, %e11[0] \n"
"vmla.f32 q10, q3, %e11[1] \n"

"pld [%3, #256] \n"
"vld2.f32 {d16-d19}, [%3] \n"
"vext.32 q1, q2, q8, #1 \n"

"vmla.f32 q11, q1, %f11[0] \n"

"pld [%4, #256] \n"
"vld2.f32 {d4-d7}, [%4]! \n"

"vmla.f32 q0, q2, %e12[0] \n"
"vmla.f32 q10, q3, %e12[1] \n"

"pld [%4, #256] \n"
"vld2.f32 {d16-d19}, [%4] \n"
"vext.32 q1, q2, q8, #1 \n"

"vmla.f32 q11, q1, %f12[0] \n"

"pld [%2, #256] \n"
"vld2.f32 {d4-d7}, [%2]! \n"

"vadd.f32 q0, q0, q10 \n"
"veor q10, q10 \n"
"vadd.f32 q0, q0, q11 \n"
"veor q11, q11 \n"

"subs %0, #1 \n"
"vst1.f32 {d0-d1}, [%1]! \n"
"bne 0b \n"
"sub %2, #32 \n"
: "=r"(nn), // %0
"=r"(outptr), // %1
"=r"(r0), // %2
"=r"(r1),
"=r"(r2)
: "0"(nn),
"1"(outptr),
"2"(r0),
"3"(r1),
"4"(r2),
"w"(_k0123), // %10
"w"(_k3456), // %11
"w"(_k6789) // %12
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
#if __ARM_NEON
float32x4_t _r00 = vld1q_f32(r0);
float32x4_t _r10 = vld1q_f32(r1);
float32x4_t _r20 = vld1q_f32(r2);

float32x4_t _sum = vmulq_f32(_r00, _k0123);
_sum = vmlaq_f32(_sum, _r10, _k3456);
_sum = vmlaq_f32(_sum, _r20, _k6789);

_sum = vsetq_lane_f32(*outptr, _sum, 3);

#if __aarch64__
*outptr = vaddvq_f32(_sum);
#else
float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
_ss = vpadd_f32(_ss, _ss);

*outptr = vget_lane_f32(_ss, 0);
#endif // __aarch64__
#else
float sum = 0;

sum += r0[0] * k0[0];
sum += r0[1] * k0[1];
sum += r0[2] * k0[2];
sum += r1[0] * k1[0];
sum += r1[1] * k1[1];
sum += r1[2] * k1[2];
sum += r2[0] * k2[0];
sum += r2[1] * k2[1];
sum += r2[2] * k2[2];

*outptr += sum;
#endif // __ARM_NEON

r0 += 2;
r1 += 2;
r2 += 2;
outptr++;
}

r0 += tailstep;
r1 += tailstep;
r2 += tailstep;
}

kernel0 += 9;
}
}
}

+ 340
- 0
src/layer/arm/convolution_4x4.h View File

@@ -0,0 +1,340 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const float* kernel = _kernel;
const float* bias = _bias;

#pragma omp parallel for
for (int p=0; p<outch; p++)
{
Mat out = top_blob.channel(p);

const float bias0 = bias ? bias[p] : 0.f;

out.fill(bias0);

for (int q=0; q<inch; q++)
{
float* outptr = out;

const float* img0 = bottom_blob.channel(q);

const float* kernel0 = kernel + p*inch*16 + q*16;

const float* r0 = img0;
const float* r1 = img0 + w;
const float* r2 = img0 + w*2;
const float* r3 = img0 + w*3;

#if __ARM_NEON
float32x4_t _k0123 = vld1q_f32(kernel0);
float32x4_t _k4567 = vld1q_f32(kernel0+4);
float32x4_t _k891011 = vld1q_f32(kernel0+8);
float32x4_t _k12131415 = vld1q_f32(kernel0+12);
#else
const float* k0 = kernel0;
const float* k1 = kernel0 + 4;
const float* k2 = kernel0 + 8;
const float* k3 = kernel0 + 12;
#endif // __ARM_NEON

for (int i = 0; i < outh; i++)
{
#if __ARM_NEON
int nn = outw >> 2;
int remain = outw - (nn << 2);
#else
int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
{
float32x4_t _r00 = vld1q_f32(r0);
float32x4_t _r10 = vld1q_f32(r1);
float32x4_t _r20 = vld1q_f32(r2);
float32x4_t _r30 = vld1q_f32(r3);

float32x4_t _r01 = vld1q_f32(r0 + 4);
float32x4_t _r11 = vld1q_f32(r1 + 4);
float32x4_t _r21 = vld1q_f32(r2 + 4);
float32x4_t _r31 = vld1q_f32(r3 + 4);

float32x4_t _r02 = vld1q_f32(r0 + 8);
float32x4_t _r12 = vld1q_f32(r1 + 8);
float32x4_t _r22 = vld1q_f32(r2 + 8);
float32x4_t _r32 = vld1q_f32(r3 + 8);

float32x4_t _r03 = vld1q_f32(r0 + 12);
float32x4_t _r13 = vld1q_f32(r1 + 12);
float32x4_t _r23 = vld1q_f32(r2 + 12);
float32x4_t _r33 = vld1q_f32(r3 + 12);

float32x4_t _sum0 = vmulq_f32(_r00, _k0123);
float32x4_t _sum1 = vmulq_f32(_r01, _k0123);
float32x4_t _sum2 = vmulq_f32(_r02, _k0123);
float32x4_t _sum3 = vmulq_f32(_r03, _k0123);

_sum0 = vfmaq_f32(_sum0, _r10, _k4567);
_sum1 = vfmaq_f32(_sum1, _r11, _k4567);
_sum2 = vfmaq_f32(_sum2, _r12, _k4567);
_sum3 = vfmaq_f32(_sum3, _r13, _k4567);

_sum0 = vfmaq_f32(_sum0, _r20, _k891011);
_sum1 = vfmaq_f32(_sum1, _r21, _k891011);
_sum2 = vfmaq_f32(_sum2, _r22, _k891011);
_sum3 = vfmaq_f32(_sum3, _r23, _k891011);

_sum0 = vfmaq_f32(_sum0, _r30, _k12131415);
_sum1 = vfmaq_f32(_sum1, _r31, _k12131415);
_sum2 = vfmaq_f32(_sum2, _r32, _k12131415);
_sum3 = vfmaq_f32(_sum3, _r33, _k12131415);

float32x4_t _s01 = vpaddq_f32(_sum0, _sum1);
float32x4_t _s23 = vpaddq_f32(_sum2, _sum3);
float32x4_t _sum = vpaddq_f32(_s01, _s23);

float32x4_t _outp = vld1q_f32(outptr);

_outp = vaddq_f32(_outp, _sum);

vst1q_f32(outptr, _sum);

r0 += 16;
r1 += 16;
r2 += 16;
r3 += 16;
outptr += 4;
}
#else
if (nn > 0)
{
asm volatile(

"pld [%1, #128] \n"

"0: \n"

"pld [%2, #512] \n"
"pld [%3, #512] \n"

"vld1.f32 {d14-d15}, [%1] \n"// q7 = outptr

"vld1.f32 {d16-d17}, [%2]! \n"// q8 = r0
"vld1.f32 {d18-d19}, [%3]! \n"// q9 = r1

"pld [%4, #512] \n"
"pld [%5, #512] \n"

"vmul.f32 q12, q8, %q12 \n"
"vmul.f32 q13, q9, %q13 \n"

"vld1.f32 {d20-d21}, [%4]! \n"// q10 = r2
"vld1.f32 {d22-d23}, [%5]! \n"// q11 = r3

"vmla.f32 q12, q10, %q14 \n"
"vmla.f32 q13, q11, %q15 \n"

"vadd.f32 q5, q12, q13 \n"

"vld1.f32 {d16-d17}, [%2]! \n"// q8 = r0
"vld1.f32 {d18-d19}, [%3]! \n"// q9 = r1

"vmul.f32 q12, q8, %q12 \n"
"vmul.f32 q13, q9, %q13 \n"

"vld1.f32 {d20-d21}, [%4]! \n"// q10 = r2
"vld1.f32 {d22-d23}, [%5]! \n"// q11 = r3

"vmla.f32 q12, q10, %q14 \n"
"vmla.f32 q13, q11, %q15 \n"

"vadd.f32 q6, q12, q13 \n"

"vld1.f32 {d16-d17}, [%2]! \n"// q8 = r0
"vld1.f32 {d18-d19}, [%3]! \n"// q9 = r1

"vmul.f32 q12, q8, %q12 \n"
"vmul.f32 q13, q9, %q13 \n"

"vld1.f32 {d20-d21}, [%4]! \n"// q10 = r2
"vld1.f32 {d22-d23}, [%5]! \n"// q11 = r3

"vmla.f32 q12, q10, %q14 \n"
"vmla.f32 q13, q11, %q15 \n"

"vadd.f32 q14, q12, q13 \n"

"vld1.f32 {d16-d17}, [%2]! \n"// q8 = r0
"vld1.f32 {d18-d19}, [%3]! \n"// q9 = r1

"vmul.f32 q12, q8, %q12 \n"
"vmul.f32 q13, q9, %q13 \n"

"vld1.f32 {d20-d21}, [%4]! \n"// q10 = r2
"vld1.f32 {d22-d23}, [%5]! \n"// q11 = r3

"vmla.f32 q12, q10, %q14 \n"
"vmla.f32 q13, q11, %q15 \n"

"vadd.f32 q15, q12, q13 \n"

"vadd.f32 d10, d10, d11 \n"
"vadd.f32 d28, d28, d29 \n"
"vadd.f32 d11, d12, d13 \n"
"vadd.f32 d29, d30, d31 \n"

"vpadd.f32 d10, d10, d11 \n"
"vpadd.f32 d11, d28, d29 \n"

"vadd.f32 q7, q7, q5 \n"

"vst1.f32 {d14-d15}, [%1]! \n"

"pld [%1, #128] \n"

"subs %0, #1 \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(outptr), // %1
"=r"(r0), // %2
"=r"(r1), // %3
"=r"(r2), // %4
"=r"(r3) // %5
: "0"(nn),
"1"(outptr),
"2"(r0),
"3"(r1),
"4"(r2),
"5"(r3),
"w"(_k0123), // %12
"w"(_k4567), // %13
"w"(_k891011), // %14
"w"(_k12131415) // %15
: "cc", "memory", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
#if __ARM_NEON
#if __aarch64__
float32x4_t _r0 = vld1q_f32(r0);
float32x4_t _r1 = vld1q_f32(r1);
float32x4_t _r2 = vld1q_f32(r2);
float32x4_t _r3 = vld1q_f32(r3);

float32x4_t _sum = vmulq_f32(_r0, _k0123);
_sum = vmlaq_f32(_sum, _r1, _k4567);
_sum = vmlaq_f32(_sum, _r2, _k891011);
_sum = vmlaq_f32(_sum, _r3, _k12131415);

*outptr += vaddvq_f32(_sum);
#else
float sum = 0.f;

asm volatile(
"vld1.f32 {d16-d17}, [%0]! \n"// q8 = r0
"vld1.f32 {d18-d19}, [%1]! \n"// q9 = r1

"vmul.f32 q12, q8, %q9 \n"
"vmul.f32 q13, q9, %q10 \n"

"vld1.f32 {d20-d21}, [%2]! \n"// q10 = r2
"vld1.f32 {d22-d23}, [%3]! \n"// q11 = r3

"vmla.f32 q12, q10, %q11 \n"
"vmla.f32 q13, q11, %q12 \n"

"vadd.f32 q5, q12, q13 \n"
"vadd.f32 d10, d10, d11 \n"
"vpadd.f32 d10, d10, d10 \n"
"vmov.f32 %4, d10[0] \n"
: "=r"(r0), // %0
"=r"(r1), // %1
"=r"(r2), // %2
"=r"(r3), // %3
"=r"(sum) // %4
: "0"(r0),
"1"(r1),
"2"(r2),
"3"(r3),
"w"(_k0123), // %9
"w"(_k4567), // %10
"w"(_k891011), // %11
"w"(_k12131415) // %12
: "cc", "memory", "q5", "q6", "q8", "q9", "q10", "q11", "q12", "q13"
);

*outptr += sum;
#endif // __aarch64__
#else
float sum = 0;

sum += r0[0] * k0[0];
sum += r0[1] * k0[1];
sum += r0[2] * k0[2];
sum += r0[3] * k0[3];

sum += r1[0] * k1[0];
sum += r1[1] * k1[1];
sum += r1[2] * k1[2];
sum += r1[3] * k1[3];

sum += r2[0] * k2[0];
sum += r2[1] * k2[1];
sum += r2[2] * k2[2];
sum += r2[3] * k2[3];

sum += r3[0] * k3[0];
sum += r3[1] * k3[1];
sum += r3[2] * k3[2];
sum += r3[3] * k3[3];

*outptr += sum;
#endif // __ARM_NEON
r0 += 4;
r1 += 4;
r2 += 4;
r3 += 4;
outptr++;
}

r0 += w * 3;
r1 += w * 3;
r2 += w * 3;
r3 += w * 3;
}

}
}

}


+ 1251
- 0
src/layer/arm/convolution_5x5.h
File diff suppressed because it is too large
View File


+ 1073
- 0
src/layer/arm/convolution_7x7.h
File diff suppressed because it is too large
View File


+ 120
- 0
src/layer/arm/convolution_arm.cpp View File

@@ -0,0 +1,120 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "convolution_arm.h"

namespace ncnn {

#include "convolution_1x1.h"
#include "convolution_2x2.h"
#include "convolution_3x3.h"
#include "convolution_4x4.h"
#include "convolution_5x5.h"
#include "convolution_7x7.h"

DEFINE_LAYER_CREATOR(Convolution_arm)

int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
{
// convolv with NxN kernel
// value = value + bias

if (kernel_size > 7 || stride > 4 || dilation != 1)
{
return Convolution::forward(bottom_blob, top_blob);
}

typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&);

// kernel_size x stride
conv_func conv_func_table[7][4] =
{
{
conv1x1s1_neon,
conv1x1s2_neon,
0,
0
}, // kernel_size = 1
{
conv2x2s1_neon,
0,
0,
0
}, // kernel_size = 2
{
conv3x3s1_neon,
conv3x3s2_neon,
0,
0
}, // kernel_size = 3
{
0,
0,
0,
conv4x4s4_neon
}, // kernel_size = 4
{
conv5x5s1_neon,
conv5x5s2_neon,
0,
0
}, // kernel_size = 5
{
0,
0,
0,
0
}, // kernel_size = 6
{
conv7x7s1_neon,
conv7x7s2_neon,
0,
0
} // kernel_size = 7
};

conv_func conv = conv_func_table[kernel_size-1][stride-1];
if (!conv)
{
return Convolution::forward(bottom_blob, top_blob);
}

int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;

Mat bottom_blob_bordered = bottom_blob;
if (pad > 0)
{
copy_make_border(bottom_blob, bottom_blob_bordered, pad, pad, pad, pad, BORDER_CONSTANT, 0.f);
if (bottom_blob_bordered.empty())
return -100;

w = bottom_blob_bordered.w;
h = bottom_blob_bordered.h;
}

int outw = (w - kernel_size) / stride + 1;
int outh = (h - kernel_size) / stride + 1;

top_blob.create(outw, outh, num_output);
if (top_blob.empty())
return -100;

conv(bottom_blob_bordered, top_blob, weight_data, bias_data);

return 0;
}

} // namespace ncnn

+ 30
- 0
src/layer/arm/convolution_arm.h View File

@@ -0,0 +1,30 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_CONVOLUTION_ARM_H
#define LAYER_CONVOLUTION_ARM_H

#include "convolution.h"

namespace ncnn {

class Convolution_arm : public Convolution
{
public:
virtual int forward(const Mat& bottom_blobs, Mat& top_blobs) const;
};

} // namespace ncnn

#endif // LAYER_CONVOLUTION_ARM_H

+ 574
- 0
src/layer/arm/eltwise_arm.cpp View File

@@ -0,0 +1,574 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "eltwise_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

namespace ncnn {

DEFINE_LAYER_CREATOR(Eltwise_arm)

int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
{
const Mat& bottom_blob = bottom_blobs[0];
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
int size = w * h;

Mat& top_blob = top_blobs[0];
top_blob.create(w, h, channels);
if (top_blob.empty())
return -100;

if (op_type == Operation_PROD)
{
// first blob
const Mat& bottom_blob1 = bottom_blobs[1];
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
const float* ptr1 = bottom_blob1.channel(q);
float* outptr = top_blob.channel(q);

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
{
float32x4_t _ptr = vld1q_f32(ptr);
float32x4_t _ptr1 = vld1q_f32(ptr1);
float32x4_t _p = vmulq_f32(_ptr, _ptr1);
vst1q_f32(outptr, _p);

ptr += 4;
ptr1 += 4;
outptr += 4;
}
#else
if (nn > 0)
{
asm volatile(
"0: \n"
"pld [%1, #128] \n"
"pld [%2, #128] \n"
"vld1.f32 {d0-d1}, [%1 :128]! \n"
"vld1.f32 {d2-d3}, [%2 :128]! \n"
"vmul.f32 q0, q0, q1 \n"
"subs %0, #1 \n"
"vst1.f32 {d0-d1}, [%3 :128]! \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr), // %1
"=r"(ptr1), // %2
"=r"(outptr) // %3
: "0"(nn),
"1"(ptr),
"2"(ptr1),
"3"(outptr)
: "cc", "memory", "q0", "q1"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
*outptr = *ptr * *ptr1;

ptr++;
ptr1++;
outptr++;
}
}

for (size_t b=2; b<bottom_blobs.size(); b++)
{
const Mat& bottom_blob1 = bottom_blobs[b];
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob1.channel(q);
float* outptr = top_blob.channel(q);

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
{
float32x4_t _ptr = vld1q_f32(ptr);
float32x4_t _p = vld1q_f32(outptr);
_p = vmulq_f32(_ptr, _p);
vst1q_f32(outptr, _p);

ptr += 4;
outptr += 4;
}
#else
if (nn > 0)
{
asm volatile(
"0: \n"
"pld [%1, #128] \n"
"pld [%2, #128] \n"
"vld1.f32 {d0-d1}, [%1 :128]! \n"
"vld1.f32 {d2-d3}, [%2 :128] \n"
"vmul.f32 q0, q0, q1 \n"
"subs %0, #1 \n"
"vst1.f32 {d0-d1}, [%2 :128]! \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr), // %1
"=r"(outptr) // %2
: "0"(nn),
"1"(ptr),
"2"(outptr)
: "cc", "memory", "q0", "q1"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
*outptr *= *ptr;

ptr++;
outptr++;
}
}
}
}
else if (op_type == Operation_SUM)
{
if (num_coeff == 0)
{
// first blob
const Mat& bottom_blob1 = bottom_blobs[1];
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
const float* ptr1 = bottom_blob1.channel(q);
float* outptr = top_blob.channel(q);

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
{
float32x4_t _ptr = vld1q_f32(ptr);
float32x4_t _ptr1 = vld1q_f32(ptr1);
float32x4_t _p = vaddq_f32(_ptr, _ptr1);
vst1q_f32(outptr, _p);

ptr += 4;
ptr1 += 4;
outptr += 4;
}
#else
if (nn > 0)
{
asm volatile(
"0: \n"
"pld [%1, #128] \n"
"pld [%2, #128] \n"
"vld1.f32 {d0-d1}, [%1 :128]! \n"
"vld1.f32 {d2-d3}, [%2 :128]! \n"
"vadd.f32 q0, q0, q1 \n"
"subs %0, #1 \n"
"vst1.f32 {d0-d1}, [%3 :128]! \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr), // %1
"=r"(ptr1), // %2
"=r"(outptr) // %3
: "0"(nn),
"1"(ptr),
"2"(ptr1),
"3"(outptr)
: "cc", "memory", "q0", "q1"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
*outptr = *ptr + *ptr1;

ptr++;
ptr1++;
outptr++;
}
}

for (size_t b=2; b<bottom_blobs.size(); b++)
{
const Mat& bottom_blob1 = bottom_blobs[b];
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob1.channel(q);
float* outptr = top_blob.channel(q);

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
{
float32x4_t _ptr = vld1q_f32(ptr);
float32x4_t _p = vld1q_f32(outptr);
_p = vaddq_f32(_ptr, _p);
vst1q_f32(outptr, _p);

ptr += 4;
outptr += 4;
}
#else
if (nn > 0)
{
asm volatile(
"0: \n"
"pld [%1, #128] \n"
"pld [%2, #128] \n"
"vld1.f32 {d0-d1}, [%1 :128]! \n"
"vld1.f32 {d2-d3}, [%2 :128] \n"
"vadd.f32 q0, q0, q1 \n"
"subs %0, #1 \n"
"vst1.f32 {d0-d1}, [%2 :128]! \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr), // %1
"=r"(outptr) // %2
: "0"(nn),
"1"(ptr),
"2"(outptr)
: "cc", "memory", "q0", "q1"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
*outptr += *ptr;

ptr++;
outptr++;
}
}
}
}
else
{
const float* coeffs_ptr = coeffs;

// first blob
const Mat& bottom_blob1 = bottom_blobs[1];
float coeff0 = coeffs_ptr[0];
float coeff1 = coeffs_ptr[1];
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
const float* ptr1 = bottom_blob1.channel(q);
float* outptr = top_blob.channel(q);

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
float32x4_t _coeff0 = vdupq_n_f32(coeff0);
float32x4_t _coeff1 = vdupq_n_f32(coeff1);
#if __aarch64__
for (; nn>0; nn--)
{
float32x4_t _ptr = vld1q_f32(ptr);
float32x4_t _ptr1 = vld1q_f32(ptr1);
float32x4_t _p = vmulq_f32(_ptr, _coeff0);
_p = vmlaq_f32(_p, _ptr1, _coeff1);
vst1q_f32(outptr, _p);

ptr += 4;
ptr1 += 4;
outptr += 4;
}
#else
if (nn > 0)
{
asm volatile(
"0: \n"
"pld [%1, #128] \n"
"pld [%2, #128] \n"
"vld1.f32 {d0-d1}, [%1 :128]! \n"
"vld1.f32 {d2-d3}, [%2 :128]! \n"
"vmul.f32 q0, q0, %q8 \n"
"vmla.f32 q0, q1, %q9 \n"
"subs %0, #1 \n"
"vst1.f32 {d0-d1}, [%3 :128]! \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr), // %1
"=r"(ptr1), // %2
"=r"(outptr) // %3
: "0"(nn),
"1"(ptr),
"2"(ptr1),
"3"(outptr),
"w"(_coeff0), // %8
"w"(_coeff1) // %9
: "cc", "memory", "q0", "q1"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
*outptr = *ptr * coeff0 + *ptr1 * coeff1;

ptr++;
ptr1++;
outptr++;
}
}

for (size_t b=2; b<bottom_blobs.size(); b++)
{
const Mat& bottom_blob1 = bottom_blobs[b];
float coeff = coeffs_ptr[b];
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob1.channel(q);
float* outptr = top_blob.channel(q);

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
float32x4_t _coeff = vdupq_n_f32(coeff);
#if __aarch64__
for (; nn>0; nn--)
{
float32x4_t _ptr = vld1q_f32(ptr);
float32x4_t _p = vld1q_f32(outptr);
_p = vmlaq_f32(_p, _ptr, _coeff);
vst1q_f32(outptr, _p);

ptr += 4;
outptr += 4;
}
#else
if (nn > 0)
{
asm volatile(
"0: \n"
"pld [%1, #128] \n"
"pld [%2, #128] \n"
"vld1.f32 {d0-d1}, [%1 :128]! \n"
"vld1.f32 {d2-d3}, [%2 :128] \n"
"vmla.f32 q1, q0, %q6 \n"
"subs %0, #1 \n"
"vst1.f32 {d0-d1}, [%2 :128]! \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr), // %1
"=r"(outptr) // %2
: "0"(nn),
"1"(ptr),
"2"(outptr),
"w"(_coeff) // %6
: "cc", "memory", "q0", "q1"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
*outptr += *ptr * coeff;

ptr++;
outptr++;
}
}
}
}
}
else if (op_type == Operation_MAX)
{
// first blob
const Mat& bottom_blob1 = bottom_blobs[1];
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
const float* ptr1 = bottom_blob1.channel(q);
float* outptr = top_blob.channel(q);

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
{
float32x4_t _ptr = vld1q_f32(ptr);
float32x4_t _ptr1 = vld1q_f32(ptr1);
float32x4_t _p = vmaxq_f32(_ptr, _ptr1);
vst1q_f32(outptr, _p);

ptr += 4;
ptr1 += 4;
outptr += 4;
}
#else
if (nn > 0)
{
asm volatile(
"0: \n"
"pld [%1, #128] \n"
"pld [%2, #128] \n"
"vld1.f32 {d0-d1}, [%1 :128]! \n"
"vld1.f32 {d2-d3}, [%2 :128]! \n"
"vmax.f32 q0, q0, q1 \n"
"subs %0, #1 \n"
"vst1.f32 {d0-d1}, [%3 :128]! \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr), // %1
"=r"(ptr1), // %2
"=r"(outptr) // %3
: "0"(nn),
"1"(ptr),
"2"(ptr1),
"3"(outptr)
: "cc", "memory", "q0", "q1"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
*outptr = std::max(*ptr, *ptr1);

ptr++;
ptr1++;
outptr++;
}
}

for (size_t b=2; b<bottom_blobs.size(); b++)
{
const Mat& bottom_blob1 = bottom_blobs[b];
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob1.channel(q);
float* outptr = top_blob.channel(q);

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
{
float32x4_t _ptr = vld1q_f32(ptr);
float32x4_t _p = vld1q_f32(outptr);
_p = vmaxq_f32(_ptr, _p);
vst1q_f32(outptr, _p);

ptr += 4;
outptr += 4;
}
#else
if (nn > 0)
{
asm volatile(
"0: \n"
"pld [%1, #128] \n"
"pld [%2, #128] \n"
"vld1.f32 {d0-d1}, [%1 :128]! \n"
"vld1.f32 {d2-d3}, [%2 :128] \n"
"vmax.f32 q0, q0, q1 \n"
"subs %0, #1 \n"
"vst1.f32 {d0-d1}, [%2 :128]! \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr), // %1
"=r"(outptr) // %2
: "0"(nn),
"1"(ptr),
"2"(outptr)
: "cc", "memory", "q0", "q1"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
*outptr = std::max(*ptr, *outptr);

ptr++;
outptr++;
}
}
}
}

return 0;
}

} // namespace ncnn

+ 30
- 0
src/layer/arm/eltwise_arm.h View File

@@ -0,0 +1,30 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_ELTWISE_ARM_H
#define LAYER_ELTWISE_ARM_H

#include "eltwise.h"

namespace ncnn {

class Eltwise_arm : public Eltwise
{
public:
virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
};

} // namespace ncnn

#endif // LAYER_ELTWISE_ARM_H

+ 136
- 0
src/layer/arm/innerproduct_arm.cpp View File

@@ -0,0 +1,136 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "innerproduct_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

namespace ncnn {

DEFINE_LAYER_CREATOR(InnerProduct_arm)

int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
int size = w * h;

top_blob.create(1, 1, num_output);
if (top_blob.empty())
return -100;

// num_output
const float* weight_data_ptr = weight_data;
#pragma omp parallel for
for (int p=0; p<num_output; p++)
{
float* outptr = top_blob.channel(p);
float sum = 0.f;

if (bias_term)
sum = bias_data.data[p];

const float* w = weight_data_ptr + size * channels * p;
const float* w2 = w + size;

#if __ARM_NEON
float32x4_t _sum = vdupq_n_f32(0.f);
float32x4_t _sum2 = vdupq_n_f32(0.f);
#endif // __ARM_NEON

// channels
for (int q=0; q<channels; q++)
{
const float* m = bottom_blob.channel(q);

#if __ARM_NEON
int nn = size >> 3;
int remain = size & 7;
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
{
float32x4_t _m = vld1q_f32(m);
float32x4_t _w = vld1q_f32(w);
_sum = vfmaq_f32(_sum, _m, _w);

_m = vld1q_f32(m + 4);
_w = vld1q_f32(w + 4);
_sum2 = vfmaq_f32(_sum2, _m, _w);

m += 8;
w += 8;
}
#else
if (nn > 0)
{
asm volatile(
"0: \n"
"pld [%1, #256] \n"
"vld1.f32 {d0-d3}, [%1 :128]! \n"
"pld [%2, #256] \n"
"vld1.f32 {d4-d7}, [%2]! \n"
"vmla.f32 %q3, q0, q2 \n"
"subs %0, #1 \n"
"vmla.f32 %q4, q1, q3 \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(m), // %1
"=r"(w), // %2
"=w"(_sum), // %3
"=w"(_sum2) // %4
: "0"(nn),
"1"(m),
"2"(w),
"3"(_sum),
"4"(_sum2)
: "cc", "memory", "q0", "q1", "q2", "q3"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
sum += *m * *w;

m++;
w++;
}
}

#if __ARM_NEON
_sum = vaddq_f32(_sum, _sum2);
#if __aarch64__
sum += vaddvq_f32(_sum);
#else
float32x2_t _sumss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
_sumss = vpadd_f32(_sumss, _sumss);
sum += vget_lane_f32(_sumss, 0);
#endif // __aarch64__
#endif // __ARM_NEON

outptr[0] = sum;
}

return 0;
}

} // namespace ncnn

+ 30
- 0
src/layer/arm/innerproduct_arm.h View File

@@ -0,0 +1,30 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_INNERPRODUCT_ARM_H
#define LAYER_INNERPRODUCT_ARM_H

#include "innerproduct.h"

namespace ncnn {

class InnerProduct_arm : public InnerProduct
{
public:
virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
};

} // namespace ncnn

#endif // LAYER_INNERPRODUCT_ARM_H

+ 227
- 0
src/layer/arm/lrn_arm.cpp View File

@@ -0,0 +1,227 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "lrn_arm.h"
#include <math.h>

#if __ARM_NEON
#include <arm_neon.h>
#include "neon_mathfun.h"
#endif // __ARM_NEON

namespace ncnn {

DEFINE_LAYER_CREATOR(LRN_arm)

int LRN_arm::forward_inplace(Mat& bottom_top_blob) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;

// squared values with local_size padding
Mat square_blob;
square_blob.create(w, h, channels);
if (square_blob.empty())
return -100;

#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_top_blob.channel(q);
float* outptr = square_blob.channel(q);

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(ptr);
float32x4_t _outp = vmulq_f32(_p, _p);
vst1q_f32(outptr, _outp);

ptr += 4;
outptr += 4;
}
#endif // __ARM_NEON
for (; remain>0; remain--)
{
*outptr = *ptr * *ptr;

ptr++;
outptr++;
}
}

float alpha_div_size = alpha / local_size;

if (region_type == NormRegion_ACROSS_CHANNELS)
{
Mat square_sum;
square_sum.create(w, h, channels);
if (square_sum.empty())
return -100;
square_sum.fill(0.f);

#pragma omp parallel for
for (int q=0; q<channels; q++)
{
// square sum
for (int p=q - local_size / 2; p<q + local_size; p++)
{
if (p < 0 || p >= channels)
continue;

const float* sptr = square_blob.channel(p);
float* ssptr = square_sum.channel(q);

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
for (; nn>0; nn--)
{
float32x4_t _sp = vld1q_f32(sptr);
float32x4_t _ssp = vld1q_f32(ssptr);
_ssp = vaddq_f32(_ssp, _sp);
vst1q_f32(ssptr, _ssp);

sptr += 4;
ssptr += 4;
}
#endif // __ARM_NEON
for (; remain>0; remain--)
{
*ssptr += *sptr;
sptr++;
ssptr++;
}
}

float* ptr = bottom_top_blob.channel(q);
float* ssptr = square_sum.channel(q);

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
float32x4_t _v1 = vdupq_n_f32(1.f);
float32x4_t _ads = vdupq_n_f32(alpha_div_size);
float32x4_t _mb = vdupq_n_f32(-beta);
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(ptr);
float32x4_t _ssp = vld1q_f32(ssptr);
_ssp = vmulq_f32(_ssp, _ads);
_ssp = vaddq_f32(_ssp, _v1);
_ssp = pow_ps(_ssp, _mb);
_p = vmulq_f32(_p, _ssp);
vst1q_f32(ptr, _p);

ssptr += 4;
ptr += 4;
}
#endif // __ARM_NEON
for (; remain>0; remain--)
{
*ptr = *ptr * pow(1.f + alpha_div_size * *ssptr, -beta);

ssptr++;
ptr++;
}
}
}
else if (region_type == NormRegion_WITHIN_CHANNEL)
{
int outw = w;
int outh = h;

Mat square_blob_bordered = square_blob;
int pad = local_size / 2;
if (pad > 0)
{
copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f);
if (square_blob_bordered.empty())
return -100;

w = square_blob_bordered.w;
h = square_blob_bordered.h;
}

const int maxk = local_size * local_size;

// norm window offsets
std::vector<int> _space_ofs(maxk);
int* space_ofs = &_space_ofs[0];
{
int p1 = 0;
int p2 = 0;
int gap = w - local_size;
for (int i = 0; i < local_size; i++)
{
for (int j = 0; j < local_size; j++)
{
space_ofs[p1] = p2;
p1++;
p2++;
}
p2 += gap;
}
}

#pragma omp parallel for
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);
const float* sptr = square_blob_bordered.channel(q);

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)
{
float ss = 0.f;

for (int k = 0; k < maxk; k++)
{
float val = sptr[ space_ofs[k] ];
ss += val;
}

ptr[j] = ptr[j] * pow(1.f + alpha_div_size * ss, -beta);
}

ptr += outw;
sptr += w;
}
}
}

return 0;
}

} // namespace ncnn

+ 30
- 0
src/layer/arm/lrn_arm.h View File

@@ -0,0 +1,30 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_LRN_ARM_H
#define LAYER_LRN_ARM_H

#include "lrn.h"

namespace ncnn {

class LRN_arm : public LRN
{
public:
virtual int forward_inplace(Mat& bottom_top_blob) const;
};

} // namespace ncnn

#endif // LAYER_LRN_ARM_H

+ 316
- 0
src/layer/arm/neon_mathfun.h View File

@@ -0,0 +1,316 @@
/* NEON implementation of sin, cos, exp and log
*
* Inspired by Intel Approximate Math library, and based on the
* corresponding algorithms of the cephes math library
*/

/* Copyright (C) 2011 Julien Pommier
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*
* (this is the zlib license)
*/

#include <arm_neon.h>

#define c_inv_mant_mask ~0x7f800000u
#define c_cephes_SQRTHF 0.707106781186547524
#define c_cephes_log_p0 7.0376836292E-2
#define c_cephes_log_p1 - 1.1514610310E-1
#define c_cephes_log_p2 1.1676998740E-1
#define c_cephes_log_p3 - 1.2420140846E-1
#define c_cephes_log_p4 + 1.4249322787E-1
#define c_cephes_log_p5 - 1.6668057665E-1
#define c_cephes_log_p6 + 2.0000714765E-1
#define c_cephes_log_p7 - 2.4999993993E-1
#define c_cephes_log_p8 + 3.3333331174E-1
#define c_cephes_log_q1 -2.12194440e-4
#define c_cephes_log_q2 0.693359375

/* natural logarithm computed for 4 simultaneous float
* return NaN for x <= 0
*/
static inline float32x4_t log_ps(float32x4_t x)
{
float32x4_t one = vdupq_n_f32(1);

x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0));

int32x4_t ux = vreinterpretq_s32_f32(x);

int32x4_t emm0 = vshrq_n_s32(ux, 23);

/* keep only the fractional part */
ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask));
ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f)));
x = vreinterpretq_f32_s32(ux);

emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f));
float32x4_t e = vcvtq_f32_s32(emm0);

e = vaddq_f32(e, one);

/* part2:
* if( x < SQRTHF ) {
* e -= 1;
* x = x + x - 1.0;
* } else { x = x - 1.0; }
*/
uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF));
float32x4_t tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
x = vsubq_f32(x, one);
e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask)));
x = vaddq_f32(x, tmp);

float32x4_t z = vmulq_f32(x,x);

float32x4_t y = vdupq_n_f32(c_cephes_log_p0);
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8));
y = vmulq_f32(y, x);

y = vmulq_f32(y, z);


tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1));
y = vaddq_f32(y, tmp);


tmp = vmulq_f32(z, vdupq_n_f32(0.5f));
y = vsubq_f32(y, tmp);

tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2));
x = vaddq_f32(x, y);
x = vaddq_f32(x, tmp);
x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN
return x;
}

#define c_exp_hi 88.3762626647949f
#define c_exp_lo -88.3762626647949f

#define c_cephes_LOG2EF 1.44269504088896341
#define c_cephes_exp_C1 0.693359375
#define c_cephes_exp_C2 -2.12194440e-4

#define c_cephes_exp_p0 1.9875691500E-4
#define c_cephes_exp_p1 1.3981999507E-3
#define c_cephes_exp_p2 8.3334519073E-3
#define c_cephes_exp_p3 4.1665795894E-2
#define c_cephes_exp_p4 1.6666665459E-1
#define c_cephes_exp_p5 5.0000001201E-1

/* exp() computed for 4 float at once */
static inline float32x4_t exp_ps(float32x4_t x)
{
float32x4_t tmp, fx;

float32x4_t one = vdupq_n_f32(1);
x = vminq_f32(x, vdupq_n_f32(c_exp_hi));
x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo));

/* express exp(x) as exp(g + n*log(2)) */
fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));

/* perform a floorf */
tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));

/* if greater, substract 1 */
uint32x4_t mask = vcgtq_f32(tmp, fx);
mask = vandq_u32(mask, vreinterpretq_u32_f32(one));


fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));

tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1));
float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2));
x = vsubq_f32(x, tmp);
x = vsubq_f32(x, z);

static const float cephes_exp_p[6] = { c_cephes_exp_p0, c_cephes_exp_p1, c_cephes_exp_p2, c_cephes_exp_p3, c_cephes_exp_p4, c_cephes_exp_p5 };
float32x4_t y = vld1q_dup_f32(cephes_exp_p+0);
float32x4_t c1 = vld1q_dup_f32(cephes_exp_p+1);
float32x4_t c2 = vld1q_dup_f32(cephes_exp_p+2);
float32x4_t c3 = vld1q_dup_f32(cephes_exp_p+3);
float32x4_t c4 = vld1q_dup_f32(cephes_exp_p+4);
float32x4_t c5 = vld1q_dup_f32(cephes_exp_p+5);

y = vmulq_f32(y, x);
z = vmulq_f32(x, x);

y = vaddq_f32(y, c1);
y = vmulq_f32(y, x);
y = vaddq_f32(y, c2);
y = vmulq_f32(y, x);
y = vaddq_f32(y, c3);
y = vmulq_f32(y, x);
y = vaddq_f32(y, c4);
y = vmulq_f32(y, x);
y = vaddq_f32(y, c5);

y = vmulq_f32(y, z);
y = vaddq_f32(y, x);
y = vaddq_f32(y, one);

/* build 2^n */
int32x4_t mm;
mm = vcvtq_s32_f32(fx);
mm = vaddq_s32(mm, vdupq_n_s32(0x7f));
mm = vshlq_n_s32(mm, 23);
float32x4_t pow2n = vreinterpretq_f32_s32(mm);

y = vmulq_f32(y, pow2n);
return y;
}

#define c_minus_cephes_DP1 -0.78515625
#define c_minus_cephes_DP2 -2.4187564849853515625e-4
#define c_minus_cephes_DP3 -3.77489497744594108e-8
#define c_sincof_p0 -1.9515295891E-4
#define c_sincof_p1 8.3321608736E-3
#define c_sincof_p2 -1.6666654611E-1
#define c_coscof_p0 2.443315711809948E-005
#define c_coscof_p1 -1.388731625493765E-003
#define c_coscof_p2 4.166664568298827E-002
#define c_cephes_FOPI 1.27323954473516 // 4 / M_PI

/* evaluation of 4 sines & cosines at once.
*
* The code is the exact rewriting of the cephes sinf function.
* Precision is excellent as long as x < 8192 (I did not bother to
* take into account the special handling they have for greater values
* -- it does not return garbage for arguments over 8192, though, but
* the extra precision is missing).
*
* Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
* surprising but correct result.
*
* Note also that when you compute sin(x), cos(x) is available at
* almost no extra price so both sin_ps and cos_ps make use of
* sincos_ps..
*/
static inline void sincos_ps(float32x4_t x, float32x4_t *ysin, float32x4_t *ycos)
{
// any x
float32x4_t xmm1, xmm2, xmm3, y;

uint32x4_t emm2;

uint32x4_t sign_mask_sin, sign_mask_cos;
sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0));
x = vabsq_f32(x);

/* scale by 4/Pi */
y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI));

/* store the integer part of y in mm0 */
emm2 = vcvtq_u32_f32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
y = vcvtq_f32_u32(emm2);

/* get the polynom selection mask
* there is one polynom for 0 <= x <= Pi/4
* and another one for Pi/4<x<=Pi/2
*
* Both branches will be computed.
*/
uint32x4_t poly_mask = vtstq_u32(emm2, vdupq_n_u32(2));

/* The magic pass: "Extended precision modular arithmetic"
* x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = vmulq_n_f32(y, c_minus_cephes_DP1);
xmm2 = vmulq_n_f32(y, c_minus_cephes_DP2);
xmm3 = vmulq_n_f32(y, c_minus_cephes_DP3);
x = vaddq_f32(x, xmm1);
x = vaddq_f32(x, xmm2);
x = vaddq_f32(x, xmm3);

sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4)));
sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4));

/* Evaluate the first polynom (0 <= x <= Pi/4) in y1,
* and the second polynom (Pi/4 <= x <= 0) in y2 */
float32x4_t z = vmulq_f32(x,x);
float32x4_t y1, y2;

y1 = vmulq_n_f32(z, c_coscof_p0);
y2 = vmulq_n_f32(z, c_sincof_p0);
y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p1));
y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p1));
y1 = vmulq_f32(y1, z);
y2 = vmulq_f32(y2, z);
y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p2));
y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p2));
y1 = vmulq_f32(y1, z);
y2 = vmulq_f32(y2, z);
y1 = vmulq_f32(y1, z);
y2 = vmulq_f32(y2, x);
y1 = vsubq_f32(y1, vmulq_f32(z, vdupq_n_f32(0.5f)));
y2 = vaddq_f32(y2, x);
y1 = vaddq_f32(y1, vdupq_n_f32(1));

/* select the correct result from the two polynoms */
float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
*ysin = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
*ycos = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
}

static inline float32x4_t sin_ps(float32x4_t x)
{
float32x4_t ysin, ycos;
sincos_ps(x, &ysin, &ycos);
return ysin;
}

static inline float32x4_t cos_ps(float32x4_t x)
{
float32x4_t ysin, ycos;
sincos_ps(x, &ysin, &ycos);
return ycos;
}

static inline float32x4_t div_ps(float32x4_t a, float32x4_t b)
{
float32x4_t reciprocal = vrecpeq_f32(b);
reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
// reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
return vmulq_f32(a, reciprocal);
}

static inline float32x4_t pow_ps(float32x4_t a, float32x4_t b)
{
// pow(x, m) = exp(m * log(x))
return exp_ps(vmulq_f32(b, log_ps(a)));
}

+ 112
- 0
src/layer/arm/pooling_2x2.h View File

@@ -0,0 +1,112 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

#pragma omp parallel for
for (int q=0; q<inch; q++)
{
const float* img0 = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);

const float* r0 = img0;
const float* r1 = img0 + w;

for (int i = 0; i < outh; i++)
{
#if __ARM_NEON
int nn = outw >> 2;
int remain = outw - (nn << 2);
#else
int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
{
float32x4_t _r00 = vld1q_f32(r0);
float32x4_t _r10 = vld1q_f32(r1);
float32x4_t _r01 = vld1q_f32(r0 + 4);
float32x4_t _r11 = vld1q_f32(r1 + 4);

float32x4_t _max0 = vmaxq_f32(_r00, _r10);
float32x4_t _max1 = vmaxq_f32(_r01, _r11);

float32x4_t _max = vpmaxq_f32(_max0, _max1);

vst1q_f32(outptr, _max);

r0 += 8;
r1 += 8;
outptr += 4;
}
#else
if (nn > 0)
{
asm volatile(
"0: \n"
"pld [%1, #256] \n"
"pld [%2, #256] \n"
"vld1.f32 {d0-d3}, [%1]! \n"
"vld1.f32 {d4-d7}, [%2]! \n"
"vmax.f32 q0, q0, q2 \n"
"vmax.f32 q1, q1, q3 \n"
"vpmax.f32 d4, d0, d1 \n"
"vpmax.f32 d5, d2, d3 \n"
"subs %0, #1 \n"
"vst1.f32 {d4-d5}, [%3]! \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(r0), // %1
"=r"(r1), // %2
"=r"(outptr) // %3
: "0"(nn),
"1"(r0),
"2"(r1),
"3"(outptr)
: "cc", "memory", "q0", "q1", "q2", "q3"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
float max0 = std::max(r0[0], r0[1]);
float max1 = std::max(r1[0], r1[1]);

*outptr = std::max(max0, max1);

r0 += 2;
r1 += 2;
outptr++;
}

r0 += w;
r1 += w;
}
}
}

+ 170
- 0
src/layer/arm/pooling_3x3.h View File

@@ -0,0 +1,170 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int tailstep = w - 2*outw + w;

#pragma omp parallel for
for (int q=0; q<inch; q++)
{
const float* img0 = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);

const float* r0 = img0;
const float* r1 = img0 + w;
const float* r2 = img0 + w*2;

for (int i = 0; i < outh; i++)
{
#if __ARM_NEON
int nn = outw >> 2;
int remain = outw - (nn << 2);
#else
int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
float32x4x2_t _r0 = vld2q_f32(r0);
float32x4x2_t _r1 = vld2q_f32(r1);
float32x4x2_t _r2 = vld2q_f32(r2);
for (; nn>0; nn--)
{
float32x4x2_t _r0n = vld2q_f32(r0+8);
float32x4x2_t _r1n = vld2q_f32(r1+8);
float32x4x2_t _r2n = vld2q_f32(r2+8);

float32x4_t _max0 = vmaxq_f32(_r0.val[0], _r0.val[1]);
float32x4_t _max1 = vmaxq_f32(_r1.val[0], _r1.val[1]);
float32x4_t _max2 = vmaxq_f32(_r2.val[0], _r2.val[1]);

float32x4_t _r02 = vextq_f32(_r0.val[0], _r0n.val[0], 1);
float32x4_t _r12 = vextq_f32(_r1.val[0], _r1n.val[0], 1);
float32x4_t _r22 = vextq_f32(_r2.val[0], _r2n.val[0], 1);

_max0 = vmaxq_f32(_max0, _r02);
_max1 = vmaxq_f32(_max1, _r12);
_max2 = vmaxq_f32(_max2, _r22);

float32x4_t _max = vmaxq_f32(vmaxq_f32(_max0, _max1), _max2);

vst1q_f32(outptr, _max);

_r0 = _r0n;
_r1 = _r1n;
_r2 = _r2n;

r0 += 8;
r1 += 8;
r2 += 8;
outptr += 4;
}
#else
if (nn > 0)
{
asm volatile(
"pld [%1, #256] \n"
"vld2.f32 {d0-d3}, [%1]! \n"// q0 = 0 2 4 6 q1 = 1 3 5 7
"pld [%2, #256] \n"
"vld2.f32 {d4-d7}, [%2]! \n"
"pld [%3, #256] \n"
"vld2.f32 {d8-d11}, [%3]! \n"
"0: \n"
"pld [%1, #256] \n"
"vld2.f32 {d12-d15}, [%1]! \n"// q6 = 8 10 12 14 q7 = 9 11 13 15

"vmax.f32 q12, q0, q1 \n"
"vmax.f32 q13, q2, q3 \n"

"pld [%2, #256] \n"
"vld2.f32 {d16-d19}, [%2]! \n"

"vmax.f32 q14, q4, q5 \n"
"vext.32 q0, q0, q6, #1 \n"

"pld [%3, #256] \n"
"vld2.f32 {d20-d23}, [%3]! \n"

"vext.32 q2, q2, q8, #1 \n"

"vmax.f32 q12, q12, q0 \n"
"vext.32 q4, q4, q10, #1 \n"

"vmax.f32 q13, q13, q2 \n"
"vmax.f32 q14, q14, q4 \n"
"vmax.f32 q12, q12, q13 \n"

"vorr q0, q6, q6 \n"
"vorr q1, q7, q7 \n"
"vmax.f32 q12, q12, q14 \n"

"vorr q2, q8, q8 \n"
"vorr q3, q9, q9 \n"
"vorr q4, q10, q10 \n"
"vorr q5, q11, q11 \n"

"subs %0, #1 \n"
"vst1.f32 {d24-d25}, [%4]! \n"
"bne 0b \n"
"sub %1, #32 \n"
"sub %2, #32 \n"
"sub %3, #32 \n"
: "=r"(nn), // %0
"=r"(r0), // %1
"=r"(r1), // %2
"=r"(r2), // %3
"=r"(outptr) // %4
: "0"(nn),
"1"(r0),
"2"(r1),
"3"(r2),
"4"(outptr)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
float max0 = std::max(std::max(r0[0], r0[1]), r0[2]);
float max1 = std::max(std::max(r1[0], r1[1]), r1[2]);
float max2 = std::max(std::max(r2[0], r2[1]), r2[2]);

*outptr = std::max(std::max(max0, max1), max2);

r0 += 2;
r1 += 2;
r2 += 2;
outptr++;
}

r0 += tailstep;//1 + w;
r1 += tailstep;//1 + w;
r2 += tailstep;//1 + w;
}
}
}

+ 96
- 0
src/layer/arm/pooling_arm.cpp View File

@@ -0,0 +1,96 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "pooling_arm.h"

namespace ncnn {

#include "pooling_2x2.h"
#include "pooling_3x3.h"

DEFINE_LAYER_CREATOR(Pooling_arm)

int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
{
// max value in NxN window
// avg value in NxN window

if (pooling_type != PoolMethod_MAX || stride != 2 || global_pooling == 1)
{
return Pooling::forward(bottom_blob, top_blob);
}

if (kernel_size != 2 && kernel_size != 3)
{
return Pooling::forward(bottom_blob, top_blob);
}

int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;

Mat bottom_blob_bordered = bottom_blob;
if (pad > 0)
{
copy_make_border(bottom_blob, bottom_blob_bordered, pad, pad, pad, pad, BORDER_CONSTANT, 0.f);
if (bottom_blob_bordered.empty())
return -100;

w = bottom_blob_bordered.w;
h = bottom_blob_bordered.h;
}

int outw = (w - kernel_size) / stride + 1;
int outh = (h - kernel_size) / stride + 1;

int wtail = (w - kernel_size) % stride;
int htail = (h - kernel_size) % stride;
if (wtail != 0 || htail != 0)
{
int wtailpad = 0;
int htailpad = 0;
if (wtail != 0)
wtailpad = kernel_size - wtail;
if (htail != 0)
htailpad = kernel_size - htail;

Mat bottom_blob_bordered2;
copy_make_border(bottom_blob_bordered, bottom_blob_bordered2, 0, htailpad, 0, wtailpad, BORDER_REPLICATE, 0.f);
if (bottom_blob_bordered2.empty())
return -100;

bottom_blob_bordered = bottom_blob_bordered2;

w = bottom_blob_bordered.w;
h = bottom_blob_bordered.h;

if (wtail != 0)
outw += 1;
if (htail != 0)
outh += 1;
}

top_blob.create(outw, outh, channels);
if (top_blob.empty())
return -100;

if (kernel_size == 2)
pooling2x2s2_max_neon(bottom_blob_bordered, top_blob);
if (kernel_size == 3)
pooling3x3s2_max_neon(bottom_blob_bordered, top_blob);

return 0;
}

} // namespace ncnn

+ 30
- 0
src/layer/arm/pooling_arm.h View File

@@ -0,0 +1,30 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_POOLING_ARM_H
#define LAYER_POOLING_ARM_H

#include "pooling.h"

namespace ncnn {

class Pooling_arm : public Pooling
{
public:
virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
};

} // namespace ncnn

#endif // LAYER_POOLING_ARM_H

+ 182
- 0
src/layer/arm/prelu_arm.cpp View File

@@ -0,0 +1,182 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "prelu_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

namespace ncnn {

DEFINE_LAYER_CREATOR(PReLU_arm)

int PReLU_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
int size = w * h;

top_blob.create(w, h, channels);
if (top_blob.empty())
return -100;

const float* slope_data_ptr = slope_data;

#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);
float slope = num_slope > 1 ? slope_data_ptr[q] : slope_data_ptr[0];

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
float32x4_t _zero = vdupq_n_f32(0.f);
float32x4_t _slope = vdupq_n_f32(slope);
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(ptr);
uint32x4_t _lemask = vcleq_f32(_p, _zero);
float32x4_t _ps = vmulq_f32(_p, _slope);
float32x4_t _outp = vbslq_f32(_lemask, _ps, _p);
vst1q_f32(outptr, _outp);

ptr += 4;
outptr += 4;
}
#else
if (nn > 0)
{
asm volatile(
"veor q1, q0, q0 \n"
"vdup.f32 q2, %6 \n"
"0: \n"
"pld [%1, #128] \n"
"vld1.f32 {d0-d1}, [%1 :128] \n"
"vcle.f32 q3, q0, q1 \n"
"vmul.f32 q4, q0, q2 \n"
"vbit.32 q0, q4, q3 \n"
"subs %0, #1 \n"
"vst1.f32 {d0-d1}, [%2 :128]! \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr), // %1
"=r"(outptr) // %2
: "0"(nn),
"1"(ptr),
"2"(outptr),
"r"(slope) // %6
: "cc", "memory", "q0", "q1", "q2", "q3", "q4"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
if (*ptr < 0)
*outptr = *ptr * slope;
else
*outptr = *ptr;

ptr++;
outptr++;
}
}

return 0;
}

int PReLU_arm::forward_inplace(Mat& bottom_top_blob) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;

const float* slope_data_ptr = slope_data;

#pragma omp parallel for
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);
float slope = num_slope > 1 ? slope_data_ptr[q] : slope_data_ptr[0];

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
float32x4_t _zero = vdupq_n_f32(0.f);
float32x4_t _slope = vdupq_n_f32(slope);
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(ptr);
uint32x4_t _lemask = vcleq_f32(_p, _zero);
float32x4_t _ps = vmulq_f32(_p, _slope);
_p = vbslq_f32(_lemask, _ps, _p);
vst1q_f32(ptr, _p);

ptr += 4;
}
#else
if (nn > 0)
{
asm volatile(
"veor q1, q0, q0 \n"
"vdup.f32 q2, %4 \n"
"0: \n"
"pld [%1, #128] \n"
"vld1.f32 {d0-d1}, [%1 :128] \n"
"vcle.f32 q3, q0, q1 \n"
"vmul.f32 q4, q0, q2 \n"
"vbit.32 q0, q4, q3 \n"
"subs %0, #1 \n"
"vst1.f32 {d0-d1}, [%1 :128]! \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr) // %1
: "0"(nn),
"1"(ptr),
"r"(slope) // %4
: "cc", "memory", "q0", "q1", "q2", "q3", "q4"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
if (*ptr < 0)
*ptr *= slope;

ptr++;
}
}

return 0;
}

} // namespace ncnn

+ 32
- 0
src/layer/arm/prelu_arm.h View File

@@ -0,0 +1,32 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_PRELU_ARM_H
#define LAYER_PRELU_ARM_H

#include "prelu.h"

namespace ncnn {

class PReLU_arm : public PReLU
{
public:
virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

virtual int forward_inplace(Mat& bottom_top_blob) const;
};

} // namespace ncnn

#endif // LAYER_PRELU_ARM_H

+ 295
- 0
src/layer/arm/relu_arm.cpp View File

@@ -0,0 +1,295 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "relu_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

namespace ncnn {

DEFINE_LAYER_CREATOR(ReLU_arm)

int ReLU_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
int size = w * h;

top_blob.create(w, h, channels);
if (top_blob.empty())
return -100;

if (slope == 0.f)
{
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
float32x4_t _zero = vdupq_n_f32(0.f);
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(ptr);
float32x4_t _outp = vmaxq_f32(_p, _zero);
vst1q_f32(outptr, _outp);

ptr += 4;
outptr += 4;
}
#else
if (nn > 0)
{
asm volatile(
"veor q1, q0, q0 \n"
"0: \n"
"pld [%1, #128] \n"
"vld1.f32 {d0-d1}, [%1 :128]! \n"
"vmax.f32 q0, q0, q1 \n"
"subs %0, #1 \n"
"vst1.f32 {d0-d1}, [%2 :128]! \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr), // %1
"=r"(outptr) // %2
: "0"(nn),
"1"(ptr),
"2"(outptr)
: "cc", "memory", "q0", "q1"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
*outptr = std::max(*ptr, 0.f);

ptr++;
outptr++;
}
}
}
else
{
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
float32x4_t _zero = vdupq_n_f32(0.f);
float32x4_t _slope = vdupq_n_f32(slope);
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(ptr);
uint32x4_t _lemask = vcleq_f32(_p, _zero);
float32x4_t _ps = vmulq_f32(_p, _slope);
float32x4_t _outp = vbslq_f32(_lemask, _ps, _p);
vst1q_f32(outptr, _outp);

ptr += 4;
outptr += 4;
}
#else
if (nn > 0)
{
asm volatile(
"veor q1, q0, q0 \n"
"vdup.f32 q2, %6 \n"
"0: \n"
"pld [%1, #128] \n"
"vld1.f32 {d0-d1}, [%1 :128] \n"
"vcle.f32 q3, q0, q1 \n"
"vmul.f32 q4, q0, q2 \n"
"vbit.32 q0, q4, q3 \n"
"subs %0, #1 \n"
"vst1.f32 {d0-d1}, [%2 :128]! \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr), // %1
"=r"(outptr) // %2
: "0"(nn),
"1"(ptr),
"2"(outptr),
"r"(slope) // %6
: "cc", "memory", "q0", "q1", "q2", "q3", "q4"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
if (*ptr < 0)
*outptr = *ptr * slope;
else
*outptr = *ptr;

ptr++;
outptr++;
}
}
}

return 0;
}

int ReLU_arm::forward_inplace(Mat& bottom_top_blob) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;

if (slope == 0.f)
{
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
float32x4_t _zero = vdupq_n_f32(0.f);
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(ptr);
_p = vmaxq_f32(_p, _zero);
vst1q_f32(ptr, _p);

ptr += 4;
}
#else
if (nn > 0)
{
asm volatile(
"veor q1, q0, q0 \n"
"0: \n"
"pld [%1, #128] \n"
"vld1.f32 {d0-d1}, [%1 :128] \n"
"vmax.f32 q0, q0, q1 \n"
"subs %0, #1 \n"
"vst1.f32 {d0-d1}, [%1 :128]! \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr) // %1
: "0"(nn),
"1"(ptr)
: "cc", "memory", "q0", "q1"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
*ptr = std::max(*ptr, 0.f);

ptr++;
}
}
}
else
{
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
float32x4_t _zero = vdupq_n_f32(0.f);
float32x4_t _slope = vdupq_n_f32(slope);
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(ptr);
uint32x4_t _lemask = vcleq_f32(_p, _zero);
float32x4_t _ps = vmulq_f32(_p, _slope);
_p = vbslq_f32(_lemask, _ps, _p);
vst1q_f32(ptr, _p);

ptr += 4;
}
#else
if (nn > 0)
{
asm volatile(
"veor q1, q0, q0 \n"
"vdup.f32 q2, %4 \n"
"0: \n"
"pld [%1, #128] \n"
"vld1.f32 {d0-d1}, [%1 :128] \n"
"vcle.f32 q3, q0, q1 \n"
"vmul.f32 q4, q0, q2 \n"
"vbit.32 q0, q4, q3 \n"
"subs %0, #1 \n"
"vst1.f32 {d0-d1}, [%1 :128]! \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr) // %1
: "0"(nn),
"1"(ptr),
"r"(slope) // %4
: "cc", "memory", "q0", "q1", "q2", "q3", "q4"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
if (*ptr < 0)
*ptr *= slope;

ptr++;
}
}
}

return 0;
}

} // namespace ncnn

+ 32
- 0
src/layer/arm/relu_arm.h View File

@@ -0,0 +1,32 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_RELU_ARM_H
#define LAYER_RELU_ARM_H

#include "relu.h"

namespace ncnn {

class ReLU_arm : public ReLU
{
public:
virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

virtual int forward_inplace(Mat& bottom_top_blob) const;
};

} // namespace ncnn

#endif // LAYER_RELU_ARM_H

+ 211
- 0
src/layer/arm/scale_arm.cpp View File

@@ -0,0 +1,211 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "scale_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

namespace ncnn {

DEFINE_LAYER_CREATOR(Scale_arm)

int Scale_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
int size = w * h;

top_blob.create(w, h, channels);
if (top_blob.empty())
return -100;

if (bias_term)
{
const float* scale_ptr = scale_data;
const float* bias_ptr = bias_data;
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);

float s = scale_ptr[q];
float bias = bias_ptr[q];

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
float32x4_t _s = vdupq_n_f32(s);
float32x4_t _bias = vdupq_n_f32(bias);
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(ptr);
_p = vmlaq_f32(_bias, _p, _s);
vst1q_f32(outptr, _p);

ptr += 4;
outptr += 4;
}
#endif // __ARM_NEON

for (; remain>0; remain--)
{
*outptr = *ptr * s + bias;

ptr++;
outptr++;
}
}
}
else
{
const float* scale_ptr = scale_data;
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);

float s = scale_ptr[q];

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
float32x4_t _s = vdupq_n_f32(s);
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(ptr);
_p = vmulq_f32(_p, _s);
vst1q_f32(outptr, _p);

ptr += 4;
outptr += 4;
}
#endif // __ARM_NEON

for (; remain>0; remain--)
{
*outptr = *ptr * s;

ptr++;
outptr++;
}
}
}

return 0;
}

int Scale_arm::forward_inplace(Mat& bottom_top_blob) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;

if (bias_term)
{
const float* scale_ptr = scale_data;
const float* bias_ptr = bias_data;
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

float s = scale_ptr[q];
float bias = bias_ptr[q];

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
float32x4_t _s = vdupq_n_f32(s);
float32x4_t _bias = vdupq_n_f32(bias);
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(ptr);
_p = vmlaq_f32(_bias, _p, _s);
vst1q_f32(ptr, _p);

ptr += 4;
}
#endif // __ARM_NEON

for (; remain>0; remain--)
{
*ptr = *ptr * s + bias;

ptr++;
}
}
}
else
{
const float* scale_ptr = scale_data;
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

float s = scale_ptr[q];

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
float32x4_t _s = vdupq_n_f32(s);
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(ptr);
_p = vmulq_f32(_p, _s);
vst1q_f32(ptr, _p);

ptr += 4;
}
#endif // __ARM_NEON

for (; remain>0; remain--)
{
*ptr *= s;

ptr++;
}
}
}

return 0;
}

} // namespace ncnn

+ 32
- 0
src/layer/arm/scale_arm.h View File

@@ -0,0 +1,32 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_SCALE_ARM_H
#define LAYER_SCALE_ARM_H

#include "scale.h"

namespace ncnn {

class Scale_arm : public Scale
{
public:
virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

virtual int forward_inplace(Mat& bottom_top_blob) const;
};

} // namespace ncnn

#endif // LAYER_SCALE_ARM_H

+ 127
- 0
src/layer/arm/sigmoid_arm.cpp View File

@@ -0,0 +1,127 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "sigmoid_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "neon_mathfun.h"
#endif // __ARM_NEON

#include <math.h>

namespace ncnn {

DEFINE_LAYER_CREATOR(Sigmoid_arm)

int Sigmoid_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
int size = w * h;

top_blob.create(w, h, channels);
if (top_blob.empty())
return -100;

#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
float32x4_t _one = vdupq_n_f32(1.f);
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(ptr);
_p = vnegq_f32(_p);
_p = exp_ps(_p);
_p = vaddq_f32(_p, _one);
float32x4_t _outp = vrecpeq_f32(_p);
_outp = vmulq_f32(vrecpsq_f32(_p, _outp), _outp);
// _outp = vmulq_f32(vrecpsq_f32(_p, _outp), _outp);
vst1q_f32(outptr, _outp);

ptr += 4;
outptr += 4;
}
#endif // __ARM_NEON
for (; remain>0; remain--)
{
*outptr = 1.f / (1.f + exp(-*ptr));

ptr++;
outptr++;
}
}

return 0;
}

int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;

#pragma omp parallel for
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
float32x4_t _one = vdupq_n_f32(1.f);
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(ptr);
_p = vnegq_f32(_p);
_p = exp_ps(_p);
_p = vaddq_f32(_p, _one);
_p = vrecpeq_f32(_p);
_p = vmulq_f32(vrecpsq_f32(_p, _p), _p);
// _p = vmulq_f32(vrecpsq_f32(_p, _p), _p);
vst1q_f32(ptr, _p);

ptr += 4;
}
#endif // __ARM_NEON
for (; remain>0; remain--)
{
*ptr = 1.f / (1.f + exp(-*ptr));

ptr++;
}
}

return 0;
}

} // namespace ncnn

+ 32
- 0
src/layer/arm/sigmoid_arm.h View File

@@ -0,0 +1,32 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_SIGMOID_ARM_H
#define LAYER_SIGMOID_ARM_H

#include "sigmoid.h"

namespace ncnn {

class Sigmoid_arm : public Sigmoid
{
public:
virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

virtual int forward_inplace(Mat& bottom_top_blob) const;
};

} // namespace ncnn

#endif // LAYER_SIGMOID_ARM_H

+ 102
- 0
src/layer/arm/slice_arm.cpp View File

@@ -0,0 +1,102 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "slice_arm.h"
#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

namespace ncnn {

DEFINE_LAYER_CREATOR(Slice_arm)

int Slice_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
{
const Mat& bottom_blob = bottom_blobs[0];
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;

int q = 0;
const int* slices_ptr = (const int*)slices.data;
for (size_t i=0; i<top_blobs.size(); i++)
{
int slice = slices_ptr[i];
if (slice == -233)
{
slice = (channels - q) / (top_blobs.size() - i);
}

Mat& top_blob = top_blobs[i];
top_blob.create(w, h, slice);
if (top_blob.empty())
return -100;

int size = bottom_blob.cstep * slice;

const float* ptr = bottom_blob.channel(q);
float* outptr = top_blob.data;

#if __ARM_NEON
int nn = size >> 3;
int remain = size - (nn << 3);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(ptr);
float32x4_t _p2 = vld1q_f32(ptr+4);
vst1q_f32(outptr, _p);
vst1q_f32(outptr+4, _p2);

ptr += 8;
outptr += 8;
}
#else
if (nn > 0)
{
asm volatile(
"0: \n"
"pld [%1, #256] \n"
"vld1.f32 {d0-d3}, [%1 :128]! \n"
"subs %0, #1 \n"
"vst1.f32 {d0-d3}, [%2 :128]! \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr), // %1
"=r"(outptr) // %2
: "0"(nn),
"1"(ptr),
"2"(outptr)
: "cc", "memory", "q0"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)
{
*outptr++ = *ptr++;
}

q += slice;
}

return 0;
}

} // namespace ncnn

+ 30
- 0
src/layer/arm/slice_arm.h View File

@@ -0,0 +1,30 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_SLICE_ARM_H
#define LAYER_SLICE_ARM_H

#include "slice.h"

namespace ncnn {

class Slice_arm : public Slice
{
public:
virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
};

} // namespace ncnn

#endif // LAYER_SLICE_ARM_H

+ 302
- 0
src/layer/arm/softmax_arm.cpp View File

@@ -0,0 +1,302 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "softmax_arm.h"
#include <float.h>
#include <math.h>

#if __ARM_NEON
#include <arm_neon.h>
#include "neon_mathfun.h"
#endif // __ARM_NEON

namespace ncnn {

DEFINE_LAYER_CREATOR(Softmax_arm)

int Softmax_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
{
// value = exp( value - global max value )
// sum all value
// value = value / sum

int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
int size = w * h;

top_blob.create(w, h, channels);
if (top_blob.empty())
return -100;

Mat max;
max.create(w, h);
if (max.empty())
return -100;
max.fill(-FLT_MAX);
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
float* maxptr = max;

for (int i=0; i<size; i++)
{
maxptr[i] = std::max(maxptr[i], ptr[i]);
}
}

#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);
float* maxptr = max;

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(ptr);
float32x4_t _max = vld1q_f32(maxptr);

_p = exp_ps(vsubq_f32(_p, _max));

vst1q_f32(outptr, _p);

ptr += 4;
maxptr += 4;
outptr += 4;
}
#endif // __ARM_NEON

for (; remain>0; remain--)
{
*outptr = exp(*ptr - *maxptr);

ptr++;
maxptr++;
outptr++;
}
}

Mat sum;
sum.create(w, h);
if (sum.empty())
return -100;
sum.fill(0.f);
for (int q=0; q<channels; q++)
{
const float* outptr = top_blob.channel(q);
float* sumptr = sum;

for (int i=0; i<size; i++)
{
sumptr[i] += outptr[i];
}
}

#pragma omp parallel for
for (int q=0; q<channels; q++)
{
float* outptr = top_blob.channel(q);
float* sumptr = sum;

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(outptr);
float32x4_t _sum = vld1q_f32(sumptr);
#if __aarch64__
_p = vdivq_f32(_p, _sum);
#else
_p = div_ps(_p, _sum);
#endif // __aarch64__
vst1q_f32(outptr, _p);

outptr += 4;
sumptr += 4;
}
#endif // __ARM_NEON

for (; remain>0; remain--)
{
*outptr /= *sumptr;

outptr++;
sumptr++;
}
}

return 0;
}

int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const
{
// value = exp( value - global max value )
// sum all value
// value = value / sum

int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;

Mat max;
max.create(w, h);
if (max.empty())
return -100;
max.fill(-FLT_MAX);
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);
float* maxptr = max;

for (int i=0; i<size; i++)
{
maxptr[i] = std::max(maxptr[i], ptr[i]);
}
}

#pragma omp parallel for
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);
float* maxptr = max;

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(ptr);
float32x4_t _max = vld1q_f32(maxptr);

_p = exp_ps(vsubq_f32(_p, _max));

vst1q_f32(ptr, _p);

ptr += 4;
maxptr += 4;
}
#endif // __ARM_NEON

for (; remain>0; remain--)
{
*ptr = exp(*ptr - *maxptr);

ptr++;
maxptr++;
}
}

Mat sum;
sum.create(w, h);
if (sum.empty())
return -100;
sum.fill(0.f);
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);
float* sumptr = sum;

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(ptr);
float32x4_t _sum = vld1q_f32(sumptr);
_sum = vaddq_f32(_sum, _p);
vst1q_f32(sumptr, _sum);

ptr += 4;
sumptr += 4;
}
#endif // __ARM_NEON

for (; remain>0; remain--)
{
*sumptr += *ptr;

ptr++;
sumptr++;
}
}

#pragma omp parallel for
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);
float* sumptr = sum;

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
for (; nn>0; nn--)
{
float32x4_t _p = vld1q_f32(ptr);
float32x4_t _sum = vld1q_f32(sumptr);
#if __aarch64__
_p = vdivq_f32(_p, _sum);
#else
_p = div_ps(_p, _sum);
#endif // __aarch64__
vst1q_f32(ptr, _p);

ptr += 4;
sumptr += 4;
}
#endif // __ARM_NEON

for (; remain>0; remain--)
{
*ptr /= *sumptr;

ptr++;
sumptr++;
}
}

return 0;
}

} // namespace ncnn

+ 32
- 0
src/layer/arm/softmax_arm.h View File

@@ -0,0 +1,32 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_SOFTMAX_ARM_H
#define LAYER_SOFTMAX_ARM_H

#include "softmax.h"

namespace ncnn {

class Softmax_arm : public Softmax
{
public:
virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

virtual int forward_inplace(Mat& bottom_top_blob) const;
};

} // namespace ncnn

#endif // LAYER_SOFTMAX_ARM_H

+ 227
- 0
src/layer/batchnorm.cpp View File

@@ -0,0 +1,227 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "batchnorm.h"
#include <math.h>

namespace ncnn {

DEFINE_LAYER_CREATOR(BatchNorm)

BatchNorm::BatchNorm()
{
one_blob_only = true;
support_inplace = true;
}

BatchNorm::~BatchNorm()
{
}

#if NCNN_STDIO
#if NCNN_STRING
int BatchNorm::load_param(FILE* paramfp)
{
int nscan = fscanf(paramfp, "%d", &channels);
if (nscan != 1)
{
fprintf(stderr, "BatchNorm load_param failed %d\n", nscan);
return -1;
}

return 0;
}
#endif // NCNN_STRING
int BatchNorm::load_param_bin(FILE* paramfp)
{
fread(&channels, sizeof(int), 1, paramfp);

return 0;
}

int BatchNorm::load_model(FILE* binfp)
{
int nread;

slope_data.create(channels);
if (slope_data.empty())
return -100;
nread = fread(slope_data, channels * sizeof(float), 1, binfp);
if (nread != 1)
{
fprintf(stderr, "BatchNorm read slope_data failed %d\n", nread);
return -1;
}

mean_data.create(channels);
if (mean_data.empty())
return -100;
nread = fread(mean_data, channels * sizeof(float), 1, binfp);
if (nread != 1)
{
fprintf(stderr, "BatchNorm read mean_data failed %d\n", nread);
return -1;
}

var_data.create(channels);
if (var_data.empty())
return -100;
nread = fread(var_data, channels * sizeof(float), 1, binfp);
if (nread != 1)
{
fprintf(stderr, "BatchNorm read var_data failed %d\n", nread);
return -1;
}

bias_data.create(channels);
if (bias_data.empty())
return -100;
nread = fread(bias_data, channels * sizeof(float), 1, binfp);
if (nread != 1)
{
fprintf(stderr, "BatchNorm read bias_data failed %d\n", nread);
return -1;
}

a_data.create(channels);
if (a_data.empty())
return -100;
b_data.create(channels);
if (b_data.empty())
return -100;
const float* slope_data_ptr = slope_data;
const float* mean_data_ptr = mean_data;
const float* var_data_ptr = var_data;
const float* bias_data_ptr = bias_data;
float* a_data_ptr = a_data;
float* b_data_ptr = b_data;
for (int i=0; i<channels; i++)
{
float sqrt_var = sqrt(var_data_ptr[i]);
a_data_ptr[i] = bias_data_ptr[i] - slope_data_ptr[i] * mean_data_ptr[i] / sqrt_var;
b_data_ptr[i] = slope_data_ptr[i] / sqrt_var;
}

return 0;
}
#endif // NCNN_STDIO

int BatchNorm::load_param(const unsigned char*& mem)
{
channels = *(int*)(mem);
mem += 4;

return 0;
}

int BatchNorm::load_model(const unsigned char*& mem)
{
slope_data = Mat(channels, (float*)mem);
mem += channels * sizeof(float);

mean_data = Mat(channels, (float*)mem);
mem += channels * sizeof(float);

var_data = Mat(channels, (float*)mem);
mem += channels * sizeof(float);

bias_data = Mat(channels, (float*)mem);
mem += channels * sizeof(float);

a_data.create(channels);
if (a_data.empty())
return -100;
b_data.create(channels);
if (b_data.empty())
return -100;
const float* slope_data_ptr = slope_data;
const float* mean_data_ptr = mean_data;
const float* var_data_ptr = var_data;
const float* bias_data_ptr = bias_data;
float* a_data_ptr = a_data;
float* b_data_ptr = b_data;
for (int i=0; i<channels; i++)
{
float sqrt_var = sqrt(var_data_ptr[i]);
a_data_ptr[i] = bias_data_ptr[i] - slope_data_ptr[i] * mean_data_ptr[i] / sqrt_var;
b_data_ptr[i] = slope_data_ptr[i] / sqrt_var;
}

return 0;
}

int BatchNorm::forward(const Mat& bottom_blob, Mat& top_blob) const
{
// a = bias - slope * mean / sqrt(var)
// b = slope / sqrt(var)
// value = b * value + a

int w = bottom_blob.w;
int h = bottom_blob.h;
int size = w * h;

top_blob.create(w, h, channels);
if (top_blob.empty())
return -100;

const float* a_data_ptr = a_data;
const float* b_data_ptr = b_data;
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);

float a = a_data_ptr[q];
float b = b_data_ptr[q];

for (int i=0; i<size; i++)
{
outptr[i] = b * ptr[i] + a;
}
}

return 0;
}

int BatchNorm::forward_inplace(Mat& bottom_top_blob) const
{
// a = bias - slope * mean / sqrt(var)
// b = slope / sqrt(var)
// value = b * value + a

int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int size = w * h;

const float* a_data_ptr = a_data;
const float* b_data_ptr = b_data;
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

float a = a_data_ptr[q];
float b = b_data_ptr[q];

for (int i=0; i<size; i++)
{
ptr[i] = b * ptr[i] + a;
}
}

return 0;
}

} // namespace ncnn

+ 58
- 0
src/layer/batchnorm.h View File

@@ -0,0 +1,58 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_BATCHNORM_H
#define LAYER_BATCHNORM_H

#include "layer.h"

namespace ncnn {

class BatchNorm : public Layer
{
public:
BatchNorm();
virtual ~BatchNorm();

#if NCNN_STDIO
#if NCNN_STRING
virtual int load_param(FILE* paramfp);
#endif // NCNN_STRING
virtual int load_param_bin(FILE* paramfp);
virtual int load_model(FILE* binfp);
#endif // NCNN_STDIO
virtual int load_param(const unsigned char*& mem);
virtual int load_model(const unsigned char*& mem);

virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

virtual int forward_inplace(Mat& bottom_top_blob) const;

public:
// param
int channels;

// model
Mat slope_data;
Mat mean_data;
Mat var_data;
Mat bias_data;

Mat a_data;
Mat b_data;
};

} // namespace ncnn

#endif // LAYER_BATCHNORM_H

+ 139
- 0
src/layer/bias.cpp View File

@@ -0,0 +1,139 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "bias.h"

namespace ncnn {

DEFINE_LAYER_CREATOR(Bias)

Bias::Bias()
{
one_blob_only = true;
support_inplace = true;
}

Bias::~Bias()
{
}

#if NCNN_STDIO
#if NCNN_STRING
int Bias::load_param(FILE* paramfp)
{
int nscan = fscanf(paramfp, "%d", &bias_data_size);
if (nscan != 1)
{
fprintf(stderr, "Bias load_param failed %d\n", nscan);
return -1;
}

return 0;
}
#endif // NCNN_STRING
int Bias::load_param_bin(FILE* paramfp)
{
fread(&bias_data_size, sizeof(int), 1, paramfp);

return 0;
}

int Bias::load_model(FILE* binfp)
{
int nread;

bias_data.create(bias_data_size);
if (bias_data.empty())
return -100;
nread = fread(bias_data, bias_data_size * sizeof(float), 1, binfp);
if (nread != 1)
{
fprintf(stderr, "Bias read bias_data failed %d\n", nread);
return -1;
}

return 0;
}
#endif // NCNN_STDIO

int Bias::load_param(const unsigned char*& mem)
{
bias_data_size = *(int*)(mem);
mem += 4;

return 0;
}

int Bias::load_model(const unsigned char*& mem)
{
bias_data = Mat(bias_data_size, (float*)mem);
mem += bias_data_size * sizeof(float);

return 0;
}

int Bias::forward(const Mat& bottom_blob, Mat& top_blob) const
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
int size = w * h;

top_blob.create(w, h, channels);
if (top_blob.empty())
return -100;

const float* bias_ptr = bias_data;
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);

float bias = bias_ptr[q];

for (int i=0; i<size; i++)
{
outptr[i] = ptr[i] + bias;
}
}

return 0;
}

int Bias::forward_inplace(Mat& bottom_top_blob) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;

const float* bias_ptr = bias_data;
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

float bias = bias_ptr[q];

for (int i=0; i<size; i++)
{
ptr[i] += bias;
}
}

return 0;
}

} // namespace ncnn

+ 52
- 0
src/layer/bias.h View File

@@ -0,0 +1,52 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_BIAS_H
#define LAYER_BIAS_H

#include "layer.h"

namespace ncnn {

class Bias : public Layer
{
public:
Bias();
virtual ~Bias();

#if NCNN_STDIO
#if NCNN_STRING
virtual int load_param(FILE* paramfp);
#endif // NCNN_STRING
virtual int load_param_bin(FILE* paramfp);
virtual int load_model(FILE* binfp);
#endif // NCNN_STDIO
virtual int load_param(const unsigned char*& mem);
virtual int load_model(const unsigned char*& mem);

virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

virtual int forward_inplace(Mat& bottom_top_blob) const;

public:
// param
int bias_data_size;

// model
Mat bias_data;
};

} // namespace ncnn

#endif // LAYER_BIAS_H

+ 81
- 0
src/layer/bnll.cpp View File

@@ -0,0 +1,81 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "bnll.h"
#include <math.h>

namespace ncnn {

DEFINE_LAYER_CREATOR(BNLL)

BNLL::BNLL()
{
one_blob_only = true;
support_inplace = true;
}

int BNLL::forward(const Mat& bottom_blob, Mat& top_blob) const
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
int size = w * h;

top_blob.create(w, h, channels);
if (top_blob.empty())
return -100;

#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);

for (int i=0; i<size; i++)
{
if (ptr[i] > 0)
outptr[i] = ptr[i] + log(1.f + exp(-ptr[i]));
else
outptr[i] = log(1.f + exp(ptr[i]));
}
}

return 0;
}

int BNLL::forward_inplace(Mat& bottom_top_blob) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;

#pragma omp parallel for
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

for (int i=0; i<size; i++)
{
if (ptr[i] > 0)
ptr[i] = ptr[i] + log(1.f + exp(-ptr[i]));
else
ptr[i] = log(1.f + exp(ptr[i]));
}
}

return 0;
}

} // namespace ncnn

+ 36
- 0
src/layer/bnll.h View File

@@ -0,0 +1,36 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_BNLL_H
#define LAYER_BNLL_H

#include "layer.h"

namespace ncnn {

class BNLL : public Layer
{
public:
BNLL();

virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

virtual int forward_inplace(Mat& bottom_top_blob) const;

public:
};

} // namespace ncnn

#endif // LAYER_BNLL_H

+ 64
- 0
src/layer/concat.cpp View File

@@ -0,0 +1,64 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "concat.h"

namespace ncnn {

DEFINE_LAYER_CREATOR(Concat)

Concat::Concat()
{
}

int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
{
int w = bottom_blobs[0].w;
int h = bottom_blobs[0].h;

// total channels
int top_channels = 0;
for (size_t b=0; b<bottom_blobs.size(); b++)
{
const Mat& bottom_blob = bottom_blobs[b];
top_channels += bottom_blob.c;
}

Mat& top_blob = top_blobs[0];
top_blob.create(w, h, top_channels);
if (top_blob.empty())
return -100;

int q = 0;
for (size_t b=0; b<bottom_blobs.size(); b++)
{
const Mat& bottom_blob = bottom_blobs[b];

int channels = bottom_blob.c;
int size = bottom_blob.cstep * channels;

const float* ptr = bottom_blob;
float* outptr = top_blob.channel(q);
for (int i=0; i<size; i++)
{
outptr[i] = ptr[i];
}

q += channels;
}

return 0;
}

} // namespace ncnn

+ 34
- 0
src/layer/concat.h View File

@@ -0,0 +1,34 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_CONCAT_H
#define LAYER_CONCAT_H

#include "layer.h"

namespace ncnn {

class Concat : public Layer
{
public:
Concat();

virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;

public:
};

} // namespace ncnn

#endif // LAYER_CONCAT_H

+ 350
- 0
src/layer/convolution.cpp View File

@@ -0,0 +1,350 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "convolution.h"

namespace ncnn {

DEFINE_LAYER_CREATOR(Convolution)

Convolution::Convolution()
{
one_blob_only = true;
support_inplace = false;
}

Convolution::~Convolution()
{
}

#if NCNN_STDIO
#if NCNN_STRING
int Convolution::load_param(FILE* paramfp)
{
int nscan = fscanf(paramfp, "%d %d %d %d %d %d %d",
&num_output, &kernel_size, &dilation, &stride, &pad, &bias_term,
&weight_data_size);
if (nscan != 7)
{
fprintf(stderr, "Convolution load_param failed %d\n", nscan);
return -1;
}

return 0;
}
#endif // NCNN_STRING
int Convolution::load_param_bin(FILE* paramfp)
{
fread(&num_output, sizeof(int), 1, paramfp);

fread(&kernel_size, sizeof(int), 1, paramfp);

fread(&dilation, sizeof(int), 1, paramfp);

fread(&stride, sizeof(int), 1, paramfp);

fread(&pad, sizeof(int), 1, paramfp);

fread(&bias_term, sizeof(int), 1, paramfp);

fread(&weight_data_size, sizeof(int), 1, paramfp);

return 0;
}

int Convolution::load_model(FILE* binfp)
{
int nread;

union
{
struct
{
unsigned char f0;
unsigned char f1;
unsigned char f2;
unsigned char f3;
};
unsigned int tag;
} flag_struct;

nread = fread(&flag_struct, sizeof(flag_struct), 1, binfp);
if (nread != 1)
{
fprintf(stderr, "Convolution read flag_struct failed %d\n", nread);
return -1;
}

unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;

weight_data.create(weight_data_size);
if (weight_data.empty())
return -100;

if (flag_struct.tag == 0x01306B47)
{
// half-precision weight data
int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned short), 4);
std::vector<unsigned short> float16_weights;
float16_weights.resize(align_weight_data_size);
nread = fread(float16_weights.data(), align_weight_data_size, 1, binfp);
if (nread != 1)
{
fprintf(stderr, "Convolution read float16_weights failed %d\n", nread);
return -1;
}

weight_data = Mat::from_float16(float16_weights.data(), weight_data_size);
if (weight_data.empty())
return -100;
}
else if (flag != 0)
{
// quantized weight data
float quantization_value[256];
nread = fread(quantization_value, 256 * sizeof(float), 1, binfp);
if (nread != 1)
{
fprintf(stderr, "Convolution read quantization_value failed %d\n", nread);
return -1;
}

int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned char), 4);
std::vector<unsigned char> index_array;
index_array.resize(align_weight_data_size);
nread = fread(index_array.data(), align_weight_data_size, 1, binfp);
if (nread != 1)
{
fprintf(stderr, "Convolution read index_array failed %d\n", nread);
return -1;
}

float* weight_data_ptr = weight_data;
for (int i = 0; i < weight_data_size; i++)
{
weight_data_ptr[i] = quantization_value[ index_array[i] ];
}
}
else if (flag_struct.f0 == 0)
{
// raw weight data
nread = fread(weight_data, weight_data_size * sizeof(float), 1, binfp);
if (nread != 1)
{
fprintf(stderr, "Convolution read weight_data failed %d\n", nread);
return -1;
}
}

if (bias_term)
{
bias_data.create(num_output);
if (bias_data.empty())
return -100;
nread = fread(bias_data, num_output * sizeof(float), 1, binfp);
if (nread != 1)
{
fprintf(stderr, "Convolution read bias_data failed %d\n", nread);
return -1;
}
}

return 0;
}
#endif // NCNN_STDIO

int Convolution::load_param(const unsigned char*& mem)
{
num_output = *(int*)(mem);
mem += 4;

kernel_size = *(int*)(mem);
mem += 4;

dilation = *(int*)(mem);
mem += 4;

stride = *(int*)(mem);
mem += 4;

pad = *(int*)(mem);
mem += 4;

bias_term = *(int*)(mem);
mem += 4;

weight_data_size = *(int*)(mem);
mem += 4;

return 0;
}

int Convolution::load_model(const unsigned char*& mem)
{
union
{
struct
{
unsigned char f0;
unsigned char f1;
unsigned char f2;
unsigned char f3;
};
unsigned int tag;
} flag_struct;

memcpy(&flag_struct, mem, sizeof(flag_struct));
mem += sizeof(flag_struct);

unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;

if (flag_struct.tag == 0x01306B47)
{
// half-precision weight data
weight_data = Mat::from_float16((unsigned short*)mem, weight_data_size);
mem += alignSize(weight_data_size * sizeof(unsigned short), 4);
if (weight_data.empty())
return -100;
}
else if (flag != 0)
{
// quantized weight data
const float* quantization_value = (const float*)mem;
mem += 256 * sizeof(float);

const unsigned char* index_array = (const unsigned char*)mem;
mem += alignSize(weight_data_size * sizeof(unsigned char), 4);

weight_data.create(weight_data_size);
if (weight_data.empty())
return -100;
float* weight_data_ptr = weight_data;
for (int i = 0; i < weight_data_size; i++)
{
weight_data_ptr[i] = quantization_value[ index_array[i] ];
}
}
else if (flag_struct.f0 == 0)
{
// raw weight data
weight_data = Mat(weight_data_size, (float*)mem);
mem += weight_data_size * sizeof(float);
}

if (bias_term)
{
bias_data = Mat(num_output, (float*)mem);
mem += num_output * sizeof(float);
}

return 0;
}

int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
{
// convolv with NxN kernel
// value = value + bias

int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;

// fprintf(stderr, "Convolution input %d x %d pad = %d ksize=%d stride=%d\n", w, h, pad, kernel_size, stride);

Mat bottom_blob_bordered = bottom_blob;
if (pad > 0)
{
copy_make_border(bottom_blob, bottom_blob_bordered, pad, pad, pad, pad, BORDER_CONSTANT, 0.f);
if (bottom_blob_bordered.empty())
return -100;

w = bottom_blob_bordered.w;
h = bottom_blob_bordered.h;
}

const int kernel_extent = dilation * (kernel_size - 1) + 1;

int outw = (w - kernel_extent) / stride + 1;
int outh = (h - kernel_extent) / stride + 1;

top_blob.create(outw, outh, num_output);
if (top_blob.empty())
return -100;

const int maxk = kernel_size * kernel_size;

// kernel offsets
std::vector<int> _space_ofs(maxk);
int* space_ofs = &_space_ofs[0];
{
int p1 = 0;
int p2 = 0;
int gap = w * dilation - kernel_extent;
for (int i = 0; i < kernel_size; i++)
{
for (int j = 0; j < kernel_size; j++)
{
space_ofs[p1] = p2;
p1++;
p2 += dilation;
}
p2 += gap;
}
}

// num_output
const float* weight_data_ptr = weight_data;
#pragma omp parallel for
for (int p=0; p<num_output; p++)
{
float* outptr = top_blob.channel(p);

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)
{
float sum = 0.f;

if (bias_term)
sum = bias_data.data[p];

const float* kptr = weight_data_ptr + maxk * channels * p;

// channels
for (int q=0; q<channels; q++)
{
const Mat m = bottom_blob_bordered.channel(q);
const float* sptr = m.data + m.w * i*stride + j*stride;

for (int k = 0; k < maxk; k++) // 29.23
{
float val = sptr[ space_ofs[k] ]; // 20.72
float w = kptr[k];
sum += val * w; // 41.45
}

kptr += maxk;
}

outptr[j] = sum;
}

outptr += outw;
}
}

return 0;
}

} // namespace ncnn

+ 58
- 0
src/layer/convolution.h View File

@@ -0,0 +1,58 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_CONVOLUTION_H
#define LAYER_CONVOLUTION_H

#include "layer.h"

namespace ncnn {

class Convolution : public Layer
{
public:
Convolution();
virtual ~Convolution();

#if NCNN_STDIO
#if NCNN_STRING
virtual int load_param(FILE* paramfp);
#endif // NCNN_STRING
virtual int load_param_bin(FILE* paramfp);
virtual int load_model(FILE* binfp);
#endif // NCNN_STDIO
virtual int load_param(const unsigned char*& mem);
virtual int load_model(const unsigned char*& mem);

virtual int forward(const Mat& bottom_blobs, Mat& top_blobs) const;

public:
// param
int num_output;
int kernel_size;
int dilation;
int stride;
int pad;
int bias_term;

int weight_data_size;

// model
Mat weight_data;
Mat bias_data;
};

} // namespace ncnn

#endif // LAYER_CONVOLUTION_H

+ 85
- 0
src/layer/crop.cpp View File

@@ -0,0 +1,85 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "crop.h"

namespace ncnn {

DEFINE_LAYER_CREATOR(Crop)

Crop::Crop()
{
}

#if NCNN_STDIO
#if NCNN_STRING
int Crop::load_param(FILE* paramfp)
{
int nscan = fscanf(paramfp, "%d %d", &woffset, &hoffset);
if (nscan != 2)
{
fprintf(stderr, "Crop load_param failed %d\n", nscan);
return -1;
}

return 0;
}
#endif // NCNN_STRING
int Crop::load_param_bin(FILE* paramfp)
{
fread(&woffset, sizeof(int), 1, paramfp);

fread(&hoffset, sizeof(int), 1, paramfp);

return 0;
}
#endif // NCNN_STDIO

int Crop::load_param(const unsigned char*& mem)
{
woffset = *(int*)(mem);
mem += 4;

hoffset = *(int*)(mem);
mem += 4;

return 0;
}

int Crop::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
{
const Mat& bottom_blob = bottom_blobs[0];
const Mat& reference_blob = bottom_blobs[1];

int w = bottom_blob.w;
int h = bottom_blob.h;

int outw = reference_blob.w;
int outh = reference_blob.h;

int top = hoffset;
int bottom = h - outh - hoffset;
int left = woffset;
int right = w - outw - woffset;

Mat& top_blob = top_blobs[0];

copy_cut_border(bottom_blob, top_blob, top, bottom, left, right);
if (top_blob.empty())
return -100;

return 0;
}

} // namespace ncnn

+ 44
- 0
src/layer/crop.h View File

@@ -0,0 +1,44 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_CROP_H
#define LAYER_CROP_H

#include "layer.h"

namespace ncnn {

class Crop : public Layer
{
public:
Crop();

#if NCNN_STDIO
#if NCNN_STRING
virtual int load_param(FILE* paramfp);
#endif // NCNN_STRING
virtual int load_param_bin(FILE* paramfp);
#endif // NCNN_STDIO
virtual int load_param(const unsigned char*& mem);

virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;

public:
int woffset;
int hoffset;
};

} // namespace ncnn

#endif // LAYER_CROP_H

+ 348
- 0
src/layer/deconvolution.cpp View File

@@ -0,0 +1,348 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "deconvolution.h"

namespace ncnn {

DEFINE_LAYER_CREATOR(Deconvolution)

Deconvolution::Deconvolution()
{
one_blob_only = true;
support_inplace = false;
}

Deconvolution::~Deconvolution()
{
}

#if NCNN_STDIO
#if NCNN_STRING
int Deconvolution::load_param(FILE* paramfp)
{
int nscan = fscanf(paramfp, "%d %d %d %d %d %d %d",
&num_output, &kernel_size, &dilation, &stride, &pad, &bias_term,
&weight_data_size);
if (nscan != 7)
{
fprintf(stderr, "Deconvolution load_param failed %d\n", nscan);
return -1;
}

return 0;
}
#endif // NCNN_STRING
int Deconvolution::load_param_bin(FILE* paramfp)
{
fread(&num_output, sizeof(int), 1, paramfp);

fread(&kernel_size, sizeof(int), 1, paramfp);

fread(&dilation, sizeof(int), 1, paramfp);

fread(&stride, sizeof(int), 1, paramfp);

fread(&pad, sizeof(int), 1, paramfp);

fread(&bias_term, sizeof(int), 1, paramfp);

fread(&weight_data_size, sizeof(int), 1, paramfp);

return 0;
}

int Deconvolution::load_model(FILE* binfp)
{
int nread;

union
{
struct
{
unsigned char f0;
unsigned char f1;
unsigned char f2;
unsigned char f3;
};
unsigned int tag;
} flag_struct;

nread = fread(&flag_struct, sizeof(flag_struct), 1, binfp);
if (nread != 1)
{
fprintf(stderr, "Deconvolution read flag_struct failed %d\n", nread);
return -1;
}

unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;

weight_data.create(weight_data_size);
if (weight_data.empty())
return -100;

if (flag_struct.tag == 0x01306B47)
{
// half-precision weight data
int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned short), 4);
std::vector<unsigned short> float16_weights;
float16_weights.resize(align_weight_data_size);
nread = fread(float16_weights.data(), align_weight_data_size, 1, binfp);
if (nread != 1)
{
fprintf(stderr, "Deconvolution read float16_weights failed %d\n", nread);
return -1;
}

weight_data = Mat::from_float16(float16_weights.data(), weight_data_size);
if (weight_data.empty())
return -100;
}
else if (flag != 0)
{
// quantized weight data
float quantization_value[256];
nread = fread(quantization_value, 256 * sizeof(float), 1, binfp);
if (nread != 1)
{
fprintf(stderr, "Deconvolution read quantization_value failed %d\n", nread);
return -1;
}

int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned char), 4);
std::vector<unsigned char> index_array;
index_array.resize(align_weight_data_size);
nread = fread(index_array.data(), align_weight_data_size, 1, binfp);
if (nread != 1)
{
fprintf(stderr, "Deconvolution read index_array failed %d\n", nread);
return -1;
}

float* weight_data_ptr = weight_data;
for (int i = 0; i < weight_data_size; i++)
{
weight_data_ptr[i] = quantization_value[ index_array[i] ];
}
}
else if (flag_struct.f0 == 0)
{
// raw weight data
nread = fread(weight_data, weight_data_size * sizeof(float), 1, binfp);
if (nread != 1)
{
fprintf(stderr, "Deconvolution read weight_data failed %d\n", nread);
return -1;
}
}

if (bias_term)
{
bias_data.create(num_output);
if (bias_data.empty())
return -100;
nread = fread(bias_data, num_output * sizeof(float), 1, binfp);
if (nread != 1)
{
fprintf(stderr, "Deconvolution read bias_data failed %d\n", nread);
return -1;
}
}

return 0;
}
#endif // NCNN_STDIO

int Deconvolution::load_param(const unsigned char*& mem)
{
num_output = *(int*)(mem);
mem += 4;

kernel_size = *(int*)(mem);
mem += 4;

dilation = *(int*)(mem);
mem += 4;

stride = *(int*)(mem);
mem += 4;

pad = *(int*)(mem);
mem += 4;

bias_term = *(int*)(mem);
mem += 4;

weight_data_size = *(int*)(mem);
mem += 4;

return 0;
}

int Deconvolution::load_model(const unsigned char*& mem)
{
union
{
struct
{
unsigned char f0;
unsigned char f1;
unsigned char f2;
unsigned char f3;
};
unsigned int tag;
} flag_struct;

memcpy(&flag_struct, mem, sizeof(flag_struct));
mem += sizeof(flag_struct);

unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;

if (flag_struct.tag == 0x01306B47)
{
// half-precision weight data
weight_data = Mat::from_float16((unsigned short*)mem, weight_data_size);
mem += alignSize(weight_data_size * sizeof(unsigned short), 4);
if (weight_data.empty())
return -100;
}
else if (flag != 0)
{
// quantized weight data
const float* quantization_value = (const float*)mem;
mem += 256 * sizeof(float);

const unsigned char* index_array = (const unsigned char*)mem;
mem += alignSize(weight_data_size * sizeof(unsigned char), 4);

weight_data.create(weight_data_size);
if (weight_data.empty())
return -100;
float* weight_data_ptr = weight_data;
for (int i = 0; i < weight_data_size; i++)
{
weight_data_ptr[i] = quantization_value[ index_array[i] ];
}
}
else if (flag_struct.f0 == 0)
{
// raw weight data
weight_data = Mat(weight_data_size, (float*)mem);
mem += weight_data_size * sizeof(float);
}

if (bias_term)
{
bias_data = Mat(num_output, (float*)mem);
mem += num_output * sizeof(float);
}

return 0;
}

int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const
{
// backward strided convolv with NxN kernel
// value = value + bias

int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;

// fprintf(stderr, "Deconvolution input %d x %d pad = %d ksize=%d stride=%d\n", w, h, pad, kernel_size, stride);

const int kernel_extent = dilation * (kernel_size - 1) + 1;

int outw = (w - 1) * stride + kernel_extent;
int outh = (h - 1) * stride + kernel_extent;

Mat top_blob_bordered;
top_blob_bordered.create(outw, outh, num_output);
if (top_blob_bordered.empty())
return -100;

const int maxk = kernel_size * kernel_size;

// kernel offsets
std::vector<int> _space_ofs(maxk);
int* space_ofs = &_space_ofs[0];
{
int p1 = 0;
int p2 = 0;
int gap = outw * dilation - kernel_extent;
for (int i = 0; i < kernel_size; i++)
{
for (int j = 0; j < kernel_size; j++)
{
space_ofs[p1] = p2;
p1++;
p2 += dilation;
}
p2 += gap;
}
}

// num_output
const float* weight_data_ptr = weight_data;
#pragma omp parallel for
for (int p=0; p<num_output; p++)
{
Mat out = top_blob_bordered.channel(p);

const float bias = bias_term ? bias_data.data[p] : 0.f;

out.fill(bias);

for (int i = 0; i < h; i++)
{
for (int j = 0; j < w; j++)
{
float* outptr = out.data + out.w * i*stride + j*stride;

const float* kptr = weight_data_ptr + maxk * channels * p;

// channels
for (int q=0; q<channels; q++)
{
const Mat m = bottom_blob.channel(q);
float val = *(m.data + m.w * i + j);

for (int k = 0; k < maxk; k++)
{
float w = kptr[k];
outptr[ space_ofs[k] ] += val * w;
}

kptr += maxk;
}
}
}
}

top_blob = top_blob_bordered;

if (pad > 0)
{
copy_cut_border(top_blob_bordered, top_blob, pad, pad, pad, pad);
if (top_blob.empty())
return -100;

outw = top_blob.w;
outh = top_blob.h;
}

return 0;
}

} // namespace ncnn

+ 58
- 0
src/layer/deconvolution.h View File

@@ -0,0 +1,58 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_DECONVOLUTION_H
#define LAYER_DECONVOLUTION_H

#include "layer.h"

namespace ncnn {

class Deconvolution : public Layer
{
public:
Deconvolution();
virtual ~Deconvolution();

#if NCNN_STDIO
#if NCNN_STRING
virtual int load_param(FILE* paramfp);
#endif // NCNN_STRING
virtual int load_param_bin(FILE* paramfp);
virtual int load_model(FILE* binfp);
#endif // NCNN_STDIO
virtual int load_param(const unsigned char*& mem);
virtual int load_model(const unsigned char*& mem);

virtual int forward(const Mat& bottom_blobs, Mat& top_blobs) const;

public:
// param
int num_output;
int kernel_size;
int dilation;
int stride;
int pad;
int bias_term;

int weight_data_size;

// model
Mat weight_data;
Mat bias_data;
};

} // namespace ncnn

#endif // LAYER_DECONVOLUTION_H

+ 38
- 0
src/layer/dropout.cpp View File

@@ -0,0 +1,38 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "dropout.h"

namespace ncnn {

DEFINE_LAYER_CREATOR(Dropout)

Dropout::Dropout()
{
one_blob_only = true;
support_inplace = true;
}

int Dropout::forward(const Mat& bottom_blob, Mat& top_blob) const
{
top_blob = bottom_blob;
return 0;
}

int Dropout::forward_inplace(Mat& /*bottom_top_blob*/) const
{
return 0;
}

} // namespace ncnn

+ 35
- 0
src/layer/dropout.h View File

@@ -0,0 +1,35 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_DROPOUT_H
#define LAYER_DROPOUT_H

#include "layer.h"

namespace ncnn {

class Dropout : public Layer
{
public:
Dropout();

virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

virtual int forward_inplace(Mat& bottom_top_blob) const;

};

} // namespace ncnn

#endif // LAYER_DROPOUT_H

+ 246
- 0
src/layer/eltwise.cpp View File

@@ -0,0 +1,246 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "eltwise.h"

namespace ncnn {

DEFINE_LAYER_CREATOR(Eltwise)

Eltwise::Eltwise()
{
}

#if NCNN_STDIO
#if NCNN_STRING
int Eltwise::load_param(FILE* paramfp)
{
int nscan = fscanf(paramfp, "%d %d", &op_type, &num_coeff);
if (nscan != 2)
{
fprintf(stderr, "Eltwise load_param failed %d\n", nscan);
return -1;
}

if (num_coeff > 0)
{
coeffs.create(num_coeff);
if (coeffs.empty())
return -100;
float* coeffs_ptr = coeffs;
for (int i=0; i<num_coeff; i++)
{
int nscan = fscanf(paramfp, "%f", &coeffs_ptr[i]);
if (nscan != 1)
{
fprintf(stderr, "Eltwise load_param failed %d\n", nscan);
return -1;
}
}
}

return 0;
}
#endif // NCNN_STRING
int Eltwise::load_param_bin(FILE* paramfp)
{
fread(&op_type, sizeof(int), 1, paramfp);

fread(&num_coeff, sizeof(int), 1, paramfp);

if (num_coeff > 0)
{
coeffs.create(num_coeff);
if (coeffs.empty())
return -100;
float* coeffs_ptr = coeffs;
fread(coeffs_ptr, sizeof(float), num_coeff, paramfp);
}

return 0;
}
#endif // NCNN_STDIO

int Eltwise::load_param(const unsigned char*& mem)
{
op_type = *(int*)(mem);
mem += 4;

num_coeff = *(int*)(mem);
mem += 4;

coeffs = Mat(num_coeff, (float*)mem);
mem += num_coeff * sizeof(float);

return 0;
}

int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
{
const Mat& bottom_blob = bottom_blobs[0];
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
int size = w * h;

Mat& top_blob = top_blobs[0];
top_blob.create(w, h, channels);
if (top_blob.empty())
return -100;

if (op_type == Operation_PROD)
{
// first blob
const Mat& bottom_blob1 = bottom_blobs[1];
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
const float* ptr1 = bottom_blob1.channel(q);
float* outptr = top_blob.channel(q);

for (int i=0; i<size; i++)
{
outptr[i] = ptr[i] * ptr1[i];
}
}

for (size_t b=2; b<bottom_blobs.size(); b++)
{
const Mat& bottom_blob1 = bottom_blobs[b];
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob1.channel(q);
float* outptr = top_blob.channel(q);

for (int i=0; i<size; i++)
{
outptr[i] *= ptr[i];
}
}
}
}
else if (op_type == Operation_SUM)
{
if (num_coeff == 0)
{
// first blob
const Mat& bottom_blob1 = bottom_blobs[1];
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
const float* ptr1 = bottom_blob1.channel(q);
float* outptr = top_blob.channel(q);

for (int i=0; i<size; i++)
{
outptr[i] = ptr[i] + ptr1[i];
}
}

for (size_t b=2; b<bottom_blobs.size(); b++)
{
const Mat& bottom_blob1 = bottom_blobs[b];
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob1.channel(q);
float* outptr = top_blob.channel(q);

for (int i=0; i<size; i++)
{
outptr[i] += ptr[i];
}
}
}
}
else
{
const float* coeffs_ptr = coeffs;

// first blob
const Mat& bottom_blob1 = bottom_blobs[1];
float coeff0 = coeffs_ptr[0];
float coeff1 = coeffs_ptr[1];
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
const float* ptr1 = bottom_blob1.channel(q);
float* outptr = top_blob.channel(q);

for (int i=0; i<size; i++)
{
outptr[i] = ptr[i] * coeff0 + ptr1[i] * coeff1;
}
}

for (size_t b=2; b<bottom_blobs.size(); b++)
{
const Mat& bottom_blob1 = bottom_blobs[b];
float coeff = coeffs_ptr[b];
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob1.channel(q);
float* outptr = top_blob.channel(q);

for (int i=0; i<size; i++)
{
outptr[i] += ptr[i] * coeff;
}
}
}
}
}
else if (op_type == Operation_MAX)
{
// first blob
const Mat& bottom_blob1 = bottom_blobs[1];
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
const float* ptr1 = bottom_blob1.channel(q);
float* outptr = top_blob.channel(q);

for (int i=0; i<size; i++)
{
outptr[i] = std::max(ptr[i], ptr1[i]);
}
}

for (size_t b=2; b<bottom_blobs.size(); b++)
{
const Mat& bottom_blob1 = bottom_blobs[b];
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob1.channel(q);
float* outptr = top_blob.channel(q);

for (int i=0; i<size; i++)
{
outptr[i] = std::max(outptr[i], ptr[i]);
}
}
}
}

return 0;
}

} // namespace ncnn

+ 48
- 0
src/layer/eltwise.h View File

@@ -0,0 +1,48 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_ELTWISE_H
#define LAYER_ELTWISE_H

#include "layer.h"

namespace ncnn {

class Eltwise : public Layer
{
public:
Eltwise();

#if NCNN_STDIO
#if NCNN_STRING
virtual int load_param(FILE* paramfp);
#endif // NCNN_STRING
virtual int load_param_bin(FILE* paramfp);
#endif // NCNN_STDIO
virtual int load_param(const unsigned char*& mem);

virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;

enum { Operation_PROD = 0, Operation_SUM = 1, Operation_MAX = 2 };

public:
// param
int op_type;
int num_coeff;
Mat coeffs;
};

} // namespace ncnn

#endif // LAYER_ELTWISE_H

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save