| @@ -0,0 +1,62 @@ | |||
| if(CMAKE_TOOLCHAIN_FILE) | |||
| set(LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_BINARY_DIR} CACHE PATH "root for library output, set this to change where android libs are compiled to") | |||
| # get absolute path, but get_filename_component ABSOLUTE only refer with source dir, so find_file here :( | |||
| get_filename_component(CMAKE_TOOLCHAIN_FILE_NAME ${CMAKE_TOOLCHAIN_FILE} NAME) | |||
| find_file(CMAKE_TOOLCHAIN_FILE ${CMAKE_TOOLCHAIN_FILE_NAME} PATHS ${CMAKE_SOURCE_DIR} NO_DEFAULT_PATH) | |||
| message(STATUS "CMAKE_TOOLCHAIN_FILE = ${CMAKE_TOOLCHAIN_FILE}") | |||
| endif() | |||
| if(NOT DEFINED CMAKE_INSTALL_PREFIX) | |||
| set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "Installation Directory") | |||
| endif() | |||
| message(STATUS "CMAKE_INSTALL_PREFIX = ${CMAKE_INSTALL_PREFIX}") | |||
| project(ncnn) | |||
| cmake_minimum_required(VERSION 2.8.10) | |||
| # set(CMAKE_BUILD_TYPE debug) | |||
| # set(CMAKE_BUILD_TYPE relwithdebinfo) | |||
| set(CMAKE_BUILD_TYPE release) | |||
| option(NCNN_OPENMP "openmp support" ON) | |||
| option(NCNN_STDIO "load model from external file" ON) | |||
| option(NCNN_STRING "plain and verbose string" ON) | |||
| option(NCNN_OPENCV "minimal opencv structure emulation" OFF) | |||
| if(NCNN_OPENMP) | |||
| find_package(OpenMP) | |||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") | |||
| set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") | |||
| endif() | |||
| add_definitions(-Wall -Wextra) | |||
| add_definitions(-fPIC) | |||
| add_definitions(-Ofast) | |||
| add_definitions(-ffast-math) | |||
| # add_definitions(-march=native) | |||
| # add_definitions(-flto) | |||
| add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden) | |||
| if(ANDROID) | |||
| # disable shared library on android | |||
| set_property(GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS FALSE) | |||
| set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti -fno-exceptions") | |||
| elseif(IOS) | |||
| # disable shared library on xcode ios | |||
| set_property(GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS FALSE) | |||
| set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti -fno-exceptions") | |||
| endif() | |||
| ############################################## | |||
| # add_subdirectory(examples) | |||
| add_subdirectory(src) | |||
| if(NOT ANDROID AND NOT IOS) | |||
| add_subdirectory(tools) | |||
| endif() | |||
| @@ -0,0 +1,18 @@ | |||
| <?xml version="1.0" encoding="UTF-8"?> | |||
| <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd"> | |||
| <plist version="1.0"> | |||
| <dict> | |||
| <key>CFBundleName</key> | |||
| <string>ncnn</string> | |||
| <key>CFBundleIdentifier</key> | |||
| <string>com.tencent.ncnn</string> | |||
| <key>CFBundleVersion</key> | |||
| <string>1.0</string> | |||
| <key>CFBundleShortVersionString</key> | |||
| <string>1.0</string> | |||
| <key>CFBundleSignature</key> | |||
| <string>????</string> | |||
| <key>CFBundlePackageType</key> | |||
| <string>FMWK</string> | |||
| </dict> | |||
| </plist> | |||
| @@ -0,0 +1,86 @@ | |||
| Tencent is pleased to support the open source community by making ncnn available. | |||
| Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| If you have downloaded a copy of the ncnn binary from Tencent, please note that the ncnn binary is licensed under the BSD 3-Clause License. | |||
| If you have downloaded a copy of the ncnn source code from Tencent, please note that ncnn source code is licensed under the BSD 3-Clause License, except for the third-party components listed below which are subject to different license terms. Your integration of ncnn into your own projects may require compliance with the BSD 3-Clause License, as well as the other licenses applicable to the third-party components included within ncnn. | |||
| A copy of the BSD 3-Clause License is included in this file. | |||
| Other dependencies and licenses: | |||
| Open Source Software Licensed Under the zlib License: | |||
| The below software in this distribution may have been modified by THL A29 Limited (“Tencent Modifications”). All Tencent Modifications are Copyright (C) 2017 THL A29 Limited. | |||
| ---------------------------------------------------------------------------------------- | |||
| 1. neon_mathfun.h | |||
| Copyright (C) 2011 Julien Pommier | |||
| 2. sse_mathfun.h | |||
| Copyright (C) 2007 Julien Pommier | |||
| 3. avx_mathfun.h | |||
| Copyright (C) 2012 Giovanni Garberoglio | |||
| Interdisciplinary Laboratory for Computational Science (LISC) | |||
| Fondazione Bruno Kessler and University of Trento | |||
| via Sommarive, 18 | |||
| I-38123 Trento (Italy) | |||
| Terms of the zlib License: | |||
| --------------------------------------------------- | |||
| Copyright (c) <year> <copyright holders> | |||
| This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. | |||
| Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: | |||
| 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. | |||
| 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. | |||
| 3. This notice may not be removed or altered from any source distribution. | |||
| Open Source Software Licensed Under the BSD 2-Clause License: | |||
| The below software in this distribution may have been modified by THL A29 Limited (“Tencent Modifications”). All Tencent Modifications are Copyright (C) 2017 THL A29 Limited. | |||
| ---------------------------------------------------------------------------------------- | |||
| 1. squeezenet 1.1 | |||
| Copyright (c) 2016 Forrest N. Iandola and Matthew W. Moskewicz and Khalid Ashraf and Song Han and William J. Dally and Kurt Keutzer | |||
| All rights reserved. | |||
| 2. caffe.proto master | |||
| All contributions by the University of California: | |||
| Copyright (c) 2014-2017 The Regents of the University of California (Regents) | |||
| All rights reserved. | |||
| All other contributions: | |||
| Copyright (c) 2014-2017, the respective contributors | |||
| All rights reserved. | |||
| Terms of the BSD 2-Clause License: | |||
| -------------------------------------------------------------------- | |||
| Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: | |||
| Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. | |||
| Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| Open Source Software Licensed Under the BSD 3-Clause License: | |||
| The below software in this distribution may have been modified by THL A29 Limited (“Tencent Modifications”). All Tencent Modifications are Copyright (C) 2017 THL A29 Limited. | |||
| ---------------------------------------------------------------------------------------- | |||
| 1. android.toolchain.cmake master | |||
| Copyright (c) 2010-2011, Ethan Rublee | |||
| Copyright (c) 2011-2014, Andrey Kamaev | |||
| All rights reserved. | |||
| Terms of the BSD 3-Clause License: | |||
| -------------------------------------------------------------------- | |||
| Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: | |||
| Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. | |||
| Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. | |||
| Neither the name of [copyright holder] nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| @@ -0,0 +1,44 @@ | |||
| # ncnn | |||
| --- | |||
| ncnn 是一个为手机端极致优化的高性能神经网络前向计算框架。ncnn 从设计之初深刻考虑手机端的部属和使用。无第三方依赖,跨平台,手机端 cpu 的速度快于目前所有已知的开源框架。基于 ncnn,开发者能够将深度学习算法轻松移植到手机端高效执行,开发出人工智能 APP,将 AI 带到你的指尖。ncnn 目前已在腾讯多款应用中使用,如 QQ,Qzone,微信,天天P图等。 | |||
| ncnn is a high-performance neural network inference computing framework optimized for the mobile platform. ncnn is deeply considered of the deployment and uses on mobile phones from the beginning of the design. ncnn does not have third party dependent, it is cross-platform, and runs faster than all known open source framework on mobile phone cpu. Developers can easily deploy deep learning algorithm models to the mobile platform by using the efficient ncnn implementation, create intelligent APP, and bring the artificial intelligence to your fingertips. ncnn is currently being used in many Tencent applications, such as QQ, Qzone, WeChat, Pitu and so on. | |||
| --- | |||
| ### 功能概述 | |||
| * 支持卷积神经网络,支持多输入和多分支结构,可计算部分分支 | |||
| * 无任何第三方库依赖,不依赖 BLAS/NNPACK 等计算框架 | |||
| * 纯 C++ 实现,跨平台,支持 android ios 等 | |||
| * ARM NEON 汇编级良心优化,计算速度极快 | |||
| * 精细的内存管理和数据结构设计,内存占用极低 | |||
| * 支持多核并行计算加速,ARM big.LITTLE cpu 调度优化 | |||
| * 整体库体积小于 500K,并可轻松精简到小于 300K | |||
| * 可扩展的模型设计,支持 8bit 量化和半精度浮点存储,可导入 caffe 模型 | |||
| * 支持直接内存零拷贝引用加载网络模型 | |||
| * 可注册自定义层实现并扩展 | |||
| * 恩,很强就是了,不怕被塞卷 QvQ | |||
| ### Features | |||
| * Support convolution neural network, support multiple input and multi-branch structure, can calculate part of the branch | |||
| * No third-party library dependent, do not rely on BLAS / NNPACK or other computing framework | |||
| * Pure C ++ implementation, cross-platform, support android ios and so on | |||
| * ARM NEON assembly level of careful optimization, the calculation speed is extremely fast | |||
| * Sophisticated memory management and data structure design, very low memory footprint | |||
| * Support multi-core parallel computing acceleration, ARM big.LITTLE cpu scheduling optimization | |||
| * The overall library size is less than 500K, and can be easily reduced to less than 300K | |||
| * Extensible model design, support 8bit quantization and half-precision floating point storage, can import caffe model | |||
| * Support direct memory zero copy reference load network model | |||
| * Can be registered with custom layer implementation and extented | |||
| * Well, it is strong, not afraid of being stuffed with 卷 QvQ | |||
| --- | |||
| ### License | |||
| BSD 3 Clause | |||
| @@ -0,0 +1,33 @@ | |||
| #!/usr/bin/bash | |||
| ##### android armv7 | |||
| mkdir -p build-android-armv7 | |||
| pushd build-android-armv7 | |||
| cmake -DCMAKE_TOOLCHAIN_FILE=../android.toolchain.cmake -DANDROID_ABI="armeabi-v7a with NEON" -DANDROID_NATIVE_API_LEVEL=android-9 -DANDROID_FORCE_ARM_BUILD=OFF -DANDROID_STL_FORCE_FEATURES=OFF .. | |||
| make | |||
| make install | |||
| popd | |||
| ##### android aarch64 | |||
| mkdir -p build-android-aarch64 | |||
| pushd build-android-aarch64 | |||
| cmake -DCMAKE_TOOLCHAIN_FILE=../android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_NATIVE_API_LEVEL=android-21 -DANDROID_FORCE_ARM_BUILD=OFF -DANDROID_STL_FORCE_FEATURES=OFF .. | |||
| make | |||
| make install | |||
| popd | |||
| ##### ios armv7 arm64 | |||
| mkdir -p build-ios | |||
| pushd build-ios | |||
| cmake -DCMAKE_TOOLCHAIN_FILE=../iosxc.toolchain.cmake .. | |||
| make | |||
| make install | |||
| popd | |||
| ##### ios simulator i386 x86_64 | |||
| mkdir -p build-ios-sim | |||
| pushd build-ios-sim | |||
| cmake -DCMAKE_TOOLCHAIN_FILE=../iossimxc.toolchain.cmake .. | |||
| make | |||
| make install | |||
| popd | |||
| @@ -0,0 +1,9 @@ | |||
| find_package(OpenCV REQUIRED core highgui imgproc) | |||
| include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src) | |||
| include_directories(${CMAKE_CURRENT_BINARY_DIR}/../src) | |||
| add_executable(squeezenet squeezenet.cpp) | |||
| target_link_libraries(squeezenet ncnn ${OpenCV_LIBS}) | |||
| @@ -0,0 +1,15 @@ | |||
| <?xml version="1.0" encoding="utf-8"?> | |||
| <manifest xmlns:android="http://schemas.android.com/apk/res/android" | |||
| package="com.tencent.squeezencnn" | |||
| android:versionCode="1" | |||
| android:versionName="1.1"> | |||
| <application android:label="@string/app_name" > | |||
| <activity android:name="MainActivity" | |||
| android:label="@string/app_name"> | |||
| <intent-filter> | |||
| <action android:name="android.intent.action.MAIN" /> | |||
| <category android:name="android.intent.category.LAUNCHER" /> | |||
| </intent-filter> | |||
| </activity> | |||
| </application> | |||
| </manifest> | |||
| @@ -0,0 +1,21 @@ | |||
| # This file is used to override default values used by the Ant build system. | |||
| # | |||
| # This file must be checked into Version Control Systems, as it is | |||
| # integral to the build system of your project. | |||
| # This file is only used by the Ant script. | |||
| # You can use this to override default values such as | |||
| # 'source.dir' for the location of your java source folder and | |||
| # 'out.dir' for the location of your output folder. | |||
| # You can also use it define how the release builds are signed by declaring | |||
| # the following properties: | |||
| # 'key.store' for the location of your keystore and | |||
| # 'key.alias' for the name of the key to use. | |||
| # The password will be asked during the build when you use the 'release' target. | |||
| key.store=/home/nihui/osd/nihuini-release-key.keystore | |||
| key.alias=nihuini | |||
| key.store.password=nihuini | |||
| key.alias.password=nihuini | |||
| @@ -0,0 +1 @@ | |||
| ../../squeezenet_v1.1.bin | |||
| @@ -0,0 +1 @@ | |||
| ../../synset_words.txt | |||
| @@ -0,0 +1,92 @@ | |||
| <?xml version="1.0" encoding="UTF-8"?> | |||
| <project name="squeezencnn" default="help"> | |||
| <!-- The local.properties file is created and updated by the 'android' tool. | |||
| It contains the path to the SDK. It should *NOT* be checked into | |||
| Version Control Systems. --> | |||
| <property file="local.properties" /> | |||
| <!-- The ant.properties file can be created by you. It is only edited by the | |||
| 'android' tool to add properties to it. | |||
| This is the place to change some Ant specific build properties. | |||
| Here are some properties you may want to change/update: | |||
| source.dir | |||
| The name of the source directory. Default is 'src'. | |||
| out.dir | |||
| The name of the output directory. Default is 'bin'. | |||
| For other overridable properties, look at the beginning of the rules | |||
| files in the SDK, at tools/ant/build.xml | |||
| Properties related to the SDK location or the project target should | |||
| be updated using the 'android' tool with the 'update' action. | |||
| This file is an integral part of the build system for your | |||
| application and should be checked into Version Control Systems. | |||
| --> | |||
| <property file="ant.properties" /> | |||
| <!-- if sdk.dir was not set from one of the property file, then | |||
| get it from the ANDROID_HOME env var. | |||
| This must be done before we load project.properties since | |||
| the proguard config can use sdk.dir --> | |||
| <property environment="env" /> | |||
| <condition property="sdk.dir" value="${env.ANDROID_HOME}"> | |||
| <isset property="env.ANDROID_HOME" /> | |||
| </condition> | |||
| <!-- The project.properties file is created and updated by the 'android' | |||
| tool, as well as ADT. | |||
| This contains project specific properties such as project target, and library | |||
| dependencies. Lower level build properties are stored in ant.properties | |||
| (or in .classpath for Eclipse projects). | |||
| This file is an integral part of the build system for your | |||
| application and should be checked into Version Control Systems. --> | |||
| <loadproperties srcFile="project.properties" /> | |||
| <!-- quick check on sdk.dir --> | |||
| <fail | |||
| message="sdk.dir is missing. Make sure to generate local.properties using 'android update project' or to inject it through the ANDROID_HOME environment variable." | |||
| unless="sdk.dir" | |||
| /> | |||
| <!-- | |||
| Import per project custom build rules if present at the root of the project. | |||
| This is the place to put custom intermediary targets such as: | |||
| -pre-build | |||
| -pre-compile | |||
| -post-compile (This is typically used for code obfuscation. | |||
| Compiled code location: ${out.classes.absolute.dir} | |||
| If this is not done in place, override ${out.dex.input.absolute.dir}) | |||
| -post-package | |||
| -post-build | |||
| -pre-clean | |||
| --> | |||
| <import file="custom_rules.xml" optional="true" /> | |||
| <!-- Import the actual build file. | |||
| To customize existing targets, there are two options: | |||
| - Customize only one target: | |||
| - copy/paste the target into this file, *before* the | |||
| <import> task. | |||
| - customize it to your needs. | |||
| - Customize the whole content of build.xml | |||
| - copy/paste the content of the rules files (minus the top node) | |||
| into this file, replacing the <import> task. | |||
| - customize to your needs. | |||
| *********************** | |||
| ****** IMPORTANT ****** | |||
| *********************** | |||
| In all cases you must update the value of version-tag below to read 'custom' instead of an integer, | |||
| in order to avoid having your file be overridden by tools such as "android update project" | |||
| --> | |||
| <!-- version-tag: 1 --> | |||
| <import file="${sdk.dir}/tools/ant/build.xml" /> | |||
| </project> | |||
| @@ -0,0 +1,30 @@ | |||
| LOCAL_PATH := $(call my-dir) | |||
| # change this folder path to yours | |||
| NCNN_INSTALL_PATH := /home/nihui/dev/qqfacecnn/ncnn/build-android-armv7/install | |||
| include $(CLEAR_VARS) | |||
| LOCAL_MODULE := ncnn | |||
| LOCAL_SRC_FILES := $(NCNN_INSTALL_PATH)/lib/libncnn.a | |||
| include $(PREBUILT_STATIC_LIBRARY) | |||
| include $(CLEAR_VARS) | |||
| LOCAL_MODULE := squeezencnn | |||
| LOCAL_SRC_FILES := squeezencnn_jni.cpp | |||
| LOCAL_C_INCLUDES := $(NCNN_INSTALL_PATH)/include | |||
| LOCAL_STATIC_LIBRARIES := ncnn | |||
| LOCAL_CFLAGS := -O2 -fvisibility=hidden -fomit-frame-pointer -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math | |||
| LOCAL_CPPFLAGS := -O2 -fvisibility=hidden -fvisibility-inlines-hidden -fomit-frame-pointer -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math | |||
| LOCAL_LDFLAGS += -Wl,--gc-sections | |||
| LOCAL_CFLAGS += -fopenmp | |||
| LOCAL_CPPFLAGS += -fopenmp | |||
| LOCAL_LDFLAGS += -fopenmp | |||
| LOCAL_LDLIBS := -lz -llog -ljnigraphics | |||
| include $(BUILD_SHARED_LIBRARY) | |||
| @@ -0,0 +1,7 @@ | |||
| # APP_STL := stlport_static | |||
| APP_STL := gnustl_static | |||
| # APP_ABI := armeabi armeabi-v7a | |||
| APP_ABI := armeabi-v7a | |||
| APP_PLATFORM := android-9 | |||
| NDK_TOOLCHAIN_VERSION := 4.9 | |||
| @@ -0,0 +1,181 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include <android/bitmap.h> | |||
| #include <android/log.h> | |||
| #include <jni.h> | |||
| #include <string> | |||
| #include <vector> | |||
| // ncnn | |||
| #include "net.h" | |||
| #include "squeezenet_v1.1.id.h" | |||
| #include <sys/time.h> | |||
| #include <unistd.h> | |||
| static struct timeval tv_begin; | |||
| static struct timeval tv_end; | |||
| static double elasped; | |||
| static void bench_start() | |||
| { | |||
| gettimeofday(&tv_begin, NULL); | |||
| } | |||
| static void bench_end(const char* comment) | |||
| { | |||
| gettimeofday(&tv_end, NULL); | |||
| elasped = ((tv_end.tv_sec - tv_begin.tv_sec) * 1000000.0f + tv_end.tv_usec - tv_begin.tv_usec) / 1000.0f; | |||
| // fprintf(stderr, "%.2fms %s\n", elasped, comment); | |||
| __android_log_print(ANDROID_LOG_DEBUG, "SqueezeNcnn", "%.2fms %s", elasped, comment); | |||
| } | |||
| static std::vector<unsigned char> squeezenet_param; | |||
| static std::vector<unsigned char> squeezenet_bin; | |||
| static std::vector<std::string> squeezenet_words; | |||
| static ncnn::Net squeezenet; | |||
| static std::vector<std::string> split_string(const std::string& str, const std::string& delimiter) | |||
| { | |||
| std::vector<std::string> strings; | |||
| std::string::size_type pos = 0; | |||
| std::string::size_type prev = 0; | |||
| while ((pos = str.find(delimiter, prev)) != std::string::npos) | |||
| { | |||
| strings.push_back(str.substr(prev, pos - prev)); | |||
| prev = pos + 1; | |||
| } | |||
| // To get the last substring (or only, if delimiter is not found) | |||
| strings.push_back(str.substr(prev)); | |||
| return strings; | |||
| } | |||
| extern "C" { | |||
| // public native boolean Init(byte[] param, byte[] bin, byte[] words); | |||
| JNIEXPORT jboolean JNICALL Java_com_tencent_squeezencnn_SqueezeNcnn_Init(JNIEnv* env, jobject thiz, jbyteArray param, jbyteArray bin, jbyteArray words) | |||
| { | |||
| // init param | |||
| { | |||
| int len = env->GetArrayLength(param); | |||
| squeezenet_param.resize(len); | |||
| env->GetByteArrayRegion(param, 0, len, (jbyte*)squeezenet_param.data()); | |||
| int ret = squeezenet.load_param(squeezenet_param.data()); | |||
| __android_log_print(ANDROID_LOG_DEBUG, "SqueezeNcnn", "load_param %d %d", ret, len); | |||
| } | |||
| // init bin | |||
| { | |||
| int len = env->GetArrayLength(bin); | |||
| squeezenet_bin.resize(len); | |||
| env->GetByteArrayRegion(bin, 0, len, (jbyte*)squeezenet_bin.data()); | |||
| int ret = squeezenet.load_model(squeezenet_bin.data()); | |||
| __android_log_print(ANDROID_LOG_DEBUG, "SqueezeNcnn", "load_model %d %d", ret, len); | |||
| } | |||
| // init words | |||
| { | |||
| int len = env->GetArrayLength(words); | |||
| std::string words_buffer; | |||
| words_buffer.resize(len); | |||
| env->GetByteArrayRegion(words, 0, len, (jbyte*)words_buffer.data()); | |||
| squeezenet_words = split_string(words_buffer, "\n"); | |||
| } | |||
| return JNI_TRUE; | |||
| } | |||
| // public native String Detect(Bitmap bitmap); | |||
| JNIEXPORT jstring JNICALL Java_com_tencent_squeezencnn_SqueezeNcnn_Detect(JNIEnv* env, jobject thiz, jobject bitmap) | |||
| { | |||
| bench_start(); | |||
| // ncnn from bitmap | |||
| ncnn::Mat in; | |||
| { | |||
| AndroidBitmapInfo info; | |||
| AndroidBitmap_getInfo(env, bitmap, &info); | |||
| int width = info.width; | |||
| int height = info.height; | |||
| if (width != 227 || height != 227) | |||
| return NULL; | |||
| if (info.format != ANDROID_BITMAP_FORMAT_RGBA_8888) | |||
| return NULL; | |||
| void* indata; | |||
| AndroidBitmap_lockPixels(env, bitmap, &indata); | |||
| in = ncnn::Mat::from_pixels((const unsigned char*)indata, ncnn::Mat::PIXEL_RGBA2BGR, width, height); | |||
| AndroidBitmap_unlockPixels(env, bitmap); | |||
| } | |||
| // squeezenet | |||
| std::vector<float> cls_scores; | |||
| { | |||
| const float mean_vals[3] = {104.f, 117.f, 123.f}; | |||
| in.substract_mean_normalize(mean_vals, 0); | |||
| ncnn::Extractor ex = squeezenet.create_extractor(); | |||
| ex.set_light_mode(true); | |||
| ex.set_num_threads(4); | |||
| ex.input(squeezenet_v1_1_param_id::BLOB_data, in); | |||
| ncnn::Mat out; | |||
| ex.extract(squeezenet_v1_1_param_id::BLOB_prob, out); | |||
| cls_scores.resize(out.c); | |||
| for (int j=0; j<out.c; j++) | |||
| { | |||
| const float* prob = out.data + out.cstep * j; | |||
| cls_scores[j] = prob[0]; | |||
| } | |||
| } | |||
| // return top class | |||
| int top_class = 0; | |||
| float max_score = 0.f; | |||
| for (size_t i=0; i<cls_scores.size(); i++) | |||
| { | |||
| float s = cls_scores[i]; | |||
| // __android_log_print(ANDROID_LOG_DEBUG, "SqueezeNcnn", "%d %f", i, s); | |||
| if (s > max_score) | |||
| { | |||
| top_class = i; | |||
| max_score = s; | |||
| } | |||
| } | |||
| const std::string& word = squeezenet_words[top_class]; | |||
| char tmp[32]; | |||
| sprintf(tmp, "%.3f", max_score); | |||
| std::string result_str = std::string(word.c_str() + 10) + " = " + tmp; | |||
| // +10 to skip leading n03179701 | |||
| jstring result = env->NewStringUTF(result_str.c_str()); | |||
| bench_end("detect"); | |||
| return result; | |||
| } | |||
| } | |||
| @@ -0,0 +1,163 @@ | |||
| #ifndef NCNN_INCLUDE_GUARD_squeezenet_v1_1_id_h | |||
| #define NCNN_INCLUDE_GUARD_squeezenet_v1_1_id_h | |||
| namespace squeezenet_v1_1_param_id { | |||
| const int LAYER_data = 0; | |||
| const int BLOB_data = 0; | |||
| const int LAYER_conv1 = 1; | |||
| const int BLOB_conv1 = 1; | |||
| const int LAYER_relu_conv1 = 2; | |||
| const int BLOB_conv1_relu_conv1 = 2; | |||
| const int LAYER_pool1 = 3; | |||
| const int BLOB_pool1 = 3; | |||
| const int LAYER_fire2_squeeze1x1 = 4; | |||
| const int BLOB_fire2_squeeze1x1 = 4; | |||
| const int LAYER_fire2_relu_squeeze1x1 = 5; | |||
| const int BLOB_fire2_squeeze1x1_fire2_relu_squeeze1x1 = 5; | |||
| const int LAYER_splitncnn_0 = 6; | |||
| const int BLOB_fire2_squeeze1x1_fire2_relu_squeeze1x1_splitncnn_0 = 6; | |||
| const int BLOB_fire2_squeeze1x1_fire2_relu_squeeze1x1_splitncnn_1 = 7; | |||
| const int LAYER_fire2_expand1x1 = 7; | |||
| const int BLOB_fire2_expand1x1 = 8; | |||
| const int LAYER_fire2_relu_expand1x1 = 8; | |||
| const int BLOB_fire2_expand1x1_fire2_relu_expand1x1 = 9; | |||
| const int LAYER_fire2_expand3x3 = 9; | |||
| const int BLOB_fire2_expand3x3 = 10; | |||
| const int LAYER_fire2_relu_expand3x3 = 10; | |||
| const int BLOB_fire2_expand3x3_fire2_relu_expand3x3 = 11; | |||
| const int LAYER_fire2_concat = 11; | |||
| const int BLOB_fire2_concat = 12; | |||
| const int LAYER_fire3_squeeze1x1 = 12; | |||
| const int BLOB_fire3_squeeze1x1 = 13; | |||
| const int LAYER_fire3_relu_squeeze1x1 = 13; | |||
| const int BLOB_fire3_squeeze1x1_fire3_relu_squeeze1x1 = 14; | |||
| const int LAYER_splitncnn_1 = 14; | |||
| const int BLOB_fire3_squeeze1x1_fire3_relu_squeeze1x1_splitncnn_0 = 15; | |||
| const int BLOB_fire3_squeeze1x1_fire3_relu_squeeze1x1_splitncnn_1 = 16; | |||
| const int LAYER_fire3_expand1x1 = 15; | |||
| const int BLOB_fire3_expand1x1 = 17; | |||
| const int LAYER_fire3_relu_expand1x1 = 16; | |||
| const int BLOB_fire3_expand1x1_fire3_relu_expand1x1 = 18; | |||
| const int LAYER_fire3_expand3x3 = 17; | |||
| const int BLOB_fire3_expand3x3 = 19; | |||
| const int LAYER_fire3_relu_expand3x3 = 18; | |||
| const int BLOB_fire3_expand3x3_fire3_relu_expand3x3 = 20; | |||
| const int LAYER_fire3_concat = 19; | |||
| const int BLOB_fire3_concat = 21; | |||
| const int LAYER_pool3 = 20; | |||
| const int BLOB_pool3 = 22; | |||
| const int LAYER_fire4_squeeze1x1 = 21; | |||
| const int BLOB_fire4_squeeze1x1 = 23; | |||
| const int LAYER_fire4_relu_squeeze1x1 = 22; | |||
| const int BLOB_fire4_squeeze1x1_fire4_relu_squeeze1x1 = 24; | |||
| const int LAYER_splitncnn_2 = 23; | |||
| const int BLOB_fire4_squeeze1x1_fire4_relu_squeeze1x1_splitncnn_0 = 25; | |||
| const int BLOB_fire4_squeeze1x1_fire4_relu_squeeze1x1_splitncnn_1 = 26; | |||
| const int LAYER_fire4_expand1x1 = 24; | |||
| const int BLOB_fire4_expand1x1 = 27; | |||
| const int LAYER_fire4_relu_expand1x1 = 25; | |||
| const int BLOB_fire4_expand1x1_fire4_relu_expand1x1 = 28; | |||
| const int LAYER_fire4_expand3x3 = 26; | |||
| const int BLOB_fire4_expand3x3 = 29; | |||
| const int LAYER_fire4_relu_expand3x3 = 27; | |||
| const int BLOB_fire4_expand3x3_fire4_relu_expand3x3 = 30; | |||
| const int LAYER_fire4_concat = 28; | |||
| const int BLOB_fire4_concat = 31; | |||
| const int LAYER_fire5_squeeze1x1 = 29; | |||
| const int BLOB_fire5_squeeze1x1 = 32; | |||
| const int LAYER_fire5_relu_squeeze1x1 = 30; | |||
| const int BLOB_fire5_squeeze1x1_fire5_relu_squeeze1x1 = 33; | |||
| const int LAYER_splitncnn_3 = 31; | |||
| const int BLOB_fire5_squeeze1x1_fire5_relu_squeeze1x1_splitncnn_0 = 34; | |||
| const int BLOB_fire5_squeeze1x1_fire5_relu_squeeze1x1_splitncnn_1 = 35; | |||
| const int LAYER_fire5_expand1x1 = 32; | |||
| const int BLOB_fire5_expand1x1 = 36; | |||
| const int LAYER_fire5_relu_expand1x1 = 33; | |||
| const int BLOB_fire5_expand1x1_fire5_relu_expand1x1 = 37; | |||
| const int LAYER_fire5_expand3x3 = 34; | |||
| const int BLOB_fire5_expand3x3 = 38; | |||
| const int LAYER_fire5_relu_expand3x3 = 35; | |||
| const int BLOB_fire5_expand3x3_fire5_relu_expand3x3 = 39; | |||
| const int LAYER_fire5_concat = 36; | |||
| const int BLOB_fire5_concat = 40; | |||
| const int LAYER_pool5 = 37; | |||
| const int BLOB_pool5 = 41; | |||
| const int LAYER_fire6_squeeze1x1 = 38; | |||
| const int BLOB_fire6_squeeze1x1 = 42; | |||
| const int LAYER_fire6_relu_squeeze1x1 = 39; | |||
| const int BLOB_fire6_squeeze1x1_fire6_relu_squeeze1x1 = 43; | |||
| const int LAYER_splitncnn_4 = 40; | |||
| const int BLOB_fire6_squeeze1x1_fire6_relu_squeeze1x1_splitncnn_0 = 44; | |||
| const int BLOB_fire6_squeeze1x1_fire6_relu_squeeze1x1_splitncnn_1 = 45; | |||
| const int LAYER_fire6_expand1x1 = 41; | |||
| const int BLOB_fire6_expand1x1 = 46; | |||
| const int LAYER_fire6_relu_expand1x1 = 42; | |||
| const int BLOB_fire6_expand1x1_fire6_relu_expand1x1 = 47; | |||
| const int LAYER_fire6_expand3x3 = 43; | |||
| const int BLOB_fire6_expand3x3 = 48; | |||
| const int LAYER_fire6_relu_expand3x3 = 44; | |||
| const int BLOB_fire6_expand3x3_fire6_relu_expand3x3 = 49; | |||
| const int LAYER_fire6_concat = 45; | |||
| const int BLOB_fire6_concat = 50; | |||
| const int LAYER_fire7_squeeze1x1 = 46; | |||
| const int BLOB_fire7_squeeze1x1 = 51; | |||
| const int LAYER_fire7_relu_squeeze1x1 = 47; | |||
| const int BLOB_fire7_squeeze1x1_fire7_relu_squeeze1x1 = 52; | |||
| const int LAYER_splitncnn_5 = 48; | |||
| const int BLOB_fire7_squeeze1x1_fire7_relu_squeeze1x1_splitncnn_0 = 53; | |||
| const int BLOB_fire7_squeeze1x1_fire7_relu_squeeze1x1_splitncnn_1 = 54; | |||
| const int LAYER_fire7_expand1x1 = 49; | |||
| const int BLOB_fire7_expand1x1 = 55; | |||
| const int LAYER_fire7_relu_expand1x1 = 50; | |||
| const int BLOB_fire7_expand1x1_fire7_relu_expand1x1 = 56; | |||
| const int LAYER_fire7_expand3x3 = 51; | |||
| const int BLOB_fire7_expand3x3 = 57; | |||
| const int LAYER_fire7_relu_expand3x3 = 52; | |||
| const int BLOB_fire7_expand3x3_fire7_relu_expand3x3 = 58; | |||
| const int LAYER_fire7_concat = 53; | |||
| const int BLOB_fire7_concat = 59; | |||
| const int LAYER_fire8_squeeze1x1 = 54; | |||
| const int BLOB_fire8_squeeze1x1 = 60; | |||
| const int LAYER_fire8_relu_squeeze1x1 = 55; | |||
| const int BLOB_fire8_squeeze1x1_fire8_relu_squeeze1x1 = 61; | |||
| const int LAYER_splitncnn_6 = 56; | |||
| const int BLOB_fire8_squeeze1x1_fire8_relu_squeeze1x1_splitncnn_0 = 62; | |||
| const int BLOB_fire8_squeeze1x1_fire8_relu_squeeze1x1_splitncnn_1 = 63; | |||
| const int LAYER_fire8_expand1x1 = 57; | |||
| const int BLOB_fire8_expand1x1 = 64; | |||
| const int LAYER_fire8_relu_expand1x1 = 58; | |||
| const int BLOB_fire8_expand1x1_fire8_relu_expand1x1 = 65; | |||
| const int LAYER_fire8_expand3x3 = 59; | |||
| const int BLOB_fire8_expand3x3 = 66; | |||
| const int LAYER_fire8_relu_expand3x3 = 60; | |||
| const int BLOB_fire8_expand3x3_fire8_relu_expand3x3 = 67; | |||
| const int LAYER_fire8_concat = 61; | |||
| const int BLOB_fire8_concat = 68; | |||
| const int LAYER_fire9_squeeze1x1 = 62; | |||
| const int BLOB_fire9_squeeze1x1 = 69; | |||
| const int LAYER_fire9_relu_squeeze1x1 = 63; | |||
| const int BLOB_fire9_squeeze1x1_fire9_relu_squeeze1x1 = 70; | |||
| const int LAYER_splitncnn_7 = 64; | |||
| const int BLOB_fire9_squeeze1x1_fire9_relu_squeeze1x1_splitncnn_0 = 71; | |||
| const int BLOB_fire9_squeeze1x1_fire9_relu_squeeze1x1_splitncnn_1 = 72; | |||
| const int LAYER_fire9_expand1x1 = 65; | |||
| const int BLOB_fire9_expand1x1 = 73; | |||
| const int LAYER_fire9_relu_expand1x1 = 66; | |||
| const int BLOB_fire9_expand1x1_fire9_relu_expand1x1 = 74; | |||
| const int LAYER_fire9_expand3x3 = 67; | |||
| const int BLOB_fire9_expand3x3 = 75; | |||
| const int LAYER_fire9_relu_expand3x3 = 68; | |||
| const int BLOB_fire9_expand3x3_fire9_relu_expand3x3 = 76; | |||
| const int LAYER_fire9_concat = 69; | |||
| const int BLOB_fire9_concat = 77; | |||
| const int LAYER_drop9 = 70; | |||
| const int BLOB_fire9_concat_drop9 = 78; | |||
| const int LAYER_conv10 = 71; | |||
| const int BLOB_conv10 = 79; | |||
| const int LAYER_relu_conv10 = 72; | |||
| const int BLOB_conv10_relu_conv10 = 80; | |||
| const int LAYER_pool10 = 73; | |||
| const int BLOB_pool10 = 81; | |||
| const int LAYER_prob = 74; | |||
| const int BLOB_prob = 82; | |||
| } // namespace squeezenet_v1_1_param_id | |||
| #endif // NCNN_INCLUDE_GUARD_squeezenet_v1_1_id_h | |||
| @@ -0,0 +1,10 @@ | |||
| # This file is automatically generated by Android Tools. | |||
| # Do not modify this file -- YOUR CHANGES WILL BE ERASED! | |||
| # | |||
| # This file must *NOT* be checked into Version Control Systems, | |||
| # as it contains information specific to your local configuration. | |||
| # location of the SDK. This is only used by Ant | |||
| # For customization when using a Version Control System, please read the | |||
| # header note. | |||
| sdk.dir=/home/nihui/osd/android-sdk-linux | |||
| @@ -0,0 +1,20 @@ | |||
| # To enable ProGuard in your project, edit project.properties | |||
| # to define the proguard.config property as described in that file. | |||
| # | |||
| # Add project specific ProGuard rules here. | |||
| # By default, the flags in this file are appended to flags specified | |||
| # in ${sdk.dir}/tools/proguard/proguard-android.txt | |||
| # You can edit the include path and order by changing the ProGuard | |||
| # include property in project.properties. | |||
| # | |||
| # For more details, see | |||
| # http://developer.android.com/guide/developing/tools/proguard.html | |||
| # Add any project specific keep options here: | |||
| # If your project uses WebView with JS, uncomment the following | |||
| # and specify the fully qualified class name to the JavaScript interface | |||
| # class: | |||
| #-keepclassmembers class fqcn.of.javascript.interface.for.webview { | |||
| # public *; | |||
| #} | |||
| @@ -0,0 +1,14 @@ | |||
| # This file is automatically generated by Android Tools. | |||
| # Do not modify this file -- YOUR CHANGES WILL BE ERASED! | |||
| # | |||
| # This file must be checked in Version Control Systems. | |||
| # | |||
| # To customize properties used by the Ant build system edit | |||
| # "ant.properties", and override values to adapt the script to your | |||
| # project structure. | |||
| # | |||
| # To enable ProGuard to shrink and obfuscate your code, uncomment this (available properties: sdk.dir, user.home): | |||
| #proguard.config=${sdk.dir}/tools/proguard/proguard-android.txt:proguard-project.txt | |||
| # Project target. | |||
| target=android-9 | |||
| @@ -0,0 +1,36 @@ | |||
| <?xml version="1.0" encoding="utf-8"?> | |||
| <LinearLayout xmlns:android="http://schemas.android.com/apk/res/android" | |||
| android:orientation="vertical" | |||
| android:layout_width="fill_parent" | |||
| android:layout_height="fill_parent"> | |||
| <LinearLayout | |||
| android:orientation="horizontal" | |||
| android:layout_width="fill_parent" | |||
| android:layout_height="wrap_content"> | |||
| <Button | |||
| android:id="@+id/buttonImage" | |||
| android:layout_width="wrap_content" | |||
| android:layout_height="wrap_content" | |||
| android:text="选图" /> | |||
| <Button | |||
| android:id="@+id/buttonDetect" | |||
| android:layout_width="wrap_content" | |||
| android:layout_height="wrap_content" | |||
| android:text="识别" /> | |||
| </LinearLayout> | |||
| <TextView | |||
| android:id="@+id/infoResult" | |||
| android:layout_width="fill_parent" | |||
| android:layout_height="wrap_content" | |||
| android:text="" /> | |||
| <ImageView | |||
| android:id="@+id/imageView" | |||
| android:layout_width="fill_parent" | |||
| android:layout_height="fill_parent" | |||
| android:layout_weight="1" /> | |||
| </LinearLayout> | |||
| @@ -0,0 +1,4 @@ | |||
| <?xml version="1.0" encoding="utf-8"?> | |||
| <resources> | |||
| <string name="app_name">squeezencnn</string> | |||
| </resources> | |||
| @@ -0,0 +1,189 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| package com.tencent.squeezencnn; | |||
| import android.app.Activity; | |||
| import android.os.Bundle; | |||
| import android.content.Context; | |||
| import android.content.Intent; | |||
| import android.database.Cursor; | |||
| import android.graphics.Bitmap; | |||
| import android.graphics.BitmapFactory; | |||
| import android.net.Uri; | |||
| import android.provider.MediaStore; | |||
| import android.util.Log; | |||
| import android.view.View; | |||
| import android.widget.Button; | |||
| import android.widget.ImageView; | |||
| import android.widget.TextView; | |||
| import java.io.FileNotFoundException; | |||
| import java.io.FileOutputStream; | |||
| import java.io.IOException; | |||
| import java.io.InputStream; | |||
| import com.tencent.squeezencnn.SqueezeNcnn; | |||
| public class MainActivity extends Activity | |||
| { | |||
| private static final int SELECT_IMAGE = 1; | |||
| private TextView infoResult; | |||
| private ImageView imageView; | |||
| private Bitmap yourSelectedImage = null; | |||
| private SqueezeNcnn squeezencnn = new SqueezeNcnn(); | |||
| /** Called when the activity is first created. */ | |||
| @Override | |||
| public void onCreate(Bundle savedInstanceState) | |||
| { | |||
| super.onCreate(savedInstanceState); | |||
| setContentView(R.layout.main); | |||
| try | |||
| { | |||
| initSqueezeNcnn(); | |||
| } | |||
| catch (IOException e) | |||
| { | |||
| Log.e("MainActivity", "initSqueezeNcnn error"); | |||
| } | |||
| infoResult = (TextView) findViewById(R.id.infoResult); | |||
| imageView = (ImageView) findViewById(R.id.imageView); | |||
| Button buttonImage = (Button) findViewById(R.id.buttonImage); | |||
| buttonImage.setOnClickListener(new View.OnClickListener() { | |||
| @Override | |||
| public void onClick(View arg0) { | |||
| Intent i = new Intent(Intent.ACTION_PICK); | |||
| i.setType("image/*"); | |||
| startActivityForResult(i, SELECT_IMAGE); | |||
| } | |||
| }); | |||
| Button buttonDetect = (Button) findViewById(R.id.buttonDetect); | |||
| buttonDetect.setOnClickListener(new View.OnClickListener() { | |||
| @Override | |||
| public void onClick(View arg0) { | |||
| if (yourSelectedImage == null) | |||
| return; | |||
| String result = squeezencnn.Detect(yourSelectedImage); | |||
| if (result == null) | |||
| { | |||
| infoResult.setText("detect failed"); | |||
| } | |||
| else | |||
| { | |||
| infoResult.setText(result); | |||
| } | |||
| } | |||
| }); | |||
| } | |||
| private void initSqueezeNcnn() throws IOException | |||
| { | |||
| byte[] param = null; | |||
| byte[] bin = null; | |||
| byte[] words = null; | |||
| { | |||
| InputStream assetsInputStream = getAssets().open("squeezenet_v1.1.param.bin"); | |||
| int available = assetsInputStream.available(); | |||
| param = new byte[available]; | |||
| int byteCode = assetsInputStream.read(param); | |||
| assetsInputStream.close(); | |||
| } | |||
| { | |||
| InputStream assetsInputStream = getAssets().open("squeezenet_v1.1.bin"); | |||
| int available = assetsInputStream.available(); | |||
| bin = new byte[available]; | |||
| int byteCode = assetsInputStream.read(bin); | |||
| assetsInputStream.close(); | |||
| } | |||
| { | |||
| InputStream assetsInputStream = getAssets().open("synset_words.txt"); | |||
| int available = assetsInputStream.available(); | |||
| words = new byte[available]; | |||
| int byteCode = assetsInputStream.read(words); | |||
| assetsInputStream.close(); | |||
| } | |||
| squeezencnn.Init(param, bin, words); | |||
| } | |||
| @Override | |||
| protected void onActivityResult(int requestCode, int resultCode, Intent data) | |||
| { | |||
| super.onActivityResult(requestCode, resultCode, data); | |||
| if (resultCode == RESULT_OK && null != data) { | |||
| Uri selectedImage = data.getData(); | |||
| try | |||
| { | |||
| if (requestCode == SELECT_IMAGE) { | |||
| Bitmap bitmap = decodeUri(selectedImage); | |||
| Bitmap rgba = bitmap.copy(Bitmap.Config.ARGB_8888, true); | |||
| // resize to 227x227 | |||
| yourSelectedImage = Bitmap.createScaledBitmap(rgba, 227, 227, false); | |||
| imageView.setImageBitmap(yourSelectedImage); | |||
| } | |||
| } | |||
| catch (FileNotFoundException e) | |||
| { | |||
| Log.e("MainActivity", "FileNotFoundException"); | |||
| return; | |||
| } | |||
| } | |||
| } | |||
| private Bitmap decodeUri(Uri selectedImage) throws FileNotFoundException | |||
| { | |||
| // Decode image size | |||
| BitmapFactory.Options o = new BitmapFactory.Options(); | |||
| o.inJustDecodeBounds = true; | |||
| BitmapFactory.decodeStream(getContentResolver().openInputStream(selectedImage), null, o); | |||
| // The new size we want to scale to | |||
| final int REQUIRED_SIZE = 400; | |||
| // Find the correct scale value. It should be the power of 2. | |||
| int width_tmp = o.outWidth, height_tmp = o.outHeight; | |||
| int scale = 1; | |||
| while (true) { | |||
| if (width_tmp / 2 < REQUIRED_SIZE | |||
| || height_tmp / 2 < REQUIRED_SIZE) { | |||
| break; | |||
| } | |||
| width_tmp /= 2; | |||
| height_tmp /= 2; | |||
| scale *= 2; | |||
| } | |||
| // Decode with inSampleSize | |||
| BitmapFactory.Options o2 = new BitmapFactory.Options(); | |||
| o2.inSampleSize = scale; | |||
| return BitmapFactory.decodeStream(getContentResolver().openInputStream(selectedImage), null, o2); | |||
| } | |||
| } | |||
| @@ -0,0 +1,29 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| package com.tencent.squeezencnn; | |||
| import android.graphics.Bitmap; | |||
| import android.content.Context; | |||
| public class SqueezeNcnn | |||
| { | |||
| public native boolean Init(byte[] param, byte[] bin, byte[] words); | |||
| public native String Detect(Bitmap bitmap); | |||
| static { | |||
| System.loadLibrary("squeezencnn"); | |||
| } | |||
| } | |||
| @@ -0,0 +1,95 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include <stdio.h> | |||
| #include <algorithm> | |||
| #include <vector> | |||
| #include <opencv2/core/core.hpp> | |||
| #include <opencv2/highgui/highgui.hpp> | |||
| #include "net.h" | |||
| static int detect_squeezenet(const cv::Mat& bgr, std::vector<float>& cls_scores) | |||
| { | |||
| ncnn::Net squeezenet; | |||
| squeezenet.load_param("squeezenet_v1.1.param"); | |||
| squeezenet.load_model("squeezenet_v1.1.bin"); | |||
| ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, 227, 227); | |||
| const float mean_vals[3] = {104.f, 117.f, 123.f}; | |||
| in.substract_mean_normalize(mean_vals, 0); | |||
| ncnn::Extractor ex = squeezenet.create_extractor(); | |||
| ex.set_light_mode(true); | |||
| ex.input("data", in); | |||
| ncnn::Mat out; | |||
| ex.extract("prob", out); | |||
| cls_scores.resize(out.c); | |||
| for (int j=0; j<out.c; j++) | |||
| { | |||
| const float* prob = out.data + out.cstep * j; | |||
| cls_scores[j] = prob[0]; | |||
| } | |||
| return 0; | |||
| } | |||
| static int print_topk(const std::vector<float>& cls_scores, int topk) | |||
| { | |||
| // partial sort topk with index | |||
| int size = cls_scores.size(); | |||
| std::vector< std::pair<float, int> > vec; | |||
| vec.resize(size); | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| vec[i] = std::make_pair(cls_scores[i], i); | |||
| } | |||
| std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), | |||
| std::greater< std::pair<float, int> >()); | |||
| // print topk and score | |||
| for (int i=0; i<topk; i++) | |||
| { | |||
| float score = vec[i].first; | |||
| int index = vec[i].second; | |||
| fprintf(stderr, "%d = %f\n", index, score); | |||
| } | |||
| return 0; | |||
| } | |||
| int main(int argc, char** argv) | |||
| { | |||
| const char* imagepath = argv[1]; | |||
| cv::Mat m = cv::imread(imagepath, CV_LOAD_IMAGE_COLOR); | |||
| if (m.empty()) | |||
| { | |||
| fprintf(stderr, "cv::imread %s failed\n", imagepath); | |||
| return -1; | |||
| } | |||
| std::vector<float> cls_scores; | |||
| detect_squeezenet(m, cls_scores); | |||
| print_topk(cls_scores, 3); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,76 @@ | |||
| 75 83 | |||
| Input data 0 1 data 3 227 227 | |||
| Convolution conv1 1 1 data conv1 64 3 1 2 0 1 1728 | |||
| ReLU relu_conv1 1 1 conv1 conv1_relu_conv1 0.000000 | |||
| Pooling pool1 1 1 conv1_relu_conv1 pool1 0 3 2 0 0 | |||
| Convolution fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1 16 1 1 1 0 1 1024 | |||
| ReLU fire2/relu_squeeze1x1 1 1 fire2/squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1 0.000000 | |||
| Split splitncnn_0 1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire2/expand1x1 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1 64 1 1 1 0 1 1024 | |||
| ReLU fire2/relu_expand1x1 1 1 fire2/expand1x1 fire2/expand1x1_fire2/relu_expand1x1 0.000000 | |||
| Convolution fire2/expand3x3 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3 64 3 1 1 1 1 9216 | |||
| ReLU fire2/relu_expand3x3 1 1 fire2/expand3x3 fire2/expand3x3_fire2/relu_expand3x3 0.000000 | |||
| Concat fire2/concat 2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat | |||
| Convolution fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1 16 1 1 1 0 1 2048 | |||
| ReLU fire3/relu_squeeze1x1 1 1 fire3/squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1 0.000000 | |||
| Split splitncnn_1 1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire3/expand1x1 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1 64 1 1 1 0 1 1024 | |||
| ReLU fire3/relu_expand1x1 1 1 fire3/expand1x1 fire3/expand1x1_fire3/relu_expand1x1 0.000000 | |||
| Convolution fire3/expand3x3 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3 64 3 1 1 1 1 9216 | |||
| ReLU fire3/relu_expand3x3 1 1 fire3/expand3x3 fire3/expand3x3_fire3/relu_expand3x3 0.000000 | |||
| Concat fire3/concat 2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat | |||
| Pooling pool3 1 1 fire3/concat pool3 0 3 2 0 0 | |||
| Convolution fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1 32 1 1 1 0 1 4096 | |||
| ReLU fire4/relu_squeeze1x1 1 1 fire4/squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1 0.000000 | |||
| Split splitncnn_2 1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire4/expand1x1 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1 128 1 1 1 0 1 4096 | |||
| ReLU fire4/relu_expand1x1 1 1 fire4/expand1x1 fire4/expand1x1_fire4/relu_expand1x1 0.000000 | |||
| Convolution fire4/expand3x3 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3 128 3 1 1 1 1 36864 | |||
| ReLU fire4/relu_expand3x3 1 1 fire4/expand3x3 fire4/expand3x3_fire4/relu_expand3x3 0.000000 | |||
| Concat fire4/concat 2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat | |||
| Convolution fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1 32 1 1 1 0 1 8192 | |||
| ReLU fire5/relu_squeeze1x1 1 1 fire5/squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1 0.000000 | |||
| Split splitncnn_3 1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire5/expand1x1 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1 128 1 1 1 0 1 4096 | |||
| ReLU fire5/relu_expand1x1 1 1 fire5/expand1x1 fire5/expand1x1_fire5/relu_expand1x1 0.000000 | |||
| Convolution fire5/expand3x3 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3 128 3 1 1 1 1 36864 | |||
| ReLU fire5/relu_expand3x3 1 1 fire5/expand3x3 fire5/expand3x3_fire5/relu_expand3x3 0.000000 | |||
| Concat fire5/concat 2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat | |||
| Pooling pool5 1 1 fire5/concat pool5 0 3 2 0 0 | |||
| Convolution fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1 48 1 1 1 0 1 12288 | |||
| ReLU fire6/relu_squeeze1x1 1 1 fire6/squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1 0.000000 | |||
| Split splitncnn_4 1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire6/expand1x1 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1 192 1 1 1 0 1 9216 | |||
| ReLU fire6/relu_expand1x1 1 1 fire6/expand1x1 fire6/expand1x1_fire6/relu_expand1x1 0.000000 | |||
| Convolution fire6/expand3x3 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3 192 3 1 1 1 1 82944 | |||
| ReLU fire6/relu_expand3x3 1 1 fire6/expand3x3 fire6/expand3x3_fire6/relu_expand3x3 0.000000 | |||
| Concat fire6/concat 2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat | |||
| Convolution fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1 48 1 1 1 0 1 18432 | |||
| ReLU fire7/relu_squeeze1x1 1 1 fire7/squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1 0.000000 | |||
| Split splitncnn_5 1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire7/expand1x1 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1 192 1 1 1 0 1 9216 | |||
| ReLU fire7/relu_expand1x1 1 1 fire7/expand1x1 fire7/expand1x1_fire7/relu_expand1x1 0.000000 | |||
| Convolution fire7/expand3x3 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3 192 3 1 1 1 1 82944 | |||
| ReLU fire7/relu_expand3x3 1 1 fire7/expand3x3 fire7/expand3x3_fire7/relu_expand3x3 0.000000 | |||
| Concat fire7/concat 2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat | |||
| Convolution fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1 64 1 1 1 0 1 24576 | |||
| ReLU fire8/relu_squeeze1x1 1 1 fire8/squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1 0.000000 | |||
| Split splitncnn_6 1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire8/expand1x1 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1 256 1 1 1 0 1 16384 | |||
| ReLU fire8/relu_expand1x1 1 1 fire8/expand1x1 fire8/expand1x1_fire8/relu_expand1x1 0.000000 | |||
| Convolution fire8/expand3x3 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3 256 3 1 1 1 1 147456 | |||
| ReLU fire8/relu_expand3x3 1 1 fire8/expand3x3 fire8/expand3x3_fire8/relu_expand3x3 0.000000 | |||
| Concat fire8/concat 2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat | |||
| Convolution fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1 64 1 1 1 0 1 32768 | |||
| ReLU fire9/relu_squeeze1x1 1 1 fire9/squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1 0.000000 | |||
| Split splitncnn_7 1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire9/expand1x1 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1 256 1 1 1 0 1 16384 | |||
| ReLU fire9/relu_expand1x1 1 1 fire9/expand1x1 fire9/expand1x1_fire9/relu_expand1x1 0.000000 | |||
| Convolution fire9/expand3x3 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3 256 3 1 1 1 1 147456 | |||
| ReLU fire9/relu_expand3x3 1 1 fire9/expand3x3 fire9/expand3x3_fire9/relu_expand3x3 0.000000 | |||
| Concat fire9/concat 2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat | |||
| Dropout drop9 1 1 fire9/concat fire9/concat_drop9 | |||
| Convolution conv10 1 1 fire9/concat_drop9 conv10 1000 1 1 1 1 1 512000 | |||
| ReLU relu_conv10 1 1 conv10 conv10_relu_conv10 0.000000 | |||
| Pooling pool10 1 1 conv10_relu_conv10 pool10 1 0 1 0 1 | |||
| Softmax prob 1 1 pool10 prob | |||
| @@ -0,0 +1,548 @@ | |||
| name: "squeezenet_v1.1_deploy" | |||
| layer { | |||
| name: "data" | |||
| type: "Input" | |||
| top: "data" | |||
| input_param { shape: { dim: 1 dim: 3 dim: 227 dim: 227 } } | |||
| } | |||
| layer { | |||
| name: "conv1" | |||
| type: "Convolution" | |||
| bottom: "data" | |||
| top: "conv1" | |||
| convolution_param { | |||
| num_output: 64 | |||
| kernel_size: 3 | |||
| stride: 2 | |||
| } | |||
| } | |||
| layer { | |||
| name: "relu_conv1" | |||
| type: "ReLU" | |||
| bottom: "conv1" | |||
| top: "conv1" | |||
| } | |||
| layer { | |||
| name: "pool1" | |||
| type: "Pooling" | |||
| bottom: "conv1" | |||
| top: "pool1" | |||
| pooling_param { | |||
| pool: MAX | |||
| kernel_size: 3 | |||
| stride: 2 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire2/squeeze1x1" | |||
| type: "Convolution" | |||
| bottom: "pool1" | |||
| top: "fire2/squeeze1x1" | |||
| convolution_param { | |||
| num_output: 16 | |||
| kernel_size: 1 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire2/relu_squeeze1x1" | |||
| type: "ReLU" | |||
| bottom: "fire2/squeeze1x1" | |||
| top: "fire2/squeeze1x1" | |||
| } | |||
| layer { | |||
| name: "fire2/expand1x1" | |||
| type: "Convolution" | |||
| bottom: "fire2/squeeze1x1" | |||
| top: "fire2/expand1x1" | |||
| convolution_param { | |||
| num_output: 64 | |||
| kernel_size: 1 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire2/relu_expand1x1" | |||
| type: "ReLU" | |||
| bottom: "fire2/expand1x1" | |||
| top: "fire2/expand1x1" | |||
| } | |||
| layer { | |||
| name: "fire2/expand3x3" | |||
| type: "Convolution" | |||
| bottom: "fire2/squeeze1x1" | |||
| top: "fire2/expand3x3" | |||
| convolution_param { | |||
| num_output: 64 | |||
| pad: 1 | |||
| kernel_size: 3 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire2/relu_expand3x3" | |||
| type: "ReLU" | |||
| bottom: "fire2/expand3x3" | |||
| top: "fire2/expand3x3" | |||
| } | |||
| layer { | |||
| name: "fire2/concat" | |||
| type: "Concat" | |||
| bottom: "fire2/expand1x1" | |||
| bottom: "fire2/expand3x3" | |||
| top: "fire2/concat" | |||
| } | |||
| layer { | |||
| name: "fire3/squeeze1x1" | |||
| type: "Convolution" | |||
| bottom: "fire2/concat" | |||
| top: "fire3/squeeze1x1" | |||
| convolution_param { | |||
| num_output: 16 | |||
| kernel_size: 1 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire3/relu_squeeze1x1" | |||
| type: "ReLU" | |||
| bottom: "fire3/squeeze1x1" | |||
| top: "fire3/squeeze1x1" | |||
| } | |||
| layer { | |||
| name: "fire3/expand1x1" | |||
| type: "Convolution" | |||
| bottom: "fire3/squeeze1x1" | |||
| top: "fire3/expand1x1" | |||
| convolution_param { | |||
| num_output: 64 | |||
| kernel_size: 1 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire3/relu_expand1x1" | |||
| type: "ReLU" | |||
| bottom: "fire3/expand1x1" | |||
| top: "fire3/expand1x1" | |||
| } | |||
| layer { | |||
| name: "fire3/expand3x3" | |||
| type: "Convolution" | |||
| bottom: "fire3/squeeze1x1" | |||
| top: "fire3/expand3x3" | |||
| convolution_param { | |||
| num_output: 64 | |||
| pad: 1 | |||
| kernel_size: 3 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire3/relu_expand3x3" | |||
| type: "ReLU" | |||
| bottom: "fire3/expand3x3" | |||
| top: "fire3/expand3x3" | |||
| } | |||
| layer { | |||
| name: "fire3/concat" | |||
| type: "Concat" | |||
| bottom: "fire3/expand1x1" | |||
| bottom: "fire3/expand3x3" | |||
| top: "fire3/concat" | |||
| } | |||
| layer { | |||
| name: "pool3" | |||
| type: "Pooling" | |||
| bottom: "fire3/concat" | |||
| top: "pool3" | |||
| pooling_param { | |||
| pool: MAX | |||
| kernel_size: 3 | |||
| stride: 2 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire4/squeeze1x1" | |||
| type: "Convolution" | |||
| bottom: "pool3" | |||
| top: "fire4/squeeze1x1" | |||
| convolution_param { | |||
| num_output: 32 | |||
| kernel_size: 1 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire4/relu_squeeze1x1" | |||
| type: "ReLU" | |||
| bottom: "fire4/squeeze1x1" | |||
| top: "fire4/squeeze1x1" | |||
| } | |||
| layer { | |||
| name: "fire4/expand1x1" | |||
| type: "Convolution" | |||
| bottom: "fire4/squeeze1x1" | |||
| top: "fire4/expand1x1" | |||
| convolution_param { | |||
| num_output: 128 | |||
| kernel_size: 1 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire4/relu_expand1x1" | |||
| type: "ReLU" | |||
| bottom: "fire4/expand1x1" | |||
| top: "fire4/expand1x1" | |||
| } | |||
| layer { | |||
| name: "fire4/expand3x3" | |||
| type: "Convolution" | |||
| bottom: "fire4/squeeze1x1" | |||
| top: "fire4/expand3x3" | |||
| convolution_param { | |||
| num_output: 128 | |||
| pad: 1 | |||
| kernel_size: 3 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire4/relu_expand3x3" | |||
| type: "ReLU" | |||
| bottom: "fire4/expand3x3" | |||
| top: "fire4/expand3x3" | |||
| } | |||
| layer { | |||
| name: "fire4/concat" | |||
| type: "Concat" | |||
| bottom: "fire4/expand1x1" | |||
| bottom: "fire4/expand3x3" | |||
| top: "fire4/concat" | |||
| } | |||
| layer { | |||
| name: "fire5/squeeze1x1" | |||
| type: "Convolution" | |||
| bottom: "fire4/concat" | |||
| top: "fire5/squeeze1x1" | |||
| convolution_param { | |||
| num_output: 32 | |||
| kernel_size: 1 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire5/relu_squeeze1x1" | |||
| type: "ReLU" | |||
| bottom: "fire5/squeeze1x1" | |||
| top: "fire5/squeeze1x1" | |||
| } | |||
| layer { | |||
| name: "fire5/expand1x1" | |||
| type: "Convolution" | |||
| bottom: "fire5/squeeze1x1" | |||
| top: "fire5/expand1x1" | |||
| convolution_param { | |||
| num_output: 128 | |||
| kernel_size: 1 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire5/relu_expand1x1" | |||
| type: "ReLU" | |||
| bottom: "fire5/expand1x1" | |||
| top: "fire5/expand1x1" | |||
| } | |||
| layer { | |||
| name: "fire5/expand3x3" | |||
| type: "Convolution" | |||
| bottom: "fire5/squeeze1x1" | |||
| top: "fire5/expand3x3" | |||
| convolution_param { | |||
| num_output: 128 | |||
| pad: 1 | |||
| kernel_size: 3 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire5/relu_expand3x3" | |||
| type: "ReLU" | |||
| bottom: "fire5/expand3x3" | |||
| top: "fire5/expand3x3" | |||
| } | |||
| layer { | |||
| name: "fire5/concat" | |||
| type: "Concat" | |||
| bottom: "fire5/expand1x1" | |||
| bottom: "fire5/expand3x3" | |||
| top: "fire5/concat" | |||
| } | |||
| layer { | |||
| name: "pool5" | |||
| type: "Pooling" | |||
| bottom: "fire5/concat" | |||
| top: "pool5" | |||
| pooling_param { | |||
| pool: MAX | |||
| kernel_size: 3 | |||
| stride: 2 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire6/squeeze1x1" | |||
| type: "Convolution" | |||
| bottom: "pool5" | |||
| top: "fire6/squeeze1x1" | |||
| convolution_param { | |||
| num_output: 48 | |||
| kernel_size: 1 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire6/relu_squeeze1x1" | |||
| type: "ReLU" | |||
| bottom: "fire6/squeeze1x1" | |||
| top: "fire6/squeeze1x1" | |||
| } | |||
| layer { | |||
| name: "fire6/expand1x1" | |||
| type: "Convolution" | |||
| bottom: "fire6/squeeze1x1" | |||
| top: "fire6/expand1x1" | |||
| convolution_param { | |||
| num_output: 192 | |||
| kernel_size: 1 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire6/relu_expand1x1" | |||
| type: "ReLU" | |||
| bottom: "fire6/expand1x1" | |||
| top: "fire6/expand1x1" | |||
| } | |||
| layer { | |||
| name: "fire6/expand3x3" | |||
| type: "Convolution" | |||
| bottom: "fire6/squeeze1x1" | |||
| top: "fire6/expand3x3" | |||
| convolution_param { | |||
| num_output: 192 | |||
| pad: 1 | |||
| kernel_size: 3 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire6/relu_expand3x3" | |||
| type: "ReLU" | |||
| bottom: "fire6/expand3x3" | |||
| top: "fire6/expand3x3" | |||
| } | |||
| layer { | |||
| name: "fire6/concat" | |||
| type: "Concat" | |||
| bottom: "fire6/expand1x1" | |||
| bottom: "fire6/expand3x3" | |||
| top: "fire6/concat" | |||
| } | |||
| layer { | |||
| name: "fire7/squeeze1x1" | |||
| type: "Convolution" | |||
| bottom: "fire6/concat" | |||
| top: "fire7/squeeze1x1" | |||
| convolution_param { | |||
| num_output: 48 | |||
| kernel_size: 1 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire7/relu_squeeze1x1" | |||
| type: "ReLU" | |||
| bottom: "fire7/squeeze1x1" | |||
| top: "fire7/squeeze1x1" | |||
| } | |||
| layer { | |||
| name: "fire7/expand1x1" | |||
| type: "Convolution" | |||
| bottom: "fire7/squeeze1x1" | |||
| top: "fire7/expand1x1" | |||
| convolution_param { | |||
| num_output: 192 | |||
| kernel_size: 1 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire7/relu_expand1x1" | |||
| type: "ReLU" | |||
| bottom: "fire7/expand1x1" | |||
| top: "fire7/expand1x1" | |||
| } | |||
| layer { | |||
| name: "fire7/expand3x3" | |||
| type: "Convolution" | |||
| bottom: "fire7/squeeze1x1" | |||
| top: "fire7/expand3x3" | |||
| convolution_param { | |||
| num_output: 192 | |||
| pad: 1 | |||
| kernel_size: 3 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire7/relu_expand3x3" | |||
| type: "ReLU" | |||
| bottom: "fire7/expand3x3" | |||
| top: "fire7/expand3x3" | |||
| } | |||
| layer { | |||
| name: "fire7/concat" | |||
| type: "Concat" | |||
| bottom: "fire7/expand1x1" | |||
| bottom: "fire7/expand3x3" | |||
| top: "fire7/concat" | |||
| } | |||
| layer { | |||
| name: "fire8/squeeze1x1" | |||
| type: "Convolution" | |||
| bottom: "fire7/concat" | |||
| top: "fire8/squeeze1x1" | |||
| convolution_param { | |||
| num_output: 64 | |||
| kernel_size: 1 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire8/relu_squeeze1x1" | |||
| type: "ReLU" | |||
| bottom: "fire8/squeeze1x1" | |||
| top: "fire8/squeeze1x1" | |||
| } | |||
| layer { | |||
| name: "fire8/expand1x1" | |||
| type: "Convolution" | |||
| bottom: "fire8/squeeze1x1" | |||
| top: "fire8/expand1x1" | |||
| convolution_param { | |||
| num_output: 256 | |||
| kernel_size: 1 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire8/relu_expand1x1" | |||
| type: "ReLU" | |||
| bottom: "fire8/expand1x1" | |||
| top: "fire8/expand1x1" | |||
| } | |||
| layer { | |||
| name: "fire8/expand3x3" | |||
| type: "Convolution" | |||
| bottom: "fire8/squeeze1x1" | |||
| top: "fire8/expand3x3" | |||
| convolution_param { | |||
| num_output: 256 | |||
| pad: 1 | |||
| kernel_size: 3 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire8/relu_expand3x3" | |||
| type: "ReLU" | |||
| bottom: "fire8/expand3x3" | |||
| top: "fire8/expand3x3" | |||
| } | |||
| layer { | |||
| name: "fire8/concat" | |||
| type: "Concat" | |||
| bottom: "fire8/expand1x1" | |||
| bottom: "fire8/expand3x3" | |||
| top: "fire8/concat" | |||
| } | |||
| layer { | |||
| name: "fire9/squeeze1x1" | |||
| type: "Convolution" | |||
| bottom: "fire8/concat" | |||
| top: "fire9/squeeze1x1" | |||
| convolution_param { | |||
| num_output: 64 | |||
| kernel_size: 1 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire9/relu_squeeze1x1" | |||
| type: "ReLU" | |||
| bottom: "fire9/squeeze1x1" | |||
| top: "fire9/squeeze1x1" | |||
| } | |||
| layer { | |||
| name: "fire9/expand1x1" | |||
| type: "Convolution" | |||
| bottom: "fire9/squeeze1x1" | |||
| top: "fire9/expand1x1" | |||
| convolution_param { | |||
| num_output: 256 | |||
| kernel_size: 1 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire9/relu_expand1x1" | |||
| type: "ReLU" | |||
| bottom: "fire9/expand1x1" | |||
| top: "fire9/expand1x1" | |||
| } | |||
| layer { | |||
| name: "fire9/expand3x3" | |||
| type: "Convolution" | |||
| bottom: "fire9/squeeze1x1" | |||
| top: "fire9/expand3x3" | |||
| convolution_param { | |||
| num_output: 256 | |||
| pad: 1 | |||
| kernel_size: 3 | |||
| } | |||
| } | |||
| layer { | |||
| name: "fire9/relu_expand3x3" | |||
| type: "ReLU" | |||
| bottom: "fire9/expand3x3" | |||
| top: "fire9/expand3x3" | |||
| } | |||
| layer { | |||
| name: "fire9/concat" | |||
| type: "Concat" | |||
| bottom: "fire9/expand1x1" | |||
| bottom: "fire9/expand3x3" | |||
| top: "fire9/concat" | |||
| } | |||
| layer { | |||
| name: "drop9" | |||
| type: "Dropout" | |||
| bottom: "fire9/concat" | |||
| top: "fire9/concat" | |||
| dropout_param { | |||
| dropout_ratio: 0.5 | |||
| } | |||
| } | |||
| layer { | |||
| name: "conv10" | |||
| type: "Convolution" | |||
| bottom: "fire9/concat" | |||
| top: "conv10" | |||
| convolution_param { | |||
| num_output: 1000 | |||
| pad: 1 | |||
| kernel_size: 1 | |||
| } | |||
| } | |||
| layer { | |||
| name: "relu_conv10" | |||
| type: "ReLU" | |||
| bottom: "conv10" | |||
| top: "conv10" | |||
| } | |||
| layer { | |||
| name: "pool10" | |||
| type: "Pooling" | |||
| bottom: "conv10" | |||
| top: "pool10" | |||
| pooling_param { | |||
| pool: AVE | |||
| global_pooling: true | |||
| } | |||
| } | |||
| layer { | |||
| name: "prob" | |||
| type: "Softmax" | |||
| bottom: "pool10" | |||
| top: "prob" | |||
| } | |||
| @@ -0,0 +1,193 @@ | |||
| # This file is based off of the Platform/Darwin.cmake and Platform/UnixPaths.cmake | |||
| # files which are included with CMake 2.8.4 | |||
| # It has been altered for iOS development | |||
| # Options: | |||
| # | |||
| # IOS_PLATFORM = iPhoneOS (default) or iPhoneSimulator | |||
| # This decides if SDKS will be selected from the iPhoneOS.platform or iPhoneSimulator.platform folders | |||
| # iPhoneOS - the default, used to build for iPhone and iPad physical devices, which have an arm arch. | |||
| # iPhoneSimulator - used to build for the Simulator platforms, which have an x86 arch. | |||
| # | |||
| # CMAKE_IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder | |||
| # By default this location is automatcially chosen based on the IOS_PLATFORM value above. | |||
| # If set manually, it will override the default location and force the user of a particular Developer Platform | |||
| # | |||
| # CMAKE_IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder | |||
| # By default this location is automatcially chosen based on the CMAKE_IOS_DEVELOPER_ROOT value. | |||
| # In this case it will always be the most up-to-date SDK found in the CMAKE_IOS_DEVELOPER_ROOT path. | |||
| # If set manually, this will force the use of a specific SDK version | |||
| # Macros: | |||
| # | |||
| # set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE) | |||
| # A convenience macro for setting xcode specific properties on targets | |||
| # example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1") | |||
| # | |||
| # find_host_package (PROGRAM ARGS) | |||
| # A macro used to find executable programs on the host system, not within the iOS environment. | |||
| # Thanks to the android-cmake project for providing the command | |||
| # Standard settings | |||
| set (CMAKE_SYSTEM_NAME Darwin) | |||
| set (CMAKE_SYSTEM_VERSION 1) | |||
| set (UNIX True) | |||
| set (APPLE True) | |||
| set (IOS True) | |||
| # Required as of cmake 2.8.10 | |||
| set (CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE) | |||
| # Determine the cmake host system version so we know where to find the iOS SDKs | |||
| find_program (CMAKE_UNAME uname /bin /usr/bin /usr/local/bin) | |||
| if (CMAKE_UNAME) | |||
| exec_program(uname ARGS -r OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_VERSION) | |||
| string (REGEX REPLACE "^([0-9]+)\\.([0-9]+).*$" "\\1" DARWIN_MAJOR_VERSION "${CMAKE_HOST_SYSTEM_VERSION}") | |||
| endif (CMAKE_UNAME) | |||
| # Force the compilers to gcc for iOS | |||
| include (CMakeForceCompiler) | |||
| CMAKE_FORCE_C_COMPILER (/usr/bin/clang Apple) | |||
| CMAKE_FORCE_CXX_COMPILER (/usr/bin/clang++ Apple) | |||
| set(CMAKE_AR ar CACHE FILEPATH "" FORCE) | |||
| # Skip the platform compiler checks for cross compiling | |||
| set (CMAKE_CXX_COMPILER_WORKS TRUE) | |||
| set (CMAKE_C_COMPILER_WORKS TRUE) | |||
| # All iOS/Darwin specific settings - some may be redundant | |||
| set (CMAKE_SHARED_LIBRARY_PREFIX "lib") | |||
| set (CMAKE_SHARED_LIBRARY_SUFFIX ".dylib") | |||
| set (CMAKE_SHARED_MODULE_PREFIX "lib") | |||
| set (CMAKE_SHARED_MODULE_SUFFIX ".so") | |||
| set (CMAKE_MODULE_EXISTS 1) | |||
| set (CMAKE_DL_LIBS "") | |||
| set (CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ") | |||
| set (CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ") | |||
| set (CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}") | |||
| set (CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}") | |||
| # Hidden visibilty is required for cxx on iOS | |||
| set (CMAKE_C_FLAGS_INIT "-isysroot ${CMAKE_OSX_SYSROOT} -miphoneos-version-min=6.0") | |||
| set (CMAKE_CXX_FLAGS_INIT "-stdlib=libc++ -fvisibility=hidden -fvisibility-inlines-hidden -isysroot ${CMAKE_OSX_SYSROOT} -miphoneos-version-min=6.0") | |||
| set (CMAKE_C_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}") | |||
| set (CMAKE_CXX_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}") | |||
| set (CMAKE_PLATFORM_HAS_INSTALLNAME 1) | |||
| set (CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names") | |||
| set (CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names") | |||
| set (CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,") | |||
| set (CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,") | |||
| set (CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a") | |||
| # hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree | |||
| # (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache | |||
| # and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun) | |||
| # hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex | |||
| if (NOT DEFINED CMAKE_INSTALL_NAME_TOOL) | |||
| find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool) | |||
| endif (NOT DEFINED CMAKE_INSTALL_NAME_TOOL) | |||
| # Setup iOS platform unless specified manually with IOS_PLATFORM | |||
| if (NOT DEFINED IOS_PLATFORM) | |||
| set (IOS_PLATFORM "iPhoneOS") | |||
| endif (NOT DEFINED IOS_PLATFORM) | |||
| set (IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform") | |||
| # Check the platform selection and setup for developer root | |||
| if (${IOS_PLATFORM} STREQUAL "iPhoneOS") | |||
| set (IOS_PLATFORM_LOCATION "iPhoneOS.platform") | |||
| # This causes the installers to properly locate the output libraries | |||
| set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos") | |||
| elseif (${IOS_PLATFORM} STREQUAL "iPhoneSimulator") | |||
| set (IOS_PLATFORM_LOCATION "iPhoneSimulator.platform") | |||
| # This causes the installers to properly locate the output libraries | |||
| set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator") | |||
| else (${IOS_PLATFORM} STREQUAL "iPhoneOS") | |||
| message (FATAL_ERROR "Unsupported IOS_PLATFORM value selected. Please choose iPhoneOS or iPhoneSimulator") | |||
| endif (${IOS_PLATFORM} STREQUAL "iPhoneOS") | |||
| # Setup iOS developer location unless specified manually with CMAKE_IOS_DEVELOPER_ROOT | |||
| # Note Xcode 4.3 changed the installation location, choose the most recent one available | |||
| set (XCODE_POST_43_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer") | |||
| set (XCODE_PRE_43_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer") | |||
| if (NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT) | |||
| if (EXISTS ${XCODE_POST_43_ROOT}) | |||
| set (CMAKE_IOS_DEVELOPER_ROOT ${XCODE_POST_43_ROOT}) | |||
| elseif(EXISTS ${XCODE_PRE_43_ROOT}) | |||
| set (CMAKE_IOS_DEVELOPER_ROOT ${XCODE_PRE_43_ROOT}) | |||
| endif (EXISTS ${XCODE_POST_43_ROOT}) | |||
| endif (NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT) | |||
| set (CMAKE_IOS_DEVELOPER_ROOT ${CMAKE_IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform") | |||
| # Find and use the most recent iOS sdk unless specified manually with CMAKE_IOS_SDK_ROOT | |||
| if (NOT DEFINED CMAKE_IOS_SDK_ROOT) | |||
| file (GLOB _CMAKE_IOS_SDKS "${CMAKE_IOS_DEVELOPER_ROOT}/SDKs/*") | |||
| if (_CMAKE_IOS_SDKS) | |||
| list (SORT _CMAKE_IOS_SDKS) | |||
| list (REVERSE _CMAKE_IOS_SDKS) | |||
| list (GET _CMAKE_IOS_SDKS 0 CMAKE_IOS_SDK_ROOT) | |||
| else (_CMAKE_IOS_SDKS) | |||
| message (FATAL_ERROR "No iOS SDK's found in default search path ${CMAKE_IOS_DEVELOPER_ROOT}. Manually set CMAKE_IOS_SDK_ROOT or install the iOS SDK.") | |||
| endif (_CMAKE_IOS_SDKS) | |||
| message (STATUS "Toolchain using default iOS SDK: ${CMAKE_IOS_SDK_ROOT}") | |||
| endif (NOT DEFINED CMAKE_IOS_SDK_ROOT) | |||
| set (CMAKE_IOS_SDK_ROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK") | |||
| # Set the sysroot default to the most recent SDK | |||
| set (CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support") | |||
| # set the architecture for iOS | |||
| # NOTE: Currently both ARCHS_STANDARD_32_BIT and ARCHS_UNIVERSAL_IPHONE_OS set armv7 only, so set both manually | |||
| if (${IOS_PLATFORM} STREQUAL "iPhoneOS") | |||
| set (IOS_ARCH armv7) | |||
| else (${IOS_PLATFORM} STREQUAL "iPhoneOS") | |||
| set (IOS_ARCH i386) | |||
| endif (${IOS_PLATFORM} STREQUAL "iPhoneOS") | |||
| set (CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS") | |||
| # Set the find root to the iOS developer roots and to user defined paths | |||
| set (CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE string "iOS find search path root") | |||
| # default to searching for frameworks first | |||
| set (CMAKE_FIND_FRAMEWORK FIRST) | |||
| # set up the default search directories for frameworks | |||
| set (CMAKE_SYSTEM_FRAMEWORK_PATH | |||
| ${CMAKE_IOS_SDK_ROOT}/System/Library/Frameworks | |||
| ${CMAKE_IOS_SDK_ROOT}/System/Library/PrivateFrameworks | |||
| ${CMAKE_IOS_SDK_ROOT}/Developer/Library/Frameworks | |||
| ) | |||
| # only search the iOS sdks, not the remainder of the host filesystem | |||
| set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY) | |||
| set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) | |||
| set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) | |||
| # This little macro lets you set any XCode specific property | |||
| macro (set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE) | |||
| set_property (TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE}) | |||
| endmacro (set_xcode_property) | |||
| # This macro lets you find executable programs on the host system | |||
| macro (find_host_package) | |||
| set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) | |||
| set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER) | |||
| set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER) | |||
| set (IOS FALSE) | |||
| find_package(${ARGN}) | |||
| set (IOS TRUE) | |||
| set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY) | |||
| set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) | |||
| set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) | |||
| endmacro (find_host_package) | |||
| @@ -0,0 +1,40 @@ | |||
| # Standard settings | |||
| # set(UNIX True) | |||
| # set(Darwin True) | |||
| # set(IOS True) | |||
| set (CMAKE_SYSTEM_NAME Darwin) | |||
| set (CMAKE_SYSTEM_VERSION 1) | |||
| set (UNIX True) | |||
| set (APPLE True) | |||
| set (IOS True) | |||
| # suppress -rdynamic | |||
| # set(CMAKE_SYSTEM_NAME Generic) | |||
| set(CMAKE_C_COMPILER i386-apple-darwin11-clang) | |||
| set(CMAKE_CXX_COMPILER i386-apple-darwin11-clang++) | |||
| set(_CMAKE_TOOLCHAIN_PREFIX i386-apple-darwin11-) | |||
| set(CMAKE_IOS_SDK_ROOT "/home/nihui/osd/cctools-port/usage_examples/ios_toolchain/target-sim/SDK/") | |||
| # Set the sysroot default to the most recent SDK | |||
| set(CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS Simulator support") | |||
| # set the architecture for iOS | |||
| # set(IOS_ARCH i386) | |||
| # set(IOS_ARCH x86_64) | |||
| set(IOS_ARCH i386;x86_64) | |||
| set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS Simulator") | |||
| # Set the find root to the iOS developer roots and to user defined paths | |||
| set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE string "iOS Simulator find search path root") | |||
| # searching for frameworks only | |||
| set(CMAKE_FIND_FRAMEWORK FIRST) | |||
| # set up the default search directories for frameworks | |||
| set(CMAKE_SYSTEM_FRAMEWORK_PATH | |||
| ${CMAKE_IOS_SDK_ROOT}/System/Library/Frameworks | |||
| ) | |||
| @@ -0,0 +1,39 @@ | |||
| # Standard settings | |||
| # set(UNIX True) | |||
| # set(Darwin True) | |||
| # set(IOS True) | |||
| set (CMAKE_SYSTEM_NAME Darwin) | |||
| set (CMAKE_SYSTEM_VERSION 1) | |||
| set (UNIX True) | |||
| set (APPLE True) | |||
| set (IOS True) | |||
| # suppress -rdynamic | |||
| # set(CMAKE_SYSTEM_NAME Generic) | |||
| set(CMAKE_C_COMPILER arm-apple-darwin11-clang) | |||
| set(CMAKE_CXX_COMPILER arm-apple-darwin11-clang++) | |||
| set(_CMAKE_TOOLCHAIN_PREFIX arm-apple-darwin11-) | |||
| set(CMAKE_IOS_SDK_ROOT "/home/nihui/osd/cctools-port/usage_examples/ios_toolchain/target/SDK/") | |||
| # Set the sysroot default to the most recent SDK | |||
| set(CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support") | |||
| # set the architecture for iOS | |||
| # set(IOS_ARCH arm64) | |||
| set(IOS_ARCH armv7;arm64) | |||
| set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS") | |||
| # Set the find root to the iOS developer roots and to user defined paths | |||
| set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE string "iOS find search path root") | |||
| # searching for frameworks only | |||
| set(CMAKE_FIND_FRAMEWORK FIRST) | |||
| # set up the default search directories for frameworks | |||
| set(CMAKE_SYSTEM_FRAMEWORK_PATH | |||
| ${CMAKE_IOS_SDK_ROOT}/System/Library/Frameworks | |||
| ) | |||
| @@ -0,0 +1,35 @@ | |||
| #!/usr/bin/bash | |||
| NAME=ncnn | |||
| ##### package android lib | |||
| ANDROIDPKGNAME=${NAME}-android-lib | |||
| rm -rf $ANDROIDPKGNAME | |||
| mkdir -p $ANDROIDPKGNAME | |||
| mkdir -p $ANDROIDPKGNAME/armeabi-v7a | |||
| mkdir -p $ANDROIDPKGNAME/arm64-v8a | |||
| mkdir -p $ANDROIDPKGNAME/include | |||
| cp build-android-armv7/install/lib/lib${NAME}.a $ANDROIDPKGNAME/armeabi-v7a/ | |||
| cp build-android-aarch64/install/lib/lib${NAME}.a $ANDROIDPKGNAME/arm64-v8a/ | |||
| cp build-android-aarch64/install/include/* $ANDROIDPKGNAME/include/ | |||
| rm -f $ANDROIDPKGNAME.zip | |||
| zip -9 -r $ANDROIDPKGNAME.zip $ANDROIDPKGNAME | |||
| ##### package ios framework | |||
| IOSPKGNAME=${NAME}.framework | |||
| rm -rf $IOSPKGNAME | |||
| mkdir -p $IOSPKGNAME/Versions/A/Headers | |||
| mkdir -p $IOSPKGNAME/Versions/A/Resources | |||
| ln -s A $IOSPKGNAME/Versions/Current | |||
| ln -s Versions/Current/Headers $IOSPKGNAME/Headers | |||
| ln -s Versions/Current/Resources $IOSPKGNAME/Resources | |||
| ln -s Versions/Current/${NAME} $IOSPKGNAME/${NAME} | |||
| lipo -create \ | |||
| build-ios/install/lib/lib${NAME}.a \ | |||
| build-ios-sim/install/lib/lib${NAME}.a \ | |||
| -o $IOSPKGNAME/Versions/A/${NAME} | |||
| cp -r build-ios/install/include/* $IOSPKGNAME/Versions/A/Headers/ | |||
| cp Info.plist ${IOSPKGNAME}/Versions/A/Resources/ | |||
| rm -f $IOSPKGNAME.zip | |||
| zip -9 -y -r $IOSPKGNAME.zip $IOSPKGNAME | |||
| @@ -0,0 +1,135 @@ | |||
| ############################################## | |||
| configure_file(platform.h.in ${CMAKE_CURRENT_BINARY_DIR}/platform.h) | |||
| include_directories(${CMAKE_CURRENT_SOURCE_DIR}) | |||
| include_directories(${CMAKE_CURRENT_BINARY_DIR}) | |||
| include_directories(${CMAKE_CURRENT_SOURCE_DIR}/layer) | |||
| set(ncnn_SRCS | |||
| blob.cpp | |||
| cpu.cpp | |||
| layer.cpp | |||
| mat.cpp | |||
| mat_pixel.cpp | |||
| net.cpp | |||
| opencv.cpp | |||
| ) | |||
| macro(ncnn_add_layer class) | |||
| string(TOLOWER ${class} name) | |||
| # WITH_LAYER_xxx option | |||
| if(${ARGC} EQUAL 2) | |||
| option(WITH_LAYER_${name} "build with layer ${name}" ${ARGV1}) | |||
| else() | |||
| option(WITH_LAYER_${name} "build with layer ${name}" ON) | |||
| endif() | |||
| message("WITH_LAYER_${name} = ${WITH_LAYER_${name}}") | |||
| if(WITH_LAYER_${name}) | |||
| list(APPEND ncnn_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/layer/${name}.cpp") | |||
| # look for arch specific implementation and append source | |||
| # optimized implementation for armv7 aarch64 | |||
| if((ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a")) | |||
| OR (ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64")) | |||
| OR (IOS AND ("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "armv7")) | |||
| OR (IOS AND ("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "arm64")) | |||
| OR (IOS AND ("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "armv7;arm64"))) | |||
| if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/layer/arm/${name}_arm.cpp") | |||
| list(APPEND ncnn_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/layer/arm/${name}_arm.cpp") | |||
| set(WITH_LAYER_${name}_arm 1) | |||
| endif() | |||
| else() | |||
| if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/layer/x86/${name}_x86.cpp") | |||
| list(APPEND ncnn_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/layer/x86/${name}_x86.cpp") | |||
| set(WITH_LAYER_${name}_x86 1) | |||
| endif() | |||
| endif() | |||
| endif() | |||
| # generate layer_declaration and layer_registry file | |||
| if(WITH_LAYER_${name}) | |||
| if(WITH_LAYER_${name}_arm) | |||
| file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_declaration.h | |||
| "extern Layer* ${class}_arm_layer_creator();\n") | |||
| file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h | |||
| "#if NCNN_STRING\n{\"${class}\",${class}_arm_layer_creator},\n#else\n{${class}_arm_layer_creator},\n#endif\n") | |||
| elseif(WITH_LAYER_${name}_x86) | |||
| file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_declaration.h | |||
| "extern Layer* ${class}_x86_layer_creator();\n") | |||
| file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h | |||
| "#if NCNN_STRING\n{\"${class}\",${class}_x86_layer_creator},\n#else\n{${class}_x86_layer_creator},\n#endif\n") | |||
| else() | |||
| file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_declaration.h | |||
| "extern Layer* ${class}_layer_creator();\n") | |||
| file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h | |||
| "#if NCNN_STRING\n{\"${class}\",${class}_layer_creator},\n#else\n{${class}_layer_creator},\n#endif\n") | |||
| endif() | |||
| else() | |||
| file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h "#if NCNN_STRING\n{\"${class}\",0},\n#else\n{0},\n#endif\n") | |||
| endif() | |||
| endmacro() | |||
| # create new | |||
| file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/layer_declaration.h) | |||
| file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h) | |||
| # layer implementation | |||
| ncnn_add_layer(AbsVal) | |||
| ncnn_add_layer(ArgMax OFF) | |||
| ncnn_add_layer(BatchNorm) | |||
| ncnn_add_layer(Bias) | |||
| ncnn_add_layer(BNLL) | |||
| ncnn_add_layer(Concat) | |||
| ncnn_add_layer(Convolution) | |||
| ncnn_add_layer(Crop) | |||
| ncnn_add_layer(Deconvolution) | |||
| ncnn_add_layer(Dropout) | |||
| ncnn_add_layer(Eltwise) | |||
| ncnn_add_layer(ELU) | |||
| ncnn_add_layer(Embed OFF) | |||
| ncnn_add_layer(Exp) | |||
| ncnn_add_layer(Flatten) | |||
| ncnn_add_layer(InnerProduct) | |||
| ncnn_add_layer(Input) | |||
| ncnn_add_layer(Log) | |||
| ncnn_add_layer(LRN) | |||
| ncnn_add_layer(MemoryData OFF) | |||
| ncnn_add_layer(MVN) | |||
| ncnn_add_layer(Pooling) | |||
| ncnn_add_layer(Power) | |||
| ncnn_add_layer(PReLU) | |||
| ncnn_add_layer(Proposal OFF) | |||
| ncnn_add_layer(Reduction OFF) | |||
| ncnn_add_layer(ReLU) | |||
| ncnn_add_layer(Reshape OFF) | |||
| ncnn_add_layer(ROIPooling OFF) | |||
| ncnn_add_layer(Scale) | |||
| ncnn_add_layer(Sigmoid) | |||
| ncnn_add_layer(Slice) | |||
| ncnn_add_layer(Softmax) | |||
| ncnn_add_layer(Split) | |||
| ncnn_add_layer(SPP OFF) | |||
| ncnn_add_layer(TanH) | |||
| ncnn_add_layer(Threshold) | |||
| ncnn_add_layer(Tile OFF) | |||
| ncnn_add_layer(RNN OFF) | |||
| ncnn_add_layer(LSTM OFF) | |||
| add_library(ncnn STATIC ${ncnn_SRCS}) | |||
| install(TARGETS ncnn ARCHIVE DESTINATION lib) | |||
| install(FILES | |||
| blob.h | |||
| cpu.h | |||
| layer.h | |||
| mat.h | |||
| net.h | |||
| opencv.h | |||
| ${CMAKE_CURRENT_BINARY_DIR}/platform.h | |||
| DESTINATION include | |||
| ) | |||
| @@ -0,0 +1,24 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "blob.h" | |||
| namespace ncnn { | |||
| Blob::Blob() | |||
| { | |||
| producer = -1; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,43 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef NCNN_BLOB_H | |||
| #define NCNN_BLOB_H | |||
| #include <string> | |||
| #include <vector> | |||
| #include "platform.h" | |||
| namespace ncnn { | |||
| class Blob | |||
| { | |||
| public: | |||
| // empty | |||
| Blob(); | |||
| public: | |||
| #if NCNN_STRING | |||
| // blob name | |||
| std::string name; | |||
| #endif // NCNN_STRING | |||
| // layer index which produce this blob as output | |||
| int producer; | |||
| // layer index which need this blob as input | |||
| std::vector<int> consumers; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // NCNN_BLOB_H | |||
| @@ -0,0 +1,471 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "cpu.h" | |||
| #include <stdio.h> | |||
| #include <vector> | |||
| #ifdef _OPENMP | |||
| #include <omp.h> | |||
| #endif | |||
| #ifdef __ANDROID__ | |||
| #include <sys/syscall.h> | |||
| #include <unistd.h> | |||
| #endif | |||
| #if __APPLE__ | |||
| #include "TargetConditionals.h" | |||
| #if TARGET_OS_IPHONE | |||
| #include <sys/types.h> | |||
| #include <sys/sysctl.h> | |||
| #include <mach/machine.h> | |||
| #define __IOS__ 1 | |||
| #endif | |||
| #endif | |||
| namespace ncnn { | |||
| #ifdef __ANDROID__ | |||
| // extract the ELF HW capabilities bitmap from /proc/self/auxv | |||
| static unsigned int get_elf_hwcap_from_proc_self_auxv() | |||
| { | |||
| FILE* fp = fopen("/proc/self/auxv", "rb"); | |||
| if (!fp) | |||
| { | |||
| return 0; | |||
| } | |||
| #define AT_HWCAP 16 | |||
| #define AT_HWCAP2 26 | |||
| struct { unsigned int tag; unsigned int value; } entry; | |||
| unsigned int result = 0; | |||
| while (!feof(fp)) | |||
| { | |||
| int nread = fread((char*)&entry, sizeof(entry), 1, fp); | |||
| if (nread != 1) | |||
| break; | |||
| if (entry.tag == 0 && entry.value == 0) | |||
| break; | |||
| if (entry.tag == AT_HWCAP) | |||
| { | |||
| result = entry.value; | |||
| break; | |||
| } | |||
| } | |||
| fclose(fp); | |||
| return result; | |||
| } | |||
| static unsigned int g_hwcaps = get_elf_hwcap_from_proc_self_auxv(); | |||
| #if __aarch64__ | |||
| // from arch/arm64/include/uapi/asm/hwcap.h | |||
| #define HWCAP_ASIMD (1 << 1) | |||
| #define HWCAP_ASIMDHP (1 << 10) | |||
| #else | |||
| // from arch/arm/include/uapi/asm/hwcap.h | |||
| #define HWCAP_NEON (1 << 12) | |||
| #define HWCAP_VFPv4 (1 << 16) | |||
| #endif | |||
| #endif // __ANDROID__ | |||
| #if __IOS__ | |||
| static cpu_type_t get_hw_cputype() | |||
| { | |||
| cpu_type_t value = 0; | |||
| size_t len = sizeof(value); | |||
| sysctlbyname("hw.cputype", &value, &len, NULL, 0); | |||
| return value; | |||
| } | |||
| static cpu_subtype_t get_hw_cpusubtype() | |||
| { | |||
| cpu_subtype_t value = 0; | |||
| size_t len = sizeof(value); | |||
| sysctlbyname("hw.cpusubtype", &value, &len, NULL, 0); | |||
| return value; | |||
| } | |||
| static cpu_type_t g_hw_cputype = get_hw_cputype(); | |||
| static cpu_subtype_t g_hw_cpusubtype = get_hw_cpusubtype(); | |||
| #endif // __IOS__ | |||
| int cpu_support_arm_neon() | |||
| { | |||
| #ifdef __ANDROID__ | |||
| #if __aarch64__ | |||
| return g_hwcaps & HWCAP_ASIMD; | |||
| #else | |||
| return g_hwcaps & HWCAP_NEON; | |||
| #endif | |||
| #elif __IOS__ | |||
| #if __aarch64__ | |||
| return g_hw_cputype == CPU_TYPE_ARM64; | |||
| #else | |||
| return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7; | |||
| #endif | |||
| #else | |||
| return 0; | |||
| #endif | |||
| } | |||
| int cpu_support_arm_vfpv4() | |||
| { | |||
| #ifdef __ANDROID__ | |||
| #if __aarch64__ | |||
| // neon always enable fma and fp16 | |||
| return g_hwcaps & HWCAP_ASIMD; | |||
| #else | |||
| return g_hwcaps & HWCAP_VFPv4; | |||
| #endif | |||
| #elif __IOS__ | |||
| #if __aarch64__ | |||
| return g_hw_cputype == CPU_TYPE_ARM64; | |||
| #else | |||
| return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S; | |||
| #endif | |||
| #else | |||
| return 0; | |||
| #endif | |||
| } | |||
| int cpu_support_arm_asimdhp() | |||
| { | |||
| #ifdef __ANDROID__ | |||
| #if __aarch64__ | |||
| return g_hwcaps & HWCAP_ASIMDHP; | |||
| #else | |||
| return 0; | |||
| #endif | |||
| #elif __IOS__ | |||
| #if __aarch64__ | |||
| return 0; | |||
| #else | |||
| return 0; | |||
| #endif | |||
| #else | |||
| return 0; | |||
| #endif | |||
| } | |||
| static int get_cpucount() | |||
| { | |||
| #ifdef __ANDROID__ | |||
| // get cpu count from /proc/cpuinfo | |||
| FILE* fp = fopen("/proc/cpuinfo", "rb"); | |||
| if (!fp) | |||
| return 1; | |||
| int count = 0; | |||
| char line[1024]; | |||
| while (!feof(fp)) | |||
| { | |||
| char* s = fgets(line, 1024, fp); | |||
| if (!s) | |||
| break; | |||
| if (memcmp(line, "processor", 9) == 0) | |||
| { | |||
| count++; | |||
| } | |||
| } | |||
| fclose(fp); | |||
| if (count < 1) | |||
| count = 1; | |||
| return count; | |||
| #elif __IOS__ | |||
| int count = 0; | |||
| size_t len = sizeof(count); | |||
| sysctlbyname("hw.ncpu", &count, &len, NULL, 0); | |||
| if (count < 1) | |||
| count = 1; | |||
| return count; | |||
| #else | |||
| return 1; | |||
| #endif | |||
| } | |||
| static int g_cpucount = get_cpucount(); | |||
| int get_cpu_count() | |||
| { | |||
| return g_cpucount; | |||
| } | |||
| #ifdef __ANDROID__ | |||
| static int get_max_freq_khz(int cpuid) | |||
| { | |||
| char path[256]; | |||
| sprintf(path, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid); | |||
| FILE* fp = fopen(path, "rb"); | |||
| if (!fp) | |||
| return -1; | |||
| int max_freq_khz = 0; | |||
| while (!feof(fp)) | |||
| { | |||
| int freq_khz = 0; | |||
| int nscan = fscanf(fp, "%d %*d", &freq_khz); | |||
| if (nscan != 1) | |||
| break; | |||
| if (freq_khz > max_freq_khz) | |||
| max_freq_khz = freq_khz; | |||
| } | |||
| fclose(fp); | |||
| return max_freq_khz; | |||
| } | |||
| static int set_sched_affinity(const std::vector<int>& cpuids) | |||
| { | |||
| // cpu_set_t definition | |||
| // ref http://stackoverflow.com/questions/16319725/android-set-thread-affinity | |||
| #define CPU_SETSIZE 1024 | |||
| #define __NCPUBITS (8 * sizeof (unsigned long)) | |||
| typedef struct | |||
| { | |||
| unsigned long __bits[CPU_SETSIZE / __NCPUBITS]; | |||
| } cpu_set_t; | |||
| #define CPU_SET(cpu, cpusetp) \ | |||
| ((cpusetp)->__bits[(cpu)/__NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS))) | |||
| #define CPU_ZERO(cpusetp) \ | |||
| memset((cpusetp), 0, sizeof(cpu_set_t)) | |||
| // set affinity for thread | |||
| pid_t pid = gettid(); | |||
| cpu_set_t mask; | |||
| CPU_ZERO(&mask); | |||
| for (int i=0; i<(int)cpuids.size(); i++) | |||
| { | |||
| CPU_SET(cpuids[i], &mask); | |||
| } | |||
| int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask); | |||
| if (syscallret) | |||
| { | |||
| fprintf(stderr, "syscall error %d\n", syscallret); | |||
| return -1; | |||
| } | |||
| return 0; | |||
| } | |||
| static int sort_cpuid_by_max_frequency(std::vector<int>& cpuids, int* little_cluster_offset) | |||
| { | |||
| const int cpu_count = cpuids.size(); | |||
| *little_cluster_offset = 0; | |||
| if (cpu_count == 0) | |||
| return 0; | |||
| std::vector<int> cpu_max_freq_khz; | |||
| cpu_max_freq_khz.resize(cpu_count); | |||
| for (int i=0; i<cpu_count; i++) | |||
| { | |||
| int max_freq_khz = get_max_freq_khz(i); | |||
| // printf("%d max freq = %d khz\n", i, max_freq_khz); | |||
| cpuids[i] = i; | |||
| cpu_max_freq_khz[i] = max_freq_khz; | |||
| } | |||
| // sort cpuid as big core first | |||
| // simple bubble sort | |||
| for (int i=0; i<cpu_count; i++) | |||
| { | |||
| for (int j=i+1; j<cpu_count; j++) | |||
| { | |||
| if (cpu_max_freq_khz[i] < cpu_max_freq_khz[j]) | |||
| { | |||
| // swap | |||
| int tmp = cpuids[i]; | |||
| cpuids[i] = cpuids[j]; | |||
| cpuids[j] = tmp; | |||
| tmp = cpu_max_freq_khz[i]; | |||
| cpu_max_freq_khz[i] = cpu_max_freq_khz[j]; | |||
| cpu_max_freq_khz[j] = tmp; | |||
| } | |||
| } | |||
| } | |||
| // SMP | |||
| int mid_max_freq_khz = (cpu_max_freq_khz.front() + cpu_max_freq_khz.back()) / 2; | |||
| if (mid_max_freq_khz == cpu_max_freq_khz.back()) | |||
| return 0; | |||
| for (int i=0; i<cpu_count; i++) | |||
| { | |||
| if (cpu_max_freq_khz[i] < mid_max_freq_khz) | |||
| { | |||
| *little_cluster_offset = i; | |||
| break; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| #endif // __ANDROID__ | |||
| static int g_powersave = 0; | |||
| int get_cpu_powersave() | |||
| { | |||
| return g_powersave; | |||
| } | |||
| int set_cpu_powersave(int powersave) | |||
| { | |||
| #ifdef __ANDROID__ | |||
| static std::vector<int> sorted_cpuids; | |||
| static int little_cluster_offset = 0; | |||
| if (sorted_cpuids.empty()) | |||
| { | |||
| // 0 ~ g_cpucount | |||
| sorted_cpuids.resize(g_cpucount); | |||
| for (int i=0; i<g_cpucount; i++) | |||
| { | |||
| sorted_cpuids[i] = i; | |||
| } | |||
| // descent sort by max frequency | |||
| sort_cpuid_by_max_frequency(sorted_cpuids, &little_cluster_offset); | |||
| } | |||
| if (little_cluster_offset == 0) | |||
| { | |||
| fprintf(stderr, "SMP cpu powersave not supported\n"); | |||
| return -1; | |||
| } | |||
| // prepare affinity cpuid | |||
| std::vector<int> cpuids; | |||
| if (powersave == 0) | |||
| { | |||
| cpuids = sorted_cpuids; | |||
| } | |||
| else if (powersave == 1) | |||
| { | |||
| cpuids = std::vector<int>(sorted_cpuids.begin() + little_cluster_offset, sorted_cpuids.end()); | |||
| } | |||
| else if (powersave == 2) | |||
| { | |||
| cpuids = std::vector<int>(sorted_cpuids.begin(), sorted_cpuids.begin() + + little_cluster_offset); | |||
| } | |||
| else | |||
| { | |||
| fprintf(stderr, "powersave %d not supported\n", powersave); | |||
| return -1; | |||
| } | |||
| #ifdef _OPENMP | |||
| // set affinity for each thread | |||
| int num_threads = cpuids.size(); | |||
| omp_set_num_threads(num_threads); | |||
| std::vector<int> ssarets(num_threads, 0); | |||
| #pragma omp parallel for | |||
| for (int i=0; i<num_threads; i++) | |||
| { | |||
| ssarets[i] = set_sched_affinity(cpuids); | |||
| } | |||
| for (int i=0; i<num_threads; i++) | |||
| { | |||
| if (ssarets[i] != 0) | |||
| { | |||
| return -1; | |||
| } | |||
| } | |||
| #else | |||
| int ssaret = set_sched_affinity(cpuids); | |||
| if (ssaret != 0) | |||
| { | |||
| return -1; | |||
| } | |||
| #endif | |||
| g_powersave = powersave; | |||
| return 0; | |||
| #elif __IOS__ | |||
| // thread affinity not supported on ios | |||
| return -1; | |||
| #else | |||
| // TODO | |||
| return -1; | |||
| #endif | |||
| } | |||
| int get_omp_num_threads() | |||
| { | |||
| #ifdef _OPENMP | |||
| return omp_get_num_threads(); | |||
| #else | |||
| return 1; | |||
| #endif | |||
| } | |||
| void set_omp_num_threads(int num_threads) | |||
| { | |||
| #ifdef _OPENMP | |||
| omp_set_num_threads(num_threads); | |||
| #else | |||
| (void)num_threads; | |||
| #endif | |||
| } | |||
| int get_omp_dynamic() | |||
| { | |||
| #ifdef _OPENMP | |||
| return omp_get_dynamic(); | |||
| #else | |||
| return 0; | |||
| #endif | |||
| } | |||
| void set_omp_dynamic(int dynamic) | |||
| { | |||
| #ifdef _OPENMP | |||
| omp_set_dynamic(dynamic); | |||
| #else | |||
| (void)dynamic; | |||
| #endif | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,51 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef NCNN_CPU_H | |||
| #define NCNN_CPU_H | |||
| namespace ncnn { | |||
| // test optional cpu features | |||
| // neon = armv7 neon or aarch64 asimd | |||
| int cpu_support_arm_neon(); | |||
| // vfpv4 = armv7 fp16 + fma | |||
| int cpu_support_arm_vfpv4(); | |||
| // asimdhp = aarch64 asimd half precision | |||
| int cpu_support_arm_asimdhp(); | |||
| // cpu info | |||
| int get_cpu_count(); | |||
| // bind all threads on little clusters if powersave enabled | |||
| // affacts HMP arch cpu like ARM big.LITTLE | |||
| // only implemented on android at the moment | |||
| // switching powersave is expensive and not thread-safe | |||
| // 0 = all cores enabled(default) | |||
| // 1 = only little clusters enabled | |||
| // 2 = only big clusters enabled | |||
| // return 0 if success for setter function | |||
| int get_cpu_powersave(); | |||
| int set_cpu_powersave(int powersave); | |||
| // misc function wrapper for openmp routines | |||
| int get_omp_num_threads(); | |||
| void set_omp_num_threads(int num_threads); | |||
| int get_omp_dynamic(); | |||
| void set_omp_dynamic(int dynamic); | |||
| } // namespace ncnn | |||
| #endif // NCNN_CPU_H | |||
| @@ -0,0 +1,130 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "layer.h" | |||
| #include <stdio.h> | |||
| #include <string.h> | |||
| namespace ncnn { | |||
| Layer::Layer() | |||
| { | |||
| one_blob_only = false; | |||
| support_inplace = false; | |||
| } | |||
| Layer::~Layer() | |||
| { | |||
| } | |||
| #if NCNN_STDIO | |||
| #if NCNN_STRING | |||
| int Layer::load_param(FILE* /*paramfp*/) | |||
| { | |||
| return 0; | |||
| } | |||
| #endif // NCNN_STRING | |||
| int Layer::load_param_bin(FILE* /*paramfp*/) | |||
| { | |||
| return 0; | |||
| } | |||
| int Layer::load_model(FILE* /*binfp*/) | |||
| { | |||
| return 0; | |||
| } | |||
| #endif // NCNN_STDIO | |||
| int Layer::load_param(const unsigned char*& /*mem*/) | |||
| { | |||
| return 0; | |||
| } | |||
| int Layer::load_model(const unsigned char*& /*mem*/) | |||
| { | |||
| return 0; | |||
| } | |||
| int Layer::forward(const std::vector<Mat>& /*bottom_blobs*/, std::vector<Mat>& /*top_blobs*/) const | |||
| { | |||
| return -1; | |||
| } | |||
| int Layer::forward(const Mat& /*bottom_blob*/, Mat& /*top_blob*/) const | |||
| { | |||
| return -1; | |||
| } | |||
| int Layer::forward_inplace(std::vector<Mat>& bottom_top_blobs) const | |||
| { | |||
| std::vector<Mat> top_blobs; | |||
| int ret = forward(bottom_top_blobs, top_blobs); | |||
| bottom_top_blobs = top_blobs; | |||
| return ret; | |||
| } | |||
| int Layer::forward_inplace(Mat& bottom_top_blob) const | |||
| { | |||
| Mat top_blob; | |||
| int ret = forward(bottom_top_blob, top_blob); | |||
| bottom_top_blob = top_blob; | |||
| return ret; | |||
| } | |||
| #include "layer_declaration.h" | |||
| static const layer_registry_entry layer_registry[] = | |||
| { | |||
| #include "layer_registry.h" | |||
| }; | |||
| static const int layer_registry_entry_count = sizeof(layer_registry) / sizeof(layer_registry_entry); | |||
| #if NCNN_STRING | |||
| int layer_to_index(const char* type) | |||
| { | |||
| for (int i=0; i<layer_registry_entry_count; i++) | |||
| { | |||
| if (strcmp(type, layer_registry[i].name) == 0) | |||
| { | |||
| return i; | |||
| } | |||
| } | |||
| fprintf(stderr, "layer %s not exists\n", type); | |||
| return -1; | |||
| } | |||
| #endif // NCNN_STRING | |||
| Layer* create_layer(int index) | |||
| { | |||
| if (index < 0 || index >= layer_registry_entry_count) | |||
| { | |||
| fprintf(stderr, "layer index %d not exists\n", index); | |||
| return 0; | |||
| } | |||
| layer_creator_func layer_creator = layer_registry[index].creator; | |||
| if (!layer_creator) | |||
| { | |||
| fprintf(stderr, "layer index %d not enabled\n", index); | |||
| return 0; | |||
| } | |||
| return layer_creator(); | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,163 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef NCNN_LAYER_H | |||
| #define NCNN_LAYER_H | |||
| #include <stdio.h> | |||
| #include <string> | |||
| #include <vector> | |||
| #include "mat.h" | |||
| #include "platform.h" | |||
| namespace ncnn { | |||
| class Layer | |||
| { | |||
| public: | |||
| // empty | |||
| Layer(); | |||
| // virtual destructor | |||
| virtual ~Layer(); | |||
| #if NCNN_STDIO | |||
| #if NCNN_STRING | |||
| // load layer specific parameter from plain param file | |||
| // return 0 if success | |||
| virtual int load_param(FILE* paramfp); | |||
| #endif // NCNN_STRING | |||
| // load layer specific parameter from binary param file | |||
| // return 0 if success | |||
| virtual int load_param_bin(FILE* paramfp); | |||
| // load layer specific weight data from model file | |||
| // return 0 if success | |||
| virtual int load_model(FILE* binfp); | |||
| #endif // NCNN_STDIO | |||
| // load layer specific parameter from memory | |||
| // memory pointer is 32-bit aligned | |||
| // return 0 if success | |||
| virtual int load_param(const unsigned char*& mem); | |||
| // load layer specific weight data from memory | |||
| // memory pointer is 32-bit aligned | |||
| // return 0 if success | |||
| virtual int load_model(const unsigned char*& mem); | |||
| public: | |||
| // one input and one output blob | |||
| bool one_blob_only; | |||
| // support inplace inference | |||
| bool support_inplace; | |||
| public: | |||
| // implement inference | |||
| // return 0 if success | |||
| virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const; | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| // implement inplace inference | |||
| // return 0 if success | |||
| virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| public: | |||
| #if NCNN_STRING | |||
| // layer type name | |||
| std::string type; | |||
| // layer name | |||
| std::string name; | |||
| #endif // NCNN_STRING | |||
| // blob index which this layer needs as input | |||
| std::vector<int> bottoms; | |||
| // blob index which this layer produces as output | |||
| std::vector<int> tops; | |||
| }; | |||
| namespace LayerType { | |||
| enum | |||
| { | |||
| AbsVal = 0, | |||
| ArgMax = 1, | |||
| BatchNorm = 2, | |||
| Bias = 3, | |||
| BNLL = 4, | |||
| Concat = 5, | |||
| Convolution = 6, | |||
| Crop = 7, | |||
| Deconvolution = 8, | |||
| Dropout = 9, | |||
| ELU = 10, | |||
| Eltwise = 11, | |||
| Embed = 12, | |||
| Exp = 13, | |||
| Flatten = 14, | |||
| InnerProduct = 15, | |||
| Input = 16, | |||
| Log = 17, | |||
| LRN = 18, | |||
| MemoryData = 19, | |||
| MVN = 20, | |||
| Pooling = 21, | |||
| Power = 22, | |||
| PReLU = 23, | |||
| Proposal = 24, | |||
| Reduction = 25, | |||
| ReLU = 26, | |||
| Reshape = 27, | |||
| ROIPooling = 28, | |||
| Scale = 29, | |||
| Sigmoid = 30, | |||
| Slice = 31, | |||
| Softmax = 32, | |||
| Split = 33, | |||
| SPP = 34, | |||
| TanH = 35, | |||
| Threshold = 36, | |||
| Tile = 37, | |||
| RNN = 38, | |||
| LSTM = 39, | |||
| CustomBit = (1<<8), | |||
| }; | |||
| } // namespace LayerType | |||
| // layer factory function | |||
| typedef Layer* (*layer_creator_func)(); | |||
| struct layer_registry_entry | |||
| { | |||
| #if NCNN_STRING | |||
| // layer type name | |||
| const char* name; | |||
| #endif // NCNN_STRING | |||
| // layer factory entry | |||
| layer_creator_func creator; | |||
| }; | |||
| #if NCNN_STRING | |||
| // get layer type from type name | |||
| int layer_to_index(const char* type); | |||
| #endif // NCNN_STRING | |||
| // create layer from layer type | |||
| Layer* create_layer(int index); | |||
| #define DEFINE_LAYER_CREATOR(name) \ | |||
| Layer* name##_layer_creator() { return new name; } | |||
| } // namespace ncnn | |||
| #endif // NCNN_LAYER_H | |||
| @@ -0,0 +1,76 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "absval.h" | |||
| namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(AbsVal) | |||
| AbsVal::AbsVal() | |||
| { | |||
| one_blob_only = true; | |||
| support_inplace = true; | |||
| } | |||
| int AbsVal::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| int size = w * h; | |||
| top_blob.create(w, h, channels); | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| if (ptr[i] < 0) | |||
| outptr[i] = -ptr[i]; | |||
| else | |||
| outptr[i] = ptr[i]; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| int AbsVal::forward_inplace(Mat& bottom_top_blob) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| int size = w * h; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| if (ptr[i] < 0) | |||
| ptr[i] = -ptr[i]; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,36 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_ABSVAL_H | |||
| #define LAYER_ABSVAL_H | |||
| #include "layer.h" | |||
| namespace ncnn { | |||
| class AbsVal : public Layer | |||
| { | |||
| public: | |||
| AbsVal(); | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| public: | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_ABSVAL_H | |||
| @@ -0,0 +1,108 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "argmax.h" | |||
| #include <algorithm> | |||
| #include <functional> | |||
| namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(ArgMax) | |||
| ArgMax::ArgMax() | |||
| { | |||
| } | |||
| #if NCNN_STDIO | |||
| #if NCNN_STRING | |||
| int ArgMax::load_param(FILE* paramfp) | |||
| { | |||
| int nscan = fscanf(paramfp, "%d %d", &out_max_val, &topk); | |||
| if (nscan != 2) | |||
| { | |||
| fprintf(stderr, "ArgMax load_param failed %d\n", nscan); | |||
| return -1; | |||
| } | |||
| return 0; | |||
| } | |||
| #endif // NCNN_STRING | |||
| int ArgMax::load_param_bin(FILE* paramfp) | |||
| { | |||
| fread(&out_max_val, sizeof(int), 1, paramfp); | |||
| fread(&topk, sizeof(int), 1, paramfp); | |||
| return 0; | |||
| } | |||
| #endif // NCNN_STDIO | |||
| int ArgMax::load_param(const unsigned char*& mem) | |||
| { | |||
| out_max_val = *(int*)(mem); | |||
| mem += 4; | |||
| topk = *(int*)(mem); | |||
| mem += 4; | |||
| return 0; | |||
| } | |||
| int ArgMax::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| { | |||
| int size = bottom_blob.total(); | |||
| if (out_max_val) | |||
| top_blob.create(topk, 2); | |||
| else | |||
| top_blob.create(topk, 1); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| const float* ptr = bottom_blob; | |||
| // partial sort topk with index | |||
| // optional value | |||
| std::vector< std::pair<float, int> > vec; | |||
| vec.resize(size); | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| vec[i] = std::make_pair(ptr[i], i); | |||
| } | |||
| std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), | |||
| std::greater< std::pair<float, int> >()); | |||
| float* outptr = top_blob; | |||
| if (out_max_val) | |||
| { | |||
| float* valptr = outptr + topk; | |||
| for (int i=0; i<topk; i++) | |||
| { | |||
| outptr[i] = vec[i].first; | |||
| valptr[i] = vec[i].second; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for (int i=0; i<topk; i++) | |||
| { | |||
| outptr[i] = vec[i].second; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,44 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_ARGMAX_H | |||
| #define LAYER_ARGMAX_H | |||
| #include "layer.h" | |||
| namespace ncnn { | |||
| class ArgMax : public Layer | |||
| { | |||
| public: | |||
| ArgMax(); | |||
| #if NCNN_STDIO | |||
| #if NCNN_STRING | |||
| virtual int load_param(FILE* paramfp); | |||
| #endif // NCNN_STRING | |||
| virtual int load_param_bin(FILE* paramfp); | |||
| #endif // NCNN_STDIO | |||
| virtual int load_param(const unsigned char*& mem); | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| public: | |||
| int out_max_val; | |||
| int topk; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_ARGMAX_H | |||
| @@ -0,0 +1,152 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "absval_arm.h" | |||
| #if __ARM_NEON | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(AbsVal_arm) | |||
| int AbsVal_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| int size = w * h; | |||
| top_blob.create(w, h, channels); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| float32x4_t _outp = vabsq_f32(_p); | |||
| vst1q_f32(outptr, _outp); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "0: \n" | |||
| "vld1.f32 {d0-d1}, [%1]! \n" | |||
| "vabs.f32 q0, q0 \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d0-d1}, [%2]! \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr), // %1 | |||
| "=r"(outptr) // %2 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "2"(outptr) | |||
| : "cc", "memory", "q0" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *outptr = *ptr > 0 ? *ptr : -*ptr; | |||
| ptr++; | |||
| outptr++; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| int AbsVal_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| int size = w * h; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| _p = vabsq_f32(_p); | |||
| vst1q_f32(ptr, _p); | |||
| ptr += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "0: \n" | |||
| "vld1.f32 {d0-d1}, [%1] \n" | |||
| "vabs.f32 q0, q0 \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d0-d1}, [%1]! \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr) // %1 | |||
| : "0"(nn), | |||
| "1"(ptr) | |||
| : "cc", "memory", "q0" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *ptr = *ptr > 0 ? *ptr : -*ptr; | |||
| ptr++; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,34 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_ABSVAL_ARM_H | |||
| #define LAYER_ABSVAL_ARM_H | |||
| #include "absval.h" | |||
| namespace ncnn { | |||
| class AbsVal_arm : public AbsVal | |||
| { | |||
| public: | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| public: | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_ABSVAL_ARM_H | |||
| @@ -0,0 +1,186 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "batchnorm_arm.h" | |||
| #if __ARM_NEON | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(BatchNorm_arm) | |||
| int BatchNorm_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| { | |||
| // a = bias - slope * mean / sqrt(var) | |||
| // b = slope / sqrt(var) | |||
| // value = b * value + a | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int size = w * h; | |||
| top_blob.create(w, h, channels); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| const float* a_data_ptr = a_data; | |||
| const float* b_data_ptr = b_data; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| float a = a_data_ptr[q]; | |||
| float b = b_data_ptr[q]; | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| float32x4_t _a = vdupq_n_f32(a); | |||
| float32x4_t _b = vdupq_n_f32(b); | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| float32x4_t _outp = _a; | |||
| _outp = vfmaq_f32(_outp, _p, _b); | |||
| vst1q_f32(outptr, _outp); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "vdup.f32 q1, %6 \n" | |||
| "vdup.f32 q2, %7 \n" | |||
| "0: \n" | |||
| "pld [%1, #128] \n" | |||
| "vld1.f32 {d0-d1}, [%1 :128]! \n" | |||
| "vorr.32 q3, q1, q1 \n" | |||
| "vmla.f32 q3, q0, q2 \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d6-d7}, [%2 :128]! \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr), // %1 | |||
| "=r"(outptr) // %2 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "2"(outptr), | |||
| "r"(a), // %6 | |||
| "r"(b) // %7 | |||
| : "cc", "memory", "q0", "q1", "q2", "q3" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *outptr = b * *ptr + a; | |||
| ptr++; | |||
| outptr++; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| { | |||
| // a = bias - slope * mean / sqrt(var) | |||
| // b = slope / sqrt(var) | |||
| // value = b * value + a | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int size = w * h; | |||
| const float* a_data_ptr = a_data; | |||
| const float* b_data_ptr = b_data; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| float a = a_data_ptr[q]; | |||
| float b = b_data_ptr[q]; | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| float32x4_t _a = vdupq_n_f32(a); | |||
| float32x4_t _b = vdupq_n_f32(b); | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| float32x4_t _outp = _a; | |||
| _outp = vfmaq_f32(_outp, _p, _b); | |||
| vst1q_f32(ptr, _outp); | |||
| ptr += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "vdup.f32 q1, %4 \n" | |||
| "vdup.f32 q2, %5 \n" | |||
| "0: \n" | |||
| "pld [%1, #128] \n" | |||
| "vld1.f32 {d0-d1}, [%1 :128] \n" | |||
| "vorr.32 q3, q1, q1 \n" | |||
| "vmla.f32 q3, q0, q2 \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d6-d7}, [%1 :128]! \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr) // %1 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "r"(a), // %4 | |||
| "r"(b) // %5 | |||
| : "cc", "memory", "q0", "q1", "q2", "q3" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *ptr = b * *ptr + a; | |||
| ptr++; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,32 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_BATCHNORM_ARM_H | |||
| #define LAYER_BATCHNORM_ARM_H | |||
| #include "batchnorm.h" | |||
| namespace ncnn { | |||
| class BatchNorm_arm : public BatchNorm | |||
| { | |||
| public: | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_BATCHNORM_ARM_H | |||
| @@ -0,0 +1,122 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "bias_arm.h" | |||
| #if __ARM_NEON | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(Bias_arm) | |||
| int Bias_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| int size = w * h; | |||
| top_blob.create(w, h, channels); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| const float* bias_ptr = bias_data; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| float bias = bias_ptr[q]; | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| float32x4_t _bias = vdupq_n_f32(bias); | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| float32x4_t _outp = vaddq_f32(_p, _bias); | |||
| vst1q_f32(outptr, _outp); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| } | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *outptr = *ptr + bias; | |||
| ptr++; | |||
| outptr++; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| int Bias_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| int size = w * h; | |||
| const float* bias_ptr = bias_data; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| float bias = bias_ptr[q]; | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| float32x4_t _bias = vdupq_n_f32(bias); | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| float32x4_t _outp = vaddq_f32(_p, _bias); | |||
| vst1q_f32(ptr, _outp); | |||
| ptr += 4; | |||
| } | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *ptr = *ptr + bias; | |||
| ptr++; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,32 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_BIAS_ARM_H | |||
| #define LAYER_BIAS_ARM_H | |||
| #include "bias.h" | |||
| namespace ncnn { | |||
| class Bias_arm : public Bias | |||
| { | |||
| public: | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_BIAS_ARM_H | |||
| @@ -0,0 +1,543 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #if __ARM_NEON | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int inch = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| const float* kernel = _kernel; | |||
| const float* bias = _bias; | |||
| #pragma omp parallel for | |||
| for (int p=0; p<outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||
| const float bias0 = bias ? bias[p] : 0.f; | |||
| out.fill(bias0); | |||
| int q = 0; | |||
| for (; q+3<inch; q+=4) | |||
| { | |||
| float* outptr = out; | |||
| const float* img0 = bottom_blob.channel(q); | |||
| const float* img1 = bottom_blob.channel(q+1); | |||
| const float* img2 = bottom_blob.channel(q+2); | |||
| const float* img3 = bottom_blob.channel(q+3); | |||
| const float* kernel0 = kernel + p*inch + q; | |||
| const float k0 = kernel0[0]; | |||
| const float k1 = kernel0[1]; | |||
| const float k2 = kernel0[2]; | |||
| const float k3 = kernel0[3]; | |||
| const float* r0 = img0; | |||
| const float* r1 = img1; | |||
| const float* r2 = img2; | |||
| const float* r3 = img3; | |||
| int size = outw * outh; | |||
| #if __ARM_NEON | |||
| int nn = size >> 3; | |||
| int remain = size & 7; | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| float32x4_t _k0 = vdupq_n_f32(k0); | |||
| float32x4_t _k1 = vdupq_n_f32(k1); | |||
| float32x4_t _k2 = vdupq_n_f32(k2); | |||
| float32x4_t _k3 = vdupq_n_f32(k3); | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(r0); | |||
| float32x4_t _pn = vld1q_f32(r0+4); | |||
| float32x4_t _outp = vld1q_f32(outptr); | |||
| float32x4_t _outpn = vld1q_f32(outptr+4); | |||
| _outp = vfmaq_f32(_outp, _p, _k0); | |||
| _outpn = vfmaq_f32(_outpn, _pn, _k0); | |||
| float32x4_t _p1 = vld1q_f32(r1); | |||
| float32x4_t _p1n = vld1q_f32(r1+4); | |||
| _outp = vfmaq_f32(_outp, _p1, _k1); | |||
| _outpn = vfmaq_f32(_outpn, _p1n, _k1); | |||
| float32x4_t _p2 = vld1q_f32(r2); | |||
| float32x4_t _p2n = vld1q_f32(r2+4); | |||
| _outp = vfmaq_f32(_outp, _p2, _k2); | |||
| _outpn = vfmaq_f32(_outpn, _p2n, _k2); | |||
| float32x4_t _p3 = vld1q_f32(r3); | |||
| float32x4_t _p3n = vld1q_f32(r3+4); | |||
| _outp = vfmaq_f32(_outp, _p3, _k3); | |||
| _outpn = vfmaq_f32(_outpn, _p3n, _k3); | |||
| vst1q_f32(outptr, _outp); | |||
| vst1q_f32(outptr+4, _outpn); | |||
| r0 += 8; | |||
| r1 += 8; | |||
| r2 += 8; | |||
| r3 += 8; | |||
| outptr += 8; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "pld [%2, #256] \n" | |||
| "vld1.f32 {d4-d7}, [%2 :128]! \n" | |||
| "0: \n" | |||
| "pld [%1, #256] \n" | |||
| "vld1.f32 {d0-d3}, [%1 :128] \n" | |||
| "vmla.f32 q0, q2, %q12 \n" | |||
| "vmla.f32 q1, q3, %q12 \n" | |||
| "pld [%3, #256] \n" | |||
| "vld1.f32 {d4-d7}, [%3 :128]! \n" | |||
| "vmla.f32 q0, q2, %q13 \n" | |||
| "vmla.f32 q1, q3, %q13 \n" | |||
| "pld [%4, #256] \n" | |||
| "vld1.f32 {d4-d7}, [%4 :128]! \n" | |||
| "vmla.f32 q0, q2, %q14 \n" | |||
| "vmla.f32 q1, q3, %q14 \n" | |||
| "pld [%5, #256] \n" | |||
| "vld1.f32 {d4-d7}, [%5 :128]! \n" | |||
| "vmla.f32 q0, q2, %q15 \n" | |||
| "vmla.f32 q1, q3, %q15 \n" | |||
| "pld [%2, #256] \n" | |||
| "vld1.f32 {d4-d7}, [%2 :128]! \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d0-d3}, [%1 :128]! \n" | |||
| "bne 0b \n" | |||
| "sub %2, #32 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr), // %1 | |||
| "=r"(r0), // %2 | |||
| "=r"(r1), // %3 | |||
| "=r"(r2), // %4 | |||
| "=r"(r3) // %5 | |||
| : "0"(nn), | |||
| "1"(outptr), | |||
| "2"(r0), | |||
| "3"(r1), | |||
| "4"(r2), | |||
| "5"(r3), | |||
| "w"(_k0), // %12 | |||
| "w"(_k1), // %13 | |||
| "w"(_k2), // %14 | |||
| "w"(_k3) // %15 | |||
| : "cc", "memory", "q0", "q1", "q2", "q3" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| float sum = *r0 * k0; | |||
| float sum1 = *r1 * k1; | |||
| float sum2 = *r2 * k2; | |||
| float sum3 = *r3 * k3; | |||
| *outptr += sum + sum1 + sum2 + sum3; | |||
| r0++; | |||
| r1++; | |||
| r2++; | |||
| r3++; | |||
| outptr++; | |||
| } | |||
| } | |||
| for (; q<inch; q++) | |||
| { | |||
| float* outptr = out; | |||
| const float* img0 = bottom_blob.channel(q); | |||
| const float* kernel0 = kernel + p*inch + q; | |||
| const float k0 = kernel0[0]; | |||
| const float* r0 = img0; | |||
| int size = outw * outh; | |||
| #if __ARM_NEON | |||
| int nn = size >> 3; | |||
| int remain = size & 7; | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| float32x4_t _k0 = vdupq_n_f32(k0); | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(r0); | |||
| float32x4_t _outp = vld1q_f32(outptr); | |||
| float32x4_t _pn = vld1q_f32(r0+4); | |||
| float32x4_t _outpn = vld1q_f32(outptr+4); | |||
| _outp = vfmaq_f32(_outp, _p, _k0); | |||
| _outpn = vfmaq_f32(_outpn, _pn, _k0); | |||
| vst1q_f32(outptr, _outp); | |||
| vst1q_f32(outptr+4, _outpn); | |||
| r0 += 8; | |||
| outptr += 8; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "pld [%2, #256] \n" | |||
| "vld1.f32 {d4-d7}, [%2 :128]! \n" | |||
| "0: \n" | |||
| "pld [%1, #256] \n" | |||
| "vld1.f32 {d0-d3}, [%1 :128] \n" | |||
| "vmla.f32 q0, q2, %q6 \n" | |||
| "vmla.f32 q1, q3, %q6 \n" | |||
| "pld [%2, #256] \n" | |||
| "vld1.f32 {d4-d7}, [%2 :128]! \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d0-d3}, [%1 :128]! \n" | |||
| "bne 0b \n" | |||
| "sub %2, #32 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr), // %1 | |||
| "=r"(r0) // %2 | |||
| : "0"(nn), | |||
| "1"(outptr), | |||
| "2"(r0), | |||
| "w"(_k0) // %6 | |||
| : "cc", "memory", "q0", "q1", "q2", "q3" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| float sum = *r0 * k0; | |||
| *outptr += sum; | |||
| r0++; | |||
| outptr++; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int inch = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| const int tailstep = w - 2*outw + w; | |||
| const float* kernel = _kernel; | |||
| const float* bias = _bias; | |||
| #pragma omp parallel for | |||
| for (int p=0; p<outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||
| const float bias0 = bias ? bias[p] : 0.f; | |||
| out.fill(bias0); | |||
| int q = 0; | |||
| for (; q+3<inch; q+=4) | |||
| { | |||
| float* outptr = out; | |||
| const float* img0 = bottom_blob.channel(q); | |||
| const float* img1 = bottom_blob.channel(q+1); | |||
| const float* img2 = bottom_blob.channel(q+2); | |||
| const float* img3 = bottom_blob.channel(q+3); | |||
| const float* kernel0 = kernel + p*inch + q; | |||
| const float k0 = kernel0[0]; | |||
| const float k1 = kernel0[1]; | |||
| const float k2 = kernel0[2]; | |||
| const float k3 = kernel0[3]; | |||
| const float* r0 = img0; | |||
| const float* r1 = img1; | |||
| const float* r2 = img2; | |||
| const float* r3 = img3; | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| #if __ARM_NEON | |||
| int nn = outw >> 3; | |||
| int remain = outw & 7; | |||
| #else | |||
| int remain = outw; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| float32x4_t _k0 = vdupq_n_f32(k0); | |||
| float32x4_t _k1 = vdupq_n_f32(k1); | |||
| float32x4_t _k2 = vdupq_n_f32(k2); | |||
| float32x4_t _k3 = vdupq_n_f32(k3); | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4x2_t _px2 = vld2q_f32(r0); | |||
| float32x4_t _p = _px2.val[0]; | |||
| float32x4_t _outp = vld1q_f32(outptr); | |||
| float32x4x2_t _pnx2 = vld2q_f32(r0+8); | |||
| float32x4_t _pn = _pnx2.val[0]; | |||
| float32x4_t _outpn = vld1q_f32(outptr+4); | |||
| _outp = vmlaq_f32(_outp, _p, _k0); | |||
| _outpn = vmlaq_f32(_outpn, _pn, _k0); | |||
| float32x4x2_t _p1x2 = vld2q_f32(r1); | |||
| float32x4_t _p1 = _p1x2.val[0]; | |||
| float32x4x2_t _p1nx2 = vld2q_f32(r1+8); | |||
| float32x4_t _p1n = _p1nx2.val[0]; | |||
| _outp = vmlaq_f32(_outp, _p1, _k1); | |||
| _outpn = vmlaq_f32(_outpn, _p1n, _k1); | |||
| float32x4x2_t _p2x2 = vld2q_f32(r2); | |||
| float32x4_t _p2 = _p2x2.val[0]; | |||
| float32x4x2_t _p2nx2 = vld2q_f32(r2+8); | |||
| float32x4_t _p2n = _p2nx2.val[0]; | |||
| _outp = vmlaq_f32(_outp, _p2, _k2); | |||
| _outpn = vmlaq_f32(_outpn, _p2n, _k2); | |||
| float32x4x2_t _p3x2 = vld2q_f32(r3); | |||
| float32x4_t _p3 = _p3x2.val[0]; | |||
| float32x4x2_t _p3nx2 = vld2q_f32(r3+8); | |||
| float32x4_t _p3n = _p3nx2.val[0]; | |||
| _outp = vmlaq_f32(_outp, _p3, _k3); | |||
| _outpn = vmlaq_f32(_outpn, _p3n, _k3); | |||
| vst1q_f32(outptr, _outp); | |||
| vst1q_f32(outptr+8, _outpn); | |||
| r0 += 16; | |||
| r1 += 16; | |||
| r2 += 16; | |||
| r3 += 16; | |||
| outptr += 8; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "pld [%2, #512] \n" | |||
| "vld2.f32 {d4-d7}, [%2]! \n" | |||
| "vld2.f32 {d16-d19}, [%2]! \n" | |||
| "0: \n" | |||
| "pld [%1, #256] \n" | |||
| "vld1.f32 {d0-d3}, [%1] \n" | |||
| "vmla.f32 q0, q2, %q12 \n" | |||
| "vmla.f32 q1, q8, %q12 \n" | |||
| "pld [%3, #512] \n" | |||
| "vld2.f32 {d4-d7}, [%3]! \n" | |||
| "vld2.f32 {d16-d19}, [%3]! \n" | |||
| "vmla.f32 q0, q2, %q13 \n" | |||
| "vmla.f32 q1, q8, %q13 \n" | |||
| "pld [%4, #512] \n" | |||
| "vld2.f32 {d4-d7}, [%4]! \n" | |||
| "vld2.f32 {d16-d19}, [%4]! \n" | |||
| "vmla.f32 q0, q2, %q14 \n" | |||
| "vmla.f32 q1, q8, %q14 \n" | |||
| "pld [%5, #512] \n" | |||
| "vld2.f32 {d4-d7}, [%5]! \n" | |||
| "vld2.f32 {d16-d19}, [%5]! \n" | |||
| "vmla.f32 q0, q2, %q15 \n" | |||
| "vmla.f32 q1, q8, %q15 \n" | |||
| "pld [%2, #512] \n" | |||
| "vld2.f32 {d4-d7}, [%2]! \n" | |||
| "vld2.f32 {d16-d19}, [%2]! \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d0-d3}, [%1]! \n" | |||
| "bne 0b \n" | |||
| "sub %2, #64 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr), // %1 | |||
| "=r"(r0), // %2 | |||
| "=r"(r1), // %3 | |||
| "=r"(r2), // %4 | |||
| "=r"(r3) // %5 | |||
| : "0"(nn), | |||
| "1"(outptr), | |||
| "2"(r0), | |||
| "3"(r1), | |||
| "4"(r2), | |||
| "5"(r3), | |||
| "w"(_k0), // %12 | |||
| "w"(_k1), // %13 | |||
| "w"(_k2), // %14 | |||
| "w"(_k3) // %15 | |||
| : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| float sum = *r0 * k0; | |||
| float sum1 = *r1 * k1; | |||
| float sum2 = *r2 * k2; | |||
| float sum3 = *r3 * k3; | |||
| *outptr += sum + sum1 + sum2 + sum3; | |||
| r0 += 2; | |||
| r1 += 2; | |||
| r2 += 2; | |||
| r3 += 2; | |||
| outptr++; | |||
| } | |||
| r0 += tailstep; | |||
| r1 += tailstep; | |||
| r2 += tailstep; | |||
| r3 += tailstep; | |||
| } | |||
| } | |||
| for (; q<inch; q++) | |||
| { | |||
| float* outptr = out; | |||
| const float* img0 = bottom_blob.channel(q); | |||
| const float* kernel0 = kernel + p*inch + q; | |||
| const float k0 = kernel0[0]; | |||
| const float* r0 = img0; | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| #if __ARM_NEON | |||
| int nn = outw >> 3; | |||
| int remain = outw & 7; | |||
| #else | |||
| int remain = outw; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| float32x4_t _k0 = vdupq_n_f32(k0); | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4x2_t _px2 = vld2q_f32(r0); | |||
| float32x4_t _p = _px2.val[0]; | |||
| float32x4_t _outp = vld1q_f32(outptr); | |||
| float32x4x2_t _pnx2 = vld2q_f32(r0+8); | |||
| float32x4_t _pn = _pnx2.val[0]; | |||
| float32x4_t _outpn = vld1q_f32(outptr+4); | |||
| _outp = vmlaq_f32(_outp, _p, _k0); | |||
| _outpn = vmlaq_f32(_outpn, _pn, _k0); | |||
| vst1q_f32(outptr, _outp); | |||
| vst1q_f32(outptr+4, _outpn); | |||
| r0 += 16; | |||
| outptr += 8; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "pld [%2, #512] \n" | |||
| "vld2.f32 {d4-d7}, [%2]! \n" | |||
| "vld2.f32 {d16-d19}, [%2]! \n" | |||
| "0: \n" | |||
| "pld [%1, #256] \n" | |||
| "vld1.f32 {d0-d3}, [%1] \n" | |||
| "vmla.f32 q0, q2, %q6 \n" | |||
| "vmla.f32 q1, q8, %q6 \n" | |||
| "pld [%2, #512] \n" | |||
| "vld2.f32 {d4-d7}, [%2]! \n" | |||
| "vld2.f32 {d16-d19}, [%2]! \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d0-d3}, [%1]! \n" | |||
| "bne 0b \n" | |||
| "sub %2, #64 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr), // %1 | |||
| "=r"(r0) // %2 | |||
| : "0"(nn), | |||
| "1"(outptr), | |||
| "2"(r0), | |||
| "w"(_k0) // %6 | |||
| : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| float sum = *r0 * k0; | |||
| *outptr += sum; | |||
| r0 += 2; | |||
| outptr++; | |||
| } | |||
| r0 += tailstep; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,381 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #if __ARM_NEON | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int inch = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| const float* kernel = _kernel; | |||
| const float* bias = _bias; | |||
| #pragma omp parallel for | |||
| for (int p=0; p<outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||
| const float bias0 = bias ? bias[p] : 0.f; | |||
| out.fill(bias0); | |||
| int q = 0; | |||
| for (; q+1<inch; q+=2) | |||
| { | |||
| float* outptr = out; | |||
| const float* img0 = bottom_blob.channel(q); | |||
| const float* img1 = bottom_blob.channel(q+1); | |||
| const float* kernel0 = kernel + p*inch*4 + q*4; | |||
| const float* kernel1 = kernel0 + 4; | |||
| const float* r00 = img0; | |||
| const float* r01 = img0 + w; | |||
| const float* r10 = img1; | |||
| const float* r11 = img1 + w; | |||
| #if __ARM_NEON | |||
| float32x4_t _k0 = vld1q_f32(kernel0); | |||
| float32x4_t _k1 = vld1q_f32(kernel1); | |||
| #endif // __ARM_NEON | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| #if __ARM_NEON | |||
| int nn = outw >> 2; | |||
| int remain = outw & 3; | |||
| #else | |||
| int remain = outw; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _r000 = vld1q_f32(r00); | |||
| float32x4_t _r010 = vld1q_f32(r01); | |||
| float32x4_t _r001 = vld1q_f32(r00 + 1); | |||
| float32x4_t _r011 = vld1q_f32(r01 + 1); | |||
| float32x4_t _r100 = vld1q_f32(r10); | |||
| float32x4_t _r110 = vld1q_f32(r11); | |||
| float32x4_t _r101 = vld1q_f32(r10 + 1); | |||
| float32x4_t _r111 = vld1q_f32(r11 + 1); | |||
| float32x4_t _sum = vld1q_f32(outptr); | |||
| _sum = vmlaq_lane_f32(_sum, _r000, vget_low_f32(_k0), 0); | |||
| _sum = vmlaq_lane_f32(_sum, _r001, vget_low_f32(_k0), 1); | |||
| _sum = vmlaq_lane_f32(_sum, _r010, vget_high_f32(_k0), 0); | |||
| _sum = vmlaq_lane_f32(_sum, _r011, vget_high_f32(_k0), 1); | |||
| _sum = vmlaq_lane_f32(_sum, _r100, vget_low_f32(_k1), 0); | |||
| _sum = vmlaq_lane_f32(_sum, _r101, vget_low_f32(_k1), 1); | |||
| _sum = vmlaq_lane_f32(_sum, _r110, vget_high_f32(_k1), 0); | |||
| _sum = vmlaq_lane_f32(_sum, _r111, vget_high_f32(_k1), 1); | |||
| vst1q_f32(outptr, _sum); | |||
| r00 += 4; | |||
| r01 += 4; | |||
| r10 += 4; | |||
| r11 += 4; | |||
| outptr += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "pld [%1, #128] \n" | |||
| "vld1.f32 {d0-d1}, [%1]! \n" | |||
| "pld [%2, #128] \n" | |||
| "vld1.f32 {d4-d5}, [%2]! \n" | |||
| "pld [%3, #128] \n" | |||
| "vld1.f32 {d24-d25}, [%3]! \n" | |||
| "pld [%4, #128] \n" | |||
| "vld1.f32 {d28-d29}, [%4]! \n" | |||
| "0: \n" | |||
| "pld [%5, #128] \n" | |||
| "vld1.f32 {d18-d19}, [%5] \n"// q9 = sum | |||
| "vmul.f32 q8, q0, %e12[0] \n" | |||
| "vmla.f32 q9, q2, %f12[0] \n" | |||
| "pld [%1, #128] \n" | |||
| "vld1.f32 {d2-d3}, [%1]! \n" | |||
| "pld [%2, #128] \n" | |||
| "vld1.f32 {d6-d7}, [%2]! \n" | |||
| "vext.f32 q10, q0, q1, #1 \n" | |||
| "vext.f32 q11, q2, q3, #1 \n" | |||
| "vmla.f32 q8, q12, %e13[0] \n" | |||
| "vmla.f32 q9, q14, %f13[0] \n" | |||
| "pld [%3, #128] \n" | |||
| "vld1.f32 {d26-d27}, [%3]! \n" | |||
| "pld [%4, #128] \n" | |||
| "vld1.f32 {d30-d31}, [%4]! \n" | |||
| "vmla.f32 q8, q10, %e12[1] \n" | |||
| "vmla.f32 q9, q11, %f12[1] \n" | |||
| "vext.f32 q10, q12, q13, #1 \n" | |||
| "vext.f32 q11, q14, q15, #1 \n" | |||
| "vmla.f32 q8, q10, %e13[1] \n" | |||
| "vmla.f32 q9, q11, %f13[1] \n" | |||
| "vorr q0, q1, q1 \n" | |||
| "vorr q2, q3, q3 \n" | |||
| "vadd.f32 q8, q8, q9 \n" | |||
| "vorr q12, q13, q13 \n" | |||
| "vorr q14, q15, q15 \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d16-d17}, [%5]! \n" | |||
| "bne 0b \n" | |||
| "sub %1, #16 \n" | |||
| "sub %2, #16 \n" | |||
| "sub %3, #16 \n" | |||
| "sub %4, #16 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(r00), // %1 | |||
| "=r"(r01), // %2 | |||
| "=r"(r10), // %3 | |||
| "=r"(r11), // %4 | |||
| "=r"(outptr) // %5 | |||
| : "0"(nn), | |||
| "1"(r00), | |||
| "2"(r01), | |||
| "3"(r10), | |||
| "4"(r11), | |||
| "5"(outptr), | |||
| "w"(_k0), // %12 | |||
| "w"(_k1) // %13 | |||
| : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| #if __ARM_NEON | |||
| float32x2_t _r00 = vld1_f32(r00); | |||
| float32x2_t _r01 = vld1_f32(r01); | |||
| float32x4_t _r00r1 = vcombine_f32(_r00, _r01); | |||
| float32x4_t _s0s1 = vmulq_f32(_r00r1, _k0); | |||
| float32x2_t _r10 = vld1_f32(r10); | |||
| float32x2_t _r11 = vld1_f32(r11); | |||
| float32x4_t _r10r1 = vcombine_f32(_r10, _r11); | |||
| _s0s1 = vmlaq_f32(_s0s1, _r10r1, _k1); | |||
| float32x2_t _s = vadd_f32(vget_low_f32(_s0s1), vget_high_f32(_s0s1)); | |||
| _s = vpadd_f32(_s, _s); | |||
| *outptr += vget_lane_f32(_s, 0); | |||
| #else | |||
| float sum = 0.f; | |||
| sum += r00[0] * kernel0[0]; | |||
| sum += r00[1] * kernel0[1]; | |||
| sum += r01[0] * kernel0[2]; | |||
| sum += r01[1] * kernel0[3]; | |||
| sum += r10[0] * kernel1[0]; | |||
| sum += r10[1] * kernel1[1]; | |||
| sum += r11[0] * kernel1[2]; | |||
| sum += r11[1] * kernel1[3]; | |||
| *outptr += sum; | |||
| #endif // __ARM_NEON | |||
| r00 += 1; | |||
| r01 += 1; | |||
| r10 += 1; | |||
| r11 += 1; | |||
| outptr++; | |||
| } | |||
| r00 += 1; | |||
| r01 += 1; | |||
| r10 += 1; | |||
| r11 += 1; | |||
| } | |||
| } | |||
| for (; q<inch; q++) | |||
| { | |||
| float* outptr = out; | |||
| const float* img0 = bottom_blob.channel(q); | |||
| const float* kernel0 = kernel + p*inch*4 + q*4; | |||
| const float* r0 = img0; | |||
| const float* r1 = img0 + w; | |||
| #if __ARM_NEON | |||
| float32x4_t _k0 = vdupq_n_f32(kernel0[0]); | |||
| float32x4_t _k1 = vdupq_n_f32(kernel0[1]); | |||
| float32x4_t _k2 = vdupq_n_f32(kernel0[2]); | |||
| float32x4_t _k3 = vdupq_n_f32(kernel0[3]); | |||
| #endif // __ARM_NEON | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| #if __ARM_NEON | |||
| int nn = outw >> 2; | |||
| int remain = outw & 3; | |||
| #else | |||
| int remain = outw; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _r00 = vld1q_f32(r0); | |||
| float32x4_t _r10 = vld1q_f32(r1); | |||
| float32x4_t _r01 = vld1q_f32(r0 + 1); | |||
| float32x4_t _r11 = vld1q_f32(r1 + 1); | |||
| float32x4_t _sum = vld1q_f32(outptr); | |||
| float32x4_t _sum2; | |||
| _sum = vmlaq_f32(_sum, _r00, _k0); | |||
| _sum2 = vmulq_f32(_r01, _k1); | |||
| _sum = vmlaq_f32(_sum, _r10, _k2); | |||
| _sum2 = vmlaq_f32(_sum2, _r11, _k3); | |||
| _sum = vaddq_f32(_sum, _sum2); | |||
| vst1q_f32(outptr, _sum); | |||
| r0 += 4; | |||
| r1 += 4; | |||
| outptr += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "pld [%1, #128] \n" | |||
| "vld1.f32 {d0-d1}, [%1]! \n" | |||
| "pld [%2, #128] \n" | |||
| "vld1.f32 {d4-d5}, [%2]! \n" | |||
| "0: \n" | |||
| "pld [%3, #128] \n" | |||
| "vld1.f32 {d18-d19}, [%3] \n"// q9 = sum | |||
| "vmul.f32 q8, q0, %q8 \n" | |||
| "vmla.f32 q9, q2, %q10 \n" | |||
| "pld [%1, #128] \n" | |||
| "vld1.f32 {d2-d3}, [%1]! \n" | |||
| "vext.f32 q10, q0, q1, #1 \n" | |||
| "vmla.f32 q8, q10, %q9 \n" | |||
| "pld [%2, #128] \n" | |||
| "vld1.f32 {d6-d7}, [%2]! \n" | |||
| "vext.f32 q11, q2, q3, #1 \n" | |||
| "vmla.f32 q9, q11, %q11 \n" | |||
| "vorr q0, q1, q1 \n" | |||
| "vadd.f32 q8, q8, q9 \n" | |||
| "vorr q2, q3, q3 \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d16-d17}, [%3]! \n" | |||
| "bne 0b \n" | |||
| "sub %1, #16 \n" | |||
| "sub %2, #16 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(r0), // %1 | |||
| "=r"(r1), // %2 | |||
| "=r"(outptr) // %3 | |||
| : "0"(nn), | |||
| "1"(r0), | |||
| "2"(r1), | |||
| "3"(outptr), | |||
| "w"(_k0), // %8 | |||
| "w"(_k1), // %9 | |||
| "w"(_k2), // %10 | |||
| "w"(_k3) // %11 | |||
| : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| float32x4_t _k0123 = vld1q_f32(kernel0); | |||
| #endif | |||
| for (; remain>0; remain--) | |||
| { | |||
| #if __ARM_NEON | |||
| float32x2_t _r0 = vld1_f32(r0); | |||
| float32x2_t _r1 = vld1_f32(r1); | |||
| float32x4_t _r0r1 = vcombine_f32(_r0, _r1); | |||
| float32x4_t _s0s1 = vmulq_f32(_r0r1, _k0123); | |||
| float32x2_t _s = vadd_f32(vget_low_f32(_s0s1), vget_high_f32(_s0s1)); | |||
| _s = vpadd_f32(_s, _s); | |||
| *outptr += vget_lane_f32(_s, 0); | |||
| #else | |||
| float sum = 0.f; | |||
| sum += r0[0] * kernel0[0]; | |||
| sum += r0[1] * kernel0[1]; | |||
| sum += r1[0] * kernel0[2]; | |||
| sum += r1[1] * kernel0[3]; | |||
| *outptr += sum; | |||
| #endif | |||
| r0 += 1; | |||
| r1 += 1; | |||
| outptr++; | |||
| } | |||
| r0 += 1; | |||
| r1 += 1; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,753 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #if __ARM_NEON | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int inch = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| const float* kernel = _kernel; | |||
| const float* bias = _bias; | |||
| #pragma omp parallel for | |||
| for (int p=0; p<outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||
| const float bias0 = bias ? bias[p] : 0.f; | |||
| out.fill(bias0); | |||
| const float* kernel0 = kernel + p*inch*9; | |||
| for (int q=0; q<inch; q++) | |||
| { | |||
| float* outptr = out; | |||
| float* outptr2 = outptr + outw; | |||
| const float* img0 = bottom_blob.channel(q); | |||
| const float* r0 = img0; | |||
| const float* r1 = img0 + w; | |||
| const float* r2 = img0 + w*2; | |||
| const float* r3 = img0 + w*3; | |||
| const float* k0 = kernel0; | |||
| const float* k1 = kernel0 + 3; | |||
| const float* k2 = kernel0 + 6; | |||
| #if __ARM_NEON | |||
| float32x4_t _k0123 = vld1q_f32(kernel0); | |||
| float32x4_t _k3456 = vld1q_f32(kernel0+3); | |||
| float32x4_t _k6789 = vld1q_f32(kernel0+6); | |||
| #endif // __ARM_NEON | |||
| int i = 0; | |||
| for (; i+1 < outh; i+=2) | |||
| { | |||
| #if __ARM_NEON | |||
| int nn = outw >> 2; | |||
| int remain = outw & 3; | |||
| #else | |||
| int remain = outw; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _sum1 = vld1q_f32(outptr); | |||
| float32x4_t _sum2 = vdupq_n_f32(0.f); | |||
| float32x4_t _sum3 = vld1q_f32(outptr2); | |||
| float32x4_t _sum4 = vdupq_n_f32(0.f); | |||
| float32x4_t _r00 = vld1q_f32(r0); | |||
| float32x4_t _r00n = vld1q_f32(r0 + 4); | |||
| float32x4_t _r01 = vextq_f32(_r00, _r00n, 1); | |||
| float32x4_t _r02 = vextq_f32(_r00, _r00n, 2); | |||
| float32x4_t _r10 = vld1q_f32(r1); | |||
| float32x4_t _r10n = vld1q_f32(r1 + 4); | |||
| float32x4_t _r11 = vextq_f32(_r10, _r10n, 1); | |||
| float32x4_t _r12 = vextq_f32(_r10, _r10n, 2); | |||
| float32x4_t _r20 = vld1q_f32(r2); | |||
| float32x4_t _r20n = vld1q_f32(r2 + 4); | |||
| float32x4_t _r21 = vextq_f32(_r20, _r20n, 1); | |||
| float32x4_t _r22 = vextq_f32(_r20, _r20n, 2); | |||
| float32x4_t _r30 = vld1q_f32(r3); | |||
| float32x4_t _r30n = vld1q_f32(r3 + 4); | |||
| float32x4_t _r31 = vextq_f32(_r30, _r30n, 1); | |||
| float32x4_t _r32 = vextq_f32(_r30, _r30n, 2); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r00, _k0123, 0); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r01, _k0123, 1); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r02, _k0123, 2); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r10, _k3456, 0); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r11, _k3456, 1); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r12, _k3456, 2); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r20, _k6789, 0); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r21, _k6789, 1); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r22, _k6789, 2); | |||
| _sum3 = vfmaq_laneq_f32(_sum3, _r10, _k0123, 0); | |||
| _sum4 = vfmaq_laneq_f32(_sum4, _r11, _k0123, 1); | |||
| _sum3 = vfmaq_laneq_f32(_sum3, _r12, _k0123, 2); | |||
| _sum4 = vfmaq_laneq_f32(_sum4, _r20, _k3456, 0); | |||
| _sum3 = vfmaq_laneq_f32(_sum3, _r21, _k3456, 1); | |||
| _sum4 = vfmaq_laneq_f32(_sum4, _r22, _k3456, 2); | |||
| _sum3 = vfmaq_laneq_f32(_sum3, _r30, _k6789, 0); | |||
| _sum4 = vfmaq_laneq_f32(_sum4, _r31, _k6789, 1); | |||
| _sum3 = vfmaq_laneq_f32(_sum3, _r32, _k6789, 2); | |||
| _sum1 = vaddq_f32(_sum1, _sum2); | |||
| _sum3 = vaddq_f32(_sum3, _sum4); | |||
| vst1q_f32(outptr, _sum1); | |||
| vst1q_f32(outptr2, _sum3); | |||
| r0 += 4; | |||
| r1 += 4; | |||
| r2 += 4; | |||
| r3 += 4; | |||
| outptr += 4; | |||
| outptr2 += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "veor q6, q6 \n" | |||
| "veor q15, q15 \n" | |||
| "pld [%3, #192] \n" | |||
| "vld1.f32 {d18-d20}, [%3 :64] \n"// r0 | |||
| "add %3, #16 \n" | |||
| "veor q13, q13 \n" | |||
| "veor q14, q14 \n" | |||
| "vext.32 q11, q9, q10, #1 \n" | |||
| "vext.32 q12, q9, q10, #2 \n" | |||
| "0: \n" | |||
| "pld [%1, #128] \n" | |||
| "vld1.f32 {d14-d15}, [%1 :64] \n"// _sum | |||
| "vmla.f32 q7, q9, %e14[0] \n" | |||
| "vmla.f32 q6, q11, %e14[1] \n" | |||
| "vmla.f32 q13, q12, %f14[0] \n" | |||
| "pld [%4, #192] \n" | |||
| "vld1.f32 {d18-d20}, [%4] \n"// r1 | |||
| "add %4, #16 \n" | |||
| "vmla.f32 q7, q9, %e15[0] \n" | |||
| "vext.32 q11, q9, q10, #1 \n" | |||
| "vext.32 q12, q9, q10, #2 \n" | |||
| "vmla.f32 q6, q11, %e15[1] \n" | |||
| "vmla.f32 q13, q12, %f15[0] \n" | |||
| "pld [%2, #128] \n" | |||
| "vld1.f32 {d16-d17}, [%2] \n"// _sum2 | |||
| "vmla.f32 q8, q9, %e14[0] \n" | |||
| "vmla.f32 q14, q11, %e14[1] \n" | |||
| "vmla.f32 q15, q12, %f14[0] \n" | |||
| "pld [%5, #192] \n" | |||
| "vld1.f32 {d18-d20}, [%5 :64] \n"// r2 | |||
| "add %5, #16 \n" | |||
| "vmla.f32 q7, q9, %e16[0] \n" | |||
| "vext.32 q11, q9, q10, #1 \n" | |||
| "vext.32 q12, q9, q10, #2 \n" | |||
| "vmla.f32 q6, q11, %e16[1] \n" | |||
| "vmla.f32 q13, q12, %f16[0] \n" | |||
| "vmla.f32 q8, q9, %e15[0] \n" | |||
| "vmla.f32 q14, q11, %e15[1] \n" | |||
| "vmla.f32 q15, q12, %f15[0] \n" | |||
| "pld [%6, #192] \n" | |||
| "vld1.f32 {d18-d20}, [%6] \n"// r3 | |||
| "add %6, #16 \n" | |||
| "vmla.f32 q8, q9, %e16[0] \n" | |||
| "vext.32 q11, q9, q10, #1 \n" | |||
| "vext.32 q12, q9, q10, #2 \n" | |||
| "vmla.f32 q14, q11, %e16[1] \n" | |||
| "vmla.f32 q15, q12, %f16[0] \n" | |||
| "vadd.f32 q7, q7, q6 \n" | |||
| "veor q6, q6 \n" | |||
| "pld [%3, #192] \n" | |||
| "vld1.f32 {d18-d20}, [%3 :64] \n"// r0 | |||
| "vadd.f32 q8, q8, q14 \n" | |||
| "veor q14, q14 \n" | |||
| "vadd.f32 q7, q7, q13 \n" | |||
| "veor q13, q13 \n" | |||
| "vadd.f32 q8, q8, q15 \n" | |||
| "veor q15, q15 \n" | |||
| "vext.32 q11, q9, q10, #1 \n" | |||
| "vext.32 q12, q9, q10, #2 \n" | |||
| "add %3, #16 \n" | |||
| "vst1.f32 {d14-d15}, [%1]! \n" | |||
| "vst1.f32 {d16-d17}, [%2]! \n" | |||
| "subs %0, #1 \n" | |||
| "bne 0b \n" | |||
| "sub %3, #16 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr), // %1 | |||
| "=r"(outptr2), // %2 | |||
| "=r"(r0), // %3 | |||
| "=r"(r1), // %4 | |||
| "=r"(r2), // %5 | |||
| "=r"(r3) // %6 | |||
| : "0"(nn), | |||
| "1"(outptr), | |||
| "2"(outptr2), | |||
| "3"(r0), | |||
| "4"(r1), | |||
| "5"(r2), | |||
| "6"(r3), | |||
| "w"(_k0123), // %14 | |||
| "w"(_k3456), // %15 | |||
| "w"(_k6789) // %16 | |||
| : "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| #if __ARM_NEON | |||
| float32x4_t _r00 = vld1q_f32(r0); | |||
| float32x4_t _r10 = vld1q_f32(r1); | |||
| float32x4_t _r20 = vld1q_f32(r2); | |||
| float32x4_t _r30 = vld1q_f32(r3); | |||
| float32x4_t _sum = vmulq_f32(_r00, _k0123); | |||
| _sum = vmlaq_f32(_sum, _r10, _k3456); | |||
| _sum = vmlaq_f32(_sum, _r20, _k6789); | |||
| float32x4_t _sum2 = vmulq_f32(_r10, _k0123); | |||
| _sum2 = vmlaq_f32(_sum2, _r20, _k3456); | |||
| _sum2 = vmlaq_f32(_sum2, _r30, _k6789); | |||
| _sum = vsetq_lane_f32(*outptr, _sum, 3); | |||
| _sum2 = vsetq_lane_f32(*outptr2, _sum2, 3); | |||
| #if __aarch64__ | |||
| *outptr = vaddvq_f32(_sum); | |||
| *outptr2 = vaddvq_f32(_sum2); | |||
| #else | |||
| float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum)); | |||
| float32x2_t _ss2 = vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2)); | |||
| float32x2_t _sss2 = vpadd_f32(_ss, _ss2); | |||
| *outptr = vget_lane_f32(_sss2, 0); | |||
| *outptr2 = vget_lane_f32(_sss2, 1); | |||
| #endif // __aarch64__ | |||
| #else | |||
| float sum = 0; | |||
| float sum2 = 0; | |||
| sum += r0[0] * k0[0]; | |||
| sum += r0[1] * k0[1]; | |||
| sum += r0[2] * k0[2]; | |||
| sum += r1[0] * k1[0]; | |||
| sum += r1[1] * k1[1]; | |||
| sum += r1[2] * k1[2]; | |||
| sum += r2[0] * k2[0]; | |||
| sum += r2[1] * k2[1]; | |||
| sum += r2[2] * k2[2]; | |||
| sum2 += r1[0] * k0[0]; | |||
| sum2 += r1[1] * k0[1]; | |||
| sum2 += r1[2] * k0[2]; | |||
| sum2 += r2[0] * k1[0]; | |||
| sum2 += r2[1] * k1[1]; | |||
| sum2 += r2[2] * k1[2]; | |||
| sum2 += r3[0] * k2[0]; | |||
| sum2 += r3[1] * k2[1]; | |||
| sum2 += r3[2] * k2[2]; | |||
| *outptr += sum; | |||
| *outptr2 += sum2; | |||
| #endif | |||
| r0++; | |||
| r1++; | |||
| r2++; | |||
| r3++; | |||
| outptr++; | |||
| outptr2++; | |||
| } | |||
| r0 += 2 + w; | |||
| r1 += 2 + w; | |||
| r2 += 2 + w; | |||
| r3 += 2 + w; | |||
| outptr += outw; | |||
| outptr2 += outw; | |||
| } | |||
| for (; i < outh; i++) | |||
| { | |||
| #if __ARM_NEON | |||
| int nn = outw >> 2; | |||
| int remain = outw & 3; | |||
| #else | |||
| int remain = outw; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _sum1 = vld1q_f32(outptr); | |||
| float32x4_t _sum2 = vdupq_n_f32(0.f); | |||
| float32x4_t _r00 = vld1q_f32(r0); | |||
| float32x4_t _r00n = vld1q_f32(r0 + 4); | |||
| float32x4_t _r01 = vextq_f32(_r00, _r00n, 1); | |||
| float32x4_t _r02 = vextq_f32(_r00, _r00n, 2); | |||
| float32x4_t _r10 = vld1q_f32(r1); | |||
| float32x4_t _r10n = vld1q_f32(r1 + 4); | |||
| float32x4_t _r11 = vextq_f32(_r10, _r10n, 1); | |||
| float32x4_t _r12 = vextq_f32(_r10, _r10n, 2); | |||
| float32x4_t _r20 = vld1q_f32(r2); | |||
| float32x4_t _r20n = vld1q_f32(r2 + 4); | |||
| float32x4_t _r21 = vextq_f32(_r20, _r20n, 1); | |||
| float32x4_t _r22 = vextq_f32(_r20, _r20n, 2); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r00, _k0123, 0); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r01, _k0123, 1); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r02, _k0123, 2); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r10, _k3456, 0); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r11, _k3456, 1); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r12, _k3456, 2); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r20, _k6789, 0); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r21, _k6789, 1); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r22, _k6789, 2); | |||
| _sum1 = vaddq_f32(_sum1, _sum2); | |||
| vst1q_f32(outptr, _sum1); | |||
| r0 += 4; | |||
| r1 += 4; | |||
| r2 += 4; | |||
| outptr += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "pld [%2, #192] \n" | |||
| "vld1.f32 {d16-d18}, [%2] \n"// r0 | |||
| "add %2, #16 \n" | |||
| "veor q13, q13 \n" | |||
| "veor q14, q14 \n" | |||
| "vext.32 q10, q8, q9, #1 \n" | |||
| "vext.32 q11, q8, q9, #2 \n" | |||
| "0: \n" | |||
| "pld [%1, #128] \n" | |||
| "vld1.f32 {d14-d15}, [%1] \n"// _sum | |||
| "vmla.f32 q7, q8, %e10[0] \n" | |||
| "vmla.f32 q13, q10, %e10[1] \n" | |||
| "vmla.f32 q14, q11, %f10[0] \n" | |||
| "pld [%3, #192] \n" | |||
| "vld1.f32 {d16-d18}, [%3] \n"// r1 | |||
| "add %3, #16 \n" | |||
| "vmla.f32 q7, q8, %e11[0] \n" | |||
| "vext.32 q10, q8, q9, #1 \n" | |||
| "vext.32 q11, q8, q9, #2 \n" | |||
| "vmla.f32 q13, q10, %e11[1] \n" | |||
| "vmla.f32 q14, q11, %f11[0] \n" | |||
| "pld [%4, #192] \n" | |||
| "vld1.f32 {d16-d18}, [%4] \n"// r2 | |||
| "add %4, #16 \n" | |||
| "vmla.f32 q7, q8, %e12[0] \n" | |||
| "vext.32 q10, q8, q9, #1 \n" | |||
| "vext.32 q11, q8, q9, #2 \n" | |||
| "vmla.f32 q13, q10, %e12[1] \n" | |||
| "vmla.f32 q14, q11, %f12[0] \n" | |||
| "pld [%2, #192] \n" | |||
| "vld1.f32 {d16-d18}, [%2] \n"// r0 | |||
| "add %2, #16 \n" | |||
| "vadd.f32 q7, q7, q13 \n" | |||
| "veor q13, q13 \n" | |||
| "vadd.f32 q7, q7, q14 \n" | |||
| "veor q14, q14 \n" | |||
| "vext.32 q10, q8, q9, #1 \n" | |||
| "vext.32 q11, q8, q9, #2 \n" | |||
| "vst1.f32 {d14-d15}, [%1]! \n" | |||
| "subs %0, #1 \n" | |||
| "bne 0b \n" | |||
| "sub %2, #16 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr), // %1 | |||
| "=r"(r0), // %2 | |||
| "=r"(r1), // %3 | |||
| "=r"(r2) // %4 | |||
| : "0"(nn), | |||
| "1"(outptr), | |||
| "2"(r0), | |||
| "3"(r1), | |||
| "4"(r2), | |||
| "w"(_k0123), // %10 | |||
| "w"(_k3456), // %11 | |||
| "w"(_k6789) // %12 | |||
| : "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| #if __ARM_NEON | |||
| float32x4_t _r00 = vld1q_f32(r0); | |||
| float32x4_t _r10 = vld1q_f32(r1); | |||
| float32x4_t _r20 = vld1q_f32(r2); | |||
| float32x4_t _sum = vmulq_f32(_r00, _k0123); | |||
| _sum = vmlaq_f32(_sum, _r10, _k3456); | |||
| _sum = vmlaq_f32(_sum, _r20, _k6789); | |||
| _sum = vsetq_lane_f32(*outptr, _sum, 3); | |||
| #if __aarch64__ | |||
| *outptr = vaddvq_f32(_sum); | |||
| #else | |||
| float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum)); | |||
| _ss = vpadd_f32(_ss, _ss); | |||
| *outptr = vget_lane_f32(_ss, 0); | |||
| #endif // __aarch64__ | |||
| #else | |||
| float sum = 0; | |||
| sum += r0[0] * k0[0]; | |||
| sum += r0[1] * k0[1]; | |||
| sum += r0[2] * k0[2]; | |||
| sum += r1[0] * k1[0]; | |||
| sum += r1[1] * k1[1]; | |||
| sum += r1[2] * k1[2]; | |||
| sum += r2[0] * k2[0]; | |||
| sum += r2[1] * k2[1]; | |||
| sum += r2[2] * k2[2]; | |||
| *outptr += sum; | |||
| #endif | |||
| r0++; | |||
| r1++; | |||
| r2++; | |||
| outptr++; | |||
| } | |||
| r0 += 2; | |||
| r1 += 2; | |||
| r2 += 2; | |||
| } | |||
| kernel0 += 9; | |||
| } | |||
| } | |||
| } | |||
| static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int inch = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| const int tailstep = w - 2*outw + w; | |||
| const float* kernel = _kernel; | |||
| const float* bias = _bias; | |||
| #pragma omp parallel for | |||
| for (int p=0; p<outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||
| const float bias0 = bias ? bias[p] : 0.f; | |||
| out.fill(bias0); | |||
| const float* kernel0 = kernel + p*inch*9; | |||
| for (int q=0; q<inch; q++) | |||
| { | |||
| float* outptr = out; | |||
| float* outptr2 = outptr + outw; | |||
| const float* img0 = bottom_blob.channel(q); | |||
| const float* r0 = img0; | |||
| const float* r1 = img0 + w; | |||
| const float* r2 = img0 + w*2; | |||
| const float* k0 = kernel0; | |||
| const float* k1 = kernel0 + 3; | |||
| const float* k2 = kernel0 + 6; | |||
| #if __ARM_NEON | |||
| float32x4_t _k0123 = vld1q_f32(k0); | |||
| float32x4_t _k3456 = vld1q_f32(k1); | |||
| float32x4_t _k6789 = vld1q_f32(k2); | |||
| #endif // __ARM_NEON | |||
| int i = 0; | |||
| for (; i < outh; i++) | |||
| { | |||
| #if __ARM_NEON | |||
| int nn = outw >> 2; | |||
| int remain = outw & 3; | |||
| #else | |||
| int remain = outw; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _outp = vld1q_f32(outptr); | |||
| float32x4x2_t _r0 = vld2q_f32(r0); | |||
| float32x4x2_t _r0n = vld2q_f32(r0+8); | |||
| float32x4_t _r00 = _r0.val[0];// 0 2 4 6 | |||
| float32x4_t _r01 = _r0.val[1];// 1 3 5 7 | |||
| float32x4_t _r02 = vextq_f32(_r00, _r0n.val[0], 1);// 2 4 6 8 | |||
| _outp = vfmaq_laneq_f32(_outp, _r00, _k0123, 0); | |||
| _outp = vfmaq_laneq_f32(_outp, _r01, _k0123, 1); | |||
| _outp = vfmaq_laneq_f32(_outp, _r02, _k0123, 2); | |||
| float32x4x2_t _r1 = vld2q_f32(r1); | |||
| float32x4x2_t _r1n = vld2q_f32(r1+8); | |||
| float32x4_t _r10 = _r1.val[0]; | |||
| float32x4_t _r11 = _r1.val[1]; | |||
| float32x4_t _r12 = vextq_f32(_r10, _r1n.val[0], 1); | |||
| _outp = vfmaq_laneq_f32(_outp, _r10, _k3456, 0); | |||
| _outp = vfmaq_laneq_f32(_outp, _r11, _k3456, 1); | |||
| _outp = vfmaq_laneq_f32(_outp, _r12, _k3456, 2); | |||
| float32x4x2_t _r2 = vld2q_f32(r2); | |||
| float32x4x2_t _r2n = vld2q_f32(r2+8); | |||
| float32x4_t _r20 = _r2.val[0]; | |||
| float32x4_t _r21 = _r2.val[1]; | |||
| float32x4_t _r22 = vextq_f32(_r20, _r2n.val[0], 1); | |||
| _outp = vfmaq_laneq_f32(_outp, _r20, _k6789, 0); | |||
| _outp = vfmaq_laneq_f32(_outp, _r21, _k6789, 1); | |||
| _outp = vfmaq_laneq_f32(_outp, _r22, _k6789, 2); | |||
| vst1q_f32(outptr, _outp); | |||
| r0 += 8; | |||
| r1 += 8; | |||
| r2 += 8; | |||
| outptr += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "pld [%2, #256] \n" | |||
| "vld2.f32 {d4-d7}, [%2]! \n" | |||
| "veor q10, q10 \n" | |||
| "veor q11, q11 \n" | |||
| "0: \n" | |||
| "pld [%1, #128] \n" | |||
| "vld1.f32 {d0-d1}, [%1] \n" | |||
| "vmla.f32 q0, q2, %e10[0] \n" | |||
| "vmla.f32 q10, q3, %e10[1] \n" | |||
| "pld [%2, #256] \n" | |||
| "vld2.f32 {d16-d19}, [%2] \n" | |||
| "vext.32 q1, q2, q8, #1 \n" | |||
| "vmla.f32 q11, q1, %f10[0] \n" | |||
| "pld [%3, #256] \n" | |||
| "vld2.f32 {d4-d7}, [%3]! \n" | |||
| "vmla.f32 q0, q2, %e11[0] \n" | |||
| "vmla.f32 q10, q3, %e11[1] \n" | |||
| "pld [%3, #256] \n" | |||
| "vld2.f32 {d16-d19}, [%3] \n" | |||
| "vext.32 q1, q2, q8, #1 \n" | |||
| "vmla.f32 q11, q1, %f11[0] \n" | |||
| "pld [%4, #256] \n" | |||
| "vld2.f32 {d4-d7}, [%4]! \n" | |||
| "vmla.f32 q0, q2, %e12[0] \n" | |||
| "vmla.f32 q10, q3, %e12[1] \n" | |||
| "pld [%4, #256] \n" | |||
| "vld2.f32 {d16-d19}, [%4] \n" | |||
| "vext.32 q1, q2, q8, #1 \n" | |||
| "vmla.f32 q11, q1, %f12[0] \n" | |||
| "pld [%2, #256] \n" | |||
| "vld2.f32 {d4-d7}, [%2]! \n" | |||
| "vadd.f32 q0, q0, q10 \n" | |||
| "veor q10, q10 \n" | |||
| "vadd.f32 q0, q0, q11 \n" | |||
| "veor q11, q11 \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d0-d1}, [%1]! \n" | |||
| "bne 0b \n" | |||
| "sub %2, #32 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr), // %1 | |||
| "=r"(r0), // %2 | |||
| "=r"(r1), | |||
| "=r"(r2) | |||
| : "0"(nn), | |||
| "1"(outptr), | |||
| "2"(r0), | |||
| "3"(r1), | |||
| "4"(r2), | |||
| "w"(_k0123), // %10 | |||
| "w"(_k3456), // %11 | |||
| "w"(_k6789) // %12 | |||
| : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| #if __ARM_NEON | |||
| float32x4_t _r00 = vld1q_f32(r0); | |||
| float32x4_t _r10 = vld1q_f32(r1); | |||
| float32x4_t _r20 = vld1q_f32(r2); | |||
| float32x4_t _sum = vmulq_f32(_r00, _k0123); | |||
| _sum = vmlaq_f32(_sum, _r10, _k3456); | |||
| _sum = vmlaq_f32(_sum, _r20, _k6789); | |||
| _sum = vsetq_lane_f32(*outptr, _sum, 3); | |||
| #if __aarch64__ | |||
| *outptr = vaddvq_f32(_sum); | |||
| #else | |||
| float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum)); | |||
| _ss = vpadd_f32(_ss, _ss); | |||
| *outptr = vget_lane_f32(_ss, 0); | |||
| #endif // __aarch64__ | |||
| #else | |||
| float sum = 0; | |||
| sum += r0[0] * k0[0]; | |||
| sum += r0[1] * k0[1]; | |||
| sum += r0[2] * k0[2]; | |||
| sum += r1[0] * k1[0]; | |||
| sum += r1[1] * k1[1]; | |||
| sum += r1[2] * k1[2]; | |||
| sum += r2[0] * k2[0]; | |||
| sum += r2[1] * k2[1]; | |||
| sum += r2[2] * k2[2]; | |||
| *outptr += sum; | |||
| #endif // __ARM_NEON | |||
| r0 += 2; | |||
| r1 += 2; | |||
| r2 += 2; | |||
| outptr++; | |||
| } | |||
| r0 += tailstep; | |||
| r1 += tailstep; | |||
| r2 += tailstep; | |||
| } | |||
| kernel0 += 9; | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,340 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #if __ARM_NEON | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int inch = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| const float* kernel = _kernel; | |||
| const float* bias = _bias; | |||
| #pragma omp parallel for | |||
| for (int p=0; p<outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||
| const float bias0 = bias ? bias[p] : 0.f; | |||
| out.fill(bias0); | |||
| for (int q=0; q<inch; q++) | |||
| { | |||
| float* outptr = out; | |||
| const float* img0 = bottom_blob.channel(q); | |||
| const float* kernel0 = kernel + p*inch*16 + q*16; | |||
| const float* r0 = img0; | |||
| const float* r1 = img0 + w; | |||
| const float* r2 = img0 + w*2; | |||
| const float* r3 = img0 + w*3; | |||
| #if __ARM_NEON | |||
| float32x4_t _k0123 = vld1q_f32(kernel0); | |||
| float32x4_t _k4567 = vld1q_f32(kernel0+4); | |||
| float32x4_t _k891011 = vld1q_f32(kernel0+8); | |||
| float32x4_t _k12131415 = vld1q_f32(kernel0+12); | |||
| #else | |||
| const float* k0 = kernel0; | |||
| const float* k1 = kernel0 + 4; | |||
| const float* k2 = kernel0 + 8; | |||
| const float* k3 = kernel0 + 12; | |||
| #endif // __ARM_NEON | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| #if __ARM_NEON | |||
| int nn = outw >> 2; | |||
| int remain = outw - (nn << 2); | |||
| #else | |||
| int remain = outw; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _r00 = vld1q_f32(r0); | |||
| float32x4_t _r10 = vld1q_f32(r1); | |||
| float32x4_t _r20 = vld1q_f32(r2); | |||
| float32x4_t _r30 = vld1q_f32(r3); | |||
| float32x4_t _r01 = vld1q_f32(r0 + 4); | |||
| float32x4_t _r11 = vld1q_f32(r1 + 4); | |||
| float32x4_t _r21 = vld1q_f32(r2 + 4); | |||
| float32x4_t _r31 = vld1q_f32(r3 + 4); | |||
| float32x4_t _r02 = vld1q_f32(r0 + 8); | |||
| float32x4_t _r12 = vld1q_f32(r1 + 8); | |||
| float32x4_t _r22 = vld1q_f32(r2 + 8); | |||
| float32x4_t _r32 = vld1q_f32(r3 + 8); | |||
| float32x4_t _r03 = vld1q_f32(r0 + 12); | |||
| float32x4_t _r13 = vld1q_f32(r1 + 12); | |||
| float32x4_t _r23 = vld1q_f32(r2 + 12); | |||
| float32x4_t _r33 = vld1q_f32(r3 + 12); | |||
| float32x4_t _sum0 = vmulq_f32(_r00, _k0123); | |||
| float32x4_t _sum1 = vmulq_f32(_r01, _k0123); | |||
| float32x4_t _sum2 = vmulq_f32(_r02, _k0123); | |||
| float32x4_t _sum3 = vmulq_f32(_r03, _k0123); | |||
| _sum0 = vfmaq_f32(_sum0, _r10, _k4567); | |||
| _sum1 = vfmaq_f32(_sum1, _r11, _k4567); | |||
| _sum2 = vfmaq_f32(_sum2, _r12, _k4567); | |||
| _sum3 = vfmaq_f32(_sum3, _r13, _k4567); | |||
| _sum0 = vfmaq_f32(_sum0, _r20, _k891011); | |||
| _sum1 = vfmaq_f32(_sum1, _r21, _k891011); | |||
| _sum2 = vfmaq_f32(_sum2, _r22, _k891011); | |||
| _sum3 = vfmaq_f32(_sum3, _r23, _k891011); | |||
| _sum0 = vfmaq_f32(_sum0, _r30, _k12131415); | |||
| _sum1 = vfmaq_f32(_sum1, _r31, _k12131415); | |||
| _sum2 = vfmaq_f32(_sum2, _r32, _k12131415); | |||
| _sum3 = vfmaq_f32(_sum3, _r33, _k12131415); | |||
| float32x4_t _s01 = vpaddq_f32(_sum0, _sum1); | |||
| float32x4_t _s23 = vpaddq_f32(_sum2, _sum3); | |||
| float32x4_t _sum = vpaddq_f32(_s01, _s23); | |||
| float32x4_t _outp = vld1q_f32(outptr); | |||
| _outp = vaddq_f32(_outp, _sum); | |||
| vst1q_f32(outptr, _sum); | |||
| r0 += 16; | |||
| r1 += 16; | |||
| r2 += 16; | |||
| r3 += 16; | |||
| outptr += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "pld [%1, #128] \n" | |||
| "0: \n" | |||
| "pld [%2, #512] \n" | |||
| "pld [%3, #512] \n" | |||
| "vld1.f32 {d14-d15}, [%1] \n"// q7 = outptr | |||
| "vld1.f32 {d16-d17}, [%2]! \n"// q8 = r0 | |||
| "vld1.f32 {d18-d19}, [%3]! \n"// q9 = r1 | |||
| "pld [%4, #512] \n" | |||
| "pld [%5, #512] \n" | |||
| "vmul.f32 q12, q8, %q12 \n" | |||
| "vmul.f32 q13, q9, %q13 \n" | |||
| "vld1.f32 {d20-d21}, [%4]! \n"// q10 = r2 | |||
| "vld1.f32 {d22-d23}, [%5]! \n"// q11 = r3 | |||
| "vmla.f32 q12, q10, %q14 \n" | |||
| "vmla.f32 q13, q11, %q15 \n" | |||
| "vadd.f32 q5, q12, q13 \n" | |||
| "vld1.f32 {d16-d17}, [%2]! \n"// q8 = r0 | |||
| "vld1.f32 {d18-d19}, [%3]! \n"// q9 = r1 | |||
| "vmul.f32 q12, q8, %q12 \n" | |||
| "vmul.f32 q13, q9, %q13 \n" | |||
| "vld1.f32 {d20-d21}, [%4]! \n"// q10 = r2 | |||
| "vld1.f32 {d22-d23}, [%5]! \n"// q11 = r3 | |||
| "vmla.f32 q12, q10, %q14 \n" | |||
| "vmla.f32 q13, q11, %q15 \n" | |||
| "vadd.f32 q6, q12, q13 \n" | |||
| "vld1.f32 {d16-d17}, [%2]! \n"// q8 = r0 | |||
| "vld1.f32 {d18-d19}, [%3]! \n"// q9 = r1 | |||
| "vmul.f32 q12, q8, %q12 \n" | |||
| "vmul.f32 q13, q9, %q13 \n" | |||
| "vld1.f32 {d20-d21}, [%4]! \n"// q10 = r2 | |||
| "vld1.f32 {d22-d23}, [%5]! \n"// q11 = r3 | |||
| "vmla.f32 q12, q10, %q14 \n" | |||
| "vmla.f32 q13, q11, %q15 \n" | |||
| "vadd.f32 q14, q12, q13 \n" | |||
| "vld1.f32 {d16-d17}, [%2]! \n"// q8 = r0 | |||
| "vld1.f32 {d18-d19}, [%3]! \n"// q9 = r1 | |||
| "vmul.f32 q12, q8, %q12 \n" | |||
| "vmul.f32 q13, q9, %q13 \n" | |||
| "vld1.f32 {d20-d21}, [%4]! \n"// q10 = r2 | |||
| "vld1.f32 {d22-d23}, [%5]! \n"// q11 = r3 | |||
| "vmla.f32 q12, q10, %q14 \n" | |||
| "vmla.f32 q13, q11, %q15 \n" | |||
| "vadd.f32 q15, q12, q13 \n" | |||
| "vadd.f32 d10, d10, d11 \n" | |||
| "vadd.f32 d28, d28, d29 \n" | |||
| "vadd.f32 d11, d12, d13 \n" | |||
| "vadd.f32 d29, d30, d31 \n" | |||
| "vpadd.f32 d10, d10, d11 \n" | |||
| "vpadd.f32 d11, d28, d29 \n" | |||
| "vadd.f32 q7, q7, q5 \n" | |||
| "vst1.f32 {d14-d15}, [%1]! \n" | |||
| "pld [%1, #128] \n" | |||
| "subs %0, #1 \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr), // %1 | |||
| "=r"(r0), // %2 | |||
| "=r"(r1), // %3 | |||
| "=r"(r2), // %4 | |||
| "=r"(r3) // %5 | |||
| : "0"(nn), | |||
| "1"(outptr), | |||
| "2"(r0), | |||
| "3"(r1), | |||
| "4"(r2), | |||
| "5"(r3), | |||
| "w"(_k0123), // %12 | |||
| "w"(_k4567), // %13 | |||
| "w"(_k891011), // %14 | |||
| "w"(_k12131415) // %15 | |||
| : "cc", "memory", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| float32x4_t _r0 = vld1q_f32(r0); | |||
| float32x4_t _r1 = vld1q_f32(r1); | |||
| float32x4_t _r2 = vld1q_f32(r2); | |||
| float32x4_t _r3 = vld1q_f32(r3); | |||
| float32x4_t _sum = vmulq_f32(_r0, _k0123); | |||
| _sum = vmlaq_f32(_sum, _r1, _k4567); | |||
| _sum = vmlaq_f32(_sum, _r2, _k891011); | |||
| _sum = vmlaq_f32(_sum, _r3, _k12131415); | |||
| *outptr += vaddvq_f32(_sum); | |||
| #else | |||
| float sum = 0.f; | |||
| asm volatile( | |||
| "vld1.f32 {d16-d17}, [%0]! \n"// q8 = r0 | |||
| "vld1.f32 {d18-d19}, [%1]! \n"// q9 = r1 | |||
| "vmul.f32 q12, q8, %q9 \n" | |||
| "vmul.f32 q13, q9, %q10 \n" | |||
| "vld1.f32 {d20-d21}, [%2]! \n"// q10 = r2 | |||
| "vld1.f32 {d22-d23}, [%3]! \n"// q11 = r3 | |||
| "vmla.f32 q12, q10, %q11 \n" | |||
| "vmla.f32 q13, q11, %q12 \n" | |||
| "vadd.f32 q5, q12, q13 \n" | |||
| "vadd.f32 d10, d10, d11 \n" | |||
| "vpadd.f32 d10, d10, d10 \n" | |||
| "vmov.f32 %4, d10[0] \n" | |||
| : "=r"(r0), // %0 | |||
| "=r"(r1), // %1 | |||
| "=r"(r2), // %2 | |||
| "=r"(r3), // %3 | |||
| "=r"(sum) // %4 | |||
| : "0"(r0), | |||
| "1"(r1), | |||
| "2"(r2), | |||
| "3"(r3), | |||
| "w"(_k0123), // %9 | |||
| "w"(_k4567), // %10 | |||
| "w"(_k891011), // %11 | |||
| "w"(_k12131415) // %12 | |||
| : "cc", "memory", "q5", "q6", "q8", "q9", "q10", "q11", "q12", "q13" | |||
| ); | |||
| *outptr += sum; | |||
| #endif // __aarch64__ | |||
| #else | |||
| float sum = 0; | |||
| sum += r0[0] * k0[0]; | |||
| sum += r0[1] * k0[1]; | |||
| sum += r0[2] * k0[2]; | |||
| sum += r0[3] * k0[3]; | |||
| sum += r1[0] * k1[0]; | |||
| sum += r1[1] * k1[1]; | |||
| sum += r1[2] * k1[2]; | |||
| sum += r1[3] * k1[3]; | |||
| sum += r2[0] * k2[0]; | |||
| sum += r2[1] * k2[1]; | |||
| sum += r2[2] * k2[2]; | |||
| sum += r2[3] * k2[3]; | |||
| sum += r3[0] * k3[0]; | |||
| sum += r3[1] * k3[1]; | |||
| sum += r3[2] * k3[2]; | |||
| sum += r3[3] * k3[3]; | |||
| *outptr += sum; | |||
| #endif // __ARM_NEON | |||
| r0 += 4; | |||
| r1 += 4; | |||
| r2 += 4; | |||
| r3 += 4; | |||
| outptr++; | |||
| } | |||
| r0 += w * 3; | |||
| r1 += w * 3; | |||
| r2 += w * 3; | |||
| r3 += w * 3; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,120 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "convolution_arm.h" | |||
| namespace ncnn { | |||
| #include "convolution_1x1.h" | |||
| #include "convolution_2x2.h" | |||
| #include "convolution_3x3.h" | |||
| #include "convolution_4x4.h" | |||
| #include "convolution_5x5.h" | |||
| #include "convolution_7x7.h" | |||
| DEFINE_LAYER_CREATOR(Convolution_arm) | |||
| int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| { | |||
| // convolv with NxN kernel | |||
| // value = value + bias | |||
| if (kernel_size > 7 || stride > 4 || dilation != 1) | |||
| { | |||
| return Convolution::forward(bottom_blob, top_blob); | |||
| } | |||
| typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&); | |||
| // kernel_size x stride | |||
| conv_func conv_func_table[7][4] = | |||
| { | |||
| { | |||
| conv1x1s1_neon, | |||
| conv1x1s2_neon, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 1 | |||
| { | |||
| conv2x2s1_neon, | |||
| 0, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 2 | |||
| { | |||
| conv3x3s1_neon, | |||
| conv3x3s2_neon, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 3 | |||
| { | |||
| 0, | |||
| 0, | |||
| 0, | |||
| conv4x4s4_neon | |||
| }, // kernel_size = 4 | |||
| { | |||
| conv5x5s1_neon, | |||
| conv5x5s2_neon, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 5 | |||
| { | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 6 | |||
| { | |||
| conv7x7s1_neon, | |||
| conv7x7s2_neon, | |||
| 0, | |||
| 0 | |||
| } // kernel_size = 7 | |||
| }; | |||
| conv_func conv = conv_func_table[kernel_size-1][stride-1]; | |||
| if (!conv) | |||
| { | |||
| return Convolution::forward(bottom_blob, top_blob); | |||
| } | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| Mat bottom_blob_bordered = bottom_blob; | |||
| if (pad > 0) | |||
| { | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, pad, pad, pad, pad, BORDER_CONSTANT, 0.f); | |||
| if (bottom_blob_bordered.empty()) | |||
| return -100; | |||
| w = bottom_blob_bordered.w; | |||
| h = bottom_blob_bordered.h; | |||
| } | |||
| int outw = (w - kernel_size) / stride + 1; | |||
| int outh = (h - kernel_size) / stride + 1; | |||
| top_blob.create(outw, outh, num_output); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| conv(bottom_blob_bordered, top_blob, weight_data, bias_data); | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,30 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_CONVOLUTION_ARM_H | |||
| #define LAYER_CONVOLUTION_ARM_H | |||
| #include "convolution.h" | |||
| namespace ncnn { | |||
| class Convolution_arm : public Convolution | |||
| { | |||
| public: | |||
| virtual int forward(const Mat& bottom_blobs, Mat& top_blobs) const; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_CONVOLUTION_ARM_H | |||
| @@ -0,0 +1,574 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "eltwise_arm.h" | |||
| #if __ARM_NEON | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(Eltwise_arm) | |||
| int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const | |||
| { | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| int size = w * h; | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(w, h, channels); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| if (op_type == Operation_PROD) | |||
| { | |||
| // first blob | |||
| const Mat& bottom_blob1 = bottom_blobs[1]; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| const float* ptr1 = bottom_blob1.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _ptr = vld1q_f32(ptr); | |||
| float32x4_t _ptr1 = vld1q_f32(ptr1); | |||
| float32x4_t _p = vmulq_f32(_ptr, _ptr1); | |||
| vst1q_f32(outptr, _p); | |||
| ptr += 4; | |||
| ptr1 += 4; | |||
| outptr += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "0: \n" | |||
| "pld [%1, #128] \n" | |||
| "pld [%2, #128] \n" | |||
| "vld1.f32 {d0-d1}, [%1 :128]! \n" | |||
| "vld1.f32 {d2-d3}, [%2 :128]! \n" | |||
| "vmul.f32 q0, q0, q1 \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d0-d1}, [%3 :128]! \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr), // %1 | |||
| "=r"(ptr1), // %2 | |||
| "=r"(outptr) // %3 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "2"(ptr1), | |||
| "3"(outptr) | |||
| : "cc", "memory", "q0", "q1" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *outptr = *ptr * *ptr1; | |||
| ptr++; | |||
| ptr1++; | |||
| outptr++; | |||
| } | |||
| } | |||
| for (size_t b=2; b<bottom_blobs.size(); b++) | |||
| { | |||
| const Mat& bottom_blob1 = bottom_blobs[b]; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob1.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _ptr = vld1q_f32(ptr); | |||
| float32x4_t _p = vld1q_f32(outptr); | |||
| _p = vmulq_f32(_ptr, _p); | |||
| vst1q_f32(outptr, _p); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "0: \n" | |||
| "pld [%1, #128] \n" | |||
| "pld [%2, #128] \n" | |||
| "vld1.f32 {d0-d1}, [%1 :128]! \n" | |||
| "vld1.f32 {d2-d3}, [%2 :128] \n" | |||
| "vmul.f32 q0, q0, q1 \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d0-d1}, [%2 :128]! \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr), // %1 | |||
| "=r"(outptr) // %2 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "2"(outptr) | |||
| : "cc", "memory", "q0", "q1" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *outptr *= *ptr; | |||
| ptr++; | |||
| outptr++; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| else if (op_type == Operation_SUM) | |||
| { | |||
| if (num_coeff == 0) | |||
| { | |||
| // first blob | |||
| const Mat& bottom_blob1 = bottom_blobs[1]; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| const float* ptr1 = bottom_blob1.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _ptr = vld1q_f32(ptr); | |||
| float32x4_t _ptr1 = vld1q_f32(ptr1); | |||
| float32x4_t _p = vaddq_f32(_ptr, _ptr1); | |||
| vst1q_f32(outptr, _p); | |||
| ptr += 4; | |||
| ptr1 += 4; | |||
| outptr += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "0: \n" | |||
| "pld [%1, #128] \n" | |||
| "pld [%2, #128] \n" | |||
| "vld1.f32 {d0-d1}, [%1 :128]! \n" | |||
| "vld1.f32 {d2-d3}, [%2 :128]! \n" | |||
| "vadd.f32 q0, q0, q1 \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d0-d1}, [%3 :128]! \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr), // %1 | |||
| "=r"(ptr1), // %2 | |||
| "=r"(outptr) // %3 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "2"(ptr1), | |||
| "3"(outptr) | |||
| : "cc", "memory", "q0", "q1" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *outptr = *ptr + *ptr1; | |||
| ptr++; | |||
| ptr1++; | |||
| outptr++; | |||
| } | |||
| } | |||
| for (size_t b=2; b<bottom_blobs.size(); b++) | |||
| { | |||
| const Mat& bottom_blob1 = bottom_blobs[b]; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob1.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _ptr = vld1q_f32(ptr); | |||
| float32x4_t _p = vld1q_f32(outptr); | |||
| _p = vaddq_f32(_ptr, _p); | |||
| vst1q_f32(outptr, _p); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "0: \n" | |||
| "pld [%1, #128] \n" | |||
| "pld [%2, #128] \n" | |||
| "vld1.f32 {d0-d1}, [%1 :128]! \n" | |||
| "vld1.f32 {d2-d3}, [%2 :128] \n" | |||
| "vadd.f32 q0, q0, q1 \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d0-d1}, [%2 :128]! \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr), // %1 | |||
| "=r"(outptr) // %2 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "2"(outptr) | |||
| : "cc", "memory", "q0", "q1" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *outptr += *ptr; | |||
| ptr++; | |||
| outptr++; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| const float* coeffs_ptr = coeffs; | |||
| // first blob | |||
| const Mat& bottom_blob1 = bottom_blobs[1]; | |||
| float coeff0 = coeffs_ptr[0]; | |||
| float coeff1 = coeffs_ptr[1]; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| const float* ptr1 = bottom_blob1.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| float32x4_t _coeff0 = vdupq_n_f32(coeff0); | |||
| float32x4_t _coeff1 = vdupq_n_f32(coeff1); | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _ptr = vld1q_f32(ptr); | |||
| float32x4_t _ptr1 = vld1q_f32(ptr1); | |||
| float32x4_t _p = vmulq_f32(_ptr, _coeff0); | |||
| _p = vmlaq_f32(_p, _ptr1, _coeff1); | |||
| vst1q_f32(outptr, _p); | |||
| ptr += 4; | |||
| ptr1 += 4; | |||
| outptr += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "0: \n" | |||
| "pld [%1, #128] \n" | |||
| "pld [%2, #128] \n" | |||
| "vld1.f32 {d0-d1}, [%1 :128]! \n" | |||
| "vld1.f32 {d2-d3}, [%2 :128]! \n" | |||
| "vmul.f32 q0, q0, %q8 \n" | |||
| "vmla.f32 q0, q1, %q9 \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d0-d1}, [%3 :128]! \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr), // %1 | |||
| "=r"(ptr1), // %2 | |||
| "=r"(outptr) // %3 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "2"(ptr1), | |||
| "3"(outptr), | |||
| "w"(_coeff0), // %8 | |||
| "w"(_coeff1) // %9 | |||
| : "cc", "memory", "q0", "q1" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *outptr = *ptr * coeff0 + *ptr1 * coeff1; | |||
| ptr++; | |||
| ptr1++; | |||
| outptr++; | |||
| } | |||
| } | |||
| for (size_t b=2; b<bottom_blobs.size(); b++) | |||
| { | |||
| const Mat& bottom_blob1 = bottom_blobs[b]; | |||
| float coeff = coeffs_ptr[b]; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob1.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| float32x4_t _coeff = vdupq_n_f32(coeff); | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _ptr = vld1q_f32(ptr); | |||
| float32x4_t _p = vld1q_f32(outptr); | |||
| _p = vmlaq_f32(_p, _ptr, _coeff); | |||
| vst1q_f32(outptr, _p); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "0: \n" | |||
| "pld [%1, #128] \n" | |||
| "pld [%2, #128] \n" | |||
| "vld1.f32 {d0-d1}, [%1 :128]! \n" | |||
| "vld1.f32 {d2-d3}, [%2 :128] \n" | |||
| "vmla.f32 q1, q0, %q6 \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d0-d1}, [%2 :128]! \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr), // %1 | |||
| "=r"(outptr) // %2 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "2"(outptr), | |||
| "w"(_coeff) // %6 | |||
| : "cc", "memory", "q0", "q1" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *outptr += *ptr * coeff; | |||
| ptr++; | |||
| outptr++; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| else if (op_type == Operation_MAX) | |||
| { | |||
| // first blob | |||
| const Mat& bottom_blob1 = bottom_blobs[1]; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| const float* ptr1 = bottom_blob1.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _ptr = vld1q_f32(ptr); | |||
| float32x4_t _ptr1 = vld1q_f32(ptr1); | |||
| float32x4_t _p = vmaxq_f32(_ptr, _ptr1); | |||
| vst1q_f32(outptr, _p); | |||
| ptr += 4; | |||
| ptr1 += 4; | |||
| outptr += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "0: \n" | |||
| "pld [%1, #128] \n" | |||
| "pld [%2, #128] \n" | |||
| "vld1.f32 {d0-d1}, [%1 :128]! \n" | |||
| "vld1.f32 {d2-d3}, [%2 :128]! \n" | |||
| "vmax.f32 q0, q0, q1 \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d0-d1}, [%3 :128]! \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr), // %1 | |||
| "=r"(ptr1), // %2 | |||
| "=r"(outptr) // %3 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "2"(ptr1), | |||
| "3"(outptr) | |||
| : "cc", "memory", "q0", "q1" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *outptr = std::max(*ptr, *ptr1); | |||
| ptr++; | |||
| ptr1++; | |||
| outptr++; | |||
| } | |||
| } | |||
| for (size_t b=2; b<bottom_blobs.size(); b++) | |||
| { | |||
| const Mat& bottom_blob1 = bottom_blobs[b]; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob1.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _ptr = vld1q_f32(ptr); | |||
| float32x4_t _p = vld1q_f32(outptr); | |||
| _p = vmaxq_f32(_ptr, _p); | |||
| vst1q_f32(outptr, _p); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "0: \n" | |||
| "pld [%1, #128] \n" | |||
| "pld [%2, #128] \n" | |||
| "vld1.f32 {d0-d1}, [%1 :128]! \n" | |||
| "vld1.f32 {d2-d3}, [%2 :128] \n" | |||
| "vmax.f32 q0, q0, q1 \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d0-d1}, [%2 :128]! \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr), // %1 | |||
| "=r"(outptr) // %2 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "2"(outptr) | |||
| : "cc", "memory", "q0", "q1" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *outptr = std::max(*ptr, *outptr); | |||
| ptr++; | |||
| outptr++; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,30 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_ELTWISE_ARM_H | |||
| #define LAYER_ELTWISE_ARM_H | |||
| #include "eltwise.h" | |||
| namespace ncnn { | |||
| class Eltwise_arm : public Eltwise | |||
| { | |||
| public: | |||
| virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_ELTWISE_ARM_H | |||
| @@ -0,0 +1,136 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "innerproduct_arm.h" | |||
| #if __ARM_NEON | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(InnerProduct_arm) | |||
| int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| int size = w * h; | |||
| top_blob.create(1, 1, num_output); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| // num_output | |||
| const float* weight_data_ptr = weight_data; | |||
| #pragma omp parallel for | |||
| for (int p=0; p<num_output; p++) | |||
| { | |||
| float* outptr = top_blob.channel(p); | |||
| float sum = 0.f; | |||
| if (bias_term) | |||
| sum = bias_data.data[p]; | |||
| const float* w = weight_data_ptr + size * channels * p; | |||
| const float* w2 = w + size; | |||
| #if __ARM_NEON | |||
| float32x4_t _sum = vdupq_n_f32(0.f); | |||
| float32x4_t _sum2 = vdupq_n_f32(0.f); | |||
| #endif // __ARM_NEON | |||
| // channels | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* m = bottom_blob.channel(q); | |||
| #if __ARM_NEON | |||
| int nn = size >> 3; | |||
| int remain = size & 7; | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _m = vld1q_f32(m); | |||
| float32x4_t _w = vld1q_f32(w); | |||
| _sum = vfmaq_f32(_sum, _m, _w); | |||
| _m = vld1q_f32(m + 4); | |||
| _w = vld1q_f32(w + 4); | |||
| _sum2 = vfmaq_f32(_sum2, _m, _w); | |||
| m += 8; | |||
| w += 8; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "0: \n" | |||
| "pld [%1, #256] \n" | |||
| "vld1.f32 {d0-d3}, [%1 :128]! \n" | |||
| "pld [%2, #256] \n" | |||
| "vld1.f32 {d4-d7}, [%2]! \n" | |||
| "vmla.f32 %q3, q0, q2 \n" | |||
| "subs %0, #1 \n" | |||
| "vmla.f32 %q4, q1, q3 \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(m), // %1 | |||
| "=r"(w), // %2 | |||
| "=w"(_sum), // %3 | |||
| "=w"(_sum2) // %4 | |||
| : "0"(nn), | |||
| "1"(m), | |||
| "2"(w), | |||
| "3"(_sum), | |||
| "4"(_sum2) | |||
| : "cc", "memory", "q0", "q1", "q2", "q3" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| sum += *m * *w; | |||
| m++; | |||
| w++; | |||
| } | |||
| } | |||
| #if __ARM_NEON | |||
| _sum = vaddq_f32(_sum, _sum2); | |||
| #if __aarch64__ | |||
| sum += vaddvq_f32(_sum); | |||
| #else | |||
| float32x2_t _sumss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum)); | |||
| _sumss = vpadd_f32(_sumss, _sumss); | |||
| sum += vget_lane_f32(_sumss, 0); | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| outptr[0] = sum; | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,30 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_INNERPRODUCT_ARM_H | |||
| #define LAYER_INNERPRODUCT_ARM_H | |||
| #include "innerproduct.h" | |||
| namespace ncnn { | |||
| class InnerProduct_arm : public InnerProduct | |||
| { | |||
| public: | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_INNERPRODUCT_ARM_H | |||
| @@ -0,0 +1,227 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "lrn_arm.h" | |||
| #include <math.h> | |||
| #if __ARM_NEON | |||
| #include <arm_neon.h> | |||
| #include "neon_mathfun.h" | |||
| #endif // __ARM_NEON | |||
| namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(LRN_arm) | |||
| int LRN_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| int size = w * h; | |||
| // squared values with local_size padding | |||
| Mat square_blob; | |||
| square_blob.create(w, h, channels); | |||
| if (square_blob.empty()) | |||
| return -100; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_top_blob.channel(q); | |||
| float* outptr = square_blob.channel(q); | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| float32x4_t _outp = vmulq_f32(_p, _p); | |||
| vst1q_f32(outptr, _outp); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| } | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *outptr = *ptr * *ptr; | |||
| ptr++; | |||
| outptr++; | |||
| } | |||
| } | |||
| float alpha_div_size = alpha / local_size; | |||
| if (region_type == NormRegion_ACROSS_CHANNELS) | |||
| { | |||
| Mat square_sum; | |||
| square_sum.create(w, h, channels); | |||
| if (square_sum.empty()) | |||
| return -100; | |||
| square_sum.fill(0.f); | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| // square sum | |||
| for (int p=q - local_size / 2; p<q + local_size; p++) | |||
| { | |||
| if (p < 0 || p >= channels) | |||
| continue; | |||
| const float* sptr = square_blob.channel(p); | |||
| float* ssptr = square_sum.channel(q); | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _sp = vld1q_f32(sptr); | |||
| float32x4_t _ssp = vld1q_f32(ssptr); | |||
| _ssp = vaddq_f32(_ssp, _sp); | |||
| vst1q_f32(ssptr, _ssp); | |||
| sptr += 4; | |||
| ssptr += 4; | |||
| } | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *ssptr += *sptr; | |||
| sptr++; | |||
| ssptr++; | |||
| } | |||
| } | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| float* ssptr = square_sum.channel(q); | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| float32x4_t _v1 = vdupq_n_f32(1.f); | |||
| float32x4_t _ads = vdupq_n_f32(alpha_div_size); | |||
| float32x4_t _mb = vdupq_n_f32(-beta); | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| float32x4_t _ssp = vld1q_f32(ssptr); | |||
| _ssp = vmulq_f32(_ssp, _ads); | |||
| _ssp = vaddq_f32(_ssp, _v1); | |||
| _ssp = pow_ps(_ssp, _mb); | |||
| _p = vmulq_f32(_p, _ssp); | |||
| vst1q_f32(ptr, _p); | |||
| ssptr += 4; | |||
| ptr += 4; | |||
| } | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *ptr = *ptr * pow(1.f + alpha_div_size * *ssptr, -beta); | |||
| ssptr++; | |||
| ptr++; | |||
| } | |||
| } | |||
| } | |||
| else if (region_type == NormRegion_WITHIN_CHANNEL) | |||
| { | |||
| int outw = w; | |||
| int outh = h; | |||
| Mat square_blob_bordered = square_blob; | |||
| int pad = local_size / 2; | |||
| if (pad > 0) | |||
| { | |||
| copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f); | |||
| if (square_blob_bordered.empty()) | |||
| return -100; | |||
| w = square_blob_bordered.w; | |||
| h = square_blob_bordered.h; | |||
| } | |||
| const int maxk = local_size * local_size; | |||
| // norm window offsets | |||
| std::vector<int> _space_ofs(maxk); | |||
| int* space_ofs = &_space_ofs[0]; | |||
| { | |||
| int p1 = 0; | |||
| int p2 = 0; | |||
| int gap = w - local_size; | |||
| for (int i = 0; i < local_size; i++) | |||
| { | |||
| for (int j = 0; j < local_size; j++) | |||
| { | |||
| space_ofs[p1] = p2; | |||
| p1++; | |||
| p2++; | |||
| } | |||
| p2 += gap; | |||
| } | |||
| } | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| const float* sptr = square_blob_bordered.channel(q); | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| for (int j = 0; j < outw; j++) | |||
| { | |||
| float ss = 0.f; | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| float val = sptr[ space_ofs[k] ]; | |||
| ss += val; | |||
| } | |||
| ptr[j] = ptr[j] * pow(1.f + alpha_div_size * ss, -beta); | |||
| } | |||
| ptr += outw; | |||
| sptr += w; | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,30 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_LRN_ARM_H | |||
| #define LAYER_LRN_ARM_H | |||
| #include "lrn.h" | |||
| namespace ncnn { | |||
| class LRN_arm : public LRN | |||
| { | |||
| public: | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_LRN_ARM_H | |||
| @@ -0,0 +1,316 @@ | |||
| /* NEON implementation of sin, cos, exp and log | |||
| * | |||
| * Inspired by Intel Approximate Math library, and based on the | |||
| * corresponding algorithms of the cephes math library | |||
| */ | |||
| /* Copyright (C) 2011 Julien Pommier | |||
| * | |||
| * This software is provided 'as-is', without any express or implied | |||
| * warranty. In no event will the authors be held liable for any damages | |||
| * arising from the use of this software. | |||
| * | |||
| * Permission is granted to anyone to use this software for any purpose, | |||
| * including commercial applications, and to alter it and redistribute it | |||
| * freely, subject to the following restrictions: | |||
| * | |||
| * 1. The origin of this software must not be misrepresented; you must not | |||
| * claim that you wrote the original software. If you use this software | |||
| * in a product, an acknowledgment in the product documentation would be | |||
| * appreciated but is not required. | |||
| * 2. Altered source versions must be plainly marked as such, and must not be | |||
| * misrepresented as being the original software. | |||
| * 3. This notice may not be removed or altered from any source distribution. | |||
| * | |||
| * (this is the zlib license) | |||
| */ | |||
| #include <arm_neon.h> | |||
| #define c_inv_mant_mask ~0x7f800000u | |||
| #define c_cephes_SQRTHF 0.707106781186547524 | |||
| #define c_cephes_log_p0 7.0376836292E-2 | |||
| #define c_cephes_log_p1 - 1.1514610310E-1 | |||
| #define c_cephes_log_p2 1.1676998740E-1 | |||
| #define c_cephes_log_p3 - 1.2420140846E-1 | |||
| #define c_cephes_log_p4 + 1.4249322787E-1 | |||
| #define c_cephes_log_p5 - 1.6668057665E-1 | |||
| #define c_cephes_log_p6 + 2.0000714765E-1 | |||
| #define c_cephes_log_p7 - 2.4999993993E-1 | |||
| #define c_cephes_log_p8 + 3.3333331174E-1 | |||
| #define c_cephes_log_q1 -2.12194440e-4 | |||
| #define c_cephes_log_q2 0.693359375 | |||
| /* natural logarithm computed for 4 simultaneous float | |||
| * return NaN for x <= 0 | |||
| */ | |||
| static inline float32x4_t log_ps(float32x4_t x) | |||
| { | |||
| float32x4_t one = vdupq_n_f32(1); | |||
| x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */ | |||
| uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0)); | |||
| int32x4_t ux = vreinterpretq_s32_f32(x); | |||
| int32x4_t emm0 = vshrq_n_s32(ux, 23); | |||
| /* keep only the fractional part */ | |||
| ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask)); | |||
| ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f))); | |||
| x = vreinterpretq_f32_s32(ux); | |||
| emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f)); | |||
| float32x4_t e = vcvtq_f32_s32(emm0); | |||
| e = vaddq_f32(e, one); | |||
| /* part2: | |||
| * if( x < SQRTHF ) { | |||
| * e -= 1; | |||
| * x = x + x - 1.0; | |||
| * } else { x = x - 1.0; } | |||
| */ | |||
| uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF)); | |||
| float32x4_t tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask)); | |||
| x = vsubq_f32(x, one); | |||
| e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask))); | |||
| x = vaddq_f32(x, tmp); | |||
| float32x4_t z = vmulq_f32(x,x); | |||
| float32x4_t y = vdupq_n_f32(c_cephes_log_p0); | |||
| y = vmulq_f32(y, x); | |||
| y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1)); | |||
| y = vmulq_f32(y, x); | |||
| y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2)); | |||
| y = vmulq_f32(y, x); | |||
| y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3)); | |||
| y = vmulq_f32(y, x); | |||
| y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4)); | |||
| y = vmulq_f32(y, x); | |||
| y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5)); | |||
| y = vmulq_f32(y, x); | |||
| y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6)); | |||
| y = vmulq_f32(y, x); | |||
| y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7)); | |||
| y = vmulq_f32(y, x); | |||
| y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8)); | |||
| y = vmulq_f32(y, x); | |||
| y = vmulq_f32(y, z); | |||
| tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1)); | |||
| y = vaddq_f32(y, tmp); | |||
| tmp = vmulq_f32(z, vdupq_n_f32(0.5f)); | |||
| y = vsubq_f32(y, tmp); | |||
| tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2)); | |||
| x = vaddq_f32(x, y); | |||
| x = vaddq_f32(x, tmp); | |||
| x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN | |||
| return x; | |||
| } | |||
| #define c_exp_hi 88.3762626647949f | |||
| #define c_exp_lo -88.3762626647949f | |||
| #define c_cephes_LOG2EF 1.44269504088896341 | |||
| #define c_cephes_exp_C1 0.693359375 | |||
| #define c_cephes_exp_C2 -2.12194440e-4 | |||
| #define c_cephes_exp_p0 1.9875691500E-4 | |||
| #define c_cephes_exp_p1 1.3981999507E-3 | |||
| #define c_cephes_exp_p2 8.3334519073E-3 | |||
| #define c_cephes_exp_p3 4.1665795894E-2 | |||
| #define c_cephes_exp_p4 1.6666665459E-1 | |||
| #define c_cephes_exp_p5 5.0000001201E-1 | |||
| /* exp() computed for 4 float at once */ | |||
| static inline float32x4_t exp_ps(float32x4_t x) | |||
| { | |||
| float32x4_t tmp, fx; | |||
| float32x4_t one = vdupq_n_f32(1); | |||
| x = vminq_f32(x, vdupq_n_f32(c_exp_hi)); | |||
| x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo)); | |||
| /* express exp(x) as exp(g + n*log(2)) */ | |||
| fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF)); | |||
| /* perform a floorf */ | |||
| tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); | |||
| /* if greater, substract 1 */ | |||
| uint32x4_t mask = vcgtq_f32(tmp, fx); | |||
| mask = vandq_u32(mask, vreinterpretq_u32_f32(one)); | |||
| fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); | |||
| tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1)); | |||
| float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2)); | |||
| x = vsubq_f32(x, tmp); | |||
| x = vsubq_f32(x, z); | |||
| static const float cephes_exp_p[6] = { c_cephes_exp_p0, c_cephes_exp_p1, c_cephes_exp_p2, c_cephes_exp_p3, c_cephes_exp_p4, c_cephes_exp_p5 }; | |||
| float32x4_t y = vld1q_dup_f32(cephes_exp_p+0); | |||
| float32x4_t c1 = vld1q_dup_f32(cephes_exp_p+1); | |||
| float32x4_t c2 = vld1q_dup_f32(cephes_exp_p+2); | |||
| float32x4_t c3 = vld1q_dup_f32(cephes_exp_p+3); | |||
| float32x4_t c4 = vld1q_dup_f32(cephes_exp_p+4); | |||
| float32x4_t c5 = vld1q_dup_f32(cephes_exp_p+5); | |||
| y = vmulq_f32(y, x); | |||
| z = vmulq_f32(x, x); | |||
| y = vaddq_f32(y, c1); | |||
| y = vmulq_f32(y, x); | |||
| y = vaddq_f32(y, c2); | |||
| y = vmulq_f32(y, x); | |||
| y = vaddq_f32(y, c3); | |||
| y = vmulq_f32(y, x); | |||
| y = vaddq_f32(y, c4); | |||
| y = vmulq_f32(y, x); | |||
| y = vaddq_f32(y, c5); | |||
| y = vmulq_f32(y, z); | |||
| y = vaddq_f32(y, x); | |||
| y = vaddq_f32(y, one); | |||
| /* build 2^n */ | |||
| int32x4_t mm; | |||
| mm = vcvtq_s32_f32(fx); | |||
| mm = vaddq_s32(mm, vdupq_n_s32(0x7f)); | |||
| mm = vshlq_n_s32(mm, 23); | |||
| float32x4_t pow2n = vreinterpretq_f32_s32(mm); | |||
| y = vmulq_f32(y, pow2n); | |||
| return y; | |||
| } | |||
| #define c_minus_cephes_DP1 -0.78515625 | |||
| #define c_minus_cephes_DP2 -2.4187564849853515625e-4 | |||
| #define c_minus_cephes_DP3 -3.77489497744594108e-8 | |||
| #define c_sincof_p0 -1.9515295891E-4 | |||
| #define c_sincof_p1 8.3321608736E-3 | |||
| #define c_sincof_p2 -1.6666654611E-1 | |||
| #define c_coscof_p0 2.443315711809948E-005 | |||
| #define c_coscof_p1 -1.388731625493765E-003 | |||
| #define c_coscof_p2 4.166664568298827E-002 | |||
| #define c_cephes_FOPI 1.27323954473516 // 4 / M_PI | |||
| /* evaluation of 4 sines & cosines at once. | |||
| * | |||
| * The code is the exact rewriting of the cephes sinf function. | |||
| * Precision is excellent as long as x < 8192 (I did not bother to | |||
| * take into account the special handling they have for greater values | |||
| * -- it does not return garbage for arguments over 8192, though, but | |||
| * the extra precision is missing). | |||
| * | |||
| * Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the | |||
| * surprising but correct result. | |||
| * | |||
| * Note also that when you compute sin(x), cos(x) is available at | |||
| * almost no extra price so both sin_ps and cos_ps make use of | |||
| * sincos_ps.. | |||
| */ | |||
| static inline void sincos_ps(float32x4_t x, float32x4_t *ysin, float32x4_t *ycos) | |||
| { | |||
| // any x | |||
| float32x4_t xmm1, xmm2, xmm3, y; | |||
| uint32x4_t emm2; | |||
| uint32x4_t sign_mask_sin, sign_mask_cos; | |||
| sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0)); | |||
| x = vabsq_f32(x); | |||
| /* scale by 4/Pi */ | |||
| y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI)); | |||
| /* store the integer part of y in mm0 */ | |||
| emm2 = vcvtq_u32_f32(y); | |||
| /* j=(j+1) & (~1) (see the cephes sources) */ | |||
| emm2 = vaddq_u32(emm2, vdupq_n_u32(1)); | |||
| emm2 = vandq_u32(emm2, vdupq_n_u32(~1)); | |||
| y = vcvtq_f32_u32(emm2); | |||
| /* get the polynom selection mask | |||
| * there is one polynom for 0 <= x <= Pi/4 | |||
| * and another one for Pi/4<x<=Pi/2 | |||
| * | |||
| * Both branches will be computed. | |||
| */ | |||
| uint32x4_t poly_mask = vtstq_u32(emm2, vdupq_n_u32(2)); | |||
| /* The magic pass: "Extended precision modular arithmetic" | |||
| * x = ((x - y * DP1) - y * DP2) - y * DP3; */ | |||
| xmm1 = vmulq_n_f32(y, c_minus_cephes_DP1); | |||
| xmm2 = vmulq_n_f32(y, c_minus_cephes_DP2); | |||
| xmm3 = vmulq_n_f32(y, c_minus_cephes_DP3); | |||
| x = vaddq_f32(x, xmm1); | |||
| x = vaddq_f32(x, xmm2); | |||
| x = vaddq_f32(x, xmm3); | |||
| sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4))); | |||
| sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4)); | |||
| /* Evaluate the first polynom (0 <= x <= Pi/4) in y1, | |||
| * and the second polynom (Pi/4 <= x <= 0) in y2 */ | |||
| float32x4_t z = vmulq_f32(x,x); | |||
| float32x4_t y1, y2; | |||
| y1 = vmulq_n_f32(z, c_coscof_p0); | |||
| y2 = vmulq_n_f32(z, c_sincof_p0); | |||
| y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p1)); | |||
| y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p1)); | |||
| y1 = vmulq_f32(y1, z); | |||
| y2 = vmulq_f32(y2, z); | |||
| y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p2)); | |||
| y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p2)); | |||
| y1 = vmulq_f32(y1, z); | |||
| y2 = vmulq_f32(y2, z); | |||
| y1 = vmulq_f32(y1, z); | |||
| y2 = vmulq_f32(y2, x); | |||
| y1 = vsubq_f32(y1, vmulq_f32(z, vdupq_n_f32(0.5f))); | |||
| y2 = vaddq_f32(y2, x); | |||
| y1 = vaddq_f32(y1, vdupq_n_f32(1)); | |||
| /* select the correct result from the two polynoms */ | |||
| float32x4_t ys = vbslq_f32(poly_mask, y1, y2); | |||
| float32x4_t yc = vbslq_f32(poly_mask, y2, y1); | |||
| *ysin = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys); | |||
| *ycos = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc)); | |||
| } | |||
| static inline float32x4_t sin_ps(float32x4_t x) | |||
| { | |||
| float32x4_t ysin, ycos; | |||
| sincos_ps(x, &ysin, &ycos); | |||
| return ysin; | |||
| } | |||
| static inline float32x4_t cos_ps(float32x4_t x) | |||
| { | |||
| float32x4_t ysin, ycos; | |||
| sincos_ps(x, &ysin, &ycos); | |||
| return ycos; | |||
| } | |||
| static inline float32x4_t div_ps(float32x4_t a, float32x4_t b) | |||
| { | |||
| float32x4_t reciprocal = vrecpeq_f32(b); | |||
| reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal); | |||
| // reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal); | |||
| return vmulq_f32(a, reciprocal); | |||
| } | |||
| static inline float32x4_t pow_ps(float32x4_t a, float32x4_t b) | |||
| { | |||
| // pow(x, m) = exp(m * log(x)) | |||
| return exp_ps(vmulq_f32(b, log_ps(a))); | |||
| } | |||
| @@ -0,0 +1,112 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #if __ARM_NEON | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int inch = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<inch; q++) | |||
| { | |||
| const float* img0 = bottom_blob.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| const float* r0 = img0; | |||
| const float* r1 = img0 + w; | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| #if __ARM_NEON | |||
| int nn = outw >> 2; | |||
| int remain = outw - (nn << 2); | |||
| #else | |||
| int remain = outw; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _r00 = vld1q_f32(r0); | |||
| float32x4_t _r10 = vld1q_f32(r1); | |||
| float32x4_t _r01 = vld1q_f32(r0 + 4); | |||
| float32x4_t _r11 = vld1q_f32(r1 + 4); | |||
| float32x4_t _max0 = vmaxq_f32(_r00, _r10); | |||
| float32x4_t _max1 = vmaxq_f32(_r01, _r11); | |||
| float32x4_t _max = vpmaxq_f32(_max0, _max1); | |||
| vst1q_f32(outptr, _max); | |||
| r0 += 8; | |||
| r1 += 8; | |||
| outptr += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "0: \n" | |||
| "pld [%1, #256] \n" | |||
| "pld [%2, #256] \n" | |||
| "vld1.f32 {d0-d3}, [%1]! \n" | |||
| "vld1.f32 {d4-d7}, [%2]! \n" | |||
| "vmax.f32 q0, q0, q2 \n" | |||
| "vmax.f32 q1, q1, q3 \n" | |||
| "vpmax.f32 d4, d0, d1 \n" | |||
| "vpmax.f32 d5, d2, d3 \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d4-d5}, [%3]! \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(r0), // %1 | |||
| "=r"(r1), // %2 | |||
| "=r"(outptr) // %3 | |||
| : "0"(nn), | |||
| "1"(r0), | |||
| "2"(r1), | |||
| "3"(outptr) | |||
| : "cc", "memory", "q0", "q1", "q2", "q3" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| float max0 = std::max(r0[0], r0[1]); | |||
| float max1 = std::max(r1[0], r1[1]); | |||
| *outptr = std::max(max0, max1); | |||
| r0 += 2; | |||
| r1 += 2; | |||
| outptr++; | |||
| } | |||
| r0 += w; | |||
| r1 += w; | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,170 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #if __ARM_NEON | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int inch = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| const int tailstep = w - 2*outw + w; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<inch; q++) | |||
| { | |||
| const float* img0 = bottom_blob.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| const float* r0 = img0; | |||
| const float* r1 = img0 + w; | |||
| const float* r2 = img0 + w*2; | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| #if __ARM_NEON | |||
| int nn = outw >> 2; | |||
| int remain = outw - (nn << 2); | |||
| #else | |||
| int remain = outw; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| float32x4x2_t _r0 = vld2q_f32(r0); | |||
| float32x4x2_t _r1 = vld2q_f32(r1); | |||
| float32x4x2_t _r2 = vld2q_f32(r2); | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4x2_t _r0n = vld2q_f32(r0+8); | |||
| float32x4x2_t _r1n = vld2q_f32(r1+8); | |||
| float32x4x2_t _r2n = vld2q_f32(r2+8); | |||
| float32x4_t _max0 = vmaxq_f32(_r0.val[0], _r0.val[1]); | |||
| float32x4_t _max1 = vmaxq_f32(_r1.val[0], _r1.val[1]); | |||
| float32x4_t _max2 = vmaxq_f32(_r2.val[0], _r2.val[1]); | |||
| float32x4_t _r02 = vextq_f32(_r0.val[0], _r0n.val[0], 1); | |||
| float32x4_t _r12 = vextq_f32(_r1.val[0], _r1n.val[0], 1); | |||
| float32x4_t _r22 = vextq_f32(_r2.val[0], _r2n.val[0], 1); | |||
| _max0 = vmaxq_f32(_max0, _r02); | |||
| _max1 = vmaxq_f32(_max1, _r12); | |||
| _max2 = vmaxq_f32(_max2, _r22); | |||
| float32x4_t _max = vmaxq_f32(vmaxq_f32(_max0, _max1), _max2); | |||
| vst1q_f32(outptr, _max); | |||
| _r0 = _r0n; | |||
| _r1 = _r1n; | |||
| _r2 = _r2n; | |||
| r0 += 8; | |||
| r1 += 8; | |||
| r2 += 8; | |||
| outptr += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "pld [%1, #256] \n" | |||
| "vld2.f32 {d0-d3}, [%1]! \n"// q0 = 0 2 4 6 q1 = 1 3 5 7 | |||
| "pld [%2, #256] \n" | |||
| "vld2.f32 {d4-d7}, [%2]! \n" | |||
| "pld [%3, #256] \n" | |||
| "vld2.f32 {d8-d11}, [%3]! \n" | |||
| "0: \n" | |||
| "pld [%1, #256] \n" | |||
| "vld2.f32 {d12-d15}, [%1]! \n"// q6 = 8 10 12 14 q7 = 9 11 13 15 | |||
| "vmax.f32 q12, q0, q1 \n" | |||
| "vmax.f32 q13, q2, q3 \n" | |||
| "pld [%2, #256] \n" | |||
| "vld2.f32 {d16-d19}, [%2]! \n" | |||
| "vmax.f32 q14, q4, q5 \n" | |||
| "vext.32 q0, q0, q6, #1 \n" | |||
| "pld [%3, #256] \n" | |||
| "vld2.f32 {d20-d23}, [%3]! \n" | |||
| "vext.32 q2, q2, q8, #1 \n" | |||
| "vmax.f32 q12, q12, q0 \n" | |||
| "vext.32 q4, q4, q10, #1 \n" | |||
| "vmax.f32 q13, q13, q2 \n" | |||
| "vmax.f32 q14, q14, q4 \n" | |||
| "vmax.f32 q12, q12, q13 \n" | |||
| "vorr q0, q6, q6 \n" | |||
| "vorr q1, q7, q7 \n" | |||
| "vmax.f32 q12, q12, q14 \n" | |||
| "vorr q2, q8, q8 \n" | |||
| "vorr q3, q9, q9 \n" | |||
| "vorr q4, q10, q10 \n" | |||
| "vorr q5, q11, q11 \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d24-d25}, [%4]! \n" | |||
| "bne 0b \n" | |||
| "sub %1, #32 \n" | |||
| "sub %2, #32 \n" | |||
| "sub %3, #32 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(r0), // %1 | |||
| "=r"(r1), // %2 | |||
| "=r"(r2), // %3 | |||
| "=r"(outptr) // %4 | |||
| : "0"(nn), | |||
| "1"(r0), | |||
| "2"(r1), | |||
| "3"(r2), | |||
| "4"(outptr) | |||
| : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| float max0 = std::max(std::max(r0[0], r0[1]), r0[2]); | |||
| float max1 = std::max(std::max(r1[0], r1[1]), r1[2]); | |||
| float max2 = std::max(std::max(r2[0], r2[1]), r2[2]); | |||
| *outptr = std::max(std::max(max0, max1), max2); | |||
| r0 += 2; | |||
| r1 += 2; | |||
| r2 += 2; | |||
| outptr++; | |||
| } | |||
| r0 += tailstep;//1 + w; | |||
| r1 += tailstep;//1 + w; | |||
| r2 += tailstep;//1 + w; | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,96 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "pooling_arm.h" | |||
| namespace ncnn { | |||
| #include "pooling_2x2.h" | |||
| #include "pooling_3x3.h" | |||
| DEFINE_LAYER_CREATOR(Pooling_arm) | |||
| int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| { | |||
| // max value in NxN window | |||
| // avg value in NxN window | |||
| if (pooling_type != PoolMethod_MAX || stride != 2 || global_pooling == 1) | |||
| { | |||
| return Pooling::forward(bottom_blob, top_blob); | |||
| } | |||
| if (kernel_size != 2 && kernel_size != 3) | |||
| { | |||
| return Pooling::forward(bottom_blob, top_blob); | |||
| } | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| Mat bottom_blob_bordered = bottom_blob; | |||
| if (pad > 0) | |||
| { | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, pad, pad, pad, pad, BORDER_CONSTANT, 0.f); | |||
| if (bottom_blob_bordered.empty()) | |||
| return -100; | |||
| w = bottom_blob_bordered.w; | |||
| h = bottom_blob_bordered.h; | |||
| } | |||
| int outw = (w - kernel_size) / stride + 1; | |||
| int outh = (h - kernel_size) / stride + 1; | |||
| int wtail = (w - kernel_size) % stride; | |||
| int htail = (h - kernel_size) % stride; | |||
| if (wtail != 0 || htail != 0) | |||
| { | |||
| int wtailpad = 0; | |||
| int htailpad = 0; | |||
| if (wtail != 0) | |||
| wtailpad = kernel_size - wtail; | |||
| if (htail != 0) | |||
| htailpad = kernel_size - htail; | |||
| Mat bottom_blob_bordered2; | |||
| copy_make_border(bottom_blob_bordered, bottom_blob_bordered2, 0, htailpad, 0, wtailpad, BORDER_REPLICATE, 0.f); | |||
| if (bottom_blob_bordered2.empty()) | |||
| return -100; | |||
| bottom_blob_bordered = bottom_blob_bordered2; | |||
| w = bottom_blob_bordered.w; | |||
| h = bottom_blob_bordered.h; | |||
| if (wtail != 0) | |||
| outw += 1; | |||
| if (htail != 0) | |||
| outh += 1; | |||
| } | |||
| top_blob.create(outw, outh, channels); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| if (kernel_size == 2) | |||
| pooling2x2s2_max_neon(bottom_blob_bordered, top_blob); | |||
| if (kernel_size == 3) | |||
| pooling3x3s2_max_neon(bottom_blob_bordered, top_blob); | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,30 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_POOLING_ARM_H | |||
| #define LAYER_POOLING_ARM_H | |||
| #include "pooling.h" | |||
| namespace ncnn { | |||
| class Pooling_arm : public Pooling | |||
| { | |||
| public: | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_POOLING_ARM_H | |||
| @@ -0,0 +1,182 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "prelu_arm.h" | |||
| #if __ARM_NEON | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(PReLU_arm) | |||
| int PReLU_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| int size = w * h; | |||
| top_blob.create(w, h, channels); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| const float* slope_data_ptr = slope_data; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| float slope = num_slope > 1 ? slope_data_ptr[q] : slope_data_ptr[0]; | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| float32x4_t _zero = vdupq_n_f32(0.f); | |||
| float32x4_t _slope = vdupq_n_f32(slope); | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| uint32x4_t _lemask = vcleq_f32(_p, _zero); | |||
| float32x4_t _ps = vmulq_f32(_p, _slope); | |||
| float32x4_t _outp = vbslq_f32(_lemask, _ps, _p); | |||
| vst1q_f32(outptr, _outp); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "veor q1, q0, q0 \n" | |||
| "vdup.f32 q2, %6 \n" | |||
| "0: \n" | |||
| "pld [%1, #128] \n" | |||
| "vld1.f32 {d0-d1}, [%1 :128] \n" | |||
| "vcle.f32 q3, q0, q1 \n" | |||
| "vmul.f32 q4, q0, q2 \n" | |||
| "vbit.32 q0, q4, q3 \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d0-d1}, [%2 :128]! \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr), // %1 | |||
| "=r"(outptr) // %2 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "2"(outptr), | |||
| "r"(slope) // %6 | |||
| : "cc", "memory", "q0", "q1", "q2", "q3", "q4" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| if (*ptr < 0) | |||
| *outptr = *ptr * slope; | |||
| else | |||
| *outptr = *ptr; | |||
| ptr++; | |||
| outptr++; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| int PReLU_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| int size = w * h; | |||
| const float* slope_data_ptr = slope_data; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| float slope = num_slope > 1 ? slope_data_ptr[q] : slope_data_ptr[0]; | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| float32x4_t _zero = vdupq_n_f32(0.f); | |||
| float32x4_t _slope = vdupq_n_f32(slope); | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| uint32x4_t _lemask = vcleq_f32(_p, _zero); | |||
| float32x4_t _ps = vmulq_f32(_p, _slope); | |||
| _p = vbslq_f32(_lemask, _ps, _p); | |||
| vst1q_f32(ptr, _p); | |||
| ptr += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "veor q1, q0, q0 \n" | |||
| "vdup.f32 q2, %4 \n" | |||
| "0: \n" | |||
| "pld [%1, #128] \n" | |||
| "vld1.f32 {d0-d1}, [%1 :128] \n" | |||
| "vcle.f32 q3, q0, q1 \n" | |||
| "vmul.f32 q4, q0, q2 \n" | |||
| "vbit.32 q0, q4, q3 \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d0-d1}, [%1 :128]! \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr) // %1 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "r"(slope) // %4 | |||
| : "cc", "memory", "q0", "q1", "q2", "q3", "q4" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| if (*ptr < 0) | |||
| *ptr *= slope; | |||
| ptr++; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,32 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_PRELU_ARM_H | |||
| #define LAYER_PRELU_ARM_H | |||
| #include "prelu.h" | |||
| namespace ncnn { | |||
| class PReLU_arm : public PReLU | |||
| { | |||
| public: | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_PRELU_ARM_H | |||
| @@ -0,0 +1,295 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "relu_arm.h" | |||
| #if __ARM_NEON | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(ReLU_arm) | |||
| int ReLU_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| int size = w * h; | |||
| top_blob.create(w, h, channels); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| if (slope == 0.f) | |||
| { | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| float32x4_t _zero = vdupq_n_f32(0.f); | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| float32x4_t _outp = vmaxq_f32(_p, _zero); | |||
| vst1q_f32(outptr, _outp); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "veor q1, q0, q0 \n" | |||
| "0: \n" | |||
| "pld [%1, #128] \n" | |||
| "vld1.f32 {d0-d1}, [%1 :128]! \n" | |||
| "vmax.f32 q0, q0, q1 \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d0-d1}, [%2 :128]! \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr), // %1 | |||
| "=r"(outptr) // %2 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "2"(outptr) | |||
| : "cc", "memory", "q0", "q1" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *outptr = std::max(*ptr, 0.f); | |||
| ptr++; | |||
| outptr++; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| float32x4_t _zero = vdupq_n_f32(0.f); | |||
| float32x4_t _slope = vdupq_n_f32(slope); | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| uint32x4_t _lemask = vcleq_f32(_p, _zero); | |||
| float32x4_t _ps = vmulq_f32(_p, _slope); | |||
| float32x4_t _outp = vbslq_f32(_lemask, _ps, _p); | |||
| vst1q_f32(outptr, _outp); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "veor q1, q0, q0 \n" | |||
| "vdup.f32 q2, %6 \n" | |||
| "0: \n" | |||
| "pld [%1, #128] \n" | |||
| "vld1.f32 {d0-d1}, [%1 :128] \n" | |||
| "vcle.f32 q3, q0, q1 \n" | |||
| "vmul.f32 q4, q0, q2 \n" | |||
| "vbit.32 q0, q4, q3 \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d0-d1}, [%2 :128]! \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr), // %1 | |||
| "=r"(outptr) // %2 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "2"(outptr), | |||
| "r"(slope) // %6 | |||
| : "cc", "memory", "q0", "q1", "q2", "q3", "q4" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| if (*ptr < 0) | |||
| *outptr = *ptr * slope; | |||
| else | |||
| *outptr = *ptr; | |||
| ptr++; | |||
| outptr++; | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| int ReLU_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| int size = w * h; | |||
| if (slope == 0.f) | |||
| { | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| float32x4_t _zero = vdupq_n_f32(0.f); | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| _p = vmaxq_f32(_p, _zero); | |||
| vst1q_f32(ptr, _p); | |||
| ptr += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "veor q1, q0, q0 \n" | |||
| "0: \n" | |||
| "pld [%1, #128] \n" | |||
| "vld1.f32 {d0-d1}, [%1 :128] \n" | |||
| "vmax.f32 q0, q0, q1 \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d0-d1}, [%1 :128]! \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr) // %1 | |||
| : "0"(nn), | |||
| "1"(ptr) | |||
| : "cc", "memory", "q0", "q1" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *ptr = std::max(*ptr, 0.f); | |||
| ptr++; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| float32x4_t _zero = vdupq_n_f32(0.f); | |||
| float32x4_t _slope = vdupq_n_f32(slope); | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| uint32x4_t _lemask = vcleq_f32(_p, _zero); | |||
| float32x4_t _ps = vmulq_f32(_p, _slope); | |||
| _p = vbslq_f32(_lemask, _ps, _p); | |||
| vst1q_f32(ptr, _p); | |||
| ptr += 4; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "veor q1, q0, q0 \n" | |||
| "vdup.f32 q2, %4 \n" | |||
| "0: \n" | |||
| "pld [%1, #128] \n" | |||
| "vld1.f32 {d0-d1}, [%1 :128] \n" | |||
| "vcle.f32 q3, q0, q1 \n" | |||
| "vmul.f32 q4, q0, q2 \n" | |||
| "vbit.32 q0, q4, q3 \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d0-d1}, [%1 :128]! \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr) // %1 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "r"(slope) // %4 | |||
| : "cc", "memory", "q0", "q1", "q2", "q3", "q4" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| if (*ptr < 0) | |||
| *ptr *= slope; | |||
| ptr++; | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,32 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_RELU_ARM_H | |||
| #define LAYER_RELU_ARM_H | |||
| #include "relu.h" | |||
| namespace ncnn { | |||
| class ReLU_arm : public ReLU | |||
| { | |||
| public: | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_RELU_ARM_H | |||
| @@ -0,0 +1,211 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "scale_arm.h" | |||
| #if __ARM_NEON | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(Scale_arm) | |||
| int Scale_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| int size = w * h; | |||
| top_blob.create(w, h, channels); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| if (bias_term) | |||
| { | |||
| const float* scale_ptr = scale_data; | |||
| const float* bias_ptr = bias_data; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| float s = scale_ptr[q]; | |||
| float bias = bias_ptr[q]; | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| float32x4_t _s = vdupq_n_f32(s); | |||
| float32x4_t _bias = vdupq_n_f32(bias); | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| _p = vmlaq_f32(_bias, _p, _s); | |||
| vst1q_f32(outptr, _p); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| } | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *outptr = *ptr * s + bias; | |||
| ptr++; | |||
| outptr++; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| const float* scale_ptr = scale_data; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| float s = scale_ptr[q]; | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| float32x4_t _s = vdupq_n_f32(s); | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| _p = vmulq_f32(_p, _s); | |||
| vst1q_f32(outptr, _p); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| } | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *outptr = *ptr * s; | |||
| ptr++; | |||
| outptr++; | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| int Scale_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| int size = w * h; | |||
| if (bias_term) | |||
| { | |||
| const float* scale_ptr = scale_data; | |||
| const float* bias_ptr = bias_data; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| float s = scale_ptr[q]; | |||
| float bias = bias_ptr[q]; | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| float32x4_t _s = vdupq_n_f32(s); | |||
| float32x4_t _bias = vdupq_n_f32(bias); | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| _p = vmlaq_f32(_bias, _p, _s); | |||
| vst1q_f32(ptr, _p); | |||
| ptr += 4; | |||
| } | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *ptr = *ptr * s + bias; | |||
| ptr++; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| const float* scale_ptr = scale_data; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| float s = scale_ptr[q]; | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| float32x4_t _s = vdupq_n_f32(s); | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| _p = vmulq_f32(_p, _s); | |||
| vst1q_f32(ptr, _p); | |||
| ptr += 4; | |||
| } | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *ptr *= s; | |||
| ptr++; | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,32 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_SCALE_ARM_H | |||
| #define LAYER_SCALE_ARM_H | |||
| #include "scale.h" | |||
| namespace ncnn { | |||
| class Scale_arm : public Scale | |||
| { | |||
| public: | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_SCALE_ARM_H | |||
| @@ -0,0 +1,127 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "sigmoid_arm.h" | |||
| #if __ARM_NEON | |||
| #include <arm_neon.h> | |||
| #include "neon_mathfun.h" | |||
| #endif // __ARM_NEON | |||
| #include <math.h> | |||
| namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(Sigmoid_arm) | |||
| int Sigmoid_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| int size = w * h; | |||
| top_blob.create(w, h, channels); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| float32x4_t _one = vdupq_n_f32(1.f); | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| _p = vnegq_f32(_p); | |||
| _p = exp_ps(_p); | |||
| _p = vaddq_f32(_p, _one); | |||
| float32x4_t _outp = vrecpeq_f32(_p); | |||
| _outp = vmulq_f32(vrecpsq_f32(_p, _outp), _outp); | |||
| // _outp = vmulq_f32(vrecpsq_f32(_p, _outp), _outp); | |||
| vst1q_f32(outptr, _outp); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| } | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *outptr = 1.f / (1.f + exp(-*ptr)); | |||
| ptr++; | |||
| outptr++; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| int size = w * h; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| float32x4_t _one = vdupq_n_f32(1.f); | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| _p = vnegq_f32(_p); | |||
| _p = exp_ps(_p); | |||
| _p = vaddq_f32(_p, _one); | |||
| _p = vrecpeq_f32(_p); | |||
| _p = vmulq_f32(vrecpsq_f32(_p, _p), _p); | |||
| // _p = vmulq_f32(vrecpsq_f32(_p, _p), _p); | |||
| vst1q_f32(ptr, _p); | |||
| ptr += 4; | |||
| } | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *ptr = 1.f / (1.f + exp(-*ptr)); | |||
| ptr++; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,32 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_SIGMOID_ARM_H | |||
| #define LAYER_SIGMOID_ARM_H | |||
| #include "sigmoid.h" | |||
| namespace ncnn { | |||
| class Sigmoid_arm : public Sigmoid | |||
| { | |||
| public: | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_SIGMOID_ARM_H | |||
| @@ -0,0 +1,102 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "slice_arm.h" | |||
| #if __ARM_NEON | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(Slice_arm) | |||
| int Slice_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const | |||
| { | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| int q = 0; | |||
| const int* slices_ptr = (const int*)slices.data; | |||
| for (size_t i=0; i<top_blobs.size(); i++) | |||
| { | |||
| int slice = slices_ptr[i]; | |||
| if (slice == -233) | |||
| { | |||
| slice = (channels - q) / (top_blobs.size() - i); | |||
| } | |||
| Mat& top_blob = top_blobs[i]; | |||
| top_blob.create(w, h, slice); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| int size = bottom_blob.cstep * slice; | |||
| const float* ptr = bottom_blob.channel(q); | |||
| float* outptr = top_blob.data; | |||
| #if __ARM_NEON | |||
| int nn = size >> 3; | |||
| int remain = size - (nn << 3); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| float32x4_t _p2 = vld1q_f32(ptr+4); | |||
| vst1q_f32(outptr, _p); | |||
| vst1q_f32(outptr+4, _p2); | |||
| ptr += 8; | |||
| outptr += 8; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "0: \n" | |||
| "pld [%1, #256] \n" | |||
| "vld1.f32 {d0-d3}, [%1 :128]! \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.f32 {d0-d3}, [%2 :128]! \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr), // %1 | |||
| "=r"(outptr) // %2 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "2"(outptr) | |||
| : "cc", "memory", "q0" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *outptr++ = *ptr++; | |||
| } | |||
| q += slice; | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,30 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_SLICE_ARM_H | |||
| #define LAYER_SLICE_ARM_H | |||
| #include "slice.h" | |||
| namespace ncnn { | |||
| class Slice_arm : public Slice | |||
| { | |||
| public: | |||
| virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_SLICE_ARM_H | |||
| @@ -0,0 +1,302 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "softmax_arm.h" | |||
| #include <float.h> | |||
| #include <math.h> | |||
| #if __ARM_NEON | |||
| #include <arm_neon.h> | |||
| #include "neon_mathfun.h" | |||
| #endif // __ARM_NEON | |||
| namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(Softmax_arm) | |||
| int Softmax_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| { | |||
| // value = exp( value - global max value ) | |||
| // sum all value | |||
| // value = value / sum | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| int size = w * h; | |||
| top_blob.create(w, h, channels); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| Mat max; | |||
| max.create(w, h); | |||
| if (max.empty()) | |||
| return -100; | |||
| max.fill(-FLT_MAX); | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| float* maxptr = max; | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| maxptr[i] = std::max(maxptr[i], ptr[i]); | |||
| } | |||
| } | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| float* maxptr = max; | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| float32x4_t _max = vld1q_f32(maxptr); | |||
| _p = exp_ps(vsubq_f32(_p, _max)); | |||
| vst1q_f32(outptr, _p); | |||
| ptr += 4; | |||
| maxptr += 4; | |||
| outptr += 4; | |||
| } | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *outptr = exp(*ptr - *maxptr); | |||
| ptr++; | |||
| maxptr++; | |||
| outptr++; | |||
| } | |||
| } | |||
| Mat sum; | |||
| sum.create(w, h); | |||
| if (sum.empty()) | |||
| return -100; | |||
| sum.fill(0.f); | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* outptr = top_blob.channel(q); | |||
| float* sumptr = sum; | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| sumptr[i] += outptr[i]; | |||
| } | |||
| } | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* outptr = top_blob.channel(q); | |||
| float* sumptr = sum; | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(outptr); | |||
| float32x4_t _sum = vld1q_f32(sumptr); | |||
| #if __aarch64__ | |||
| _p = vdivq_f32(_p, _sum); | |||
| #else | |||
| _p = div_ps(_p, _sum); | |||
| #endif // __aarch64__ | |||
| vst1q_f32(outptr, _p); | |||
| outptr += 4; | |||
| sumptr += 4; | |||
| } | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *outptr /= *sumptr; | |||
| outptr++; | |||
| sumptr++; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| { | |||
| // value = exp( value - global max value ) | |||
| // sum all value | |||
| // value = value / sum | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| int size = w * h; | |||
| Mat max; | |||
| max.create(w, h); | |||
| if (max.empty()) | |||
| return -100; | |||
| max.fill(-FLT_MAX); | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| float* maxptr = max; | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| maxptr[i] = std::max(maxptr[i], ptr[i]); | |||
| } | |||
| } | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| float* maxptr = max; | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| float32x4_t _max = vld1q_f32(maxptr); | |||
| _p = exp_ps(vsubq_f32(_p, _max)); | |||
| vst1q_f32(ptr, _p); | |||
| ptr += 4; | |||
| maxptr += 4; | |||
| } | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *ptr = exp(*ptr - *maxptr); | |||
| ptr++; | |||
| maxptr++; | |||
| } | |||
| } | |||
| Mat sum; | |||
| sum.create(w, h); | |||
| if (sum.empty()) | |||
| return -100; | |||
| sum.fill(0.f); | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| float* sumptr = sum; | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| float32x4_t _sum = vld1q_f32(sumptr); | |||
| _sum = vaddq_f32(_sum, _p); | |||
| vst1q_f32(sumptr, _sum); | |||
| ptr += 4; | |||
| sumptr += 4; | |||
| } | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *sumptr += *ptr; | |||
| ptr++; | |||
| sumptr++; | |||
| } | |||
| } | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| float* sumptr = sum; | |||
| #if __ARM_NEON | |||
| int nn = size >> 2; | |||
| int remain = size - (nn << 2); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| float32x4_t _sum = vld1q_f32(sumptr); | |||
| #if __aarch64__ | |||
| _p = vdivq_f32(_p, _sum); | |||
| #else | |||
| _p = div_ps(_p, _sum); | |||
| #endif // __aarch64__ | |||
| vst1q_f32(ptr, _p); | |||
| ptr += 4; | |||
| sumptr += 4; | |||
| } | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| *ptr /= *sumptr; | |||
| ptr++; | |||
| sumptr++; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,32 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_SOFTMAX_ARM_H | |||
| #define LAYER_SOFTMAX_ARM_H | |||
| #include "softmax.h" | |||
| namespace ncnn { | |||
| class Softmax_arm : public Softmax | |||
| { | |||
| public: | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_SOFTMAX_ARM_H | |||
| @@ -0,0 +1,227 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "batchnorm.h" | |||
| #include <math.h> | |||
| namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(BatchNorm) | |||
| BatchNorm::BatchNorm() | |||
| { | |||
| one_blob_only = true; | |||
| support_inplace = true; | |||
| } | |||
| BatchNorm::~BatchNorm() | |||
| { | |||
| } | |||
| #if NCNN_STDIO | |||
| #if NCNN_STRING | |||
| int BatchNorm::load_param(FILE* paramfp) | |||
| { | |||
| int nscan = fscanf(paramfp, "%d", &channels); | |||
| if (nscan != 1) | |||
| { | |||
| fprintf(stderr, "BatchNorm load_param failed %d\n", nscan); | |||
| return -1; | |||
| } | |||
| return 0; | |||
| } | |||
| #endif // NCNN_STRING | |||
| int BatchNorm::load_param_bin(FILE* paramfp) | |||
| { | |||
| fread(&channels, sizeof(int), 1, paramfp); | |||
| return 0; | |||
| } | |||
| int BatchNorm::load_model(FILE* binfp) | |||
| { | |||
| int nread; | |||
| slope_data.create(channels); | |||
| if (slope_data.empty()) | |||
| return -100; | |||
| nread = fread(slope_data, channels * sizeof(float), 1, binfp); | |||
| if (nread != 1) | |||
| { | |||
| fprintf(stderr, "BatchNorm read slope_data failed %d\n", nread); | |||
| return -1; | |||
| } | |||
| mean_data.create(channels); | |||
| if (mean_data.empty()) | |||
| return -100; | |||
| nread = fread(mean_data, channels * sizeof(float), 1, binfp); | |||
| if (nread != 1) | |||
| { | |||
| fprintf(stderr, "BatchNorm read mean_data failed %d\n", nread); | |||
| return -1; | |||
| } | |||
| var_data.create(channels); | |||
| if (var_data.empty()) | |||
| return -100; | |||
| nread = fread(var_data, channels * sizeof(float), 1, binfp); | |||
| if (nread != 1) | |||
| { | |||
| fprintf(stderr, "BatchNorm read var_data failed %d\n", nread); | |||
| return -1; | |||
| } | |||
| bias_data.create(channels); | |||
| if (bias_data.empty()) | |||
| return -100; | |||
| nread = fread(bias_data, channels * sizeof(float), 1, binfp); | |||
| if (nread != 1) | |||
| { | |||
| fprintf(stderr, "BatchNorm read bias_data failed %d\n", nread); | |||
| return -1; | |||
| } | |||
| a_data.create(channels); | |||
| if (a_data.empty()) | |||
| return -100; | |||
| b_data.create(channels); | |||
| if (b_data.empty()) | |||
| return -100; | |||
| const float* slope_data_ptr = slope_data; | |||
| const float* mean_data_ptr = mean_data; | |||
| const float* var_data_ptr = var_data; | |||
| const float* bias_data_ptr = bias_data; | |||
| float* a_data_ptr = a_data; | |||
| float* b_data_ptr = b_data; | |||
| for (int i=0; i<channels; i++) | |||
| { | |||
| float sqrt_var = sqrt(var_data_ptr[i]); | |||
| a_data_ptr[i] = bias_data_ptr[i] - slope_data_ptr[i] * mean_data_ptr[i] / sqrt_var; | |||
| b_data_ptr[i] = slope_data_ptr[i] / sqrt_var; | |||
| } | |||
| return 0; | |||
| } | |||
| #endif // NCNN_STDIO | |||
| int BatchNorm::load_param(const unsigned char*& mem) | |||
| { | |||
| channels = *(int*)(mem); | |||
| mem += 4; | |||
| return 0; | |||
| } | |||
| int BatchNorm::load_model(const unsigned char*& mem) | |||
| { | |||
| slope_data = Mat(channels, (float*)mem); | |||
| mem += channels * sizeof(float); | |||
| mean_data = Mat(channels, (float*)mem); | |||
| mem += channels * sizeof(float); | |||
| var_data = Mat(channels, (float*)mem); | |||
| mem += channels * sizeof(float); | |||
| bias_data = Mat(channels, (float*)mem); | |||
| mem += channels * sizeof(float); | |||
| a_data.create(channels); | |||
| if (a_data.empty()) | |||
| return -100; | |||
| b_data.create(channels); | |||
| if (b_data.empty()) | |||
| return -100; | |||
| const float* slope_data_ptr = slope_data; | |||
| const float* mean_data_ptr = mean_data; | |||
| const float* var_data_ptr = var_data; | |||
| const float* bias_data_ptr = bias_data; | |||
| float* a_data_ptr = a_data; | |||
| float* b_data_ptr = b_data; | |||
| for (int i=0; i<channels; i++) | |||
| { | |||
| float sqrt_var = sqrt(var_data_ptr[i]); | |||
| a_data_ptr[i] = bias_data_ptr[i] - slope_data_ptr[i] * mean_data_ptr[i] / sqrt_var; | |||
| b_data_ptr[i] = slope_data_ptr[i] / sqrt_var; | |||
| } | |||
| return 0; | |||
| } | |||
| int BatchNorm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| { | |||
| // a = bias - slope * mean / sqrt(var) | |||
| // b = slope / sqrt(var) | |||
| // value = b * value + a | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int size = w * h; | |||
| top_blob.create(w, h, channels); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| const float* a_data_ptr = a_data; | |||
| const float* b_data_ptr = b_data; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| float a = a_data_ptr[q]; | |||
| float b = b_data_ptr[q]; | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| outptr[i] = b * ptr[i] + a; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| int BatchNorm::forward_inplace(Mat& bottom_top_blob) const | |||
| { | |||
| // a = bias - slope * mean / sqrt(var) | |||
| // b = slope / sqrt(var) | |||
| // value = b * value + a | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int size = w * h; | |||
| const float* a_data_ptr = a_data; | |||
| const float* b_data_ptr = b_data; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| float a = a_data_ptr[q]; | |||
| float b = b_data_ptr[q]; | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| ptr[i] = b * ptr[i] + a; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,58 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_BATCHNORM_H | |||
| #define LAYER_BATCHNORM_H | |||
| #include "layer.h" | |||
| namespace ncnn { | |||
| class BatchNorm : public Layer | |||
| { | |||
| public: | |||
| BatchNorm(); | |||
| virtual ~BatchNorm(); | |||
| #if NCNN_STDIO | |||
| #if NCNN_STRING | |||
| virtual int load_param(FILE* paramfp); | |||
| #endif // NCNN_STRING | |||
| virtual int load_param_bin(FILE* paramfp); | |||
| virtual int load_model(FILE* binfp); | |||
| #endif // NCNN_STDIO | |||
| virtual int load_param(const unsigned char*& mem); | |||
| virtual int load_model(const unsigned char*& mem); | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| public: | |||
| // param | |||
| int channels; | |||
| // model | |||
| Mat slope_data; | |||
| Mat mean_data; | |||
| Mat var_data; | |||
| Mat bias_data; | |||
| Mat a_data; | |||
| Mat b_data; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_BATCHNORM_H | |||
| @@ -0,0 +1,139 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "bias.h" | |||
| namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(Bias) | |||
| Bias::Bias() | |||
| { | |||
| one_blob_only = true; | |||
| support_inplace = true; | |||
| } | |||
| Bias::~Bias() | |||
| { | |||
| } | |||
| #if NCNN_STDIO | |||
| #if NCNN_STRING | |||
| int Bias::load_param(FILE* paramfp) | |||
| { | |||
| int nscan = fscanf(paramfp, "%d", &bias_data_size); | |||
| if (nscan != 1) | |||
| { | |||
| fprintf(stderr, "Bias load_param failed %d\n", nscan); | |||
| return -1; | |||
| } | |||
| return 0; | |||
| } | |||
| #endif // NCNN_STRING | |||
| int Bias::load_param_bin(FILE* paramfp) | |||
| { | |||
| fread(&bias_data_size, sizeof(int), 1, paramfp); | |||
| return 0; | |||
| } | |||
| int Bias::load_model(FILE* binfp) | |||
| { | |||
| int nread; | |||
| bias_data.create(bias_data_size); | |||
| if (bias_data.empty()) | |||
| return -100; | |||
| nread = fread(bias_data, bias_data_size * sizeof(float), 1, binfp); | |||
| if (nread != 1) | |||
| { | |||
| fprintf(stderr, "Bias read bias_data failed %d\n", nread); | |||
| return -1; | |||
| } | |||
| return 0; | |||
| } | |||
| #endif // NCNN_STDIO | |||
| int Bias::load_param(const unsigned char*& mem) | |||
| { | |||
| bias_data_size = *(int*)(mem); | |||
| mem += 4; | |||
| return 0; | |||
| } | |||
| int Bias::load_model(const unsigned char*& mem) | |||
| { | |||
| bias_data = Mat(bias_data_size, (float*)mem); | |||
| mem += bias_data_size * sizeof(float); | |||
| return 0; | |||
| } | |||
| int Bias::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| int size = w * h; | |||
| top_blob.create(w, h, channels); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| const float* bias_ptr = bias_data; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| float bias = bias_ptr[q]; | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| outptr[i] = ptr[i] + bias; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| int Bias::forward_inplace(Mat& bottom_top_blob) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| int size = w * h; | |||
| const float* bias_ptr = bias_data; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| float bias = bias_ptr[q]; | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| ptr[i] += bias; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,52 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_BIAS_H | |||
| #define LAYER_BIAS_H | |||
| #include "layer.h" | |||
| namespace ncnn { | |||
| class Bias : public Layer | |||
| { | |||
| public: | |||
| Bias(); | |||
| virtual ~Bias(); | |||
| #if NCNN_STDIO | |||
| #if NCNN_STRING | |||
| virtual int load_param(FILE* paramfp); | |||
| #endif // NCNN_STRING | |||
| virtual int load_param_bin(FILE* paramfp); | |||
| virtual int load_model(FILE* binfp); | |||
| #endif // NCNN_STDIO | |||
| virtual int load_param(const unsigned char*& mem); | |||
| virtual int load_model(const unsigned char*& mem); | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| public: | |||
| // param | |||
| int bias_data_size; | |||
| // model | |||
| Mat bias_data; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_BIAS_H | |||
| @@ -0,0 +1,81 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "bnll.h" | |||
| #include <math.h> | |||
| namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(BNLL) | |||
| BNLL::BNLL() | |||
| { | |||
| one_blob_only = true; | |||
| support_inplace = true; | |||
| } | |||
| int BNLL::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| int size = w * h; | |||
| top_blob.create(w, h, channels); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| if (ptr[i] > 0) | |||
| outptr[i] = ptr[i] + log(1.f + exp(-ptr[i])); | |||
| else | |||
| outptr[i] = log(1.f + exp(ptr[i])); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| int BNLL::forward_inplace(Mat& bottom_top_blob) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| int size = w * h; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| if (ptr[i] > 0) | |||
| ptr[i] = ptr[i] + log(1.f + exp(-ptr[i])); | |||
| else | |||
| ptr[i] = log(1.f + exp(ptr[i])); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,36 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_BNLL_H | |||
| #define LAYER_BNLL_H | |||
| #include "layer.h" | |||
| namespace ncnn { | |||
| class BNLL : public Layer | |||
| { | |||
| public: | |||
| BNLL(); | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| public: | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_BNLL_H | |||
| @@ -0,0 +1,64 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "concat.h" | |||
| namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(Concat) | |||
| Concat::Concat() | |||
| { | |||
| } | |||
| int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const | |||
| { | |||
| int w = bottom_blobs[0].w; | |||
| int h = bottom_blobs[0].h; | |||
| // total channels | |||
| int top_channels = 0; | |||
| for (size_t b=0; b<bottom_blobs.size(); b++) | |||
| { | |||
| const Mat& bottom_blob = bottom_blobs[b]; | |||
| top_channels += bottom_blob.c; | |||
| } | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(w, h, top_channels); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| int q = 0; | |||
| for (size_t b=0; b<bottom_blobs.size(); b++) | |||
| { | |||
| const Mat& bottom_blob = bottom_blobs[b]; | |||
| int channels = bottom_blob.c; | |||
| int size = bottom_blob.cstep * channels; | |||
| const float* ptr = bottom_blob; | |||
| float* outptr = top_blob.channel(q); | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| outptr[i] = ptr[i]; | |||
| } | |||
| q += channels; | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,34 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_CONCAT_H | |||
| #define LAYER_CONCAT_H | |||
| #include "layer.h" | |||
| namespace ncnn { | |||
| class Concat : public Layer | |||
| { | |||
| public: | |||
| Concat(); | |||
| virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const; | |||
| public: | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_CONCAT_H | |||
| @@ -0,0 +1,350 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "convolution.h" | |||
| namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(Convolution) | |||
| Convolution::Convolution() | |||
| { | |||
| one_blob_only = true; | |||
| support_inplace = false; | |||
| } | |||
| Convolution::~Convolution() | |||
| { | |||
| } | |||
| #if NCNN_STDIO | |||
| #if NCNN_STRING | |||
| int Convolution::load_param(FILE* paramfp) | |||
| { | |||
| int nscan = fscanf(paramfp, "%d %d %d %d %d %d %d", | |||
| &num_output, &kernel_size, &dilation, &stride, &pad, &bias_term, | |||
| &weight_data_size); | |||
| if (nscan != 7) | |||
| { | |||
| fprintf(stderr, "Convolution load_param failed %d\n", nscan); | |||
| return -1; | |||
| } | |||
| return 0; | |||
| } | |||
| #endif // NCNN_STRING | |||
| int Convolution::load_param_bin(FILE* paramfp) | |||
| { | |||
| fread(&num_output, sizeof(int), 1, paramfp); | |||
| fread(&kernel_size, sizeof(int), 1, paramfp); | |||
| fread(&dilation, sizeof(int), 1, paramfp); | |||
| fread(&stride, sizeof(int), 1, paramfp); | |||
| fread(&pad, sizeof(int), 1, paramfp); | |||
| fread(&bias_term, sizeof(int), 1, paramfp); | |||
| fread(&weight_data_size, sizeof(int), 1, paramfp); | |||
| return 0; | |||
| } | |||
| int Convolution::load_model(FILE* binfp) | |||
| { | |||
| int nread; | |||
| union | |||
| { | |||
| struct | |||
| { | |||
| unsigned char f0; | |||
| unsigned char f1; | |||
| unsigned char f2; | |||
| unsigned char f3; | |||
| }; | |||
| unsigned int tag; | |||
| } flag_struct; | |||
| nread = fread(&flag_struct, sizeof(flag_struct), 1, binfp); | |||
| if (nread != 1) | |||
| { | |||
| fprintf(stderr, "Convolution read flag_struct failed %d\n", nread); | |||
| return -1; | |||
| } | |||
| unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3; | |||
| weight_data.create(weight_data_size); | |||
| if (weight_data.empty()) | |||
| return -100; | |||
| if (flag_struct.tag == 0x01306B47) | |||
| { | |||
| // half-precision weight data | |||
| int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned short), 4); | |||
| std::vector<unsigned short> float16_weights; | |||
| float16_weights.resize(align_weight_data_size); | |||
| nread = fread(float16_weights.data(), align_weight_data_size, 1, binfp); | |||
| if (nread != 1) | |||
| { | |||
| fprintf(stderr, "Convolution read float16_weights failed %d\n", nread); | |||
| return -1; | |||
| } | |||
| weight_data = Mat::from_float16(float16_weights.data(), weight_data_size); | |||
| if (weight_data.empty()) | |||
| return -100; | |||
| } | |||
| else if (flag != 0) | |||
| { | |||
| // quantized weight data | |||
| float quantization_value[256]; | |||
| nread = fread(quantization_value, 256 * sizeof(float), 1, binfp); | |||
| if (nread != 1) | |||
| { | |||
| fprintf(stderr, "Convolution read quantization_value failed %d\n", nread); | |||
| return -1; | |||
| } | |||
| int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned char), 4); | |||
| std::vector<unsigned char> index_array; | |||
| index_array.resize(align_weight_data_size); | |||
| nread = fread(index_array.data(), align_weight_data_size, 1, binfp); | |||
| if (nread != 1) | |||
| { | |||
| fprintf(stderr, "Convolution read index_array failed %d\n", nread); | |||
| return -1; | |||
| } | |||
| float* weight_data_ptr = weight_data; | |||
| for (int i = 0; i < weight_data_size; i++) | |||
| { | |||
| weight_data_ptr[i] = quantization_value[ index_array[i] ]; | |||
| } | |||
| } | |||
| else if (flag_struct.f0 == 0) | |||
| { | |||
| // raw weight data | |||
| nread = fread(weight_data, weight_data_size * sizeof(float), 1, binfp); | |||
| if (nread != 1) | |||
| { | |||
| fprintf(stderr, "Convolution read weight_data failed %d\n", nread); | |||
| return -1; | |||
| } | |||
| } | |||
| if (bias_term) | |||
| { | |||
| bias_data.create(num_output); | |||
| if (bias_data.empty()) | |||
| return -100; | |||
| nread = fread(bias_data, num_output * sizeof(float), 1, binfp); | |||
| if (nread != 1) | |||
| { | |||
| fprintf(stderr, "Convolution read bias_data failed %d\n", nread); | |||
| return -1; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| #endif // NCNN_STDIO | |||
| int Convolution::load_param(const unsigned char*& mem) | |||
| { | |||
| num_output = *(int*)(mem); | |||
| mem += 4; | |||
| kernel_size = *(int*)(mem); | |||
| mem += 4; | |||
| dilation = *(int*)(mem); | |||
| mem += 4; | |||
| stride = *(int*)(mem); | |||
| mem += 4; | |||
| pad = *(int*)(mem); | |||
| mem += 4; | |||
| bias_term = *(int*)(mem); | |||
| mem += 4; | |||
| weight_data_size = *(int*)(mem); | |||
| mem += 4; | |||
| return 0; | |||
| } | |||
| int Convolution::load_model(const unsigned char*& mem) | |||
| { | |||
| union | |||
| { | |||
| struct | |||
| { | |||
| unsigned char f0; | |||
| unsigned char f1; | |||
| unsigned char f2; | |||
| unsigned char f3; | |||
| }; | |||
| unsigned int tag; | |||
| } flag_struct; | |||
| memcpy(&flag_struct, mem, sizeof(flag_struct)); | |||
| mem += sizeof(flag_struct); | |||
| unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3; | |||
| if (flag_struct.tag == 0x01306B47) | |||
| { | |||
| // half-precision weight data | |||
| weight_data = Mat::from_float16((unsigned short*)mem, weight_data_size); | |||
| mem += alignSize(weight_data_size * sizeof(unsigned short), 4); | |||
| if (weight_data.empty()) | |||
| return -100; | |||
| } | |||
| else if (flag != 0) | |||
| { | |||
| // quantized weight data | |||
| const float* quantization_value = (const float*)mem; | |||
| mem += 256 * sizeof(float); | |||
| const unsigned char* index_array = (const unsigned char*)mem; | |||
| mem += alignSize(weight_data_size * sizeof(unsigned char), 4); | |||
| weight_data.create(weight_data_size); | |||
| if (weight_data.empty()) | |||
| return -100; | |||
| float* weight_data_ptr = weight_data; | |||
| for (int i = 0; i < weight_data_size; i++) | |||
| { | |||
| weight_data_ptr[i] = quantization_value[ index_array[i] ]; | |||
| } | |||
| } | |||
| else if (flag_struct.f0 == 0) | |||
| { | |||
| // raw weight data | |||
| weight_data = Mat(weight_data_size, (float*)mem); | |||
| mem += weight_data_size * sizeof(float); | |||
| } | |||
| if (bias_term) | |||
| { | |||
| bias_data = Mat(num_output, (float*)mem); | |||
| mem += num_output * sizeof(float); | |||
| } | |||
| return 0; | |||
| } | |||
| int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| { | |||
| // convolv with NxN kernel | |||
| // value = value + bias | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| // fprintf(stderr, "Convolution input %d x %d pad = %d ksize=%d stride=%d\n", w, h, pad, kernel_size, stride); | |||
| Mat bottom_blob_bordered = bottom_blob; | |||
| if (pad > 0) | |||
| { | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, pad, pad, pad, pad, BORDER_CONSTANT, 0.f); | |||
| if (bottom_blob_bordered.empty()) | |||
| return -100; | |||
| w = bottom_blob_bordered.w; | |||
| h = bottom_blob_bordered.h; | |||
| } | |||
| const int kernel_extent = dilation * (kernel_size - 1) + 1; | |||
| int outw = (w - kernel_extent) / stride + 1; | |||
| int outh = (h - kernel_extent) / stride + 1; | |||
| top_blob.create(outw, outh, num_output); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| const int maxk = kernel_size * kernel_size; | |||
| // kernel offsets | |||
| std::vector<int> _space_ofs(maxk); | |||
| int* space_ofs = &_space_ofs[0]; | |||
| { | |||
| int p1 = 0; | |||
| int p2 = 0; | |||
| int gap = w * dilation - kernel_extent; | |||
| for (int i = 0; i < kernel_size; i++) | |||
| { | |||
| for (int j = 0; j < kernel_size; j++) | |||
| { | |||
| space_ofs[p1] = p2; | |||
| p1++; | |||
| p2 += dilation; | |||
| } | |||
| p2 += gap; | |||
| } | |||
| } | |||
| // num_output | |||
| const float* weight_data_ptr = weight_data; | |||
| #pragma omp parallel for | |||
| for (int p=0; p<num_output; p++) | |||
| { | |||
| float* outptr = top_blob.channel(p); | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| for (int j = 0; j < outw; j++) | |||
| { | |||
| float sum = 0.f; | |||
| if (bias_term) | |||
| sum = bias_data.data[p]; | |||
| const float* kptr = weight_data_ptr + maxk * channels * p; | |||
| // channels | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const Mat m = bottom_blob_bordered.channel(q); | |||
| const float* sptr = m.data + m.w * i*stride + j*stride; | |||
| for (int k = 0; k < maxk; k++) // 29.23 | |||
| { | |||
| float val = sptr[ space_ofs[k] ]; // 20.72 | |||
| float w = kptr[k]; | |||
| sum += val * w; // 41.45 | |||
| } | |||
| kptr += maxk; | |||
| } | |||
| outptr[j] = sum; | |||
| } | |||
| outptr += outw; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,58 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_CONVOLUTION_H | |||
| #define LAYER_CONVOLUTION_H | |||
| #include "layer.h" | |||
| namespace ncnn { | |||
| class Convolution : public Layer | |||
| { | |||
| public: | |||
| Convolution(); | |||
| virtual ~Convolution(); | |||
| #if NCNN_STDIO | |||
| #if NCNN_STRING | |||
| virtual int load_param(FILE* paramfp); | |||
| #endif // NCNN_STRING | |||
| virtual int load_param_bin(FILE* paramfp); | |||
| virtual int load_model(FILE* binfp); | |||
| #endif // NCNN_STDIO | |||
| virtual int load_param(const unsigned char*& mem); | |||
| virtual int load_model(const unsigned char*& mem); | |||
| virtual int forward(const Mat& bottom_blobs, Mat& top_blobs) const; | |||
| public: | |||
| // param | |||
| int num_output; | |||
| int kernel_size; | |||
| int dilation; | |||
| int stride; | |||
| int pad; | |||
| int bias_term; | |||
| int weight_data_size; | |||
| // model | |||
| Mat weight_data; | |||
| Mat bias_data; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_CONVOLUTION_H | |||
| @@ -0,0 +1,85 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "crop.h" | |||
| namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(Crop) | |||
| Crop::Crop() | |||
| { | |||
| } | |||
| #if NCNN_STDIO | |||
| #if NCNN_STRING | |||
| int Crop::load_param(FILE* paramfp) | |||
| { | |||
| int nscan = fscanf(paramfp, "%d %d", &woffset, &hoffset); | |||
| if (nscan != 2) | |||
| { | |||
| fprintf(stderr, "Crop load_param failed %d\n", nscan); | |||
| return -1; | |||
| } | |||
| return 0; | |||
| } | |||
| #endif // NCNN_STRING | |||
| int Crop::load_param_bin(FILE* paramfp) | |||
| { | |||
| fread(&woffset, sizeof(int), 1, paramfp); | |||
| fread(&hoffset, sizeof(int), 1, paramfp); | |||
| return 0; | |||
| } | |||
| #endif // NCNN_STDIO | |||
| int Crop::load_param(const unsigned char*& mem) | |||
| { | |||
| woffset = *(int*)(mem); | |||
| mem += 4; | |||
| hoffset = *(int*)(mem); | |||
| mem += 4; | |||
| return 0; | |||
| } | |||
| int Crop::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const | |||
| { | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| const Mat& reference_blob = bottom_blobs[1]; | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int outw = reference_blob.w; | |||
| int outh = reference_blob.h; | |||
| int top = hoffset; | |||
| int bottom = h - outh - hoffset; | |||
| int left = woffset; | |||
| int right = w - outw - woffset; | |||
| Mat& top_blob = top_blobs[0]; | |||
| copy_cut_border(bottom_blob, top_blob, top, bottom, left, right); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,44 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_CROP_H | |||
| #define LAYER_CROP_H | |||
| #include "layer.h" | |||
| namespace ncnn { | |||
| class Crop : public Layer | |||
| { | |||
| public: | |||
| Crop(); | |||
| #if NCNN_STDIO | |||
| #if NCNN_STRING | |||
| virtual int load_param(FILE* paramfp); | |||
| #endif // NCNN_STRING | |||
| virtual int load_param_bin(FILE* paramfp); | |||
| #endif // NCNN_STDIO | |||
| virtual int load_param(const unsigned char*& mem); | |||
| virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const; | |||
| public: | |||
| int woffset; | |||
| int hoffset; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_CROP_H | |||
| @@ -0,0 +1,348 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "deconvolution.h" | |||
| namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(Deconvolution) | |||
| Deconvolution::Deconvolution() | |||
| { | |||
| one_blob_only = true; | |||
| support_inplace = false; | |||
| } | |||
| Deconvolution::~Deconvolution() | |||
| { | |||
| } | |||
| #if NCNN_STDIO | |||
| #if NCNN_STRING | |||
| int Deconvolution::load_param(FILE* paramfp) | |||
| { | |||
| int nscan = fscanf(paramfp, "%d %d %d %d %d %d %d", | |||
| &num_output, &kernel_size, &dilation, &stride, &pad, &bias_term, | |||
| &weight_data_size); | |||
| if (nscan != 7) | |||
| { | |||
| fprintf(stderr, "Deconvolution load_param failed %d\n", nscan); | |||
| return -1; | |||
| } | |||
| return 0; | |||
| } | |||
| #endif // NCNN_STRING | |||
| int Deconvolution::load_param_bin(FILE* paramfp) | |||
| { | |||
| fread(&num_output, sizeof(int), 1, paramfp); | |||
| fread(&kernel_size, sizeof(int), 1, paramfp); | |||
| fread(&dilation, sizeof(int), 1, paramfp); | |||
| fread(&stride, sizeof(int), 1, paramfp); | |||
| fread(&pad, sizeof(int), 1, paramfp); | |||
| fread(&bias_term, sizeof(int), 1, paramfp); | |||
| fread(&weight_data_size, sizeof(int), 1, paramfp); | |||
| return 0; | |||
| } | |||
| int Deconvolution::load_model(FILE* binfp) | |||
| { | |||
| int nread; | |||
| union | |||
| { | |||
| struct | |||
| { | |||
| unsigned char f0; | |||
| unsigned char f1; | |||
| unsigned char f2; | |||
| unsigned char f3; | |||
| }; | |||
| unsigned int tag; | |||
| } flag_struct; | |||
| nread = fread(&flag_struct, sizeof(flag_struct), 1, binfp); | |||
| if (nread != 1) | |||
| { | |||
| fprintf(stderr, "Deconvolution read flag_struct failed %d\n", nread); | |||
| return -1; | |||
| } | |||
| unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3; | |||
| weight_data.create(weight_data_size); | |||
| if (weight_data.empty()) | |||
| return -100; | |||
| if (flag_struct.tag == 0x01306B47) | |||
| { | |||
| // half-precision weight data | |||
| int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned short), 4); | |||
| std::vector<unsigned short> float16_weights; | |||
| float16_weights.resize(align_weight_data_size); | |||
| nread = fread(float16_weights.data(), align_weight_data_size, 1, binfp); | |||
| if (nread != 1) | |||
| { | |||
| fprintf(stderr, "Deconvolution read float16_weights failed %d\n", nread); | |||
| return -1; | |||
| } | |||
| weight_data = Mat::from_float16(float16_weights.data(), weight_data_size); | |||
| if (weight_data.empty()) | |||
| return -100; | |||
| } | |||
| else if (flag != 0) | |||
| { | |||
| // quantized weight data | |||
| float quantization_value[256]; | |||
| nread = fread(quantization_value, 256 * sizeof(float), 1, binfp); | |||
| if (nread != 1) | |||
| { | |||
| fprintf(stderr, "Deconvolution read quantization_value failed %d\n", nread); | |||
| return -1; | |||
| } | |||
| int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned char), 4); | |||
| std::vector<unsigned char> index_array; | |||
| index_array.resize(align_weight_data_size); | |||
| nread = fread(index_array.data(), align_weight_data_size, 1, binfp); | |||
| if (nread != 1) | |||
| { | |||
| fprintf(stderr, "Deconvolution read index_array failed %d\n", nread); | |||
| return -1; | |||
| } | |||
| float* weight_data_ptr = weight_data; | |||
| for (int i = 0; i < weight_data_size; i++) | |||
| { | |||
| weight_data_ptr[i] = quantization_value[ index_array[i] ]; | |||
| } | |||
| } | |||
| else if (flag_struct.f0 == 0) | |||
| { | |||
| // raw weight data | |||
| nread = fread(weight_data, weight_data_size * sizeof(float), 1, binfp); | |||
| if (nread != 1) | |||
| { | |||
| fprintf(stderr, "Deconvolution read weight_data failed %d\n", nread); | |||
| return -1; | |||
| } | |||
| } | |||
| if (bias_term) | |||
| { | |||
| bias_data.create(num_output); | |||
| if (bias_data.empty()) | |||
| return -100; | |||
| nread = fread(bias_data, num_output * sizeof(float), 1, binfp); | |||
| if (nread != 1) | |||
| { | |||
| fprintf(stderr, "Deconvolution read bias_data failed %d\n", nread); | |||
| return -1; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| #endif // NCNN_STDIO | |||
| int Deconvolution::load_param(const unsigned char*& mem) | |||
| { | |||
| num_output = *(int*)(mem); | |||
| mem += 4; | |||
| kernel_size = *(int*)(mem); | |||
| mem += 4; | |||
| dilation = *(int*)(mem); | |||
| mem += 4; | |||
| stride = *(int*)(mem); | |||
| mem += 4; | |||
| pad = *(int*)(mem); | |||
| mem += 4; | |||
| bias_term = *(int*)(mem); | |||
| mem += 4; | |||
| weight_data_size = *(int*)(mem); | |||
| mem += 4; | |||
| return 0; | |||
| } | |||
| int Deconvolution::load_model(const unsigned char*& mem) | |||
| { | |||
| union | |||
| { | |||
| struct | |||
| { | |||
| unsigned char f0; | |||
| unsigned char f1; | |||
| unsigned char f2; | |||
| unsigned char f3; | |||
| }; | |||
| unsigned int tag; | |||
| } flag_struct; | |||
| memcpy(&flag_struct, mem, sizeof(flag_struct)); | |||
| mem += sizeof(flag_struct); | |||
| unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3; | |||
| if (flag_struct.tag == 0x01306B47) | |||
| { | |||
| // half-precision weight data | |||
| weight_data = Mat::from_float16((unsigned short*)mem, weight_data_size); | |||
| mem += alignSize(weight_data_size * sizeof(unsigned short), 4); | |||
| if (weight_data.empty()) | |||
| return -100; | |||
| } | |||
| else if (flag != 0) | |||
| { | |||
| // quantized weight data | |||
| const float* quantization_value = (const float*)mem; | |||
| mem += 256 * sizeof(float); | |||
| const unsigned char* index_array = (const unsigned char*)mem; | |||
| mem += alignSize(weight_data_size * sizeof(unsigned char), 4); | |||
| weight_data.create(weight_data_size); | |||
| if (weight_data.empty()) | |||
| return -100; | |||
| float* weight_data_ptr = weight_data; | |||
| for (int i = 0; i < weight_data_size; i++) | |||
| { | |||
| weight_data_ptr[i] = quantization_value[ index_array[i] ]; | |||
| } | |||
| } | |||
| else if (flag_struct.f0 == 0) | |||
| { | |||
| // raw weight data | |||
| weight_data = Mat(weight_data_size, (float*)mem); | |||
| mem += weight_data_size * sizeof(float); | |||
| } | |||
| if (bias_term) | |||
| { | |||
| bias_data = Mat(num_output, (float*)mem); | |||
| mem += num_output * sizeof(float); | |||
| } | |||
| return 0; | |||
| } | |||
| int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| { | |||
| // backward strided convolv with NxN kernel | |||
| // value = value + bias | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| // fprintf(stderr, "Deconvolution input %d x %d pad = %d ksize=%d stride=%d\n", w, h, pad, kernel_size, stride); | |||
| const int kernel_extent = dilation * (kernel_size - 1) + 1; | |||
| int outw = (w - 1) * stride + kernel_extent; | |||
| int outh = (h - 1) * stride + kernel_extent; | |||
| Mat top_blob_bordered; | |||
| top_blob_bordered.create(outw, outh, num_output); | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| const int maxk = kernel_size * kernel_size; | |||
| // kernel offsets | |||
| std::vector<int> _space_ofs(maxk); | |||
| int* space_ofs = &_space_ofs[0]; | |||
| { | |||
| int p1 = 0; | |||
| int p2 = 0; | |||
| int gap = outw * dilation - kernel_extent; | |||
| for (int i = 0; i < kernel_size; i++) | |||
| { | |||
| for (int j = 0; j < kernel_size; j++) | |||
| { | |||
| space_ofs[p1] = p2; | |||
| p1++; | |||
| p2 += dilation; | |||
| } | |||
| p2 += gap; | |||
| } | |||
| } | |||
| // num_output | |||
| const float* weight_data_ptr = weight_data; | |||
| #pragma omp parallel for | |||
| for (int p=0; p<num_output; p++) | |||
| { | |||
| Mat out = top_blob_bordered.channel(p); | |||
| const float bias = bias_term ? bias_data.data[p] : 0.f; | |||
| out.fill(bias); | |||
| for (int i = 0; i < h; i++) | |||
| { | |||
| for (int j = 0; j < w; j++) | |||
| { | |||
| float* outptr = out.data + out.w * i*stride + j*stride; | |||
| const float* kptr = weight_data_ptr + maxk * channels * p; | |||
| // channels | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const Mat m = bottom_blob.channel(q); | |||
| float val = *(m.data + m.w * i + j); | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| float w = kptr[k]; | |||
| outptr[ space_ofs[k] ] += val * w; | |||
| } | |||
| kptr += maxk; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| top_blob = top_blob_bordered; | |||
| if (pad > 0) | |||
| { | |||
| copy_cut_border(top_blob_bordered, top_blob, pad, pad, pad, pad); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| outw = top_blob.w; | |||
| outh = top_blob.h; | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,58 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_DECONVOLUTION_H | |||
| #define LAYER_DECONVOLUTION_H | |||
| #include "layer.h" | |||
| namespace ncnn { | |||
| class Deconvolution : public Layer | |||
| { | |||
| public: | |||
| Deconvolution(); | |||
| virtual ~Deconvolution(); | |||
| #if NCNN_STDIO | |||
| #if NCNN_STRING | |||
| virtual int load_param(FILE* paramfp); | |||
| #endif // NCNN_STRING | |||
| virtual int load_param_bin(FILE* paramfp); | |||
| virtual int load_model(FILE* binfp); | |||
| #endif // NCNN_STDIO | |||
| virtual int load_param(const unsigned char*& mem); | |||
| virtual int load_model(const unsigned char*& mem); | |||
| virtual int forward(const Mat& bottom_blobs, Mat& top_blobs) const; | |||
| public: | |||
| // param | |||
| int num_output; | |||
| int kernel_size; | |||
| int dilation; | |||
| int stride; | |||
| int pad; | |||
| int bias_term; | |||
| int weight_data_size; | |||
| // model | |||
| Mat weight_data; | |||
| Mat bias_data; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_DECONVOLUTION_H | |||
| @@ -0,0 +1,38 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "dropout.h" | |||
| namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(Dropout) | |||
| Dropout::Dropout() | |||
| { | |||
| one_blob_only = true; | |||
| support_inplace = true; | |||
| } | |||
| int Dropout::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| { | |||
| top_blob = bottom_blob; | |||
| return 0; | |||
| } | |||
| int Dropout::forward_inplace(Mat& /*bottom_top_blob*/) const | |||
| { | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,35 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_DROPOUT_H | |||
| #define LAYER_DROPOUT_H | |||
| #include "layer.h" | |||
| namespace ncnn { | |||
| class Dropout : public Layer | |||
| { | |||
| public: | |||
| Dropout(); | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_DROPOUT_H | |||
| @@ -0,0 +1,246 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "eltwise.h" | |||
| namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(Eltwise) | |||
| Eltwise::Eltwise() | |||
| { | |||
| } | |||
| #if NCNN_STDIO | |||
| #if NCNN_STRING | |||
| int Eltwise::load_param(FILE* paramfp) | |||
| { | |||
| int nscan = fscanf(paramfp, "%d %d", &op_type, &num_coeff); | |||
| if (nscan != 2) | |||
| { | |||
| fprintf(stderr, "Eltwise load_param failed %d\n", nscan); | |||
| return -1; | |||
| } | |||
| if (num_coeff > 0) | |||
| { | |||
| coeffs.create(num_coeff); | |||
| if (coeffs.empty()) | |||
| return -100; | |||
| float* coeffs_ptr = coeffs; | |||
| for (int i=0; i<num_coeff; i++) | |||
| { | |||
| int nscan = fscanf(paramfp, "%f", &coeffs_ptr[i]); | |||
| if (nscan != 1) | |||
| { | |||
| fprintf(stderr, "Eltwise load_param failed %d\n", nscan); | |||
| return -1; | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| #endif // NCNN_STRING | |||
| int Eltwise::load_param_bin(FILE* paramfp) | |||
| { | |||
| fread(&op_type, sizeof(int), 1, paramfp); | |||
| fread(&num_coeff, sizeof(int), 1, paramfp); | |||
| if (num_coeff > 0) | |||
| { | |||
| coeffs.create(num_coeff); | |||
| if (coeffs.empty()) | |||
| return -100; | |||
| float* coeffs_ptr = coeffs; | |||
| fread(coeffs_ptr, sizeof(float), num_coeff, paramfp); | |||
| } | |||
| return 0; | |||
| } | |||
| #endif // NCNN_STDIO | |||
| int Eltwise::load_param(const unsigned char*& mem) | |||
| { | |||
| op_type = *(int*)(mem); | |||
| mem += 4; | |||
| num_coeff = *(int*)(mem); | |||
| mem += 4; | |||
| coeffs = Mat(num_coeff, (float*)mem); | |||
| mem += num_coeff * sizeof(float); | |||
| return 0; | |||
| } | |||
| int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const | |||
| { | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| int size = w * h; | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(w, h, channels); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| if (op_type == Operation_PROD) | |||
| { | |||
| // first blob | |||
| const Mat& bottom_blob1 = bottom_blobs[1]; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| const float* ptr1 = bottom_blob1.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| outptr[i] = ptr[i] * ptr1[i]; | |||
| } | |||
| } | |||
| for (size_t b=2; b<bottom_blobs.size(); b++) | |||
| { | |||
| const Mat& bottom_blob1 = bottom_blobs[b]; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob1.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| outptr[i] *= ptr[i]; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| else if (op_type == Operation_SUM) | |||
| { | |||
| if (num_coeff == 0) | |||
| { | |||
| // first blob | |||
| const Mat& bottom_blob1 = bottom_blobs[1]; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| const float* ptr1 = bottom_blob1.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| outptr[i] = ptr[i] + ptr1[i]; | |||
| } | |||
| } | |||
| for (size_t b=2; b<bottom_blobs.size(); b++) | |||
| { | |||
| const Mat& bottom_blob1 = bottom_blobs[b]; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob1.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| outptr[i] += ptr[i]; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| const float* coeffs_ptr = coeffs; | |||
| // first blob | |||
| const Mat& bottom_blob1 = bottom_blobs[1]; | |||
| float coeff0 = coeffs_ptr[0]; | |||
| float coeff1 = coeffs_ptr[1]; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| const float* ptr1 = bottom_blob1.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| outptr[i] = ptr[i] * coeff0 + ptr1[i] * coeff1; | |||
| } | |||
| } | |||
| for (size_t b=2; b<bottom_blobs.size(); b++) | |||
| { | |||
| const Mat& bottom_blob1 = bottom_blobs[b]; | |||
| float coeff = coeffs_ptr[b]; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob1.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| outptr[i] += ptr[i] * coeff; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| else if (op_type == Operation_MAX) | |||
| { | |||
| // first blob | |||
| const Mat& bottom_blob1 = bottom_blobs[1]; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| const float* ptr1 = bottom_blob1.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| outptr[i] = std::max(ptr[i], ptr1[i]); | |||
| } | |||
| } | |||
| for (size_t b=2; b<bottom_blobs.size(); b++) | |||
| { | |||
| const Mat& bottom_blob1 = bottom_blobs[b]; | |||
| #pragma omp parallel for | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob1.channel(q); | |||
| float* outptr = top_blob.channel(q); | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| outptr[i] = std::max(outptr[i], ptr[i]); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,48 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_ELTWISE_H | |||
| #define LAYER_ELTWISE_H | |||
| #include "layer.h" | |||
| namespace ncnn { | |||
| class Eltwise : public Layer | |||
| { | |||
| public: | |||
| Eltwise(); | |||
| #if NCNN_STDIO | |||
| #if NCNN_STRING | |||
| virtual int load_param(FILE* paramfp); | |||
| #endif // NCNN_STRING | |||
| virtual int load_param_bin(FILE* paramfp); | |||
| #endif // NCNN_STDIO | |||
| virtual int load_param(const unsigned char*& mem); | |||
| virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const; | |||
| enum { Operation_PROD = 0, Operation_SUM = 1, Operation_MAX = 2 }; | |||
| public: | |||
| // param | |||
| int op_type; | |||
| int num_coeff; | |||
| Mat coeffs; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_ELTWISE_H | |||