From 9706cd1447119cd3f66e2dddc8a5ffc5935f8661 Mon Sep 17 00:00:00 2001 From: nihui Date: Sun, 15 Jul 2018 20:38:36 +0800 Subject: [PATCH] implement ncnn blob/workspace allocator, fine-grained per-layer openmp threads control, fix #469 --- benchmark/benchncnn.cpp | 19 +- src/CMakeLists.txt | 1 + src/allocator.cpp | 237 +++++++++++++++++++ src/allocator.h | 175 ++++++++++++++ src/layer.cpp | 46 +++- src/layer.h | 24 +- src/layer/absval.cpp | 4 +- src/layer/absval.h | 2 +- src/layer/argmax.cpp | 6 +- src/layer/argmax.h | 2 +- src/layer/arm/absval_arm.cpp | 4 +- src/layer/arm/absval_arm.h | 2 +- src/layer/arm/batchnorm_arm.cpp | 6 +- src/layer/arm/batchnorm_arm.h | 2 +- src/layer/arm/bias_arm.cpp | 4 +- src/layer/arm/bias_arm.h | 2 +- src/layer/arm/convolution_1x1.h | 32 +-- src/layer/arm/convolution_2x2.h | 4 +- src/layer/arm/convolution_3x3.h | 55 ++--- src/layer/arm/convolution_4x4.h | 4 +- src/layer/arm/convolution_5x5.h | 8 +- src/layer/arm/convolution_7x7.h | 8 +- src/layer/arm/convolution_arm.cpp | 48 ++-- src/layer/arm/convolution_arm.h | 6 +- src/layer/arm/convolutiondepthwise_3x3.h | 8 +- src/layer/arm/convolutiondepthwise_arm.cpp | 19 +- src/layer/arm/convolutiondepthwise_arm.h | 2 +- src/layer/arm/deconvolution_3x3.h | 8 +- src/layer/arm/deconvolution_4x4.h | 8 +- src/layer/arm/deconvolution_arm.cpp | 39 ++- src/layer/arm/deconvolution_arm.h | 2 +- src/layer/arm/deconvolutiondepthwise_arm.cpp | 35 ++- src/layer/arm/deconvolutiondepthwise_arm.h | 2 +- src/layer/arm/eltwise_arm.cpp | 21 +- src/layer/arm/eltwise_arm.h | 2 +- src/layer/arm/innerproduct_arm.cpp | 9 +- src/layer/arm/innerproduct_arm.h | 2 +- src/layer/arm/lrn_arm.cpp | 15 +- src/layer/arm/lrn_arm.h | 2 +- src/layer/arm/pooling_2x2.h | 4 +- src/layer/arm/pooling_3x3.h | 4 +- src/layer/arm/pooling_arm.cpp | 21 +- src/layer/arm/pooling_arm.h | 2 +- src/layer/arm/prelu_arm.cpp | 6 +- src/layer/arm/prelu_arm.h | 2 +- src/layer/arm/relu_arm.cpp | 6 +- src/layer/arm/relu_arm.h | 2 +- src/layer/arm/scale_arm.cpp | 8 +- src/layer/arm/scale_arm.h | 2 +- src/layer/arm/sigmoid_arm.cpp | 4 +- src/layer/arm/sigmoid_arm.h | 2 +- src/layer/arm/softmax_arm.cpp | 13 +- src/layer/arm/softmax_arm.h | 2 +- src/layer/batchnorm.cpp | 8 +- src/layer/batchnorm.h | 2 +- src/layer/bias.cpp | 4 +- src/layer/bias.h | 2 +- src/layer/binaryop.cpp | 81 +++---- src/layer/binaryop.h | 4 +- src/layer/bnll.cpp | 4 +- src/layer/bnll.h | 2 +- src/layer/clip.cpp | 4 +- src/layer/clip.h | 2 +- src/layer/concat.cpp | 20 +- src/layer/concat.h | 2 +- src/layer/convolution.cpp | 13 +- src/layer/convolution.h | 2 +- src/layer/convolutiondepthwise.cpp | 15 +- src/layer/convolutiondepthwise.h | 2 +- src/layer/crop.cpp | 8 +- src/layer/crop.h | 4 +- src/layer/deconvolution.cpp | 31 ++- src/layer/deconvolution.h | 2 +- src/layer/deconvolutiondepthwise.cpp | 33 ++- src/layer/deconvolutiondepthwise.h | 2 +- src/layer/detectionoutput.cpp | 10 +- src/layer/detectionoutput.h | 2 +- src/layer/dropout.cpp | 4 +- src/layer/dropout.h | 2 +- src/layer/eltwise.cpp | 21 +- src/layer/eltwise.h | 2 +- src/layer/elu.cpp | 4 +- src/layer/elu.h | 2 +- src/layer/embed.cpp | 6 +- src/layer/embed.h | 2 +- src/layer/exp.cpp | 6 +- src/layer/exp.h | 2 +- src/layer/expanddims.cpp | 18 +- src/layer/expanddims.h | 2 +- src/layer/flatten.cpp | 7 +- src/layer/flatten.h | 2 +- src/layer/innerproduct.cpp | 7 +- src/layer/innerproduct.h | 2 +- src/layer/input.cpp | 2 +- src/layer/input.h | 2 +- src/layer/instancenorm.cpp | 4 +- src/layer/instancenorm.h | 2 +- src/layer/interp.cpp | 10 +- src/layer/interp.h | 2 +- src/layer/log.cpp | 6 +- src/layer/log.h | 2 +- src/layer/lrn.cpp | 15 +- src/layer/lrn.h | 2 +- src/layer/lstm.cpp | 53 ++--- src/layer/lstm.h | 2 +- src/layer/memorydata.cpp | 4 +- src/layer/memorydata.h | 2 +- src/layer/mvn.cpp | 21 +- src/layer/mvn.h | 2 +- src/layer/normalize.cpp | 25 +- src/layer/normalize.h | 2 +- src/layer/padding.cpp | 4 +- src/layer/padding.h | 2 +- src/layer/permute.cpp | 23 +- src/layer/permute.h | 2 +- src/layer/pooling.cpp | 21 +- src/layer/pooling.h | 2 +- src/layer/power.cpp | 4 +- src/layer/power.h | 2 +- src/layer/prelu.cpp | 10 +- src/layer/prelu.h | 2 +- src/layer/priorbox.cpp | 6 +- src/layer/priorbox.h | 2 +- src/layer/proposal.cpp | 6 +- src/layer/proposal.h | 2 +- src/layer/reduction.cpp | 41 ++-- src/layer/reduction.h | 2 +- src/layer/relu.cpp | 6 +- src/layer/relu.h | 2 +- src/layer/reorg.cpp | 7 +- src/layer/reorg.h | 2 +- src/layer/reshape.cpp | 11 +- src/layer/reshape.h | 2 +- src/layer/rnn.cpp | 7 +- src/layer/rnn.h | 2 +- src/layer/roipooling.cpp | 7 +- src/layer/roipooling.h | 2 +- src/layer/scale.cpp | 18 +- src/layer/scale.h | 4 +- src/layer/shufflechannel.cpp | 4 +- src/layer/shufflechannel.h | 2 +- src/layer/sigmoid.cpp | 4 +- src/layer/sigmoid.h | 2 +- src/layer/slice.cpp | 20 +- src/layer/slice.h | 2 +- src/layer/softmax.cpp | 43 ++-- src/layer/softmax.h | 2 +- src/layer/split.cpp | 2 +- src/layer/split.h | 2 +- src/layer/spp.cpp | 12 +- src/layer/spp.h | 2 +- src/layer/squeeze.cpp | 14 +- src/layer/squeeze.h | 2 +- src/layer/tanh.cpp | 4 +- src/layer/tanh.h | 2 +- src/layer/threshold.cpp | 4 +- src/layer/threshold.h | 2 +- src/layer/tile.cpp | 15 +- src/layer/tile.h | 2 +- src/layer/unaryop.cpp | 38 +-- src/layer/unaryop.h | 2 +- src/layer/x86/convolution_1x1.h | 8 +- src/layer/x86/convolution_3x3.h | 4 +- src/layer/x86/convolution_5x5.h | 4 +- src/layer/x86/convolution_x86.cpp | 42 ++-- src/layer/x86/convolution_x86.h | 6 +- src/layer/x86/convolutiondepthwise_3x3.h | 8 +- src/layer/x86/convolutiondepthwise_x86.cpp | 19 +- src/layer/x86/convolutiondepthwise_x86.h | 2 +- src/layer/yolodetectionoutput.cpp | 8 +- src/layer/yolodetectionoutput.h | 2 +- src/mat.cpp | 28 ++- src/mat.h | 189 ++++++--------- src/mat_pixel.cpp | 70 +++--- src/net.cpp | 95 +++----- src/net.h | 11 +- 176 files changed, 1414 insertions(+), 924 deletions(-) create mode 100644 src/allocator.cpp create mode 100644 src/allocator.h diff --git a/benchmark/benchncnn.cpp b/benchmark/benchncnn.cpp index 8e6fdd28d..42e1c9b34 100644 --- a/benchmark/benchncnn.cpp +++ b/benchmark/benchncnn.cpp @@ -52,6 +52,9 @@ public: static int g_loop_count = 4; +static ncnn::UnlockedPoolAllocator g_blob_pool_allocator; +static ncnn::PoolAllocator g_workspace_pool_allocator; + void benchmark(const char* comment, void (*init)(ncnn::Net&), void (*run)(const ncnn::Net&)) { ncnn::BenchNet net; @@ -60,6 +63,9 @@ void benchmark(const char* comment, void (*init)(ncnn::Net&), void (*run)(const net.load_model(); + g_blob_pool_allocator.clear(); + g_workspace_pool_allocator.clear(); + // sleep 10 seconds for cooling down SOC :( #ifdef _WIN32 Sleep(10 * 1000); @@ -265,8 +271,6 @@ void mobilenet_yolo_run(const ncnn::Net& net) { ncnn::Extractor ex = net.create_extractor(); - // NOTE original model input is 416x416x3 - // you may change to 300x300x3 for comparison with ssd ncnn::Mat in(416, 416, 3); ex.input("data", in); @@ -295,6 +299,17 @@ int main(int argc, char** argv) g_loop_count = loop_count; + g_blob_pool_allocator.set_size_compare_ratio(0.0f); + g_workspace_pool_allocator.set_size_compare_ratio(0.5f); + + ncnn::Option opt; + opt.lightmode = true; + opt.num_threads = num_threads; + opt.blob_allocator = &g_blob_pool_allocator; + opt.workspace_allocator = &g_workspace_pool_allocator; + + ncnn::set_default_option(opt); + ncnn::set_cpu_powersave(powersave); ncnn::set_omp_dynamic(0); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e72b16606..fa7ec646e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -8,6 +8,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR}) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/layer) set(ncnn_SRCS + allocator.cpp blob.cpp cpu.cpp layer.cpp diff --git a/src/allocator.cpp b/src/allocator.cpp new file mode 100644 index 000000000..2f866ee0a --- /dev/null +++ b/src/allocator.cpp @@ -0,0 +1,237 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "allocator.h" + +#include + +namespace ncnn { + +PoolAllocator::PoolAllocator() +{ + size_compare_ratio = 192;// 0.75f * 256 +} + +PoolAllocator::~PoolAllocator() +{ + clear(); + + if (!payouts.empty()) + { + fprintf(stderr, "FATAL ERROR! pool allocator destroyed too early\n"); + std::list< std::pair >::iterator it = payouts.begin(); + for (; it != payouts.end(); it++) + { + void* ptr = it->second; + fprintf(stderr, "%p still in use\n", ptr); + } + } +} + +void PoolAllocator::clear() +{ + budgets_lock.lock(); + + std::list< std::pair >::iterator it = budgets.begin(); + for (; it != budgets.end(); it++) + { + void* ptr = it->second; + ncnn::fastFree(ptr); + } + budgets.clear(); + + budgets_lock.unlock(); +} + +void PoolAllocator::set_size_compare_ratio(float scr) +{ + if (scr < 0.f || scr > 1.f) + { + fprintf(stderr, "invalid size compare ratio %f\n", scr); + return; + } + + size_compare_ratio = (unsigned int)(scr * 256); +} + +void* PoolAllocator::fastMalloc(size_t size) +{ + budgets_lock.lock(); + + // find free budget + std::list< std::pair >::iterator it = budgets.begin(); + for (; it != budgets.end(); it++) + { + size_t bs = it->first; + + // size_compare_ratio ~ 100% + if (bs >= size && ((bs * size_compare_ratio) >> 8) <= size) + { + void* ptr = it->second; + + budgets.erase(it); + + budgets_lock.unlock(); + + payouts_lock.lock(); + + payouts.push_back(std::make_pair(bs, ptr)); + + payouts_lock.unlock(); + + return ptr; + } + } + + budgets_lock.unlock(); + + // new + void* ptr = ncnn::fastMalloc(size); + + payouts_lock.lock(); + + payouts.push_back(std::make_pair(size, ptr)); + + payouts_lock.unlock(); + + return ptr; +} + +void PoolAllocator::fastFree(void* ptr) +{ + payouts_lock.lock(); + + // return to budgets + std::list< std::pair >::iterator it = payouts.begin(); + for (; it != payouts.end(); it++) + { + if (it->second == ptr) + { + size_t size = it->first; + + payouts.erase(it); + + payouts_lock.unlock(); + + budgets_lock.lock(); + + budgets.push_back(std::make_pair(size, ptr)); + + budgets_lock.unlock(); + + return; + } + } + + payouts_lock.unlock(); + + fprintf(stderr, "FATAL ERROR! pool allocator get wild %p\n", ptr); + ncnn::fastFree(ptr); +} + +UnlockedPoolAllocator::UnlockedPoolAllocator() +{ + size_compare_ratio = 192;// 0.75f * 256 +} + +UnlockedPoolAllocator::~UnlockedPoolAllocator() +{ + clear(); + + if (!payouts.empty()) + { + fprintf(stderr, "FATAL ERROR! unlocked pool allocator destroyed too early\n"); + std::list< std::pair >::iterator it = payouts.begin(); + for (; it != payouts.end(); it++) + { + void* ptr = it->second; + fprintf(stderr, "%p still in use\n", ptr); + } + } +} + +void UnlockedPoolAllocator::clear() +{ + std::list< std::pair >::iterator it = budgets.begin(); + for (; it != budgets.end(); it++) + { + void* ptr = it->second; + ncnn::fastFree(ptr); + } + budgets.clear(); +} + +void UnlockedPoolAllocator::set_size_compare_ratio(float scr) +{ + if (scr < 0.f || scr > 1.f) + { + fprintf(stderr, "invalid size compare ratio %f\n", scr); + return; + } + + size_compare_ratio = (unsigned int)(scr * 256); +} + +void* UnlockedPoolAllocator::fastMalloc(size_t size) +{ + // find free budget + std::list< std::pair >::iterator it = budgets.begin(); + for (; it != budgets.end(); it++) + { + size_t bs = it->first; + + // size_compare_ratio ~ 100% + if (bs >= size && ((bs * size_compare_ratio) >> 8) <= size) + { + void* ptr = it->second; + + budgets.erase(it); + + payouts.push_back(std::make_pair(bs, ptr)); + + return ptr; + } + } + + // new + void* ptr = ncnn::fastMalloc(size); + + payouts.push_back(std::make_pair(size, ptr)); + + return ptr; +} + +void UnlockedPoolAllocator::fastFree(void* ptr) +{ + // return to budgets + std::list< std::pair >::iterator it = payouts.begin(); + for (; it != payouts.end(); it++) + { + if (it->second == ptr) + { + size_t size = it->first; + + payouts.erase(it); + + budgets.push_back(std::make_pair(size, ptr)); + + return; + } + } + + fprintf(stderr, "FATAL ERROR! unlocked pool allocator get wild %p\n", ptr); + ncnn::fastFree(ptr); +} + +} // namespace ncnn diff --git a/src/allocator.h b/src/allocator.h new file mode 100644 index 000000000..061250ab0 --- /dev/null +++ b/src/allocator.h @@ -0,0 +1,175 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef NCNN_ALLOCATOR_H +#define NCNN_ALLOCATOR_H + +#ifdef _WIN32 +#define WIN32_LEAN_AND_MEAN +#include +#else +#include +#endif + +#include +#include + +namespace ncnn { + +// the alignment of all the allocated buffers +#define MALLOC_ALIGN 16 + +// Aligns a pointer to the specified number of bytes +// ptr Aligned pointer +// n Alignment size that must be a power of two +template static inline _Tp* alignPtr(_Tp* ptr, int n=(int)sizeof(_Tp)) +{ + return (_Tp*)(((size_t)ptr + n-1) & -n); +} + +// Aligns a buffer size to the specified number of bytes +// The function returns the minimum number that is greater or equal to sz and is divisible by n +// sz Buffer size to align +// n Alignment size that must be a power of two +static inline size_t alignSize(size_t sz, int n) +{ + return (sz + n-1) & -n; +} + +static inline void* fastMalloc(size_t size) +{ + unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + MALLOC_ALIGN); + if (!udata) + return 0; + unsigned char** adata = alignPtr((unsigned char**)udata + 1, MALLOC_ALIGN); + adata[-1] = udata; + return adata; +} + +static inline void fastFree(void* ptr) +{ + if (ptr) + { + unsigned char* udata = ((unsigned char**)ptr)[-1]; + free(udata); + } +} + +// exchange-add operation for atomic operations on reference counters +#if defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32) +// atomic increment on the linux version of the Intel(tm) compiler +# define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd(const_cast(reinterpret_cast(addr)), delta) +#elif defined __GNUC__ +# if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__) +# ifdef __ATOMIC_ACQ_REL +# define NCNN_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL) +# else +# define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4) +# endif +# else +# if defined __ATOMIC_ACQ_REL && !defined __clang__ +// version for gcc >= 4.7 +# define NCNN_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL) +# else +# define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta)) +# endif +# endif +#elif defined _MSC_VER && !defined RC_INVOKED +# include +# define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta) +#else +static inline void NCNN_XADD(int* addr, int delta) { int tmp = *addr; *addr += delta; return tmp; } +#endif + +#ifdef _WIN32 +class Mutex +{ +public: + Mutex() { InitializeSRWLock(&lock); } + ~Mutex() { } + void lock() { AcquireSRWLockExclusive(&lock); } + void unlock() { ReleaseSRWLockExclusive(&lock); } +private: + // NOTE SRWLock is available from windows vista + SRWLOCK lock; +}; +#else // _WIN32 +class Mutex +{ +public: + Mutex() { pthread_mutex_init(&mutex, 0); } + ~Mutex() { pthread_mutex_destroy(&mutex); } + void lock() { pthread_mutex_lock(&mutex); } + void unlock() { pthread_mutex_unlock(&mutex); } +private: + pthread_mutex_t mutex; +}; +#endif // _WIN32 + +class Allocator +{ +public: + virtual void* fastMalloc(size_t size) = 0; + virtual void fastFree(void* ptr) = 0; +}; + +class PoolAllocator : public Allocator +{ +public: + PoolAllocator(); + ~PoolAllocator(); + + // ratio range 0 ~ 1 + // default cr = 0.75 + void set_size_compare_ratio(float scr); + + // release all budgets immediately + void clear(); + + virtual void* fastMalloc(size_t size); + virtual void fastFree(void* ptr); + +private: + Mutex budgets_lock; + Mutex payouts_lock; + unsigned int size_compare_ratio;// 0~256 + std::list< std::pair > budgets; + std::list< std::pair > payouts; +}; + +class UnlockedPoolAllocator : public Allocator +{ +public: + UnlockedPoolAllocator(); + ~UnlockedPoolAllocator(); + + // ratio range 0 ~ 1 + // default cr = 0.75 + void set_size_compare_ratio(float scr); + + // release all budgets immediately + void clear(); + + virtual void* fastMalloc(size_t size); + virtual void fastFree(void* ptr); + +private: + unsigned int size_compare_ratio;// 0~256 + std::list< std::pair > budgets; + std::list< std::pair > payouts; +}; + +} // namespace ncnn + +#endif // NCNN_ALLOCATOR_H diff --git a/src/layer.cpp b/src/layer.cpp index 01e5e638c..522e33e6c 100644 --- a/src/layer.cpp +++ b/src/layer.cpp @@ -14,10 +14,40 @@ #include "layer.h" +#include #include +#include "cpu.h" namespace ncnn { +Option::Option() +{ + lightmode = true; + num_threads = get_cpu_count(); + blob_allocator = 0; + workspace_allocator = 0; +} + +static Option g_default_option; + +const Option& get_default_option() +{ + return g_default_option; +} + +int set_default_option(const Option& opt) +{ + if (opt.num_threads <= 0) + { + fprintf(stderr, "invalid option num_threads %d\n", opt.num_threads); + return -1; + } + + g_default_option = opt; + + return 0; +} + Layer::Layer() { one_blob_only = false; @@ -38,7 +68,7 @@ int Layer::load_model(const ModelBin& /*mb*/) return 0; } -int Layer::forward(const std::vector& bottom_blobs, std::vector& top_blobs) const +int Layer::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { if (!support_inplace) return -1; @@ -46,32 +76,32 @@ int Layer::forward(const std::vector& bottom_blobs, std::vector& top_b top_blobs = bottom_blobs; for (int i = 0; i < (int)top_blobs.size(); i++) { - top_blobs[i] = bottom_blobs[i].clone(); + top_blobs[i] = bottom_blobs[i].clone(opt.blob_allocator); if (top_blobs[i].empty()) return -100; } - return forward_inplace(top_blobs); + return forward_inplace(top_blobs, opt); } -int Layer::forward(const Mat& bottom_blob, Mat& top_blob) const +int Layer::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { if (!support_inplace) return -1; - top_blob = bottom_blob.clone(); + top_blob = bottom_blob.clone(opt.blob_allocator); if (top_blob.empty()) return -100; - return forward_inplace(top_blob); + return forward_inplace(top_blob, opt); } -int Layer::forward_inplace(std::vector& /*bottom_top_blobs*/) const +int Layer::forward_inplace(std::vector& /*bottom_top_blobs*/, const Option& /*opt*/) const { return -1; } -int Layer::forward_inplace(Mat& /*bottom_top_blob*/) const +int Layer::forward_inplace(Mat& /*bottom_top_blob*/, const Option& /*opt*/) const { return -1; } diff --git a/src/layer.h b/src/layer.h index 1eeae3b97..b46bf177d 100644 --- a/src/layer.h +++ b/src/layer.h @@ -25,6 +25,22 @@ namespace ncnn { +class Allocator; +class Option +{ +public: + Option(); + +public: + bool lightmode; + int num_threads; + Allocator* blob_allocator; + Allocator* workspace_allocator; +}; + +const Option& get_default_option(); +int set_default_option(const Option& opt); + class Layer { public: @@ -51,13 +67,13 @@ public: public: // implement inference // return 0 if success - virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs) const; - virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt = get_default_option()) const; + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt = get_default_option()) const; // implement inplace inference // return 0 if success - virtual int forward_inplace(std::vector& bottom_top_blobs) const; - virtual int forward_inplace(Mat& bottom_top_blob) const; + virtual int forward_inplace(std::vector& bottom_top_blobs, const Option& opt = get_default_option()) const; + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt = get_default_option()) const; public: #if NCNN_STRING diff --git a/src/layer/absval.cpp b/src/layer/absval.cpp index 5b066ab88..73a8f04c7 100644 --- a/src/layer/absval.cpp +++ b/src/layer/absval.cpp @@ -24,14 +24,14 @@ AbsVal::AbsVal() support_inplace = true; } -int AbsVal::forward_inplace(Mat& bottom_top_blob) const +int AbsVal::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; int h = bottom_top_blob.h; int channels = bottom_top_blob.c; int size = w * h; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q> 3; int remain_size_start = nn_size << 3; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int ii=0; ii> 2; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int ii=0; ii> 3; remain_outch_start = nn_outch << 3; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int pp=0; pp> 2; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int pp=0; pp> 3; remain_outch_start = nn_outch << 3; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int pp=0; pp> 2; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int pp=0; pp> 2; int remain_outch_start = nn_outch << 2; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int pp=0; pp #endif // __ARM_NEON -static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) +static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) { int w = bottom_blob.w; int inch = bottom_blob.c; @@ -28,7 +28,7 @@ static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke const float* kernel = _kernel; const float* bias = _bias; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int p=0; p #endif // __ARM_NEON -static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) +static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) { int w = bottom_blob.w; int inch = bottom_blob.c; @@ -31,7 +31,7 @@ static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke int nn_outch = outch >> 1; int remain_outch_start = nn_outch << 1; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int pp=0; pp> 2; int remain_outch_start = nn_outch << 2; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int pp=0; pp> 3; remain_outch_start = nn_outch << 3; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int pp=0; pp> 2; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int pp=0; pp> 1; int remain_outch_start = nn_outch << 1; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int pp=0; pp #endif // __ARM_NEON -static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) +static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) { int w = bottom_blob.w; int inch = bottom_blob.c; @@ -30,7 +30,7 @@ static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke const float* kernel = _kernel; const float* bias = _bias; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int p=0; p #endif // __ARM_NEON -static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) +static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) { int w = bottom_blob.w; int inch = bottom_blob.c; @@ -28,7 +28,7 @@ static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke const float* kernel = _kernel; const float* bias = _bias; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int p=0; p #endif // __ARM_NEON -static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) +static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) { int w = bottom_blob.w; int inch = bottom_blob.c; @@ -28,7 +28,7 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke const float* kernel = _kernel; const float* bias = _bias; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int p=0; p 0 || pad_h > 0) { - copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f); + copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); if (bottom_blob_bordered.empty()) return -100; @@ -101,7 +102,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv int hpad = kernel_extent + (h - 1) / stride * stride - h; if (wpad > 0 || hpad > 0) { - copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f); + copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); if (bottom_blob_bordered.empty()) return -100; } @@ -113,7 +114,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv int outw = (w - kernel_extent) / stride + 1; int outh = (h - kernel_extent) / stride + 1; - top_blob.create(outw, outh, num_output); + top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -132,7 +133,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv if (inner_bottom_blob.w != inner_w || inner_bottom_blob.h != inner_h) { - inner_bottom_blob.create(inner_w, inner_h, bottom_blob.c); + inner_bottom_blob.create(inner_w, inner_h, bottom_blob.c, elemsize, opt.workspace_allocator); if (inner_bottom_blob.empty()) { @@ -142,7 +143,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv if (inner_top_blob.w != inner_outw || inner_top_blob.h != inner_outh) { - inner_top_blob.create(inner_outw, inner_outh, num_output); + inner_top_blob.create(inner_outw, inner_outh, num_output, elemsize, opt.workspace_allocator); if (inner_top_blob.empty()) { @@ -150,7 +151,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv } } - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int c = 0; c < bottom_blob.c; c ++) { float *outptr = (float *) inner_bottom_blob.channel(c); @@ -166,9 +167,9 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv } } - conv(inner_bottom_blob, inner_top_blob, weight_data, bias_data); + conv(inner_bottom_blob, inner_top_blob, weight_data, bias_data, opt); - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int c = 0; c < num_output; c ++) { float *outptr = (float *) top_blob.channel(c) + x * outw + y; @@ -188,19 +189,19 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv return 0; } -int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const +int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { // convolv with NxN kernel // value = value + bias if (bottom_blob.dims != 3) { - return Convolution::forward(bottom_blob, top_blob); + return Convolution::forward(bottom_blob, top_blob, opt); } if (kernel_w != kernel_h || stride_w != stride_h) { - return Convolution::forward(bottom_blob, top_blob); + return Convolution::forward(bottom_blob, top_blob, opt); } const int kernel_size = kernel_w; @@ -208,10 +209,10 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const if (kernel_size > 7 || stride > 4 || dilation_w != dilation_h) { - return Convolution::forward(bottom_blob, top_blob); + return Convolution::forward(bottom_blob, top_blob, opt); } - typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&); + typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&); // kernel_size x stride conv_func conv_func_table[7][4] = @@ -263,22 +264,23 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const conv_func conv = conv_func_table[kernel_size-1][stride-1]; if (!conv) { - return Convolution::forward(bottom_blob, top_blob); + return Convolution::forward(bottom_blob, top_blob, opt); } if (dilation_w != 1) { - return forwardDilation(bottom_blob, top_blob, conv); + return forwardDilation(bottom_blob, top_blob, conv, opt); } int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; Mat bottom_blob_bordered = bottom_blob; if (pad_w > 0 || pad_h > 0) { - copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f); + copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); if (bottom_blob_bordered.empty()) return -100; @@ -291,7 +293,7 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const int hpad = kernel_size + (h - 1) / stride * stride - h; if (wpad > 0 || hpad > 0) { - copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f); + copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); if (bottom_blob_bordered.empty()) return -100; } @@ -303,21 +305,21 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const int outw = (w - kernel_size) / stride + 1; int outh = (h - kernel_size) / stride + 1; - top_blob.create(outw, outh, num_output); + top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; if (use_winograd3x3 && w <= 120 && h <= 120) { -// conv3x3s1_winograd64_neon4(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data); - conv3x3s1_winograd64_neon5(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data); +// conv3x3s1_winograd64_neon4(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt); + conv3x3s1_winograd64_neon5(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt); } else if (use_sgemm1x1) { - conv1x1s1_sgemm_neon(bottom_blob_bordered, top_blob, weight_1x1_sgemm_data, bias_data); + conv1x1s1_sgemm_neon(bottom_blob_bordered, top_blob, weight_1x1_sgemm_data, bias_data, opt); } else - conv(bottom_blob_bordered, top_blob, weight_data, bias_data); + conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); return 0; } diff --git a/src/layer/arm/convolution_arm.h b/src/layer/arm/convolution_arm.h index 6a47fff51..21d0256f2 100644 --- a/src/layer/arm/convolution_arm.h +++ b/src/layer/arm/convolution_arm.h @@ -19,7 +19,7 @@ namespace ncnn { -typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&); +typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&); class Convolution_arm : public Convolution { @@ -28,8 +28,8 @@ public: virtual int load_model(const ModelBin& mb); - virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; - virtual int forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv) const; + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + virtual int forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv, const Option& opt) const; public: bool use_winograd3x3; diff --git a/src/layer/arm/convolutiondepthwise_3x3.h b/src/layer/arm/convolutiondepthwise_3x3.h index c2f1ae222..6cd12a999 100644 --- a/src/layer/arm/convolutiondepthwise_3x3.h +++ b/src/layer/arm/convolutiondepthwise_3x3.h @@ -16,7 +16,7 @@ #include #endif // __ARM_NEON -static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) +static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) { int w = bottom_blob.w; @@ -28,7 +28,7 @@ static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ const float* kernel = _kernel; const float* bias = _bias; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int g=0; g 0 || pad_h > 0) { - copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f); + copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); if (bottom_blob_bordered.empty()) return -100; @@ -136,7 +137,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h; if (wpad > 0 || hpad > 0) { - copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f); + copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); if (bottom_blob_bordered.empty()) return -100; } @@ -148,7 +149,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con int outw = (w - kernel_extent_w) / stride_w + 1; int outh = (h - kernel_extent_h) / stride_h + 1; - top_blob.create(outw, outh, num_output); + top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -161,12 +162,12 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con { if (stride_w == 1 && stride_h == 1) { - convdw3x3s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data); + convdw3x3s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); return 0; } else if (stride_w == 2 && stride_h == 2) { - convdw3x3s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data); + convdw3x3s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); return 0; } } @@ -176,7 +177,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con omp_set_nested(0); #endif - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int g=0; gload_model(ModelBinFromMatArray(weights)); // forward - op->forward(bottom_blob_bordered_g, top_blob_g); + op->forward(bottom_blob_bordered_g, top_blob_g, opt); delete op; } @@ -235,7 +236,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con const ncnn::Layer* op = group_ops[g]; // forward - op->forward(bottom_blob_bordered_g, top_blob_g); + op->forward(bottom_blob_bordered_g, top_blob_g, opt); } return 0; diff --git a/src/layer/arm/convolutiondepthwise_arm.h b/src/layer/arm/convolutiondepthwise_arm.h index 8181f4181..188ef6bdd 100644 --- a/src/layer/arm/convolutiondepthwise_arm.h +++ b/src/layer/arm/convolutiondepthwise_arm.h @@ -27,7 +27,7 @@ public: virtual int load_model(const ModelBin& mb); - virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; public: std::vector group_ops; diff --git a/src/layer/arm/deconvolution_3x3.h b/src/layer/arm/deconvolution_3x3.h index 931eb9404..39082f2e8 100644 --- a/src/layer/arm/deconvolution_3x3.h +++ b/src/layer/arm/deconvolution_3x3.h @@ -16,7 +16,7 @@ #include #endif // __ARM_NEON -static void deconv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) +static void deconv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) { int w = bottom_blob.w; int h = bottom_blob.h; @@ -28,7 +28,7 @@ static void deconv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ const float* kernel = _kernel; const float* bias = _bias; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int p=0; p #endif // __ARM_NEON -static void deconv4x4s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) +static void deconv4x4s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) { int w = bottom_blob.w; int h = bottom_blob.h; @@ -28,7 +28,7 @@ static void deconv4x4s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ const float* kernel = _kernel; const float* bias = _bias; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int p=0; p 2 || dilation_w != 1 || dilation_h != 1) { - return Deconvolution::forward(bottom_blob, top_blob); + return Deconvolution::forward(bottom_blob, top_blob, opt); } - typedef void (*deconv_func)(const Mat&, Mat&, const Mat&, const Mat&); + typedef void (*deconv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&); // kernel_size x stride deconv_func deconv_func_table[2][2] = @@ -57,33 +57,46 @@ int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const deconv_func deconv = deconv_func_table[kernel_size-3][stride-1]; if (!deconv) { - return Deconvolution::forward(bottom_blob, top_blob); + return Deconvolution::forward(bottom_blob, top_blob, opt); } int w = bottom_blob.w; int h = bottom_blob.h; + size_t elemsize = bottom_blob.elemsize; int outw = (w - 1) * stride + kernel_size; int outh = (h - 1) * stride + kernel_size; - Mat top_blob_bordered = top_blob; - top_blob_bordered.create(outw, outh, num_output); - if (top_blob_bordered.empty()) - return -100; - - deconv(bottom_blob, top_blob_bordered, weight_data, bias_data); + Mat top_blob_bordered; + if (pad_w > 0 || pad_h > 0) + { + top_blob_bordered.create(outw, outh, num_output, elemsize, opt.workspace_allocator); + if (top_blob_bordered.empty()) + return -100; + } + else + { + top_blob_bordered = top_blob; + top_blob_bordered.create(outw, outh, num_output, elemsize, opt.blob_allocator); + if (top_blob_bordered.empty()) + return -100; + } - top_blob = top_blob_bordered; + deconv(bottom_blob, top_blob_bordered, weight_data, bias_data, opt); if (pad_w > 0 || pad_h > 0) { - copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w); + copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w, opt.blob_allocator, opt.num_threads); if (top_blob.empty()) return -100; outw = top_blob.w; outh = top_blob.h; } + else + { + top_blob = top_blob_bordered; + } return 0; } diff --git a/src/layer/arm/deconvolution_arm.h b/src/layer/arm/deconvolution_arm.h index ce7a83b5b..6b688b09f 100644 --- a/src/layer/arm/deconvolution_arm.h +++ b/src/layer/arm/deconvolution_arm.h @@ -22,7 +22,7 @@ namespace ncnn { class Deconvolution_arm : public Deconvolution { public: - virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; }; } // namespace ncnn diff --git a/src/layer/arm/deconvolutiondepthwise_arm.cpp b/src/layer/arm/deconvolutiondepthwise_arm.cpp index bfdd5ed7b..a18803745 100644 --- a/src/layer/arm/deconvolutiondepthwise_arm.cpp +++ b/src/layer/arm/deconvolutiondepthwise_arm.cpp @@ -24,7 +24,7 @@ namespace ncnn { DEFINE_LAYER_CREATOR(DeconvolutionDepthWise_arm) -int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) const +int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { // convolv with NxN kernel // value = value + bias @@ -32,6 +32,7 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; if (channels % group != 0 || num_output % group != 0) { @@ -45,10 +46,20 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c int outw = (w - 1) * stride_w + kernel_extent_w; int outh = (h - 1) * stride_h + kernel_extent_h; - Mat top_blob_bordered = top_blob; - top_blob_bordered.create(outw, outh, num_output); - if (top_blob_bordered.empty()) - return -100; + Mat top_blob_bordered; + if (pad_w > 0 || pad_h > 0) + { + top_blob_bordered.create(outw, outh, num_output, elemsize, opt.workspace_allocator); + if (top_blob_bordered.empty()) + return -100; + } + else + { + top_blob_bordered = top_blob; + top_blob_bordered.create(outw, outh, num_output, elemsize, opt.blob_allocator); + if (top_blob_bordered.empty()) + return -100; + } const int maxk = kernel_w * kernel_h; @@ -60,7 +71,7 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c omp_set_nested(0); #endif - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int g=0; gload_model(ModelBinFromMatArray(weights)); // forward - op->forward(bottom_blob_g, top_blob_bordered_g); + op->forward(bottom_blob_g, top_blob_bordered_g, opt); delete op; } @@ -148,23 +159,25 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c op->load_model(ModelBinFromMatArray(weights)); // forward - op->forward(bottom_blob_g, top_blob_bordered_g); + op->forward(bottom_blob_g, top_blob_bordered_g, opt); delete op; } } - top_blob = top_blob_bordered; - if (pad_w > 0 || pad_h > 0) { - copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w); + copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w, opt.blob_allocator, opt.num_threads); if (top_blob.empty()) return -100; outw = top_blob.w; outh = top_blob.h; } + else + { + top_blob = top_blob_bordered; + } return 0; diff --git a/src/layer/arm/deconvolutiondepthwise_arm.h b/src/layer/arm/deconvolutiondepthwise_arm.h index 472311da7..792478fd9 100644 --- a/src/layer/arm/deconvolutiondepthwise_arm.h +++ b/src/layer/arm/deconvolutiondepthwise_arm.h @@ -22,7 +22,7 @@ namespace ncnn { class DeconvolutionDepthWise_arm : public DeconvolutionDepthWise { public: - virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; }; } // namespace ncnn diff --git a/src/layer/arm/eltwise_arm.cpp b/src/layer/arm/eltwise_arm.cpp index cbe03d61a..5ed62bef0 100644 --- a/src/layer/arm/eltwise_arm.cpp +++ b/src/layer/arm/eltwise_arm.cpp @@ -22,16 +22,17 @@ namespace ncnn { DEFINE_LAYER_CREATOR(Eltwise_arm) -int Eltwise_arm::forward(const std::vector& bottom_blobs, std::vector& top_blobs) const +int Eltwise_arm::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { const Mat& bottom_blob = bottom_blobs[0]; int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; int size = w * h; Mat& top_blob = top_blobs[0]; - top_blob.create(w, h, channels); + top_blob.create(w, h, channels, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -39,7 +40,7 @@ int Eltwise_arm::forward(const std::vector& bottom_blobs, std::vector& { // first blob const Mat& bottom_blob1 = bottom_blobs[1]; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q& bottom_blobs, std::vector& for (size_t b=2; b& bottom_blobs, std::vector& { // first blob const Mat& bottom_blob1 = bottom_blobs[1]; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q& bottom_blobs, std::vector& for (size_t b=2; b& bottom_blobs, std::vector& const Mat& bottom_blob1 = bottom_blobs[1]; float coeff0 = coeffs_ptr[0]; float coeff1 = coeffs_ptr[1]; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q& bottom_blobs, std::vector& { const Mat& bottom_blob1 = bottom_blobs[b]; float coeff = coeffs_ptr[b]; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q& bottom_blobs, std::vector& { // first blob const Mat& bottom_blob1 = bottom_blobs[1]; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q& bottom_blobs, std::vector& for (size_t b=2; b& bottom_blobs, std::vector& top_blobs) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; }; } // namespace ncnn diff --git a/src/layer/arm/innerproduct_arm.cpp b/src/layer/arm/innerproduct_arm.cpp index d9947614d..5005ea7da 100644 --- a/src/layer/arm/innerproduct_arm.cpp +++ b/src/layer/arm/innerproduct_arm.cpp @@ -22,14 +22,15 @@ namespace ncnn { DEFINE_LAYER_CREATOR(InnerProduct_arm) -int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const +int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; int size = w * h; - top_blob.create(num_output); + top_blob.create(num_output, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -38,7 +39,7 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const int nn_num_output = num_output >> 2; int remain_num_output_start = nn_num_output << 2; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int pp=0; pp 0) { - copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f); + copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); if (square_blob_bordered.empty()) return -100; @@ -196,7 +197,7 @@ int LRN_arm::forward_inplace(Mat& bottom_top_blob) const } } - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q #endif // __ARM_NEON -static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob) +static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt) { int w = bottom_blob.w; int inch = bottom_blob.c; @@ -26,7 +26,7 @@ static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob) const int tailstep = w - 2*outw + w; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q #endif // __ARM_NEON -static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob) +static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt) { int w = bottom_blob.w; int inch = bottom_blob.c; @@ -26,7 +26,7 @@ static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob) const int tailstep = w - 2*outw + w; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q 0 || hpad > 0) { - copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value); + copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value, opt.workspace_allocator, opt.num_threads); if (bottom_blob_bordered.empty()) return -100; } @@ -107,14 +108,14 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const int outw = (w - kernel_w) / stride_w + 1; int outh = (h - kernel_h) / stride_h + 1; - top_blob.create(outw, outh, channels); + top_blob.create(outw, outh, channels, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; if (kernel_size == 2) - pooling2x2s2_max_neon(bottom_blob_bordered, top_blob); + pooling2x2s2_max_neon(bottom_blob_bordered, top_blob, opt); if (kernel_size == 3) - pooling3x3s2_max_neon(bottom_blob_bordered, top_blob); + pooling3x3s2_max_neon(bottom_blob_bordered, top_blob, opt); return 0; } diff --git a/src/layer/arm/pooling_arm.h b/src/layer/arm/pooling_arm.h index b7d774fa2..72f01533c 100644 --- a/src/layer/arm/pooling_arm.h +++ b/src/layer/arm/pooling_arm.h @@ -22,7 +22,7 @@ namespace ncnn { class Pooling_arm : public Pooling { public: - virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; }; } // namespace ncnn diff --git a/src/layer/arm/prelu_arm.cpp b/src/layer/arm/prelu_arm.cpp index 420d08892..845a8c217 100644 --- a/src/layer/arm/prelu_arm.cpp +++ b/src/layer/arm/prelu_arm.cpp @@ -22,11 +22,11 @@ namespace ncnn { DEFINE_LAYER_CREATOR(PReLU_arm) -int PReLU_arm::forward_inplace(Mat& bottom_top_blob) const +int PReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { int dims = bottom_top_blob.dims; if (dims != 3) - return PReLU::forward_inplace(bottom_top_blob); + return PReLU::forward_inplace(bottom_top_blob, opt); int w = bottom_top_blob.w; int h = bottom_top_blob.h; @@ -35,7 +35,7 @@ int PReLU_arm::forward_inplace(Mat& bottom_top_blob) const const float* slope_data_ptr = slope_data; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q -static int binary_op(const Mat& a, const Mat& b, Mat& c) +static int binary_op(const Mat& a, const Mat& b, Mat& c, const Option& opt) { Op op; @@ -51,6 +51,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c) int h = a.h; int channels = a.c; int size = w * h; + size_t elemsize = a.elemsize; int w1 = b.w; int h1 = b.h; @@ -59,13 +60,13 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c) if (a.dims == 3) { - c.create(w, h, channels); + c.create(w, h, channels, elemsize, opt.blob_allocator); if (c.empty()) return -100; if (b.dims == 3) { - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q -static int binary_op_scalar_inplace(Mat& a, float b) +static int binary_op_scalar_inplace(Mat& a, float b, const Option& opt) { Op op; @@ -362,7 +363,7 @@ static int binary_op_scalar_inplace(Mat& a, float b) int channels = a.c; int size = w * h; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q { T operator() (const T& x, const T& y) const { return y / x; } }; -int BinaryOp::forward(const std::vector& bottom_blobs, std::vector& top_blobs) const +int BinaryOp::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { const Mat& bottom_blob = bottom_blobs[0]; const Mat& bottom_blob1 = bottom_blobs[1]; @@ -409,63 +410,63 @@ int BinaryOp::forward(const std::vector& bottom_blobs, std::vector& to Mat& top_blob = top_blobs[0]; if (op_type == Operation_ADD) - return binary_op< std::plus >(bottom_blob, bottom_blob1, top_blob); + return binary_op< std::plus >(bottom_blob, bottom_blob1, top_blob, opt); if (op_type == Operation_SUB) - return binary_op< std::minus >(bottom_blob, bottom_blob1, top_blob); + return binary_op< std::minus >(bottom_blob, bottom_blob1, top_blob, opt); if (op_type == Operation_MUL) - return binary_op< std::multiplies >(bottom_blob, bottom_blob1, top_blob); + return binary_op< std::multiplies >(bottom_blob, bottom_blob1, top_blob, opt); if (op_type == Operation_DIV) - return binary_op< std::divides >(bottom_blob, bottom_blob1, top_blob); + return binary_op< std::divides >(bottom_blob, bottom_blob1, top_blob, opt); if (op_type == Operation_MAX) - return binary_op< binary_op_max >(bottom_blob, bottom_blob1, top_blob); + return binary_op< binary_op_max >(bottom_blob, bottom_blob1, top_blob, opt); if (op_type == Operation_MIN) - return binary_op< binary_op_min >(bottom_blob, bottom_blob1, top_blob); + return binary_op< binary_op_min >(bottom_blob, bottom_blob1, top_blob, opt); if (op_type == Operation_POW) - return binary_op< binary_op_pow >(bottom_blob, bottom_blob1, top_blob); + return binary_op< binary_op_pow >(bottom_blob, bottom_blob1, top_blob, opt); if (op_type == Operation_RSUB) - return binary_op< binary_op_rsub >(bottom_blob, bottom_blob1, top_blob); + return binary_op< binary_op_rsub >(bottom_blob, bottom_blob1, top_blob, opt); if (op_type == Operation_RDIV) - return binary_op< binary_op_rdiv >(bottom_blob, bottom_blob1, top_blob); + return binary_op< binary_op_rdiv >(bottom_blob, bottom_blob1, top_blob, opt); return 0; } -int BinaryOp::forward_inplace(Mat& bottom_top_blob) const +int BinaryOp::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { if (op_type == Operation_ADD) - return binary_op_scalar_inplace< std::plus >(bottom_top_blob, b); + return binary_op_scalar_inplace< std::plus >(bottom_top_blob, b, opt); if (op_type == Operation_SUB) - return binary_op_scalar_inplace< std::minus >(bottom_top_blob, b); + return binary_op_scalar_inplace< std::minus >(bottom_top_blob, b, opt); if (op_type == Operation_MUL) - return binary_op_scalar_inplace< std::multiplies >(bottom_top_blob, b); + return binary_op_scalar_inplace< std::multiplies >(bottom_top_blob, b, opt); if (op_type == Operation_DIV) - return binary_op_scalar_inplace< std::divides >(bottom_top_blob, b); + return binary_op_scalar_inplace< std::divides >(bottom_top_blob, b, opt); if (op_type == Operation_MAX) - return binary_op_scalar_inplace< binary_op_max >(bottom_top_blob, b); + return binary_op_scalar_inplace< binary_op_max >(bottom_top_blob, b, opt); if (op_type == Operation_MIN) - return binary_op_scalar_inplace< binary_op_min >(bottom_top_blob, b); + return binary_op_scalar_inplace< binary_op_min >(bottom_top_blob, b, opt); if (op_type == Operation_POW) - return binary_op_scalar_inplace< binary_op_pow >(bottom_top_blob, b); + return binary_op_scalar_inplace< binary_op_pow >(bottom_top_blob, b, opt); if (op_type == Operation_RSUB) - return binary_op_scalar_inplace< binary_op_rsub >(bottom_top_blob, b); + return binary_op_scalar_inplace< binary_op_rsub >(bottom_top_blob, b, opt); if (op_type == Operation_RDIV) - return binary_op_scalar_inplace< binary_op_rdiv >(bottom_top_blob, b); + return binary_op_scalar_inplace< binary_op_rdiv >(bottom_top_blob, b, opt); return 0; } diff --git a/src/layer/binaryop.h b/src/layer/binaryop.h index 8affa7c35..daf0b8d4a 100644 --- a/src/layer/binaryop.h +++ b/src/layer/binaryop.h @@ -26,9 +26,9 @@ public: virtual int load_param(const ParamDict& pd); - virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; - virtual int forward_inplace(Mat& bottom_top_blob) const; + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; enum { Operation_ADD = 0, diff --git a/src/layer/bnll.cpp b/src/layer/bnll.cpp index f53052bd8..74c0735e8 100644 --- a/src/layer/bnll.cpp +++ b/src/layer/bnll.cpp @@ -25,14 +25,14 @@ BNLL::BNLL() support_inplace = true; } -int BNLL::forward_inplace(Mat& bottom_top_blob) const +int BNLL::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; int h = bottom_top_blob.h; int channels = bottom_top_blob.c; int size = w * h; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q& bottom_blobs, std::vector& top_blobs) const +int Concat::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { int dims = bottom_blobs[0].dims; size_t elemsize = bottom_blobs[0].elemsize; @@ -48,7 +48,7 @@ int Concat::forward(const std::vector& bottom_blobs, std::vector& top_ } Mat& top_blob = top_blobs[0]; - top_blob.create(top_w, elemsize); + top_blob.create(top_w, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -82,7 +82,7 @@ int Concat::forward(const std::vector& bottom_blobs, std::vector& top_ } Mat& top_blob = top_blobs[0]; - top_blob.create(w, top_h, elemsize); + top_blob.create(w, top_h, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -116,11 +116,11 @@ int Concat::forward(const std::vector& bottom_blobs, std::vector& top_ } Mat& top_blob = top_blobs[0]; - top_blob.create(top_w, h, elemsize); + top_blob.create(top_w, h, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int i=0; i& bottom_blobs, std::vector& top_ } Mat& top_blob = top_blobs[0]; - top_blob.create(w, h, top_channels, elemsize); + top_blob.create(w, h, top_channels, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -190,11 +190,11 @@ int Concat::forward(const std::vector& bottom_blobs, std::vector& top_ } Mat& top_blob = top_blobs[0]; - top_blob.create(w, top_h, channels, elemsize); + top_blob.create(w, top_h, channels, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q& bottom_blobs, std::vector& top_ } Mat& top_blob = top_blobs[0]; - top_blob.create(top_w, h, channels, elemsize); + top_blob.create(top_w, h, channels, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q& bottom_blobs, std::vector& top_blobs) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; public: int axis; diff --git a/src/layer/convolution.cpp b/src/layer/convolution.cpp index ea8073436..d4757f151 100644 --- a/src/layer/convolution.cpp +++ b/src/layer/convolution.cpp @@ -59,7 +59,7 @@ int Convolution::load_model(const ModelBin& mb) return 0; } -int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const +int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { // convolv with NxN kernel // value = value + bias @@ -89,7 +89,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const op->load_model(ModelBinFromMatArray(weights)); // forward - op->forward(bottom_blob, top_blob); + op->forward(bottom_blob, top_blob, opt); delete op; @@ -100,6 +100,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; // fprintf(stderr, "Convolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d\n", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h); @@ -109,7 +110,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const Mat bottom_blob_bordered = bottom_blob; if (pad_w > 0 || pad_h > 0) { - copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f); + copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); if (bottom_blob_bordered.empty()) return -100; @@ -122,7 +123,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h; if (wpad > 0 || hpad > 0) { - copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f); + copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); if (bottom_blob_bordered.empty()) return -100; } @@ -134,7 +135,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const int outw = (w - kernel_extent_w) / stride_w + 1; int outh = (h - kernel_extent_h) / stride_h + 1; - top_blob.create(outw, outh, num_output); + top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -160,7 +161,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const } // num_output - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int p=0; p 0 || pad_h > 0) { - copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f); + copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); if (bottom_blob_bordered.empty()) return -100; @@ -100,7 +101,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h; if (wpad > 0 || hpad > 0) { - copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f); + copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); if (bottom_blob_bordered.empty()) return -100; } @@ -112,7 +113,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const int outw = (w - kernel_extent_w) / stride_w + 1; int outh = (h - kernel_extent_h) / stride_h + 1; - top_blob.create(outw, outh, num_output); + top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -140,7 +141,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const // depth-wise if (channels == group && group == num_output) { - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int g=0; g& bottom_blobs, std::vector& top_blobs) const +int Crop::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { const Mat& bottom_blob = bottom_blobs[0]; const Mat& reference_blob = bottom_blobs[1]; @@ -85,7 +85,7 @@ int Crop::forward(const std::vector& bottom_blobs, std::vector& top_bl Mat& top_blob = top_blobs[0]; - copy_cut_border(bottom_blob_sliced, top_blob, top, bottom, left, right); + copy_cut_border(bottom_blob_sliced, top_blob, top, bottom, left, right, opt.blob_allocator, opt.num_threads); if (top_blob.empty()) return -100; diff --git a/src/layer/crop.h b/src/layer/crop.h index 43b30defe..ee88ee076 100644 --- a/src/layer/crop.h +++ b/src/layer/crop.h @@ -26,9 +26,9 @@ public: virtual int load_param(const ParamDict& pd); - virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; - virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; public: int woffset; diff --git a/src/layer/deconvolution.cpp b/src/layer/deconvolution.cpp index 5ed43a00c..506d8173e 100644 --- a/src/layer/deconvolution.cpp +++ b/src/layer/deconvolution.cpp @@ -57,7 +57,7 @@ int Deconvolution::load_model(const ModelBin& mb) return 0; } -int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const +int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { // backward strided convolv with NxN kernel // value = value + bias @@ -65,6 +65,7 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; // fprintf(stderr, "Deconvolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d\n", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h); @@ -74,10 +75,20 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const int outw = (w - 1) * stride_w + kernel_extent_w; int outh = (h - 1) * stride_h + kernel_extent_h; - Mat top_blob_bordered = top_blob; - top_blob_bordered.create(outw, outh, num_output); - if (top_blob_bordered.empty()) - return -100; + Mat top_blob_bordered; + if (pad_w > 0 || pad_h > 0) + { + top_blob_bordered.create(outw, outh, num_output, elemsize, opt.workspace_allocator); + if (top_blob_bordered.empty()) + return -100; + } + else + { + top_blob_bordered = top_blob; + top_blob_bordered.create(outw, outh, num_output, elemsize, opt.blob_allocator); + if (top_blob_bordered.empty()) + return -100; + } const int maxk = kernel_w * kernel_h; @@ -101,7 +112,7 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const } // num_output - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int p=0; p 0 || pad_h > 0) { - copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w); + copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w, opt.blob_allocator, opt.num_threads); if (top_blob.empty()) return -100; outw = top_blob.w; outh = top_blob.h; } + else + { + top_blob = top_blob_bordered; + } return 0; } diff --git a/src/layer/deconvolution.h b/src/layer/deconvolution.h index 1ef614d3e..10bd9e3c9 100644 --- a/src/layer/deconvolution.h +++ b/src/layer/deconvolution.h @@ -28,7 +28,7 @@ public: virtual int load_model(const ModelBin& mb); - virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; public: // param diff --git a/src/layer/deconvolutiondepthwise.cpp b/src/layer/deconvolutiondepthwise.cpp index 536f82b09..fc1b1ce26 100644 --- a/src/layer/deconvolutiondepthwise.cpp +++ b/src/layer/deconvolutiondepthwise.cpp @@ -58,7 +58,7 @@ int DeconvolutionDepthWise::load_model(const ModelBin& mb) return 0; } -int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const +int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { // deconvolv with NxN kernel // value = value + bias @@ -66,6 +66,7 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; if (channels % group != 0 || num_output % group != 0) { @@ -79,10 +80,20 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const int outw = (w - 1) * stride_w + kernel_extent_w; int outh = (h - 1) * stride_h + kernel_extent_h; - Mat top_blob_bordered = top_blob; - top_blob_bordered.create(outw, outh, num_output); - if (top_blob_bordered.empty()) - return -100; + Mat top_blob_bordered; + if (pad_w > 0 || pad_h > 0) + { + top_blob_bordered.create(outw, outh, num_output, elemsize, opt.workspace_allocator); + if (top_blob_bordered.empty()) + return -100; + } + else + { + top_blob_bordered = top_blob; + top_blob_bordered.create(outw, outh, num_output, elemsize, opt.blob_allocator); + if (top_blob_bordered.empty()) + return -100; + } const int maxk = kernel_w * kernel_h; @@ -108,7 +119,7 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const // depth-wise if (channels == group && group == num_output) { - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int g=0; g 0 || pad_h > 0) { - copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w); + copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w, opt.blob_allocator, opt.num_threads); if (top_blob.empty()) return -100; outw = top_blob.w; outh = top_blob.h; } + else + { + top_blob = top_blob_bordered; + } return 0; } diff --git a/src/layer/deconvolutiondepthwise.h b/src/layer/deconvolutiondepthwise.h index a1a57795f..674cb2c47 100644 --- a/src/layer/deconvolutiondepthwise.h +++ b/src/layer/deconvolutiondepthwise.h @@ -28,7 +28,7 @@ public: virtual int load_model(const ModelBin& mb); - virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; public: // param diff --git a/src/layer/detectionoutput.cpp b/src/layer/detectionoutput.cpp index fc12955a7..af0976ca2 100644 --- a/src/layer/detectionoutput.cpp +++ b/src/layer/detectionoutput.cpp @@ -141,7 +141,7 @@ static void nms_sorted_bboxes(const std::vector& bboxes, std::vector& bottom_blobs, std::vector& top_blobs) const +int DetectionOutput::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { const Mat& location = bottom_blobs[0]; const Mat& confidence = bottom_blobs[1]; @@ -151,7 +151,7 @@ int DetectionOutput::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& top_blobs) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; public: int num_class; diff --git a/src/layer/dropout.cpp b/src/layer/dropout.cpp index 89ca292eb..ccf79243c 100644 --- a/src/layer/dropout.cpp +++ b/src/layer/dropout.cpp @@ -31,7 +31,7 @@ int Dropout::load_param(const ParamDict& pd) return 0; } -int Dropout::forward_inplace(Mat& bottom_top_blob) const +int Dropout::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { if (scale == 1.f) { @@ -43,7 +43,7 @@ int Dropout::forward_inplace(Mat& bottom_top_blob) const int channels = bottom_top_blob.c; int size = w * h; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q& bottom_blobs, std::vector& top_blobs) const +int Eltwise::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { const Mat& bottom_blob = bottom_blobs[0]; int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; int size = w * h; Mat& top_blob = top_blobs[0]; - top_blob.create(w, h, channels); + top_blob.create(w, h, channels, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -48,7 +49,7 @@ int Eltwise::forward(const std::vector& bottom_blobs, std::vector& top { // first blob const Mat& bottom_blob1 = bottom_blobs[1]; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q& bottom_blobs, std::vector& top for (size_t b=2; b& bottom_blobs, std::vector& top { // first blob const Mat& bottom_blob1 = bottom_blobs[1]; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q& bottom_blobs, std::vector& top for (size_t b=2; b& bottom_blobs, std::vector& top const Mat& bottom_blob1 = bottom_blobs[1]; float coeff0 = coeffs[0]; float coeff1 = coeffs[1]; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q& bottom_blobs, std::vector& top { const Mat& bottom_blob1 = bottom_blobs[b]; float coeff = coeffs[b]; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q& bottom_blobs, std::vector& top { // first blob const Mat& bottom_blob1 = bottom_blobs[1]; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q& bottom_blobs, std::vector& top for (size_t b=2; b& bottom_blobs, std::vector& top_blobs) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; enum { Operation_PROD = 0, Operation_SUM = 1, Operation_MAX = 2 }; diff --git a/src/layer/elu.cpp b/src/layer/elu.cpp index dd5780a2d..bbd1679a5 100644 --- a/src/layer/elu.cpp +++ b/src/layer/elu.cpp @@ -32,14 +32,14 @@ int ELU::load_param(const ParamDict& pd) return 0; } -int ELU::forward_inplace(Mat& bottom_top_blob) const +int ELU::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; int h = bottom_top_blob.h; int channels = bottom_top_blob.c; int size = w * h; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q 0) { - copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f); + copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); if (square_blob_bordered.empty()) return -100; @@ -135,7 +136,7 @@ int LRN::forward_inplace(Mat& bottom_top_blob) const } } - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q& bottom_blobs, std::vector& top_blobs) const +int LSTM::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { - // size x 1 x T + // size x T const Mat& input_blob = bottom_blobs[0]; + size_t elemsize = input_blob.elemsize; // T, 0 or 1 each const Mat& cont_blob = bottom_blobs[1]; - int T = input_blob.c; + int T = input_blob.h; int size = input_blob.w; // initial hidden state - Mat hidden(num_output); + Mat hidden(num_output, 4u, opt.workspace_allocator); if (hidden.empty()) return -100; hidden.fill(0.f); // internal cell state - Mat cell(num_output); + Mat cell(num_output, 4u, opt.workspace_allocator); if (cell.empty()) return -100; // 4 x num_output - Mat gates(4, num_output); + Mat gates(4, num_output, 4u, opt.workspace_allocator); if (gates.empty()) return -100; Mat& top_blob = top_blobs[0]; - top_blob.create(num_output, 1, T); + top_blob.create(num_output, T, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -93,14 +94,12 @@ int LSTM::forward(const std::vector& bottom_blobs, std::vector& top_bl // 0 otherwise // calculate hidden // gate_input_t := W_hc * h_conted_{t-1} + W_xc * x_t + b_c - const float cont = cont_blob[t]; - const Mat x = input_blob.channel(t); - float* hidden_data = hidden; + const int cont = ((const int*)cont_blob)[t]; + const float* x = input_blob.row(t); for (int q=0; q& bottom_blobs, std::vector& top_bl float G = bias_c_data_ptr[3]; for (int i=0; i& bottom_blobs, std::vector& top_bl // tanh(G) // c_t := f_t .* c_{t-1} + i_t .* g_t // h_t := o_t .* tanh[c_t] - float* cell_data = cell; - Mat output = top_blob.channel(t); - float* output_data = output; + float* output_data = top_blob.row(t); for (int q=0; q& bottom_blobs, std::vector& top_bl float G = gates_data[3]; I = 1.f / (1.f + exp(-I)); - F = cont ? 0.f : 1.f / (1.f + exp(-F)); + F = cont ? 1.f / (1.f + exp(-F)) : 0.f; O = 1.f / (1.f + exp(-O)); G = tanh(G); - float cell = F * cell_data[q] + I * G; - float H = O * tanh(cell); + float cell2 = F * cell[q] + I * G; + float H = O * tanh(cell2); - cell_data[q] = cell; - hidden_data[q] = H; + cell[q] = cell2; + hidden[q] = H; output_data[q] = H; } diff --git a/src/layer/lstm.h b/src/layer/lstm.h index 1a9ec4d04..d975745ac 100644 --- a/src/layer/lstm.h +++ b/src/layer/lstm.h @@ -28,7 +28,7 @@ public: virtual int load_model(const ModelBin& mb); - virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; public: // param diff --git a/src/layer/memorydata.cpp b/src/layer/memorydata.cpp index d7c2cf214..a3b4c6957 100644 --- a/src/layer/memorydata.cpp +++ b/src/layer/memorydata.cpp @@ -57,11 +57,11 @@ int MemoryData::load_model(const ModelBin& mb) return 0; } -int MemoryData::forward(const std::vector& /*bottom_blobs*/, std::vector& top_blobs) const +int MemoryData::forward(const std::vector& /*bottom_blobs*/, std::vector& top_blobs, const Option& opt) const { Mat& top_blob = top_blobs[0]; - top_blob = data.clone(); + top_blob = data.clone(opt.blob_allocator); if (top_blob.empty()) return -100; diff --git a/src/layer/memorydata.h b/src/layer/memorydata.h index e91c49da2..3b8081acb 100644 --- a/src/layer/memorydata.h +++ b/src/layer/memorydata.h @@ -28,7 +28,7 @@ public: virtual int load_model(const ModelBin& mb); - virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; public: int w; diff --git a/src/layer/mvn.cpp b/src/layer/mvn.cpp index 825eab9b3..0cec40301 100644 --- a/src/layer/mvn.cpp +++ b/src/layer/mvn.cpp @@ -34,23 +34,24 @@ int MVN::load_param(const ParamDict& pd) return 0; } -int MVN::forward(const Mat& bottom_blob, Mat& top_blob) const +int MVN::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; int size = w * h; - top_blob.create(w, h, channels); + top_blob.create(w, h, channels, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; // prepare sum per channel - Mat sum(channels); + Mat sum(channels, elemsize, opt.workspace_allocator); if (sum.empty()) return -100; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q 0 || hpad > 0) { - copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value); + copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value, opt.workspace_allocator, opt.num_threads); if (bottom_blob_bordered.empty()) return -100; } @@ -156,7 +157,7 @@ int Pooling::forward(const Mat& bottom_blob, Mat& top_blob) const int outw = (w - kernel_w) / stride_w + 1; int outh = (h - kernel_h) / stride_h + 1; - top_blob.create(outw, outh, channels); + top_blob.create(outw, outh, channels, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -183,7 +184,7 @@ int Pooling::forward(const Mat& bottom_blob, Mat& top_blob) const if (pooling_type == PoolMethod_MAX) { - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q 1) { - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int i=0; i& bottom_blobs, std::vector& top_blobs) const +int PriorBox::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { int w = bottom_blobs[0].w; int h = bottom_blobs[0].h; @@ -74,9 +74,9 @@ int PriorBox::forward(const std::vector& bottom_blobs, std::vector& to num_prior += num_min_size * num_aspect_ratio; Mat& top_blob = top_blobs[0]; - top_blob.create(4 * w * h * num_prior, 2); + top_blob.create(4 * w * h * num_prior, 2, 4u, opt.blob_allocator); - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < h; i++) { float* box = (float*)top_blob + i * w * num_prior * 4; diff --git a/src/layer/priorbox.h b/src/layer/priorbox.h index b7f70d9cc..249a65f25 100644 --- a/src/layer/priorbox.h +++ b/src/layer/priorbox.h @@ -26,7 +26,7 @@ public: virtual int load_param(const ParamDict& pd); - virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; public: Mat min_sizes; diff --git a/src/layer/proposal.cpp b/src/layer/proposal.cpp index 03dc679cc..c4437dad7 100644 --- a/src/layer/proposal.cpp +++ b/src/layer/proposal.cpp @@ -195,7 +195,7 @@ static void nms_sorted_bboxes(const std::vector& bboxes, std::vector& } } -int Proposal::forward(const std::vector& bottom_blobs, std::vector& top_blobs) const +int Proposal::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { const Mat& score_blob = bottom_blobs[0]; const Mat& bbox_blob = bottom_blobs[1]; @@ -210,7 +210,7 @@ int Proposal::forward(const std::vector& bottom_blobs, std::vector& to Mat proposals; proposals.create(4, w * h, num_anchors); - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q& bottom_blobs, std::vector& to float im_w = im_info_blob[1]; float im_h = im_info_blob[0]; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q& bottom_blobs, std::vector& top_blobs) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; public: // param diff --git a/src/layer/reduction.cpp b/src/layer/reduction.cpp index a2ac15f2b..3994cf82e 100644 --- a/src/layer/reduction.cpp +++ b/src/layer/reduction.cpp @@ -39,7 +39,7 @@ int Reduction::load_param(const ParamDict& pd) } template -static int reduction_op(const Mat& a, Mat& b, float v0, int dim, float coeff) +static int reduction_op(const Mat& a, Mat& b, float v0, int dim, float coeff, const Option& opt) { Op op; Op2 op2; @@ -47,43 +47,44 @@ static int reduction_op(const Mat& a, Mat& b, float v0, int dim, float coeff) int w = a.w; int h = a.h; int channels = a.c; + size_t elemsize = a.elemsize; int size = w * h; if (dim == 0) { // w h c -> X X X - b.create(1); + b.create(1, elemsize, opt.blob_allocator); } else if (dim == 1) { // w h c -> X X c - b.create(channels); + b.create(channels, elemsize, opt.blob_allocator); } else if (dim == 2) { // w h c -> X h c - b.create(h, channels); + b.create(h, channels, elemsize, opt.blob_allocator); } else if (dim == -1) { // w h c -> w X X - b.create(w); + b.create(w, elemsize, opt.blob_allocator); } else if (dim == -2) { // w h c -> w h X - b.create(w, h); + b.create(w, h, elemsize, opt.blob_allocator); } if (b.empty()) return -100; if (dim == 0) { - Mat sums(channels); + Mat sums(channels, elemsize, opt.workspace_allocator); if (sums.empty()) return -100; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q { T operator() (const T& x, const T& y) const { return std::min(x, y); } }; -int Reduction::forward(const Mat& bottom_blob, Mat& top_blob) const +int Reduction::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { if (operation == ReductionOp_SUM) - return reduction_op< std::plus, std::plus >(bottom_blob, top_blob, 0.f, dim, coeff); + return reduction_op< std::plus, std::plus >(bottom_blob, top_blob, 0.f, dim, coeff, opt); if (operation == ReductionOp_ASUM) - return reduction_op< reduction_op_asum, std::plus >(bottom_blob, top_blob, 0.f, dim, coeff); + return reduction_op< reduction_op_asum, std::plus >(bottom_blob, top_blob, 0.f, dim, coeff, opt); if (operation == ReductionOp_SUMSQ) - return reduction_op< reduction_op_sumsq, std::plus >(bottom_blob, top_blob, 0.f, dim, coeff); + return reduction_op< reduction_op_sumsq, std::plus >(bottom_blob, top_blob, 0.f, dim, coeff, opt); if (operation == ReductionOp_MEAN) { - int ret = reduction_op< std::plus, std::plus >(bottom_blob, top_blob, 0.f, dim, coeff); + int ret = reduction_op< std::plus, std::plus >(bottom_blob, top_blob, 0.f, dim, coeff, opt); if (ret != 0) return -100; @@ -289,13 +290,13 @@ int Reduction::forward(const Mat& bottom_blob, Mat& top_blob) const } if (operation == ReductionOp_MAX) - return reduction_op< reduction_op_max, reduction_op_max >(bottom_blob, top_blob, -FLT_MAX, dim, coeff); + return reduction_op< reduction_op_max, reduction_op_max >(bottom_blob, top_blob, -FLT_MAX, dim, coeff, opt); if (operation == ReductionOp_MIN) - return reduction_op< reduction_op_min, reduction_op_min >(bottom_blob, top_blob, FLT_MAX, dim, coeff); + return reduction_op< reduction_op_min, reduction_op_min >(bottom_blob, top_blob, FLT_MAX, dim, coeff, opt); if (operation == ReductionOp_PROD) - return reduction_op< std::multiplies, std::multiplies >(bottom_blob, top_blob, 1.f, dim, coeff); + return reduction_op< std::multiplies, std::multiplies >(bottom_blob, top_blob, 1.f, dim, coeff, opt); return 0; } diff --git a/src/layer/reduction.h b/src/layer/reduction.h index b48c6fb95..340ed99c6 100644 --- a/src/layer/reduction.h +++ b/src/layer/reduction.h @@ -26,7 +26,7 @@ public: virtual int load_param(const ParamDict& pd); - virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; enum { ReductionOp_SUM = 0, diff --git a/src/layer/relu.cpp b/src/layer/relu.cpp index 3d2b8e8cd..4013699bc 100644 --- a/src/layer/relu.cpp +++ b/src/layer/relu.cpp @@ -31,7 +31,7 @@ int ReLU::load_param(const ParamDict& pd) return 0; } -int ReLU::forward_inplace(Mat& bottom_top_blob) const +int ReLU::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; int h = bottom_top_blob.h; @@ -40,7 +40,7 @@ int ReLU::forward_inplace(Mat& bottom_top_blob) const if (slope == 0.f) { - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q& bottom_blobs, std::vector& top_blobs) const +int RNN::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { // size x 1 x T const Mat& input_blob = bottom_blobs[0]; + size_t elemsize = input_blob.elemsize; // T, 0 or 1 each const Mat& cont_blob = bottom_blobs[1]; @@ -73,13 +74,13 @@ int RNN::forward(const std::vector& bottom_blobs, std::vector& top_blo int size = input_blob.w; // initial hidden state - Mat hidden(num_output); + Mat hidden(num_output, 4u, opt.workspace_allocator); if (hidden.empty()) return -100; hidden.fill(0.f); Mat& top_blob = top_blobs[0]; - top_blob.create(num_output, 1, T); + top_blob.create(num_output, 1, T, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; diff --git a/src/layer/rnn.h b/src/layer/rnn.h index b3e9c982e..80de65bc5 100644 --- a/src/layer/rnn.h +++ b/src/layer/rnn.h @@ -28,7 +28,7 @@ public: virtual int load_model(const ModelBin& mb); - virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; public: // param diff --git a/src/layer/roipooling.cpp b/src/layer/roipooling.cpp index 21f262872..5efc53142 100644 --- a/src/layer/roipooling.cpp +++ b/src/layer/roipooling.cpp @@ -33,17 +33,18 @@ int ROIPooling::load_param(const ParamDict& pd) return 0; } -int ROIPooling::forward(const std::vector& bottom_blobs, std::vector& top_blobs) const +int ROIPooling::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { const Mat& bottom_blob = bottom_blobs[0]; int w = bottom_blob.w; int h = bottom_blob.h; + size_t elemsize = bottom_blob.elemsize; int channels = bottom_blob.c; const Mat& roi_blob = bottom_blobs[1]; Mat& top_blob = top_blobs[0]; - top_blob.create(pooled_width, pooled_height, channels); + top_blob.create(pooled_width, pooled_height, channels, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -61,7 +62,7 @@ int ROIPooling::forward(const std::vector& bottom_blobs, std::vector& float bin_size_w = (float)roi_w / (float)pooled_width; float bin_size_h = (float)roi_h / (float)pooled_height; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q& bottom_blobs, std::vector& top_blobs) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; public: int pooled_width; diff --git a/src/layer/scale.cpp b/src/layer/scale.cpp index cf8d7df73..0bb09e7c5 100644 --- a/src/layer/scale.cpp +++ b/src/layer/scale.cpp @@ -54,7 +54,7 @@ int Scale::load_model(const ModelBin& mb) return 0; } -int Scale::forward_inplace(std::vector& bottom_top_blobs) const +int Scale::forward_inplace(std::vector& bottom_top_blobs, const Option& opt) const { Mat& bottom_top_blob = bottom_top_blobs[0]; const Mat& scale_blob = bottom_top_blobs[1]; @@ -69,7 +69,7 @@ int Scale::forward_inplace(std::vector& bottom_top_blobs) const if (bias_term) { - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int i=0; i& bottom_top_blobs) const } else { - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int i=0; i& bottom_top_blobs) const if (bias_term) { - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int i=0; i& bottom_top_blobs) const } else { - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int i=0; i& bottom_top_blobs) const if (bias_term) { - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q& bottom_top_blobs) const } else { - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q& bottom_top_blobs) const return 0; } -int Scale::forward_inplace(Mat& bottom_top_blob) const +int Scale::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { std::vector bottom_top_blobs(2); bottom_top_blobs[0] = bottom_top_blob; bottom_top_blobs[1] = scale_data; - return forward_inplace(bottom_top_blobs); + return forward_inplace(bottom_top_blobs, opt); } } // namespace ncnn diff --git a/src/layer/scale.h b/src/layer/scale.h index cac25cea5..3ca87950f 100644 --- a/src/layer/scale.h +++ b/src/layer/scale.h @@ -28,8 +28,8 @@ public: virtual int load_model(const ModelBin& mb); - virtual int forward_inplace(std::vector& bottom_top_blobs) const; - virtual int forward_inplace(Mat& bottom_top_blob) const; + virtual int forward_inplace(std::vector& bottom_top_blobs, const Option& opt) const; + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; public: // param diff --git a/src/layer/shufflechannel.cpp b/src/layer/shufflechannel.cpp index 6ba0c08fb..f129e9024 100644 --- a/src/layer/shufflechannel.cpp +++ b/src/layer/shufflechannel.cpp @@ -31,7 +31,7 @@ int ShuffleChannel::load_param(const ParamDict& pd) return 0; } -int ShuffleChannel::forward(const Mat& bottom_blob, Mat& top_blob) const +int ShuffleChannel::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { int w = bottom_blob.w; int h = bottom_blob.h; @@ -45,7 +45,7 @@ int ShuffleChannel::forward(const Mat& bottom_blob, Mat& top_blob) const return -100; } - top_blob.create(w, h, c, elemsize); + top_blob.create(w, h, c, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; diff --git a/src/layer/shufflechannel.h b/src/layer/shufflechannel.h index bcc3cee44..d180db625 100644 --- a/src/layer/shufflechannel.h +++ b/src/layer/shufflechannel.h @@ -26,7 +26,7 @@ public: virtual int load_param(const ParamDict& pd); - virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; public: int group; diff --git a/src/layer/sigmoid.cpp b/src/layer/sigmoid.cpp index cc62ac75a..6704cea2c 100644 --- a/src/layer/sigmoid.cpp +++ b/src/layer/sigmoid.cpp @@ -25,14 +25,14 @@ Sigmoid::Sigmoid() support_inplace = true; } -int Sigmoid::forward_inplace(Mat& bottom_top_blob) const +int Sigmoid::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; int h = bottom_top_blob.h; int channels = bottom_top_blob.c; int size = w * h; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q& bottom_blobs, std::vector& top_blobs) const +int Slice::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { const Mat& bottom_blob = bottom_blobs[0]; int dims = bottom_blob.dims; @@ -51,7 +51,7 @@ int Slice::forward(const std::vector& bottom_blobs, std::vector& top_b } Mat& top_blob = top_blobs[i]; - top_blob.create(slice, elemsize); + top_blob.create(slice, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -80,7 +80,7 @@ int Slice::forward(const std::vector& bottom_blobs, std::vector& top_b } Mat& top_blob = top_blobs[i]; - top_blob.create(w, slice, elemsize); + top_blob.create(w, slice, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -111,11 +111,11 @@ int Slice::forward(const std::vector& bottom_blobs, std::vector& top_b } Mat& top_blob = top_blobs[i]; - top_blob.create(slice, h, elemsize); + top_blob.create(slice, h, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int j=0; j& bottom_blobs, std::vector& top_b } Mat& top_blob = top_blobs[i]; - top_blob.create(w, h, slice, elemsize); + top_blob.create(w, h, slice, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -177,11 +177,11 @@ int Slice::forward(const std::vector& bottom_blobs, std::vector& top_b } Mat& top_blob = top_blobs[i]; - top_blob.create(w, slice, channels, elemsize); + top_blob.create(w, slice, channels, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int p=0; p& bottom_blobs, std::vector& top_b } Mat& top_blob = top_blobs[i]; - top_blob.create(slice, h, channels, elemsize); + top_blob.create(slice, h, channels, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int p=0; p& bottom_blobs, std::vector& top_blobs) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; public: Mat slices; diff --git a/src/layer/softmax.cpp b/src/layer/softmax.cpp index 44ad8dbfe..3ca2d8154 100644 --- a/src/layer/softmax.cpp +++ b/src/layer/softmax.cpp @@ -34,13 +34,14 @@ int Softmax::load_param(const ParamDict& pd) return 0; } -int Softmax::forward_inplace(Mat& bottom_top_blob) const +int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { // value = exp( value - global max value ) // sum all value // value = value / sum int dims = bottom_top_blob.dims; + size_t elemsize = bottom_top_blob.elemsize; if (dims == 1) // axis == 0 { @@ -79,7 +80,7 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const int h = bottom_top_blob.h; Mat max; - max.create(w); + max.create(w, elemsize, opt.workspace_allocator); if (max.empty()) return -100; max.fill(-FLT_MAX); @@ -103,7 +104,7 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const } Mat sum; - sum.create(w); + sum.create(w, elemsize, opt.workspace_allocator); if (sum.empty()) return -100; sum.fill(0.f); @@ -135,7 +136,7 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const int h = bottom_top_blob.h; Mat max; - max.create(h); + max.create(h, elemsize, opt.workspace_allocator); if (max.empty()) return -100; @@ -164,7 +165,7 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const } Mat sum; - sum.create(h); + sum.create(h, elemsize, opt.workspace_allocator); if (sum.empty()) return -100; @@ -203,7 +204,7 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const int size = w * h; Mat max; - max.create(w, h); + max.create(w, h, elemsize, opt.workspace_allocator); if (max.empty()) return -100; max.fill(-FLT_MAX); @@ -217,7 +218,7 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const } } - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q& bottom_blobs, std::vector& top_blobs) const +int Split::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& /*opt*/) const { const Mat& bottom_blob = bottom_blobs[0]; for (size_t i=0; i& bottom_blobs, std::vector& top_blobs) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; public: }; diff --git a/src/layer/spp.cpp b/src/layer/spp.cpp index 09fab007a..38c94f27b 100644 --- a/src/layer/spp.cpp +++ b/src/layer/spp.cpp @@ -34,11 +34,13 @@ int SPP::load_param(const ParamDict& pd) return 0; } -int SPP::forward(const Mat& bottom_blob, Mat& top_blob) const +int SPP::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { + size_t elemsize = bottom_blob.elemsize; + // 1 + 4 + 16 + 64 + ... + (2*pyramid_height)^2 int pyramid_num_bins = ((1 << (pyramid_height * 2)) - 1) / 3; - top_blob.create(pyramid_num_bins, 1, 2); + top_blob.create(pyramid_num_bins, 1, 2, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -72,7 +74,7 @@ int SPP::forward(const Mat& bottom_blob, Mat& top_blob) const Mat bottom_blob_bordered = bottom_blob; if (pad_h > 0 || pad_w > 0) { - copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f); + copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); if (bottom_blob_bordered.empty()) return -100; @@ -103,7 +105,7 @@ int SPP::forward(const Mat& bottom_blob, Mat& top_blob) const if (pooling_type == PoolMethod_MAX) { - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q= 2 && h == 1) { if (squeeze_w && w == 1) - top_blob = bottom_blob.reshape(channels); + top_blob = bottom_blob.reshape(channels, opt.blob_allocator); else - top_blob = bottom_blob.reshape(w, channels); + top_blob = bottom_blob.reshape(w, channels, opt.blob_allocator); } else if (squeeze_w && dims >= 1 && w == 1) { if (squeeze_h && h == 1) - top_blob = bottom_blob.reshape(channels); + top_blob = bottom_blob.reshape(channels, opt.blob_allocator); else - top_blob = bottom_blob.reshape(h, channels); + top_blob = bottom_blob.reshape(h, channels, opt.blob_allocator); } if (top_blob.empty()) diff --git a/src/layer/squeeze.h b/src/layer/squeeze.h index 1db596da4..cf8a2a164 100644 --- a/src/layer/squeeze.h +++ b/src/layer/squeeze.h @@ -26,7 +26,7 @@ public: virtual int load_param(const ParamDict& pd); - virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; public: int squeeze_w; diff --git a/src/layer/tanh.cpp b/src/layer/tanh.cpp index 75a47944d..9cf732a19 100644 --- a/src/layer/tanh.cpp +++ b/src/layer/tanh.cpp @@ -25,14 +25,14 @@ TanH::TanH() support_inplace = true; } -int TanH::forward_inplace(Mat& bottom_top_blob) const +int TanH::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; int h = bottom_top_blob.h; int channels = bottom_top_blob.c; int size = w * h; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q -static int unary_op_inplace(Mat& a) +static int unary_op_inplace(Mat& a, const Option& opt) { Op op; int size = a.total(); - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int i=0; i { T operator() (const T& x) const { return 1.f / x; } }; -int UnaryOp::forward_inplace(Mat& bottom_top_blob) const +int UnaryOp::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { if (op_type == Operation_ABS) - return unary_op_inplace< unary_op_abs >(bottom_top_blob); + return unary_op_inplace< unary_op_abs >(bottom_top_blob, opt); if (op_type == Operation_NEG) - return unary_op_inplace< unary_op_neg >(bottom_top_blob); + return unary_op_inplace< unary_op_neg >(bottom_top_blob, opt); if (op_type == Operation_FLOOR) - return unary_op_inplace< unary_op_floor >(bottom_top_blob); + return unary_op_inplace< unary_op_floor >(bottom_top_blob, opt); if (op_type == Operation_CEIL) - return unary_op_inplace< unary_op_ceil >(bottom_top_blob); + return unary_op_inplace< unary_op_ceil >(bottom_top_blob, opt); if (op_type == Operation_SQUARE) - return unary_op_inplace< unary_op_square >(bottom_top_blob); + return unary_op_inplace< unary_op_square >(bottom_top_blob, opt); if (op_type == Operation_SQRT) - return unary_op_inplace< unary_op_sqrt >(bottom_top_blob); + return unary_op_inplace< unary_op_sqrt >(bottom_top_blob, opt); if (op_type == Operation_RSQRT) - return unary_op_inplace< unary_op_rsqrt >(bottom_top_blob); + return unary_op_inplace< unary_op_rsqrt >(bottom_top_blob, opt); if (op_type == Operation_EXP) - return unary_op_inplace< unary_op_exp >(bottom_top_blob); + return unary_op_inplace< unary_op_exp >(bottom_top_blob, opt); if (op_type == Operation_LOG) - return unary_op_inplace< unary_op_log >(bottom_top_blob); + return unary_op_inplace< unary_op_log >(bottom_top_blob, opt); if (op_type == Operation_SIN) - return unary_op_inplace< unary_op_sin >(bottom_top_blob); + return unary_op_inplace< unary_op_sin >(bottom_top_blob, opt); if (op_type == Operation_COS) - return unary_op_inplace< unary_op_cos >(bottom_top_blob); + return unary_op_inplace< unary_op_cos >(bottom_top_blob, opt); if (op_type == Operation_TAN) - return unary_op_inplace< unary_op_tan >(bottom_top_blob); + return unary_op_inplace< unary_op_tan >(bottom_top_blob, opt); if (op_type == Operation_ASIN) - return unary_op_inplace< unary_op_asin >(bottom_top_blob); + return unary_op_inplace< unary_op_asin >(bottom_top_blob, opt); if (op_type == Operation_ACOS) - return unary_op_inplace< unary_op_acos >(bottom_top_blob); + return unary_op_inplace< unary_op_acos >(bottom_top_blob, opt); if (op_type == Operation_ATAN) - return unary_op_inplace< unary_op_atan >(bottom_top_blob); + return unary_op_inplace< unary_op_atan >(bottom_top_blob, opt); if (op_type == Operation_RECIPROCAL) - return unary_op_inplace< unary_op_reciprocal >(bottom_top_blob); + return unary_op_inplace< unary_op_reciprocal >(bottom_top_blob, opt); return 0; } diff --git a/src/layer/unaryop.h b/src/layer/unaryop.h index 827784a70..6084e966c 100644 --- a/src/layer/unaryop.h +++ b/src/layer/unaryop.h @@ -26,7 +26,7 @@ public: virtual int load_param(const ParamDict& pd); - virtual int forward_inplace(Mat& bottom_top_blob) const; + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; enum { Operation_ABS = 0, diff --git a/src/layer/x86/convolution_1x1.h b/src/layer/x86/convolution_1x1.h index b324740f3..c5db4b17b 100644 --- a/src/layer/x86/convolution_1x1.h +++ b/src/layer/x86/convolution_1x1.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -static void conv1x1s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) +static void conv1x1s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) { int w = bottom_blob.w; int h = bottom_blob.h; @@ -25,7 +25,7 @@ static void conv1x1s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _ker const float* kernel = _kernel; const float* bias = _bias; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int p=0; p 0 || pad_h > 0) { - copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f); + copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); if (bottom_blob_bordered.empty()) return -100; @@ -48,7 +49,7 @@ int Convolution_x86::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv int hpad = kernel_extent + (h - 1) / stride * stride - h; if (wpad > 0 || hpad > 0) { - copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f); + copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); if (bottom_blob_bordered.empty()) return -100; } @@ -60,7 +61,7 @@ int Convolution_x86::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv int outw = (w - kernel_extent) / stride + 1; int outh = (h - kernel_extent) / stride + 1; - top_blob.create(outw, outh, num_output); + top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -79,7 +80,7 @@ int Convolution_x86::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv if (inner_bottom_blob.w != inner_w || inner_bottom_blob.h != inner_h) { - inner_bottom_blob.create(inner_w, inner_h, bottom_blob.c); + inner_bottom_blob.create(inner_w, inner_h, bottom_blob.c, elemsize, opt.workspace_allocator); if (inner_bottom_blob.empty()) { @@ -89,7 +90,7 @@ int Convolution_x86::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv if (inner_top_blob.w != inner_outw || inner_top_blob.h != inner_outh) { - inner_top_blob.create(inner_outw, inner_outh, num_output); + inner_top_blob.create(inner_outw, inner_outh, num_output, elemsize, opt.workspace_allocator); if (inner_top_blob.empty()) { @@ -97,7 +98,7 @@ int Convolution_x86::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv } } - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int c = 0; c < bottom_blob.c; c ++) { float *outptr = inner_bottom_blob.channel(c); @@ -113,9 +114,9 @@ int Convolution_x86::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv } } - conv(inner_bottom_blob, inner_top_blob, weight_data, bias_data); + conv(inner_bottom_blob, inner_top_blob, weight_data, bias_data, opt); - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int c = 0; c < num_output; c ++) { float *outptr = (float *)top_blob.channel(c) + x * outw + y; @@ -136,19 +137,19 @@ int Convolution_x86::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv } -int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob) const +int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { // convolv with NxN kernel // value = value + bias if (bottom_blob.dims != 3) { - return Convolution::forward(bottom_blob, top_blob); + return Convolution::forward(bottom_blob, top_blob, opt); } if (kernel_w != kernel_h || stride_w != stride_h) { - return Convolution::forward(bottom_blob, top_blob); + return Convolution::forward(bottom_blob, top_blob, opt); } const int kernel_size = kernel_w; @@ -156,10 +157,10 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob) const if (kernel_size > 5 || stride > 5 || dilation_w != dilation_h) { - return Convolution::forward(bottom_blob, top_blob); + return Convolution::forward(bottom_blob, top_blob, opt); } - typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&); + typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&); // kernel_size x stride conv_func conv_func_table[5][5] = @@ -204,20 +205,21 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob) const conv_func conv = conv_func_table[kernel_size-1][stride-1]; if (!conv) { - return Convolution::forward(bottom_blob, top_blob); + return Convolution::forward(bottom_blob, top_blob, opt); } if (dilation_w != 1) { - return forwardDilation(bottom_blob, top_blob, conv); + return forwardDilation(bottom_blob, top_blob, conv, opt); } int w = bottom_blob.w; int h = bottom_blob.h; + size_t elemsize = bottom_blob.elemsize; Mat bottom_blob_bordered = bottom_blob; if (pad_w > 0 || pad_h > 0) { - copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f); + copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); if (bottom_blob_bordered.empty()) return -100; @@ -230,7 +232,7 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob) const int hpad = kernel_size + (h - 1) / stride * stride - h; if (wpad > 0 || hpad > 0) { - copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f); + copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); if (bottom_blob_bordered.empty()) return -100; } @@ -242,11 +244,11 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob) const int outw = (w - kernel_size) / stride + 1; int outh = (h - kernel_size) / stride + 1; - top_blob.create(outw, outh, num_output); + top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; - conv(bottom_blob_bordered, top_blob, weight_data, bias_data); + conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); return 0; } diff --git a/src/layer/x86/convolution_x86.h b/src/layer/x86/convolution_x86.h index 1aad94476..e72c14aca 100644 --- a/src/layer/x86/convolution_x86.h +++ b/src/layer/x86/convolution_x86.h @@ -19,13 +19,13 @@ namespace ncnn { -typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&); +typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&); class Convolution_x86 : public Convolution { public: - virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; - virtual int forwardDilation(const Mat& bottom_blob, Mat &top_blob, conv_func conv) const; + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + virtual int forwardDilation(const Mat& bottom_blob, Mat &top_blob, conv_func conv, const Option& opt) const; }; } // namespace ncnn diff --git a/src/layer/x86/convolutiondepthwise_3x3.h b/src/layer/x86/convolutiondepthwise_3x3.h index d14948de3..aa34ca084 100644 --- a/src/layer/x86/convolutiondepthwise_3x3.h +++ b/src/layer/x86/convolutiondepthwise_3x3.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -static void convdw3x3s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) +static void convdw3x3s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) { int w = bottom_blob.w; @@ -24,7 +24,7 @@ static void convdw3x3s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _k const float* kernel = _kernel; const float* bias = _bias; - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int g=0; g 0 || pad_h > 0) { - copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f); + copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); if (bottom_blob_bordered.empty()) return -100; @@ -60,7 +61,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob) con int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h; if (wpad > 0 || hpad > 0) { - copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f); + copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); if (bottom_blob_bordered.empty()) return -100; } @@ -72,7 +73,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob) con int outw = (w - kernel_extent_w) / stride_w + 1; int outh = (h - kernel_extent_h) / stride_h + 1; - top_blob.create(outw, outh, num_output); + top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -85,12 +86,12 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob) con { if (stride_w == 1 && stride_h == 1) { - convdw3x3s1_sse(bottom_blob_bordered, top_blob, weight_data, bias_data); + convdw3x3s1_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); return 0; } else if (stride_w == 2 && stride_h == 2) { - convdw3x3s2_sse(bottom_blob_bordered, top_blob, weight_data, bias_data); + convdw3x3s2_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); return 0; } } @@ -100,7 +101,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob) con omp_set_nested(0); #endif - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int g=0; gload_model(ModelBinFromMatArray(weights)); // forward - op->forward(bottom_blob_bordered_g, top_blob_g); + op->forward(bottom_blob_bordered_g, top_blob_g, opt); delete op; } @@ -187,7 +188,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob) con op->load_model(ModelBinFromMatArray(weights)); // forward - op->forward(bottom_blob_bordered_g, top_blob_g); + op->forward(bottom_blob_bordered_g, top_blob_g, opt); delete op; } diff --git a/src/layer/x86/convolutiondepthwise_x86.h b/src/layer/x86/convolutiondepthwise_x86.h index d67283511..82352312e 100644 --- a/src/layer/x86/convolutiondepthwise_x86.h +++ b/src/layer/x86/convolutiondepthwise_x86.h @@ -22,7 +22,7 @@ namespace ncnn { class ConvolutionDepthWise_x86 : public ConvolutionDepthWise { public: - virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; }; } // namespace ncnn diff --git a/src/layer/yolodetectionoutput.cpp b/src/layer/yolodetectionoutput.cpp index ff39e9979..8a810b020 100644 --- a/src/layer/yolodetectionoutput.cpp +++ b/src/layer/yolodetectionoutput.cpp @@ -160,7 +160,7 @@ static inline float sigmoid(float x) return 1.f / (1.f + exp(-x)); } -int YoloDetectionOutput::forward_inplace(Mat& bottom_top_blob) const +int YoloDetectionOutput::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; int h = bottom_top_blob.h; @@ -177,7 +177,7 @@ int YoloDetectionOutput::forward_inplace(Mat& bottom_top_blob) const all_box_bbox_rects.resize(num_box); all_box_bbox_scores.resize(num_box); - #pragma omp parallel for + #pragma omp parallel for num_threads(opt.num_threads) for (int pp = 0; pp < num_box; pp++) { int p = pp * channels_per_box; @@ -194,7 +194,7 @@ int YoloDetectionOutput::forward_inplace(Mat& bottom_top_blob) const // softmax class scores Mat scores(w, h, num_class, (void*)((const float*)bottom_top_blob.channel(p+5))); - softmax->forward_inplace(scores); + softmax->forward_inplace(scores, opt); for (int i = 0; i < h; i++) { @@ -281,7 +281,7 @@ int YoloDetectionOutput::forward_inplace(Mat& bottom_top_blob) const // fill result int num_detected = bbox_rects.size(); - bottom_top_blob.create(6, num_detected); + bottom_top_blob.create(6, num_detected, 4u, opt.blob_allocator); if (bottom_top_blob.empty()) return -100; diff --git a/src/layer/yolodetectionoutput.h b/src/layer/yolodetectionoutput.h index b35b8032f..8513ca30d 100644 --- a/src/layer/yolodetectionoutput.h +++ b/src/layer/yolodetectionoutput.h @@ -27,7 +27,7 @@ public: virtual int load_param(const ParamDict& pd); - virtual int forward_inplace(Mat& bottom_top_blob) const; + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; public: int num_class; diff --git a/src/mat.cpp b/src/mat.cpp index 498768735..92c3f12aa 100644 --- a/src/mat.cpp +++ b/src/mat.cpp @@ -499,10 +499,11 @@ static void copy_make_border_image(const Mat& src, Mat& dst, int top, int left, } } -void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int type, float v) +void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int type, float v, Allocator* allocator, int num_threads) { int w = src.w + left + right; int h = src.h + top + bottom; + size_t elemsize = src.elemsize; if (w == src.w && h == src.h) { @@ -512,7 +513,7 @@ void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, i if (src.dims == 2) { - dst.create(w, h); + dst.create(w, h, elemsize, allocator); if (dst.empty()) return; @@ -522,12 +523,12 @@ void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, i { int channels = src.c; - dst.create(w, h, channels); + dst.create(w, h, channels, elemsize, allocator); if (dst.empty()) return; // unroll image channel - #pragma omp parallel for + #pragma omp parallel for num_threads(num_threads) for (int q=0; q #endif +#include "allocator.h" #include "platform.h" namespace ncnn { @@ -31,19 +32,19 @@ public: // empty Mat(); // vec - Mat(int w, size_t elemsize = 4); + Mat(int w, size_t elemsize = 4u, Allocator* allocator = 0); // image - Mat(int w, int h, size_t elemsize = 4); + Mat(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0); // dim - Mat(int w, int h, int c, size_t elemsize = 4); + Mat(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0); // copy Mat(const Mat& m); // external vec - Mat(int w, void* data, size_t elemsize = 4); + Mat(int w, void* data, size_t elemsize = 4u); // external image - Mat(int w, int h, void* data, size_t elemsize = 4); + Mat(int w, int h, void* data, size_t elemsize = 4u); // external dim - Mat(int w, int h, int c, void* data, size_t elemsize = 4); + Mat(int w, int h, int c, void* data, size_t elemsize = 4u); // release ~Mat(); // assign @@ -52,19 +53,19 @@ public: void fill(float v); template void fill(T v); // deep copy - Mat clone() const; + Mat clone(Allocator* allocator = 0) const; // reshape vec - Mat reshape(int w) const; + Mat reshape(int w, Allocator* allocator = 0) const; // reshape image - Mat reshape(int w, int h) const; + Mat reshape(int w, int h, Allocator* allocator = 0) const; // reshape dim - Mat reshape(int w, int h, int c) const; + Mat reshape(int w, int h, int c, Allocator* allocator = 0) const; // allocate vec - void create(int w, size_t elemsize = 4); + void create(int w, size_t elemsize = 4u, Allocator* allocator = 0); // allocate image - void create(int w, int h, size_t elemsize = 4); + void create(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0); // allocate dim - void create(int w, int h, int c, size_t elemsize = 4); + void create(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0); // refcount++ void addref(); // refcount-- @@ -115,9 +116,9 @@ public: PIXEL_RGBA2GRAY = PIXEL_RGBA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT), }; // convenient construct from pixel data - static Mat from_pixels(const unsigned char* pixels, int type, int w, int h); + static Mat from_pixels(const unsigned char* pixels, int type, int w, int h, Allocator* allocator = 0); // convenient construct from pixel data and resize to specific size - static Mat from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height); + static Mat from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height, Allocator* allocator = 0); // convenient export to pixel data void to_pixels(unsigned char* pixels, int type) const; @@ -145,6 +146,9 @@ public: // 0 = empty size_t elemsize; + // the allocator + Allocator* allocator; + // the dimensionality int dims; @@ -169,100 +173,35 @@ enum BORDER_CONSTANT = 0, BORDER_REPLICATE = 1, }; -void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int type, float v); -void copy_cut_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right); -void resize_bilinear(const Mat& src, Mat& dst, int w, int h); - -// the alignment of all the allocated buffers -#define MALLOC_ALIGN 16 - -// Aligns a pointer to the specified number of bytes -// ptr Aligned pointer -// n Alignment size that must be a power of two -template static inline _Tp* alignPtr(_Tp* ptr, int n=(int)sizeof(_Tp)) -{ - return (_Tp*)(((size_t)ptr + n-1) & -n); -} - -// Aligns a buffer size to the specified number of bytes -// The function returns the minimum number that is greater or equal to sz and is divisible by n -// sz Buffer size to align -// n Alignment size that must be a power of two -static inline size_t alignSize(size_t sz, int n) -{ - return (sz + n-1) & -n; -} - -static inline void* fastMalloc(size_t size) -{ - unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + MALLOC_ALIGN); - if (!udata) - return 0; - unsigned char** adata = alignPtr((unsigned char**)udata + 1, MALLOC_ALIGN); - adata[-1] = udata; - return adata; -} - -static inline void fastFree(void* ptr) -{ - if (ptr) - { - unsigned char* udata = ((unsigned char**)ptr)[-1]; - free(udata); - } -} - -// exchange-add operation for atomic operations on reference counters -#if defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32) -// atomic increment on the linux version of the Intel(tm) compiler -# define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd(const_cast(reinterpret_cast(addr)), delta) -#elif defined __GNUC__ -# if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__) -# ifdef __ATOMIC_ACQ_REL -# define NCNN_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL) -# else -# define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4) -# endif -# else -# if defined __ATOMIC_ACQ_REL && !defined __clang__ -// version for gcc >= 4.7 -# define NCNN_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL) -# else -# define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta)) -# endif -# endif -#elif defined _MSC_VER && !defined RC_INVOKED -# include -# define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta) -#else -static inline void NCNN_XADD(int* addr, int delta) { int tmp = *addr; *addr += delta; return tmp; } -#endif +void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int type, float v, Allocator* allocator = 0, int num_threads = 1); +void copy_cut_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, Allocator* allocator = 0, int num_threads = 1); +void resize_bilinear(const Mat& src, Mat& dst, int w, int h, Allocator* allocator = 0, int num_threads = 1); inline Mat::Mat() - : data(0), refcount(0), elemsize(0), dims(0), w(0), h(0), c(0), cstep(0) + : data(0), refcount(0), elemsize(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) { } -inline Mat::Mat(int _w, size_t _elemsize) +inline Mat::Mat(int _w, size_t _elemsize, Allocator* allocator) : data(0), refcount(0), dims(0) { - create(_w, _elemsize); + create(_w, _elemsize, allocator); } -inline Mat::Mat(int _w, int _h, size_t _elemsize) +inline Mat::Mat(int _w, int _h, size_t _elemsize, Allocator* allocator) : data(0), refcount(0), dims(0) { - create(_w, _h, _elemsize); + create(_w, _h, _elemsize, allocator); } -inline Mat::Mat(int _w, int _h, int _c, size_t _elemsize) +inline Mat::Mat(int _w, int _h, int _c, size_t _elemsize, Allocator* allocator) : data(0), refcount(0), dims(0) { - create(_w, _h, _c, _elemsize); + create(_w, _h, _c, _elemsize, allocator); } inline Mat::Mat(const Mat& m) - : data(m.data), refcount(m.refcount), elemsize(m.elemsize), dims(m.dims) + : data(m.data), refcount(m.refcount), elemsize(m.elemsize), allocator(m.allocator), dims(m.dims) { if (refcount) NCNN_XADD(refcount, 1); @@ -275,7 +214,7 @@ inline Mat::Mat(const Mat& m) } inline Mat::Mat(int _w, void* _data, size_t _elemsize) - : data(_data), refcount(0), elemsize(_elemsize), dims(1) + : data(_data), refcount(0), elemsize(_elemsize), allocator(0), dims(1) { w = _w; h = 1; @@ -285,7 +224,7 @@ inline Mat::Mat(int _w, void* _data, size_t _elemsize) } inline Mat::Mat(int _w, int _h, void* _data, size_t _elemsize) - : data(_data), refcount(0), elemsize(_elemsize), dims(2) + : data(_data), refcount(0), elemsize(_elemsize), allocator(0), dims(2) { w = _w; h = _h; @@ -295,7 +234,7 @@ inline Mat::Mat(int _w, int _h, void* _data, size_t _elemsize) } inline Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize) - : data(_data), refcount(0), elemsize(_elemsize), dims(3) + : data(_data), refcount(0), elemsize(_elemsize), allocator(0), dims(3) { w = _w; h = _h; @@ -322,6 +261,7 @@ inline Mat& Mat::operator=(const Mat& m) data = m.data; refcount = m.refcount; elemsize = m.elemsize; + allocator = m.allocator; dims = m.dims; w = m.w; @@ -398,18 +338,18 @@ inline void Mat::fill(T _v) } } -inline Mat Mat::clone() const +inline Mat Mat::clone(Allocator* allocator) const { if (empty()) return Mat(); Mat m; if (dims == 1) - m.create(w, elemsize); + m.create(w, elemsize, allocator); else if (dims == 2) - m.create(w, h, elemsize); + m.create(w, h, elemsize, allocator); else if (dims == 3) - m.create(w, h, c, elemsize); + m.create(w, h, c, elemsize, allocator); if (total() > 0) { @@ -419,7 +359,7 @@ inline Mat Mat::clone() const return m; } -inline Mat Mat::reshape(int _w) const +inline Mat Mat::reshape(int _w, Allocator* allocator) const { if (w * h * c != _w) return Mat(); @@ -427,7 +367,7 @@ inline Mat Mat::reshape(int _w) const if (dims == 3 && cstep != (size_t)w * h) { Mat m; - m.create(_w, elemsize); + m.create(_w, elemsize, allocator); // flatten for (int i=0; i 0) { size_t totalsize = total() * elemsize; - data = fastMalloc(totalsize + (int)sizeof(*refcount)); + if (allocator) + data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount)); + else + data = fastMalloc(totalsize + (int)sizeof(*refcount)); refcount = (int*)(((unsigned char*)data) + totalsize); *refcount = 1; } } -inline void Mat::create(int _w, int _h, size_t _elemsize) +inline void Mat::create(int _w, int _h, size_t _elemsize, Allocator* _allocator) { - if (dims == 2 && w == _w && h == _h && elemsize == _elemsize) + if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && allocator == _allocator) return; release(); elemsize = _elemsize; + allocator = _allocator; dims = 2; w = _w; @@ -571,20 +516,24 @@ inline void Mat::create(int _w, int _h, size_t _elemsize) if (total() > 0) { size_t totalsize = total() * elemsize; - data = fastMalloc(totalsize + (int)sizeof(*refcount)); + if (allocator) + data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount)); + else + data = fastMalloc(totalsize + (int)sizeof(*refcount)); refcount = (int*)(((unsigned char*)data) + totalsize); *refcount = 1; } } -inline void Mat::create(int _w, int _h, int _c, size_t _elemsize) +inline void Mat::create(int _w, int _h, int _c, size_t _elemsize, Allocator* _allocator) { - if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize) + if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && allocator == _allocator) return; release(); elemsize = _elemsize; + allocator = _allocator; dims = 3; w = _w; @@ -596,7 +545,10 @@ inline void Mat::create(int _w, int _h, int _c, size_t _elemsize) if (total() > 0) { size_t totalsize = total() * elemsize; - data = fastMalloc(totalsize + (int)sizeof(*refcount)); + if (allocator) + data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount)); + else + data = fastMalloc(totalsize + (int)sizeof(*refcount)); refcount = (int*)(((unsigned char*)data) + totalsize); *refcount = 1; } @@ -611,7 +563,12 @@ inline void Mat::addref() inline void Mat::release() { if (refcount && NCNN_XADD(refcount, -1) == 1) - fastFree(data); + { + if (allocator) + allocator->fastFree(data); + else + fastFree(data); + } data = 0; diff --git a/src/mat_pixel.cpp b/src/mat_pixel.cpp index f07f88647..531788276 100644 --- a/src/mat_pixel.cpp +++ b/src/mat_pixel.cpp @@ -24,9 +24,9 @@ namespace ncnn { #if NCNN_PIXEL -static Mat from_rgb(const unsigned char* rgb, int w, int h) +static Mat from_rgb(const unsigned char* rgb, int w, int h, Allocator* allocator) { - Mat m(w, h, 3); + Mat m(w, h, 3, 4u, allocator); if (m.empty()) return m; @@ -155,9 +155,9 @@ static void to_rgb(const Mat& m, unsigned char* rgb) #undef SATURATE_CAST_UCHAR } -static Mat from_gray(const unsigned char* gray, int w, int h) +static Mat from_gray(const unsigned char* gray, int w, int h, Allocator* allocator) { - Mat m(w, h, 1); + Mat m(w, h, 1, 4u, allocator); if (m.empty()) return m; @@ -257,9 +257,9 @@ static void to_gray(const Mat& m, unsigned char* gray) #undef SATURATE_CAST_UCHAR } -static Mat from_rgba(const unsigned char* rgba, int w, int h) +static Mat from_rgba(const unsigned char* rgba, int w, int h, Allocator* allocator) { - Mat m(w, h, 4); + Mat m(w, h, 4, 4u, allocator); if (m.empty()) return m; @@ -408,9 +408,9 @@ static void to_rgba(const Mat& m, unsigned char* rgba) #undef SATURATE_CAST_UCHAR } -static Mat from_rgb2bgr(const unsigned char* rgb, int w, int h) +static Mat from_rgb2bgr(const unsigned char* rgb, int w, int h, Allocator* allocator) { - Mat m(w, h, 3); + Mat m(w, h, 3, 4u, allocator); if (m.empty()) return m; @@ -539,7 +539,7 @@ static void to_bgr2rgb(const Mat& m, unsigned char* rgb) #undef SATURATE_CAST_UCHAR } -static Mat from_rgb2gray(const unsigned char* rgb, int w, int h) +static Mat from_rgb2gray(const unsigned char* rgb, int w, int h, Allocator* allocator) { // coeffs for r g b = 0.299f, 0.587f, 0.114f const unsigned char Y_shift = 8;//14 @@ -547,7 +547,7 @@ static Mat from_rgb2gray(const unsigned char* rgb, int w, int h) const unsigned char G2Y = 150; const unsigned char B2Y = 29; - Mat m(w, h, 1); + Mat m(w, h, 1, 4u, allocator); if (m.empty()) return m; @@ -631,7 +631,7 @@ static Mat from_rgb2gray(const unsigned char* rgb, int w, int h) return m; } -static Mat from_bgr2gray(const unsigned char* bgr, int w, int h) +static Mat from_bgr2gray(const unsigned char* bgr, int w, int h, Allocator* allocator) { // coeffs for r g b = 0.299f, 0.587f, 0.114f const unsigned char Y_shift = 8;//14 @@ -639,7 +639,7 @@ static Mat from_bgr2gray(const unsigned char* bgr, int w, int h) const unsigned char G2Y = 150; const unsigned char B2Y = 29; - Mat m(w, h, 1); + Mat m(w, h, 1, 4u, allocator); if (m.empty()) return m; @@ -723,9 +723,9 @@ static Mat from_bgr2gray(const unsigned char* bgr, int w, int h) return m; } -static Mat from_gray2rgb(const unsigned char* gray, int w, int h) +static Mat from_gray2rgb(const unsigned char* gray, int w, int h, Allocator* allocator) { - Mat m(w, h, 3); + Mat m(w, h, 3, 4u, allocator); if (m.empty()) return m; @@ -830,9 +830,9 @@ static Mat from_gray2rgb(const unsigned char* gray, int w, int h) return m; } -static Mat from_rgba2rgb(const unsigned char* rgba, int w, int h) +static Mat from_rgba2rgb(const unsigned char* rgba, int w, int h, Allocator* allocator) { - Mat m(w, h, 3); + Mat m(w, h, 3, 4u, allocator); if (m.empty()) return m; @@ -934,9 +934,9 @@ static Mat from_rgba2rgb(const unsigned char* rgba, int w, int h) return m; } -static Mat from_rgba2bgr(const unsigned char* rgba, int w, int h) +static Mat from_rgba2bgr(const unsigned char* rgba, int w, int h, Allocator* allocator) { - Mat m(w, h, 3); + Mat m(w, h, 3, 4u, allocator); if (m.empty()) return m; @@ -1038,7 +1038,7 @@ static Mat from_rgba2bgr(const unsigned char* rgba, int w, int h) return m; } -static Mat from_rgba2gray(const unsigned char* rgba, int w, int h) +static Mat from_rgba2gray(const unsigned char* rgba, int w, int h, Allocator* allocator) { // coeffs for r g b = 0.299f, 0.587f, 0.114f const unsigned char Y_shift = 8;//14 @@ -1046,7 +1046,7 @@ static Mat from_rgba2gray(const unsigned char* rgba, int w, int h) const unsigned char G2Y = 150; const unsigned char B2Y = 29; - Mat m(w, h, 1); + Mat m(w, h, 1, 4u, allocator); if (m.empty()) return m; @@ -1972,47 +1972,47 @@ void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned c delete[] buf; } -Mat Mat::from_pixels(const unsigned char* pixels, int type, int w, int h) +Mat Mat::from_pixels(const unsigned char* pixels, int type, int w, int h, Allocator* allocator) { if (type & PIXEL_CONVERT_MASK) { if (type == PIXEL_RGB2BGR || type == PIXEL_BGR2RGB) - return from_rgb2bgr(pixels, w, h); + return from_rgb2bgr(pixels, w, h, allocator); if (type == PIXEL_RGB2GRAY) - return from_rgb2gray(pixels, w, h); + return from_rgb2gray(pixels, w, h, allocator); if (type == PIXEL_BGR2GRAY) - return from_bgr2gray(pixels, w, h); + return from_bgr2gray(pixels, w, h, allocator); if (type == PIXEL_GRAY2RGB || type == PIXEL_GRAY2BGR) - return from_gray2rgb(pixels, w, h); + return from_gray2rgb(pixels, w, h, allocator); if (type == PIXEL_RGBA2RGB) - return from_rgba2rgb(pixels, w, h); + return from_rgba2rgb(pixels, w, h, allocator); if (type == PIXEL_RGBA2BGR) - return from_rgba2bgr(pixels, w, h); + return from_rgba2bgr(pixels, w, h, allocator); if (type == PIXEL_RGBA2GRAY) - return from_rgba2gray(pixels, w, h); + return from_rgba2gray(pixels, w, h, allocator); } else { if (type == PIXEL_RGB || type == PIXEL_BGR) - return from_rgb(pixels, w, h); + return from_rgb(pixels, w, h, allocator); if (type == PIXEL_GRAY) - return from_gray(pixels, w, h); + return from_gray(pixels, w, h, allocator); if (type == PIXEL_RGBA) - return from_rgba(pixels, w, h); + return from_rgba(pixels, w, h, allocator); } return Mat(); } -Mat Mat::from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height) +Mat Mat::from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height, Allocator* allocator) { if (w == target_width && h == target_height) return Mat::from_pixels(pixels, type, w, h); @@ -2027,7 +2027,7 @@ Mat Mat::from_pixels_resize(const unsigned char* pixels, int type, int w, int h, resize_bilinear_c3(pixels, w, h, dst, target_width, target_height); - m = Mat::from_pixels(dst, type, target_width, target_height); + m = Mat::from_pixels(dst, type, target_width, target_height, allocator); delete[] dst; } @@ -2037,7 +2037,7 @@ Mat Mat::from_pixels_resize(const unsigned char* pixels, int type, int w, int h, resize_bilinear_c1(pixels, w, h, dst, target_width, target_height); - m = Mat::from_pixels(dst, type, target_width, target_height); + m = Mat::from_pixels(dst, type, target_width, target_height, allocator); delete[] dst; } @@ -2047,7 +2047,7 @@ Mat Mat::from_pixels_resize(const unsigned char* pixels, int type, int w, int h, resize_bilinear_c4(pixels, w, h, dst, target_width, target_height); - m = Mat::from_pixels(dst, type, target_width, target_height); + m = Mat::from_pixels(dst, type, target_width, target_height, allocator); delete[] dst; } diff --git a/src/net.cpp b/src/net.cpp index 2c812c612..e60efe04a 100644 --- a/src/net.cpp +++ b/src/net.cpp @@ -622,7 +622,7 @@ Layer* Net::create_custom_layer(int index) return layer_creator(); } -int Net::forward_layer(int layer_index, std::vector& blob_mats, bool lightmode) const +int Net::forward_layer(int layer_index, std::vector& blob_mats, Option& opt) const { const Layer* layer = layers[layer_index]; @@ -636,14 +636,14 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, bool lightm if (blob_mats[bottom_blob_index].dims == 0) { - int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, lightmode); + int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, opt); if (ret != 0) return ret; } Mat bottom_blob = blob_mats[bottom_blob_index]; - if (lightmode) + if (opt.lightmode) { // delete after taken in light mode blob_mats[bottom_blob_index].release(); @@ -655,16 +655,16 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, bool lightm } // forward - if (lightmode && layer->support_inplace) + if (opt.lightmode && layer->support_inplace) { Mat& bottom_top_blob = bottom_blob; #if NCNN_BENCHMARK double start = get_current_time(); - int ret = layer->forward_inplace(bottom_top_blob); + int ret = layer->forward_inplace(bottom_top_blob, opt); double end = get_current_time(); benchmark(layer, bottom_top_blob, bottom_top_blob, start, end); #else - int ret = layer->forward_inplace(bottom_top_blob); + int ret = layer->forward_inplace(bottom_top_blob, opt); #endif // NCNN_BENCHMARK if (ret != 0) return ret; @@ -677,11 +677,11 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, bool lightm Mat top_blob; #if NCNN_BENCHMARK double start = get_current_time(); - int ret = layer->forward(bottom_blob, top_blob); + int ret = layer->forward(bottom_blob, top_blob, opt); double end = get_current_time(); benchmark(layer, bottom_blob, top_blob, start, end); #else - int ret = layer->forward(bottom_blob, top_blob); + int ret = layer->forward(bottom_blob, top_blob, opt); #endif // NCNN_BENCHMARK if (ret != 0) return ret; @@ -702,14 +702,14 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, bool lightm if (blob_mats[bottom_blob_index].dims == 0) { - int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, lightmode); + int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, opt); if (ret != 0) return ret; } bottom_blobs[i] = blob_mats[bottom_blob_index]; - if (lightmode) + if (opt.lightmode) { // delete after taken in light mode blob_mats[bottom_blob_index].release(); @@ -722,16 +722,16 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, bool lightm } // forward - if (lightmode && layer->support_inplace) + if (opt.lightmode && layer->support_inplace) { std::vector& bottom_top_blobs = bottom_blobs; #if NCNN_BENCHMARK double start = get_current_time(); - int ret = layer->forward_inplace(bottom_top_blobs); + int ret = layer->forward_inplace(bottom_top_blobs, opt); double end = get_current_time(); benchmark(layer, start, end); #else - int ret = layer->forward_inplace(bottom_top_blobs); + int ret = layer->forward_inplace(bottom_top_blobs, opt); #endif // NCNN_BENCHMARK if (ret != 0) return ret; @@ -750,11 +750,11 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, bool lightm top_blobs.resize(layer->tops.size()); #if NCNN_BENCHMARK double start = get_current_time(); - int ret = layer->forward(bottom_blobs, top_blobs); + int ret = layer->forward(bottom_blobs, top_blobs, opt); double end = get_current_time(); benchmark(layer, start, end); #else - int ret = layer->forward(bottom_blobs, top_blobs); + int ret = layer->forward(bottom_blobs, top_blobs, opt); #endif // NCNN_BENCHMARK if (ret != 0) return ret; @@ -779,18 +779,27 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, bool lightm Extractor::Extractor(const Net* _net, int blob_count) : net(_net) { blob_mats.resize(blob_count); - lightmode = true; - num_threads = 0; + opt = get_default_option(); } void Extractor::set_light_mode(bool enable) { - lightmode = enable; + opt.lightmode = enable; } -void Extractor::set_num_threads(int _num_threads) +void Extractor::set_num_threads(int num_threads) { - num_threads = _num_threads; + opt.num_threads = num_threads; +} + +void Extractor::set_blob_allocator(Allocator* allocator) +{ + opt.blob_allocator = allocator; +} + +void Extractor::set_workspace_allocator(Allocator* allocator) +{ + opt.workspace_allocator = allocator; } int Extractor::input(int blob_index, const Mat& in) @@ -813,28 +822,7 @@ int Extractor::extract(int blob_index, Mat& feat) if (blob_mats[blob_index].dims == 0) { int layer_index = net->blobs[blob_index].producer; - -#ifdef _OPENMP - int dynamic_current = 0; - int num_threads_current = 1; - if (num_threads) - { - dynamic_current = omp_get_dynamic(); - num_threads_current = omp_get_num_threads(); - omp_set_dynamic(0); - omp_set_num_threads(num_threads); - } -#endif - - ret = net->forward_layer(layer_index, blob_mats, lightmode); - -#ifdef _OPENMP - if (num_threads) - { - omp_set_dynamic(dynamic_current); - omp_set_num_threads(num_threads_current); - } -#endif + ret = net->forward_layer(layer_index, blob_mats, opt); } feat = blob_mats[blob_index]; @@ -865,28 +853,7 @@ int Extractor::extract(const char* blob_name, Mat& feat) if (blob_mats[blob_index].dims == 0) { int layer_index = net->blobs[blob_index].producer; - -#ifdef _OPENMP - int dynamic_current = 0; - int num_threads_current = 1; - if (num_threads) - { - dynamic_current = omp_get_dynamic(); - num_threads_current = omp_get_num_threads(); - omp_set_dynamic(0); - omp_set_num_threads(num_threads); - } -#endif - - ret = net->forward_layer(layer_index, blob_mats, lightmode); - -#ifdef _OPENMP - if (num_threads) - { - omp_set_dynamic(dynamic_current); - omp_set_num_threads(num_threads_current); - } -#endif + ret = net->forward_layer(layer_index, blob_mats, opt); } feat = blob_mats[blob_index]; diff --git a/src/net.h b/src/net.h index 2bd7976da..99b8d6fa0 100644 --- a/src/net.h +++ b/src/net.h @@ -87,7 +87,7 @@ protected: Layer* create_custom_layer(const char* type); #endif // NCNN_STRING Layer* create_custom_layer(int index); - int forward_layer(int layer_index, std::vector& blob_mats, bool lightmode) const; + int forward_layer(int layer_index, std::vector& blob_mats, Option& opt) const; protected: std::vector blobs; @@ -109,6 +109,12 @@ public: // default count is system depended void set_num_threads(int num_threads); + // set blob memory allocator + void set_blob_allocator(Allocator* allocator); + + // set workspace memory allocator + void set_workspace_allocator(Allocator* allocator); + #if NCNN_STRING // set input by blob name // return 0 if success @@ -134,8 +140,7 @@ protected: private: const Net* net; std::vector blob_mats; - bool lightmode; - int num_threads; + Option opt; }; } // namespace ncnn