| @@ -52,6 +52,9 @@ public: | |||
| static int g_loop_count = 4; | |||
| static ncnn::UnlockedPoolAllocator g_blob_pool_allocator; | |||
| static ncnn::PoolAllocator g_workspace_pool_allocator; | |||
| void benchmark(const char* comment, void (*init)(ncnn::Net&), void (*run)(const ncnn::Net&)) | |||
| { | |||
| ncnn::BenchNet net; | |||
| @@ -60,6 +63,9 @@ void benchmark(const char* comment, void (*init)(ncnn::Net&), void (*run)(const | |||
| net.load_model(); | |||
| g_blob_pool_allocator.clear(); | |||
| g_workspace_pool_allocator.clear(); | |||
| // sleep 10 seconds for cooling down SOC :( | |||
| #ifdef _WIN32 | |||
| Sleep(10 * 1000); | |||
| @@ -265,8 +271,6 @@ void mobilenet_yolo_run(const ncnn::Net& net) | |||
| { | |||
| ncnn::Extractor ex = net.create_extractor(); | |||
| // NOTE original model input is 416x416x3 | |||
| // you may change to 300x300x3 for comparison with ssd | |||
| ncnn::Mat in(416, 416, 3); | |||
| ex.input("data", in); | |||
| @@ -295,6 +299,17 @@ int main(int argc, char** argv) | |||
| g_loop_count = loop_count; | |||
| g_blob_pool_allocator.set_size_compare_ratio(0.0f); | |||
| g_workspace_pool_allocator.set_size_compare_ratio(0.5f); | |||
| ncnn::Option opt; | |||
| opt.lightmode = true; | |||
| opt.num_threads = num_threads; | |||
| opt.blob_allocator = &g_blob_pool_allocator; | |||
| opt.workspace_allocator = &g_workspace_pool_allocator; | |||
| ncnn::set_default_option(opt); | |||
| ncnn::set_cpu_powersave(powersave); | |||
| ncnn::set_omp_dynamic(0); | |||
| @@ -8,6 +8,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR}) | |||
| include_directories(${CMAKE_CURRENT_SOURCE_DIR}/layer) | |||
| set(ncnn_SRCS | |||
| allocator.cpp | |||
| blob.cpp | |||
| cpu.cpp | |||
| layer.cpp | |||
| @@ -0,0 +1,237 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "allocator.h" | |||
| #include <stdio.h> | |||
| namespace ncnn { | |||
| PoolAllocator::PoolAllocator() | |||
| { | |||
| size_compare_ratio = 192;// 0.75f * 256 | |||
| } | |||
| PoolAllocator::~PoolAllocator() | |||
| { | |||
| clear(); | |||
| if (!payouts.empty()) | |||
| { | |||
| fprintf(stderr, "FATAL ERROR! pool allocator destroyed too early\n"); | |||
| std::list< std::pair<size_t, void*> >::iterator it = payouts.begin(); | |||
| for (; it != payouts.end(); it++) | |||
| { | |||
| void* ptr = it->second; | |||
| fprintf(stderr, "%p still in use\n", ptr); | |||
| } | |||
| } | |||
| } | |||
| void PoolAllocator::clear() | |||
| { | |||
| budgets_lock.lock(); | |||
| std::list< std::pair<size_t, void*> >::iterator it = budgets.begin(); | |||
| for (; it != budgets.end(); it++) | |||
| { | |||
| void* ptr = it->second; | |||
| ncnn::fastFree(ptr); | |||
| } | |||
| budgets.clear(); | |||
| budgets_lock.unlock(); | |||
| } | |||
| void PoolAllocator::set_size_compare_ratio(float scr) | |||
| { | |||
| if (scr < 0.f || scr > 1.f) | |||
| { | |||
| fprintf(stderr, "invalid size compare ratio %f\n", scr); | |||
| return; | |||
| } | |||
| size_compare_ratio = (unsigned int)(scr * 256); | |||
| } | |||
| void* PoolAllocator::fastMalloc(size_t size) | |||
| { | |||
| budgets_lock.lock(); | |||
| // find free budget | |||
| std::list< std::pair<size_t, void*> >::iterator it = budgets.begin(); | |||
| for (; it != budgets.end(); it++) | |||
| { | |||
| size_t bs = it->first; | |||
| // size_compare_ratio ~ 100% | |||
| if (bs >= size && ((bs * size_compare_ratio) >> 8) <= size) | |||
| { | |||
| void* ptr = it->second; | |||
| budgets.erase(it); | |||
| budgets_lock.unlock(); | |||
| payouts_lock.lock(); | |||
| payouts.push_back(std::make_pair(bs, ptr)); | |||
| payouts_lock.unlock(); | |||
| return ptr; | |||
| } | |||
| } | |||
| budgets_lock.unlock(); | |||
| // new | |||
| void* ptr = ncnn::fastMalloc(size); | |||
| payouts_lock.lock(); | |||
| payouts.push_back(std::make_pair(size, ptr)); | |||
| payouts_lock.unlock(); | |||
| return ptr; | |||
| } | |||
| void PoolAllocator::fastFree(void* ptr) | |||
| { | |||
| payouts_lock.lock(); | |||
| // return to budgets | |||
| std::list< std::pair<size_t, void*> >::iterator it = payouts.begin(); | |||
| for (; it != payouts.end(); it++) | |||
| { | |||
| if (it->second == ptr) | |||
| { | |||
| size_t size = it->first; | |||
| payouts.erase(it); | |||
| payouts_lock.unlock(); | |||
| budgets_lock.lock(); | |||
| budgets.push_back(std::make_pair(size, ptr)); | |||
| budgets_lock.unlock(); | |||
| return; | |||
| } | |||
| } | |||
| payouts_lock.unlock(); | |||
| fprintf(stderr, "FATAL ERROR! pool allocator get wild %p\n", ptr); | |||
| ncnn::fastFree(ptr); | |||
| } | |||
| UnlockedPoolAllocator::UnlockedPoolAllocator() | |||
| { | |||
| size_compare_ratio = 192;// 0.75f * 256 | |||
| } | |||
| UnlockedPoolAllocator::~UnlockedPoolAllocator() | |||
| { | |||
| clear(); | |||
| if (!payouts.empty()) | |||
| { | |||
| fprintf(stderr, "FATAL ERROR! unlocked pool allocator destroyed too early\n"); | |||
| std::list< std::pair<size_t, void*> >::iterator it = payouts.begin(); | |||
| for (; it != payouts.end(); it++) | |||
| { | |||
| void* ptr = it->second; | |||
| fprintf(stderr, "%p still in use\n", ptr); | |||
| } | |||
| } | |||
| } | |||
| void UnlockedPoolAllocator::clear() | |||
| { | |||
| std::list< std::pair<size_t, void*> >::iterator it = budgets.begin(); | |||
| for (; it != budgets.end(); it++) | |||
| { | |||
| void* ptr = it->second; | |||
| ncnn::fastFree(ptr); | |||
| } | |||
| budgets.clear(); | |||
| } | |||
| void UnlockedPoolAllocator::set_size_compare_ratio(float scr) | |||
| { | |||
| if (scr < 0.f || scr > 1.f) | |||
| { | |||
| fprintf(stderr, "invalid size compare ratio %f\n", scr); | |||
| return; | |||
| } | |||
| size_compare_ratio = (unsigned int)(scr * 256); | |||
| } | |||
| void* UnlockedPoolAllocator::fastMalloc(size_t size) | |||
| { | |||
| // find free budget | |||
| std::list< std::pair<size_t, void*> >::iterator it = budgets.begin(); | |||
| for (; it != budgets.end(); it++) | |||
| { | |||
| size_t bs = it->first; | |||
| // size_compare_ratio ~ 100% | |||
| if (bs >= size && ((bs * size_compare_ratio) >> 8) <= size) | |||
| { | |||
| void* ptr = it->second; | |||
| budgets.erase(it); | |||
| payouts.push_back(std::make_pair(bs, ptr)); | |||
| return ptr; | |||
| } | |||
| } | |||
| // new | |||
| void* ptr = ncnn::fastMalloc(size); | |||
| payouts.push_back(std::make_pair(size, ptr)); | |||
| return ptr; | |||
| } | |||
| void UnlockedPoolAllocator::fastFree(void* ptr) | |||
| { | |||
| // return to budgets | |||
| std::list< std::pair<size_t, void*> >::iterator it = payouts.begin(); | |||
| for (; it != payouts.end(); it++) | |||
| { | |||
| if (it->second == ptr) | |||
| { | |||
| size_t size = it->first; | |||
| payouts.erase(it); | |||
| budgets.push_back(std::make_pair(size, ptr)); | |||
| return; | |||
| } | |||
| } | |||
| fprintf(stderr, "FATAL ERROR! unlocked pool allocator get wild %p\n", ptr); | |||
| ncnn::fastFree(ptr); | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,175 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef NCNN_ALLOCATOR_H | |||
| #define NCNN_ALLOCATOR_H | |||
| #ifdef _WIN32 | |||
| #define WIN32_LEAN_AND_MEAN | |||
| #include <windows.h> | |||
| #else | |||
| #include <pthread.h> | |||
| #endif | |||
| #include <stdlib.h> | |||
| #include <list> | |||
| namespace ncnn { | |||
| // the alignment of all the allocated buffers | |||
| #define MALLOC_ALIGN 16 | |||
| // Aligns a pointer to the specified number of bytes | |||
| // ptr Aligned pointer | |||
| // n Alignment size that must be a power of two | |||
| template<typename _Tp> static inline _Tp* alignPtr(_Tp* ptr, int n=(int)sizeof(_Tp)) | |||
| { | |||
| return (_Tp*)(((size_t)ptr + n-1) & -n); | |||
| } | |||
| // Aligns a buffer size to the specified number of bytes | |||
| // The function returns the minimum number that is greater or equal to sz and is divisible by n | |||
| // sz Buffer size to align | |||
| // n Alignment size that must be a power of two | |||
| static inline size_t alignSize(size_t sz, int n) | |||
| { | |||
| return (sz + n-1) & -n; | |||
| } | |||
| static inline void* fastMalloc(size_t size) | |||
| { | |||
| unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + MALLOC_ALIGN); | |||
| if (!udata) | |||
| return 0; | |||
| unsigned char** adata = alignPtr((unsigned char**)udata + 1, MALLOC_ALIGN); | |||
| adata[-1] = udata; | |||
| return adata; | |||
| } | |||
| static inline void fastFree(void* ptr) | |||
| { | |||
| if (ptr) | |||
| { | |||
| unsigned char* udata = ((unsigned char**)ptr)[-1]; | |||
| free(udata); | |||
| } | |||
| } | |||
| // exchange-add operation for atomic operations on reference counters | |||
| #if defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32) | |||
| // atomic increment on the linux version of the Intel(tm) compiler | |||
| # define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd(const_cast<void*>(reinterpret_cast<volatile void*>(addr)), delta) | |||
| #elif defined __GNUC__ | |||
| # if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__) | |||
| # ifdef __ATOMIC_ACQ_REL | |||
| # define NCNN_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL) | |||
| # else | |||
| # define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4) | |||
| # endif | |||
| # else | |||
| # if defined __ATOMIC_ACQ_REL && !defined __clang__ | |||
| // version for gcc >= 4.7 | |||
| # define NCNN_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL) | |||
| # else | |||
| # define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta)) | |||
| # endif | |||
| # endif | |||
| #elif defined _MSC_VER && !defined RC_INVOKED | |||
| # include <intrin.h> | |||
| # define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta) | |||
| #else | |||
| static inline void NCNN_XADD(int* addr, int delta) { int tmp = *addr; *addr += delta; return tmp; } | |||
| #endif | |||
| #ifdef _WIN32 | |||
| class Mutex | |||
| { | |||
| public: | |||
| Mutex() { InitializeSRWLock(&lock); } | |||
| ~Mutex() { } | |||
| void lock() { AcquireSRWLockExclusive(&lock); } | |||
| void unlock() { ReleaseSRWLockExclusive(&lock); } | |||
| private: | |||
| // NOTE SRWLock is available from windows vista | |||
| SRWLOCK lock; | |||
| }; | |||
| #else // _WIN32 | |||
| class Mutex | |||
| { | |||
| public: | |||
| Mutex() { pthread_mutex_init(&mutex, 0); } | |||
| ~Mutex() { pthread_mutex_destroy(&mutex); } | |||
| void lock() { pthread_mutex_lock(&mutex); } | |||
| void unlock() { pthread_mutex_unlock(&mutex); } | |||
| private: | |||
| pthread_mutex_t mutex; | |||
| }; | |||
| #endif // _WIN32 | |||
| class Allocator | |||
| { | |||
| public: | |||
| virtual void* fastMalloc(size_t size) = 0; | |||
| virtual void fastFree(void* ptr) = 0; | |||
| }; | |||
| class PoolAllocator : public Allocator | |||
| { | |||
| public: | |||
| PoolAllocator(); | |||
| ~PoolAllocator(); | |||
| // ratio range 0 ~ 1 | |||
| // default cr = 0.75 | |||
| void set_size_compare_ratio(float scr); | |||
| // release all budgets immediately | |||
| void clear(); | |||
| virtual void* fastMalloc(size_t size); | |||
| virtual void fastFree(void* ptr); | |||
| private: | |||
| Mutex budgets_lock; | |||
| Mutex payouts_lock; | |||
| unsigned int size_compare_ratio;// 0~256 | |||
| std::list< std::pair<size_t, void*> > budgets; | |||
| std::list< std::pair<size_t, void*> > payouts; | |||
| }; | |||
| class UnlockedPoolAllocator : public Allocator | |||
| { | |||
| public: | |||
| UnlockedPoolAllocator(); | |||
| ~UnlockedPoolAllocator(); | |||
| // ratio range 0 ~ 1 | |||
| // default cr = 0.75 | |||
| void set_size_compare_ratio(float scr); | |||
| // release all budgets immediately | |||
| void clear(); | |||
| virtual void* fastMalloc(size_t size); | |||
| virtual void fastFree(void* ptr); | |||
| private: | |||
| unsigned int size_compare_ratio;// 0~256 | |||
| std::list< std::pair<size_t, void*> > budgets; | |||
| std::list< std::pair<size_t, void*> > payouts; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // NCNN_ALLOCATOR_H | |||
| @@ -14,10 +14,40 @@ | |||
| #include "layer.h" | |||
| #include <stdio.h> | |||
| #include <string.h> | |||
| #include "cpu.h" | |||
| namespace ncnn { | |||
| Option::Option() | |||
| { | |||
| lightmode = true; | |||
| num_threads = get_cpu_count(); | |||
| blob_allocator = 0; | |||
| workspace_allocator = 0; | |||
| } | |||
| static Option g_default_option; | |||
| const Option& get_default_option() | |||
| { | |||
| return g_default_option; | |||
| } | |||
| int set_default_option(const Option& opt) | |||
| { | |||
| if (opt.num_threads <= 0) | |||
| { | |||
| fprintf(stderr, "invalid option num_threads %d\n", opt.num_threads); | |||
| return -1; | |||
| } | |||
| g_default_option = opt; | |||
| return 0; | |||
| } | |||
| Layer::Layer() | |||
| { | |||
| one_blob_only = false; | |||
| @@ -38,7 +68,7 @@ int Layer::load_model(const ModelBin& /*mb*/) | |||
| return 0; | |||
| } | |||
| int Layer::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const | |||
| int Layer::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const | |||
| { | |||
| if (!support_inplace) | |||
| return -1; | |||
| @@ -46,32 +76,32 @@ int Layer::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_b | |||
| top_blobs = bottom_blobs; | |||
| for (int i = 0; i < (int)top_blobs.size(); i++) | |||
| { | |||
| top_blobs[i] = bottom_blobs[i].clone(); | |||
| top_blobs[i] = bottom_blobs[i].clone(opt.blob_allocator); | |||
| if (top_blobs[i].empty()) | |||
| return -100; | |||
| } | |||
| return forward_inplace(top_blobs); | |||
| return forward_inplace(top_blobs, opt); | |||
| } | |||
| int Layer::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int Layer::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| if (!support_inplace) | |||
| return -1; | |||
| top_blob = bottom_blob.clone(); | |||
| top_blob = bottom_blob.clone(opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| return forward_inplace(top_blob); | |||
| return forward_inplace(top_blob, opt); | |||
| } | |||
| int Layer::forward_inplace(std::vector<Mat>& /*bottom_top_blobs*/) const | |||
| int Layer::forward_inplace(std::vector<Mat>& /*bottom_top_blobs*/, const Option& /*opt*/) const | |||
| { | |||
| return -1; | |||
| } | |||
| int Layer::forward_inplace(Mat& /*bottom_top_blob*/) const | |||
| int Layer::forward_inplace(Mat& /*bottom_top_blob*/, const Option& /*opt*/) const | |||
| { | |||
| return -1; | |||
| } | |||
| @@ -25,6 +25,22 @@ | |||
| namespace ncnn { | |||
| class Allocator; | |||
| class Option | |||
| { | |||
| public: | |||
| Option(); | |||
| public: | |||
| bool lightmode; | |||
| int num_threads; | |||
| Allocator* blob_allocator; | |||
| Allocator* workspace_allocator; | |||
| }; | |||
| const Option& get_default_option(); | |||
| int set_default_option(const Option& opt); | |||
| class Layer | |||
| { | |||
| public: | |||
| @@ -51,13 +67,13 @@ public: | |||
| public: | |||
| // implement inference | |||
| // return 0 if success | |||
| virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const; | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt = get_default_option()) const; | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt = get_default_option()) const; | |||
| // implement inplace inference | |||
| // return 0 if success | |||
| virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt = get_default_option()) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt = get_default_option()) const; | |||
| public: | |||
| #if NCNN_STRING | |||
| @@ -24,14 +24,14 @@ AbsVal::AbsVal() | |||
| support_inplace = true; | |||
| } | |||
| int AbsVal::forward_inplace(Mat& bottom_top_blob) const | |||
| int AbsVal::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| int size = w * h; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| @@ -24,7 +24,7 @@ class AbsVal : public Layer | |||
| public: | |||
| AbsVal(); | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; | |||
| public: | |||
| }; | |||
| @@ -33,14 +33,14 @@ int ArgMax::load_param(const ParamDict& pd) | |||
| return 0; | |||
| } | |||
| int ArgMax::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int ArgMax::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| int size = bottom_blob.total(); | |||
| if (out_max_val) | |||
| top_blob.create(topk, 2); | |||
| top_blob.create(topk, 2, 4u, opt.blob_allocator); | |||
| else | |||
| top_blob.create(topk, 1); | |||
| top_blob.create(topk, 1, 4u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -26,7 +26,7 @@ public: | |||
| virtual int load_param(const ParamDict& pd); | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| public: | |||
| int out_max_val; | |||
| @@ -22,14 +22,14 @@ namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(AbsVal_arm) | |||
| int AbsVal_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| int AbsVal_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| int size = w * h; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| @@ -22,7 +22,7 @@ namespace ncnn { | |||
| class AbsVal_arm : public AbsVal | |||
| { | |||
| public: | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -22,11 +22,11 @@ namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(BatchNorm_arm) | |||
| int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int dims = bottom_top_blob.dims; | |||
| if (dims != 3) | |||
| return BatchNorm::forward_inplace(bottom_top_blob); | |||
| return BatchNorm::forward_inplace(bottom_top_blob, opt); | |||
| // a = bias - slope * mean / sqrt(var) | |||
| // b = slope / sqrt(var) | |||
| @@ -38,7 +38,7 @@ int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| const float* a_data_ptr = a_data; | |||
| const float* b_data_ptr = b_data; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| @@ -22,7 +22,7 @@ namespace ncnn { | |||
| class BatchNorm_arm : public BatchNorm | |||
| { | |||
| public: | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -22,7 +22,7 @@ namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(Bias_arm) | |||
| int Bias_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| int Bias_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| @@ -30,7 +30,7 @@ int Bias_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| int size = w * h; | |||
| const float* bias_ptr = bias_data; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| @@ -22,7 +22,7 @@ namespace ncnn { | |||
| class Bias_arm : public Bias | |||
| { | |||
| public: | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -113,7 +113,7 @@ static void conv1x1s1_sgemm_transform_kernel_neon(const Mat& _kernel, Mat& kerne | |||
| } | |||
| } | |||
| static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias) | |||
| static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| @@ -128,12 +128,12 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma | |||
| const float* bias = _bias; | |||
| // interleave | |||
| Mat tmp(8*4, inch/4+inch%4, size/8 + (size%8)/4 + size%4); | |||
| Mat tmp(8*4, inch/4+inch%4, size/8 + (size%8)/4 + size%4, 4u, opt.workspace_allocator); | |||
| { | |||
| int nn_size = size >> 3; | |||
| int remain_size_start = nn_size << 3; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int ii=0; ii<nn_size; ii++) | |||
| { | |||
| int i = ii * 8; | |||
| @@ -184,7 +184,7 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma | |||
| nn_size = (size - remain_size_start) >> 2; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int ii=0; ii<nn_size; ii++) | |||
| { | |||
| int i = remain_size_start + ii * 4; | |||
| @@ -230,7 +230,7 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma | |||
| remain_size_start += nn_size << 2; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int i=remain_size_start; i<size; i++) | |||
| { | |||
| const float* img0 = bottom_blob.channel(0); | |||
| @@ -254,7 +254,7 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma | |||
| nn_outch = outch >> 3; | |||
| remain_outch_start = nn_outch << 3; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int pp=0; pp<nn_outch; pp++) | |||
| { | |||
| int p = pp * 8; | |||
| @@ -733,7 +733,7 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma | |||
| nn_outch = (outch - remain_outch_start) >> 2; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int pp=0; pp<nn_outch; pp++) | |||
| { | |||
| int p = remain_outch_start + pp * 4; | |||
| @@ -1613,7 +1613,7 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma | |||
| remain_outch_start += nn_outch << 2; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=remain_outch_start; p<outch; p++) | |||
| { | |||
| Mat out0 = top_blob.channel(p); | |||
| @@ -2064,7 +2064,7 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma | |||
| // } | |||
| } | |||
| static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) | |||
| static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) | |||
| { | |||
| int inch = bottom_blob.c; | |||
| @@ -2083,7 +2083,7 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| nn_outch = outch >> 3; | |||
| remain_outch_start = nn_outch << 3; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int pp=0; pp<nn_outch; pp++) | |||
| { | |||
| int p = pp * 8; | |||
| @@ -2710,7 +2710,7 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| nn_outch = outch / 6; | |||
| remain_outch_start = nn_outch * 6; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int pp=0; pp<nn_outch; pp++) | |||
| { | |||
| int p = pp * 6; | |||
| @@ -3101,7 +3101,7 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| nn_outch = (outch - remain_outch_start) >> 2; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int pp=0; pp<nn_outch; pp++) | |||
| { | |||
| int p = remain_outch_start + pp * 4; | |||
| @@ -3605,7 +3605,7 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| remain_outch_start += nn_outch << 2; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=remain_outch_start; p<outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||
| @@ -3863,7 +3863,7 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| } | |||
| static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) | |||
| static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| @@ -3880,7 +3880,7 @@ static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| int nn_outch = outch >> 2; | |||
| int remain_outch_start = nn_outch << 2; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int pp=0; pp<nn_outch; pp++) | |||
| { | |||
| int p = pp * 4; | |||
| @@ -4409,7 +4409,7 @@ static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| } | |||
| } | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=remain_outch_start; p<outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||
| @@ -16,7 +16,7 @@ | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) | |||
| static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| @@ -28,7 +28,7 @@ static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| const float* kernel = _kernel; | |||
| const float* bias = _bias; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=0; p<outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||
| @@ -16,7 +16,7 @@ | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) | |||
| static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| @@ -31,7 +31,7 @@ static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| int nn_outch = outch >> 1; | |||
| int remain_outch_start = nn_outch << 1; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int pp=0; pp<nn_outch; pp++) | |||
| { | |||
| int p = pp * 2; | |||
| @@ -654,7 +654,7 @@ static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| } | |||
| } | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=remain_outch_start; p<outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||
| @@ -5427,7 +5427,7 @@ static void conv3x3s1_winograd64_neon3(const Mat& bottom_blob, Mat& top_blob, co | |||
| } | |||
| #endif | |||
| static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias) | |||
| static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| @@ -5445,7 +5445,7 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co | |||
| w = outw + 2; | |||
| h = outh + 2; | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f); | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt.workspace_allocator, opt.num_threads); | |||
| const float* bias = _bias; | |||
| @@ -5454,7 +5454,7 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co | |||
| { | |||
| int w_tm = outw / 6 * 8; | |||
| int h_tm = outh / 6 * 8; | |||
| bottom_blob_tm.create(4, 16 * w_tm/8 * h_tm/8, inch); | |||
| bottom_blob_tm.create(4, 16 * w_tm/8 * h_tm/8, inch, 4u, opt.workspace_allocator); | |||
| const int tiles = w_tm/8 * h_tm/8; | |||
| // const float itm[8][8] = { | |||
| @@ -5495,7 +5495,7 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co | |||
| float32x4_t _coeff1 = vld1q_f32(coeff+4); | |||
| #endif // __ARM_NEON | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q<inch; q++) | |||
| { | |||
| const Mat img0 = bottom_blob_bordered.channel(q); | |||
| @@ -6263,14 +6263,14 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co | |||
| { | |||
| int w_tm = outw / 6 * 8; | |||
| int h_tm = outh / 6 * 8; | |||
| top_blob_tm.create(4, 16 * w_tm/8 * h_tm/8, outch); | |||
| top_blob_tm.create(4, 16 * w_tm/8 * h_tm/8, outch, 4u, opt.workspace_allocator); | |||
| const int tiles = h_tm/8 * w_tm/8; | |||
| int nn_outch = outch >> 2; | |||
| int remain_outch_start = nn_outch << 2; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int pp=0; pp<nn_outch; pp++) | |||
| { | |||
| int p = pp * 4; | |||
| @@ -7439,7 +7439,7 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co | |||
| } | |||
| } | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = remain_outch_start; p<outch; p++) | |||
| { | |||
| Mat out0_tm = top_blob_tm.channel(p); | |||
| @@ -7526,7 +7526,7 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co | |||
| // BEGIN transform output | |||
| Mat top_blob_bordered; | |||
| top_blob_bordered.create(outw, outh, outch); | |||
| top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator); | |||
| { | |||
| // const float otm[6][8] = { | |||
| // {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 32.0f, 32.0f, 0.0f}, | |||
| @@ -7553,7 +7553,7 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co | |||
| int h_tm = outh / 6 * 8; | |||
| const int tiles = w_tm/8 * h_tm/8; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p<outch; p++) | |||
| { | |||
| const Mat out0_tm = top_blob_tm.channel(p); | |||
| @@ -8157,10 +8157,10 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co | |||
| // END transform output | |||
| // cut result pad | |||
| copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w); | |||
| copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt.blob_allocator, opt.num_threads); | |||
| } | |||
| static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias) | |||
| static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| @@ -8178,7 +8178,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co | |||
| w = outw + 2; | |||
| h = outh + 2; | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f); | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt.workspace_allocator, opt.num_threads); | |||
| const float* bias = _bias; | |||
| @@ -8188,7 +8188,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co | |||
| int w_tm = outw / 6 * 8; | |||
| int h_tm = outh / 6 * 8; | |||
| const int tiles = w_tm/8 * h_tm/8; | |||
| bottom_blob_tm.create(1, 64 * tiles, inch); | |||
| bottom_blob_tm.create(1, 64 * tiles, inch, 4u, opt.workspace_allocator); | |||
| // bottom_blob_tm.create(inch, tiles, 64); | |||
| // const float itm[8][8] = { | |||
| @@ -8229,7 +8229,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co | |||
| float32x4_t _coeff1 = vld1q_f32(coeff+4); | |||
| #endif // __ARM_NEON | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q<inch; q++) | |||
| { | |||
| const Mat img0 = bottom_blob_bordered.channel(q); | |||
| @@ -9054,9 +9054,9 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co | |||
| // permute | |||
| // bottom_blob_tm.create(1, 64 * tiles, inch); | |||
| // Mat bottom_blob_tm2(inch, tiles, 64); | |||
| Mat bottom_blob_tm2(8*inch, tiles/8 + (tiles%8)/4 + tiles%4, 64); | |||
| Mat bottom_blob_tm2(8*inch, tiles/8 + (tiles%8)/4 + tiles%4, 64, 4u, opt.workspace_allocator); | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int r=0; r<64; r++) | |||
| { | |||
| Mat tm2 = bottom_blob_tm2.channel(r); | |||
| @@ -9147,7 +9147,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co | |||
| nn_outch = outch >> 3; | |||
| remain_outch_start = nn_outch << 3; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int pp=0; pp<nn_outch; pp++) | |||
| { | |||
| int p = pp * 8; | |||
| @@ -9592,7 +9592,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co | |||
| nn_outch = (outch - remain_outch_start) >> 2; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int pp=0; pp<nn_outch; pp++) | |||
| { | |||
| int p = remain_outch_start + pp * 4; | |||
| @@ -10332,6 +10332,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co | |||
| remain_outch_start += nn_outch << 2; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=remain_outch_start; p<outch; p++) | |||
| { | |||
| #if __ARM_NEON && __aarch64__ | |||
| @@ -10738,7 +10739,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co | |||
| // BEGIN transform output | |||
| Mat top_blob_bordered; | |||
| top_blob_bordered.create(outw, outh, outch); | |||
| top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator); | |||
| { | |||
| // const float otm[6][8] = { | |||
| // {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 32.0f, 32.0f, 0.0f}, | |||
| @@ -10765,7 +10766,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co | |||
| int h_tm = outh / 6 * 8; | |||
| const int tiles = w_tm/8 * h_tm/8; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p<outch; p++) | |||
| { | |||
| const Mat out0_tm = top_blob_tm.channel(p); | |||
| @@ -11514,10 +11515,10 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co | |||
| // END transform output | |||
| // cut result pad | |||
| copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w); | |||
| copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt.blob_allocator, opt.num_threads); | |||
| } | |||
| static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) | |||
| static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| @@ -11534,7 +11535,7 @@ static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| int nn_outch = outch >> 1; | |||
| int remain_outch_start = nn_outch << 1; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int pp=0; pp<nn_outch; pp++) | |||
| { | |||
| int p = pp * 2; | |||
| @@ -11858,7 +11859,7 @@ static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| } | |||
| } | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=remain_outch_start; p<outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||
| @@ -16,7 +16,7 @@ | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) | |||
| static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| @@ -30,7 +30,7 @@ static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| const float* kernel = _kernel; | |||
| const float* bias = _bias; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=0; p<outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||
| @@ -16,7 +16,7 @@ | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) | |||
| static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| @@ -28,7 +28,7 @@ static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| const float* kernel = _kernel; | |||
| const float* bias = _bias; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=0; p<outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||
| @@ -982,7 +982,7 @@ static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| } | |||
| static void conv5x5s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) | |||
| static void conv5x5s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| @@ -996,7 +996,7 @@ static void conv5x5s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| const float* kernel = _kernel; | |||
| const float* bias = _bias; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=0; p<outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||
| @@ -16,7 +16,7 @@ | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) | |||
| static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| @@ -28,7 +28,7 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| const float* kernel = _kernel; | |||
| const float* bias = _bias; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=0; p<outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||
| @@ -706,7 +706,7 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| } | |||
| static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) | |||
| static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| @@ -720,7 +720,7 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| const float* kernel = _kernel; | |||
| const float* bias = _bias; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=0; p<outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||
| @@ -75,10 +75,11 @@ int Convolution_arm::load_model(const ModelBin& mb) | |||
| return 0; | |||
| } | |||
| int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv) const | |||
| int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv, const Option& opt) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| const int kernel_size = kernel_w; | |||
| const int stride = stride_w; | |||
| @@ -88,7 +89,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv | |||
| Mat bottom_blob_bordered = bottom_blob; | |||
| if (pad_w > 0 || pad_h > 0) | |||
| { | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f); | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); | |||
| if (bottom_blob_bordered.empty()) | |||
| return -100; | |||
| @@ -101,7 +102,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv | |||
| int hpad = kernel_extent + (h - 1) / stride * stride - h; | |||
| if (wpad > 0 || hpad > 0) | |||
| { | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f); | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); | |||
| if (bottom_blob_bordered.empty()) | |||
| return -100; | |||
| } | |||
| @@ -113,7 +114,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv | |||
| int outw = (w - kernel_extent) / stride + 1; | |||
| int outh = (h - kernel_extent) / stride + 1; | |||
| top_blob.create(outw, outh, num_output); | |||
| top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -132,7 +133,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv | |||
| if (inner_bottom_blob.w != inner_w || inner_bottom_blob.h != inner_h) | |||
| { | |||
| inner_bottom_blob.create(inner_w, inner_h, bottom_blob.c); | |||
| inner_bottom_blob.create(inner_w, inner_h, bottom_blob.c, elemsize, opt.workspace_allocator); | |||
| if (inner_bottom_blob.empty()) | |||
| { | |||
| @@ -142,7 +143,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv | |||
| if (inner_top_blob.w != inner_outw || inner_top_blob.h != inner_outh) | |||
| { | |||
| inner_top_blob.create(inner_outw, inner_outh, num_output); | |||
| inner_top_blob.create(inner_outw, inner_outh, num_output, elemsize, opt.workspace_allocator); | |||
| if (inner_top_blob.empty()) | |||
| { | |||
| @@ -150,7 +151,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv | |||
| } | |||
| } | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int c = 0; c < bottom_blob.c; c ++) | |||
| { | |||
| float *outptr = (float *) inner_bottom_blob.channel(c); | |||
| @@ -166,9 +167,9 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv | |||
| } | |||
| } | |||
| conv(inner_bottom_blob, inner_top_blob, weight_data, bias_data); | |||
| conv(inner_bottom_blob, inner_top_blob, weight_data, bias_data, opt); | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int c = 0; c < num_output; c ++) | |||
| { | |||
| float *outptr = (float *) top_blob.channel(c) + x * outw + y; | |||
| @@ -188,19 +189,19 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv | |||
| return 0; | |||
| } | |||
| int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| // convolv with NxN kernel | |||
| // value = value + bias | |||
| if (bottom_blob.dims != 3) | |||
| { | |||
| return Convolution::forward(bottom_blob, top_blob); | |||
| return Convolution::forward(bottom_blob, top_blob, opt); | |||
| } | |||
| if (kernel_w != kernel_h || stride_w != stride_h) | |||
| { | |||
| return Convolution::forward(bottom_blob, top_blob); | |||
| return Convolution::forward(bottom_blob, top_blob, opt); | |||
| } | |||
| const int kernel_size = kernel_w; | |||
| @@ -208,10 +209,10 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| if (kernel_size > 7 || stride > 4 || dilation_w != dilation_h) | |||
| { | |||
| return Convolution::forward(bottom_blob, top_blob); | |||
| return Convolution::forward(bottom_blob, top_blob, opt); | |||
| } | |||
| typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&); | |||
| typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&); | |||
| // kernel_size x stride | |||
| conv_func conv_func_table[7][4] = | |||
| @@ -263,22 +264,23 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| conv_func conv = conv_func_table[kernel_size-1][stride-1]; | |||
| if (!conv) | |||
| { | |||
| return Convolution::forward(bottom_blob, top_blob); | |||
| return Convolution::forward(bottom_blob, top_blob, opt); | |||
| } | |||
| if (dilation_w != 1) | |||
| { | |||
| return forwardDilation(bottom_blob, top_blob, conv); | |||
| return forwardDilation(bottom_blob, top_blob, conv, opt); | |||
| } | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| Mat bottom_blob_bordered = bottom_blob; | |||
| if (pad_w > 0 || pad_h > 0) | |||
| { | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f); | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); | |||
| if (bottom_blob_bordered.empty()) | |||
| return -100; | |||
| @@ -291,7 +293,7 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int hpad = kernel_size + (h - 1) / stride * stride - h; | |||
| if (wpad > 0 || hpad > 0) | |||
| { | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f); | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); | |||
| if (bottom_blob_bordered.empty()) | |||
| return -100; | |||
| } | |||
| @@ -303,21 +305,21 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int outw = (w - kernel_size) / stride + 1; | |||
| int outh = (h - kernel_size) / stride + 1; | |||
| top_blob.create(outw, outh, num_output); | |||
| top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| if (use_winograd3x3 && w <= 120 && h <= 120) | |||
| { | |||
| // conv3x3s1_winograd64_neon4(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data); | |||
| conv3x3s1_winograd64_neon5(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data); | |||
| // conv3x3s1_winograd64_neon4(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt); | |||
| conv3x3s1_winograd64_neon5(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt); | |||
| } | |||
| else if (use_sgemm1x1) | |||
| { | |||
| conv1x1s1_sgemm_neon(bottom_blob_bordered, top_blob, weight_1x1_sgemm_data, bias_data); | |||
| conv1x1s1_sgemm_neon(bottom_blob_bordered, top_blob, weight_1x1_sgemm_data, bias_data, opt); | |||
| } | |||
| else | |||
| conv(bottom_blob_bordered, top_blob, weight_data, bias_data); | |||
| conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); | |||
| return 0; | |||
| } | |||
| @@ -19,7 +19,7 @@ | |||
| namespace ncnn { | |||
| typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&); | |||
| typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&); | |||
| class Convolution_arm : public Convolution | |||
| { | |||
| @@ -28,8 +28,8 @@ public: | |||
| virtual int load_model(const ModelBin& mb); | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv) const; | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| virtual int forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv, const Option& opt) const; | |||
| public: | |||
| bool use_winograd3x3; | |||
| @@ -16,7 +16,7 @@ | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) | |||
| static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| @@ -28,7 +28,7 @@ static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ | |||
| const float* kernel = _kernel; | |||
| const float* bias = _bias; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| Mat out = top_blob.channel(g); | |||
| @@ -577,7 +577,7 @@ static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ | |||
| } | |||
| } | |||
| static void convdw3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) | |||
| static void convdw3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| @@ -591,7 +591,7 @@ static void convdw3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ | |||
| const float* kernel = _kernel; | |||
| const float* bias = _bias; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| Mat out = top_blob.channel(g); | |||
| @@ -102,7 +102,7 @@ int ConvolutionDepthWise_arm::load_model(const ModelBin& mb) | |||
| return 0; | |||
| } | |||
| int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| // convolv with NxN kernel | |||
| // value = value + bias | |||
| @@ -110,6 +110,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| if (channels % group != 0 || num_output % group != 0) | |||
| { | |||
| @@ -123,7 +124,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con | |||
| Mat bottom_blob_bordered = bottom_blob; | |||
| if (pad_w > 0 || pad_h > 0) | |||
| { | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f); | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); | |||
| if (bottom_blob_bordered.empty()) | |||
| return -100; | |||
| @@ -136,7 +137,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con | |||
| int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h; | |||
| if (wpad > 0 || hpad > 0) | |||
| { | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f); | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); | |||
| if (bottom_blob_bordered.empty()) | |||
| return -100; | |||
| } | |||
| @@ -148,7 +149,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con | |||
| int outw = (w - kernel_extent_w) / stride_w + 1; | |||
| int outh = (h - kernel_extent_h) / stride_h + 1; | |||
| top_blob.create(outw, outh, num_output); | |||
| top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -161,12 +162,12 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con | |||
| { | |||
| if (stride_w == 1 && stride_h == 1) | |||
| { | |||
| convdw3x3s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data); | |||
| convdw3x3s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); | |||
| return 0; | |||
| } | |||
| else if (stride_w == 2 && stride_h == 2) | |||
| { | |||
| convdw3x3s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data); | |||
| convdw3x3s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); | |||
| return 0; | |||
| } | |||
| } | |||
| @@ -176,7 +177,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con | |||
| omp_set_nested(0); | |||
| #endif | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| Mat bottom_blob_bordered_g(w, h, 1, bottom_blob_bordered.channel(g)); | |||
| @@ -213,7 +214,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con | |||
| op->load_model(ModelBinFromMatArray(weights)); | |||
| // forward | |||
| op->forward(bottom_blob_bordered_g, top_blob_g); | |||
| op->forward(bottom_blob_bordered_g, top_blob_g, opt); | |||
| delete op; | |||
| } | |||
| @@ -235,7 +236,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con | |||
| const ncnn::Layer* op = group_ops[g]; | |||
| // forward | |||
| op->forward(bottom_blob_bordered_g, top_blob_g); | |||
| op->forward(bottom_blob_bordered_g, top_blob_g, opt); | |||
| } | |||
| return 0; | |||
| @@ -27,7 +27,7 @@ public: | |||
| virtual int load_model(const ModelBin& mb); | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| public: | |||
| std::vector<ncnn::Layer*> group_ops; | |||
| @@ -16,7 +16,7 @@ | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| static void deconv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) | |||
| static void deconv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| @@ -28,7 +28,7 @@ static void deconv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ | |||
| const float* kernel = _kernel; | |||
| const float* bias = _bias; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=0; p<outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||
| @@ -237,7 +237,7 @@ static void deconv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ | |||
| } | |||
| } | |||
| static void deconv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) | |||
| static void deconv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| @@ -249,7 +249,7 @@ static void deconv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ | |||
| const float* kernel = _kernel; | |||
| const float* bias = _bias; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=0; p<outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||
| @@ -16,7 +16,7 @@ | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| static void deconv4x4s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) | |||
| static void deconv4x4s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| @@ -28,7 +28,7 @@ static void deconv4x4s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ | |||
| const float* kernel = _kernel; | |||
| const float* bias = _bias; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=0; p<outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||
| @@ -185,7 +185,7 @@ static void deconv4x4s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ | |||
| } | |||
| } | |||
| static void deconv4x4s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) | |||
| static void deconv4x4s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| @@ -197,7 +197,7 @@ static void deconv4x4s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ | |||
| const float* kernel = _kernel; | |||
| const float* bias = _bias; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=0; p<outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||
| @@ -21,14 +21,14 @@ namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(Deconvolution_arm) | |||
| int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| // deconvolv with NxN kernel | |||
| // value = value + bias | |||
| if (kernel_w != kernel_h || stride_w != stride_h) | |||
| { | |||
| return Deconvolution::forward(bottom_blob, top_blob); | |||
| return Deconvolution::forward(bottom_blob, top_blob, opt); | |||
| } | |||
| const int kernel_size = kernel_w; | |||
| @@ -36,10 +36,10 @@ int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| if ((kernel_size != 3 && kernel_size != 4) || stride > 2 || dilation_w != 1 || dilation_h != 1) | |||
| { | |||
| return Deconvolution::forward(bottom_blob, top_blob); | |||
| return Deconvolution::forward(bottom_blob, top_blob, opt); | |||
| } | |||
| typedef void (*deconv_func)(const Mat&, Mat&, const Mat&, const Mat&); | |||
| typedef void (*deconv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&); | |||
| // kernel_size x stride | |||
| deconv_func deconv_func_table[2][2] = | |||
| @@ -57,33 +57,46 @@ int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| deconv_func deconv = deconv_func_table[kernel_size-3][stride-1]; | |||
| if (!deconv) | |||
| { | |||
| return Deconvolution::forward(bottom_blob, top_blob); | |||
| return Deconvolution::forward(bottom_blob, top_blob, opt); | |||
| } | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int outw = (w - 1) * stride + kernel_size; | |||
| int outh = (h - 1) * stride + kernel_size; | |||
| Mat top_blob_bordered = top_blob; | |||
| top_blob_bordered.create(outw, outh, num_output); | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| deconv(bottom_blob, top_blob_bordered, weight_data, bias_data); | |||
| Mat top_blob_bordered; | |||
| if (pad_w > 0 || pad_h > 0) | |||
| { | |||
| top_blob_bordered.create(outw, outh, num_output, elemsize, opt.workspace_allocator); | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| } | |||
| else | |||
| { | |||
| top_blob_bordered = top_blob; | |||
| top_blob_bordered.create(outw, outh, num_output, elemsize, opt.blob_allocator); | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| } | |||
| top_blob = top_blob_bordered; | |||
| deconv(bottom_blob, top_blob_bordered, weight_data, bias_data, opt); | |||
| if (pad_w > 0 || pad_h > 0) | |||
| { | |||
| copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w); | |||
| copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w, opt.blob_allocator, opt.num_threads); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| outw = top_blob.w; | |||
| outh = top_blob.h; | |||
| } | |||
| else | |||
| { | |||
| top_blob = top_blob_bordered; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -22,7 +22,7 @@ namespace ncnn { | |||
| class Deconvolution_arm : public Deconvolution | |||
| { | |||
| public: | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -24,7 +24,7 @@ namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(DeconvolutionDepthWise_arm) | |||
| int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| // convolv with NxN kernel | |||
| // value = value + bias | |||
| @@ -32,6 +32,7 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| if (channels % group != 0 || num_output % group != 0) | |||
| { | |||
| @@ -45,10 +46,20 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c | |||
| int outw = (w - 1) * stride_w + kernel_extent_w; | |||
| int outh = (h - 1) * stride_h + kernel_extent_h; | |||
| Mat top_blob_bordered = top_blob; | |||
| top_blob_bordered.create(outw, outh, num_output); | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| Mat top_blob_bordered; | |||
| if (pad_w > 0 || pad_h > 0) | |||
| { | |||
| top_blob_bordered.create(outw, outh, num_output, elemsize, opt.workspace_allocator); | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| } | |||
| else | |||
| { | |||
| top_blob_bordered = top_blob; | |||
| top_blob_bordered.create(outw, outh, num_output, elemsize, opt.blob_allocator); | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| } | |||
| const int maxk = kernel_w * kernel_h; | |||
| @@ -60,7 +71,7 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c | |||
| omp_set_nested(0); | |||
| #endif | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| Mat bottom_blob_g(w, h, 1, bottom_blob.channel(g).data); | |||
| @@ -98,7 +109,7 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c | |||
| op->load_model(ModelBinFromMatArray(weights)); | |||
| // forward | |||
| op->forward(bottom_blob_g, top_blob_bordered_g); | |||
| op->forward(bottom_blob_g, top_blob_bordered_g, opt); | |||
| delete op; | |||
| } | |||
| @@ -148,23 +159,25 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c | |||
| op->load_model(ModelBinFromMatArray(weights)); | |||
| // forward | |||
| op->forward(bottom_blob_g, top_blob_bordered_g); | |||
| op->forward(bottom_blob_g, top_blob_bordered_g, opt); | |||
| delete op; | |||
| } | |||
| } | |||
| top_blob = top_blob_bordered; | |||
| if (pad_w > 0 || pad_h > 0) | |||
| { | |||
| copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w); | |||
| copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w, opt.blob_allocator, opt.num_threads); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| outw = top_blob.w; | |||
| outh = top_blob.h; | |||
| } | |||
| else | |||
| { | |||
| top_blob = top_blob_bordered; | |||
| } | |||
| return 0; | |||
| @@ -22,7 +22,7 @@ namespace ncnn { | |||
| class DeconvolutionDepthWise_arm : public DeconvolutionDepthWise | |||
| { | |||
| public: | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -22,16 +22,17 @@ namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(Eltwise_arm) | |||
| int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const | |||
| int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const | |||
| { | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int size = w * h; | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(w, h, channels); | |||
| top_blob.create(w, h, channels, elemsize, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -39,7 +40,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& | |||
| { | |||
| // first blob | |||
| const Mat& bottom_blob1 = bottom_blobs[1]; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| @@ -117,7 +118,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& | |||
| for (size_t b=2; b<bottom_blobs.size(); b++) | |||
| { | |||
| const Mat& bottom_blob1 = bottom_blobs[b]; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob1.channel(q); | |||
| @@ -193,7 +194,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& | |||
| { | |||
| // first blob | |||
| const Mat& bottom_blob1 = bottom_blobs[1]; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| @@ -271,7 +272,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& | |||
| for (size_t b=2; b<bottom_blobs.size(); b++) | |||
| { | |||
| const Mat& bottom_blob1 = bottom_blobs[b]; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob1.channel(q); | |||
| @@ -349,7 +350,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& | |||
| const Mat& bottom_blob1 = bottom_blobs[1]; | |||
| float coeff0 = coeffs_ptr[0]; | |||
| float coeff1 = coeffs_ptr[1]; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| @@ -436,7 +437,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& | |||
| { | |||
| const Mat& bottom_blob1 = bottom_blobs[b]; | |||
| float coeff = coeffs_ptr[b]; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob1.channel(q); | |||
| @@ -514,7 +515,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& | |||
| { | |||
| // first blob | |||
| const Mat& bottom_blob1 = bottom_blobs[1]; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| @@ -592,7 +593,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& | |||
| for (size_t b=2; b<bottom_blobs.size(); b++) | |||
| { | |||
| const Mat& bottom_blob1 = bottom_blobs[b]; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob1.channel(q); | |||
| @@ -22,7 +22,7 @@ namespace ncnn { | |||
| class Eltwise_arm : public Eltwise | |||
| { | |||
| public: | |||
| virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const; | |||
| virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -22,14 +22,15 @@ namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(InnerProduct_arm) | |||
| int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int size = w * h; | |||
| top_blob.create(num_output); | |||
| top_blob.create(num_output, elemsize, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -38,7 +39,7 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int nn_num_output = num_output >> 2; | |||
| int remain_num_output_start = nn_num_output << 2; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int pp=0; pp<nn_num_output; pp++) | |||
| { | |||
| int p = pp * 4; | |||
| @@ -143,7 +144,7 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| } | |||
| // num_output | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=remain_num_output_start; p<num_output; p++) | |||
| { | |||
| float sum = 0.f; | |||
| @@ -22,7 +22,7 @@ namespace ncnn { | |||
| class InnerProduct_arm : public InnerProduct | |||
| { | |||
| public: | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -24,20 +24,21 @@ namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(LRN_arm) | |||
| int LRN_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| int LRN_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| size_t elemsize = bottom_top_blob.elemsize; | |||
| int size = w * h; | |||
| // squared values with local_size padding | |||
| Mat square_blob; | |||
| square_blob.create(w, h, channels); | |||
| square_blob.create(w, h, channels, elemsize, opt.workspace_allocator); | |||
| if (square_blob.empty()) | |||
| return -100; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_top_blob.channel(q); | |||
| @@ -73,14 +74,14 @@ int LRN_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| if (region_type == NormRegion_ACROSS_CHANNELS) | |||
| { | |||
| Mat square_sum; | |||
| square_sum.create(w, h, channels); | |||
| square_sum.create(w, h, channels, elemsize, opt.workspace_allocator); | |||
| if (square_sum.empty()) | |||
| return -100; | |||
| square_sum.fill(0.f); | |||
| const float alpha_div_size = alpha / local_size; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| // square sum | |||
| @@ -165,7 +166,7 @@ int LRN_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| int pad = local_size / 2; | |||
| if (pad > 0) | |||
| { | |||
| copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f); | |||
| copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); | |||
| if (square_blob_bordered.empty()) | |||
| return -100; | |||
| @@ -196,7 +197,7 @@ int LRN_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| } | |||
| } | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| @@ -22,7 +22,7 @@ namespace ncnn { | |||
| class LRN_arm : public LRN | |||
| { | |||
| public: | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -16,7 +16,7 @@ | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob) | |||
| static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| @@ -26,7 +26,7 @@ static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob) | |||
| const int tailstep = w - 2*outw + w; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<inch; q++) | |||
| { | |||
| const float* img0 = bottom_blob.channel(q); | |||
| @@ -16,7 +16,7 @@ | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob) | |||
| static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| @@ -26,7 +26,7 @@ static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob) | |||
| const int tailstep = w - 2*outw + w; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<inch; q++) | |||
| { | |||
| const float* img0 = bottom_blob.channel(q); | |||
| @@ -21,14 +21,14 @@ namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(Pooling_arm) | |||
| int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| // max value in NxN window | |||
| // avg value in NxN window | |||
| if (kernel_w != kernel_h || stride_w != stride_h) | |||
| { | |||
| return Pooling::forward(bottom_blob, top_blob); | |||
| return Pooling::forward(bottom_blob, top_blob, opt); | |||
| } | |||
| const int kernel_size = kernel_w; | |||
| @@ -36,17 +36,18 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| if (pooling_type != PoolMethod_MAX || stride != 2 || global_pooling == 1) | |||
| { | |||
| return Pooling::forward(bottom_blob, top_blob); | |||
| return Pooling::forward(bottom_blob, top_blob, opt); | |||
| } | |||
| if (kernel_size != 2 && kernel_size != 3) | |||
| { | |||
| return Pooling::forward(bottom_blob, top_blob); | |||
| return Pooling::forward(bottom_blob, top_blob, opt); | |||
| } | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| Mat bottom_blob_bordered = bottom_blob; | |||
| @@ -73,7 +74,7 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| if (htail != 0) | |||
| htailpad = stride_h - htail; | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom + htailpad, pad_left, pad_right + wtailpad, BORDER_CONSTANT, pad_value); | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom + htailpad, pad_left, pad_right + wtailpad, BORDER_CONSTANT, pad_value, opt.workspace_allocator, opt.num_threads); | |||
| if (bottom_blob_bordered.empty()) | |||
| return -100; | |||
| @@ -82,7 +83,7 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| } | |||
| else if (pad_mode == 1) // valid padding | |||
| { | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, BORDER_CONSTANT, pad_value); | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, BORDER_CONSTANT, pad_value, opt.workspace_allocator, opt.num_threads); | |||
| if (bottom_blob_bordered.empty()) | |||
| return -100; | |||
| @@ -95,7 +96,7 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int hpad = kernel_h + (h - 1) / stride_h * stride_h - h; | |||
| if (wpad > 0 || hpad > 0) | |||
| { | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value); | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value, opt.workspace_allocator, opt.num_threads); | |||
| if (bottom_blob_bordered.empty()) | |||
| return -100; | |||
| } | |||
| @@ -107,14 +108,14 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int outw = (w - kernel_w) / stride_w + 1; | |||
| int outh = (h - kernel_h) / stride_h + 1; | |||
| top_blob.create(outw, outh, channels); | |||
| top_blob.create(outw, outh, channels, elemsize, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| if (kernel_size == 2) | |||
| pooling2x2s2_max_neon(bottom_blob_bordered, top_blob); | |||
| pooling2x2s2_max_neon(bottom_blob_bordered, top_blob, opt); | |||
| if (kernel_size == 3) | |||
| pooling3x3s2_max_neon(bottom_blob_bordered, top_blob); | |||
| pooling3x3s2_max_neon(bottom_blob_bordered, top_blob, opt); | |||
| return 0; | |||
| } | |||
| @@ -22,7 +22,7 @@ namespace ncnn { | |||
| class Pooling_arm : public Pooling | |||
| { | |||
| public: | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -22,11 +22,11 @@ namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(PReLU_arm) | |||
| int PReLU_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| int PReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int dims = bottom_top_blob.dims; | |||
| if (dims != 3) | |||
| return PReLU::forward_inplace(bottom_top_blob); | |||
| return PReLU::forward_inplace(bottom_top_blob, opt); | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| @@ -35,7 +35,7 @@ int PReLU_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| const float* slope_data_ptr = slope_data; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| @@ -22,7 +22,7 @@ namespace ncnn { | |||
| class PReLU_arm : public PReLU | |||
| { | |||
| public: | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -22,7 +22,7 @@ namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(ReLU_arm) | |||
| int ReLU_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| int ReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| @@ -31,7 +31,7 @@ int ReLU_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| if (slope == 0.f) | |||
| { | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| @@ -85,7 +85,7 @@ int ReLU_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| } | |||
| else | |||
| { | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| @@ -22,7 +22,7 @@ namespace ncnn { | |||
| class ReLU_arm : public ReLU | |||
| { | |||
| public: | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -22,11 +22,11 @@ namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(Scale_arm) | |||
| int Scale_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| int Scale_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int dims = bottom_top_blob.dims; | |||
| if (dims != 3) | |||
| return Scale::forward_inplace(bottom_top_blob); | |||
| return Scale::forward_inplace(bottom_top_blob, opt); | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| @@ -37,7 +37,7 @@ int Scale_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| { | |||
| const float* scale_ptr = scale_data; | |||
| const float* bias_ptr = bias_data; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| @@ -76,7 +76,7 @@ int Scale_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| else | |||
| { | |||
| const float* scale_ptr = scale_data; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| @@ -22,7 +22,7 @@ namespace ncnn { | |||
| class Scale_arm : public Scale | |||
| { | |||
| public: | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -25,14 +25,14 @@ namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(Sigmoid_arm) | |||
| int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| int size = w * h; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| @@ -22,7 +22,7 @@ namespace ncnn { | |||
| class Sigmoid_arm : public Sigmoid | |||
| { | |||
| public: | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -25,12 +25,12 @@ namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(Softmax_arm) | |||
| int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int dims = bottom_top_blob.dims; | |||
| if (dims != 3 || axis != 0) | |||
| return Softmax::forward_inplace(bottom_top_blob); | |||
| return Softmax::forward_inplace(bottom_top_blob, opt); | |||
| // value = exp( value - global max value ) | |||
| // sum all value | |||
| @@ -39,10 +39,11 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| size_t elemsize = bottom_top_blob.elemsize; | |||
| int size = w * h; | |||
| Mat max; | |||
| max.create(w, h); | |||
| max.create(w, h, elemsize, opt.workspace_allocator); | |||
| if (max.empty()) | |||
| return -100; | |||
| max.fill(-FLT_MAX); | |||
| @@ -57,7 +58,7 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| } | |||
| } | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| @@ -95,7 +96,7 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| } | |||
| Mat sum; | |||
| sum.create(w, h); | |||
| sum.create(w, h, elemsize, opt.workspace_allocator); | |||
| if (sum.empty()) | |||
| return -100; | |||
| sum.fill(0.f); | |||
| @@ -133,7 +134,7 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| } | |||
| } | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| @@ -22,7 +22,7 @@ namespace ncnn { | |||
| class Softmax_arm : public Softmax | |||
| { | |||
| public: | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -68,7 +68,7 @@ int BatchNorm::load_model(const ModelBin& mb) | |||
| return 0; | |||
| } | |||
| int BatchNorm::forward_inplace(Mat& bottom_top_blob) const | |||
| int BatchNorm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| // a = bias - slope * mean / sqrt(var) | |||
| // b = slope / sqrt(var) | |||
| @@ -82,7 +82,7 @@ int BatchNorm::forward_inplace(Mat& bottom_top_blob) const | |||
| float* ptr = bottom_top_blob; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int i=0; i<w; i++) | |||
| { | |||
| ptr[i] = b_data[i] * ptr[i] + a_data[i]; | |||
| @@ -94,7 +94,7 @@ int BatchNorm::forward_inplace(Mat& bottom_top_blob) const | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int i=0; i<h; i++) | |||
| { | |||
| float* ptr = bottom_top_blob.row(i); | |||
| @@ -114,7 +114,7 @@ int BatchNorm::forward_inplace(Mat& bottom_top_blob) const | |||
| int h = bottom_top_blob.h; | |||
| int size = w * h; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| @@ -28,7 +28,7 @@ public: | |||
| virtual int load_model(const ModelBin& mb); | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; | |||
| public: | |||
| // param | |||
| @@ -40,14 +40,14 @@ int Bias::load_model(const ModelBin& mb) | |||
| return 0; | |||
| } | |||
| int Bias::forward_inplace(Mat& bottom_top_blob) const | |||
| int Bias::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| int size = w * h; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| @@ -28,7 +28,7 @@ public: | |||
| virtual int load_model(const ModelBin& mb); | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; | |||
| public: | |||
| // param | |||
| @@ -43,7 +43,7 @@ int BinaryOp::load_param(const ParamDict& pd) | |||
| } | |||
| template<typename Op> | |||
| static int binary_op(const Mat& a, const Mat& b, Mat& c) | |||
| static int binary_op(const Mat& a, const Mat& b, Mat& c, const Option& opt) | |||
| { | |||
| Op op; | |||
| @@ -51,6 +51,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c) | |||
| int h = a.h; | |||
| int channels = a.c; | |||
| int size = w * h; | |||
| size_t elemsize = a.elemsize; | |||
| int w1 = b.w; | |||
| int h1 = b.h; | |||
| @@ -59,13 +60,13 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c) | |||
| if (a.dims == 3) | |||
| { | |||
| c.create(w, h, channels); | |||
| c.create(w, h, channels, elemsize, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| if (b.dims == 3) | |||
| { | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| @@ -83,7 +84,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c) | |||
| if (b.dims == 2) | |||
| { | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| @@ -111,7 +112,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c) | |||
| if (b.w == 1) | |||
| { | |||
| const float b0 = b[0]; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| @@ -126,7 +127,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c) | |||
| return 0; | |||
| } | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| @@ -146,11 +147,11 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c) | |||
| { | |||
| if (b.dims == 3) | |||
| { | |||
| c.create(w1, h1, channels1); | |||
| c.create(w1, h1, channels1, elemsize, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels1; q++) | |||
| { | |||
| const float* ptr = (const float*)a + h1 * q; | |||
| @@ -173,7 +174,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c) | |||
| return 0; | |||
| } | |||
| c.create(w, h); | |||
| c.create(w, h, elemsize, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| @@ -189,7 +190,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c) | |||
| if (b.dims == 1) | |||
| { | |||
| c.create(w, h); | |||
| c.create(w, h, elemsize, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| @@ -228,12 +229,12 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c) | |||
| { | |||
| if (b.dims == 3) | |||
| { | |||
| c.create(w1, h1, channels1); | |||
| c.create(w1, h1, channels1, elemsize, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| const float a0 = a[0]; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels1; q++) | |||
| { | |||
| const float* ptr1 = b.channel(q); | |||
| @@ -250,7 +251,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c) | |||
| if (b.dims == 2) | |||
| { | |||
| c.create(w1, h1); | |||
| c.create(w1, h1, elemsize, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| @@ -265,7 +266,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c) | |||
| if (b.dims == 1) | |||
| { | |||
| c.create(w1); | |||
| c.create(w1, elemsize, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| @@ -281,11 +282,11 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c) | |||
| if (b.dims == 3) | |||
| { | |||
| c.create(w1, h1, channels1); | |||
| c.create(w1, h1, channels1, elemsize, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels1; q++) | |||
| { | |||
| const float a0 = a[q]; | |||
| @@ -303,7 +304,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c) | |||
| if (b.dims == 2) | |||
| { | |||
| c.create(w1, h1); | |||
| c.create(w1, h1, elemsize, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| @@ -327,7 +328,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c) | |||
| if (b.dims == 1) | |||
| { | |||
| c.create(w); | |||
| c.create(w, elemsize, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| @@ -353,7 +354,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c) | |||
| } | |||
| template<typename Op> | |||
| static int binary_op_scalar_inplace(Mat& a, float b) | |||
| static int binary_op_scalar_inplace(Mat& a, float b, const Option& opt) | |||
| { | |||
| Op op; | |||
| @@ -362,7 +363,7 @@ static int binary_op_scalar_inplace(Mat& a, float b) | |||
| int channels = a.c; | |||
| int size = w * h; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = a.channel(q); | |||
| @@ -401,7 +402,7 @@ struct binary_op_rdiv : std::binary_function<T,T,T> { | |||
| T operator() (const T& x, const T& y) const { return y / x; } | |||
| }; | |||
| int BinaryOp::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const | |||
| int BinaryOp::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const | |||
| { | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| const Mat& bottom_blob1 = bottom_blobs[1]; | |||
| @@ -409,63 +410,63 @@ int BinaryOp::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to | |||
| Mat& top_blob = top_blobs[0]; | |||
| if (op_type == Operation_ADD) | |||
| return binary_op< std::plus<float> >(bottom_blob, bottom_blob1, top_blob); | |||
| return binary_op< std::plus<float> >(bottom_blob, bottom_blob1, top_blob, opt); | |||
| if (op_type == Operation_SUB) | |||
| return binary_op< std::minus<float> >(bottom_blob, bottom_blob1, top_blob); | |||
| return binary_op< std::minus<float> >(bottom_blob, bottom_blob1, top_blob, opt); | |||
| if (op_type == Operation_MUL) | |||
| return binary_op< std::multiplies<float> >(bottom_blob, bottom_blob1, top_blob); | |||
| return binary_op< std::multiplies<float> >(bottom_blob, bottom_blob1, top_blob, opt); | |||
| if (op_type == Operation_DIV) | |||
| return binary_op< std::divides<float> >(bottom_blob, bottom_blob1, top_blob); | |||
| return binary_op< std::divides<float> >(bottom_blob, bottom_blob1, top_blob, opt); | |||
| if (op_type == Operation_MAX) | |||
| return binary_op< binary_op_max<float> >(bottom_blob, bottom_blob1, top_blob); | |||
| return binary_op< binary_op_max<float> >(bottom_blob, bottom_blob1, top_blob, opt); | |||
| if (op_type == Operation_MIN) | |||
| return binary_op< binary_op_min<float> >(bottom_blob, bottom_blob1, top_blob); | |||
| return binary_op< binary_op_min<float> >(bottom_blob, bottom_blob1, top_blob, opt); | |||
| if (op_type == Operation_POW) | |||
| return binary_op< binary_op_pow<float> >(bottom_blob, bottom_blob1, top_blob); | |||
| return binary_op< binary_op_pow<float> >(bottom_blob, bottom_blob1, top_blob, opt); | |||
| if (op_type == Operation_RSUB) | |||
| return binary_op< binary_op_rsub<float> >(bottom_blob, bottom_blob1, top_blob); | |||
| return binary_op< binary_op_rsub<float> >(bottom_blob, bottom_blob1, top_blob, opt); | |||
| if (op_type == Operation_RDIV) | |||
| return binary_op< binary_op_rdiv<float> >(bottom_blob, bottom_blob1, top_blob); | |||
| return binary_op< binary_op_rdiv<float> >(bottom_blob, bottom_blob1, top_blob, opt); | |||
| return 0; | |||
| } | |||
| int BinaryOp::forward_inplace(Mat& bottom_top_blob) const | |||
| int BinaryOp::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| if (op_type == Operation_ADD) | |||
| return binary_op_scalar_inplace< std::plus<float> >(bottom_top_blob, b); | |||
| return binary_op_scalar_inplace< std::plus<float> >(bottom_top_blob, b, opt); | |||
| if (op_type == Operation_SUB) | |||
| return binary_op_scalar_inplace< std::minus<float> >(bottom_top_blob, b); | |||
| return binary_op_scalar_inplace< std::minus<float> >(bottom_top_blob, b, opt); | |||
| if (op_type == Operation_MUL) | |||
| return binary_op_scalar_inplace< std::multiplies<float> >(bottom_top_blob, b); | |||
| return binary_op_scalar_inplace< std::multiplies<float> >(bottom_top_blob, b, opt); | |||
| if (op_type == Operation_DIV) | |||
| return binary_op_scalar_inplace< std::divides<float> >(bottom_top_blob, b); | |||
| return binary_op_scalar_inplace< std::divides<float> >(bottom_top_blob, b, opt); | |||
| if (op_type == Operation_MAX) | |||
| return binary_op_scalar_inplace< binary_op_max<float> >(bottom_top_blob, b); | |||
| return binary_op_scalar_inplace< binary_op_max<float> >(bottom_top_blob, b, opt); | |||
| if (op_type == Operation_MIN) | |||
| return binary_op_scalar_inplace< binary_op_min<float> >(bottom_top_blob, b); | |||
| return binary_op_scalar_inplace< binary_op_min<float> >(bottom_top_blob, b, opt); | |||
| if (op_type == Operation_POW) | |||
| return binary_op_scalar_inplace< binary_op_pow<float> >(bottom_top_blob, b); | |||
| return binary_op_scalar_inplace< binary_op_pow<float> >(bottom_top_blob, b, opt); | |||
| if (op_type == Operation_RSUB) | |||
| return binary_op_scalar_inplace< binary_op_rsub<float> >(bottom_top_blob, b); | |||
| return binary_op_scalar_inplace< binary_op_rsub<float> >(bottom_top_blob, b, opt); | |||
| if (op_type == Operation_RDIV) | |||
| return binary_op_scalar_inplace< binary_op_rdiv<float> >(bottom_top_blob, b); | |||
| return binary_op_scalar_inplace< binary_op_rdiv<float> >(bottom_top_blob, b, opt); | |||
| return 0; | |||
| } | |||
| @@ -26,9 +26,9 @@ public: | |||
| virtual int load_param(const ParamDict& pd); | |||
| virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const; | |||
| virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; | |||
| enum { | |||
| Operation_ADD = 0, | |||
| @@ -25,14 +25,14 @@ BNLL::BNLL() | |||
| support_inplace = true; | |||
| } | |||
| int BNLL::forward_inplace(Mat& bottom_top_blob) const | |||
| int BNLL::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| int size = w * h; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| @@ -24,7 +24,7 @@ class BNLL : public Layer | |||
| public: | |||
| BNLL(); | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; | |||
| public: | |||
| }; | |||
| @@ -34,14 +34,14 @@ int Clip::load_param(const ParamDict& pd) | |||
| return 0; | |||
| } | |||
| int Clip::forward_inplace(Mat& bottom_top_blob) const | |||
| int Clip::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| int size = w * h; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| @@ -26,7 +26,7 @@ public: | |||
| virtual int load_param(const ParamDict& pd); | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; | |||
| public: | |||
| float min; | |||
| @@ -31,7 +31,7 @@ int Concat::load_param(const ParamDict& pd) | |||
| return 0; | |||
| } | |||
| int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const | |||
| int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const | |||
| { | |||
| int dims = bottom_blobs[0].dims; | |||
| size_t elemsize = bottom_blobs[0].elemsize; | |||
| @@ -48,7 +48,7 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_ | |||
| } | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(top_w, elemsize); | |||
| top_blob.create(top_w, elemsize, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -82,7 +82,7 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_ | |||
| } | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(w, top_h, elemsize); | |||
| top_blob.create(w, top_h, elemsize, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -116,11 +116,11 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_ | |||
| } | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(top_w, h, elemsize); | |||
| top_blob.create(top_w, h, elemsize, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int i=0; i<h; i++) | |||
| { | |||
| float* outptr = top_blob.row(i); | |||
| @@ -153,7 +153,7 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_ | |||
| } | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(w, h, top_channels, elemsize); | |||
| top_blob.create(w, h, top_channels, elemsize, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -190,11 +190,11 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_ | |||
| } | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(w, top_h, channels, elemsize); | |||
| top_blob.create(w, top_h, channels, elemsize, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* outptr = top_blob.channel(q); | |||
| @@ -230,11 +230,11 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_ | |||
| } | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(top_w, h, channels, elemsize); | |||
| top_blob.create(top_w, h, channels, elemsize, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* outptr = top_blob.channel(q); | |||
| @@ -26,7 +26,7 @@ public: | |||
| virtual int load_param(const ParamDict& pd); | |||
| virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const; | |||
| virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| public: | |||
| int axis; | |||
| @@ -59,7 +59,7 @@ int Convolution::load_model(const ModelBin& mb) | |||
| return 0; | |||
| } | |||
| int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| // convolv with NxN kernel | |||
| // value = value + bias | |||
| @@ -89,7 +89,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| op->load_model(ModelBinFromMatArray(weights)); | |||
| // forward | |||
| op->forward(bottom_blob, top_blob); | |||
| op->forward(bottom_blob, top_blob, opt); | |||
| delete op; | |||
| @@ -100,6 +100,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| // fprintf(stderr, "Convolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d\n", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h); | |||
| @@ -109,7 +110,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| Mat bottom_blob_bordered = bottom_blob; | |||
| if (pad_w > 0 || pad_h > 0) | |||
| { | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f); | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); | |||
| if (bottom_blob_bordered.empty()) | |||
| return -100; | |||
| @@ -122,7 +123,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h; | |||
| if (wpad > 0 || hpad > 0) | |||
| { | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f); | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); | |||
| if (bottom_blob_bordered.empty()) | |||
| return -100; | |||
| } | |||
| @@ -134,7 +135,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int outw = (w - kernel_extent_w) / stride_w + 1; | |||
| int outh = (h - kernel_extent_h) / stride_h + 1; | |||
| top_blob.create(outw, outh, num_output); | |||
| top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -160,7 +161,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| } | |||
| // num_output | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=0; p<num_output; p++) | |||
| { | |||
| float* outptr = top_blob.channel(p); | |||
| @@ -28,7 +28,7 @@ public: | |||
| virtual int load_model(const ModelBin& mb); | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| public: | |||
| // param | |||
| @@ -64,7 +64,7 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb) | |||
| return 0; | |||
| } | |||
| int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| // convolv with NxN kernel | |||
| // value = value + bias | |||
| @@ -72,6 +72,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| if (channels % group != 0 || num_output % group != 0) | |||
| { | |||
| @@ -87,7 +88,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| Mat bottom_blob_bordered = bottom_blob; | |||
| if (pad_w > 0 || pad_h > 0) | |||
| { | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f); | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); | |||
| if (bottom_blob_bordered.empty()) | |||
| return -100; | |||
| @@ -100,7 +101,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h; | |||
| if (wpad > 0 || hpad > 0) | |||
| { | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f); | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); | |||
| if (bottom_blob_bordered.empty()) | |||
| return -100; | |||
| } | |||
| @@ -112,7 +113,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int outw = (w - kernel_extent_w) / stride_w + 1; | |||
| int outh = (h - kernel_extent_h) / stride_h + 1; | |||
| top_blob.create(outw, outh, num_output); | |||
| top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -140,7 +141,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| // depth-wise | |||
| if (channels == group && group == num_output) | |||
| { | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| float* outptr = top_blob.channel(g); | |||
| @@ -179,9 +180,9 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| const int num_output_g = num_output / group; | |||
| #ifdef _WIN32 | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| #else // _WIN32 | |||
| #pragma omp parallel for collapse(2) | |||
| #pragma omp parallel for collapse(2) num_threads(opt.num_threads) | |||
| #endif // _WIN32 | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| @@ -28,7 +28,7 @@ public: | |||
| virtual int load_model(const ModelBin& mb); | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| public: | |||
| // param | |||
| @@ -39,7 +39,7 @@ int Crop::load_param(const ParamDict& pd) | |||
| return 0; | |||
| } | |||
| int Crop::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int Crop::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| @@ -56,14 +56,14 @@ int Crop::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int left = woffset; | |||
| int right = w - _outw - woffset; | |||
| copy_cut_border(bottom_blob_sliced, top_blob, top, bottom, left, right); | |||
| copy_cut_border(bottom_blob_sliced, top_blob, top, bottom, left, right, opt.blob_allocator, opt.num_threads); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| return 0; | |||
| } | |||
| int Crop::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const | |||
| int Crop::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const | |||
| { | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| const Mat& reference_blob = bottom_blobs[1]; | |||
| @@ -85,7 +85,7 @@ int Crop::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl | |||
| Mat& top_blob = top_blobs[0]; | |||
| copy_cut_border(bottom_blob_sliced, top_blob, top, bottom, left, right); | |||
| copy_cut_border(bottom_blob_sliced, top_blob, top, bottom, left, right, opt.blob_allocator, opt.num_threads); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -26,9 +26,9 @@ public: | |||
| virtual int load_param(const ParamDict& pd); | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const; | |||
| virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| public: | |||
| int woffset; | |||
| @@ -57,7 +57,7 @@ int Deconvolution::load_model(const ModelBin& mb) | |||
| return 0; | |||
| } | |||
| int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| // backward strided convolv with NxN kernel | |||
| // value = value + bias | |||
| @@ -65,6 +65,7 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| // fprintf(stderr, "Deconvolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d\n", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h); | |||
| @@ -74,10 +75,20 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int outw = (w - 1) * stride_w + kernel_extent_w; | |||
| int outh = (h - 1) * stride_h + kernel_extent_h; | |||
| Mat top_blob_bordered = top_blob; | |||
| top_blob_bordered.create(outw, outh, num_output); | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| Mat top_blob_bordered; | |||
| if (pad_w > 0 || pad_h > 0) | |||
| { | |||
| top_blob_bordered.create(outw, outh, num_output, elemsize, opt.workspace_allocator); | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| } | |||
| else | |||
| { | |||
| top_blob_bordered = top_blob; | |||
| top_blob_bordered.create(outw, outh, num_output, elemsize, opt.blob_allocator); | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| } | |||
| const int maxk = kernel_w * kernel_h; | |||
| @@ -101,7 +112,7 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| } | |||
| // num_output | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=0; p<num_output; p++) | |||
| { | |||
| Mat out = top_blob_bordered.channel(p); | |||
| @@ -136,17 +147,19 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| } | |||
| } | |||
| top_blob = top_blob_bordered; | |||
| if (pad_w > 0 || pad_h > 0) | |||
| { | |||
| copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w); | |||
| copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w, opt.blob_allocator, opt.num_threads); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| outw = top_blob.w; | |||
| outh = top_blob.h; | |||
| } | |||
| else | |||
| { | |||
| top_blob = top_blob_bordered; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -28,7 +28,7 @@ public: | |||
| virtual int load_model(const ModelBin& mb); | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| public: | |||
| // param | |||
| @@ -58,7 +58,7 @@ int DeconvolutionDepthWise::load_model(const ModelBin& mb) | |||
| return 0; | |||
| } | |||
| int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| // deconvolv with NxN kernel | |||
| // value = value + bias | |||
| @@ -66,6 +66,7 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| if (channels % group != 0 || num_output % group != 0) | |||
| { | |||
| @@ -79,10 +80,20 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int outw = (w - 1) * stride_w + kernel_extent_w; | |||
| int outh = (h - 1) * stride_h + kernel_extent_h; | |||
| Mat top_blob_bordered = top_blob; | |||
| top_blob_bordered.create(outw, outh, num_output); | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| Mat top_blob_bordered; | |||
| if (pad_w > 0 || pad_h > 0) | |||
| { | |||
| top_blob_bordered.create(outw, outh, num_output, elemsize, opt.workspace_allocator); | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| } | |||
| else | |||
| { | |||
| top_blob_bordered = top_blob; | |||
| top_blob_bordered.create(outw, outh, num_output, elemsize, opt.blob_allocator); | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| } | |||
| const int maxk = kernel_w * kernel_h; | |||
| @@ -108,7 +119,7 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| // depth-wise | |||
| if (channels == group && group == num_output) | |||
| { | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| const float* inptr = bottom_blob.channel(g); | |||
| @@ -141,7 +152,7 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| const int channels_g = channels / group; | |||
| const int num_output_g = num_output / group; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int g = 0; g < group; g++) | |||
| { | |||
| const float* weight_data_ptr = (const float*)weight_data + maxk * channels_g * num_output_g * g; | |||
| @@ -180,17 +191,19 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| } | |||
| } | |||
| top_blob = top_blob_bordered; | |||
| if (pad_w > 0 || pad_h > 0) | |||
| { | |||
| copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w); | |||
| copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w, opt.blob_allocator, opt.num_threads); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| outw = top_blob.w; | |||
| outh = top_blob.h; | |||
| } | |||
| else | |||
| { | |||
| top_blob = top_blob_bordered; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -28,7 +28,7 @@ public: | |||
| virtual int load_model(const ModelBin& mb); | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| public: | |||
| // param | |||
| @@ -141,7 +141,7 @@ static void nms_sorted_bboxes(const std::vector<BBoxRect>& bboxes, std::vector<i | |||
| } | |||
| } | |||
| int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const | |||
| int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const | |||
| { | |||
| const Mat& location = bottom_blobs[0]; | |||
| const Mat& confidence = bottom_blobs[1]; | |||
| @@ -151,7 +151,7 @@ int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<M | |||
| // apply location with priorbox | |||
| Mat bboxes; | |||
| bboxes.create(4, num_prior); | |||
| bboxes.create(4, num_prior, 4u, opt.workspace_allocator); | |||
| if (bboxes.empty()) | |||
| return -100; | |||
| @@ -159,7 +159,7 @@ int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<M | |||
| const float* priorbox_ptr = priorbox.row(0); | |||
| const float* variance_ptr = priorbox.row(1); | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int i = 0; i < num_prior; i++) | |||
| { | |||
| const float* loc = location_ptr + i * 4; | |||
| @@ -192,7 +192,7 @@ int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<M | |||
| all_class_bbox_scores.resize(num_class); | |||
| // start from 1 to ignore background class | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int i = 1; i < num_class; i++) | |||
| { | |||
| // filter by confidence_threshold | |||
| @@ -262,7 +262,7 @@ int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<M | |||
| int num_detected = bbox_rects.size(); | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(6, num_detected); | |||
| top_blob.create(6, num_detected, 4u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -26,7 +26,7 @@ public: | |||
| virtual int load_param(const ParamDict& pd); | |||
| virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const; | |||
| virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| public: | |||
| int num_class; | |||
| @@ -31,7 +31,7 @@ int Dropout::load_param(const ParamDict& pd) | |||
| return 0; | |||
| } | |||
| int Dropout::forward_inplace(Mat& bottom_top_blob) const | |||
| int Dropout::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| if (scale == 1.f) | |||
| { | |||
| @@ -43,7 +43,7 @@ int Dropout::forward_inplace(Mat& bottom_top_blob) const | |||
| int channels = bottom_top_blob.c; | |||
| int size = w * h; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| @@ -26,7 +26,7 @@ public: | |||
| virtual int load_param(const ParamDict& pd); | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; | |||
| public: | |||
| float scale; | |||
| @@ -31,16 +31,17 @@ int Eltwise::load_param(const ParamDict& pd) | |||
| return 0; | |||
| } | |||
| int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const | |||
| int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const | |||
| { | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int size = w * h; | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(w, h, channels); | |||
| top_blob.create(w, h, channels, elemsize, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -48,7 +49,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top | |||
| { | |||
| // first blob | |||
| const Mat& bottom_blob1 = bottom_blobs[1]; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| @@ -64,7 +65,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top | |||
| for (size_t b=2; b<bottom_blobs.size(); b++) | |||
| { | |||
| const Mat& bottom_blob1 = bottom_blobs[b]; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob1.channel(q); | |||
| @@ -83,7 +84,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top | |||
| { | |||
| // first blob | |||
| const Mat& bottom_blob1 = bottom_blobs[1]; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| @@ -99,7 +100,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top | |||
| for (size_t b=2; b<bottom_blobs.size(); b++) | |||
| { | |||
| const Mat& bottom_blob1 = bottom_blobs[b]; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob1.channel(q); | |||
| @@ -118,7 +119,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top | |||
| const Mat& bottom_blob1 = bottom_blobs[1]; | |||
| float coeff0 = coeffs[0]; | |||
| float coeff1 = coeffs[1]; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| @@ -135,7 +136,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top | |||
| { | |||
| const Mat& bottom_blob1 = bottom_blobs[b]; | |||
| float coeff = coeffs[b]; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob1.channel(q); | |||
| @@ -153,7 +154,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top | |||
| { | |||
| // first blob | |||
| const Mat& bottom_blob1 = bottom_blobs[1]; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| @@ -169,7 +170,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top | |||
| for (size_t b=2; b<bottom_blobs.size(); b++) | |||
| { | |||
| const Mat& bottom_blob1 = bottom_blobs[b]; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob1.channel(q); | |||
| @@ -26,7 +26,7 @@ public: | |||
| virtual int load_param(const ParamDict& pd); | |||
| virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const; | |||
| virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| enum { Operation_PROD = 0, Operation_SUM = 1, Operation_MAX = 2 }; | |||
| @@ -32,14 +32,14 @@ int ELU::load_param(const ParamDict& pd) | |||
| return 0; | |||
| } | |||
| int ELU::forward_inplace(Mat& bottom_top_blob) const | |||
| int ELU::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| int size = w * h; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| @@ -26,7 +26,7 @@ public: | |||
| virtual int load_param(const ParamDict& pd); | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; | |||
| public: | |||
| float alpha; | |||
| @@ -51,16 +51,16 @@ int Embed::load_model(const ModelBin& mb) | |||
| return 0; | |||
| } | |||
| int Embed::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| int words = bottom_blob.total(); | |||
| top_blob.create(num_output, words); | |||
| top_blob.create(num_output, words, 4u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| // num_output | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<words; q++) | |||
| { | |||
| float* outptr = top_blob.row(q); | |||
| @@ -28,7 +28,7 @@ public: | |||
| virtual int load_model(const ModelBin& mb); | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| public: | |||
| // param | |||
| @@ -34,7 +34,7 @@ int Exp::load_param(const ParamDict& pd) | |||
| return 0; | |||
| } | |||
| int Exp::forward_inplace(Mat& bottom_top_blob) const | |||
| int Exp::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| @@ -43,7 +43,7 @@ int Exp::forward_inplace(Mat& bottom_top_blob) const | |||
| if (base == -1.f) | |||
| { | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| @@ -56,7 +56,7 @@ int Exp::forward_inplace(Mat& bottom_top_blob) const | |||
| } | |||
| else | |||
| { | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| @@ -26,7 +26,7 @@ public: | |||
| virtual int load_param(const ParamDict& pd); | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; | |||
| public: | |||
| float base; | |||
| @@ -33,7 +33,7 @@ int ExpandDims::load_param(const ParamDict& pd) | |||
| return 0; | |||
| } | |||
| int ExpandDims::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int ExpandDims::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| @@ -46,28 +46,28 @@ int ExpandDims::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| if (expand_w) | |||
| { | |||
| if (expand_h) | |||
| top_blob = bottom_blob.reshape(1, 1, w); | |||
| top_blob = bottom_blob.reshape(1, 1, w, opt.blob_allocator); | |||
| else if (expand_c) | |||
| top_blob = bottom_blob.reshape(1, w, 1); | |||
| top_blob = bottom_blob.reshape(1, w, 1, opt.blob_allocator); | |||
| else | |||
| top_blob = bottom_blob.reshape(1, w); | |||
| top_blob = bottom_blob.reshape(1, w, opt.blob_allocator); | |||
| } | |||
| else if (expand_h) | |||
| { | |||
| if (expand_c) | |||
| top_blob = bottom_blob.reshape(w, 1, 1); | |||
| top_blob = bottom_blob.reshape(w, 1, 1, opt.blob_allocator); | |||
| else | |||
| top_blob = bottom_blob.reshape(w, 1); | |||
| top_blob = bottom_blob.reshape(w, 1, opt.blob_allocator); | |||
| } | |||
| } | |||
| else if (dims == 2) | |||
| { | |||
| if (expand_w) | |||
| top_blob = bottom_blob.reshape(1, w, h); | |||
| top_blob = bottom_blob.reshape(1, w, h, opt.blob_allocator); | |||
| else if (expand_h) | |||
| top_blob = bottom_blob.reshape(w, 1, h); | |||
| top_blob = bottom_blob.reshape(w, 1, h, opt.blob_allocator); | |||
| else if (expand_c) | |||
| top_blob = bottom_blob.reshape(w, h, 1); | |||
| top_blob = bottom_blob.reshape(w, h, 1, opt.blob_allocator); | |||
| } | |||
| if (top_blob.empty()) | |||
| @@ -26,7 +26,7 @@ public: | |||
| virtual int load_param(const ParamDict& pd); | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| public: | |||
| int expand_w; | |||
| @@ -24,18 +24,19 @@ Flatten::Flatten() | |||
| support_inplace = false; | |||
| } | |||
| int Flatten::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int Flatten::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int size = w * h; | |||
| top_blob.create(size * channels); | |||
| top_blob.create(size * channels, elemsize, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const float* ptr = bottom_blob.channel(q); | |||
| @@ -24,7 +24,7 @@ class Flatten : public Layer | |||
| public: | |||
| Flatten(); | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -49,19 +49,20 @@ int InnerProduct::load_model(const ModelBin& mb) | |||
| return 0; | |||
| } | |||
| int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int size = w * h; | |||
| top_blob.create(num_output); | |||
| top_blob.create(num_output, elemsize, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| // num_output | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=0; p<num_output; p++) | |||
| { | |||
| float sum = 0.f; | |||
| @@ -28,7 +28,7 @@ public: | |||
| virtual int load_model(const ModelBin& mb); | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob) const; | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| public: | |||
| // param | |||
| @@ -33,7 +33,7 @@ int Input::load_param(const ParamDict& pd) | |||
| return 0; | |||
| } | |||
| int Input::forward_inplace(Mat& /*bottom_top_blob*/) const | |||
| int Input::forward_inplace(Mat& /*bottom_top_blob*/, const Option& /*opt*/) const | |||
| { | |||
| return 0; | |||
| } | |||
| @@ -26,7 +26,7 @@ public: | |||
| virtual int load_param(const ParamDict& pd); | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; | |||
| public: | |||
| int w; | |||
| @@ -46,7 +46,7 @@ int InstanceNorm::load_model(const ModelBin& mb) | |||
| return 0; | |||
| } | |||
| int InstanceNorm::forward_inplace(Mat& bottom_top_blob) const | |||
| int InstanceNorm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| // x = (x - mean) / (sqrt(var) + eps) * gamma + beta | |||
| @@ -54,7 +54,7 @@ int InstanceNorm::forward_inplace(Mat& bottom_top_blob) const | |||
| int h = bottom_top_blob.h; | |||
| int size = w * h; | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| @@ -28,7 +28,7 @@ public: | |||
| virtual int load_model(const ModelBin& mb); | |||
| virtual int forward_inplace(Mat& bottom_top_blob) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; | |||
| public: | |||
| // param | |||
| @@ -35,11 +35,13 @@ int Interp::load_param(const ParamDict& pd) | |||
| return 0; | |||
| } | |||
| int Interp::forward(const Mat &bottom_blob, Mat &top_blob) const | |||
| int Interp::forward(const Mat &bottom_blob, Mat &top_blob, const Option& opt) const | |||
| { | |||
| int h = bottom_blob.h; | |||
| int w = bottom_blob.w; | |||
| int c = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int oh = output_height; | |||
| int ow = output_width; | |||
| if (bottom_blob.dims == 1) | |||
| @@ -58,13 +60,13 @@ int Interp::forward(const Mat &bottom_blob, Mat &top_blob) const | |||
| top_blob = bottom_blob; | |||
| return 0; | |||
| } | |||
| top_blob.create(ow, oh, c); | |||
| top_blob.create(ow, oh, c, elemsize, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| if (bottom_blob.dims == 1) | |||
| { | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < c; ++q) | |||
| { | |||
| Mat top_blob_c = top_blob.channel(q); | |||
| @@ -76,7 +78,7 @@ int Interp::forward(const Mat &bottom_blob, Mat &top_blob) const | |||
| if (resize_type == 1)//nearest | |||
| { | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < c; ++q) | |||
| { | |||
| const float *ptr = bottom_blob.channel(q); | |||
| @@ -26,7 +26,7 @@ public: | |||
| virtual int load_param(const ParamDict& pd); | |||
| virtual int forward(const Mat &bottom_blob, Mat &top_blob) const; | |||
| virtual int forward(const Mat &bottom_blob, Mat &top_blob, const Option& opt) const; | |||
| public: | |||
| // param | |||
| @@ -34,7 +34,7 @@ int Log::load_param(const ParamDict& pd) | |||
| return 0; | |||
| } | |||
| int Log::forward_inplace(Mat& bottom_top_blob) const | |||
| int Log::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| @@ -43,7 +43,7 @@ int Log::forward_inplace(Mat& bottom_top_blob) const | |||
| if (base == -1.f) | |||
| { | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| @@ -58,7 +58,7 @@ int Log::forward_inplace(Mat& bottom_top_blob) const | |||
| { | |||
| float log_base_inv = 1.f / log(base); | |||
| #pragma omp parallel for | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||