From 9706cd1447119cd3f66e2dddc8a5ffc5935f8661 Mon Sep 17 00:00:00 2001
From: nihui <shuizhuyuanluo@126.com>
Date: Sun, 15 Jul 2018 20:38:36 +0800
Subject: [PATCH] implement ncnn blob/workspace allocator, fine-grained
 per-layer openmp threads control, fix #469

---
 benchmark/benchncnn.cpp                      |  19 +-
 src/CMakeLists.txt                           |   1 +
 src/allocator.cpp                            | 237 +++++++++++++++++++
 src/allocator.h                              | 175 ++++++++++++++
 src/layer.cpp                                |  46 +++-
 src/layer.h                                  |  24 +-
 src/layer/absval.cpp                         |   4 +-
 src/layer/absval.h                           |   2 +-
 src/layer/argmax.cpp                         |   6 +-
 src/layer/argmax.h                           |   2 +-
 src/layer/arm/absval_arm.cpp                 |   4 +-
 src/layer/arm/absval_arm.h                   |   2 +-
 src/layer/arm/batchnorm_arm.cpp              |   6 +-
 src/layer/arm/batchnorm_arm.h                |   2 +-
 src/layer/arm/bias_arm.cpp                   |   4 +-
 src/layer/arm/bias_arm.h                     |   2 +-
 src/layer/arm/convolution_1x1.h              |  32 +--
 src/layer/arm/convolution_2x2.h              |   4 +-
 src/layer/arm/convolution_3x3.h              |  55 ++---
 src/layer/arm/convolution_4x4.h              |   4 +-
 src/layer/arm/convolution_5x5.h              |   8 +-
 src/layer/arm/convolution_7x7.h              |   8 +-
 src/layer/arm/convolution_arm.cpp            |  48 ++--
 src/layer/arm/convolution_arm.h              |   6 +-
 src/layer/arm/convolutiondepthwise_3x3.h     |   8 +-
 src/layer/arm/convolutiondepthwise_arm.cpp   |  19 +-
 src/layer/arm/convolutiondepthwise_arm.h     |   2 +-
 src/layer/arm/deconvolution_3x3.h            |   8 +-
 src/layer/arm/deconvolution_4x4.h            |   8 +-
 src/layer/arm/deconvolution_arm.cpp          |  39 ++-
 src/layer/arm/deconvolution_arm.h            |   2 +-
 src/layer/arm/deconvolutiondepthwise_arm.cpp |  35 ++-
 src/layer/arm/deconvolutiondepthwise_arm.h   |   2 +-
 src/layer/arm/eltwise_arm.cpp                |  21 +-
 src/layer/arm/eltwise_arm.h                  |   2 +-
 src/layer/arm/innerproduct_arm.cpp           |   9 +-
 src/layer/arm/innerproduct_arm.h             |   2 +-
 src/layer/arm/lrn_arm.cpp                    |  15 +-
 src/layer/arm/lrn_arm.h                      |   2 +-
 src/layer/arm/pooling_2x2.h                  |   4 +-
 src/layer/arm/pooling_3x3.h                  |   4 +-
 src/layer/arm/pooling_arm.cpp                |  21 +-
 src/layer/arm/pooling_arm.h                  |   2 +-
 src/layer/arm/prelu_arm.cpp                  |   6 +-
 src/layer/arm/prelu_arm.h                    |   2 +-
 src/layer/arm/relu_arm.cpp                   |   6 +-
 src/layer/arm/relu_arm.h                     |   2 +-
 src/layer/arm/scale_arm.cpp                  |   8 +-
 src/layer/arm/scale_arm.h                    |   2 +-
 src/layer/arm/sigmoid_arm.cpp                |   4 +-
 src/layer/arm/sigmoid_arm.h                  |   2 +-
 src/layer/arm/softmax_arm.cpp                |  13 +-
 src/layer/arm/softmax_arm.h                  |   2 +-
 src/layer/batchnorm.cpp                      |   8 +-
 src/layer/batchnorm.h                        |   2 +-
 src/layer/bias.cpp                           |   4 +-
 src/layer/bias.h                             |   2 +-
 src/layer/binaryop.cpp                       |  81 +++----
 src/layer/binaryop.h                         |   4 +-
 src/layer/bnll.cpp                           |   4 +-
 src/layer/bnll.h                             |   2 +-
 src/layer/clip.cpp                           |   4 +-
 src/layer/clip.h                             |   2 +-
 src/layer/concat.cpp                         |  20 +-
 src/layer/concat.h                           |   2 +-
 src/layer/convolution.cpp                    |  13 +-
 src/layer/convolution.h                      |   2 +-
 src/layer/convolutiondepthwise.cpp           |  15 +-
 src/layer/convolutiondepthwise.h             |   2 +-
 src/layer/crop.cpp                           |   8 +-
 src/layer/crop.h                             |   4 +-
 src/layer/deconvolution.cpp                  |  31 ++-
 src/layer/deconvolution.h                    |   2 +-
 src/layer/deconvolutiondepthwise.cpp         |  33 ++-
 src/layer/deconvolutiondepthwise.h           |   2 +-
 src/layer/detectionoutput.cpp                |  10 +-
 src/layer/detectionoutput.h                  |   2 +-
 src/layer/dropout.cpp                        |   4 +-
 src/layer/dropout.h                          |   2 +-
 src/layer/eltwise.cpp                        |  21 +-
 src/layer/eltwise.h                          |   2 +-
 src/layer/elu.cpp                            |   4 +-
 src/layer/elu.h                              |   2 +-
 src/layer/embed.cpp                          |   6 +-
 src/layer/embed.h                            |   2 +-
 src/layer/exp.cpp                            |   6 +-
 src/layer/exp.h                              |   2 +-
 src/layer/expanddims.cpp                     |  18 +-
 src/layer/expanddims.h                       |   2 +-
 src/layer/flatten.cpp                        |   7 +-
 src/layer/flatten.h                          |   2 +-
 src/layer/innerproduct.cpp                   |   7 +-
 src/layer/innerproduct.h                     |   2 +-
 src/layer/input.cpp                          |   2 +-
 src/layer/input.h                            |   2 +-
 src/layer/instancenorm.cpp                   |   4 +-
 src/layer/instancenorm.h                     |   2 +-
 src/layer/interp.cpp                         |  10 +-
 src/layer/interp.h                           |   2 +-
 src/layer/log.cpp                            |   6 +-
 src/layer/log.h                              |   2 +-
 src/layer/lrn.cpp                            |  15 +-
 src/layer/lrn.h                              |   2 +-
 src/layer/lstm.cpp                           |  53 ++---
 src/layer/lstm.h                             |   2 +-
 src/layer/memorydata.cpp                     |   4 +-
 src/layer/memorydata.h                       |   2 +-
 src/layer/mvn.cpp                            |  21 +-
 src/layer/mvn.h                              |   2 +-
 src/layer/normalize.cpp                      |  25 +-
 src/layer/normalize.h                        |   2 +-
 src/layer/padding.cpp                        |   4 +-
 src/layer/padding.h                          |   2 +-
 src/layer/permute.cpp                        |  23 +-
 src/layer/permute.h                          |   2 +-
 src/layer/pooling.cpp                        |  21 +-
 src/layer/pooling.h                          |   2 +-
 src/layer/power.cpp                          |   4 +-
 src/layer/power.h                            |   2 +-
 src/layer/prelu.cpp                          |  10 +-
 src/layer/prelu.h                            |   2 +-
 src/layer/priorbox.cpp                       |   6 +-
 src/layer/priorbox.h                         |   2 +-
 src/layer/proposal.cpp                       |   6 +-
 src/layer/proposal.h                         |   2 +-
 src/layer/reduction.cpp                      |  41 ++--
 src/layer/reduction.h                        |   2 +-
 src/layer/relu.cpp                           |   6 +-
 src/layer/relu.h                             |   2 +-
 src/layer/reorg.cpp                          |   7 +-
 src/layer/reorg.h                            |   2 +-
 src/layer/reshape.cpp                        |  11 +-
 src/layer/reshape.h                          |   2 +-
 src/layer/rnn.cpp                            |   7 +-
 src/layer/rnn.h                              |   2 +-
 src/layer/roipooling.cpp                     |   7 +-
 src/layer/roipooling.h                       |   2 +-
 src/layer/scale.cpp                          |  18 +-
 src/layer/scale.h                            |   4 +-
 src/layer/shufflechannel.cpp                 |   4 +-
 src/layer/shufflechannel.h                   |   2 +-
 src/layer/sigmoid.cpp                        |   4 +-
 src/layer/sigmoid.h                          |   2 +-
 src/layer/slice.cpp                          |  20 +-
 src/layer/slice.h                            |   2 +-
 src/layer/softmax.cpp                        |  43 ++--
 src/layer/softmax.h                          |   2 +-
 src/layer/split.cpp                          |   2 +-
 src/layer/split.h                            |   2 +-
 src/layer/spp.cpp                            |  12 +-
 src/layer/spp.h                              |   2 +-
 src/layer/squeeze.cpp                        |  14 +-
 src/layer/squeeze.h                          |   2 +-
 src/layer/tanh.cpp                           |   4 +-
 src/layer/tanh.h                             |   2 +-
 src/layer/threshold.cpp                      |   4 +-
 src/layer/threshold.h                        |   2 +-
 src/layer/tile.cpp                           |  15 +-
 src/layer/tile.h                             |   2 +-
 src/layer/unaryop.cpp                        |  38 +--
 src/layer/unaryop.h                          |   2 +-
 src/layer/x86/convolution_1x1.h              |   8 +-
 src/layer/x86/convolution_3x3.h              |   4 +-
 src/layer/x86/convolution_5x5.h              |   4 +-
 src/layer/x86/convolution_x86.cpp            |  42 ++--
 src/layer/x86/convolution_x86.h              |   6 +-
 src/layer/x86/convolutiondepthwise_3x3.h     |   8 +-
 src/layer/x86/convolutiondepthwise_x86.cpp   |  19 +-
 src/layer/x86/convolutiondepthwise_x86.h     |   2 +-
 src/layer/yolodetectionoutput.cpp            |   8 +-
 src/layer/yolodetectionoutput.h              |   2 +-
 src/mat.cpp                                  |  28 ++-
 src/mat.h                                    | 189 ++++++---------
 src/mat_pixel.cpp                            |  70 +++---
 src/net.cpp                                  |  95 +++-----
 src/net.h                                    |  11 +-
 176 files changed, 1414 insertions(+), 924 deletions(-)
 create mode 100644 src/allocator.cpp
 create mode 100644 src/allocator.h

diff --git a/benchmark/benchncnn.cpp b/benchmark/benchncnn.cpp
index 8e6fdd28d..42e1c9b34 100644
--- a/benchmark/benchncnn.cpp
+++ b/benchmark/benchncnn.cpp
@@ -52,6 +52,9 @@ public:
 
 static int g_loop_count = 4;
 
+static ncnn::UnlockedPoolAllocator g_blob_pool_allocator;
+static ncnn::PoolAllocator g_workspace_pool_allocator;
+
 void benchmark(const char* comment, void (*init)(ncnn::Net&), void (*run)(const ncnn::Net&))
 {
     ncnn::BenchNet net;
@@ -60,6 +63,9 @@ void benchmark(const char* comment, void (*init)(ncnn::Net&), void (*run)(const
 
     net.load_model();
 
+    g_blob_pool_allocator.clear();
+    g_workspace_pool_allocator.clear();
+
     // sleep 10 seconds for cooling down SOC  :(
 #ifdef _WIN32
     Sleep(10 * 1000);
@@ -265,8 +271,6 @@ void mobilenet_yolo_run(const ncnn::Net& net)
 {
     ncnn::Extractor ex = net.create_extractor();
 
-    // NOTE original model input is 416x416x3
-    // you may change to 300x300x3 for comparison with ssd
     ncnn::Mat in(416, 416, 3);
     ex.input("data", in);
 
@@ -295,6 +299,17 @@ int main(int argc, char** argv)
 
     g_loop_count = loop_count;
 
+    g_blob_pool_allocator.set_size_compare_ratio(0.0f);
+    g_workspace_pool_allocator.set_size_compare_ratio(0.5f);
+
+    ncnn::Option opt;
+    opt.lightmode = true;
+    opt.num_threads = num_threads;
+    opt.blob_allocator = &g_blob_pool_allocator;
+    opt.workspace_allocator = &g_workspace_pool_allocator;
+
+    ncnn::set_default_option(opt);
+
     ncnn::set_cpu_powersave(powersave);
 
     ncnn::set_omp_dynamic(0);
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e72b16606..fa7ec646e 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -8,6 +8,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/layer)
 
 set(ncnn_SRCS
+    allocator.cpp
     blob.cpp
     cpu.cpp
     layer.cpp
diff --git a/src/allocator.cpp b/src/allocator.cpp
new file mode 100644
index 000000000..2f866ee0a
--- /dev/null
+++ b/src/allocator.cpp
@@ -0,0 +1,237 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "allocator.h"
+
+#include <stdio.h>
+
+namespace ncnn {
+
+PoolAllocator::PoolAllocator()
+{
+    size_compare_ratio = 192;// 0.75f * 256
+}
+
+PoolAllocator::~PoolAllocator()
+{
+    clear();
+
+    if (!payouts.empty())
+    {
+        fprintf(stderr, "FATAL ERROR! pool allocator destroyed too early\n");
+        std::list< std::pair<size_t, void*> >::iterator it = payouts.begin();
+        for (; it != payouts.end(); it++)
+        {
+            void* ptr = it->second;
+            fprintf(stderr, "%p still in use\n", ptr);
+        }
+    }
+}
+
+void PoolAllocator::clear()
+{
+    budgets_lock.lock();
+
+    std::list< std::pair<size_t, void*> >::iterator it = budgets.begin();
+    for (; it != budgets.end(); it++)
+    {
+        void* ptr = it->second;
+        ncnn::fastFree(ptr);
+    }
+    budgets.clear();
+
+    budgets_lock.unlock();
+}
+
+void PoolAllocator::set_size_compare_ratio(float scr)
+{
+    if (scr < 0.f || scr > 1.f)
+    {
+        fprintf(stderr, "invalid size compare ratio %f\n", scr);
+        return;
+    }
+
+    size_compare_ratio = (unsigned int)(scr * 256);
+}
+
+void* PoolAllocator::fastMalloc(size_t size)
+{
+    budgets_lock.lock();
+
+    // find free budget
+    std::list< std::pair<size_t, void*> >::iterator it = budgets.begin();
+    for (; it != budgets.end(); it++)
+    {
+        size_t bs = it->first;
+
+        // size_compare_ratio ~ 100%
+        if (bs >= size && ((bs * size_compare_ratio) >> 8) <= size)
+        {
+            void* ptr = it->second;
+
+            budgets.erase(it);
+
+            budgets_lock.unlock();
+
+            payouts_lock.lock();
+
+            payouts.push_back(std::make_pair(bs, ptr));
+
+            payouts_lock.unlock();
+
+            return ptr;
+        }
+    }
+
+    budgets_lock.unlock();
+
+    // new
+    void* ptr = ncnn::fastMalloc(size);
+
+    payouts_lock.lock();
+
+    payouts.push_back(std::make_pair(size, ptr));
+
+    payouts_lock.unlock();
+
+    return ptr;
+}
+
+void PoolAllocator::fastFree(void* ptr)
+{
+    payouts_lock.lock();
+
+    // return to budgets
+    std::list< std::pair<size_t, void*> >::iterator it = payouts.begin();
+    for (; it != payouts.end(); it++)
+    {
+        if (it->second == ptr)
+        {
+            size_t size = it->first;
+
+            payouts.erase(it);
+
+            payouts_lock.unlock();
+
+            budgets_lock.lock();
+
+            budgets.push_back(std::make_pair(size, ptr));
+
+            budgets_lock.unlock();
+
+            return;
+        }
+    }
+
+    payouts_lock.unlock();
+
+    fprintf(stderr, "FATAL ERROR! pool allocator get wild %p\n", ptr);
+    ncnn::fastFree(ptr);
+}
+
+UnlockedPoolAllocator::UnlockedPoolAllocator()
+{
+    size_compare_ratio = 192;// 0.75f * 256
+}
+
+UnlockedPoolAllocator::~UnlockedPoolAllocator()
+{
+    clear();
+
+    if (!payouts.empty())
+    {
+        fprintf(stderr, "FATAL ERROR! unlocked pool allocator destroyed too early\n");
+        std::list< std::pair<size_t, void*> >::iterator it = payouts.begin();
+        for (; it != payouts.end(); it++)
+        {
+            void* ptr = it->second;
+            fprintf(stderr, "%p still in use\n", ptr);
+        }
+    }
+}
+
+void UnlockedPoolAllocator::clear()
+{
+    std::list< std::pair<size_t, void*> >::iterator it = budgets.begin();
+    for (; it != budgets.end(); it++)
+    {
+        void* ptr = it->second;
+        ncnn::fastFree(ptr);
+    }
+    budgets.clear();
+}
+
+void UnlockedPoolAllocator::set_size_compare_ratio(float scr)
+{
+    if (scr < 0.f || scr > 1.f)
+    {
+        fprintf(stderr, "invalid size compare ratio %f\n", scr);
+        return;
+    }
+
+    size_compare_ratio = (unsigned int)(scr * 256);
+}
+
+void* UnlockedPoolAllocator::fastMalloc(size_t size)
+{
+    // find free budget
+    std::list< std::pair<size_t, void*> >::iterator it = budgets.begin();
+    for (; it != budgets.end(); it++)
+    {
+        size_t bs = it->first;
+
+        // size_compare_ratio ~ 100%
+        if (bs >= size && ((bs * size_compare_ratio) >> 8) <= size)
+        {
+            void* ptr = it->second;
+
+            budgets.erase(it);
+
+            payouts.push_back(std::make_pair(bs, ptr));
+
+            return ptr;
+        }
+    }
+
+    // new
+    void* ptr = ncnn::fastMalloc(size);
+
+    payouts.push_back(std::make_pair(size, ptr));
+
+    return ptr;
+}
+
+void UnlockedPoolAllocator::fastFree(void* ptr)
+{
+    // return to budgets
+    std::list< std::pair<size_t, void*> >::iterator it = payouts.begin();
+    for (; it != payouts.end(); it++)
+    {
+        if (it->second == ptr)
+        {
+            size_t size = it->first;
+
+            payouts.erase(it);
+
+            budgets.push_back(std::make_pair(size, ptr));
+
+            return;
+        }
+    }
+
+    fprintf(stderr, "FATAL ERROR! unlocked pool allocator get wild %p\n", ptr);
+    ncnn::fastFree(ptr);
+}
+
+} // namespace ncnn
diff --git a/src/allocator.h b/src/allocator.h
new file mode 100644
index 000000000..061250ab0
--- /dev/null
+++ b/src/allocator.h
@@ -0,0 +1,175 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_ALLOCATOR_H
+#define NCNN_ALLOCATOR_H
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#else
+#include <pthread.h>
+#endif
+
+#include <stdlib.h>
+#include <list>
+
+namespace ncnn {
+
+// the alignment of all the allocated buffers
+#define MALLOC_ALIGN    16
+
+// Aligns a pointer to the specified number of bytes
+// ptr Aligned pointer
+// n Alignment size that must be a power of two
+template<typename _Tp> static inline _Tp* alignPtr(_Tp* ptr, int n=(int)sizeof(_Tp))
+{
+    return (_Tp*)(((size_t)ptr + n-1) & -n);
+}
+
+// Aligns a buffer size to the specified number of bytes
+// The function returns the minimum number that is greater or equal to sz and is divisible by n
+// sz Buffer size to align
+// n Alignment size that must be a power of two
+static inline size_t alignSize(size_t sz, int n)
+{
+    return (sz + n-1) & -n;
+}
+
+static inline void* fastMalloc(size_t size)
+{
+    unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + MALLOC_ALIGN);
+    if (!udata)
+        return 0;
+    unsigned char** adata = alignPtr((unsigned char**)udata + 1, MALLOC_ALIGN);
+    adata[-1] = udata;
+    return adata;
+}
+
+static inline void fastFree(void* ptr)
+{
+    if (ptr)
+    {
+        unsigned char* udata = ((unsigned char**)ptr)[-1];
+        free(udata);
+    }
+}
+
+// exchange-add operation for atomic operations on reference counters
+#if defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32)
+// atomic increment on the linux version of the Intel(tm) compiler
+#  define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd(const_cast<void*>(reinterpret_cast<volatile void*>(addr)), delta)
+#elif defined __GNUC__
+#  if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__)
+#    ifdef __ATOMIC_ACQ_REL
+#      define NCNN_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL)
+#    else
+#      define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4)
+#    endif
+#  else
+#    if defined __ATOMIC_ACQ_REL && !defined __clang__
+// version for gcc >= 4.7
+#      define NCNN_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL)
+#    else
+#      define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta))
+#    endif
+#  endif
+#elif defined _MSC_VER && !defined RC_INVOKED
+#  include <intrin.h>
+#  define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
+#else
+static inline void NCNN_XADD(int* addr, int delta) { int tmp = *addr; *addr += delta; return tmp; }
+#endif
+
+#ifdef _WIN32
+class Mutex
+{
+public:
+    Mutex() { InitializeSRWLock(&lock); }
+    ~Mutex() { }
+    void lock() { AcquireSRWLockExclusive(&lock); }
+    void unlock() { ReleaseSRWLockExclusive(&lock); }
+private:
+    // NOTE SRWLock is available from windows vista
+    SRWLOCK lock;
+};
+#else // _WIN32
+class Mutex
+{
+public:
+    Mutex() { pthread_mutex_init(&mutex, 0); }
+    ~Mutex() { pthread_mutex_destroy(&mutex); }
+    void lock() { pthread_mutex_lock(&mutex); }
+    void unlock() { pthread_mutex_unlock(&mutex); }
+private:
+    pthread_mutex_t mutex;
+};
+#endif // _WIN32
+
+class Allocator
+{
+public:
+    virtual void* fastMalloc(size_t size) = 0;
+    virtual void fastFree(void* ptr) = 0;
+};
+
+class PoolAllocator : public Allocator
+{
+public:
+    PoolAllocator();
+    ~PoolAllocator();
+
+    // ratio range 0 ~ 1
+    // default cr = 0.75
+    void set_size_compare_ratio(float scr);
+
+    // release all budgets immediately
+    void clear();
+
+    virtual void* fastMalloc(size_t size);
+    virtual void fastFree(void* ptr);
+
+private:
+    Mutex budgets_lock;
+    Mutex payouts_lock;
+    unsigned int size_compare_ratio;// 0~256
+    std::list< std::pair<size_t, void*> > budgets;
+    std::list< std::pair<size_t, void*> > payouts;
+};
+
+class UnlockedPoolAllocator : public Allocator
+{
+public:
+    UnlockedPoolAllocator();
+    ~UnlockedPoolAllocator();
+
+    // ratio range 0 ~ 1
+    // default cr = 0.75
+    void set_size_compare_ratio(float scr);
+
+    // release all budgets immediately
+    void clear();
+
+    virtual void* fastMalloc(size_t size);
+    virtual void fastFree(void* ptr);
+
+private:
+    unsigned int size_compare_ratio;// 0~256
+    std::list< std::pair<size_t, void*> > budgets;
+    std::list< std::pair<size_t, void*> > payouts;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_ALLOCATOR_H
diff --git a/src/layer.cpp b/src/layer.cpp
index 01e5e638c..522e33e6c 100644
--- a/src/layer.cpp
+++ b/src/layer.cpp
@@ -14,10 +14,40 @@
 
 #include "layer.h"
 
+#include <stdio.h>
 #include <string.h>
+#include "cpu.h"
 
 namespace ncnn {
 
+Option::Option()
+{
+    lightmode = true;
+    num_threads = get_cpu_count();
+    blob_allocator = 0;
+    workspace_allocator = 0;
+}
+
+static Option g_default_option;
+
+const Option& get_default_option()
+{
+    return g_default_option;
+}
+
+int set_default_option(const Option& opt)
+{
+    if (opt.num_threads <= 0)
+    {
+        fprintf(stderr, "invalid option num_threads %d\n", opt.num_threads);
+        return -1;
+    }
+
+    g_default_option = opt;
+
+    return 0;
+}
+
 Layer::Layer()
 {
     one_blob_only = false;
@@ -38,7 +68,7 @@ int Layer::load_model(const ModelBin& /*mb*/)
     return 0;
 }
 
-int Layer::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
+int Layer::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
     if (!support_inplace)
         return -1;
@@ -46,32 +76,32 @@ int Layer::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_b
     top_blobs = bottom_blobs;
     for (int i = 0; i < (int)top_blobs.size(); i++)
     {
-        top_blobs[i] = bottom_blobs[i].clone();
+        top_blobs[i] = bottom_blobs[i].clone(opt.blob_allocator);
         if (top_blobs[i].empty())
             return -100;
     }
 
-    return forward_inplace(top_blobs);
+    return forward_inplace(top_blobs, opt);
 }
 
-int Layer::forward(const Mat& bottom_blob, Mat& top_blob) const
+int Layer::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     if (!support_inplace)
         return -1;
 
-    top_blob = bottom_blob.clone();
+    top_blob = bottom_blob.clone(opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
-    return forward_inplace(top_blob);
+    return forward_inplace(top_blob, opt);
 }
 
-int Layer::forward_inplace(std::vector<Mat>& /*bottom_top_blobs*/) const
+int Layer::forward_inplace(std::vector<Mat>& /*bottom_top_blobs*/, const Option& /*opt*/) const
 {
     return -1;
 }
 
-int Layer::forward_inplace(Mat& /*bottom_top_blob*/) const
+int Layer::forward_inplace(Mat& /*bottom_top_blob*/, const Option& /*opt*/) const
 {
     return -1;
 }
diff --git a/src/layer.h b/src/layer.h
index 1eeae3b97..b46bf177d 100644
--- a/src/layer.h
+++ b/src/layer.h
@@ -25,6 +25,22 @@
 
 namespace ncnn {
 
+class Allocator;
+class Option
+{
+public:
+    Option();
+
+public:
+    bool lightmode;
+    int num_threads;
+    Allocator* blob_allocator;
+    Allocator* workspace_allocator;
+};
+
+const Option& get_default_option();
+int set_default_option(const Option& opt);
+
 class Layer
 {
 public:
@@ -51,13 +67,13 @@ public:
 public:
     // implement inference
     // return 0 if success
-    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt = get_default_option()) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt = get_default_option()) const;
 
     // implement inplace inference
     // return 0 if success
-    virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs) const;
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt = get_default_option()) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt = get_default_option()) const;
 
 public:
 #if NCNN_STRING
diff --git a/src/layer/absval.cpp b/src/layer/absval.cpp
index 5b066ab88..73a8f04c7 100644
--- a/src/layer/absval.cpp
+++ b/src/layer/absval.cpp
@@ -24,14 +24,14 @@ AbsVal::AbsVal()
     support_inplace = true;
 }
 
-int AbsVal::forward_inplace(Mat& bottom_top_blob) const
+int AbsVal::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
     int size = w * h;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<channels; q++)
     {
         float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/absval.h b/src/layer/absval.h
index 5c0dad200..30c13999e 100644
--- a/src/layer/absval.h
+++ b/src/layer/absval.h
@@ -24,7 +24,7 @@ class AbsVal : public Layer
 public:
     AbsVal();
 
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 public:
 };
diff --git a/src/layer/argmax.cpp b/src/layer/argmax.cpp
index 3ede2a0db..4622da926 100644
--- a/src/layer/argmax.cpp
+++ b/src/layer/argmax.cpp
@@ -33,14 +33,14 @@ int ArgMax::load_param(const ParamDict& pd)
     return 0;
 }
 
-int ArgMax::forward(const Mat& bottom_blob, Mat& top_blob) const
+int ArgMax::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     int size = bottom_blob.total();
 
     if (out_max_val)
-        top_blob.create(topk, 2);
+        top_blob.create(topk, 2, 4u, opt.blob_allocator);
     else
-        top_blob.create(topk, 1);
+        top_blob.create(topk, 1, 4u, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
diff --git a/src/layer/argmax.h b/src/layer/argmax.h
index bd965eae8..05d5ca401 100644
--- a/src/layer/argmax.h
+++ b/src/layer/argmax.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 public:
     int out_max_val;
diff --git a/src/layer/arm/absval_arm.cpp b/src/layer/arm/absval_arm.cpp
index da5142ad6..77b5a0406 100644
--- a/src/layer/arm/absval_arm.cpp
+++ b/src/layer/arm/absval_arm.cpp
@@ -22,14 +22,14 @@ namespace ncnn {
 
 DEFINE_LAYER_CREATOR(AbsVal_arm)
 
-int AbsVal_arm::forward_inplace(Mat& bottom_top_blob) const
+int AbsVal_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
     int size = w * h;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<channels; q++)
     {
         float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/arm/absval_arm.h b/src/layer/arm/absval_arm.h
index b9d7a460f..aa1952584 100644
--- a/src/layer/arm/absval_arm.h
+++ b/src/layer/arm/absval_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class AbsVal_arm : public AbsVal
 {
 public:
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 };
 
 } // namespace ncnn
diff --git a/src/layer/arm/batchnorm_arm.cpp b/src/layer/arm/batchnorm_arm.cpp
index 0bbac26f3..6957c3dbb 100644
--- a/src/layer/arm/batchnorm_arm.cpp
+++ b/src/layer/arm/batchnorm_arm.cpp
@@ -22,11 +22,11 @@ namespace ncnn {
 
 DEFINE_LAYER_CREATOR(BatchNorm_arm)
 
-int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob) const
+int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     int dims = bottom_top_blob.dims;
     if (dims != 3)
-        return BatchNorm::forward_inplace(bottom_top_blob);
+        return BatchNorm::forward_inplace(bottom_top_blob, opt);
 
     // a = bias - slope * mean / sqrt(var)
     // b = slope / sqrt(var)
@@ -38,7 +38,7 @@ int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob) const
 
     const float* a_data_ptr = a_data;
     const float* b_data_ptr = b_data;
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<channels; q++)
     {
         float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/arm/batchnorm_arm.h b/src/layer/arm/batchnorm_arm.h
index 5e99ac978..2b460817d 100644
--- a/src/layer/arm/batchnorm_arm.h
+++ b/src/layer/arm/batchnorm_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class BatchNorm_arm : public BatchNorm
 {
 public:
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 };
 
 } // namespace ncnn
diff --git a/src/layer/arm/bias_arm.cpp b/src/layer/arm/bias_arm.cpp
index aec915488..ed11501be 100644
--- a/src/layer/arm/bias_arm.cpp
+++ b/src/layer/arm/bias_arm.cpp
@@ -22,7 +22,7 @@ namespace ncnn {
 
 DEFINE_LAYER_CREATOR(Bias_arm)
 
-int Bias_arm::forward_inplace(Mat& bottom_top_blob) const
+int Bias_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
@@ -30,7 +30,7 @@ int Bias_arm::forward_inplace(Mat& bottom_top_blob) const
     int size = w * h;
 
     const float* bias_ptr = bias_data;
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<channels; q++)
     {
         float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/arm/bias_arm.h b/src/layer/arm/bias_arm.h
index ccf6f4de7..5f08facf1 100644
--- a/src/layer/arm/bias_arm.h
+++ b/src/layer/arm/bias_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class Bias_arm : public Bias
 {
 public:
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 };
 
 } // namespace ncnn
diff --git a/src/layer/arm/convolution_1x1.h b/src/layer/arm/convolution_1x1.h
index 8cb8d387f..c3fe9fd39 100644
--- a/src/layer/arm/convolution_1x1.h
+++ b/src/layer/arm/convolution_1x1.h
@@ -113,7 +113,7 @@ static void conv1x1s1_sgemm_transform_kernel_neon(const Mat& _kernel, Mat& kerne
     }
 }
 
-static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias)
+static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -128,12 +128,12 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma
     const float* bias = _bias;
 
     // interleave
-    Mat tmp(8*4, inch/4+inch%4, size/8 + (size%8)/4 + size%4);
+    Mat tmp(8*4, inch/4+inch%4, size/8 + (size%8)/4 + size%4, 4u, opt.workspace_allocator);
     {
         int nn_size = size >> 3;
         int remain_size_start = nn_size << 3;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int ii=0; ii<nn_size; ii++)
         {
             int i = ii * 8;
@@ -184,7 +184,7 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma
 
         nn_size = (size - remain_size_start) >> 2;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int ii=0; ii<nn_size; ii++)
         {
             int i = remain_size_start + ii * 4;
@@ -230,7 +230,7 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma
 
         remain_size_start += nn_size << 2;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int i=remain_size_start; i<size; i++)
         {
             const float* img0 = bottom_blob.channel(0);
@@ -254,7 +254,7 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma
     nn_outch = outch >> 3;
     remain_outch_start = nn_outch << 3;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int pp=0; pp<nn_outch; pp++)
     {
         int p = pp * 8;
@@ -733,7 +733,7 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma
 
     nn_outch = (outch - remain_outch_start) >> 2;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int pp=0; pp<nn_outch; pp++)
     {
         int p = remain_outch_start + pp * 4;
@@ -1613,7 +1613,7 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma
 
     remain_outch_start += nn_outch << 2;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=remain_outch_start; p<outch; p++)
     {
         Mat out0 = top_blob.channel(p);
@@ -2064,7 +2064,7 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma
 //     }
 }
 
-static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
     int inch = bottom_blob.c;
 
@@ -2083,7 +2083,7 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
     nn_outch = outch >> 3;
     remain_outch_start = nn_outch << 3;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int pp=0; pp<nn_outch; pp++)
     {
         int p = pp * 8;
@@ -2710,7 +2710,7 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
     nn_outch = outch / 6;
     remain_outch_start = nn_outch * 6;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int pp=0; pp<nn_outch; pp++)
     {
         int p = pp * 6;
@@ -3101,7 +3101,7 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
 
     nn_outch = (outch - remain_outch_start) >> 2;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int pp=0; pp<nn_outch; pp++)
     {
         int p = remain_outch_start + pp * 4;
@@ -3605,7 +3605,7 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
 
     remain_outch_start += nn_outch << 2;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=remain_outch_start; p<outch; p++)
     {
         Mat out = top_blob.channel(p);
@@ -3863,7 +3863,7 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
 
 }
 
-static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
@@ -3880,7 +3880,7 @@ static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
     int nn_outch = outch >> 2;
     int remain_outch_start = nn_outch << 2;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int pp=0; pp<nn_outch; pp++)
     {
         int p = pp * 4;
@@ -4409,7 +4409,7 @@ static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
         }
     }
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=remain_outch_start; p<outch; p++)
     {
         Mat out = top_blob.channel(p);
diff --git a/src/layer/arm/convolution_2x2.h b/src/layer/arm/convolution_2x2.h
index 24cc7c4d9..1eb679153 100644
--- a/src/layer/arm/convolution_2x2.h
+++ b/src/layer/arm/convolution_2x2.h
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON
 
-static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
@@ -28,7 +28,7 @@ static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
     const float* kernel = _kernel;
     const float* bias = _bias;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=0; p<outch; p++)
     {
         Mat out = top_blob.channel(p);
diff --git a/src/layer/arm/convolution_3x3.h b/src/layer/arm/convolution_3x3.h
index 3c500db35..a0582dc6c 100644
--- a/src/layer/arm/convolution_3x3.h
+++ b/src/layer/arm/convolution_3x3.h
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON
 
-static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
@@ -31,7 +31,7 @@ static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
     int nn_outch = outch >> 1;
     int remain_outch_start = nn_outch << 1;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int pp=0; pp<nn_outch; pp++)
     {
         int p = pp * 2;
@@ -654,7 +654,7 @@ static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
         }
     }
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=remain_outch_start; p<outch; p++)
     {
         Mat out = top_blob.channel(p);
@@ -5427,7 +5427,7 @@ static void conv3x3s1_winograd64_neon3(const Mat& bottom_blob, Mat& top_blob, co
 }
 #endif
 
-static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias)
+static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -5445,7 +5445,7 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co
 
     w = outw + 2;
     h = outh + 2;
-    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f);
+    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt.workspace_allocator, opt.num_threads);
 
     const float* bias = _bias;
 
@@ -5454,7 +5454,7 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co
     {
         int w_tm = outw / 6 * 8;
         int h_tm = outh / 6 * 8;
-        bottom_blob_tm.create(4, 16 * w_tm/8 * h_tm/8, inch);
+        bottom_blob_tm.create(4, 16 * w_tm/8 * h_tm/8, inch, 4u, opt.workspace_allocator);
         const int tiles = w_tm/8 * h_tm/8;
 
 //         const float itm[8][8] = {
@@ -5495,7 +5495,7 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co
         float32x4_t _coeff1 = vld1q_f32(coeff+4);
 #endif // __ARM_NEON
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q<inch; q++)
         {
             const Mat img0 = bottom_blob_bordered.channel(q);
@@ -6263,14 +6263,14 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co
     {
         int w_tm = outw / 6 * 8;
         int h_tm = outh / 6 * 8;
-        top_blob_tm.create(4, 16 * w_tm/8 * h_tm/8, outch);
+        top_blob_tm.create(4, 16 * w_tm/8 * h_tm/8, outch, 4u, opt.workspace_allocator);
 
         const int tiles = h_tm/8 * w_tm/8;
 
         int nn_outch = outch >> 2;
         int remain_outch_start = nn_outch << 2;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int pp=0; pp<nn_outch; pp++)
         {
             int p = pp * 4;
@@ -7439,7 +7439,7 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co
             }
         }
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int p = remain_outch_start; p<outch; p++)
         {
             Mat out0_tm = top_blob_tm.channel(p);
@@ -7526,7 +7526,7 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co
 
     // BEGIN transform output
     Mat top_blob_bordered;
-    top_blob_bordered.create(outw, outh, outch);
+    top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
     {
 //         const float otm[6][8] = {
 //             {1.0f,  1.0f,   1.0f,   1.0f,   1.0f,  32.0f, 32.0f, 0.0f},
@@ -7553,7 +7553,7 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co
         int h_tm = outh / 6 * 8;
         const int tiles = w_tm/8 * h_tm/8;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int p = 0; p<outch; p++)
         {
             const Mat out0_tm = top_blob_tm.channel(p);
@@ -8157,10 +8157,10 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co
     // END transform output
 
     // cut result pad
-    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w);
+    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt.blob_allocator, opt.num_threads);
 }
 
-static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias)
+static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -8178,7 +8178,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co
 
     w = outw + 2;
     h = outh + 2;
-    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f);
+    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt.workspace_allocator, opt.num_threads);
 
     const float* bias = _bias;
 
@@ -8188,7 +8188,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co
         int w_tm = outw / 6 * 8;
         int h_tm = outh / 6 * 8;
         const int tiles = w_tm/8 * h_tm/8;
-        bottom_blob_tm.create(1, 64 * tiles, inch);
+        bottom_blob_tm.create(1, 64 * tiles, inch, 4u, opt.workspace_allocator);
 //         bottom_blob_tm.create(inch, tiles, 64);
 
 //         const float itm[8][8] = {
@@ -8229,7 +8229,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co
         float32x4_t _coeff1 = vld1q_f32(coeff+4);
 #endif // __ARM_NEON
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q<inch; q++)
         {
             const Mat img0 = bottom_blob_bordered.channel(q);
@@ -9054,9 +9054,9 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co
         // permute
         // bottom_blob_tm.create(1, 64 * tiles, inch);
 //         Mat bottom_blob_tm2(inch, tiles, 64);
-        Mat bottom_blob_tm2(8*inch, tiles/8 + (tiles%8)/4 + tiles%4, 64);
+        Mat bottom_blob_tm2(8*inch, tiles/8 + (tiles%8)/4 + tiles%4, 64, 4u, opt.workspace_allocator);
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int r=0; r<64; r++)
         {
             Mat tm2 = bottom_blob_tm2.channel(r);
@@ -9147,7 +9147,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co
         nn_outch = outch >> 3;
         remain_outch_start = nn_outch << 3;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int pp=0; pp<nn_outch; pp++)
         {
             int p = pp * 8;
@@ -9592,7 +9592,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co
 
         nn_outch = (outch - remain_outch_start) >> 2;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int pp=0; pp<nn_outch; pp++)
         {
             int p = remain_outch_start + pp * 4;
@@ -10332,6 +10332,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co
 
         remain_outch_start += nn_outch << 2;
 
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int p=remain_outch_start; p<outch; p++)
         {
 #if __ARM_NEON && __aarch64__
@@ -10738,7 +10739,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co
 
     // BEGIN transform output
     Mat top_blob_bordered;
-    top_blob_bordered.create(outw, outh, outch);
+    top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
     {
 //         const float otm[6][8] = {
 //             {1.0f,  1.0f,   1.0f,   1.0f,   1.0f,  32.0f, 32.0f, 0.0f},
@@ -10765,7 +10766,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co
         int h_tm = outh / 6 * 8;
         const int tiles = w_tm/8 * h_tm/8;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int p = 0; p<outch; p++)
         {
             const Mat out0_tm = top_blob_tm.channel(p);
@@ -11514,10 +11515,10 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co
     // END transform output
 
     // cut result pad
-    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w);
+    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt.blob_allocator, opt.num_threads);
 }
 
-static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
@@ -11534,7 +11535,7 @@ static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
     int nn_outch = outch >> 1;
     int remain_outch_start = nn_outch << 1;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int pp=0; pp<nn_outch; pp++)
     {
         int p = pp * 2;
@@ -11858,7 +11859,7 @@ static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
         }
     }
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=remain_outch_start; p<outch; p++)
     {
         Mat out = top_blob.channel(p);
diff --git a/src/layer/arm/convolution_4x4.h b/src/layer/arm/convolution_4x4.h
index 3054faffd..3a78d7905 100644
--- a/src/layer/arm/convolution_4x4.h
+++ b/src/layer/arm/convolution_4x4.h
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON
 
-static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
@@ -30,7 +30,7 @@ static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
     const float* kernel = _kernel;
     const float* bias = _bias;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=0; p<outch; p++)
     {
         Mat out = top_blob.channel(p);
diff --git a/src/layer/arm/convolution_5x5.h b/src/layer/arm/convolution_5x5.h
index 4928d0d43..b1e778a6a 100644
--- a/src/layer/arm/convolution_5x5.h
+++ b/src/layer/arm/convolution_5x5.h
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON
 
-static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
@@ -28,7 +28,7 @@ static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
     const float* kernel = _kernel;
     const float* bias = _bias;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=0; p<outch; p++)
     {
         Mat out = top_blob.channel(p);
@@ -982,7 +982,7 @@ static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
 
 }
 
-static void conv5x5s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+static void conv5x5s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
@@ -996,7 +996,7 @@ static void conv5x5s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
     const float* kernel = _kernel;
     const float* bias = _bias;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=0; p<outch; p++)
     {
         Mat out = top_blob.channel(p);
diff --git a/src/layer/arm/convolution_7x7.h b/src/layer/arm/convolution_7x7.h
index 3b2c77e6a..8632c3f19 100644
--- a/src/layer/arm/convolution_7x7.h
+++ b/src/layer/arm/convolution_7x7.h
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON
 
-static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
@@ -28,7 +28,7 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
     const float* kernel = _kernel;
     const float* bias = _bias;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=0; p<outch; p++)
     {
         Mat out = top_blob.channel(p);
@@ -706,7 +706,7 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
 
 }
 
-static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
@@ -720,7 +720,7 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
     const float* kernel = _kernel;
     const float* bias = _bias;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=0; p<outch; p++)
     {
         Mat out = top_blob.channel(p);
diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp
index ed13c0ed6..7e12044e0 100644
--- a/src/layer/arm/convolution_arm.cpp
+++ b/src/layer/arm/convolution_arm.cpp
@@ -75,10 +75,11 @@ int Convolution_arm::load_model(const ModelBin& mb)
     return 0;
 }
 
-int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv) const
+int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv, const Option& opt) const
 {
     int w = bottom_blob.w;
     int h = bottom_blob.h;
+    size_t elemsize = bottom_blob.elemsize;
 
     const int kernel_size = kernel_w;
     const int stride = stride_w;
@@ -88,7 +89,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
     Mat bottom_blob_bordered = bottom_blob;
     if (pad_w > 0 || pad_h > 0)
     {
-        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f);
+        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
         if (bottom_blob_bordered.empty())
             return -100;
 
@@ -101,7 +102,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
         int hpad = kernel_extent + (h - 1) / stride * stride - h;
         if (wpad > 0 || hpad > 0)
         {
-            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f);
+            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
             if (bottom_blob_bordered.empty())
                 return -100;
         }
@@ -113,7 +114,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
     int outw = (w - kernel_extent) / stride + 1;
     int outh = (h - kernel_extent) / stride + 1;
 
-    top_blob.create(outw, outh, num_output);
+    top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
@@ -132,7 +133,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
 
             if (inner_bottom_blob.w != inner_w || inner_bottom_blob.h != inner_h)
             {
-                inner_bottom_blob.create(inner_w, inner_h, bottom_blob.c);
+                inner_bottom_blob.create(inner_w, inner_h, bottom_blob.c, elemsize, opt.workspace_allocator);
 
                 if (inner_bottom_blob.empty())
                 {
@@ -142,7 +143,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
 
             if (inner_top_blob.w != inner_outw || inner_top_blob.h != inner_outh)
             {
-                inner_top_blob.create(inner_outw, inner_outh, num_output);
+                inner_top_blob.create(inner_outw, inner_outh, num_output, elemsize, opt.workspace_allocator);
 
                 if (inner_top_blob.empty())
                 {
@@ -150,7 +151,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
                 }
             }
 
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int c = 0; c < bottom_blob.c; c ++)
             {
                 float *outptr = (float *) inner_bottom_blob.channel(c);
@@ -166,9 +167,9 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
                 }
             }
 
-            conv(inner_bottom_blob, inner_top_blob, weight_data, bias_data);
+            conv(inner_bottom_blob, inner_top_blob, weight_data, bias_data, opt);
 
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int c = 0; c < num_output; c ++)
             {
                 float *outptr = (float *) top_blob.channel(c) + x * outw + y;
@@ -188,19 +189,19 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
     return 0;
 }
 
-int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     // convolv with NxN kernel
     // value = value + bias
 
     if (bottom_blob.dims != 3)
     {
-        return Convolution::forward(bottom_blob, top_blob);
+        return Convolution::forward(bottom_blob, top_blob, opt);
     }
 
     if (kernel_w != kernel_h || stride_w != stride_h)
     {
-        return Convolution::forward(bottom_blob, top_blob);
+        return Convolution::forward(bottom_blob, top_blob, opt);
     }
 
     const int kernel_size = kernel_w;
@@ -208,10 +209,10 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
 
     if (kernel_size > 7 || stride > 4 || dilation_w != dilation_h)
     {
-        return Convolution::forward(bottom_blob, top_blob);
+        return Convolution::forward(bottom_blob, top_blob, opt);
     }
 
-    typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&);
+    typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&);
 
     // kernel_size x stride
     conv_func conv_func_table[7][4] =
@@ -263,22 +264,23 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
     conv_func conv = conv_func_table[kernel_size-1][stride-1];
     if (!conv)
     {
-        return Convolution::forward(bottom_blob, top_blob);
+        return Convolution::forward(bottom_blob, top_blob, opt);
     }
 
     if (dilation_w != 1)
     {
-        return forwardDilation(bottom_blob, top_blob, conv);
+        return forwardDilation(bottom_blob, top_blob, conv, opt);
     }
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
 
     Mat bottom_blob_bordered = bottom_blob;
     if (pad_w > 0 || pad_h > 0)
     {
-        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f);
+        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
         if (bottom_blob_bordered.empty())
             return -100;
 
@@ -291,7 +293,7 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
         int hpad = kernel_size + (h - 1) / stride * stride - h;
         if (wpad > 0 || hpad > 0)
         {
-            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f);
+            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
             if (bottom_blob_bordered.empty())
                 return -100;
         }
@@ -303,21 +305,21 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
     int outw = (w - kernel_size) / stride + 1;
     int outh = (h - kernel_size) / stride + 1;
 
-    top_blob.create(outw, outh, num_output);
+    top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
     if (use_winograd3x3 && w <= 120 && h <= 120)
     {
-//         conv3x3s1_winograd64_neon4(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data);
-        conv3x3s1_winograd64_neon5(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data);
+//         conv3x3s1_winograd64_neon4(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
+        conv3x3s1_winograd64_neon5(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
     }
     else if (use_sgemm1x1)
     {
-        conv1x1s1_sgemm_neon(bottom_blob_bordered, top_blob, weight_1x1_sgemm_data, bias_data);
+        conv1x1s1_sgemm_neon(bottom_blob_bordered, top_blob, weight_1x1_sgemm_data, bias_data, opt);
     }
     else
-        conv(bottom_blob_bordered, top_blob, weight_data, bias_data);
+        conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
 
     return 0;
 }
diff --git a/src/layer/arm/convolution_arm.h b/src/layer/arm/convolution_arm.h
index 6a47fff51..21d0256f2 100644
--- a/src/layer/arm/convolution_arm.h
+++ b/src/layer/arm/convolution_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&);
+typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&);
 
 class Convolution_arm : public Convolution
 {
@@ -28,8 +28,8 @@ public:
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
-    virtual int forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+    virtual int forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv, const Option& opt) const;
 
 public:
     bool use_winograd3x3;
diff --git a/src/layer/arm/convolutiondepthwise_3x3.h b/src/layer/arm/convolutiondepthwise_3x3.h
index c2f1ae222..6cd12a999 100644
--- a/src/layer/arm/convolutiondepthwise_3x3.h
+++ b/src/layer/arm/convolutiondepthwise_3x3.h
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON
 
-static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
 
@@ -28,7 +28,7 @@ static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
     const float* kernel = _kernel;
     const float* bias = _bias;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int g=0; g<group; g++)
     {
         Mat out = top_blob.channel(g);
@@ -577,7 +577,7 @@ static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
     }
 }
 
-static void convdw3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+static void convdw3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
 
@@ -591,7 +591,7 @@ static void convdw3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
     const float* kernel = _kernel;
     const float* bias = _bias;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int g=0; g<group; g++)
     {
         Mat out = top_blob.channel(g);
diff --git a/src/layer/arm/convolutiondepthwise_arm.cpp b/src/layer/arm/convolutiondepthwise_arm.cpp
index c2f5cba06..29402d0c8 100644
--- a/src/layer/arm/convolutiondepthwise_arm.cpp
+++ b/src/layer/arm/convolutiondepthwise_arm.cpp
@@ -102,7 +102,7 @@ int ConvolutionDepthWise_arm::load_model(const ModelBin& mb)
     return 0;
 }
 
-int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     // convolv with NxN kernel
     // value = value + bias
@@ -110,6 +110,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
 
     if (channels % group != 0 || num_output % group != 0)
     {
@@ -123,7 +124,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
     Mat bottom_blob_bordered = bottom_blob;
     if (pad_w > 0 || pad_h > 0)
     {
-        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f);
+        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
         if (bottom_blob_bordered.empty())
             return -100;
 
@@ -136,7 +137,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
         int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
         if (wpad > 0 || hpad > 0)
         {
-            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f);
+            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
             if (bottom_blob_bordered.empty())
                 return -100;
         }
@@ -148,7 +149,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
     int outw = (w - kernel_extent_w) / stride_w + 1;
     int outh = (h - kernel_extent_h) / stride_h + 1;
 
-    top_blob.create(outw, outh, num_output);
+    top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
@@ -161,12 +162,12 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
         {
             if (stride_w == 1 && stride_h == 1)
             {
-                convdw3x3s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data);
+                convdw3x3s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
                 return 0;
             }
             else if (stride_w == 2 && stride_h == 2)
             {
-                convdw3x3s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data);
+                convdw3x3s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
                 return 0;
             }
         }
@@ -176,7 +177,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
         omp_set_nested(0);
 #endif
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int g=0; g<group; g++)
         {
             Mat bottom_blob_bordered_g(w, h, 1, bottom_blob_bordered.channel(g));
@@ -213,7 +214,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
             op->load_model(ModelBinFromMatArray(weights));
 
             // forward
-            op->forward(bottom_blob_bordered_g, top_blob_g);
+            op->forward(bottom_blob_bordered_g, top_blob_g, opt);
 
             delete op;
         }
@@ -235,7 +236,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
         const ncnn::Layer* op = group_ops[g];
 
         // forward
-        op->forward(bottom_blob_bordered_g, top_blob_g);
+        op->forward(bottom_blob_bordered_g, top_blob_g, opt);
     }
 
     return 0;
diff --git a/src/layer/arm/convolutiondepthwise_arm.h b/src/layer/arm/convolutiondepthwise_arm.h
index 8181f4181..188ef6bdd 100644
--- a/src/layer/arm/convolutiondepthwise_arm.h
+++ b/src/layer/arm/convolutiondepthwise_arm.h
@@ -27,7 +27,7 @@ public:
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 public:
     std::vector<ncnn::Layer*> group_ops;
diff --git a/src/layer/arm/deconvolution_3x3.h b/src/layer/arm/deconvolution_3x3.h
index 931eb9404..39082f2e8 100644
--- a/src/layer/arm/deconvolution_3x3.h
+++ b/src/layer/arm/deconvolution_3x3.h
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON
 
-static void deconv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+static void deconv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -28,7 +28,7 @@ static void deconv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
     const float* kernel = _kernel;
     const float* bias = _bias;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=0; p<outch; p++)
     {
         Mat out = top_blob.channel(p);
@@ -237,7 +237,7 @@ static void deconv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
     }
 }
 
-static void deconv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+static void deconv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -249,7 +249,7 @@ static void deconv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
     const float* kernel = _kernel;
     const float* bias = _bias;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=0; p<outch; p++)
     {
         Mat out = top_blob.channel(p);
diff --git a/src/layer/arm/deconvolution_4x4.h b/src/layer/arm/deconvolution_4x4.h
index 6e13f4e6f..27cdc02d3 100644
--- a/src/layer/arm/deconvolution_4x4.h
+++ b/src/layer/arm/deconvolution_4x4.h
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON
 
-static void deconv4x4s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+static void deconv4x4s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -28,7 +28,7 @@ static void deconv4x4s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
     const float* kernel = _kernel;
     const float* bias = _bias;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=0; p<outch; p++)
     {
         Mat out = top_blob.channel(p);
@@ -185,7 +185,7 @@ static void deconv4x4s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
     }
 }
 
-static void deconv4x4s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+static void deconv4x4s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -197,7 +197,7 @@ static void deconv4x4s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
     const float* kernel = _kernel;
     const float* bias = _bias;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=0; p<outch; p++)
     {
         Mat out = top_blob.channel(p);
diff --git a/src/layer/arm/deconvolution_arm.cpp b/src/layer/arm/deconvolution_arm.cpp
index 26aab30a3..90b051456 100644
--- a/src/layer/arm/deconvolution_arm.cpp
+++ b/src/layer/arm/deconvolution_arm.cpp
@@ -21,14 +21,14 @@ namespace ncnn {
 
 DEFINE_LAYER_CREATOR(Deconvolution_arm)
 
-int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     // deconvolv with NxN kernel
     // value = value + bias
 
     if (kernel_w != kernel_h || stride_w != stride_h)
     {
-        return Deconvolution::forward(bottom_blob, top_blob);
+        return Deconvolution::forward(bottom_blob, top_blob, opt);
     }
 
     const int kernel_size = kernel_w;
@@ -36,10 +36,10 @@ int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
 
     if ((kernel_size != 3 && kernel_size != 4) || stride > 2 || dilation_w != 1 || dilation_h != 1)
     {
-        return Deconvolution::forward(bottom_blob, top_blob);
+        return Deconvolution::forward(bottom_blob, top_blob, opt);
     }
 
-    typedef void (*deconv_func)(const Mat&, Mat&, const Mat&, const Mat&);
+    typedef void (*deconv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&);
 
     // kernel_size x stride
     deconv_func deconv_func_table[2][2] =
@@ -57,33 +57,46 @@ int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
     deconv_func deconv = deconv_func_table[kernel_size-3][stride-1];
     if (!deconv)
     {
-        return Deconvolution::forward(bottom_blob, top_blob);
+        return Deconvolution::forward(bottom_blob, top_blob, opt);
     }
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
+    size_t elemsize = bottom_blob.elemsize;
 
     int outw = (w - 1) * stride + kernel_size;
     int outh = (h - 1) * stride + kernel_size;
 
-    Mat top_blob_bordered = top_blob;
-    top_blob_bordered.create(outw, outh, num_output);
-    if (top_blob_bordered.empty())
-        return -100;
-
-    deconv(bottom_blob, top_blob_bordered, weight_data, bias_data);
+    Mat top_blob_bordered;
+    if (pad_w > 0 || pad_h > 0)
+    {
+        top_blob_bordered.create(outw, outh, num_output, elemsize, opt.workspace_allocator);
+        if (top_blob_bordered.empty())
+            return -100;
+    }
+    else
+    {
+        top_blob_bordered = top_blob;
+        top_blob_bordered.create(outw, outh, num_output, elemsize, opt.blob_allocator);
+        if (top_blob_bordered.empty())
+            return -100;
+    }
 
-    top_blob = top_blob_bordered;
+    deconv(bottom_blob, top_blob_bordered, weight_data, bias_data, opt);
 
     if (pad_w > 0 || pad_h > 0)
     {
-        copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w);
+        copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w, opt.blob_allocator, opt.num_threads);
         if (top_blob.empty())
             return -100;
 
         outw = top_blob.w;
         outh = top_blob.h;
     }
+    else
+    {
+        top_blob = top_blob_bordered;
+    }
 
     return 0;
 }
diff --git a/src/layer/arm/deconvolution_arm.h b/src/layer/arm/deconvolution_arm.h
index ce7a83b5b..6b688b09f 100644
--- a/src/layer/arm/deconvolution_arm.h
+++ b/src/layer/arm/deconvolution_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class Deconvolution_arm : public Deconvolution
 {
 public:
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 };
 
 } // namespace ncnn
diff --git a/src/layer/arm/deconvolutiondepthwise_arm.cpp b/src/layer/arm/deconvolutiondepthwise_arm.cpp
index bfdd5ed7b..a18803745 100644
--- a/src/layer/arm/deconvolutiondepthwise_arm.cpp
+++ b/src/layer/arm/deconvolutiondepthwise_arm.cpp
@@ -24,7 +24,7 @@ namespace ncnn {
 
 DEFINE_LAYER_CREATOR(DeconvolutionDepthWise_arm)
 
-int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     // convolv with NxN kernel
     // value = value + bias
@@ -32,6 +32,7 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
 
     if (channels % group != 0 || num_output % group != 0)
     {
@@ -45,10 +46,20 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c
     int outw = (w - 1) * stride_w + kernel_extent_w;
     int outh = (h - 1) * stride_h + kernel_extent_h;
 
-    Mat top_blob_bordered = top_blob;
-    top_blob_bordered.create(outw, outh, num_output);
-    if (top_blob_bordered.empty())
-        return -100;
+    Mat top_blob_bordered;
+    if (pad_w > 0 || pad_h > 0)
+    {
+        top_blob_bordered.create(outw, outh, num_output, elemsize, opt.workspace_allocator);
+        if (top_blob_bordered.empty())
+            return -100;
+    }
+    else
+    {
+        top_blob_bordered = top_blob;
+        top_blob_bordered.create(outw, outh, num_output, elemsize, opt.blob_allocator);
+        if (top_blob_bordered.empty())
+            return -100;
+    }
 
     const int maxk = kernel_w * kernel_h;
 
@@ -60,7 +71,7 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c
         omp_set_nested(0);
 #endif
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int g=0; g<group; g++)
         {
             Mat bottom_blob_g(w, h, 1, bottom_blob.channel(g).data);
@@ -98,7 +109,7 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c
             op->load_model(ModelBinFromMatArray(weights));
 
             // forward
-            op->forward(bottom_blob_g, top_blob_bordered_g);
+            op->forward(bottom_blob_g, top_blob_bordered_g, opt);
 
             delete op;
         }
@@ -148,23 +159,25 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c
             op->load_model(ModelBinFromMatArray(weights));
 
             // forward
-            op->forward(bottom_blob_g, top_blob_bordered_g);
+            op->forward(bottom_blob_g, top_blob_bordered_g, opt);
 
             delete op;
         }
     }
 
-    top_blob = top_blob_bordered;
-
     if (pad_w > 0 || pad_h > 0)
     {
-        copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w);
+        copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w, opt.blob_allocator, opt.num_threads);
         if (top_blob.empty())
             return -100;
 
         outw = top_blob.w;
         outh = top_blob.h;
     }
+    else
+    {
+        top_blob = top_blob_bordered;
+    }
 
     return 0;
 
diff --git a/src/layer/arm/deconvolutiondepthwise_arm.h b/src/layer/arm/deconvolutiondepthwise_arm.h
index 472311da7..792478fd9 100644
--- a/src/layer/arm/deconvolutiondepthwise_arm.h
+++ b/src/layer/arm/deconvolutiondepthwise_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class DeconvolutionDepthWise_arm : public DeconvolutionDepthWise
 {
 public:
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 };
 
 } // namespace ncnn
diff --git a/src/layer/arm/eltwise_arm.cpp b/src/layer/arm/eltwise_arm.cpp
index cbe03d61a..5ed62bef0 100644
--- a/src/layer/arm/eltwise_arm.cpp
+++ b/src/layer/arm/eltwise_arm.cpp
@@ -22,16 +22,17 @@ namespace ncnn {
 
 DEFINE_LAYER_CREATOR(Eltwise_arm)
 
-int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
+int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
     const Mat& bottom_blob = bottom_blobs[0];
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
     int size = w * h;
 
     Mat& top_blob = top_blobs[0];
-    top_blob.create(w, h, channels);
+    top_blob.create(w, h, channels, elemsize, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
@@ -39,7 +40,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
     {
         // first blob
         const Mat& bottom_blob1 = bottom_blobs[1];
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             const float* ptr = bottom_blob.channel(q);
@@ -117,7 +118,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
         for (size_t b=2; b<bottom_blobs.size(); b++)
         {
             const Mat& bottom_blob1 = bottom_blobs[b];
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int q=0; q<channels; q++)
             {
                 const float* ptr = bottom_blob1.channel(q);
@@ -193,7 +194,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
         {
             // first blob
             const Mat& bottom_blob1 = bottom_blobs[1];
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int q=0; q<channels; q++)
             {
                 const float* ptr = bottom_blob.channel(q);
@@ -271,7 +272,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
             for (size_t b=2; b<bottom_blobs.size(); b++)
             {
                 const Mat& bottom_blob1 = bottom_blobs[b];
-                #pragma omp parallel for
+                #pragma omp parallel for num_threads(opt.num_threads)
                 for (int q=0; q<channels; q++)
                 {
                     const float* ptr = bottom_blob1.channel(q);
@@ -349,7 +350,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
             const Mat& bottom_blob1 = bottom_blobs[1];
             float coeff0 = coeffs_ptr[0];
             float coeff1 = coeffs_ptr[1];
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int q=0; q<channels; q++)
             {
                 const float* ptr = bottom_blob.channel(q);
@@ -436,7 +437,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
             {
                 const Mat& bottom_blob1 = bottom_blobs[b];
                 float coeff = coeffs_ptr[b];
-                #pragma omp parallel for
+                #pragma omp parallel for num_threads(opt.num_threads)
                 for (int q=0; q<channels; q++)
                 {
                     const float* ptr = bottom_blob1.channel(q);
@@ -514,7 +515,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
     {
         // first blob
         const Mat& bottom_blob1 = bottom_blobs[1];
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             const float* ptr = bottom_blob.channel(q);
@@ -592,7 +593,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
         for (size_t b=2; b<bottom_blobs.size(); b++)
         {
             const Mat& bottom_blob1 = bottom_blobs[b];
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int q=0; q<channels; q++)
             {
                 const float* ptr = bottom_blob1.channel(q);
diff --git a/src/layer/arm/eltwise_arm.h b/src/layer/arm/eltwise_arm.h
index 060fac695..9da561fb4 100644
--- a/src/layer/arm/eltwise_arm.h
+++ b/src/layer/arm/eltwise_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class Eltwise_arm : public Eltwise
 {
 public:
-    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 };
 
 } // namespace ncnn
diff --git a/src/layer/arm/innerproduct_arm.cpp b/src/layer/arm/innerproduct_arm.cpp
index d9947614d..5005ea7da 100644
--- a/src/layer/arm/innerproduct_arm.cpp
+++ b/src/layer/arm/innerproduct_arm.cpp
@@ -22,14 +22,15 @@ namespace ncnn {
 
 DEFINE_LAYER_CREATOR(InnerProduct_arm)
 
-int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
     int size = w * h;
 
-    top_blob.create(num_output);
+    top_blob.create(num_output, elemsize, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
@@ -38,7 +39,7 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
     int nn_num_output = num_output >> 2;
     int remain_num_output_start = nn_num_output << 2;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int pp=0; pp<nn_num_output; pp++)
     {
         int p = pp * 4;
@@ -143,7 +144,7 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
     }
 
     // num_output
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=remain_num_output_start; p<num_output; p++)
     {
         float sum = 0.f;
diff --git a/src/layer/arm/innerproduct_arm.h b/src/layer/arm/innerproduct_arm.h
index 5fdf3fe20..fbd3c1586 100644
--- a/src/layer/arm/innerproduct_arm.h
+++ b/src/layer/arm/innerproduct_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class InnerProduct_arm : public InnerProduct
 {
 public:
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 };
 
 } // namespace ncnn
diff --git a/src/layer/arm/lrn_arm.cpp b/src/layer/arm/lrn_arm.cpp
index 3e5dec916..81bd59a6e 100644
--- a/src/layer/arm/lrn_arm.cpp
+++ b/src/layer/arm/lrn_arm.cpp
@@ -24,20 +24,21 @@ namespace ncnn {
 
 DEFINE_LAYER_CREATOR(LRN_arm)
 
-int LRN_arm::forward_inplace(Mat& bottom_top_blob) const
+int LRN_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
+    size_t elemsize = bottom_top_blob.elemsize;
     int size = w * h;
 
     // squared values with local_size padding
     Mat square_blob;
-    square_blob.create(w, h, channels);
+    square_blob.create(w, h, channels, elemsize, opt.workspace_allocator);
     if (square_blob.empty())
         return -100;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<channels; q++)
     {
         const float* ptr = bottom_top_blob.channel(q);
@@ -73,14 +74,14 @@ int LRN_arm::forward_inplace(Mat& bottom_top_blob) const
     if (region_type == NormRegion_ACROSS_CHANNELS)
     {
         Mat square_sum;
-        square_sum.create(w, h, channels);
+        square_sum.create(w, h, channels, elemsize, opt.workspace_allocator);
         if (square_sum.empty())
             return -100;
         square_sum.fill(0.f);
 
         const float alpha_div_size = alpha / local_size;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             // square sum
@@ -165,7 +166,7 @@ int LRN_arm::forward_inplace(Mat& bottom_top_blob) const
         int pad = local_size / 2;
         if (pad > 0)
         {
-            copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f);
+            copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
             if (square_blob_bordered.empty())
                 return -100;
 
@@ -196,7 +197,7 @@ int LRN_arm::forward_inplace(Mat& bottom_top_blob) const
             }
         }
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/arm/lrn_arm.h b/src/layer/arm/lrn_arm.h
index 6b052945e..f2c43ba08 100644
--- a/src/layer/arm/lrn_arm.h
+++ b/src/layer/arm/lrn_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class LRN_arm : public LRN
 {
 public:
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 };
 
 } // namespace ncnn
diff --git a/src/layer/arm/pooling_2x2.h b/src/layer/arm/pooling_2x2.h
index 91cb1b98c..3be70300e 100644
--- a/src/layer/arm/pooling_2x2.h
+++ b/src/layer/arm/pooling_2x2.h
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON
 
-static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob)
+static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
 {
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
@@ -26,7 +26,7 @@ static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob)
     
     const int tailstep = w - 2*outw + w;
     
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<inch; q++)
     {
         const float* img0 = bottom_blob.channel(q);
diff --git a/src/layer/arm/pooling_3x3.h b/src/layer/arm/pooling_3x3.h
index b53cbbd4d..6b23e7a98 100644
--- a/src/layer/arm/pooling_3x3.h
+++ b/src/layer/arm/pooling_3x3.h
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON
 
-static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob)
+static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
 {
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
@@ -26,7 +26,7 @@ static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob)
 
     const int tailstep = w - 2*outw + w;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<inch; q++)
     {
         const float* img0 = bottom_blob.channel(q);
diff --git a/src/layer/arm/pooling_arm.cpp b/src/layer/arm/pooling_arm.cpp
index a10ed7ef3..53dca5dc0 100644
--- a/src/layer/arm/pooling_arm.cpp
+++ b/src/layer/arm/pooling_arm.cpp
@@ -21,14 +21,14 @@ namespace ncnn {
 
 DEFINE_LAYER_CREATOR(Pooling_arm)
 
-int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     // max value in NxN window
     // avg value in NxN window
 
     if (kernel_w != kernel_h || stride_w != stride_h)
     {
-        return Pooling::forward(bottom_blob, top_blob);
+        return Pooling::forward(bottom_blob, top_blob, opt);
     }
 
     const int kernel_size = kernel_w;
@@ -36,17 +36,18 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
 
     if (pooling_type != PoolMethod_MAX || stride != 2 || global_pooling == 1)
     {
-        return Pooling::forward(bottom_blob, top_blob);
+        return Pooling::forward(bottom_blob, top_blob, opt);
     }
 
     if (kernel_size != 2 && kernel_size != 3)
     {
-        return Pooling::forward(bottom_blob, top_blob);
+        return Pooling::forward(bottom_blob, top_blob, opt);
     }
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
 
     Mat bottom_blob_bordered = bottom_blob;
 
@@ -73,7 +74,7 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
         if (htail != 0)
             htailpad = stride_h - htail;
 
-        copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom + htailpad, pad_left, pad_right + wtailpad, BORDER_CONSTANT, pad_value);
+        copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom + htailpad, pad_left, pad_right + wtailpad, BORDER_CONSTANT, pad_value, opt.workspace_allocator, opt.num_threads);
         if (bottom_blob_bordered.empty())
             return -100;
 
@@ -82,7 +83,7 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
     }
     else if (pad_mode == 1) // valid padding
     {
-        copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, BORDER_CONSTANT, pad_value);
+        copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, BORDER_CONSTANT, pad_value, opt.workspace_allocator, opt.num_threads);
         if (bottom_blob_bordered.empty())
             return -100;
 
@@ -95,7 +96,7 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
         int hpad = kernel_h + (h - 1) / stride_h * stride_h - h;
         if (wpad > 0 || hpad > 0)
         {
-            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value);
+            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value, opt.workspace_allocator, opt.num_threads);
             if (bottom_blob_bordered.empty())
                 return -100;
         }
@@ -107,14 +108,14 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
     int outw = (w - kernel_w) / stride_w + 1;
     int outh = (h - kernel_h) / stride_h + 1;
 
-    top_blob.create(outw, outh, channels);
+    top_blob.create(outw, outh, channels, elemsize, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
     if (kernel_size == 2)
-        pooling2x2s2_max_neon(bottom_blob_bordered, top_blob);
+        pooling2x2s2_max_neon(bottom_blob_bordered, top_blob, opt);
     if (kernel_size == 3)
-        pooling3x3s2_max_neon(bottom_blob_bordered, top_blob);
+        pooling3x3s2_max_neon(bottom_blob_bordered, top_blob, opt);
 
     return 0;
 }
diff --git a/src/layer/arm/pooling_arm.h b/src/layer/arm/pooling_arm.h
index b7d774fa2..72f01533c 100644
--- a/src/layer/arm/pooling_arm.h
+++ b/src/layer/arm/pooling_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class Pooling_arm : public Pooling
 {
 public:
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 };
 
 } // namespace ncnn
diff --git a/src/layer/arm/prelu_arm.cpp b/src/layer/arm/prelu_arm.cpp
index 420d08892..845a8c217 100644
--- a/src/layer/arm/prelu_arm.cpp
+++ b/src/layer/arm/prelu_arm.cpp
@@ -22,11 +22,11 @@ namespace ncnn {
 
 DEFINE_LAYER_CREATOR(PReLU_arm)
 
-int PReLU_arm::forward_inplace(Mat& bottom_top_blob) const
+int PReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     int dims = bottom_top_blob.dims;
     if (dims != 3)
-        return PReLU::forward_inplace(bottom_top_blob);
+        return PReLU::forward_inplace(bottom_top_blob, opt);
 
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
@@ -35,7 +35,7 @@ int PReLU_arm::forward_inplace(Mat& bottom_top_blob) const
 
     const float* slope_data_ptr = slope_data;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<channels; q++)
     {
         float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/arm/prelu_arm.h b/src/layer/arm/prelu_arm.h
index 13809657b..7077d7141 100644
--- a/src/layer/arm/prelu_arm.h
+++ b/src/layer/arm/prelu_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class PReLU_arm : public PReLU
 {
 public:
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 };
 
 } // namespace ncnn
diff --git a/src/layer/arm/relu_arm.cpp b/src/layer/arm/relu_arm.cpp
index 7068ec0e7..9f3d541bc 100644
--- a/src/layer/arm/relu_arm.cpp
+++ b/src/layer/arm/relu_arm.cpp
@@ -22,7 +22,7 @@ namespace ncnn {
 
 DEFINE_LAYER_CREATOR(ReLU_arm)
 
-int ReLU_arm::forward_inplace(Mat& bottom_top_blob) const
+int ReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
@@ -31,7 +31,7 @@ int ReLU_arm::forward_inplace(Mat& bottom_top_blob) const
 
     if (slope == 0.f)
     {
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             float* ptr = bottom_top_blob.channel(q);
@@ -85,7 +85,7 @@ int ReLU_arm::forward_inplace(Mat& bottom_top_blob) const
     }
     else
     {
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/arm/relu_arm.h b/src/layer/arm/relu_arm.h
index 0d88aee72..4403d61f1 100644
--- a/src/layer/arm/relu_arm.h
+++ b/src/layer/arm/relu_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class ReLU_arm : public ReLU
 {
 public:
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 };
 
 } // namespace ncnn
diff --git a/src/layer/arm/scale_arm.cpp b/src/layer/arm/scale_arm.cpp
index 06762126b..9f0a7181e 100644
--- a/src/layer/arm/scale_arm.cpp
+++ b/src/layer/arm/scale_arm.cpp
@@ -22,11 +22,11 @@ namespace ncnn {
 
 DEFINE_LAYER_CREATOR(Scale_arm)
 
-int Scale_arm::forward_inplace(Mat& bottom_top_blob) const
+int Scale_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     int dims = bottom_top_blob.dims;
     if (dims != 3)
-        return Scale::forward_inplace(bottom_top_blob);
+        return Scale::forward_inplace(bottom_top_blob, opt);
 
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
@@ -37,7 +37,7 @@ int Scale_arm::forward_inplace(Mat& bottom_top_blob) const
     {
         const float* scale_ptr = scale_data;
         const float* bias_ptr = bias_data;
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             float* ptr = bottom_top_blob.channel(q);
@@ -76,7 +76,7 @@ int Scale_arm::forward_inplace(Mat& bottom_top_blob) const
     else
     {
         const float* scale_ptr = scale_data;
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/arm/scale_arm.h b/src/layer/arm/scale_arm.h
index d996c28ac..2a4a4e659 100644
--- a/src/layer/arm/scale_arm.h
+++ b/src/layer/arm/scale_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class Scale_arm : public Scale
 {
 public:
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 };
 
 } // namespace ncnn
diff --git a/src/layer/arm/sigmoid_arm.cpp b/src/layer/arm/sigmoid_arm.cpp
index 480c1d2f3..4f1450e2e 100644
--- a/src/layer/arm/sigmoid_arm.cpp
+++ b/src/layer/arm/sigmoid_arm.cpp
@@ -25,14 +25,14 @@ namespace ncnn {
 
 DEFINE_LAYER_CREATOR(Sigmoid_arm)
 
-int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob) const
+int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
     int size = w * h;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<channels; q++)
     {
         float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/arm/sigmoid_arm.h b/src/layer/arm/sigmoid_arm.h
index 082cd216a..c26f4a6d0 100644
--- a/src/layer/arm/sigmoid_arm.h
+++ b/src/layer/arm/sigmoid_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class Sigmoid_arm : public Sigmoid
 {
 public:
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 };
 
 } // namespace ncnn
diff --git a/src/layer/arm/softmax_arm.cpp b/src/layer/arm/softmax_arm.cpp
index de55fd071..4575e1f5f 100644
--- a/src/layer/arm/softmax_arm.cpp
+++ b/src/layer/arm/softmax_arm.cpp
@@ -25,12 +25,12 @@ namespace ncnn {
 
 DEFINE_LAYER_CREATOR(Softmax_arm)
 
-int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const
+int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     int dims = bottom_top_blob.dims;
 
     if (dims != 3 || axis != 0)
-        return Softmax::forward_inplace(bottom_top_blob);
+        return Softmax::forward_inplace(bottom_top_blob, opt);
 
     // value = exp( value - global max value )
     // sum all value
@@ -39,10 +39,11 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
+    size_t elemsize = bottom_top_blob.elemsize;
     int size = w * h;
 
     Mat max;
-    max.create(w, h);
+    max.create(w, h, elemsize, opt.workspace_allocator);
     if (max.empty())
         return -100;
     max.fill(-FLT_MAX);
@@ -57,7 +58,7 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const
         }
     }
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<channels; q++)
     {
         float* ptr = bottom_top_blob.channel(q);
@@ -95,7 +96,7 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const
     }
 
     Mat sum;
-    sum.create(w, h);
+    sum.create(w, h, elemsize, opt.workspace_allocator);
     if (sum.empty())
         return -100;
     sum.fill(0.f);
@@ -133,7 +134,7 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const
         }
     }
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<channels; q++)
     {
         float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/arm/softmax_arm.h b/src/layer/arm/softmax_arm.h
index 71e6fea32..9638a286c 100644
--- a/src/layer/arm/softmax_arm.h
+++ b/src/layer/arm/softmax_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class Softmax_arm : public Softmax
 {
 public:
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 };
 
 } // namespace ncnn
diff --git a/src/layer/batchnorm.cpp b/src/layer/batchnorm.cpp
index 5418653d8..387c11242 100644
--- a/src/layer/batchnorm.cpp
+++ b/src/layer/batchnorm.cpp
@@ -68,7 +68,7 @@ int BatchNorm::load_model(const ModelBin& mb)
     return 0;
 }
 
-int BatchNorm::forward_inplace(Mat& bottom_top_blob) const
+int BatchNorm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     // a = bias - slope * mean / sqrt(var)
     // b = slope / sqrt(var)
@@ -82,7 +82,7 @@ int BatchNorm::forward_inplace(Mat& bottom_top_blob) const
 
         float* ptr = bottom_top_blob;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int i=0; i<w; i++)
         {
             ptr[i] = b_data[i] * ptr[i] + a_data[i];
@@ -94,7 +94,7 @@ int BatchNorm::forward_inplace(Mat& bottom_top_blob) const
         int w = bottom_top_blob.w;
         int h = bottom_top_blob.h;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int i=0; i<h; i++)
         {
             float* ptr = bottom_top_blob.row(i);
@@ -114,7 +114,7 @@ int BatchNorm::forward_inplace(Mat& bottom_top_blob) const
         int h = bottom_top_blob.h;
         int size = w * h;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/batchnorm.h b/src/layer/batchnorm.h
index ca6cbb689..e2b56cda2 100644
--- a/src/layer/batchnorm.h
+++ b/src/layer/batchnorm.h
@@ -28,7 +28,7 @@ public:
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 public:
     // param
diff --git a/src/layer/bias.cpp b/src/layer/bias.cpp
index 4794a2b4f..f9f93bf8c 100644
--- a/src/layer/bias.cpp
+++ b/src/layer/bias.cpp
@@ -40,14 +40,14 @@ int Bias::load_model(const ModelBin& mb)
     return 0;
 }
 
-int Bias::forward_inplace(Mat& bottom_top_blob) const
+int Bias::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
     int size = w * h;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<channels; q++)
     {
         float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/bias.h b/src/layer/bias.h
index f63aee115..f540d34e7 100644
--- a/src/layer/bias.h
+++ b/src/layer/bias.h
@@ -28,7 +28,7 @@ public:
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 public:
     // param
diff --git a/src/layer/binaryop.cpp b/src/layer/binaryop.cpp
index 72108b81a..886794caf 100644
--- a/src/layer/binaryop.cpp
+++ b/src/layer/binaryop.cpp
@@ -43,7 +43,7 @@ int BinaryOp::load_param(const ParamDict& pd)
 }
 
 template<typename Op>
-static int binary_op(const Mat& a, const Mat& b, Mat& c)
+static int binary_op(const Mat& a, const Mat& b, Mat& c, const Option& opt)
 {
     Op op;
 
@@ -51,6 +51,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
     int h = a.h;
     int channels = a.c;
     int size = w * h;
+    size_t elemsize = a.elemsize;
 
     int w1 = b.w;
     int h1 = b.h;
@@ -59,13 +60,13 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
 
     if (a.dims == 3)
     {
-        c.create(w, h, channels);
+        c.create(w, h, channels, elemsize, opt.blob_allocator);
         if (c.empty())
             return -100;
 
         if (b.dims == 3)
         {
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int q=0; q<channels; q++)
             {
                 const float* ptr = a.channel(q);
@@ -83,7 +84,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
 
         if (b.dims == 2)
         {
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int q=0; q<channels; q++)
             {
                 const float* ptr = a.channel(q);
@@ -111,7 +112,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
             if (b.w == 1)
             {
                 const float b0 = b[0];
-                #pragma omp parallel for
+                #pragma omp parallel for num_threads(opt.num_threads)
                 for (int q=0; q<channels; q++)
                 {
                     const float* ptr = a.channel(q);
@@ -126,7 +127,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
                 return 0;
             }
 
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int q=0; q<channels; q++)
             {
                 const float* ptr = a.channel(q);
@@ -146,11 +147,11 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
     {
         if (b.dims == 3)
         {
-            c.create(w1, h1, channels1);
+            c.create(w1, h1, channels1, elemsize, opt.blob_allocator);
             if (c.empty())
                 return -100;
 
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int q=0; q<channels1; q++)
             {
                 const float* ptr = (const float*)a + h1 * q;
@@ -173,7 +174,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
             return 0;
         }
 
-        c.create(w, h);
+        c.create(w, h, elemsize, opt.blob_allocator);
         if (c.empty())
             return -100;
 
@@ -189,7 +190,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
 
         if (b.dims == 1)
         {
-            c.create(w, h);
+            c.create(w, h, elemsize, opt.blob_allocator);
             if (c.empty())
                 return -100;
 
@@ -228,12 +229,12 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
         {
             if (b.dims == 3)
             {
-                c.create(w1, h1, channels1);
+                c.create(w1, h1, channels1, elemsize, opt.blob_allocator);
                 if (c.empty())
                     return -100;
 
                 const float a0 = a[0];
-                #pragma omp parallel for
+                #pragma omp parallel for num_threads(opt.num_threads)
                 for (int q=0; q<channels1; q++)
                 {
                     const float* ptr1 = b.channel(q);
@@ -250,7 +251,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
 
             if (b.dims == 2)
             {
-                c.create(w1, h1);
+                c.create(w1, h1, elemsize, opt.blob_allocator);
                 if (c.empty())
                     return -100;
 
@@ -265,7 +266,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
 
             if (b.dims == 1)
             {
-                c.create(w1);
+                c.create(w1, elemsize, opt.blob_allocator);
                 if (c.empty())
                     return -100;
 
@@ -281,11 +282,11 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
 
         if (b.dims == 3)
         {
-            c.create(w1, h1, channels1);
+            c.create(w1, h1, channels1, elemsize, opt.blob_allocator);
             if (c.empty())
                 return -100;
 
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int q=0; q<channels1; q++)
             {
                 const float a0 = a[q];
@@ -303,7 +304,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
 
         if (b.dims == 2)
         {
-            c.create(w1, h1);
+            c.create(w1, h1, elemsize, opt.blob_allocator);
             if (c.empty())
                 return -100;
 
@@ -327,7 +328,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
 
         if (b.dims == 1)
         {
-            c.create(w);
+            c.create(w, elemsize, opt.blob_allocator);
             if (c.empty())
                 return -100;
 
@@ -353,7 +354,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
 }
 
 template<typename Op>
-static int binary_op_scalar_inplace(Mat& a, float b)
+static int binary_op_scalar_inplace(Mat& a, float b, const Option& opt)
 {
     Op op;
 
@@ -362,7 +363,7 @@ static int binary_op_scalar_inplace(Mat& a, float b)
     int channels = a.c;
     int size = w * h;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<channels; q++)
     {
         float* ptr = a.channel(q);
@@ -401,7 +402,7 @@ struct binary_op_rdiv : std::binary_function<T,T,T> {
     T operator() (const T& x, const T& y) const { return y / x; }
 };
 
-int BinaryOp::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
+int BinaryOp::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
     const Mat& bottom_blob = bottom_blobs[0];
     const Mat& bottom_blob1 = bottom_blobs[1];
@@ -409,63 +410,63 @@ int BinaryOp::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
     Mat& top_blob = top_blobs[0];
 
     if (op_type == Operation_ADD)
-        return binary_op< std::plus<float> >(bottom_blob, bottom_blob1, top_blob);
+        return binary_op< std::plus<float> >(bottom_blob, bottom_blob1, top_blob, opt);
 
     if (op_type == Operation_SUB)
-        return binary_op< std::minus<float> >(bottom_blob, bottom_blob1, top_blob);
+        return binary_op< std::minus<float> >(bottom_blob, bottom_blob1, top_blob, opt);
 
     if (op_type == Operation_MUL)
-        return binary_op< std::multiplies<float> >(bottom_blob, bottom_blob1, top_blob);
+        return binary_op< std::multiplies<float> >(bottom_blob, bottom_blob1, top_blob, opt);
 
     if (op_type == Operation_DIV)
-        return binary_op< std::divides<float> >(bottom_blob, bottom_blob1, top_blob);
+        return binary_op< std::divides<float> >(bottom_blob, bottom_blob1, top_blob, opt);
 
     if (op_type == Operation_MAX)
-        return binary_op< binary_op_max<float> >(bottom_blob, bottom_blob1, top_blob);
+        return binary_op< binary_op_max<float> >(bottom_blob, bottom_blob1, top_blob, opt);
 
     if (op_type == Operation_MIN)
-        return binary_op< binary_op_min<float> >(bottom_blob, bottom_blob1, top_blob);
+        return binary_op< binary_op_min<float> >(bottom_blob, bottom_blob1, top_blob, opt);
 
     if (op_type == Operation_POW)
-        return binary_op< binary_op_pow<float> >(bottom_blob, bottom_blob1, top_blob);
+        return binary_op< binary_op_pow<float> >(bottom_blob, bottom_blob1, top_blob, opt);
 
     if (op_type == Operation_RSUB)
-        return binary_op< binary_op_rsub<float> >(bottom_blob, bottom_blob1, top_blob);
+        return binary_op< binary_op_rsub<float> >(bottom_blob, bottom_blob1, top_blob, opt);
 
     if (op_type == Operation_RDIV)
-        return binary_op< binary_op_rdiv<float> >(bottom_blob, bottom_blob1, top_blob);
+        return binary_op< binary_op_rdiv<float> >(bottom_blob, bottom_blob1, top_blob, opt);
 
     return 0;
 }
 
-int BinaryOp::forward_inplace(Mat& bottom_top_blob) const
+int BinaryOp::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     if (op_type == Operation_ADD)
-        return binary_op_scalar_inplace< std::plus<float> >(bottom_top_blob, b);
+        return binary_op_scalar_inplace< std::plus<float> >(bottom_top_blob, b, opt);
 
     if (op_type == Operation_SUB)
-        return binary_op_scalar_inplace< std::minus<float> >(bottom_top_blob, b);
+        return binary_op_scalar_inplace< std::minus<float> >(bottom_top_blob, b, opt);
 
     if (op_type == Operation_MUL)
-        return binary_op_scalar_inplace< std::multiplies<float> >(bottom_top_blob, b);
+        return binary_op_scalar_inplace< std::multiplies<float> >(bottom_top_blob, b, opt);
 
     if (op_type == Operation_DIV)
-        return binary_op_scalar_inplace< std::divides<float> >(bottom_top_blob, b);
+        return binary_op_scalar_inplace< std::divides<float> >(bottom_top_blob, b, opt);
 
     if (op_type == Operation_MAX)
-        return binary_op_scalar_inplace< binary_op_max<float> >(bottom_top_blob, b);
+        return binary_op_scalar_inplace< binary_op_max<float> >(bottom_top_blob, b, opt);
 
     if (op_type == Operation_MIN)
-        return binary_op_scalar_inplace< binary_op_min<float> >(bottom_top_blob, b);
+        return binary_op_scalar_inplace< binary_op_min<float> >(bottom_top_blob, b, opt);
 
     if (op_type == Operation_POW)
-        return binary_op_scalar_inplace< binary_op_pow<float> >(bottom_top_blob, b);
+        return binary_op_scalar_inplace< binary_op_pow<float> >(bottom_top_blob, b, opt);
 
     if (op_type == Operation_RSUB)
-        return binary_op_scalar_inplace< binary_op_rsub<float> >(bottom_top_blob, b);
+        return binary_op_scalar_inplace< binary_op_rsub<float> >(bottom_top_blob, b, opt);
 
     if (op_type == Operation_RDIV)
-        return binary_op_scalar_inplace< binary_op_rdiv<float> >(bottom_top_blob, b);
+        return binary_op_scalar_inplace< binary_op_rdiv<float> >(bottom_top_blob, b, opt);
 
     return 0;
 }
diff --git a/src/layer/binaryop.h b/src/layer/binaryop.h
index 8affa7c35..daf0b8d4a 100644
--- a/src/layer/binaryop.h
+++ b/src/layer/binaryop.h
@@ -26,9 +26,9 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
     enum {
         Operation_ADD   = 0,
diff --git a/src/layer/bnll.cpp b/src/layer/bnll.cpp
index f53052bd8..74c0735e8 100644
--- a/src/layer/bnll.cpp
+++ b/src/layer/bnll.cpp
@@ -25,14 +25,14 @@ BNLL::BNLL()
     support_inplace = true;
 }
 
-int BNLL::forward_inplace(Mat& bottom_top_blob) const
+int BNLL::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
     int size = w * h;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<channels; q++)
     {
         float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/bnll.h b/src/layer/bnll.h
index a406d9ca1..e1293aa95 100644
--- a/src/layer/bnll.h
+++ b/src/layer/bnll.h
@@ -24,7 +24,7 @@ class BNLL : public Layer
 public:
     BNLL();
 
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 public:
 };
diff --git a/src/layer/clip.cpp b/src/layer/clip.cpp
index b56ea5880..7729b948d 100644
--- a/src/layer/clip.cpp
+++ b/src/layer/clip.cpp
@@ -34,14 +34,14 @@ int Clip::load_param(const ParamDict& pd)
     return 0;
 }
 
-int Clip::forward_inplace(Mat& bottom_top_blob) const
+int Clip::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
     int size = w * h;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<channels; q++)
     {
         float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/clip.h b/src/layer/clip.h
index 9be706331..608964c02 100644
--- a/src/layer/clip.h
+++ b/src/layer/clip.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 public:
     float min;
diff --git a/src/layer/concat.cpp b/src/layer/concat.cpp
index caec01430..52e24fb2f 100644
--- a/src/layer/concat.cpp
+++ b/src/layer/concat.cpp
@@ -31,7 +31,7 @@ int Concat::load_param(const ParamDict& pd)
     return 0;
 }
 
-int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
+int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
     int dims = bottom_blobs[0].dims;
     size_t elemsize = bottom_blobs[0].elemsize;
@@ -48,7 +48,7 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_
         }
 
         Mat& top_blob = top_blobs[0];
-        top_blob.create(top_w, elemsize);
+        top_blob.create(top_w, elemsize, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
@@ -82,7 +82,7 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_
         }
 
         Mat& top_blob = top_blobs[0];
-        top_blob.create(w, top_h, elemsize);
+        top_blob.create(w, top_h, elemsize, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
@@ -116,11 +116,11 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_
         }
 
         Mat& top_blob = top_blobs[0];
-        top_blob.create(top_w, h, elemsize);
+        top_blob.create(top_w, h, elemsize, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int i=0; i<h; i++)
         {
             float* outptr = top_blob.row(i);
@@ -153,7 +153,7 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_
         }
 
         Mat& top_blob = top_blobs[0];
-        top_blob.create(w, h, top_channels, elemsize);
+        top_blob.create(w, h, top_channels, elemsize, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
@@ -190,11 +190,11 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_
         }
 
         Mat& top_blob = top_blobs[0];
-        top_blob.create(w, top_h, channels, elemsize);
+        top_blob.create(w, top_h, channels, elemsize, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             float* outptr = top_blob.channel(q);
@@ -230,11 +230,11 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_
         }
 
         Mat& top_blob = top_blobs[0];
-        top_blob.create(top_w, h, channels, elemsize);
+        top_blob.create(top_w, h, channels, elemsize, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             float* outptr = top_blob.channel(q);
diff --git a/src/layer/concat.h b/src/layer/concat.h
index d50e5eef3..bf2fe77e6 100644
--- a/src/layer/concat.h
+++ b/src/layer/concat.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 
 public:
     int axis;
diff --git a/src/layer/convolution.cpp b/src/layer/convolution.cpp
index ea8073436..d4757f151 100644
--- a/src/layer/convolution.cpp
+++ b/src/layer/convolution.cpp
@@ -59,7 +59,7 @@ int Convolution::load_model(const ModelBin& mb)
     return 0;
 }
 
-int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
+int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     // convolv with NxN kernel
     // value = value + bias
@@ -89,7 +89,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
             op->load_model(ModelBinFromMatArray(weights));
 
             // forward
-            op->forward(bottom_blob, top_blob);
+            op->forward(bottom_blob, top_blob, opt);
 
             delete op;
 
@@ -100,6 +100,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
 
 //     fprintf(stderr, "Convolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d\n", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
 
@@ -109,7 +110,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
     Mat bottom_blob_bordered = bottom_blob;
     if (pad_w > 0 || pad_h > 0)
     {
-        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f);
+        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
         if (bottom_blob_bordered.empty())
             return -100;
 
@@ -122,7 +123,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
         int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
         if (wpad > 0 || hpad > 0)
         {
-            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f);
+            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
             if (bottom_blob_bordered.empty())
                 return -100;
         }
@@ -134,7 +135,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
     int outw = (w - kernel_extent_w) / stride_w + 1;
     int outh = (h - kernel_extent_h) / stride_h + 1;
 
-    top_blob.create(outw, outh, num_output);
+    top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
@@ -160,7 +161,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
     }
 
     // num_output
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=0; p<num_output; p++)
     {
         float* outptr = top_blob.channel(p);
diff --git a/src/layer/convolution.h b/src/layer/convolution.h
index 74d954223..73c97be89 100644
--- a/src/layer/convolution.h
+++ b/src/layer/convolution.h
@@ -28,7 +28,7 @@ public:
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 public:
     // param
diff --git a/src/layer/convolutiondepthwise.cpp b/src/layer/convolutiondepthwise.cpp
index 2eaf7fb92..88376decf 100644
--- a/src/layer/convolutiondepthwise.cpp
+++ b/src/layer/convolutiondepthwise.cpp
@@ -64,7 +64,7 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb)
     return 0;
 }
 
-int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
+int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     // convolv with NxN kernel
     // value = value + bias
@@ -72,6 +72,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
 
     if (channels % group != 0 || num_output % group != 0)
     {
@@ -87,7 +88,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
     Mat bottom_blob_bordered = bottom_blob;
     if (pad_w > 0 || pad_h > 0)
     {
-        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f);
+        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
         if (bottom_blob_bordered.empty())
             return -100;
 
@@ -100,7 +101,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
         int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
         if (wpad > 0 || hpad > 0)
         {
-            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f);
+            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
             if (bottom_blob_bordered.empty())
                 return -100;
         }
@@ -112,7 +113,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
     int outw = (w - kernel_extent_w) / stride_w + 1;
     int outh = (h - kernel_extent_h) / stride_h + 1;
 
-    top_blob.create(outw, outh, num_output);
+    top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
@@ -140,7 +141,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
     // depth-wise
     if (channels == group && group == num_output)
     {
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int g=0; g<group; g++)
         {
             float* outptr = top_blob.channel(g);
@@ -179,9 +180,9 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
     const int num_output_g = num_output / group;
 
 #ifdef _WIN32
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
 #else // _WIN32
-    #pragma omp parallel for collapse(2)
+    #pragma omp parallel for collapse(2) num_threads(opt.num_threads)
 #endif // _WIN32
     for (int g=0; g<group; g++)
     {
diff --git a/src/layer/convolutiondepthwise.h b/src/layer/convolutiondepthwise.h
index 7bb94172b..0327eea10 100644
--- a/src/layer/convolutiondepthwise.h
+++ b/src/layer/convolutiondepthwise.h
@@ -28,7 +28,7 @@ public:
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 public:
     // param
diff --git a/src/layer/crop.cpp b/src/layer/crop.cpp
index 64267d077..05ab8ab78 100644
--- a/src/layer/crop.cpp
+++ b/src/layer/crop.cpp
@@ -39,7 +39,7 @@ int Crop::load_param(const ParamDict& pd)
     return 0;
 }
 
-int Crop::forward(const Mat& bottom_blob, Mat& top_blob) const
+int Crop::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -56,14 +56,14 @@ int Crop::forward(const Mat& bottom_blob, Mat& top_blob) const
     int left = woffset;
     int right = w - _outw - woffset;
 
-    copy_cut_border(bottom_blob_sliced, top_blob, top, bottom, left, right);
+    copy_cut_border(bottom_blob_sliced, top_blob, top, bottom, left, right, opt.blob_allocator, opt.num_threads);
     if (top_blob.empty())
         return -100;
 
     return 0;
 }
 
-int Crop::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
+int Crop::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
     const Mat& bottom_blob = bottom_blobs[0];
     const Mat& reference_blob = bottom_blobs[1];
@@ -85,7 +85,7 @@ int Crop::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
 
     Mat& top_blob = top_blobs[0];
 
-    copy_cut_border(bottom_blob_sliced, top_blob, top, bottom, left, right);
+    copy_cut_border(bottom_blob_sliced, top_blob, top, bottom, left, right, opt.blob_allocator, opt.num_threads);
     if (top_blob.empty())
         return -100;
 
diff --git a/src/layer/crop.h b/src/layer/crop.h
index 43b30defe..ee88ee076 100644
--- a/src/layer/crop.h
+++ b/src/layer/crop.h
@@ -26,9 +26,9 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
-    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 
 public:
     int woffset;
diff --git a/src/layer/deconvolution.cpp b/src/layer/deconvolution.cpp
index 5ed43a00c..506d8173e 100644
--- a/src/layer/deconvolution.cpp
+++ b/src/layer/deconvolution.cpp
@@ -57,7 +57,7 @@ int Deconvolution::load_model(const ModelBin& mb)
     return 0;
 }
 
-int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const
+int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     // backward strided convolv with NxN kernel
     // value = value + bias
@@ -65,6 +65,7 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
 
 //     fprintf(stderr, "Deconvolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d\n", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
 
@@ -74,10 +75,20 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const
     int outw = (w - 1) * stride_w + kernel_extent_w;
     int outh = (h - 1) * stride_h + kernel_extent_h;
 
-    Mat top_blob_bordered = top_blob;
-    top_blob_bordered.create(outw, outh, num_output);
-    if (top_blob_bordered.empty())
-        return -100;
+    Mat top_blob_bordered;
+    if (pad_w > 0 || pad_h > 0)
+    {
+        top_blob_bordered.create(outw, outh, num_output, elemsize, opt.workspace_allocator);
+        if (top_blob_bordered.empty())
+            return -100;
+    }
+    else
+    {
+        top_blob_bordered = top_blob;
+        top_blob_bordered.create(outw, outh, num_output, elemsize, opt.blob_allocator);
+        if (top_blob_bordered.empty())
+            return -100;
+    }
 
     const int maxk = kernel_w * kernel_h;
 
@@ -101,7 +112,7 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const
     }
 
     // num_output
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=0; p<num_output; p++)
     {
         Mat out = top_blob_bordered.channel(p);
@@ -136,17 +147,19 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const
         }
     }
 
-    top_blob = top_blob_bordered;
-
     if (pad_w > 0 || pad_h > 0)
     {
-        copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w);
+        copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w, opt.blob_allocator, opt.num_threads);
         if (top_blob.empty())
             return -100;
 
         outw = top_blob.w;
         outh = top_blob.h;
     }
+    else
+    {
+        top_blob = top_blob_bordered;
+    }
 
     return 0;
 }
diff --git a/src/layer/deconvolution.h b/src/layer/deconvolution.h
index 1ef614d3e..10bd9e3c9 100644
--- a/src/layer/deconvolution.h
+++ b/src/layer/deconvolution.h
@@ -28,7 +28,7 @@ public:
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 public:
     // param
diff --git a/src/layer/deconvolutiondepthwise.cpp b/src/layer/deconvolutiondepthwise.cpp
index 536f82b09..fc1b1ce26 100644
--- a/src/layer/deconvolutiondepthwise.cpp
+++ b/src/layer/deconvolutiondepthwise.cpp
@@ -58,7 +58,7 @@ int DeconvolutionDepthWise::load_model(const ModelBin& mb)
     return 0;
 }
 
-int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
+int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     // deconvolv with NxN kernel
     // value = value + bias
@@ -66,6 +66,7 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
 
     if (channels % group != 0 || num_output % group != 0)
     {
@@ -79,10 +80,20 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
     int outw = (w - 1) * stride_w + kernel_extent_w;
     int outh = (h - 1) * stride_h + kernel_extent_h;
 
-    Mat top_blob_bordered = top_blob;
-    top_blob_bordered.create(outw, outh, num_output);
-    if (top_blob_bordered.empty())
-        return -100;
+    Mat top_blob_bordered;
+    if (pad_w > 0 || pad_h > 0)
+    {
+        top_blob_bordered.create(outw, outh, num_output, elemsize, opt.workspace_allocator);
+        if (top_blob_bordered.empty())
+            return -100;
+    }
+    else
+    {
+        top_blob_bordered = top_blob;
+        top_blob_bordered.create(outw, outh, num_output, elemsize, opt.blob_allocator);
+        if (top_blob_bordered.empty())
+            return -100;
+    }
 
     const int maxk = kernel_w * kernel_h;
 
@@ -108,7 +119,7 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
     // depth-wise
     if (channels == group && group == num_output)
     {
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int g=0; g<group; g++)
         {
             const float* inptr = bottom_blob.channel(g);
@@ -141,7 +152,7 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
         const int channels_g = channels / group;
         const int num_output_g = num_output / group;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int g = 0; g < group; g++)
         {
             const float* weight_data_ptr = (const float*)weight_data + maxk * channels_g * num_output_g * g;
@@ -180,17 +191,19 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
         }
     }
 
-    top_blob = top_blob_bordered;
-
     if (pad_w > 0 || pad_h > 0)
     {
-        copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w);
+        copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w, opt.blob_allocator, opt.num_threads);
         if (top_blob.empty())
             return -100;
 
         outw = top_blob.w;
         outh = top_blob.h;
     }
+    else
+    {
+        top_blob = top_blob_bordered;
+    }
 
     return 0;
 }
diff --git a/src/layer/deconvolutiondepthwise.h b/src/layer/deconvolutiondepthwise.h
index a1a57795f..674cb2c47 100644
--- a/src/layer/deconvolutiondepthwise.h
+++ b/src/layer/deconvolutiondepthwise.h
@@ -28,7 +28,7 @@ public:
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 public:
     // param
diff --git a/src/layer/detectionoutput.cpp b/src/layer/detectionoutput.cpp
index fc12955a7..af0976ca2 100644
--- a/src/layer/detectionoutput.cpp
+++ b/src/layer/detectionoutput.cpp
@@ -141,7 +141,7 @@ static void nms_sorted_bboxes(const std::vector<BBoxRect>& bboxes, std::vector<i
     }
 }
 
-int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
+int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
     const Mat& location = bottom_blobs[0];
     const Mat& confidence = bottom_blobs[1];
@@ -151,7 +151,7 @@ int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<M
 
     // apply location with priorbox
     Mat bboxes;
-    bboxes.create(4, num_prior);
+    bboxes.create(4, num_prior, 4u, opt.workspace_allocator);
     if (bboxes.empty())
         return -100;
 
@@ -159,7 +159,7 @@ int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<M
     const float* priorbox_ptr = priorbox.row(0);
     const float* variance_ptr = priorbox.row(1);
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int i = 0; i < num_prior; i++)
     {
         const float* loc = location_ptr + i * 4;
@@ -192,7 +192,7 @@ int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<M
     all_class_bbox_scores.resize(num_class);
 
     // start from 1 to ignore background class
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int i = 1; i < num_class; i++)
     {
         // filter by confidence_threshold
@@ -262,7 +262,7 @@ int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<M
     int num_detected = bbox_rects.size();
 
     Mat& top_blob = top_blobs[0];
-    top_blob.create(6, num_detected);
+    top_blob.create(6, num_detected, 4u, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
diff --git a/src/layer/detectionoutput.h b/src/layer/detectionoutput.h
index 1573a9ede..97486220e 100644
--- a/src/layer/detectionoutput.h
+++ b/src/layer/detectionoutput.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 
 public:
     int num_class;
diff --git a/src/layer/dropout.cpp b/src/layer/dropout.cpp
index 89ca292eb..ccf79243c 100644
--- a/src/layer/dropout.cpp
+++ b/src/layer/dropout.cpp
@@ -31,7 +31,7 @@ int Dropout::load_param(const ParamDict& pd)
     return 0;
 }
 
-int Dropout::forward_inplace(Mat& bottom_top_blob) const
+int Dropout::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     if (scale == 1.f)
     {
@@ -43,7 +43,7 @@ int Dropout::forward_inplace(Mat& bottom_top_blob) const
     int channels = bottom_top_blob.c;
     int size = w * h;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<channels; q++)
     {
         float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/dropout.h b/src/layer/dropout.h
index 078a93f48..396740abf 100644
--- a/src/layer/dropout.h
+++ b/src/layer/dropout.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 public:
     float scale;
diff --git a/src/layer/eltwise.cpp b/src/layer/eltwise.cpp
index 243bc89bd..9126ba498 100644
--- a/src/layer/eltwise.cpp
+++ b/src/layer/eltwise.cpp
@@ -31,16 +31,17 @@ int Eltwise::load_param(const ParamDict& pd)
     return 0;
 }
 
-int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
+int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
     const Mat& bottom_blob = bottom_blobs[0];
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
     int size = w * h;
 
     Mat& top_blob = top_blobs[0];
-    top_blob.create(w, h, channels);
+    top_blob.create(w, h, channels, elemsize, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
@@ -48,7 +49,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
     {
         // first blob
         const Mat& bottom_blob1 = bottom_blobs[1];
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             const float* ptr = bottom_blob.channel(q);
@@ -64,7 +65,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
         for (size_t b=2; b<bottom_blobs.size(); b++)
         {
             const Mat& bottom_blob1 = bottom_blobs[b];
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int q=0; q<channels; q++)
             {
                 const float* ptr = bottom_blob1.channel(q);
@@ -83,7 +84,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
         {
             // first blob
             const Mat& bottom_blob1 = bottom_blobs[1];
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int q=0; q<channels; q++)
             {
                 const float* ptr = bottom_blob.channel(q);
@@ -99,7 +100,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
             for (size_t b=2; b<bottom_blobs.size(); b++)
             {
                 const Mat& bottom_blob1 = bottom_blobs[b];
-                #pragma omp parallel for
+                #pragma omp parallel for num_threads(opt.num_threads)
                 for (int q=0; q<channels; q++)
                 {
                     const float* ptr = bottom_blob1.channel(q);
@@ -118,7 +119,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
             const Mat& bottom_blob1 = bottom_blobs[1];
             float coeff0 = coeffs[0];
             float coeff1 = coeffs[1];
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int q=0; q<channels; q++)
             {
                 const float* ptr = bottom_blob.channel(q);
@@ -135,7 +136,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
             {
                 const Mat& bottom_blob1 = bottom_blobs[b];
                 float coeff = coeffs[b];
-                #pragma omp parallel for
+                #pragma omp parallel for num_threads(opt.num_threads)
                 for (int q=0; q<channels; q++)
                 {
                     const float* ptr = bottom_blob1.channel(q);
@@ -153,7 +154,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
     {
         // first blob
         const Mat& bottom_blob1 = bottom_blobs[1];
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             const float* ptr = bottom_blob.channel(q);
@@ -169,7 +170,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
         for (size_t b=2; b<bottom_blobs.size(); b++)
         {
             const Mat& bottom_blob1 = bottom_blobs[b];
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int q=0; q<channels; q++)
             {
                 const float* ptr = bottom_blob1.channel(q);
diff --git a/src/layer/eltwise.h b/src/layer/eltwise.h
index 4ef4266b9..6b8a8effc 100644
--- a/src/layer/eltwise.h
+++ b/src/layer/eltwise.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 
     enum { Operation_PROD = 0, Operation_SUM = 1, Operation_MAX = 2 };
 
diff --git a/src/layer/elu.cpp b/src/layer/elu.cpp
index dd5780a2d..bbd1679a5 100644
--- a/src/layer/elu.cpp
+++ b/src/layer/elu.cpp
@@ -32,14 +32,14 @@ int ELU::load_param(const ParamDict& pd)
     return 0;
 }
 
-int ELU::forward_inplace(Mat& bottom_top_blob) const
+int ELU::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
     int size = w * h;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<channels; q++)
     {
         float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/elu.h b/src/layer/elu.h
index 45dc67ca8..8d44124cb 100644
--- a/src/layer/elu.h
+++ b/src/layer/elu.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 public:
     float alpha;
diff --git a/src/layer/embed.cpp b/src/layer/embed.cpp
index 26095d5c8..08ef0175d 100644
--- a/src/layer/embed.cpp
+++ b/src/layer/embed.cpp
@@ -51,16 +51,16 @@ int Embed::load_model(const ModelBin& mb)
     return 0;
 }
 
-int Embed::forward(const Mat& bottom_blob, Mat& top_blob) const
+int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     int words = bottom_blob.total();
 
-    top_blob.create(num_output, words);
+    top_blob.create(num_output, words, 4u, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
     // num_output
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<words; q++)
     {
         float* outptr = top_blob.row(q);
diff --git a/src/layer/embed.h b/src/layer/embed.h
index 58920294e..8e2366567 100644
--- a/src/layer/embed.h
+++ b/src/layer/embed.h
@@ -28,7 +28,7 @@ public:
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 public:
     // param
diff --git a/src/layer/exp.cpp b/src/layer/exp.cpp
index 0dd533045..ee5a27752 100644
--- a/src/layer/exp.cpp
+++ b/src/layer/exp.cpp
@@ -34,7 +34,7 @@ int Exp::load_param(const ParamDict& pd)
     return 0;
 }
 
-int Exp::forward_inplace(Mat& bottom_top_blob) const
+int Exp::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
@@ -43,7 +43,7 @@ int Exp::forward_inplace(Mat& bottom_top_blob) const
 
     if (base == -1.f)
     {
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             float* ptr = bottom_top_blob.channel(q);
@@ -56,7 +56,7 @@ int Exp::forward_inplace(Mat& bottom_top_blob) const
     }
     else
     {
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/exp.h b/src/layer/exp.h
index eb6c52c68..df1c53f89 100644
--- a/src/layer/exp.h
+++ b/src/layer/exp.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 public:
     float base;
diff --git a/src/layer/expanddims.cpp b/src/layer/expanddims.cpp
index d65e24ba3..9daecda6a 100644
--- a/src/layer/expanddims.cpp
+++ b/src/layer/expanddims.cpp
@@ -33,7 +33,7 @@ int ExpandDims::load_param(const ParamDict& pd)
     return 0;
 }
 
-int ExpandDims::forward(const Mat& bottom_blob, Mat& top_blob) const
+int ExpandDims::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -46,28 +46,28 @@ int ExpandDims::forward(const Mat& bottom_blob, Mat& top_blob) const
         if (expand_w)
         {
             if (expand_h)
-                top_blob = bottom_blob.reshape(1, 1, w);
+                top_blob = bottom_blob.reshape(1, 1, w, opt.blob_allocator);
             else if (expand_c)
-                top_blob = bottom_blob.reshape(1, w, 1);
+                top_blob = bottom_blob.reshape(1, w, 1, opt.blob_allocator);
             else
-                top_blob = bottom_blob.reshape(1, w);
+                top_blob = bottom_blob.reshape(1, w, opt.blob_allocator);
         }
         else if (expand_h)
         {
             if (expand_c)
-                top_blob = bottom_blob.reshape(w, 1, 1);
+                top_blob = bottom_blob.reshape(w, 1, 1, opt.blob_allocator);
             else
-                top_blob = bottom_blob.reshape(w, 1);
+                top_blob = bottom_blob.reshape(w, 1, opt.blob_allocator);
         }
     }
     else if (dims == 2)
     {
         if (expand_w)
-            top_blob = bottom_blob.reshape(1, w, h);
+            top_blob = bottom_blob.reshape(1, w, h, opt.blob_allocator);
         else if (expand_h)
-            top_blob = bottom_blob.reshape(w, 1, h);
+            top_blob = bottom_blob.reshape(w, 1, h, opt.blob_allocator);
         else if (expand_c)
-            top_blob = bottom_blob.reshape(w, h, 1);
+            top_blob = bottom_blob.reshape(w, h, 1, opt.blob_allocator);
     }
 
     if (top_blob.empty())
diff --git a/src/layer/expanddims.h b/src/layer/expanddims.h
index aedadba42..c898f2ae7 100644
--- a/src/layer/expanddims.h
+++ b/src/layer/expanddims.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 public:
     int expand_w;
diff --git a/src/layer/flatten.cpp b/src/layer/flatten.cpp
index 6c7994b09..8c9806cc4 100644
--- a/src/layer/flatten.cpp
+++ b/src/layer/flatten.cpp
@@ -24,18 +24,19 @@ Flatten::Flatten()
     support_inplace = false;
 }
 
-int Flatten::forward(const Mat& bottom_blob, Mat& top_blob) const
+int Flatten::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
     int size = w * h;
 
-    top_blob.create(size * channels);
+    top_blob.create(size * channels, elemsize, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<channels; q++)
     {
         const float* ptr = bottom_blob.channel(q);
diff --git a/src/layer/flatten.h b/src/layer/flatten.h
index 10c5fe260..22667a45a 100644
--- a/src/layer/flatten.h
+++ b/src/layer/flatten.h
@@ -24,7 +24,7 @@ class Flatten : public Layer
 public:
     Flatten();
 
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 };
 
 } // namespace ncnn
diff --git a/src/layer/innerproduct.cpp b/src/layer/innerproduct.cpp
index 91d8b4049..17adb28e2 100644
--- a/src/layer/innerproduct.cpp
+++ b/src/layer/innerproduct.cpp
@@ -49,19 +49,20 @@ int InnerProduct::load_model(const ModelBin& mb)
     return 0;
 }
 
-int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob) const
+int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
     int size = w * h;
 
-    top_blob.create(num_output);
+    top_blob.create(num_output, elemsize, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
     // num_output
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=0; p<num_output; p++)
     {
         float sum = 0.f;
diff --git a/src/layer/innerproduct.h b/src/layer/innerproduct.h
index c9cde317d..e6c1d7221 100644
--- a/src/layer/innerproduct.h
+++ b/src/layer/innerproduct.h
@@ -28,7 +28,7 @@ public:
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 public:
     // param
diff --git a/src/layer/input.cpp b/src/layer/input.cpp
index a7d6a35e7..ea9acb151 100644
--- a/src/layer/input.cpp
+++ b/src/layer/input.cpp
@@ -33,7 +33,7 @@ int Input::load_param(const ParamDict& pd)
     return 0;
 }
 
-int Input::forward_inplace(Mat& /*bottom_top_blob*/) const
+int Input::forward_inplace(Mat& /*bottom_top_blob*/, const Option& /*opt*/) const
 {
     return 0;
 }
diff --git a/src/layer/input.h b/src/layer/input.h
index 8af67acc9..2d12d54d6 100644
--- a/src/layer/input.h
+++ b/src/layer/input.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 public:
     int w;
diff --git a/src/layer/instancenorm.cpp b/src/layer/instancenorm.cpp
index eba3a97c1..2bd30b513 100644
--- a/src/layer/instancenorm.cpp
+++ b/src/layer/instancenorm.cpp
@@ -46,7 +46,7 @@ int InstanceNorm::load_model(const ModelBin& mb)
     return 0;
 }
 
-int InstanceNorm::forward_inplace(Mat& bottom_top_blob) const
+int InstanceNorm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     // x = (x - mean) / (sqrt(var) + eps) * gamma + beta
 
@@ -54,7 +54,7 @@ int InstanceNorm::forward_inplace(Mat& bottom_top_blob) const
     int h = bottom_top_blob.h;
     int size = w * h;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<channels; q++)
     {
         float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/instancenorm.h b/src/layer/instancenorm.h
index 4866eef68..ce6268ac6 100644
--- a/src/layer/instancenorm.h
+++ b/src/layer/instancenorm.h
@@ -28,7 +28,7 @@ public:
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 public:
     // param
diff --git a/src/layer/interp.cpp b/src/layer/interp.cpp
index fb2216536..623f6772f 100644
--- a/src/layer/interp.cpp
+++ b/src/layer/interp.cpp
@@ -35,11 +35,13 @@ int Interp::load_param(const ParamDict& pd)
     return 0;
 }
 
-int Interp::forward(const Mat &bottom_blob, Mat &top_blob) const
+int Interp::forward(const Mat &bottom_blob, Mat &top_blob, const Option& opt) const
 {
     int h = bottom_blob.h;
     int w = bottom_blob.w;
     int c = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+
     int oh = output_height;
     int ow = output_width;
     if (bottom_blob.dims == 1)
@@ -58,13 +60,13 @@ int Interp::forward(const Mat &bottom_blob, Mat &top_blob) const
         top_blob = bottom_blob;
         return 0;
     }
-    top_blob.create(ow, oh, c);
+    top_blob.create(ow, oh, c, elemsize, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
     if (bottom_blob.dims == 1)
     {
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < c; ++q)
         {
             Mat top_blob_c = top_blob.channel(q);
@@ -76,7 +78,7 @@ int Interp::forward(const Mat &bottom_blob, Mat &top_blob) const
 
     if (resize_type == 1)//nearest
     {
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < c; ++q)
         {
             const float *ptr = bottom_blob.channel(q);
diff --git a/src/layer/interp.h b/src/layer/interp.h
index 813d5fa0d..f14a7dd99 100644
--- a/src/layer/interp.h
+++ b/src/layer/interp.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward(const Mat &bottom_blob, Mat &top_blob) const;
+    virtual int forward(const Mat &bottom_blob, Mat &top_blob, const Option& opt) const;
 
 public:
     // param
diff --git a/src/layer/log.cpp b/src/layer/log.cpp
index 4ff66ee84..2b47ad1b9 100644
--- a/src/layer/log.cpp
+++ b/src/layer/log.cpp
@@ -34,7 +34,7 @@ int Log::load_param(const ParamDict& pd)
     return 0;
 }
 
-int Log::forward_inplace(Mat& bottom_top_blob) const
+int Log::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
@@ -43,7 +43,7 @@ int Log::forward_inplace(Mat& bottom_top_blob) const
 
     if (base == -1.f)
     {
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             float* ptr = bottom_top_blob.channel(q);
@@ -58,7 +58,7 @@ int Log::forward_inplace(Mat& bottom_top_blob) const
     {
         float log_base_inv = 1.f / log(base);
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/log.h b/src/layer/log.h
index c75181286..185a14076 100644
--- a/src/layer/log.h
+++ b/src/layer/log.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 public:
     float base;
diff --git a/src/layer/lrn.cpp b/src/layer/lrn.cpp
index e8edb40ba..3e8cf0501 100644
--- a/src/layer/lrn.cpp
+++ b/src/layer/lrn.cpp
@@ -36,20 +36,21 @@ int LRN::load_param(const ParamDict& pd)
     return 0;
 }
 
-int LRN::forward_inplace(Mat& bottom_top_blob) const
+int LRN::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
+    size_t elemsize = bottom_top_blob.elemsize;
     int size = w * h;
 
     // squared values with local_size padding
     Mat square_blob;
-    square_blob.create(w, h, channels);
+    square_blob.create(w, h, channels, elemsize, opt.workspace_allocator);
     if (square_blob.empty())
         return -100;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<channels; q++)
     {
         const float* ptr = bottom_top_blob.channel(q);
@@ -64,14 +65,14 @@ int LRN::forward_inplace(Mat& bottom_top_blob) const
     if (region_type == NormRegion_ACROSS_CHANNELS)
     {
         Mat square_sum;
-        square_sum.create(w, h, channels);
+        square_sum.create(w, h, channels, elemsize, opt.workspace_allocator);
         if (square_sum.empty())
             return -100;
         square_sum.fill(0.f);
 
         const float alpha_div_size = alpha / local_size;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             // square sum
@@ -104,7 +105,7 @@ int LRN::forward_inplace(Mat& bottom_top_blob) const
         int pad = local_size / 2;
         if (pad > 0)
         {
-            copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f);
+            copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
             if (square_blob_bordered.empty())
                 return -100;
 
@@ -135,7 +136,7 @@ int LRN::forward_inplace(Mat& bottom_top_blob) const
             }
         }
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/lrn.h b/src/layer/lrn.h
index 24f7b23ca..520a24117 100644
--- a/src/layer/lrn.h
+++ b/src/layer/lrn.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
     enum { NormRegion_ACROSS_CHANNELS = 0, NormRegion_WITHIN_CHANNEL = 1 };
 
diff --git a/src/layer/lstm.cpp b/src/layer/lstm.cpp
index 5798e2e0d..934424420 100644
--- a/src/layer/lstm.cpp
+++ b/src/layer/lstm.cpp
@@ -35,52 +35,53 @@ int LSTM::load_param(const ParamDict& pd)
 
 int LSTM::load_model(const ModelBin& mb)
 {
-    int size = weight_data_size / 2 / num_output / 4;
+    int size = weight_data_size / num_output / 4;
 
     // raw weight data
-    weight_hc_data = mb.load(size * 4, num_output, 1);
+    weight_hc_data = mb.load(size, num_output * 4, 0);
     if (weight_hc_data.empty())
         return -100;
 
-    weight_xc_data = mb.load(size * 4, num_output, 1);
+    weight_xc_data = mb.load(size, num_output * 4, 0);
     if (weight_xc_data.empty())
         return -100;
 
-    bias_c_data = mb.load(4, num_output, 1);
+    bias_c_data = mb.load(4, num_output, 0);
     if (bias_c_data.empty())
         return -100;
 
     return 0;
 }
 
-int LSTM::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
+int LSTM::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
-    // size x 1 x T
+    // size x T
     const Mat& input_blob = bottom_blobs[0];
+    size_t elemsize = input_blob.elemsize;
 
     // T, 0 or 1 each
     const Mat& cont_blob = bottom_blobs[1];
 
-    int T = input_blob.c;
+    int T = input_blob.h;
     int size = input_blob.w;
 
     // initial hidden state
-    Mat hidden(num_output);
+    Mat hidden(num_output, 4u, opt.workspace_allocator);
     if (hidden.empty())
         return -100;
     hidden.fill(0.f);
 
     // internal cell state
-    Mat cell(num_output);
+    Mat cell(num_output, 4u, opt.workspace_allocator);
     if (cell.empty())
         return -100;
     // 4 x num_output
-    Mat gates(4, num_output);
+    Mat gates(4, num_output, 4u, opt.workspace_allocator);
     if (gates.empty())
         return -100;
 
     Mat& top_blob = top_blobs[0];
-    top_blob.create(num_output, 1, T);
+    top_blob.create(num_output, T, elemsize, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
@@ -93,14 +94,12 @@ int LSTM::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
         //                0       otherwise
         // calculate hidden
         // gate_input_t := W_hc * h_conted_{t-1} + W_xc * x_t + b_c
-        const float cont = cont_blob[t];
-        const Mat x = input_blob.channel(t);
-        float* hidden_data = hidden;
+        const int cont = ((const int*)cont_blob)[t];
+        const float* x = input_blob.row(t);
         for (int q=0; q<num_output; q++)
         {
-            float h_cont = cont ? hidden_data[q] : 0.f;
+            float h_cont = cont ? hidden[q] : 0.f;
 
-            const float* x_data = x;
             const float* bias_c_data_ptr = (const float*)bias_c_data + 4 * q;
             float* gates_data = (float*)gates + 4 * q;
 
@@ -120,10 +119,10 @@ int LSTM::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
             float G = bias_c_data_ptr[3];
             for (int i=0; i<size; i++)
             {
-                I += weight_hc_data_I[i] * h_cont + weight_xc_data_I[i] * x_data[i];
-                F += weight_hc_data_F[i] * h_cont + weight_xc_data_F[i] * x_data[i];
-                O += weight_hc_data_O[i] * h_cont + weight_xc_data_O[i] * x_data[i];
-                G += weight_hc_data_G[i] * h_cont + weight_xc_data_G[i] * x_data[i];
+                I += weight_hc_data_I[i] * h_cont + weight_xc_data_I[i] * x[i];
+                F += weight_hc_data_F[i] * h_cont + weight_xc_data_F[i] * x[i];
+                O += weight_hc_data_O[i] * h_cont + weight_xc_data_O[i] * x[i];
+                G += weight_hc_data_G[i] * h_cont + weight_xc_data_G[i] * x[i];
             }
 
             gates_data[0] = I;
@@ -139,9 +138,7 @@ int LSTM::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
         // tanh(G)
         // c_t := f_t .* c_{t-1} + i_t .* g_t
         // h_t := o_t .* tanh[c_t]
-        float* cell_data = cell;
-        Mat output = top_blob.channel(t);
-        float* output_data = output;
+        float* output_data = top_blob.row(t);
         for (int q=0; q<num_output; q++)
         {
             float* gates_data = (float*)gates + 4 * q;
@@ -152,15 +149,15 @@ int LSTM::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
             float G = gates_data[3];
 
             I = 1.f / (1.f + exp(-I));
-            F = cont ? 0.f : 1.f / (1.f + exp(-F));
+            F = cont ? 1.f / (1.f + exp(-F)) : 0.f;
             O = 1.f / (1.f + exp(-O));
             G = tanh(G);
 
-            float cell = F * cell_data[q] + I * G;
-            float H = O * tanh(cell);
+            float cell2 = F * cell[q] + I * G;
+            float H = O * tanh(cell2);
 
-            cell_data[q] = cell;
-            hidden_data[q] = H;
+            cell[q] = cell2;
+            hidden[q] = H;
             output_data[q] = H;
         }
 
diff --git a/src/layer/lstm.h b/src/layer/lstm.h
index 1a9ec4d04..d975745ac 100644
--- a/src/layer/lstm.h
+++ b/src/layer/lstm.h
@@ -28,7 +28,7 @@ public:
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 
 public:
     // param
diff --git a/src/layer/memorydata.cpp b/src/layer/memorydata.cpp
index d7c2cf214..a3b4c6957 100644
--- a/src/layer/memorydata.cpp
+++ b/src/layer/memorydata.cpp
@@ -57,11 +57,11 @@ int MemoryData::load_model(const ModelBin& mb)
     return 0;
 }
 
-int MemoryData::forward(const std::vector<Mat>& /*bottom_blobs*/, std::vector<Mat>& top_blobs) const
+int MemoryData::forward(const std::vector<Mat>& /*bottom_blobs*/, std::vector<Mat>& top_blobs, const Option& opt) const
 {
     Mat& top_blob = top_blobs[0];
 
-    top_blob = data.clone();
+    top_blob = data.clone(opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
diff --git a/src/layer/memorydata.h b/src/layer/memorydata.h
index e91c49da2..3b8081acb 100644
--- a/src/layer/memorydata.h
+++ b/src/layer/memorydata.h
@@ -28,7 +28,7 @@ public:
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 
 public:
     int w;
diff --git a/src/layer/mvn.cpp b/src/layer/mvn.cpp
index 825eab9b3..0cec40301 100644
--- a/src/layer/mvn.cpp
+++ b/src/layer/mvn.cpp
@@ -34,23 +34,24 @@ int MVN::load_param(const ParamDict& pd)
     return 0;
 }
 
-int MVN::forward(const Mat& bottom_blob, Mat& top_blob) const
+int MVN::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
     int size = w * h;
 
-    top_blob.create(w, h, channels);
+    top_blob.create(w, h, channels, elemsize, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
     // prepare sum per channel
-    Mat sum(channels);
+    Mat sum(channels, elemsize, opt.workspace_allocator);
     if (sum.empty())
         return -100;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<channels; q++)
     {
         const float* ptr = bottom_blob.channel(q);
@@ -75,7 +76,7 @@ int MVN::forward(const Mat& bottom_blob, Mat& top_blob) const
         mean = mean / (channels * size);
 
         // subtract mean
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             const float* ptr = bottom_blob.channel(q);
@@ -90,7 +91,7 @@ int MVN::forward(const Mat& bottom_blob, Mat& top_blob) const
     else
     {
         // subtract mean
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             const float* ptr = bottom_blob.channel(q);
@@ -107,11 +108,11 @@ int MVN::forward(const Mat& bottom_blob, Mat& top_blob) const
     if (normalize_variance)
     {
         // prepare squared sum per channel
-        Mat sqsum(channels);
+        Mat sqsum(channels, elemsize, opt.workspace_allocator);
         if (sqsum.empty())
             return -100;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             const float* ptr = top_blob.channel(q);
@@ -140,7 +141,7 @@ int MVN::forward(const Mat& bottom_blob, Mat& top_blob) const
             float norm_var_inv = 1.f / norm_var;
 
             // apply normalize_variance
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int q=0; q<channels; q++)
             {
                 float* outptr = top_blob.channel(q);
@@ -154,7 +155,7 @@ int MVN::forward(const Mat& bottom_blob, Mat& top_blob) const
         else
         {
             // apply normalize_variance
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int q=0; q<channels; q++)
             {
                 float* outptr = top_blob.channel(q);
diff --git a/src/layer/mvn.h b/src/layer/mvn.h
index eec72012c..90fb24608 100644
--- a/src/layer/mvn.h
+++ b/src/layer/mvn.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 public:
     int normalize_variance;
diff --git a/src/layer/normalize.cpp b/src/layer/normalize.cpp
index f625a59cf..99a35c39d 100644
--- a/src/layer/normalize.cpp
+++ b/src/layer/normalize.cpp
@@ -45,14 +45,15 @@ int Normalize::load_model(const ModelBin& mb)
     return 0;
 }
 
-int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const
+int Normalize::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
     int size = w * h;
 
-    top_blob.create(w, h, channels);
+    top_blob.create(w, h, channels, elemsize, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
@@ -60,11 +61,11 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const
     {
         // square
         Mat square_sum_blob;
-        square_sum_blob.create(channels);
+        square_sum_blob.create(channels, elemsize, opt.workspace_allocator);
         if (square_sum_blob.empty())
             return -100;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             const float* ptr = bottom_blob.channel(q);
@@ -92,7 +93,7 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const
         {
             float scale = a * scale_data[0];
 
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int q=0; q<channels; q++)
             {
                 const float* ptr = bottom_blob.channel(q);
@@ -106,7 +107,7 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const
         }
         else
         {
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int q=0; q<channels; q++)
             {
                 const float* ptr = bottom_blob.channel(q);
@@ -125,7 +126,7 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const
 
     if (across_spatial && !across_channel)
     {
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             const float* ptr = bottom_blob.channel(q);
@@ -153,7 +154,7 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const
     {
         // square sum, 1 / sqrt(ssum)
         Mat square_sum_blob;
-        square_sum_blob.create(size);
+        square_sum_blob.create(size, elemsize, opt.workspace_allocator);
         if (square_sum_blob.empty())
             return -100;
 
@@ -161,7 +162,7 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const
         {
             float scale = scale_data[0];
 
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int i=0; i<size; i++)
             {
                 float ssum = eps;
@@ -174,7 +175,7 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const
                 square_sum_blob[i] = 1.f / sqrt(ssum) * scale;
             }
 
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int q=0; q<channels; q++)
             {
                 const float* ptr = bottom_blob.channel(q);
@@ -188,7 +189,7 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const
         }
         else
         {
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int i=0; i<size; i++)
             {
                 float ssum = eps;
@@ -201,7 +202,7 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const
                 square_sum_blob[i] = 1.f / sqrt(ssum);
             }
 
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int q=0; q<channels; q++)
             {
                 const float* ptr = bottom_blob.channel(q);
diff --git a/src/layer/normalize.h b/src/layer/normalize.h
index 42e066ca3..8f0fefedb 100644
--- a/src/layer/normalize.h
+++ b/src/layer/normalize.h
@@ -28,7 +28,7 @@ public:
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 public:
     // param
diff --git a/src/layer/padding.cpp b/src/layer/padding.cpp
index 5f1b43351..e1e066618 100644
--- a/src/layer/padding.cpp
+++ b/src/layer/padding.cpp
@@ -36,9 +36,9 @@ int Padding::load_param(const ParamDict& pd)
     return 0;
 }
 
-int Padding::forward(const Mat& bottom_blob, Mat& top_blob) const
+int Padding::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
-    copy_make_border(bottom_blob, top_blob, top, bottom, left, right, type, value);
+    copy_make_border(bottom_blob, top_blob, top, bottom, left, right, type, value, opt.blob_allocator, opt.num_threads);
 
     if (top_blob.empty())
         return -100;
diff --git a/src/layer/padding.h b/src/layer/padding.h
index 58b24d53b..44ccbe2bb 100644
--- a/src/layer/padding.h
+++ b/src/layer/padding.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 public:
     int top;
diff --git a/src/layer/permute.cpp b/src/layer/permute.cpp
index 962db320c..34ddf6edc 100644
--- a/src/layer/permute.cpp
+++ b/src/layer/permute.cpp
@@ -31,11 +31,12 @@ int Permute::load_param(const ParamDict& pd)
     return 0;
 }
 
-int Permute::forward(const Mat& bottom_blob, Mat& top_blob) const
+int Permute::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
 
     // order_type
     // 0 = w h c
@@ -51,11 +52,11 @@ int Permute::forward(const Mat& bottom_blob, Mat& top_blob) const
     }
     else if (order_type == 1)
     {
-        top_blob.create(h, w, channels);
+        top_blob.create(h, w, channels, elemsize, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             const float* ptr = bottom_blob.channel(q);
@@ -72,11 +73,11 @@ int Permute::forward(const Mat& bottom_blob, Mat& top_blob) const
     }
     else if (order_type == 2)
     {
-        top_blob.create(w, channels, h);
+        top_blob.create(w, channels, h, elemsize, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<h; q++)
         {
             float* outptr = top_blob.channel(q);
@@ -94,11 +95,11 @@ int Permute::forward(const Mat& bottom_blob, Mat& top_blob) const
     }
     else if (order_type == 3)
     {
-        top_blob.create(channels, w, h);
+        top_blob.create(channels, w, h, elemsize, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<h; q++)
         {
             float* outptr = top_blob.channel(q);
@@ -116,11 +117,11 @@ int Permute::forward(const Mat& bottom_blob, Mat& top_blob) const
     }
     else if (order_type == 4)
     {
-        top_blob.create(h, channels, w);
+        top_blob.create(h, channels, w, elemsize, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<w; q++)
         {
             float* outptr = top_blob.channel(q);
@@ -138,11 +139,11 @@ int Permute::forward(const Mat& bottom_blob, Mat& top_blob) const
     }
     else if (order_type == 5)
     {
-        top_blob.create(channels, h, w);
+        top_blob.create(channels, h, w, elemsize, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<w; q++)
         {
             float* outptr = top_blob.channel(q);
diff --git a/src/layer/permute.h b/src/layer/permute.h
index 23a3bf83d..963aa5f20 100644
--- a/src/layer/permute.h
+++ b/src/layer/permute.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 public:
     int order_type;
diff --git a/src/layer/pooling.cpp b/src/layer/pooling.cpp
index 22702f17d..0b86b19f4 100644
--- a/src/layer/pooling.cpp
+++ b/src/layer/pooling.cpp
@@ -43,7 +43,7 @@ int Pooling::load_param(const ParamDict& pd)
     return 0;
 }
 
-int Pooling::forward(const Mat& bottom_blob, Mat& top_blob) const
+int Pooling::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     // max value in NxN window
     // avg value in NxN window
@@ -51,11 +51,12 @@ int Pooling::forward(const Mat& bottom_blob, Mat& top_blob) const
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
 
 //     fprintf(stderr, "Pooling     input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d\n", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
     if (global_pooling)
     {
-        top_blob.create(channels);
+        top_blob.create(channels, elemsize, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
@@ -63,7 +64,7 @@ int Pooling::forward(const Mat& bottom_blob, Mat& top_blob) const
 
         if (pooling_type == PoolMethod_MAX)
         {
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int q=0; q<channels; q++)
             {
                 const float* ptr = bottom_blob.channel(q);
@@ -79,7 +80,7 @@ int Pooling::forward(const Mat& bottom_blob, Mat& top_blob) const
         }
         else if (pooling_type == PoolMethod_AVE)
         {
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int q=0; q<channels; q++)
             {
                 const float* ptr = bottom_blob.channel(q);
@@ -122,7 +123,7 @@ int Pooling::forward(const Mat& bottom_blob, Mat& top_blob) const
         if (htail != 0)
             htailpad = stride_h - htail;
 
-        copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom + htailpad, pad_left, pad_right + wtailpad, BORDER_CONSTANT, pad_value);
+        copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom + htailpad, pad_left, pad_right + wtailpad, BORDER_CONSTANT, pad_value, opt.workspace_allocator, opt.num_threads);
         if (bottom_blob_bordered.empty())
             return -100;
 
@@ -131,7 +132,7 @@ int Pooling::forward(const Mat& bottom_blob, Mat& top_blob) const
     }
     else if (pad_mode == 1) // valid padding
     {
-        copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, BORDER_CONSTANT, pad_value);
+        copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, BORDER_CONSTANT, pad_value, opt.workspace_allocator, opt.num_threads);
         if (bottom_blob_bordered.empty())
             return -100;
 
@@ -144,7 +145,7 @@ int Pooling::forward(const Mat& bottom_blob, Mat& top_blob) const
         int hpad = kernel_h + (h - 1) / stride_h * stride_h - h;
         if (wpad > 0 || hpad > 0)
         {
-            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value);
+            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value, opt.workspace_allocator, opt.num_threads);
             if (bottom_blob_bordered.empty())
                 return -100;
         }
@@ -156,7 +157,7 @@ int Pooling::forward(const Mat& bottom_blob, Mat& top_blob) const
     int outw = (w - kernel_w) / stride_w + 1;
     int outh = (h - kernel_h) / stride_h + 1;
 
-    top_blob.create(outw, outh, channels);
+    top_blob.create(outw, outh, channels, elemsize, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
@@ -183,7 +184,7 @@ int Pooling::forward(const Mat& bottom_blob, Mat& top_blob) const
 
     if (pooling_type == PoolMethod_MAX)
     {
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             const Mat m = bottom_blob_bordered.channel(q);
@@ -212,7 +213,7 @@ int Pooling::forward(const Mat& bottom_blob, Mat& top_blob) const
     }
     else if (pooling_type == PoolMethod_AVE)
     {
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             const Mat m = bottom_blob_bordered.channel(q);
diff --git a/src/layer/pooling.h b/src/layer/pooling.h
index ee1ff3d83..01974be60 100644
--- a/src/layer/pooling.h
+++ b/src/layer/pooling.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
     enum { PoolMethod_MAX = 0, PoolMethod_AVE = 1 };
 
diff --git a/src/layer/power.cpp b/src/layer/power.cpp
index d68d3adb5..bc2e04ca3 100644
--- a/src/layer/power.cpp
+++ b/src/layer/power.cpp
@@ -34,14 +34,14 @@ int Power::load_param(const ParamDict& pd)
     return 0;
 }
 
-int Power::forward_inplace(Mat& bottom_top_blob) const
+int Power::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
     int size = w * h;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<channels; q++)
     {
         float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/power.h b/src/layer/power.h
index 044c77974..8a54cc637 100644
--- a/src/layer/power.h
+++ b/src/layer/power.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 public:
     float power;
diff --git a/src/layer/prelu.cpp b/src/layer/prelu.cpp
index df2736f37..5a581578c 100644
--- a/src/layer/prelu.cpp
+++ b/src/layer/prelu.cpp
@@ -40,7 +40,7 @@ int PReLU::load_model(const ModelBin& mb)
     return 0;
 }
 
-int PReLU::forward_inplace(Mat& bottom_top_blob) const
+int PReLU::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     int dims = bottom_top_blob.dims;
 
@@ -52,7 +52,7 @@ int PReLU::forward_inplace(Mat& bottom_top_blob) const
 
         if (num_slope > 1)
         {
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int i=0; i<w; i++)
             {
                 if (ptr[i] < 0)
@@ -63,7 +63,7 @@ int PReLU::forward_inplace(Mat& bottom_top_blob) const
         {
             float slope = slope_data[0];
 
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int i=0; i<w; i++)
             {
                 if (ptr[i] < 0)
@@ -77,7 +77,7 @@ int PReLU::forward_inplace(Mat& bottom_top_blob) const
         int w = bottom_top_blob.w;
         int h = bottom_top_blob.h;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int i=0; i<h; i++)
         {
             float* ptr = bottom_top_blob.row(i);
@@ -98,7 +98,7 @@ int PReLU::forward_inplace(Mat& bottom_top_blob) const
         int channels = bottom_top_blob.c;
         int size = w * h;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/prelu.h b/src/layer/prelu.h
index 3249beb11..ca155d6e4 100644
--- a/src/layer/prelu.h
+++ b/src/layer/prelu.h
@@ -28,7 +28,7 @@ public:
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 public:
     int num_slope;
diff --git a/src/layer/priorbox.cpp b/src/layer/priorbox.cpp
index 2a0304814..fa40298ff 100644
--- a/src/layer/priorbox.cpp
+++ b/src/layer/priorbox.cpp
@@ -46,7 +46,7 @@ int PriorBox::load_param(const ParamDict& pd)
     return 0;
 }
 
-int PriorBox::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
+int PriorBox::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
     int w = bottom_blobs[0].w;
     int h = bottom_blobs[0].h;
@@ -74,9 +74,9 @@ int PriorBox::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
         num_prior += num_min_size * num_aspect_ratio;
 
     Mat& top_blob = top_blobs[0];
-    top_blob.create(4 * w * h * num_prior, 2);
+    top_blob.create(4 * w * h * num_prior, 2, 4u, opt.blob_allocator);
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int i = 0; i < h; i++)
     {
         float* box = (float*)top_blob + i * w * num_prior * 4;
diff --git a/src/layer/priorbox.h b/src/layer/priorbox.h
index b7f70d9cc..249a65f25 100644
--- a/src/layer/priorbox.h
+++ b/src/layer/priorbox.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 
 public:
     Mat min_sizes;
diff --git a/src/layer/proposal.cpp b/src/layer/proposal.cpp
index 03dc679cc..c4437dad7 100644
--- a/src/layer/proposal.cpp
+++ b/src/layer/proposal.cpp
@@ -195,7 +195,7 @@ static void nms_sorted_bboxes(const std::vector<Rect>& bboxes, std::vector<int>&
     }
 }
 
-int Proposal::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
+int Proposal::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
     const Mat& score_blob = bottom_blobs[0];
     const Mat& bbox_blob = bottom_blobs[1];
@@ -210,7 +210,7 @@ int Proposal::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
     Mat proposals;
     proposals.create(4, w * h, num_anchors);
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<num_anchors; q++)
     {
         const float* bbox_xptr = bbox_blob.channel(q * 4);
@@ -272,7 +272,7 @@ int Proposal::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
     float im_w = im_info_blob[1];
     float im_h = im_info_blob[0];
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<num_anchors; q++)
     {
         Mat pbs = proposals.channel(q);
diff --git a/src/layer/proposal.h b/src/layer/proposal.h
index 68c7c37d3..ac23ec094 100644
--- a/src/layer/proposal.h
+++ b/src/layer/proposal.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 
 public:
     // param
diff --git a/src/layer/reduction.cpp b/src/layer/reduction.cpp
index a2ac15f2b..3994cf82e 100644
--- a/src/layer/reduction.cpp
+++ b/src/layer/reduction.cpp
@@ -39,7 +39,7 @@ int Reduction::load_param(const ParamDict& pd)
 }
 
 template<typename Op, typename Op2>
-static int reduction_op(const Mat& a, Mat& b, float v0, int dim, float coeff)
+static int reduction_op(const Mat& a, Mat& b, float v0, int dim, float coeff, const Option& opt)
 {
     Op op;
     Op2 op2;
@@ -47,43 +47,44 @@ static int reduction_op(const Mat& a, Mat& b, float v0, int dim, float coeff)
     int w = a.w;
     int h = a.h;
     int channels = a.c;
+    size_t elemsize = a.elemsize;
     int size = w * h;
 
     if (dim == 0)
     {
         // w h c -> X X X
-        b.create(1);
+        b.create(1, elemsize, opt.blob_allocator);
     }
     else if (dim == 1)
     {
         // w h c -> X X c
-        b.create(channels);
+        b.create(channels, elemsize, opt.blob_allocator);
     }
     else if (dim == 2)
     {
         // w h c -> X h c
-        b.create(h, channels);
+        b.create(h, channels, elemsize, opt.blob_allocator);
     }
     else if (dim == -1)
     {
         // w h c -> w X X
-        b.create(w);
+        b.create(w, elemsize, opt.blob_allocator);
     }
     else if (dim == -2)
     {
         // w h c -> w h X
-        b.create(w, h);
+        b.create(w, h, elemsize, opt.blob_allocator);
     }
     if (b.empty())
         return -100;
 
     if (dim == 0)
     {
-        Mat sums(channels);
+        Mat sums(channels, elemsize, opt.workspace_allocator);
         if (sums.empty())
             return -100;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             const float* ptr = a.channel(q);
@@ -107,7 +108,7 @@ static int reduction_op(const Mat& a, Mat& b, float v0, int dim, float coeff)
     }
     else if (dim == 1)
     {
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             const float* ptr = a.channel(q);
@@ -123,7 +124,7 @@ static int reduction_op(const Mat& a, Mat& b, float v0, int dim, float coeff)
     }
     else if (dim == 2)
     {
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             const float* ptr = a.channel(q);
@@ -145,13 +146,13 @@ static int reduction_op(const Mat& a, Mat& b, float v0, int dim, float coeff)
     }
     else if (dim == -1)
     {
-        Mat mins(w, 1, channels);
+        Mat mins(w, 1, channels, elemsize, opt.workspace_allocator);
         if (mins.empty())
             return -100;
 
         mins.fill(v0);
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             const float* ptr = a.channel(q);
@@ -227,20 +228,20 @@ struct reduction_op_min : std::binary_function<T,T,T> {
     T operator() (const T& x, const T& y) const { return std::min(x, y); }
 };
 
-int Reduction::forward(const Mat& bottom_blob, Mat& top_blob) const
+int Reduction::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     if (operation == ReductionOp_SUM)
-        return reduction_op< std::plus<float>, std::plus<float> >(bottom_blob, top_blob, 0.f, dim, coeff);
+        return reduction_op< std::plus<float>, std::plus<float> >(bottom_blob, top_blob, 0.f, dim, coeff, opt);
 
     if (operation == ReductionOp_ASUM)
-        return reduction_op< reduction_op_asum<float>, std::plus<float> >(bottom_blob, top_blob, 0.f, dim, coeff);
+        return reduction_op< reduction_op_asum<float>, std::plus<float> >(bottom_blob, top_blob, 0.f, dim, coeff, opt);
 
     if (operation == ReductionOp_SUMSQ)
-        return reduction_op< reduction_op_sumsq<float>, std::plus<float> >(bottom_blob, top_blob, 0.f, dim, coeff);
+        return reduction_op< reduction_op_sumsq<float>, std::plus<float> >(bottom_blob, top_blob, 0.f, dim, coeff, opt);
 
     if (operation == ReductionOp_MEAN)
     {
-        int ret = reduction_op< std::plus<float>, std::plus<float> >(bottom_blob, top_blob, 0.f, dim, coeff);
+        int ret = reduction_op< std::plus<float>, std::plus<float> >(bottom_blob, top_blob, 0.f, dim, coeff, opt);
         if (ret != 0)
             return -100;
 
@@ -289,13 +290,13 @@ int Reduction::forward(const Mat& bottom_blob, Mat& top_blob) const
     }
 
     if (operation == ReductionOp_MAX)
-        return reduction_op< reduction_op_max<float>, reduction_op_max<float> >(bottom_blob, top_blob, -FLT_MAX, dim, coeff);
+        return reduction_op< reduction_op_max<float>, reduction_op_max<float> >(bottom_blob, top_blob, -FLT_MAX, dim, coeff, opt);
 
     if (operation == ReductionOp_MIN)
-        return reduction_op< reduction_op_min<float>, reduction_op_min<float> >(bottom_blob, top_blob, FLT_MAX, dim, coeff);
+        return reduction_op< reduction_op_min<float>, reduction_op_min<float> >(bottom_blob, top_blob, FLT_MAX, dim, coeff, opt);
 
     if (operation == ReductionOp_PROD)
-        return reduction_op< std::multiplies<float>, std::multiplies<float> >(bottom_blob, top_blob, 1.f, dim, coeff);
+        return reduction_op< std::multiplies<float>, std::multiplies<float> >(bottom_blob, top_blob, 1.f, dim, coeff, opt);
 
     return 0;
 }
diff --git a/src/layer/reduction.h b/src/layer/reduction.h
index b48c6fb95..340ed99c6 100644
--- a/src/layer/reduction.h
+++ b/src/layer/reduction.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
     enum {
         ReductionOp_SUM     = 0,
diff --git a/src/layer/relu.cpp b/src/layer/relu.cpp
index 3d2b8e8cd..4013699bc 100644
--- a/src/layer/relu.cpp
+++ b/src/layer/relu.cpp
@@ -31,7 +31,7 @@ int ReLU::load_param(const ParamDict& pd)
     return 0;
 }
 
-int ReLU::forward_inplace(Mat& bottom_top_blob) const
+int ReLU::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
@@ -40,7 +40,7 @@ int ReLU::forward_inplace(Mat& bottom_top_blob) const
 
     if (slope == 0.f)
     {
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             float* ptr = bottom_top_blob.channel(q);
@@ -54,7 +54,7 @@ int ReLU::forward_inplace(Mat& bottom_top_blob) const
     }
     else
     {
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/relu.h b/src/layer/relu.h
index 3535e6fd3..9460bcdcb 100644
--- a/src/layer/relu.h
+++ b/src/layer/relu.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 public:
     float slope;
diff --git a/src/layer/reorg.cpp b/src/layer/reorg.cpp
index 71a0c27db..e37e1f4c0 100644
--- a/src/layer/reorg.cpp
+++ b/src/layer/reorg.cpp
@@ -31,21 +31,22 @@ int Reorg::load_param(const ParamDict& pd)
     return 0;
 }
 
-int Reorg::forward(const Mat& bottom_blob, Mat& top_blob) const
+int Reorg::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
 
     int outw = w / stride;
     int outh = h / stride;
     int outc = channels * stride * stride;
 
-    top_blob.create(outw, outh, outc);
+    top_blob.create(outw, outh, outc, elemsize, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<channels; q++)
     {
         const Mat m = bottom_blob.channel(q);
diff --git a/src/layer/reorg.h b/src/layer/reorg.h
index 1fed474ee..8dc7be310 100644
--- a/src/layer/reorg.h
+++ b/src/layer/reorg.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 private:
     int stride;
diff --git a/src/layer/reshape.cpp b/src/layer/reshape.cpp
index 770e3115f..370e771f5 100644
--- a/src/layer/reshape.cpp
+++ b/src/layer/reshape.cpp
@@ -42,8 +42,9 @@ int Reshape::load_param(const ParamDict& pd)
     return 0;
 }
 
-int Reshape::forward(const Mat& bottom_blob, Mat& top_blob) const
+int Reshape::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
+    size_t elemsize = bottom_blob.elemsize;
     int total = bottom_blob.w * bottom_blob.h * bottom_blob.c;
 
     if (ndim == 1)
@@ -58,7 +59,7 @@ int Reshape::forward(const Mat& bottom_blob, Mat& top_blob) const
 
         if (permute == 1)
         {
-            top_blob.create(_w);
+            top_blob.create(_w, elemsize, opt.blob_allocator);
             if (top_blob.empty())
                 return -100;
 
@@ -78,7 +79,7 @@ int Reshape::forward(const Mat& bottom_blob, Mat& top_blob) const
         }
         else
         {
-            top_blob = bottom_blob.reshape(_w);
+            top_blob = bottom_blob.reshape(_w, opt.blob_allocator);
         }
     }
     else if (ndim == 2)
@@ -96,7 +97,7 @@ int Reshape::forward(const Mat& bottom_blob, Mat& top_blob) const
         if (_h == -1)
             _h = total / _w;
 
-        top_blob = bottom_blob.reshape(_w, _h);
+        top_blob = bottom_blob.reshape(_w, _h, opt.blob_allocator);
     }
     else if (ndim == 3)
     {
@@ -118,7 +119,7 @@ int Reshape::forward(const Mat& bottom_blob, Mat& top_blob) const
         if (_c == -1)
             _c = total / _h / _w;
 
-        top_blob = bottom_blob.reshape(_w, _h, _c);
+        top_blob = bottom_blob.reshape(_w, _h, _c, opt.blob_allocator);
     }
 
     if (top_blob.empty())
diff --git a/src/layer/reshape.h b/src/layer/reshape.h
index e8a847bae..611e9a0a3 100644
--- a/src/layer/reshape.h
+++ b/src/layer/reshape.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 private:
     // reshape flag
diff --git a/src/layer/rnn.cpp b/src/layer/rnn.cpp
index 5df80432a..0f3633595 100644
--- a/src/layer/rnn.cpp
+++ b/src/layer/rnn.cpp
@@ -61,10 +61,11 @@ int RNN::load_model(const ModelBin& mb)
     return 0;
 }
 
-int RNN::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
+int RNN::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
     // size x 1 x T
     const Mat& input_blob = bottom_blobs[0];
+    size_t elemsize = input_blob.elemsize;
 
     // T, 0 or 1 each
     const Mat& cont_blob = bottom_blobs[1];
@@ -73,13 +74,13 @@ int RNN::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blo
     int size = input_blob.w;
 
     // initial hidden state
-    Mat hidden(num_output);
+    Mat hidden(num_output, 4u, opt.workspace_allocator);
     if (hidden.empty())
         return -100;
     hidden.fill(0.f);
 
     Mat& top_blob = top_blobs[0];
-    top_blob.create(num_output, 1, T);
+    top_blob.create(num_output, 1, T, elemsize, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
diff --git a/src/layer/rnn.h b/src/layer/rnn.h
index b3e9c982e..80de65bc5 100644
--- a/src/layer/rnn.h
+++ b/src/layer/rnn.h
@@ -28,7 +28,7 @@ public:
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 
 public:
     // param
diff --git a/src/layer/roipooling.cpp b/src/layer/roipooling.cpp
index 21f262872..5efc53142 100644
--- a/src/layer/roipooling.cpp
+++ b/src/layer/roipooling.cpp
@@ -33,17 +33,18 @@ int ROIPooling::load_param(const ParamDict& pd)
     return 0;
 }
 
-int ROIPooling::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
+int ROIPooling::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
     const Mat& bottom_blob = bottom_blobs[0];
     int w = bottom_blob.w;
     int h = bottom_blob.h;
+    size_t elemsize = bottom_blob.elemsize;
     int channels = bottom_blob.c;
 
     const Mat& roi_blob = bottom_blobs[1];
 
     Mat& top_blob = top_blobs[0];
-    top_blob.create(pooled_width, pooled_height, channels);
+    top_blob.create(pooled_width, pooled_height, channels, elemsize, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
@@ -61,7 +62,7 @@ int ROIPooling::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
     float bin_size_w = (float)roi_w / (float)pooled_width;
     float bin_size_h = (float)roi_h / (float)pooled_height;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<channels; q++)
     {
         const float* ptr = bottom_blob.channel(q);
diff --git a/src/layer/roipooling.h b/src/layer/roipooling.h
index 90cc1a487..3f73bf31f 100644
--- a/src/layer/roipooling.h
+++ b/src/layer/roipooling.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 
 public:
     int pooled_width;
diff --git a/src/layer/scale.cpp b/src/layer/scale.cpp
index cf8d7df73..0bb09e7c5 100644
--- a/src/layer/scale.cpp
+++ b/src/layer/scale.cpp
@@ -54,7 +54,7 @@ int Scale::load_model(const ModelBin& mb)
     return 0;
 }
 
-int Scale::forward_inplace(std::vector<Mat>& bottom_top_blobs) const
+int Scale::forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const
 {
     Mat& bottom_top_blob = bottom_top_blobs[0];
     const Mat& scale_blob = bottom_top_blobs[1];
@@ -69,7 +69,7 @@ int Scale::forward_inplace(std::vector<Mat>& bottom_top_blobs) const
 
         if (bias_term)
         {
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int i=0; i<w; i++)
             {
                 ptr[i] = ptr[i] * scale_blob[i] + bias_data[i];
@@ -77,7 +77,7 @@ int Scale::forward_inplace(std::vector<Mat>& bottom_top_blobs) const
         }
         else
         {
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int i=0; i<w; i++)
             {
                 ptr[i] *= scale_blob[i];
@@ -92,7 +92,7 @@ int Scale::forward_inplace(std::vector<Mat>& bottom_top_blobs) const
 
         if (bias_term)
         {
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int i=0; i<h; i++)
             {
                 float* ptr = bottom_top_blob.row(i);
@@ -107,7 +107,7 @@ int Scale::forward_inplace(std::vector<Mat>& bottom_top_blobs) const
         }
         else
         {
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int i=0; i<h; i++)
             {
                 float* ptr = bottom_top_blob.row(i);
@@ -130,7 +130,7 @@ int Scale::forward_inplace(std::vector<Mat>& bottom_top_blobs) const
 
         if (bias_term)
         {
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int q=0; q<channels; q++)
             {
                 float* ptr = bottom_top_blob.channel(q);
@@ -146,7 +146,7 @@ int Scale::forward_inplace(std::vector<Mat>& bottom_top_blobs) const
         }
         else
         {
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int q=0; q<channels; q++)
             {
                 float* ptr = bottom_top_blob.channel(q);
@@ -164,13 +164,13 @@ int Scale::forward_inplace(std::vector<Mat>& bottom_top_blobs) const
     return 0;
 }
 
-int Scale::forward_inplace(Mat& bottom_top_blob) const
+int Scale::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     std::vector<Mat> bottom_top_blobs(2);
     bottom_top_blobs[0] = bottom_top_blob;
     bottom_top_blobs[1] = scale_data;
 
-    return forward_inplace(bottom_top_blobs);
+    return forward_inplace(bottom_top_blobs, opt);
 }
 
 } // namespace ncnn
diff --git a/src/layer/scale.h b/src/layer/scale.h
index cac25cea5..3ca87950f 100644
--- a/src/layer/scale.h
+++ b/src/layer/scale.h
@@ -28,8 +28,8 @@ public:
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs) const;
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 public:
     // param
diff --git a/src/layer/shufflechannel.cpp b/src/layer/shufflechannel.cpp
index 6ba0c08fb..f129e9024 100644
--- a/src/layer/shufflechannel.cpp
+++ b/src/layer/shufflechannel.cpp
@@ -31,7 +31,7 @@ int ShuffleChannel::load_param(const ParamDict& pd)
     return 0;
 }
 
-int ShuffleChannel::forward(const Mat& bottom_blob, Mat& top_blob) const
+int ShuffleChannel::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -45,7 +45,7 @@ int ShuffleChannel::forward(const Mat& bottom_blob, Mat& top_blob) const
         return -100;
     }
 
-    top_blob.create(w, h, c, elemsize);
+    top_blob.create(w, h, c, elemsize, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
diff --git a/src/layer/shufflechannel.h b/src/layer/shufflechannel.h
index bcc3cee44..d180db625 100644
--- a/src/layer/shufflechannel.h
+++ b/src/layer/shufflechannel.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 public:
     int group;
diff --git a/src/layer/sigmoid.cpp b/src/layer/sigmoid.cpp
index cc62ac75a..6704cea2c 100644
--- a/src/layer/sigmoid.cpp
+++ b/src/layer/sigmoid.cpp
@@ -25,14 +25,14 @@ Sigmoid::Sigmoid()
     support_inplace = true;
 }
 
-int Sigmoid::forward_inplace(Mat& bottom_top_blob) const
+int Sigmoid::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
     int size = w * h;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<channels; q++)
     {
         float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/sigmoid.h b/src/layer/sigmoid.h
index 8ffe28074..9fda8d336 100644
--- a/src/layer/sigmoid.h
+++ b/src/layer/sigmoid.h
@@ -24,7 +24,7 @@ class Sigmoid : public Layer
 public:
     Sigmoid();
 
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 public:
 };
diff --git a/src/layer/slice.cpp b/src/layer/slice.cpp
index d906991e6..e16e8a253 100644
--- a/src/layer/slice.cpp
+++ b/src/layer/slice.cpp
@@ -30,7 +30,7 @@ int Slice::load_param(const ParamDict& pd)
     return 0;
 }
 
-int Slice::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
+int Slice::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
     const Mat& bottom_blob = bottom_blobs[0];
     int dims = bottom_blob.dims;
@@ -51,7 +51,7 @@ int Slice::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_b
             }
 
             Mat& top_blob = top_blobs[i];
-            top_blob.create(slice, elemsize);
+            top_blob.create(slice, elemsize, opt.blob_allocator);
             if (top_blob.empty())
                 return -100;
 
@@ -80,7 +80,7 @@ int Slice::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_b
             }
 
             Mat& top_blob = top_blobs[i];
-            top_blob.create(w, slice, elemsize);
+            top_blob.create(w, slice, elemsize, opt.blob_allocator);
             if (top_blob.empty())
                 return -100;
 
@@ -111,11 +111,11 @@ int Slice::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_b
             }
 
             Mat& top_blob = top_blobs[i];
-            top_blob.create(slice, h, elemsize);
+            top_blob.create(slice, h, elemsize, opt.blob_allocator);
             if (top_blob.empty())
                 return -100;
 
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int j=0; j<h; j++)
             {
                 float* outptr = top_blob.row(j);
@@ -145,7 +145,7 @@ int Slice::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_b
             }
 
             Mat& top_blob = top_blobs[i];
-            top_blob.create(w, h, slice, elemsize);
+            top_blob.create(w, h, slice, elemsize, opt.blob_allocator);
             if (top_blob.empty())
                 return -100;
 
@@ -177,11 +177,11 @@ int Slice::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_b
             }
 
             Mat& top_blob = top_blobs[i];
-            top_blob.create(w, slice, channels, elemsize);
+            top_blob.create(w, slice, channels, elemsize, opt.blob_allocator);
             if (top_blob.empty())
                 return -100;
 
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int p=0; p<channels; p++)
             {
                 int size = w * slice;
@@ -213,11 +213,11 @@ int Slice::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_b
             }
 
             Mat& top_blob = top_blobs[i];
-            top_blob.create(slice, h, channels, elemsize);
+            top_blob.create(slice, h, channels, elemsize, opt.blob_allocator);
             if (top_blob.empty())
                 return -100;
 
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int p=0; p<channels; p++)
             {
                 float* outptr = top_blob.channel(p);
diff --git a/src/layer/slice.h b/src/layer/slice.h
index 2feb5b1ae..4da3c8036 100644
--- a/src/layer/slice.h
+++ b/src/layer/slice.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 
 public:
     Mat slices;
diff --git a/src/layer/softmax.cpp b/src/layer/softmax.cpp
index 44ad8dbfe..3ca2d8154 100644
--- a/src/layer/softmax.cpp
+++ b/src/layer/softmax.cpp
@@ -34,13 +34,14 @@ int Softmax::load_param(const ParamDict& pd)
     return 0;
 }
 
-int Softmax::forward_inplace(Mat& bottom_top_blob) const
+int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     // value = exp( value - global max value )
     // sum all value
     // value = value / sum
 
     int dims = bottom_top_blob.dims;
+    size_t elemsize = bottom_top_blob.elemsize;
 
     if (dims == 1) // axis == 0
     {
@@ -79,7 +80,7 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
         int h = bottom_top_blob.h;
 
         Mat max;
-        max.create(w);
+        max.create(w, elemsize, opt.workspace_allocator);
         if (max.empty())
             return -100;
         max.fill(-FLT_MAX);
@@ -103,7 +104,7 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
         }
 
         Mat sum;
-        sum.create(w);
+        sum.create(w, elemsize, opt.workspace_allocator);
         if (sum.empty())
             return -100;
         sum.fill(0.f);
@@ -135,7 +136,7 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
         int h = bottom_top_blob.h;
 
         Mat max;
-        max.create(h);
+        max.create(h, elemsize, opt.workspace_allocator);
         if (max.empty())
             return -100;
 
@@ -164,7 +165,7 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
         }
 
         Mat sum;
-        sum.create(h);
+        sum.create(h, elemsize, opt.workspace_allocator);
         if (sum.empty())
             return -100;
 
@@ -203,7 +204,7 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
         int size = w * h;
 
         Mat max;
-        max.create(w, h);
+        max.create(w, h, elemsize, opt.workspace_allocator);
         if (max.empty())
             return -100;
         max.fill(-FLT_MAX);
@@ -217,7 +218,7 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
             }
         }
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             float* ptr = bottom_top_blob.channel(q);
@@ -229,7 +230,7 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
         }
 
         Mat sum;
-        sum.create(w, h);
+        sum.create(w, h, elemsize, opt.workspace_allocator);
         if (sum.empty())
             return -100;
         sum.fill(0.f);
@@ -243,7 +244,7 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
             }
         }
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             float* ptr = bottom_top_blob.channel(q);
@@ -264,11 +265,11 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
         int channels = bottom_top_blob.c;
 
         Mat max;
-        max.create(h, channels);
+        max.create(h, channels, elemsize, opt.workspace_allocator);
         if (max.empty())
             return -100;
         max.fill(-FLT_MAX);
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             const float* ptr = bottom_top_blob.channel(q);
@@ -287,7 +288,7 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
             }
         }
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             float* ptr = bottom_top_blob.channel(q);
@@ -306,11 +307,11 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
         }
 
         Mat sum;
-        sum.create(h, channels);
+        sum.create(h, channels, elemsize, opt.workspace_allocator);
         if (sum.empty())
             return -100;
         sum.fill(0.f);
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             const float* ptr = bottom_top_blob.channel(q);
@@ -329,7 +330,7 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
             }
         }
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             float* ptr = bottom_top_blob.channel(q);
@@ -357,11 +358,11 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
         int channels = bottom_top_blob.c;
 
         Mat max;
-        max.create(w, channels);
+        max.create(w, channels, elemsize, opt.workspace_allocator);
         if (max.empty())
             return -100;
         max.fill(-FLT_MAX);
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             const float* ptr = bottom_top_blob.channel(q);
@@ -378,7 +379,7 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
             }
         }
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             float* ptr = bottom_top_blob.channel(q);
@@ -396,11 +397,11 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
         }
 
         Mat sum;
-        sum.create(w, channels);
+        sum.create(w, channels, elemsize, opt.workspace_allocator);
         if (sum.empty())
             return -100;
         sum.fill(0.f);
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             const float* ptr = bottom_top_blob.channel(q);
@@ -417,7 +418,7 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
             }
         }
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/softmax.h b/src/layer/softmax.h
index 99910321d..a40237cbf 100644
--- a/src/layer/softmax.h
+++ b/src/layer/softmax.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 public:
     int axis;
diff --git a/src/layer/split.cpp b/src/layer/split.cpp
index fb8ee34d5..e50330a10 100644
--- a/src/layer/split.cpp
+++ b/src/layer/split.cpp
@@ -22,7 +22,7 @@ Split::Split()
 {
 }
 
-int Split::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
+int Split::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& /*opt*/) const
 {
     const Mat& bottom_blob = bottom_blobs[0];
     for (size_t i=0; i<top_blobs.size(); i++)
diff --git a/src/layer/split.h b/src/layer/split.h
index 0239d9d56..67cbec5d3 100644
--- a/src/layer/split.h
+++ b/src/layer/split.h
@@ -24,7 +24,7 @@ class Split : public Layer
 public:
     Split();
 
-    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 
 public:
 };
diff --git a/src/layer/spp.cpp b/src/layer/spp.cpp
index 09fab007a..38c94f27b 100644
--- a/src/layer/spp.cpp
+++ b/src/layer/spp.cpp
@@ -34,11 +34,13 @@ int SPP::load_param(const ParamDict& pd)
     return 0;
 }
 
-int SPP::forward(const Mat& bottom_blob, Mat& top_blob) const
+int SPP::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
+    size_t elemsize = bottom_blob.elemsize;
+
     // 1 + 4 + 16 + 64 + ... + (2*pyramid_height)^2
     int pyramid_num_bins = ((1 << (pyramid_height * 2)) - 1) / 3;
-    top_blob.create(pyramid_num_bins, 1, 2);
+    top_blob.create(pyramid_num_bins, 1, 2, elemsize, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
@@ -72,7 +74,7 @@ int SPP::forward(const Mat& bottom_blob, Mat& top_blob) const
         Mat bottom_blob_bordered = bottom_blob;
         if (pad_h > 0 || pad_w > 0)
         {
-            copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f);
+            copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
             if (bottom_blob_bordered.empty())
                 return -100;
 
@@ -103,7 +105,7 @@ int SPP::forward(const Mat& bottom_blob, Mat& top_blob) const
 
         if (pooling_type == PoolMethod_MAX)
         {
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int q=0; q<channels; q++)
             {
                 const Mat m(w, h, bottom_blob_bordered.channel(q));
@@ -132,7 +134,7 @@ int SPP::forward(const Mat& bottom_blob, Mat& top_blob) const
         }
         else if (pooling_type == PoolMethod_AVE)
         {
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int q=0; q<channels; q++)
             {
                 const Mat m(w, h, bottom_blob_bordered.channel(q));
diff --git a/src/layer/spp.h b/src/layer/spp.h
index c487a6d05..015880a8b 100644
--- a/src/layer/spp.h
+++ b/src/layer/spp.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
     enum { PoolMethod_MAX = 0, PoolMethod_AVE = 1 };
 
diff --git a/src/layer/squeeze.cpp b/src/layer/squeeze.cpp
index a2d7f1581..f4efef5df 100644
--- a/src/layer/squeeze.cpp
+++ b/src/layer/squeeze.cpp
@@ -33,7 +33,7 @@ int Squeeze::load_param(const ParamDict& pd)
     return 0;
 }
 
-int Squeeze::forward(const Mat& bottom_blob, Mat& top_blob) const
+int Squeeze::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -45,23 +45,23 @@ int Squeeze::forward(const Mat& bottom_blob, Mat& top_blob) const
     if (squeeze_c && dims == 3 && channels == 1)
     {
         if (squeeze_h && h == 1)
-            top_blob = bottom_blob.reshape(w);
+            top_blob = bottom_blob.reshape(w, opt.blob_allocator);
         else
-            top_blob = bottom_blob.reshape(w, h);
+            top_blob = bottom_blob.reshape(w, h, opt.blob_allocator);
     }
     else if (squeeze_h && dims >= 2 && h == 1)
     {
         if (squeeze_w && w == 1)
-            top_blob = bottom_blob.reshape(channels);
+            top_blob = bottom_blob.reshape(channels, opt.blob_allocator);
         else
-            top_blob = bottom_blob.reshape(w, channels);
+            top_blob = bottom_blob.reshape(w, channels, opt.blob_allocator);
     }
     else if (squeeze_w && dims >= 1 && w == 1)
     {
         if (squeeze_h && h == 1)
-            top_blob = bottom_blob.reshape(channels);
+            top_blob = bottom_blob.reshape(channels, opt.blob_allocator);
         else
-            top_blob = bottom_blob.reshape(h, channels);
+            top_blob = bottom_blob.reshape(h, channels, opt.blob_allocator);
     }
 
     if (top_blob.empty())
diff --git a/src/layer/squeeze.h b/src/layer/squeeze.h
index 1db596da4..cf8a2a164 100644
--- a/src/layer/squeeze.h
+++ b/src/layer/squeeze.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 public:
     int squeeze_w;
diff --git a/src/layer/tanh.cpp b/src/layer/tanh.cpp
index 75a47944d..9cf732a19 100644
--- a/src/layer/tanh.cpp
+++ b/src/layer/tanh.cpp
@@ -25,14 +25,14 @@ TanH::TanH()
     support_inplace = true;
 }
 
-int TanH::forward_inplace(Mat& bottom_top_blob) const
+int TanH::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
     int size = w * h;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<channels; q++)
     {
         float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/tanh.h b/src/layer/tanh.h
index e5c117a66..ea1c575e5 100644
--- a/src/layer/tanh.h
+++ b/src/layer/tanh.h
@@ -24,7 +24,7 @@ class TanH : public Layer
 public:
     TanH();
 
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 public:
 };
diff --git a/src/layer/threshold.cpp b/src/layer/threshold.cpp
index 2ca6cd46d..0d1bdb60f 100644
--- a/src/layer/threshold.cpp
+++ b/src/layer/threshold.cpp
@@ -31,14 +31,14 @@ int Threshold::load_param(const ParamDict& pd)
     return 0;
 }
 
-int Threshold::forward_inplace(Mat& bottom_top_blob) const
+int Threshold::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
     int size = w * h;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int q=0; q<channels; q++)
     {
         float* ptr = bottom_top_blob.channel(q);
diff --git a/src/layer/threshold.h b/src/layer/threshold.h
index 0ae5344e3..f49f3ec23 100644
--- a/src/layer/threshold.h
+++ b/src/layer/threshold.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 public:
     float threshold;
diff --git a/src/layer/tile.cpp b/src/layer/tile.cpp
index c0a6a5775..c87189071 100644
--- a/src/layer/tile.cpp
+++ b/src/layer/tile.cpp
@@ -32,22 +32,23 @@ int Tile::load_param(const ParamDict& pd)
     return 0;
 }
 
-int Tile::forward(const Mat& bottom_blob, Mat& top_blob) const
+int Tile::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
 
     if (dim == 0)
     {
-        top_blob.create(w, h, channels * tiles);
+        top_blob.create(w, h, channels * tiles, elemsize, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         const float* ptr = bottom_blob;
         int size = bottom_blob.cstep * channels;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int p=0; p<tiles; p++)
         {
             float* outptr = top_blob.channel(p * channels);
@@ -60,13 +61,13 @@ int Tile::forward(const Mat& bottom_blob, Mat& top_blob) const
     }
     else if (dim == 1)
     {
-        top_blob.create(w, h * tiles, channels);
+        top_blob.create(w, h * tiles, channels, elemsize, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
         int size = w * h;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             const float* ptr = bottom_blob.channel(q);
@@ -85,11 +86,11 @@ int Tile::forward(const Mat& bottom_blob, Mat& top_blob) const
     }
     else if (dim == 2)
     {
-        top_blob.create(w * tiles, h, channels);
+        top_blob.create(w * tiles, h, channels, elemsize, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
             const float* ptr = bottom_blob.channel(q);
diff --git a/src/layer/tile.h b/src/layer/tile.h
index 28c4bd4bd..e6fbd1bf0 100644
--- a/src/layer/tile.h
+++ b/src/layer/tile.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 public:
     int dim;
diff --git a/src/layer/unaryop.cpp b/src/layer/unaryop.cpp
index 7753a1609..47c94d995 100644
--- a/src/layer/unaryop.cpp
+++ b/src/layer/unaryop.cpp
@@ -34,13 +34,13 @@ int UnaryOp::load_param(const ParamDict& pd)
 }
 
 template<typename Op>
-static int unary_op_inplace(Mat& a)
+static int unary_op_inplace(Mat& a, const Option& opt)
 {
     Op op;
 
     int size = a.total();
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int i=0; i<size; i++)
     {
         a[i] = op(a[i]);
@@ -129,55 +129,55 @@ struct unary_op_reciprocal : std::unary_function<T,T> {
     T operator() (const T& x) const { return 1.f / x; }
 };
 
-int UnaryOp::forward_inplace(Mat& bottom_top_blob) const
+int UnaryOp::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     if (op_type == Operation_ABS)
-        return unary_op_inplace< unary_op_abs<float> >(bottom_top_blob);
+        return unary_op_inplace< unary_op_abs<float> >(bottom_top_blob, opt);
 
     if (op_type == Operation_NEG)
-        return unary_op_inplace< unary_op_neg<float> >(bottom_top_blob);
+        return unary_op_inplace< unary_op_neg<float> >(bottom_top_blob, opt);
 
     if (op_type == Operation_FLOOR)
-        return unary_op_inplace< unary_op_floor<float> >(bottom_top_blob);
+        return unary_op_inplace< unary_op_floor<float> >(bottom_top_blob, opt);
 
     if (op_type == Operation_CEIL)
-        return unary_op_inplace< unary_op_ceil<float> >(bottom_top_blob);
+        return unary_op_inplace< unary_op_ceil<float> >(bottom_top_blob, opt);
 
     if (op_type == Operation_SQUARE)
-        return unary_op_inplace< unary_op_square<float> >(bottom_top_blob);
+        return unary_op_inplace< unary_op_square<float> >(bottom_top_blob, opt);
 
     if (op_type == Operation_SQRT)
-        return unary_op_inplace< unary_op_sqrt<float> >(bottom_top_blob);
+        return unary_op_inplace< unary_op_sqrt<float> >(bottom_top_blob, opt);
 
     if (op_type == Operation_RSQRT)
-        return unary_op_inplace< unary_op_rsqrt<float> >(bottom_top_blob);
+        return unary_op_inplace< unary_op_rsqrt<float> >(bottom_top_blob, opt);
 
     if (op_type == Operation_EXP)
-        return unary_op_inplace< unary_op_exp<float> >(bottom_top_blob);
+        return unary_op_inplace< unary_op_exp<float> >(bottom_top_blob, opt);
 
     if (op_type == Operation_LOG)
-        return unary_op_inplace< unary_op_log<float> >(bottom_top_blob);
+        return unary_op_inplace< unary_op_log<float> >(bottom_top_blob, opt);
 
     if (op_type == Operation_SIN)
-        return unary_op_inplace< unary_op_sin<float> >(bottom_top_blob);
+        return unary_op_inplace< unary_op_sin<float> >(bottom_top_blob, opt);
 
     if (op_type == Operation_COS)
-        return unary_op_inplace< unary_op_cos<float> >(bottom_top_blob);
+        return unary_op_inplace< unary_op_cos<float> >(bottom_top_blob, opt);
 
     if (op_type == Operation_TAN)
-        return unary_op_inplace< unary_op_tan<float> >(bottom_top_blob);
+        return unary_op_inplace< unary_op_tan<float> >(bottom_top_blob, opt);
 
     if (op_type == Operation_ASIN)
-        return unary_op_inplace< unary_op_asin<float> >(bottom_top_blob);
+        return unary_op_inplace< unary_op_asin<float> >(bottom_top_blob, opt);
 
     if (op_type == Operation_ACOS)
-        return unary_op_inplace< unary_op_acos<float> >(bottom_top_blob);
+        return unary_op_inplace< unary_op_acos<float> >(bottom_top_blob, opt);
 
     if (op_type == Operation_ATAN)
-        return unary_op_inplace< unary_op_atan<float> >(bottom_top_blob);
+        return unary_op_inplace< unary_op_atan<float> >(bottom_top_blob, opt);
 
     if (op_type == Operation_RECIPROCAL)
-        return unary_op_inplace< unary_op_reciprocal<float> >(bottom_top_blob);
+        return unary_op_inplace< unary_op_reciprocal<float> >(bottom_top_blob, opt);
 
     return 0;
 }
diff --git a/src/layer/unaryop.h b/src/layer/unaryop.h
index 827784a70..6084e966c 100644
--- a/src/layer/unaryop.h
+++ b/src/layer/unaryop.h
@@ -26,7 +26,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
     enum {
         Operation_ABS   = 0,
diff --git a/src/layer/x86/convolution_1x1.h b/src/layer/x86/convolution_1x1.h
index b324740f3..c5db4b17b 100644
--- a/src/layer/x86/convolution_1x1.h
+++ b/src/layer/x86/convolution_1x1.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-static void conv1x1s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+static void conv1x1s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -25,7 +25,7 @@ static void conv1x1s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _ker
     const float* kernel = _kernel;
     const float* bias = _bias;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=0; p<outch; p++)
     {
         Mat out = top_blob.channel(p);
@@ -108,7 +108,7 @@ static void conv1x1s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _ker
 
 }
 
-static void conv1x1s2_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+static void conv1x1s2_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
@@ -122,7 +122,7 @@ static void conv1x1s2_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _ker
     const float* kernel = _kernel;
     const float* bias = _bias;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=0; p<outch; p++)
     {
         Mat out = top_blob.channel(p);
diff --git a/src/layer/x86/convolution_3x3.h b/src/layer/x86/convolution_3x3.h
index 4ba03bc71..cc7ecd3bf 100644
--- a/src/layer/x86/convolution_3x3.h
+++ b/src/layer/x86/convolution_3x3.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-static void conv3x3s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+static void conv3x3s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
@@ -24,7 +24,7 @@ static void conv3x3s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _ker
     const float* kernel = _kernel;
     const float* bias = _bias;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=0; p<outch; p++)
     {
         Mat out = top_blob.channel(p);
diff --git a/src/layer/x86/convolution_5x5.h b/src/layer/x86/convolution_5x5.h
index 01289531e..c48fb9c45 100644
--- a/src/layer/x86/convolution_5x5.h
+++ b/src/layer/x86/convolution_5x5.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-static void conv5x5s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+static void conv5x5s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
@@ -24,7 +24,7 @@ static void conv5x5s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _ker
     const float* kernel = _kernel;
     const float* bias = _bias;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=0; p<outch; p++)
     {
         Mat out = top_blob.channel(p);
diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp
index 630590bcf..520f9a2a1 100644
--- a/src/layer/x86/convolution_x86.cpp
+++ b/src/layer/x86/convolution_x86.cpp
@@ -22,10 +22,11 @@ namespace ncnn {
 
 DEFINE_LAYER_CREATOR(Convolution_x86)
 
-int Convolution_x86::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv) const
+int Convolution_x86::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv, const Option& opt) const
 {
     int w = bottom_blob.w;
     int h = bottom_blob.h;
+    size_t elemsize = bottom_blob.elemsize;
 
     const int kernel_size = kernel_w;
     const int stride = stride_w;
@@ -35,7 +36,7 @@ int Convolution_x86::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
     Mat bottom_blob_bordered = bottom_blob;
     if (pad_w > 0 || pad_h > 0)
     {
-        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f);
+        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
         if (bottom_blob_bordered.empty())
             return -100;
 
@@ -48,7 +49,7 @@ int Convolution_x86::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
         int hpad = kernel_extent + (h - 1) / stride * stride - h;
         if (wpad > 0 || hpad > 0)
         {
-            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f);
+            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
             if (bottom_blob_bordered.empty())
                 return -100;
         }
@@ -60,7 +61,7 @@ int Convolution_x86::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
     int outw = (w - kernel_extent) / stride + 1;
     int outh = (h - kernel_extent) / stride + 1;
 
-    top_blob.create(outw, outh, num_output);
+    top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
@@ -79,7 +80,7 @@ int Convolution_x86::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
 
             if (inner_bottom_blob.w != inner_w || inner_bottom_blob.h != inner_h)
             {
-                inner_bottom_blob.create(inner_w, inner_h, bottom_blob.c);
+                inner_bottom_blob.create(inner_w, inner_h, bottom_blob.c, elemsize, opt.workspace_allocator);
 
                 if (inner_bottom_blob.empty())
                 {
@@ -89,7 +90,7 @@ int Convolution_x86::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
 
             if (inner_top_blob.w != inner_outw || inner_top_blob.h != inner_outh)
             {
-                inner_top_blob.create(inner_outw, inner_outh, num_output);
+                inner_top_blob.create(inner_outw, inner_outh, num_output, elemsize, opt.workspace_allocator);
 
                 if (inner_top_blob.empty())
                 {
@@ -97,7 +98,7 @@ int Convolution_x86::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
                 }
             }
 
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int c = 0; c < bottom_blob.c; c ++)
             {
                 float *outptr = inner_bottom_blob.channel(c);
@@ -113,9 +114,9 @@ int Convolution_x86::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
                 }
             }
 
-            conv(inner_bottom_blob, inner_top_blob, weight_data, bias_data);
+            conv(inner_bottom_blob, inner_top_blob, weight_data, bias_data, opt);
 
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int c = 0; c < num_output; c ++)
             {
                 float *outptr = (float *)top_blob.channel(c) + x * outw + y;
@@ -136,19 +137,19 @@ int Convolution_x86::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
 }
 
 
-int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob) const
+int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     // convolv with NxN kernel
     // value = value + bias
 
     if (bottom_blob.dims != 3)
     {
-        return Convolution::forward(bottom_blob, top_blob);
+        return Convolution::forward(bottom_blob, top_blob, opt);
     }
 
     if (kernel_w != kernel_h || stride_w != stride_h)
     {
-        return Convolution::forward(bottom_blob, top_blob);
+        return Convolution::forward(bottom_blob, top_blob, opt);
     }
 
     const int kernel_size = kernel_w;
@@ -156,10 +157,10 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob) const
 
     if (kernel_size > 5 || stride > 5 || dilation_w != dilation_h)
     {
-        return Convolution::forward(bottom_blob, top_blob);
+        return Convolution::forward(bottom_blob, top_blob, opt);
     }
 
-    typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&);
+    typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&);
 
     // kernel_size x stride
     conv_func conv_func_table[5][5] =
@@ -204,20 +205,21 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob) const
     conv_func conv = conv_func_table[kernel_size-1][stride-1];
     if (!conv)
     {
-        return Convolution::forward(bottom_blob, top_blob);
+        return Convolution::forward(bottom_blob, top_blob, opt);
     }
 
     if (dilation_w != 1) {
-        return forwardDilation(bottom_blob, top_blob, conv);
+        return forwardDilation(bottom_blob, top_blob, conv, opt);
     }
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
+    size_t elemsize = bottom_blob.elemsize;
 
     Mat bottom_blob_bordered = bottom_blob;
     if (pad_w > 0 || pad_h > 0)
     {
-        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f);
+        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
         if (bottom_blob_bordered.empty())
             return -100;
 
@@ -230,7 +232,7 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob) const
         int hpad = kernel_size + (h - 1) / stride * stride - h;
         if (wpad > 0 || hpad > 0)
         {
-            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f);
+            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
             if (bottom_blob_bordered.empty())
                 return -100;
         }
@@ -242,11 +244,11 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob) const
     int outw = (w - kernel_size) / stride + 1;
     int outh = (h - kernel_size) / stride + 1;
 
-    top_blob.create(outw, outh, num_output);
+    top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
-    conv(bottom_blob_bordered, top_blob, weight_data, bias_data);
+    conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
 
     return 0;
 }
diff --git a/src/layer/x86/convolution_x86.h b/src/layer/x86/convolution_x86.h
index 1aad94476..e72c14aca 100644
--- a/src/layer/x86/convolution_x86.h
+++ b/src/layer/x86/convolution_x86.h
@@ -19,13 +19,13 @@
 
 namespace ncnn {
 
-typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&);
+typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&);
 
 class Convolution_x86 : public Convolution
 {
 public:
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
-    virtual int forwardDilation(const Mat& bottom_blob, Mat &top_blob, conv_func conv) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+    virtual int forwardDilation(const Mat& bottom_blob, Mat &top_blob, conv_func conv, const Option& opt) const;
 };
 
 } // namespace ncnn
diff --git a/src/layer/x86/convolutiondepthwise_3x3.h b/src/layer/x86/convolutiondepthwise_3x3.h
index d14948de3..aa34ca084 100644
--- a/src/layer/x86/convolutiondepthwise_3x3.h
+++ b/src/layer/x86/convolutiondepthwise_3x3.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-static void convdw3x3s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+static void convdw3x3s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
 
@@ -24,7 +24,7 @@ static void convdw3x3s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _k
     const float* kernel = _kernel;
     const float* bias = _bias;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int g=0; g<group; g++)
     {
         Mat out = top_blob.channel(g);
@@ -130,7 +130,7 @@ static void convdw3x3s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _k
     }
 }
 
-static void convdw3x3s2_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+static void convdw3x3s2_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
 
@@ -144,7 +144,7 @@ static void convdw3x3s2_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _k
     const float* kernel = _kernel;
     const float* bias = _bias;
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int g=0; g<group; g++)
     {
         Mat out = top_blob.channel(g);
diff --git a/src/layer/x86/convolutiondepthwise_x86.cpp b/src/layer/x86/convolutiondepthwise_x86.cpp
index 3cd48e2e7..2ef69631d 100644
--- a/src/layer/x86/convolutiondepthwise_x86.cpp
+++ b/src/layer/x86/convolutiondepthwise_x86.cpp
@@ -26,7 +26,7 @@ namespace ncnn {
 
 DEFINE_LAYER_CREATOR(ConvolutionDepthWise_x86)
 
-int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob) const
+int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     // convolv with NxN kernel
     // value = value + bias
@@ -34,6 +34,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob) con
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
 
     if (channels % group != 0 || num_output % group != 0)
     {
@@ -47,7 +48,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob) con
     Mat bottom_blob_bordered = bottom_blob;
     if (pad_w > 0 || pad_h > 0)
     {
-        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f);
+        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
         if (bottom_blob_bordered.empty())
             return -100;
 
@@ -60,7 +61,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob) con
         int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
         if (wpad > 0 || hpad > 0)
         {
-            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f);
+            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
             if (bottom_blob_bordered.empty())
                 return -100;
         }
@@ -72,7 +73,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob) con
     int outw = (w - kernel_extent_w) / stride_w + 1;
     int outh = (h - kernel_extent_h) / stride_h + 1;
 
-    top_blob.create(outw, outh, num_output);
+    top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
@@ -85,12 +86,12 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob) con
         {
             if (stride_w == 1 && stride_h == 1)
             {
-                convdw3x3s1_sse(bottom_blob_bordered, top_blob, weight_data, bias_data);
+                convdw3x3s1_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
                 return 0;
             }
             else if (stride_w == 2 && stride_h == 2)
             {
-                convdw3x3s2_sse(bottom_blob_bordered, top_blob, weight_data, bias_data);
+                convdw3x3s2_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
                 return 0;
             }
         }
@@ -100,7 +101,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob) con
         omp_set_nested(0);
 #endif
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int g=0; g<group; g++)
         {
             Mat bottom_blob_bordered_g(w, h, 1, bottom_blob_bordered.channel(g));
@@ -137,7 +138,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob) con
             op->load_model(ModelBinFromMatArray(weights));
 
             // forward
-            op->forward(bottom_blob_bordered_g, top_blob_g);
+            op->forward(bottom_blob_bordered_g, top_blob_g, opt);
 
             delete op;
         }
@@ -187,7 +188,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob) con
         op->load_model(ModelBinFromMatArray(weights));
 
         // forward
-        op->forward(bottom_blob_bordered_g, top_blob_g);
+        op->forward(bottom_blob_bordered_g, top_blob_g, opt);
 
         delete op;
     }
diff --git a/src/layer/x86/convolutiondepthwise_x86.h b/src/layer/x86/convolutiondepthwise_x86.h
index d67283511..82352312e 100644
--- a/src/layer/x86/convolutiondepthwise_x86.h
+++ b/src/layer/x86/convolutiondepthwise_x86.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class ConvolutionDepthWise_x86 : public ConvolutionDepthWise
 {
 public:
-    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 };
 
 } // namespace ncnn
diff --git a/src/layer/yolodetectionoutput.cpp b/src/layer/yolodetectionoutput.cpp
index ff39e9979..8a810b020 100644
--- a/src/layer/yolodetectionoutput.cpp
+++ b/src/layer/yolodetectionoutput.cpp
@@ -160,7 +160,7 @@ static inline float sigmoid(float x)
     return 1.f / (1.f + exp(-x));
 }
 
-int YoloDetectionOutput::forward_inplace(Mat& bottom_top_blob) const
+int YoloDetectionOutput::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
@@ -177,7 +177,7 @@ int YoloDetectionOutput::forward_inplace(Mat& bottom_top_blob) const
     all_box_bbox_rects.resize(num_box);
     all_box_bbox_scores.resize(num_box);
 
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int pp = 0; pp < num_box; pp++)
     {
         int p = pp * channels_per_box;
@@ -194,7 +194,7 @@ int YoloDetectionOutput::forward_inplace(Mat& bottom_top_blob) const
 
         // softmax class scores
         Mat scores(w, h, num_class, (void*)((const float*)bottom_top_blob.channel(p+5)));
-        softmax->forward_inplace(scores);
+        softmax->forward_inplace(scores, opt);
 
         for (int i = 0; i < h; i++)
         {
@@ -281,7 +281,7 @@ int YoloDetectionOutput::forward_inplace(Mat& bottom_top_blob) const
     // fill result
     int num_detected = bbox_rects.size();
 
-    bottom_top_blob.create(6, num_detected);
+    bottom_top_blob.create(6, num_detected, 4u, opt.blob_allocator);
     if (bottom_top_blob.empty())
         return -100;
 
diff --git a/src/layer/yolodetectionoutput.h b/src/layer/yolodetectionoutput.h
index b35b8032f..8513ca30d 100644
--- a/src/layer/yolodetectionoutput.h
+++ b/src/layer/yolodetectionoutput.h
@@ -27,7 +27,7 @@ public:
 
     virtual int load_param(const ParamDict& pd);
 
-    virtual int forward_inplace(Mat& bottom_top_blob) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 public:
     int num_class;
diff --git a/src/mat.cpp b/src/mat.cpp
index 498768735..92c3f12aa 100644
--- a/src/mat.cpp
+++ b/src/mat.cpp
@@ -499,10 +499,11 @@ static void copy_make_border_image(const Mat& src, Mat& dst, int top, int left,
     }
 }
 
-void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int type, float v)
+void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int type, float v, Allocator* allocator, int num_threads)
 {
     int w = src.w + left + right;
     int h = src.h + top + bottom;
+    size_t elemsize = src.elemsize;
 
     if (w == src.w && h == src.h)
     {
@@ -512,7 +513,7 @@ void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, i
 
     if (src.dims == 2)
     {
-        dst.create(w, h);
+        dst.create(w, h, elemsize, allocator);
         if (dst.empty())
             return;
 
@@ -522,12 +523,12 @@ void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, i
     {
         int channels = src.c;
 
-        dst.create(w, h, channels);
+        dst.create(w, h, channels, elemsize, allocator);
         if (dst.empty())
             return;
 
         // unroll image channel
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(num_threads)
         for (int q=0; q<channels; q++)
         {
             const Mat m = src.channel(q);
@@ -564,10 +565,11 @@ static void copy_cut_border_image(const Mat& src, Mat& dst, int top, int left)
     }
 }
 
-void copy_cut_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
+void copy_cut_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, Allocator* allocator, int num_threads)
 {
     int w = src.w - left - right;
     int h = src.h - top - bottom;
+    size_t elemsize = src.elemsize;
 
     if (w == src.w && h == src.h)
     {
@@ -577,7 +579,7 @@ void copy_cut_border(const Mat& src, Mat& dst, int top, int bottom, int left, in
 
     if (src.dims == 2)
     {
-        dst.create(w, h);
+        dst.create(w, h, elemsize, allocator);
         if (dst.empty())
             return;
 
@@ -587,12 +589,12 @@ void copy_cut_border(const Mat& src, Mat& dst, int top, int bottom, int left, in
     {
         int channels = src.c;
 
-        dst.create(w, h, channels);
+        dst.create(w, h, channels, elemsize, allocator);
         if (dst.empty())
             return;
 
         // unroll image channel
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(num_threads)
         for (int q=0; q<channels; q++)
         {
             const Mat m = src.channel(q);
@@ -832,7 +834,7 @@ static void resize_bilinear_image(const Mat& src, Mat& dst, int w, int h)
     delete[] buf;
 }
 
-void resize_bilinear(const Mat& src, Mat& dst, int w, int h)
+void resize_bilinear(const Mat& src, Mat& dst, int w, int h, Allocator* allocator, int num_threads)
 {
     if (w == src.w && h == src.h)
     {
@@ -840,9 +842,11 @@ void resize_bilinear(const Mat& src, Mat& dst, int w, int h)
         return;
     }
 
+    size_t elemsize = src.elemsize;
+
     if (src.dims == 2)
     {
-        dst.create(w, h);
+        dst.create(w, h, elemsize, allocator);
         if (dst.empty())
             return;
 
@@ -852,12 +856,12 @@ void resize_bilinear(const Mat& src, Mat& dst, int w, int h)
     {
         int channels = src.c;
 
-        dst.create(w, h, channels);
+        dst.create(w, h, channels, elemsize, allocator);
         if (dst.empty())
             return;
 
         // unroll image channel
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(num_threads)
         for (int q=0; q<channels; q++)
         {
             const Mat m = src.channel(q);
diff --git a/src/mat.h b/src/mat.h
index dab86201f..c25a5b74a 100644
--- a/src/mat.h
+++ b/src/mat.h
@@ -20,6 +20,7 @@
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif
+#include "allocator.h"
 #include "platform.h"
 
 namespace ncnn {
@@ -31,19 +32,19 @@ public:
     // empty
     Mat();
     // vec
-    Mat(int w, size_t elemsize = 4);
+    Mat(int w, size_t elemsize = 4u, Allocator* allocator = 0);
     // image
-    Mat(int w, int h, size_t elemsize = 4);
+    Mat(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0);
     // dim
-    Mat(int w, int h, int c, size_t elemsize = 4);
+    Mat(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0);
     // copy
     Mat(const Mat& m);
     // external vec
-    Mat(int w, void* data, size_t elemsize = 4);
+    Mat(int w, void* data, size_t elemsize = 4u);
     // external image
-    Mat(int w, int h, void* data, size_t elemsize = 4);
+    Mat(int w, int h, void* data, size_t elemsize = 4u);
     // external dim
-    Mat(int w, int h, int c, void* data, size_t elemsize = 4);
+    Mat(int w, int h, int c, void* data, size_t elemsize = 4u);
     // release
     ~Mat();
     // assign
@@ -52,19 +53,19 @@ public:
     void fill(float v);
     template <typename T> void fill(T v);
     // deep copy
-    Mat clone() const;
+    Mat clone(Allocator* allocator = 0) const;
     // reshape vec
-    Mat reshape(int w) const;
+    Mat reshape(int w, Allocator* allocator = 0) const;
     // reshape image
-    Mat reshape(int w, int h) const;
+    Mat reshape(int w, int h, Allocator* allocator = 0) const;
     // reshape dim
-    Mat reshape(int w, int h, int c) const;
+    Mat reshape(int w, int h, int c, Allocator* allocator = 0) const;
     // allocate vec
-    void create(int w, size_t elemsize = 4);
+    void create(int w, size_t elemsize = 4u, Allocator* allocator = 0);
     // allocate image
-    void create(int w, int h, size_t elemsize = 4);
+    void create(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0);
     // allocate dim
-    void create(int w, int h, int c, size_t elemsize = 4);
+    void create(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0);
     // refcount++
     void addref();
     // refcount--
@@ -115,9 +116,9 @@ public:
         PIXEL_RGBA2GRAY = PIXEL_RGBA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
     };
     // convenient construct from pixel data
-    static Mat from_pixels(const unsigned char* pixels, int type, int w, int h);
+    static Mat from_pixels(const unsigned char* pixels, int type, int w, int h, Allocator* allocator = 0);
     // convenient construct from pixel data and resize to specific size
-    static Mat from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height);
+    static Mat from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height, Allocator* allocator = 0);
 
     // convenient export to pixel data
     void to_pixels(unsigned char* pixels, int type) const;
@@ -145,6 +146,9 @@ public:
     // 0 = empty
     size_t elemsize;
 
+    // the allocator
+    Allocator* allocator;
+
     // the dimensionality
     int dims;
 
@@ -169,100 +173,35 @@ enum
     BORDER_CONSTANT = 0,
     BORDER_REPLICATE = 1,
 };
-void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int type, float v);
-void copy_cut_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right);
-void resize_bilinear(const Mat& src, Mat& dst, int w, int h);
-
-// the alignment of all the allocated buffers
-#define MALLOC_ALIGN    16
-
-// Aligns a pointer to the specified number of bytes
-// ptr Aligned pointer
-// n Alignment size that must be a power of two
-template<typename _Tp> static inline _Tp* alignPtr(_Tp* ptr, int n=(int)sizeof(_Tp))
-{
-    return (_Tp*)(((size_t)ptr + n-1) & -n);
-}
-
-// Aligns a buffer size to the specified number of bytes
-// The function returns the minimum number that is greater or equal to sz and is divisible by n
-// sz Buffer size to align
-// n Alignment size that must be a power of two
-static inline size_t alignSize(size_t sz, int n)
-{
-    return (sz + n-1) & -n;
-}
-
-static inline void* fastMalloc(size_t size)
-{
-    unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + MALLOC_ALIGN);
-    if (!udata)
-        return 0;
-    unsigned char** adata = alignPtr((unsigned char**)udata + 1, MALLOC_ALIGN);
-    adata[-1] = udata;
-    return adata;
-}
-
-static inline void fastFree(void* ptr)
-{
-    if (ptr)
-    {
-        unsigned char* udata = ((unsigned char**)ptr)[-1];
-        free(udata);
-    }
-}
-
-// exchange-add operation for atomic operations on reference counters
-#if defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32)
-// atomic increment on the linux version of the Intel(tm) compiler
-#  define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd(const_cast<void*>(reinterpret_cast<volatile void*>(addr)), delta)
-#elif defined __GNUC__
-#  if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__)
-#    ifdef __ATOMIC_ACQ_REL
-#      define NCNN_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL)
-#    else
-#      define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4)
-#    endif
-#  else
-#    if defined __ATOMIC_ACQ_REL && !defined __clang__
-// version for gcc >= 4.7
-#      define NCNN_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL)
-#    else
-#      define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta))
-#    endif
-#  endif
-#elif defined _MSC_VER && !defined RC_INVOKED
-#  include <intrin.h>
-#  define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
-#else
-static inline void NCNN_XADD(int* addr, int delta) { int tmp = *addr; *addr += delta; return tmp; }
-#endif
+void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int type, float v, Allocator* allocator = 0, int num_threads = 1);
+void copy_cut_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, Allocator* allocator = 0, int num_threads = 1);
+void resize_bilinear(const Mat& src, Mat& dst, int w, int h, Allocator* allocator = 0, int num_threads = 1);
 
 inline Mat::Mat()
-    : data(0), refcount(0), elemsize(0), dims(0), w(0), h(0), c(0), cstep(0)
+    : data(0), refcount(0), elemsize(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
 {
 }
 
-inline Mat::Mat(int _w, size_t _elemsize)
+inline Mat::Mat(int _w, size_t _elemsize, Allocator* allocator)
     : data(0), refcount(0), dims(0)
 {
-    create(_w, _elemsize);
+    create(_w, _elemsize, allocator);
 }
 
-inline Mat::Mat(int _w, int _h, size_t _elemsize)
+inline Mat::Mat(int _w, int _h, size_t _elemsize, Allocator* allocator)
     : data(0), refcount(0), dims(0)
 {
-    create(_w, _h, _elemsize);
+    create(_w, _h, _elemsize, allocator);
 }
 
-inline Mat::Mat(int _w, int _h, int _c, size_t _elemsize)
+inline Mat::Mat(int _w, int _h, int _c, size_t _elemsize, Allocator* allocator)
     : data(0), refcount(0), dims(0)
 {
-    create(_w, _h, _c, _elemsize);
+    create(_w, _h, _c, _elemsize, allocator);
 }
 
 inline Mat::Mat(const Mat& m)
-    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), dims(m.dims)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), allocator(m.allocator), dims(m.dims)
 {
     if (refcount)
         NCNN_XADD(refcount, 1);
@@ -275,7 +214,7 @@ inline Mat::Mat(const Mat& m)
 }
 
 inline Mat::Mat(int _w, void* _data, size_t _elemsize)
-    : data(_data), refcount(0), elemsize(_elemsize), dims(1)
+    : data(_data), refcount(0), elemsize(_elemsize), allocator(0), dims(1)
 {
     w = _w;
     h = 1;
@@ -285,7 +224,7 @@ inline Mat::Mat(int _w, void* _data, size_t _elemsize)
 }
 
 inline Mat::Mat(int _w, int _h, void* _data, size_t _elemsize)
-    : data(_data), refcount(0), elemsize(_elemsize), dims(2)
+    : data(_data), refcount(0), elemsize(_elemsize), allocator(0), dims(2)
 {
     w = _w;
     h = _h;
@@ -295,7 +234,7 @@ inline Mat::Mat(int _w, int _h, void* _data, size_t _elemsize)
 }
 
 inline Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize)
-    : data(_data), refcount(0), elemsize(_elemsize), dims(3)
+    : data(_data), refcount(0), elemsize(_elemsize), allocator(0), dims(3)
 {
     w = _w;
     h = _h;
@@ -322,6 +261,7 @@ inline Mat& Mat::operator=(const Mat& m)
     data = m.data;
     refcount = m.refcount;
     elemsize = m.elemsize;
+    allocator = m.allocator;
 
     dims = m.dims;
     w = m.w;
@@ -398,18 +338,18 @@ inline void Mat::fill(T _v)
     }
 }
 
-inline Mat Mat::clone() const
+inline Mat Mat::clone(Allocator* allocator) const
 {
     if (empty())
         return Mat();
 
     Mat m;
     if (dims == 1)
-        m.create(w, elemsize);
+        m.create(w, elemsize, allocator);
     else if (dims == 2)
-        m.create(w, h, elemsize);
+        m.create(w, h, elemsize, allocator);
     else if (dims == 3)
-        m.create(w, h, c, elemsize);
+        m.create(w, h, c, elemsize, allocator);
 
     if (total() > 0)
     {
@@ -419,7 +359,7 @@ inline Mat Mat::clone() const
     return m;
 }
 
-inline Mat Mat::reshape(int _w) const
+inline Mat Mat::reshape(int _w, Allocator* allocator) const
 {
     if (w * h * c != _w)
         return Mat();
@@ -427,7 +367,7 @@ inline Mat Mat::reshape(int _w) const
     if (dims == 3 && cstep != (size_t)w * h)
     {
         Mat m;
-        m.create(_w, elemsize);
+        m.create(_w, elemsize, allocator);
 
         // flatten
         for (int i=0; i<c; i++)
@@ -452,7 +392,7 @@ inline Mat Mat::reshape(int _w) const
     return m;
 }
 
-inline Mat Mat::reshape(int _w, int _h) const
+inline Mat Mat::reshape(int _w, int _h, Allocator* allocator) const
 {
     if (w * h * c != _w * _h)
         return Mat();
@@ -460,7 +400,7 @@ inline Mat Mat::reshape(int _w, int _h) const
     if (dims == 3 && cstep != (size_t)w * h)
     {
         Mat m;
-        m.create(_w, _h, elemsize);
+        m.create(_w, _h, elemsize, allocator);
 
         // flatten
         for (int i=0; i<c; i++)
@@ -485,7 +425,7 @@ inline Mat Mat::reshape(int _w, int _h) const
     return m;
 }
 
-inline Mat Mat::reshape(int _w, int _h, int _c) const
+inline Mat Mat::reshape(int _w, int _h, int _c, Allocator* allocator) const
 {
     if (w * h * c != _w * _h * _c)
         return Mat();
@@ -495,7 +435,7 @@ inline Mat Mat::reshape(int _w, int _h, int _c) const
         if ((size_t)_w * _h != alignSize(_w * _h * elemsize, 16) / elemsize)
         {
             Mat m;
-            m.create(_w, _h, _c, elemsize);
+            m.create(_w, _h, _c, elemsize, allocator);
 
             // align channel
             for (int i=0; i<_c; i++)
@@ -511,8 +451,8 @@ inline Mat Mat::reshape(int _w, int _h, int _c) const
     else if (c != _c)
     {
         // flatten and then align
-        Mat tmp = reshape(_w * _h * _c);
-        return tmp.reshape(_w, _h, _c);
+        Mat tmp = reshape(_w * _h * _c, allocator);
+        return tmp.reshape(_w, _h, _c, allocator);
     }
 
     Mat m = *this;
@@ -527,14 +467,15 @@ inline Mat Mat::reshape(int _w, int _h, int _c) const
     return m;
 }
 
-inline void Mat::create(int _w, size_t _elemsize)
+inline void Mat::create(int _w, size_t _elemsize, Allocator* _allocator)
 {
-    if (dims == 1 && w == _w && elemsize == _elemsize)
+    if (dims == 1 && w == _w && elemsize == _elemsize && allocator == _allocator)
         return;
 
     release();
 
     elemsize = _elemsize;
+    allocator = _allocator;
 
     dims = 1;
     w = _w;
@@ -546,20 +487,24 @@ inline void Mat::create(int _w, size_t _elemsize)
     if (total() > 0)
     {
         size_t totalsize = total() * elemsize;
-        data = fastMalloc(totalsize + (int)sizeof(*refcount));
+        if (allocator)
+            data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount));
+        else
+            data = fastMalloc(totalsize + (int)sizeof(*refcount));
         refcount = (int*)(((unsigned char*)data) + totalsize);
         *refcount = 1;
     }
 }
 
-inline void Mat::create(int _w, int _h, size_t _elemsize)
+inline void Mat::create(int _w, int _h, size_t _elemsize, Allocator* _allocator)
 {
-    if (dims == 2 && w == _w && h == _h && elemsize == _elemsize)
+    if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && allocator == _allocator)
         return;
 
     release();
 
     elemsize = _elemsize;
+    allocator = _allocator;
 
     dims = 2;
     w = _w;
@@ -571,20 +516,24 @@ inline void Mat::create(int _w, int _h, size_t _elemsize)
     if (total() > 0)
     {
         size_t totalsize = total() * elemsize;
-        data = fastMalloc(totalsize + (int)sizeof(*refcount));
+        if (allocator)
+            data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount));
+        else
+            data = fastMalloc(totalsize + (int)sizeof(*refcount));
         refcount = (int*)(((unsigned char*)data) + totalsize);
         *refcount = 1;
     }
 }
 
-inline void Mat::create(int _w, int _h, int _c, size_t _elemsize)
+inline void Mat::create(int _w, int _h, int _c, size_t _elemsize, Allocator* _allocator)
 {
-    if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize)
+    if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && allocator == _allocator)
         return;
 
     release();
 
     elemsize = _elemsize;
+    allocator = _allocator;
 
     dims = 3;
     w = _w;
@@ -596,7 +545,10 @@ inline void Mat::create(int _w, int _h, int _c, size_t _elemsize)
     if (total() > 0)
     {
         size_t totalsize = total() * elemsize;
-        data = fastMalloc(totalsize + (int)sizeof(*refcount));
+        if (allocator)
+            data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount));
+        else
+            data = fastMalloc(totalsize + (int)sizeof(*refcount));
         refcount = (int*)(((unsigned char*)data) + totalsize);
         *refcount = 1;
     }
@@ -611,7 +563,12 @@ inline void Mat::addref()
 inline void Mat::release()
 {
     if (refcount && NCNN_XADD(refcount, -1) == 1)
-        fastFree(data);
+    {
+        if (allocator)
+            allocator->fastFree(data);
+        else
+            fastFree(data);
+    }
 
     data = 0;
 
diff --git a/src/mat_pixel.cpp b/src/mat_pixel.cpp
index f07f88647..531788276 100644
--- a/src/mat_pixel.cpp
+++ b/src/mat_pixel.cpp
@@ -24,9 +24,9 @@
 namespace ncnn {
 
 #if NCNN_PIXEL
-static Mat from_rgb(const unsigned char* rgb, int w, int h)
+static Mat from_rgb(const unsigned char* rgb, int w, int h, Allocator* allocator)
 {
-    Mat m(w, h, 3);
+    Mat m(w, h, 3, 4u, allocator);
     if (m.empty())
         return m;
 
@@ -155,9 +155,9 @@ static void to_rgb(const Mat& m, unsigned char* rgb)
 #undef SATURATE_CAST_UCHAR
 }
 
-static Mat from_gray(const unsigned char* gray, int w, int h)
+static Mat from_gray(const unsigned char* gray, int w, int h, Allocator* allocator)
 {
-    Mat m(w, h, 1);
+    Mat m(w, h, 1, 4u, allocator);
     if (m.empty())
         return m;
 
@@ -257,9 +257,9 @@ static void to_gray(const Mat& m, unsigned char* gray)
 #undef SATURATE_CAST_UCHAR
 }
 
-static Mat from_rgba(const unsigned char* rgba, int w, int h)
+static Mat from_rgba(const unsigned char* rgba, int w, int h, Allocator* allocator)
 {
-    Mat m(w, h, 4);
+    Mat m(w, h, 4, 4u, allocator);
     if (m.empty())
         return m;
 
@@ -408,9 +408,9 @@ static void to_rgba(const Mat& m, unsigned char* rgba)
 #undef SATURATE_CAST_UCHAR
 }
 
-static Mat from_rgb2bgr(const unsigned char* rgb, int w, int h)
+static Mat from_rgb2bgr(const unsigned char* rgb, int w, int h, Allocator* allocator)
 {
-    Mat m(w, h, 3);
+    Mat m(w, h, 3, 4u, allocator);
     if (m.empty())
         return m;
 
@@ -539,7 +539,7 @@ static void to_bgr2rgb(const Mat& m, unsigned char* rgb)
 #undef SATURATE_CAST_UCHAR
 }
 
-static Mat from_rgb2gray(const unsigned char* rgb, int w, int h)
+static Mat from_rgb2gray(const unsigned char* rgb, int w, int h, Allocator* allocator)
 {
     // coeffs for r g b = 0.299f, 0.587f, 0.114f
     const unsigned char Y_shift = 8;//14
@@ -547,7 +547,7 @@ static Mat from_rgb2gray(const unsigned char* rgb, int w, int h)
     const unsigned char G2Y = 150;
     const unsigned char B2Y = 29;
 
-    Mat m(w, h, 1);
+    Mat m(w, h, 1, 4u, allocator);
     if (m.empty())
         return m;
 
@@ -631,7 +631,7 @@ static Mat from_rgb2gray(const unsigned char* rgb, int w, int h)
     return m;
 }
 
-static Mat from_bgr2gray(const unsigned char* bgr, int w, int h)
+static Mat from_bgr2gray(const unsigned char* bgr, int w, int h, Allocator* allocator)
 {
     // coeffs for r g b = 0.299f, 0.587f, 0.114f
     const unsigned char Y_shift = 8;//14
@@ -639,7 +639,7 @@ static Mat from_bgr2gray(const unsigned char* bgr, int w, int h)
     const unsigned char G2Y = 150;
     const unsigned char B2Y = 29;
 
-    Mat m(w, h, 1);
+    Mat m(w, h, 1, 4u, allocator);
     if (m.empty())
         return m;
 
@@ -723,9 +723,9 @@ static Mat from_bgr2gray(const unsigned char* bgr, int w, int h)
     return m;
 }
 
-static Mat from_gray2rgb(const unsigned char* gray, int w, int h)
+static Mat from_gray2rgb(const unsigned char* gray, int w, int h, Allocator* allocator)
 {
-    Mat m(w, h, 3);
+    Mat m(w, h, 3, 4u, allocator);
     if (m.empty())
         return m;
 
@@ -830,9 +830,9 @@ static Mat from_gray2rgb(const unsigned char* gray, int w, int h)
     return m;
 }
 
-static Mat from_rgba2rgb(const unsigned char* rgba, int w, int h)
+static Mat from_rgba2rgb(const unsigned char* rgba, int w, int h, Allocator* allocator)
 {
-    Mat m(w, h, 3);
+    Mat m(w, h, 3, 4u, allocator);
     if (m.empty())
         return m;
 
@@ -934,9 +934,9 @@ static Mat from_rgba2rgb(const unsigned char* rgba, int w, int h)
     return m;
 }
 
-static Mat from_rgba2bgr(const unsigned char* rgba, int w, int h)
+static Mat from_rgba2bgr(const unsigned char* rgba, int w, int h, Allocator* allocator)
 {
-    Mat m(w, h, 3);
+    Mat m(w, h, 3, 4u, allocator);
     if (m.empty())
         return m;
 
@@ -1038,7 +1038,7 @@ static Mat from_rgba2bgr(const unsigned char* rgba, int w, int h)
     return m;
 }
 
-static Mat from_rgba2gray(const unsigned char* rgba, int w, int h)
+static Mat from_rgba2gray(const unsigned char* rgba, int w, int h, Allocator* allocator)
 {
     // coeffs for r g b = 0.299f, 0.587f, 0.114f
     const unsigned char Y_shift = 8;//14
@@ -1046,7 +1046,7 @@ static Mat from_rgba2gray(const unsigned char* rgba, int w, int h)
     const unsigned char G2Y = 150;
     const unsigned char B2Y = 29;
 
-    Mat m(w, h, 1);
+    Mat m(w, h, 1, 4u, allocator);
     if (m.empty())
         return m;
 
@@ -1972,47 +1972,47 @@ void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned c
     delete[] buf;
 }
 
-Mat Mat::from_pixels(const unsigned char* pixels, int type, int w, int h)
+Mat Mat::from_pixels(const unsigned char* pixels, int type, int w, int h, Allocator* allocator)
 {
     if (type & PIXEL_CONVERT_MASK)
     {
         if (type == PIXEL_RGB2BGR || type == PIXEL_BGR2RGB)
-            return from_rgb2bgr(pixels, w, h);
+            return from_rgb2bgr(pixels, w, h, allocator);
 
         if (type == PIXEL_RGB2GRAY)
-            return from_rgb2gray(pixels, w, h);
+            return from_rgb2gray(pixels, w, h, allocator);
 
         if (type == PIXEL_BGR2GRAY)
-            return from_bgr2gray(pixels, w, h);
+            return from_bgr2gray(pixels, w, h, allocator);
 
         if (type == PIXEL_GRAY2RGB || type == PIXEL_GRAY2BGR)
-            return from_gray2rgb(pixels, w, h);
+            return from_gray2rgb(pixels, w, h, allocator);
 
         if (type == PIXEL_RGBA2RGB)
-            return from_rgba2rgb(pixels, w, h);
+            return from_rgba2rgb(pixels, w, h, allocator);
 
         if (type == PIXEL_RGBA2BGR)
-            return from_rgba2bgr(pixels, w, h);
+            return from_rgba2bgr(pixels, w, h, allocator);
 
         if (type == PIXEL_RGBA2GRAY)
-            return from_rgba2gray(pixels, w, h);
+            return from_rgba2gray(pixels, w, h, allocator);
     }
     else
     {
         if (type == PIXEL_RGB || type == PIXEL_BGR)
-            return from_rgb(pixels, w, h);
+            return from_rgb(pixels, w, h, allocator);
 
         if (type == PIXEL_GRAY)
-            return from_gray(pixels, w, h);
+            return from_gray(pixels, w, h, allocator);
 
         if (type == PIXEL_RGBA)
-            return from_rgba(pixels, w, h);
+            return from_rgba(pixels, w, h, allocator);
     }
 
     return Mat();
 }
 
-Mat Mat::from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height)
+Mat Mat::from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height, Allocator* allocator)
 {
     if (w == target_width && h == target_height)
         return Mat::from_pixels(pixels, type, w, h);
@@ -2027,7 +2027,7 @@ Mat Mat::from_pixels_resize(const unsigned char* pixels, int type, int w, int h,
 
         resize_bilinear_c3(pixels, w, h, dst, target_width, target_height);
 
-        m = Mat::from_pixels(dst, type, target_width, target_height);
+        m = Mat::from_pixels(dst, type, target_width, target_height, allocator);
 
         delete[] dst;
     }
@@ -2037,7 +2037,7 @@ Mat Mat::from_pixels_resize(const unsigned char* pixels, int type, int w, int h,
 
         resize_bilinear_c1(pixels, w, h, dst, target_width, target_height);
 
-        m = Mat::from_pixels(dst, type, target_width, target_height);
+        m = Mat::from_pixels(dst, type, target_width, target_height, allocator);
 
         delete[] dst;
     }
@@ -2047,7 +2047,7 @@ Mat Mat::from_pixels_resize(const unsigned char* pixels, int type, int w, int h,
 
         resize_bilinear_c4(pixels, w, h, dst, target_width, target_height);
 
-        m = Mat::from_pixels(dst, type, target_width, target_height);
+        m = Mat::from_pixels(dst, type, target_width, target_height, allocator);
 
         delete[] dst;
     }
diff --git a/src/net.cpp b/src/net.cpp
index 2c812c612..e60efe04a 100644
--- a/src/net.cpp
+++ b/src/net.cpp
@@ -622,7 +622,7 @@ Layer* Net::create_custom_layer(int index)
     return layer_creator();
 }
 
-int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, bool lightmode) const
+int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, Option& opt) const
 {
     const Layer* layer = layers[layer_index];
 
@@ -636,14 +636,14 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, bool lightm
 
         if (blob_mats[bottom_blob_index].dims == 0)
         {
-            int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, lightmode);
+            int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, opt);
             if (ret != 0)
                 return ret;
         }
 
         Mat bottom_blob = blob_mats[bottom_blob_index];
 
-        if (lightmode)
+        if (opt.lightmode)
         {
             // delete after taken in light mode
             blob_mats[bottom_blob_index].release();
@@ -655,16 +655,16 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, bool lightm
         }
 
         // forward
-        if (lightmode && layer->support_inplace)
+        if (opt.lightmode && layer->support_inplace)
         {
             Mat& bottom_top_blob = bottom_blob;
 #if NCNN_BENCHMARK
             double start = get_current_time();
-            int ret = layer->forward_inplace(bottom_top_blob);
+            int ret = layer->forward_inplace(bottom_top_blob, opt);
             double end = get_current_time();
             benchmark(layer, bottom_top_blob, bottom_top_blob, start, end);
 #else
-            int ret = layer->forward_inplace(bottom_top_blob);
+            int ret = layer->forward_inplace(bottom_top_blob, opt);
 #endif // NCNN_BENCHMARK
             if (ret != 0)
                 return ret;
@@ -677,11 +677,11 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, bool lightm
             Mat top_blob;
 #if NCNN_BENCHMARK
             double start = get_current_time();
-            int ret = layer->forward(bottom_blob, top_blob);
+            int ret = layer->forward(bottom_blob, top_blob, opt);
             double end = get_current_time();
             benchmark(layer, bottom_blob, top_blob, start, end);
 #else
-            int ret = layer->forward(bottom_blob, top_blob);
+            int ret = layer->forward(bottom_blob, top_blob, opt);
 #endif // NCNN_BENCHMARK
             if (ret != 0)
                 return ret;
@@ -702,14 +702,14 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, bool lightm
 
             if (blob_mats[bottom_blob_index].dims == 0)
             {
-                int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, lightmode);
+                int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, opt);
                 if (ret != 0)
                     return ret;
             }
 
             bottom_blobs[i] = blob_mats[bottom_blob_index];
 
-            if (lightmode)
+            if (opt.lightmode)
             {
                 // delete after taken in light mode
                 blob_mats[bottom_blob_index].release();
@@ -722,16 +722,16 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, bool lightm
         }
 
         // forward
-        if (lightmode && layer->support_inplace)
+        if (opt.lightmode && layer->support_inplace)
         {
             std::vector<Mat>& bottom_top_blobs = bottom_blobs;
 #if NCNN_BENCHMARK
             double start = get_current_time();
-            int ret = layer->forward_inplace(bottom_top_blobs);
+            int ret = layer->forward_inplace(bottom_top_blobs, opt);
             double end = get_current_time();
             benchmark(layer, start, end);
 #else
-            int ret = layer->forward_inplace(bottom_top_blobs);
+            int ret = layer->forward_inplace(bottom_top_blobs, opt);
 #endif // NCNN_BENCHMARK
             if (ret != 0)
                 return ret;
@@ -750,11 +750,11 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, bool lightm
             top_blobs.resize(layer->tops.size());
 #if NCNN_BENCHMARK
             double start = get_current_time();
-            int ret = layer->forward(bottom_blobs, top_blobs);
+            int ret = layer->forward(bottom_blobs, top_blobs, opt);
             double end = get_current_time();
             benchmark(layer, start, end);
 #else
-            int ret = layer->forward(bottom_blobs, top_blobs);
+            int ret = layer->forward(bottom_blobs, top_blobs, opt);
 #endif // NCNN_BENCHMARK
             if (ret != 0)
                 return ret;
@@ -779,18 +779,27 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, bool lightm
 Extractor::Extractor(const Net* _net, int blob_count) : net(_net)
 {
     blob_mats.resize(blob_count);
-    lightmode = true;
-    num_threads = 0;
+    opt = get_default_option();
 }
 
 void Extractor::set_light_mode(bool enable)
 {
-    lightmode = enable;
+    opt.lightmode = enable;
 }
 
-void Extractor::set_num_threads(int _num_threads)
+void Extractor::set_num_threads(int num_threads)
 {
-    num_threads = _num_threads;
+    opt.num_threads = num_threads;
+}
+
+void Extractor::set_blob_allocator(Allocator* allocator)
+{
+    opt.blob_allocator = allocator;
+}
+
+void Extractor::set_workspace_allocator(Allocator* allocator)
+{
+    opt.workspace_allocator = allocator;
 }
 
 int Extractor::input(int blob_index, const Mat& in)
@@ -813,28 +822,7 @@ int Extractor::extract(int blob_index, Mat& feat)
     if (blob_mats[blob_index].dims == 0)
     {
         int layer_index = net->blobs[blob_index].producer;
-
-#ifdef _OPENMP
-        int dynamic_current = 0;
-        int num_threads_current = 1;
-        if (num_threads)
-        {
-            dynamic_current = omp_get_dynamic();
-            num_threads_current = omp_get_num_threads();
-            omp_set_dynamic(0);
-            omp_set_num_threads(num_threads);
-        }
-#endif
-
-        ret = net->forward_layer(layer_index, blob_mats, lightmode);
-
-#ifdef _OPENMP
-        if (num_threads)
-        {
-            omp_set_dynamic(dynamic_current);
-            omp_set_num_threads(num_threads_current);
-        }
-#endif
+        ret = net->forward_layer(layer_index, blob_mats, opt);
     }
 
     feat = blob_mats[blob_index];
@@ -865,28 +853,7 @@ int Extractor::extract(const char* blob_name, Mat& feat)
     if (blob_mats[blob_index].dims == 0)
     {
         int layer_index = net->blobs[blob_index].producer;
-
-#ifdef _OPENMP
-        int dynamic_current = 0;
-        int num_threads_current = 1;
-        if (num_threads)
-        {
-            dynamic_current = omp_get_dynamic();
-            num_threads_current = omp_get_num_threads();
-            omp_set_dynamic(0);
-            omp_set_num_threads(num_threads);
-        }
-#endif
-
-        ret = net->forward_layer(layer_index, blob_mats, lightmode);
-
-#ifdef _OPENMP
-        if (num_threads)
-        {
-            omp_set_dynamic(dynamic_current);
-            omp_set_num_threads(num_threads_current);
-        }
-#endif
+        ret = net->forward_layer(layer_index, blob_mats, opt);
     }
 
     feat = blob_mats[blob_index];
diff --git a/src/net.h b/src/net.h
index 2bd7976da..99b8d6fa0 100644
--- a/src/net.h
+++ b/src/net.h
@@ -87,7 +87,7 @@ protected:
     Layer* create_custom_layer(const char* type);
 #endif // NCNN_STRING
     Layer* create_custom_layer(int index);
-    int forward_layer(int layer_index, std::vector<Mat>& blob_mats, bool lightmode) const;
+    int forward_layer(int layer_index, std::vector<Mat>& blob_mats, Option& opt) const;
 
 protected:
     std::vector<Blob> blobs;
@@ -109,6 +109,12 @@ public:
     // default count is system depended
     void set_num_threads(int num_threads);
 
+    // set blob memory allocator
+    void set_blob_allocator(Allocator* allocator);
+
+    // set workspace memory allocator
+    void set_workspace_allocator(Allocator* allocator);
+
 #if NCNN_STRING
     // set input by blob name
     // return 0 if success
@@ -134,8 +140,7 @@ protected:
 private:
     const Net* net;
     std::vector<Mat> blob_mats;
-    bool lightmode;
-    int num_threads;
+    Option opt;
 };
 
 } // namespace ncnn