implement ncnn blob/workspace allocator, fine-grained per-layer openmp threads control, fix #469

8 years ago · 9706cd1447
--- a/benchmark/benchncnn.cpp
+++ b/benchmark/benchncnn.cpp
@@ -52,6 +52,9 @@ public:

 static int g_loop_count = 4;

 static ncnn::UnlockedPoolAllocator g_blob_pool_allocator;
 static ncnn::PoolAllocator g_workspace_pool_allocator;

 void benchmark(const char* comment, void (*init)(ncnn::Net&), void (*run)(const ncnn::Net&))
 {
    ncnn::BenchNet net;
@@ -60,6 +63,9 @@ void benchmark(const char* comment, void (*init)(ncnn::Net&), void (*run)(const

    net.load_model();

    g_blob_pool_allocator.clear();
    g_workspace_pool_allocator.clear();

    // sleep 10 seconds for cooling down SOC  :(
 #ifdef _WIN32
    Sleep(10 * 1000);
@@ -265,8 +271,6 @@ void mobilenet_yolo_run(const ncnn::Net& net)
 {
    ncnn::Extractor ex = net.create_extractor();

    // NOTE original model input is 416x416x3
    // you may change to 300x300x3 for comparison with ssd
    ncnn::Mat in(416, 416, 3);
    ex.input("data", in);

@@ -295,6 +299,17 @@ int main(int argc, char** argv)

    g_loop_count = loop_count;

    g_blob_pool_allocator.set_size_compare_ratio(0.0f);
    g_workspace_pool_allocator.set_size_compare_ratio(0.5f);

    ncnn::Option opt;
    opt.lightmode = true;
    opt.num_threads = num_threads;
    opt.blob_allocator = &g_blob_pool_allocator;
    opt.workspace_allocator = &g_workspace_pool_allocator;

    ncnn::set_default_option(opt);

    ncnn::set_cpu_powersave(powersave);

    ncnn::set_omp_dynamic(0);
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -8,6 +8,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/layer)

 set(ncnn_SRCS
    allocator.cpp
    blob.cpp
    cpu.cpp
    layer.cpp
--- a/src/allocator.cpp
+++ b/src/allocator.cpp
@@ -0,0 +1,237 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "allocator.h"

 #include <stdio.h>

 namespace ncnn {

 PoolAllocator::PoolAllocator()
 {
    size_compare_ratio = 192;// 0.75f * 256
 }

 PoolAllocator::~PoolAllocator()
 {
    clear();

    if (!payouts.empty())
    {
        fprintf(stderr, "FATAL ERROR! pool allocator destroyed too early\n");
        std::list< std::pair<size_t, void*> >::iterator it = payouts.begin();
        for (; it != payouts.end(); it++)
        {
            void* ptr = it->second;
            fprintf(stderr, "%p still in use\n", ptr);
        }
    }
 }

 void PoolAllocator::clear()
 {
    budgets_lock.lock();

    std::list< std::pair<size_t, void*> >::iterator it = budgets.begin();
    for (; it != budgets.end(); it++)
    {
        void* ptr = it->second;
        ncnn::fastFree(ptr);
    }
    budgets.clear();

    budgets_lock.unlock();
 }

 void PoolAllocator::set_size_compare_ratio(float scr)
 {
    if (scr < 0.f || scr > 1.f)
    {
        fprintf(stderr, "invalid size compare ratio %f\n", scr);
        return;
    }

    size_compare_ratio = (unsigned int)(scr * 256);
 }

 void* PoolAllocator::fastMalloc(size_t size)
 {
    budgets_lock.lock();

    // find free budget
    std::list< std::pair<size_t, void*> >::iterator it = budgets.begin();
    for (; it != budgets.end(); it++)
    {
        size_t bs = it->first;

        // size_compare_ratio ~ 100%
        if (bs >= size && ((bs * size_compare_ratio) >> 8) <= size)
        {
            void* ptr = it->second;

            budgets.erase(it);

            budgets_lock.unlock();

            payouts_lock.lock();

            payouts.push_back(std::make_pair(bs, ptr));

            payouts_lock.unlock();

            return ptr;
        }
    }

    budgets_lock.unlock();

    // new
    void* ptr = ncnn::fastMalloc(size);

    payouts_lock.lock();

    payouts.push_back(std::make_pair(size, ptr));

    payouts_lock.unlock();

    return ptr;
 }

 void PoolAllocator::fastFree(void* ptr)
 {
    payouts_lock.lock();

    // return to budgets
    std::list< std::pair<size_t, void*> >::iterator it = payouts.begin();
    for (; it != payouts.end(); it++)
    {
        if (it->second == ptr)
        {
            size_t size = it->first;

            payouts.erase(it);

            payouts_lock.unlock();

            budgets_lock.lock();

            budgets.push_back(std::make_pair(size, ptr));

            budgets_lock.unlock();

            return;
        }
    }

    payouts_lock.unlock();

    fprintf(stderr, "FATAL ERROR! pool allocator get wild %p\n", ptr);
    ncnn::fastFree(ptr);
 }

 UnlockedPoolAllocator::UnlockedPoolAllocator()
 {
    size_compare_ratio = 192;// 0.75f * 256
 }

 UnlockedPoolAllocator::~UnlockedPoolAllocator()
 {
    clear();

    if (!payouts.empty())
    {
        fprintf(stderr, "FATAL ERROR! unlocked pool allocator destroyed too early\n");
        std::list< std::pair<size_t, void*> >::iterator it = payouts.begin();
        for (; it != payouts.end(); it++)
        {
            void* ptr = it->second;
            fprintf(stderr, "%p still in use\n", ptr);
        }
    }
 }

 void UnlockedPoolAllocator::clear()
 {
    std::list< std::pair<size_t, void*> >::iterator it = budgets.begin();
    for (; it != budgets.end(); it++)
    {
        void* ptr = it->second;
        ncnn::fastFree(ptr);
    }
    budgets.clear();
 }

 void UnlockedPoolAllocator::set_size_compare_ratio(float scr)
 {
    if (scr < 0.f || scr > 1.f)
    {
        fprintf(stderr, "invalid size compare ratio %f\n", scr);
        return;
    }

    size_compare_ratio = (unsigned int)(scr * 256);
 }

 void* UnlockedPoolAllocator::fastMalloc(size_t size)
 {
    // find free budget
    std::list< std::pair<size_t, void*> >::iterator it = budgets.begin();
    for (; it != budgets.end(); it++)
    {
        size_t bs = it->first;

        // size_compare_ratio ~ 100%
        if (bs >= size && ((bs * size_compare_ratio) >> 8) <= size)
        {
            void* ptr = it->second;

            budgets.erase(it);

            payouts.push_back(std::make_pair(bs, ptr));

            return ptr;
        }
    }

    // new
    void* ptr = ncnn::fastMalloc(size);

    payouts.push_back(std::make_pair(size, ptr));

    return ptr;
 }

 void UnlockedPoolAllocator::fastFree(void* ptr)
 {
    // return to budgets
    std::list< std::pair<size_t, void*> >::iterator it = payouts.begin();
    for (; it != payouts.end(); it++)
    {
        if (it->second == ptr)
        {
            size_t size = it->first;

            payouts.erase(it);

            budgets.push_back(std::make_pair(size, ptr));

            return;
        }
    }

    fprintf(stderr, "FATAL ERROR! unlocked pool allocator get wild %p\n", ptr);
    ncnn::fastFree(ptr);
 }

 } // namespace ncnn
--- a/src/allocator.h
+++ b/src/allocator.h
@@ -0,0 +1,175 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef NCNN_ALLOCATOR_H
 #define NCNN_ALLOCATOR_H

 #ifdef _WIN32
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 #else
 #include <pthread.h>
 #endif

 #include <stdlib.h>
 #include <list>

 namespace ncnn {

 // the alignment of all the allocated buffers
 #define MALLOC_ALIGN    16

 // Aligns a pointer to the specified number of bytes
 // ptr Aligned pointer
 // n Alignment size that must be a power of two
 template<typename _Tp> static inline _Tp* alignPtr(_Tp* ptr, int n=(int)sizeof(_Tp))
 {
    return (_Tp*)(((size_t)ptr + n-1) & -n);
 }

 // Aligns a buffer size to the specified number of bytes
 // The function returns the minimum number that is greater or equal to sz and is divisible by n
 // sz Buffer size to align
 // n Alignment size that must be a power of two
 static inline size_t alignSize(size_t sz, int n)
 {
    return (sz + n-1) & -n;
 }

 static inline void* fastMalloc(size_t size)
 {
    unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + MALLOC_ALIGN);
    if (!udata)
        return 0;
    unsigned char** adata = alignPtr((unsigned char**)udata + 1, MALLOC_ALIGN);
    adata[-1] = udata;
    return adata;
 }

 static inline void fastFree(void* ptr)
 {
    if (ptr)
    {
        unsigned char* udata = ((unsigned char**)ptr)[-1];
        free(udata);
    }
 }

 // exchange-add operation for atomic operations on reference counters
 #if defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32)
 // atomic increment on the linux version of the Intel(tm) compiler
 #  define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd(const_cast<void*>(reinterpret_cast<volatile void*>(addr)), delta)
 #elif defined __GNUC__
 #  if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__)
 #    ifdef __ATOMIC_ACQ_REL
 #      define NCNN_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL)
 #    else
 #      define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4)
 #    endif
 #  else
 #    if defined __ATOMIC_ACQ_REL && !defined __clang__
 // version for gcc >= 4.7
 #      define NCNN_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL)
 #    else
 #      define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta))
 #    endif
 #  endif
 #elif defined _MSC_VER && !defined RC_INVOKED
 #  include <intrin.h>
 #  define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
 #else
 static inline void NCNN_XADD(int* addr, int delta) { int tmp = *addr; *addr += delta; return tmp; }
 #endif

 #ifdef _WIN32
 class Mutex
 {
 public:
    Mutex() { InitializeSRWLock(&lock); }
    ~Mutex() { }
    void lock() { AcquireSRWLockExclusive(&lock); }
    void unlock() { ReleaseSRWLockExclusive(&lock); }
 private:
    // NOTE SRWLock is available from windows vista
    SRWLOCK lock;
 };
 #else // _WIN32
 class Mutex
 {
 public:
    Mutex() { pthread_mutex_init(&mutex, 0); }
    ~Mutex() { pthread_mutex_destroy(&mutex); }
    void lock() { pthread_mutex_lock(&mutex); }
    void unlock() { pthread_mutex_unlock(&mutex); }
 private:
    pthread_mutex_t mutex;
 };
 #endif // _WIN32

 class Allocator
 {
 public:
    virtual void* fastMalloc(size_t size) = 0;
    virtual void fastFree(void* ptr) = 0;
 };

 class PoolAllocator : public Allocator
 {
 public:
    PoolAllocator();
    ~PoolAllocator();

    // ratio range 0 ~ 1
    // default cr = 0.75
    void set_size_compare_ratio(float scr);

    // release all budgets immediately
    void clear();

    virtual void* fastMalloc(size_t size);
    virtual void fastFree(void* ptr);

 private:
    Mutex budgets_lock;
    Mutex payouts_lock;
    unsigned int size_compare_ratio;// 0~256
    std::list< std::pair<size_t, void*> > budgets;
    std::list< std::pair<size_t, void*> > payouts;
 };

 class UnlockedPoolAllocator : public Allocator
 {
 public:
    UnlockedPoolAllocator();
    ~UnlockedPoolAllocator();

    // ratio range 0 ~ 1
    // default cr = 0.75
    void set_size_compare_ratio(float scr);

    // release all budgets immediately
    void clear();

    virtual void* fastMalloc(size_t size);
    virtual void fastFree(void* ptr);

 private:
    unsigned int size_compare_ratio;// 0~256
    std::list< std::pair<size_t, void*> > budgets;
    std::list< std::pair<size_t, void*> > payouts;
 };

 } // namespace ncnn

 #endif // NCNN_ALLOCATOR_H
--- a/src/layer.cpp
+++ b/src/layer.cpp
@@ -14,10 +14,40 @@

 #include "layer.h"

 #include <stdio.h>
 #include <string.h>
 #include "cpu.h"

 namespace ncnn {

 Option::Option()
 {
    lightmode = true;
    num_threads = get_cpu_count();
    blob_allocator = 0;
    workspace_allocator = 0;
 }

 static Option g_default_option;

 const Option& get_default_option()
 {
    return g_default_option;
 }

 int set_default_option(const Option& opt)
 {
    if (opt.num_threads <= 0)
    {
        fprintf(stderr, "invalid option num_threads %d\n", opt.num_threads);
        return -1;
    }

    g_default_option = opt;

    return 0;
 }

 Layer::Layer()
 {
    one_blob_only = false;
@@ -38,7 +68,7 @@ int Layer::load_model(const ModelBin& /*mb*/)
    return 0;
 }

 int Layer::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
 int Layer::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
    if (!support_inplace)
        return -1;
@@ -46,32 +76,32 @@ int Layer::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_b
    top_blobs = bottom_blobs;
    for (int i = 0; i < (int)top_blobs.size(); i++)
    {
        top_blobs[i] = bottom_blobs[i].clone();
        top_blobs[i] = bottom_blobs[i].clone(opt.blob_allocator);
        if (top_blobs[i].empty())
            return -100;
    }

    return forward_inplace(top_blobs);
    return forward_inplace(top_blobs, opt);
 }

 int Layer::forward(const Mat& bottom_blob, Mat& top_blob) const
 int Layer::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    if (!support_inplace)
        return -1;

    top_blob = bottom_blob.clone();
    top_blob = bottom_blob.clone(opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    return forward_inplace(top_blob);
    return forward_inplace(top_blob, opt);
 }

 int Layer::forward_inplace(std::vector<Mat>& /*bottom_top_blobs*/) const
 int Layer::forward_inplace(std::vector<Mat>& /*bottom_top_blobs*/, const Option& /*opt*/) const
 {
    return -1;
 }

 int Layer::forward_inplace(Mat& /*bottom_top_blob*/) const
 int Layer::forward_inplace(Mat& /*bottom_top_blob*/, const Option& /*opt*/) const
 {
    return -1;
 }
--- a/src/layer.h
+++ b/src/layer.h
@@ -25,6 +25,22 @@

 namespace ncnn {

 class Allocator;
 class Option
 {
 public:
    Option();

 public:
    bool lightmode;
    int num_threads;
    Allocator* blob_allocator;
    Allocator* workspace_allocator;
 };

 const Option& get_default_option();
 int set_default_option(const Option& opt);

 class Layer
 {
 public:
@@ -51,13 +67,13 @@ public:
 public:
    // implement inference
    // return 0 if success
    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt = get_default_option()) const;
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt = get_default_option()) const;

    // implement inplace inference
    // return 0 if success
    virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs) const;
    virtual int forward_inplace(Mat& bottom_top_blob) const;
    virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt = get_default_option()) const;
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt = get_default_option()) const;

 public:
 #if NCNN_STRING
--- a/src/layer/absval.cpp
+++ b/src/layer/absval.cpp
@@ -24,14 +24,14 @@ AbsVal::AbsVal()
    support_inplace = true;
 }

 int AbsVal::forward_inplace(Mat& bottom_top_blob) const
 int AbsVal::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
--- a/src/layer/absval.h
+++ b/src/layer/absval.h
@@ -24,7 +24,7 @@ class AbsVal : public Layer
 public:
    AbsVal();

    virtual int forward_inplace(Mat& bottom_top_blob) const;
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

 public:
 };
--- a/src/layer/argmax.cpp
+++ b/src/layer/argmax.cpp
@@ -33,14 +33,14 @@ int ArgMax::load_param(const ParamDict& pd)
    return 0;
 }

 int ArgMax::forward(const Mat& bottom_blob, Mat& top_blob) const
 int ArgMax::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    int size = bottom_blob.total();

    if (out_max_val)
        top_blob.create(topk, 2);
        top_blob.create(topk, 2, 4u, opt.blob_allocator);
    else
        top_blob.create(topk, 1);
        top_blob.create(topk, 1, 4u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

--- a/src/layer/argmax.h
+++ b/src/layer/argmax.h
@@ -26,7 +26,7 @@ public:

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

 public:
    int out_max_val;
--- a/src/layer/arm/absval_arm.cpp
+++ b/src/layer/arm/absval_arm.cpp
@@ -22,14 +22,14 @@ namespace ncnn {

 DEFINE_LAYER_CREATOR(AbsVal_arm)

 int AbsVal_arm::forward_inplace(Mat& bottom_top_blob) const
 int AbsVal_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
--- a/src/layer/arm/absval_arm.h
+++ b/src/layer/arm/absval_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class AbsVal_arm : public AbsVal
 {
 public:
    virtual int forward_inplace(Mat& bottom_top_blob) const;
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 };

 } // namespace ncnn
--- a/src/layer/arm/batchnorm_arm.cpp
+++ b/src/layer/arm/batchnorm_arm.cpp
@@ -22,11 +22,11 @@ namespace ncnn {

 DEFINE_LAYER_CREATOR(BatchNorm_arm)

 int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob) const
 int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
    int dims = bottom_top_blob.dims;
    if (dims != 3)
        return BatchNorm::forward_inplace(bottom_top_blob);
        return BatchNorm::forward_inplace(bottom_top_blob, opt);

    // a = bias - slope * mean / sqrt(var)
    // b = slope / sqrt(var)
@@ -38,7 +38,7 @@ int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob) const

    const float* a_data_ptr = a_data;
    const float* b_data_ptr = b_data;
    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
--- a/src/layer/arm/batchnorm_arm.h
+++ b/src/layer/arm/batchnorm_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class BatchNorm_arm : public BatchNorm
 {
 public:
    virtual int forward_inplace(Mat& bottom_top_blob) const;
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 };

 } // namespace ncnn
--- a/src/layer/arm/bias_arm.cpp
+++ b/src/layer/arm/bias_arm.cpp
@@ -22,7 +22,7 @@ namespace ncnn {

 DEFINE_LAYER_CREATOR(Bias_arm)

 int Bias_arm::forward_inplace(Mat& bottom_top_blob) const
 int Bias_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
@@ -30,7 +30,7 @@ int Bias_arm::forward_inplace(Mat& bottom_top_blob) const
    int size = w * h;

    const float* bias_ptr = bias_data;
    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
--- a/src/layer/arm/bias_arm.h
+++ b/src/layer/arm/bias_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class Bias_arm : public Bias
 {
 public:
    virtual int forward_inplace(Mat& bottom_top_blob) const;
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 };

 } // namespace ncnn
--- a/src/layer/arm/convolution_1x1.h
+++ b/src/layer/arm/convolution_1x1.h
@@ -113,7 +113,7 @@ static void conv1x1s1_sgemm_transform_kernel_neon(const Mat& _kernel, Mat& kerne
    }
 }

 static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias)
 static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
@@ -128,12 +128,12 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma
    const float* bias = _bias;

    // interleave
    Mat tmp(8*4, inch/4+inch%4, size/8 + (size%8)/4 + size%4);
    Mat tmp(8*4, inch/4+inch%4, size/8 + (size%8)/4 + size%4, 4u, opt.workspace_allocator);
    {
        int nn_size = size >> 3;
        int remain_size_start = nn_size << 3;

        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii=0; ii<nn_size; ii++)
        {
            int i = ii * 8;
@@ -184,7 +184,7 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma

        nn_size = (size - remain_size_start) >> 2;

        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii=0; ii<nn_size; ii++)
        {
            int i = remain_size_start + ii * 4;
@@ -230,7 +230,7 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma

        remain_size_start += nn_size << 2;

        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i=remain_size_start; i<size; i++)
        {
            const float* img0 = bottom_blob.channel(0);
@@ -254,7 +254,7 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma
    nn_outch = outch >> 3;
    remain_outch_start = nn_outch << 3;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp=0; pp<nn_outch; pp++)
    {
        int p = pp * 8;
@@ -733,7 +733,7 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma

    nn_outch = (outch - remain_outch_start) >> 2;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp=0; pp<nn_outch; pp++)
    {
        int p = remain_outch_start + pp * 4;
@@ -1613,7 +1613,7 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma

    remain_outch_start += nn_outch << 2;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p=remain_outch_start; p<outch; p++)
    {
        Mat out0 = top_blob.channel(p);
@@ -2064,7 +2064,7 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma
 //     }
 }

 static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
 static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
    int inch = bottom_blob.c;

@@ -2083,7 +2083,7 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
    nn_outch = outch >> 3;
    remain_outch_start = nn_outch << 3;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp=0; pp<nn_outch; pp++)
    {
        int p = pp * 8;
@@ -2710,7 +2710,7 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
    nn_outch = outch / 6;
    remain_outch_start = nn_outch * 6;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp=0; pp<nn_outch; pp++)
    {
        int p = pp * 6;
@@ -3101,7 +3101,7 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke

    nn_outch = (outch - remain_outch_start) >> 2;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp=0; pp<nn_outch; pp++)
    {
        int p = remain_outch_start + pp * 4;
@@ -3605,7 +3605,7 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke

    remain_outch_start += nn_outch << 2;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p=remain_outch_start; p<outch; p++)
    {
        Mat out = top_blob.channel(p);
@@ -3863,7 +3863,7 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke

 }

 static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
 static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
@@ -3880,7 +3880,7 @@ static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
    int nn_outch = outch >> 2;
    int remain_outch_start = nn_outch << 2;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp=0; pp<nn_outch; pp++)
    {
        int p = pp * 4;
@@ -4409,7 +4409,7 @@ static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
        }
    }

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p=remain_outch_start; p<outch; p++)
    {
        Mat out = top_blob.channel(p);
--- a/src/layer/arm/convolution_2x2.h
+++ b/src/layer/arm/convolution_2x2.h
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON

 static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
 static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
@@ -28,7 +28,7 @@ static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p=0; p<outch; p++)
    {
        Mat out = top_blob.channel(p);
--- a/src/layer/arm/convolution_3x3.h
+++ b/src/layer/arm/convolution_3x3.h
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON

 static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
 static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
@@ -31,7 +31,7 @@ static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
    int nn_outch = outch >> 1;
    int remain_outch_start = nn_outch << 1;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp=0; pp<nn_outch; pp++)
    {
        int p = pp * 2;
@@ -654,7 +654,7 @@ static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
        }
    }

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p=remain_outch_start; p<outch; p++)
    {
        Mat out = top_blob.channel(p);
@@ -5427,7 +5427,7 @@ static void conv3x3s1_winograd64_neon3(const Mat& bottom_blob, Mat& top_blob, co
 }
 #endif

 static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias)
 static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
@@ -5445,7 +5445,7 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co

    w = outw + 2;
    h = outh + 2;
    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f);
    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt.workspace_allocator, opt.num_threads);

    const float* bias = _bias;

@@ -5454,7 +5454,7 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co
    {
        int w_tm = outw / 6 * 8;
        int h_tm = outh / 6 * 8;
        bottom_blob_tm.create(4, 16 * w_tm/8 * h_tm/8, inch);
        bottom_blob_tm.create(4, 16 * w_tm/8 * h_tm/8, inch, 4u, opt.workspace_allocator);
        const int tiles = w_tm/8 * h_tm/8;

 //         const float itm[8][8] = {
@@ -5495,7 +5495,7 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co
        float32x4_t _coeff1 = vld1q_f32(coeff+4);
 #endif // __ARM_NEON

        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q<inch; q++)
        {
            const Mat img0 = bottom_blob_bordered.channel(q);
@@ -6263,14 +6263,14 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co
    {
        int w_tm = outw / 6 * 8;
        int h_tm = outh / 6 * 8;
        top_blob_tm.create(4, 16 * w_tm/8 * h_tm/8, outch);
        top_blob_tm.create(4, 16 * w_tm/8 * h_tm/8, outch, 4u, opt.workspace_allocator);

        const int tiles = h_tm/8 * w_tm/8;

        int nn_outch = outch >> 2;
        int remain_outch_start = nn_outch << 2;

        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int pp=0; pp<nn_outch; pp++)
        {
            int p = pp * 4;
@@ -7439,7 +7439,7 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co
            }
        }

        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = remain_outch_start; p<outch; p++)
        {
            Mat out0_tm = top_blob_tm.channel(p);
@@ -7526,7 +7526,7 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co

    // BEGIN transform output
    Mat top_blob_bordered;
    top_blob_bordered.create(outw, outh, outch);
    top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
    {
 //         const float otm[6][8] = {
 //             {1.0f,  1.0f,   1.0f,   1.0f,   1.0f,  32.0f, 32.0f, 0.0f},
@@ -7553,7 +7553,7 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co
        int h_tm = outh / 6 * 8;
        const int tiles = w_tm/8 * h_tm/8;

        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p<outch; p++)
        {
            const Mat out0_tm = top_blob_tm.channel(p);
@@ -8157,10 +8157,10 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co
    // END transform output

    // cut result pad
    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w);
    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt.blob_allocator, opt.num_threads);
 }

 static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias)
 static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
@@ -8178,7 +8178,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co

    w = outw + 2;
    h = outh + 2;
    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f);
    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt.workspace_allocator, opt.num_threads);

    const float* bias = _bias;

@@ -8188,7 +8188,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co
        int w_tm = outw / 6 * 8;
        int h_tm = outh / 6 * 8;
        const int tiles = w_tm/8 * h_tm/8;
        bottom_blob_tm.create(1, 64 * tiles, inch);
        bottom_blob_tm.create(1, 64 * tiles, inch, 4u, opt.workspace_allocator);
 //         bottom_blob_tm.create(inch, tiles, 64);

 //         const float itm[8][8] = {
@@ -8229,7 +8229,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co
        float32x4_t _coeff1 = vld1q_f32(coeff+4);
 #endif // __ARM_NEON

        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q<inch; q++)
        {
            const Mat img0 = bottom_blob_bordered.channel(q);
@@ -9054,9 +9054,9 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co
        // permute
        // bottom_blob_tm.create(1, 64 * tiles, inch);
 //         Mat bottom_blob_tm2(inch, tiles, 64);
        Mat bottom_blob_tm2(8*inch, tiles/8 + (tiles%8)/4 + tiles%4, 64);
        Mat bottom_blob_tm2(8*inch, tiles/8 + (tiles%8)/4 + tiles%4, 64, 4u, opt.workspace_allocator);

        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int r=0; r<64; r++)
        {
            Mat tm2 = bottom_blob_tm2.channel(r);
@@ -9147,7 +9147,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co
        nn_outch = outch >> 3;
        remain_outch_start = nn_outch << 3;

        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int pp=0; pp<nn_outch; pp++)
        {
            int p = pp * 8;
@@ -9592,7 +9592,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co

        nn_outch = (outch - remain_outch_start) >> 2;

        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int pp=0; pp<nn_outch; pp++)
        {
            int p = remain_outch_start + pp * 4;
@@ -10332,6 +10332,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co

        remain_outch_start += nn_outch << 2;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p=remain_outch_start; p<outch; p++)
        {
 #if __ARM_NEON && __aarch64__
@@ -10738,7 +10739,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co

    // BEGIN transform output
    Mat top_blob_bordered;
    top_blob_bordered.create(outw, outh, outch);
    top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
    {
 //         const float otm[6][8] = {
 //             {1.0f,  1.0f,   1.0f,   1.0f,   1.0f,  32.0f, 32.0f, 0.0f},
@@ -10765,7 +10766,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co
        int h_tm = outh / 6 * 8;
        const int tiles = w_tm/8 * h_tm/8;

        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p<outch; p++)
        {
            const Mat out0_tm = top_blob_tm.channel(p);
@@ -11514,10 +11515,10 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co
    // END transform output

    // cut result pad
    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w);
    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt.blob_allocator, opt.num_threads);
 }

 static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
 static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
@@ -11534,7 +11535,7 @@ static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
    int nn_outch = outch >> 1;
    int remain_outch_start = nn_outch << 1;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp=0; pp<nn_outch; pp++)
    {
        int p = pp * 2;
@@ -11858,7 +11859,7 @@ static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
        }
    }

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p=remain_outch_start; p<outch; p++)
    {
        Mat out = top_blob.channel(p);
--- a/src/layer/arm/convolution_4x4.h
+++ b/src/layer/arm/convolution_4x4.h
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON

 static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
 static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
@@ -30,7 +30,7 @@ static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p=0; p<outch; p++)
    {
        Mat out = top_blob.channel(p);
--- a/src/layer/arm/convolution_5x5.h
+++ b/src/layer/arm/convolution_5x5.h
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON

 static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
 static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
@@ -28,7 +28,7 @@ static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p=0; p<outch; p++)
    {
        Mat out = top_blob.channel(p);
@@ -982,7 +982,7 @@ static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke

 }

 static void conv5x5s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
 static void conv5x5s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
@@ -996,7 +996,7 @@ static void conv5x5s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p=0; p<outch; p++)
    {
        Mat out = top_blob.channel(p);
--- a/src/layer/arm/convolution_7x7.h
+++ b/src/layer/arm/convolution_7x7.h
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON

 static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
 static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
@@ -28,7 +28,7 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p=0; p<outch; p++)
    {
        Mat out = top_blob.channel(p);
@@ -706,7 +706,7 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke

 }

 static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
 static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
@@ -720,7 +720,7 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p=0; p<outch; p++)
    {
        Mat out = top_blob.channel(p);
--- a/src/layer/arm/convolution_arm.cpp
+++ b/src/layer/arm/convolution_arm.cpp
@@ -75,10 +75,11 @@ int Convolution_arm::load_model(const ModelBin& mb)
    return 0;
 }

 int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv) const
 int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv, const Option& opt) const
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    size_t elemsize = bottom_blob.elemsize;

    const int kernel_size = kernel_w;
    const int stride = stride_w;
@@ -88,7 +89,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
    Mat bottom_blob_bordered = bottom_blob;
    if (pad_w > 0 || pad_h > 0)
    {
        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f);
        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
        if (bottom_blob_bordered.empty())
            return -100;

@@ -101,7 +102,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
        int hpad = kernel_extent + (h - 1) / stride * stride - h;
        if (wpad > 0 || hpad > 0)
        {
            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f);
            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
            if (bottom_blob_bordered.empty())
                return -100;
        }
@@ -113,7 +114,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
    int outw = (w - kernel_extent) / stride + 1;
    int outh = (h - kernel_extent) / stride + 1;

    top_blob.create(outw, outh, num_output);
    top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

@@ -132,7 +133,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv

            if (inner_bottom_blob.w != inner_w || inner_bottom_blob.h != inner_h)
            {
                inner_bottom_blob.create(inner_w, inner_h, bottom_blob.c);
                inner_bottom_blob.create(inner_w, inner_h, bottom_blob.c, elemsize, opt.workspace_allocator);

                if (inner_bottom_blob.empty())
                {
@@ -142,7 +143,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv

            if (inner_top_blob.w != inner_outw || inner_top_blob.h != inner_outh)
            {
                inner_top_blob.create(inner_outw, inner_outh, num_output);
                inner_top_blob.create(inner_outw, inner_outh, num_output, elemsize, opt.workspace_allocator);

                if (inner_top_blob.empty())
                {
@@ -150,7 +151,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
                }
            }

            #pragma omp parallel for
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int c = 0; c < bottom_blob.c; c ++)
            {
                float *outptr = (float *) inner_bottom_blob.channel(c);
@@ -166,9 +167,9 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
                }
            }

            conv(inner_bottom_blob, inner_top_blob, weight_data, bias_data);
            conv(inner_bottom_blob, inner_top_blob, weight_data, bias_data, opt);

            #pragma omp parallel for
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int c = 0; c < num_output; c ++)
            {
                float *outptr = (float *) top_blob.channel(c) + x * outw + y;
@@ -188,19 +189,19 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
    return 0;
 }

 int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
 int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    // convolv with NxN kernel
    // value = value + bias

    if (bottom_blob.dims != 3)
    {
        return Convolution::forward(bottom_blob, top_blob);
        return Convolution::forward(bottom_blob, top_blob, opt);
    }

    if (kernel_w != kernel_h || stride_w != stride_h)
    {
        return Convolution::forward(bottom_blob, top_blob);
        return Convolution::forward(bottom_blob, top_blob, opt);
    }

    const int kernel_size = kernel_w;
@@ -208,10 +209,10 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const

    if (kernel_size > 7 || stride > 4 || dilation_w != dilation_h)
    {
        return Convolution::forward(bottom_blob, top_blob);
        return Convolution::forward(bottom_blob, top_blob, opt);
    }

    typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&);
    typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&);

    // kernel_size x stride
    conv_func conv_func_table[7][4] =
@@ -263,22 +264,23 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
    conv_func conv = conv_func_table[kernel_size-1][stride-1];
    if (!conv)
    {
        return Convolution::forward(bottom_blob, top_blob);
        return Convolution::forward(bottom_blob, top_blob, opt);
    }

    if (dilation_w != 1)
    {
        return forwardDilation(bottom_blob, top_blob, conv);
        return forwardDilation(bottom_blob, top_blob, conv, opt);
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;

    Mat bottom_blob_bordered = bottom_blob;
    if (pad_w > 0 || pad_h > 0)
    {
        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f);
        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
        if (bottom_blob_bordered.empty())
            return -100;

@@ -291,7 +293,7 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
        int hpad = kernel_size + (h - 1) / stride * stride - h;
        if (wpad > 0 || hpad > 0)
        {
            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f);
            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
            if (bottom_blob_bordered.empty())
                return -100;
        }
@@ -303,21 +305,21 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
    int outw = (w - kernel_size) / stride + 1;
    int outh = (h - kernel_size) / stride + 1;

    top_blob.create(outw, outh, num_output);
    top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if (use_winograd3x3 && w <= 120 && h <= 120)
    {
 //         conv3x3s1_winograd64_neon4(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data);
        conv3x3s1_winograd64_neon5(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data);
 //         conv3x3s1_winograd64_neon4(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
        conv3x3s1_winograd64_neon5(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
    }
    else if (use_sgemm1x1)
    {
        conv1x1s1_sgemm_neon(bottom_blob_bordered, top_blob, weight_1x1_sgemm_data, bias_data);
        conv1x1s1_sgemm_neon(bottom_blob_bordered, top_blob, weight_1x1_sgemm_data, bias_data, opt);
    }
    else
        conv(bottom_blob_bordered, top_blob, weight_data, bias_data);
        conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);

    return 0;
 }
--- a/src/layer/arm/convolution_arm.h
+++ b/src/layer/arm/convolution_arm.h
@@ -19,7 +19,7 @@

 namespace ncnn {

 typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&);
 typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&);

 class Convolution_arm : public Convolution
 {
@@ -28,8 +28,8 @@ public:

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
    virtual int forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv) const;
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    virtual int forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv, const Option& opt) const;

 public:
    bool use_winograd3x3;
--- a/src/layer/arm/convolutiondepthwise_3x3.h
+++ b/src/layer/arm/convolutiondepthwise_3x3.h
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON

 static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
 static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
    int w = bottom_blob.w;

@@ -28,7 +28,7 @@ static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g=0; g<group; g++)
    {
        Mat out = top_blob.channel(g);
@@ -577,7 +577,7 @@ static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
    }
 }

 static void convdw3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
 static void convdw3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
    int w = bottom_blob.w;

@@ -591,7 +591,7 @@ static void convdw3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g=0; g<group; g++)
    {
        Mat out = top_blob.channel(g);
--- a/src/layer/arm/convolutiondepthwise_arm.cpp
+++ b/src/layer/arm/convolutiondepthwise_arm.cpp
@@ -102,7 +102,7 @@ int ConvolutionDepthWise_arm::load_model(const ModelBin& mb)
    return 0;
 }

 int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
 int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    // convolv with NxN kernel
    // value = value + bias
@@ -110,6 +110,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;

    if (channels % group != 0 || num_output % group != 0)
    {
@@ -123,7 +124,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
    Mat bottom_blob_bordered = bottom_blob;
    if (pad_w > 0 || pad_h > 0)
    {
        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f);
        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
        if (bottom_blob_bordered.empty())
            return -100;

@@ -136,7 +137,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
        int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
        if (wpad > 0 || hpad > 0)
        {
            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f);
            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
            if (bottom_blob_bordered.empty())
                return -100;
        }
@@ -148,7 +149,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
    int outw = (w - kernel_extent_w) / stride_w + 1;
    int outh = (h - kernel_extent_h) / stride_h + 1;

    top_blob.create(outw, outh, num_output);
    top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

@@ -161,12 +162,12 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
        {
            if (stride_w == 1 && stride_h == 1)
            {
                convdw3x3s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data);
                convdw3x3s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
                return 0;
            }
            else if (stride_w == 2 && stride_h == 2)
            {
                convdw3x3s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data);
                convdw3x3s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
                return 0;
            }
        }
@@ -176,7 +177,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
        omp_set_nested(0);
 #endif

        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int g=0; g<group; g++)
        {
            Mat bottom_blob_bordered_g(w, h, 1, bottom_blob_bordered.channel(g));
@@ -213,7 +214,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
            op->load_model(ModelBinFromMatArray(weights));

            // forward
            op->forward(bottom_blob_bordered_g, top_blob_g);
            op->forward(bottom_blob_bordered_g, top_blob_g, opt);

            delete op;
        }
@@ -235,7 +236,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
        const ncnn::Layer* op = group_ops[g];

        // forward
        op->forward(bottom_blob_bordered_g, top_blob_g);
        op->forward(bottom_blob_bordered_g, top_blob_g, opt);
    }

    return 0;
--- a/src/layer/arm/convolutiondepthwise_arm.h
+++ b/src/layer/arm/convolutiondepthwise_arm.h
@@ -27,7 +27,7 @@ public:

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

 public:
    std::vector<ncnn::Layer*> group_ops;
--- a/src/layer/arm/deconvolution_3x3.h
+++ b/src/layer/arm/deconvolution_3x3.h
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON

 static void deconv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
 static void deconv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
@@ -28,7 +28,7 @@ static void deconv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p=0; p<outch; p++)
    {
        Mat out = top_blob.channel(p);
@@ -237,7 +237,7 @@ static void deconv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
    }
 }

 static void deconv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
 static void deconv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
@@ -249,7 +249,7 @@ static void deconv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p=0; p<outch; p++)
    {
        Mat out = top_blob.channel(p);
--- a/src/layer/arm/deconvolution_4x4.h
+++ b/src/layer/arm/deconvolution_4x4.h
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON

 static void deconv4x4s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
 static void deconv4x4s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
@@ -28,7 +28,7 @@ static void deconv4x4s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p=0; p<outch; p++)
    {
        Mat out = top_blob.channel(p);
@@ -185,7 +185,7 @@ static void deconv4x4s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
    }
 }

 static void deconv4x4s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
 static void deconv4x4s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
@@ -197,7 +197,7 @@ static void deconv4x4s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p=0; p<outch; p++)
    {
        Mat out = top_blob.channel(p);
--- a/src/layer/arm/deconvolution_arm.cpp
+++ b/src/layer/arm/deconvolution_arm.cpp
@@ -21,14 +21,14 @@ namespace ncnn {

 DEFINE_LAYER_CREATOR(Deconvolution_arm)

 int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
 int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    // deconvolv with NxN kernel
    // value = value + bias

    if (kernel_w != kernel_h || stride_w != stride_h)
    {
        return Deconvolution::forward(bottom_blob, top_blob);
        return Deconvolution::forward(bottom_blob, top_blob, opt);
    }

    const int kernel_size = kernel_w;
@@ -36,10 +36,10 @@ int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const

    if ((kernel_size != 3 && kernel_size != 4) || stride > 2 || dilation_w != 1 || dilation_h != 1)
    {
        return Deconvolution::forward(bottom_blob, top_blob);
        return Deconvolution::forward(bottom_blob, top_blob, opt);
    }

    typedef void (*deconv_func)(const Mat&, Mat&, const Mat&, const Mat&);
    typedef void (*deconv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&);

    // kernel_size x stride
    deconv_func deconv_func_table[2][2] =
@@ -57,33 +57,46 @@ int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
    deconv_func deconv = deconv_func_table[kernel_size-3][stride-1];
    if (!deconv)
    {
        return Deconvolution::forward(bottom_blob, top_blob);
        return Deconvolution::forward(bottom_blob, top_blob, opt);
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    size_t elemsize = bottom_blob.elemsize;

    int outw = (w - 1) * stride + kernel_size;
    int outh = (h - 1) * stride + kernel_size;

    Mat top_blob_bordered = top_blob;
    top_blob_bordered.create(outw, outh, num_output);
    if (top_blob_bordered.empty())
        return -100;

    deconv(bottom_blob, top_blob_bordered, weight_data, bias_data);
    Mat top_blob_bordered;
    if (pad_w > 0 || pad_h > 0)
    {
        top_blob_bordered.create(outw, outh, num_output, elemsize, opt.workspace_allocator);
        if (top_blob_bordered.empty())
            return -100;
    }
    else
    {
        top_blob_bordered = top_blob;
        top_blob_bordered.create(outw, outh, num_output, elemsize, opt.blob_allocator);
        if (top_blob_bordered.empty())
            return -100;
    }

    top_blob = top_blob_bordered;
    deconv(bottom_blob, top_blob_bordered, weight_data, bias_data, opt);

    if (pad_w > 0 || pad_h > 0)
    {
        copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w);
        copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w, opt.blob_allocator, opt.num_threads);
        if (top_blob.empty())
            return -100;

        outw = top_blob.w;
        outh = top_blob.h;
    }
    else
    {
        top_blob = top_blob_bordered;
    }

    return 0;
 }
--- a/src/layer/arm/deconvolution_arm.h
+++ b/src/layer/arm/deconvolution_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class Deconvolution_arm : public Deconvolution
 {
 public:
    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 };

 } // namespace ncnn
--- a/src/layer/arm/deconvolutiondepthwise_arm.cpp
+++ b/src/layer/arm/deconvolutiondepthwise_arm.cpp
@@ -24,7 +24,7 @@ namespace ncnn {

 DEFINE_LAYER_CREATOR(DeconvolutionDepthWise_arm)

 int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
 int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    // convolv with NxN kernel
    // value = value + bias
@@ -32,6 +32,7 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;

    if (channels % group != 0 || num_output % group != 0)
    {
@@ -45,10 +46,20 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c
    int outw = (w - 1) * stride_w + kernel_extent_w;
    int outh = (h - 1) * stride_h + kernel_extent_h;

    Mat top_blob_bordered = top_blob;
    top_blob_bordered.create(outw, outh, num_output);
    if (top_blob_bordered.empty())
        return -100;
    Mat top_blob_bordered;
    if (pad_w > 0 || pad_h > 0)
    {
        top_blob_bordered.create(outw, outh, num_output, elemsize, opt.workspace_allocator);
        if (top_blob_bordered.empty())
            return -100;
    }
    else
    {
        top_blob_bordered = top_blob;
        top_blob_bordered.create(outw, outh, num_output, elemsize, opt.blob_allocator);
        if (top_blob_bordered.empty())
            return -100;
    }

    const int maxk = kernel_w * kernel_h;

@@ -60,7 +71,7 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c
        omp_set_nested(0);
 #endif

        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int g=0; g<group; g++)
        {
            Mat bottom_blob_g(w, h, 1, bottom_blob.channel(g).data);
@@ -98,7 +109,7 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c
            op->load_model(ModelBinFromMatArray(weights));

            // forward
            op->forward(bottom_blob_g, top_blob_bordered_g);
            op->forward(bottom_blob_g, top_blob_bordered_g, opt);

            delete op;
        }
@@ -148,23 +159,25 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c
            op->load_model(ModelBinFromMatArray(weights));

            // forward
            op->forward(bottom_blob_g, top_blob_bordered_g);
            op->forward(bottom_blob_g, top_blob_bordered_g, opt);

            delete op;
        }
    }

    top_blob = top_blob_bordered;

    if (pad_w > 0 || pad_h > 0)
    {
        copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w);
        copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w, opt.blob_allocator, opt.num_threads);
        if (top_blob.empty())
            return -100;

        outw = top_blob.w;
        outh = top_blob.h;
    }
    else
    {
        top_blob = top_blob_bordered;
    }

    return 0;

--- a/src/layer/arm/deconvolutiondepthwise_arm.h
+++ b/src/layer/arm/deconvolutiondepthwise_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class DeconvolutionDepthWise_arm : public DeconvolutionDepthWise
 {
 public:
    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 };

 } // namespace ncnn
--- a/src/layer/arm/eltwise_arm.cpp
+++ b/src/layer/arm/eltwise_arm.cpp
@@ -22,16 +22,17 @@ namespace ncnn {

 DEFINE_LAYER_CREATOR(Eltwise_arm)

 int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
 int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
    const Mat& bottom_blob = bottom_blobs[0];
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int size = w * h;

    Mat& top_blob = top_blobs[0];
    top_blob.create(w, h, channels);
    top_blob.create(w, h, channels, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

@@ -39,7 +40,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
    {
        // first blob
        const Mat& bottom_blob1 = bottom_blobs[1];
        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
@@ -117,7 +118,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
        for (size_t b=2; b<bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob1 = bottom_blobs[b];
            #pragma omp parallel for
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels; q++)
            {
                const float* ptr = bottom_blob1.channel(q);
@@ -193,7 +194,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
        {
            // first blob
            const Mat& bottom_blob1 = bottom_blobs[1];
            #pragma omp parallel for
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
@@ -271,7 +272,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
            for (size_t b=2; b<bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob1 = bottom_blobs[b];
                #pragma omp parallel for
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q=0; q<channels; q++)
                {
                    const float* ptr = bottom_blob1.channel(q);
@@ -349,7 +350,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
            const Mat& bottom_blob1 = bottom_blobs[1];
            float coeff0 = coeffs_ptr[0];
            float coeff1 = coeffs_ptr[1];
            #pragma omp parallel for
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
@@ -436,7 +437,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
            {
                const Mat& bottom_blob1 = bottom_blobs[b];
                float coeff = coeffs_ptr[b];
                #pragma omp parallel for
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q=0; q<channels; q++)
                {
                    const float* ptr = bottom_blob1.channel(q);
@@ -514,7 +515,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
    {
        // first blob
        const Mat& bottom_blob1 = bottom_blobs[1];
        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
@@ -592,7 +593,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
        for (size_t b=2; b<bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob1 = bottom_blobs[b];
            #pragma omp parallel for
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels; q++)
            {
                const float* ptr = bottom_blob1.channel(q);
--- a/src/layer/arm/eltwise_arm.h
+++ b/src/layer/arm/eltwise_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class Eltwise_arm : public Eltwise
 {
 public:
    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 };

 } // namespace ncnn
--- a/src/layer/arm/innerproduct_arm.cpp
+++ b/src/layer/arm/innerproduct_arm.cpp
@@ -22,14 +22,15 @@ namespace ncnn {

 DEFINE_LAYER_CREATOR(InnerProduct_arm)

 int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
 int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int size = w * h;

    top_blob.create(num_output);
    top_blob.create(num_output, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

@@ -38,7 +39,7 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
    int nn_num_output = num_output >> 2;
    int remain_num_output_start = nn_num_output << 2;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp=0; pp<nn_num_output; pp++)
    {
        int p = pp * 4;
@@ -143,7 +144,7 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
    }

    // num_output
    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p=remain_num_output_start; p<num_output; p++)
    {
        float sum = 0.f;
--- a/src/layer/arm/innerproduct_arm.h
+++ b/src/layer/arm/innerproduct_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class InnerProduct_arm : public InnerProduct
 {
 public:
    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 };

 } // namespace ncnn
--- a/src/layer/arm/lrn_arm.cpp
+++ b/src/layer/arm/lrn_arm.cpp
@@ -24,20 +24,21 @@ namespace ncnn {

 DEFINE_LAYER_CREATOR(LRN_arm)

 int LRN_arm::forward_inplace(Mat& bottom_top_blob) const
 int LRN_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    size_t elemsize = bottom_top_blob.elemsize;
    int size = w * h;

    // squared values with local_size padding
    Mat square_blob;
    square_blob.create(w, h, channels);
    square_blob.create(w, h, channels, elemsize, opt.workspace_allocator);
    if (square_blob.empty())
        return -100;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_top_blob.channel(q);
@@ -73,14 +74,14 @@ int LRN_arm::forward_inplace(Mat& bottom_top_blob) const
    if (region_type == NormRegion_ACROSS_CHANNELS)
    {
        Mat square_sum;
        square_sum.create(w, h, channels);
        square_sum.create(w, h, channels, elemsize, opt.workspace_allocator);
        if (square_sum.empty())
            return -100;
        square_sum.fill(0.f);

        const float alpha_div_size = alpha / local_size;

        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            // square sum
@@ -165,7 +166,7 @@ int LRN_arm::forward_inplace(Mat& bottom_top_blob) const
        int pad = local_size / 2;
        if (pad > 0)
        {
            copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f);
            copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
            if (square_blob_bordered.empty())
                return -100;

@@ -196,7 +197,7 @@ int LRN_arm::forward_inplace(Mat& bottom_top_blob) const
            }
        }

        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
--- a/src/layer/arm/lrn_arm.h
+++ b/src/layer/arm/lrn_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class LRN_arm : public LRN
 {
 public:
    virtual int forward_inplace(Mat& bottom_top_blob) const;
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 };

 } // namespace ncnn
--- a/src/layer/arm/pooling_2x2.h
+++ b/src/layer/arm/pooling_2x2.h
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON

 static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob)
 static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
@@ -26,7 +26,7 @@ static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob)
    
    const int tailstep = w - 2*outw + w;
    
    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<inch; q++)
    {
        const float* img0 = bottom_blob.channel(q);
--- a/src/layer/arm/pooling_3x3.h
+++ b/src/layer/arm/pooling_3x3.h
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON

 static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob)
 static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
@@ -26,7 +26,7 @@ static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob)

    const int tailstep = w - 2*outw + w;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<inch; q++)
    {
        const float* img0 = bottom_blob.channel(q);
--- a/src/layer/arm/pooling_arm.cpp
+++ b/src/layer/arm/pooling_arm.cpp
@@ -21,14 +21,14 @@ namespace ncnn {

 DEFINE_LAYER_CREATOR(Pooling_arm)

 int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
 int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    // max value in NxN window
    // avg value in NxN window

    if (kernel_w != kernel_h || stride_w != stride_h)
    {
        return Pooling::forward(bottom_blob, top_blob);
        return Pooling::forward(bottom_blob, top_blob, opt);
    }

    const int kernel_size = kernel_w;
@@ -36,17 +36,18 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const

    if (pooling_type != PoolMethod_MAX || stride != 2 || global_pooling == 1)
    {
        return Pooling::forward(bottom_blob, top_blob);
        return Pooling::forward(bottom_blob, top_blob, opt);
    }

    if (kernel_size != 2 && kernel_size != 3)
    {
        return Pooling::forward(bottom_blob, top_blob);
        return Pooling::forward(bottom_blob, top_blob, opt);
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;

    Mat bottom_blob_bordered = bottom_blob;

@@ -73,7 +74,7 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
        if (htail != 0)
            htailpad = stride_h - htail;

        copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom + htailpad, pad_left, pad_right + wtailpad, BORDER_CONSTANT, pad_value);
        copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom + htailpad, pad_left, pad_right + wtailpad, BORDER_CONSTANT, pad_value, opt.workspace_allocator, opt.num_threads);
        if (bottom_blob_bordered.empty())
            return -100;

@@ -82,7 +83,7 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
    }
    else if (pad_mode == 1) // valid padding
    {
        copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, BORDER_CONSTANT, pad_value);
        copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, BORDER_CONSTANT, pad_value, opt.workspace_allocator, opt.num_threads);
        if (bottom_blob_bordered.empty())
            return -100;

@@ -95,7 +96,7 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
        int hpad = kernel_h + (h - 1) / stride_h * stride_h - h;
        if (wpad > 0 || hpad > 0)
        {
            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value);
            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value, opt.workspace_allocator, opt.num_threads);
            if (bottom_blob_bordered.empty())
                return -100;
        }
@@ -107,14 +108,14 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
    int outw = (w - kernel_w) / stride_w + 1;
    int outh = (h - kernel_h) / stride_h + 1;

    top_blob.create(outw, outh, channels);
    top_blob.create(outw, outh, channels, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if (kernel_size == 2)
        pooling2x2s2_max_neon(bottom_blob_bordered, top_blob);
        pooling2x2s2_max_neon(bottom_blob_bordered, top_blob, opt);
    if (kernel_size == 3)
        pooling3x3s2_max_neon(bottom_blob_bordered, top_blob);
        pooling3x3s2_max_neon(bottom_blob_bordered, top_blob, opt);

    return 0;
 }
--- a/src/layer/arm/pooling_arm.h
+++ b/src/layer/arm/pooling_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class Pooling_arm : public Pooling
 {
 public:
    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 };

 } // namespace ncnn
--- a/src/layer/arm/prelu_arm.cpp
+++ b/src/layer/arm/prelu_arm.cpp
@@ -22,11 +22,11 @@ namespace ncnn {

 DEFINE_LAYER_CREATOR(PReLU_arm)

 int PReLU_arm::forward_inplace(Mat& bottom_top_blob) const
 int PReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
    int dims = bottom_top_blob.dims;
    if (dims != 3)
        return PReLU::forward_inplace(bottom_top_blob);
        return PReLU::forward_inplace(bottom_top_blob, opt);

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
@@ -35,7 +35,7 @@ int PReLU_arm::forward_inplace(Mat& bottom_top_blob) const

    const float* slope_data_ptr = slope_data;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
--- a/src/layer/arm/prelu_arm.h
+++ b/src/layer/arm/prelu_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class PReLU_arm : public PReLU
 {
 public:
    virtual int forward_inplace(Mat& bottom_top_blob) const;
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 };

 } // namespace ncnn
--- a/src/layer/arm/relu_arm.cpp
+++ b/src/layer/arm/relu_arm.cpp
@@ -22,7 +22,7 @@ namespace ncnn {

 DEFINE_LAYER_CREATOR(ReLU_arm)

 int ReLU_arm::forward_inplace(Mat& bottom_top_blob) const
 int ReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
@@ -31,7 +31,7 @@ int ReLU_arm::forward_inplace(Mat& bottom_top_blob) const

    if (slope == 0.f)
    {
        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
@@ -85,7 +85,7 @@ int ReLU_arm::forward_inplace(Mat& bottom_top_blob) const
    }
    else
    {
        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
--- a/src/layer/arm/relu_arm.h
+++ b/src/layer/arm/relu_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class ReLU_arm : public ReLU
 {
 public:
    virtual int forward_inplace(Mat& bottom_top_blob) const;
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 };

 } // namespace ncnn
--- a/src/layer/arm/scale_arm.cpp
+++ b/src/layer/arm/scale_arm.cpp
@@ -22,11 +22,11 @@ namespace ncnn {

 DEFINE_LAYER_CREATOR(Scale_arm)

 int Scale_arm::forward_inplace(Mat& bottom_top_blob) const
 int Scale_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
    int dims = bottom_top_blob.dims;
    if (dims != 3)
        return Scale::forward_inplace(bottom_top_blob);
        return Scale::forward_inplace(bottom_top_blob, opt);

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
@@ -37,7 +37,7 @@ int Scale_arm::forward_inplace(Mat& bottom_top_blob) const
    {
        const float* scale_ptr = scale_data;
        const float* bias_ptr = bias_data;
        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
@@ -76,7 +76,7 @@ int Scale_arm::forward_inplace(Mat& bottom_top_blob) const
    else
    {
        const float* scale_ptr = scale_data;
        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
--- a/src/layer/arm/scale_arm.h
+++ b/src/layer/arm/scale_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class Scale_arm : public Scale
 {
 public:
    virtual int forward_inplace(Mat& bottom_top_blob) const;
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 };

 } // namespace ncnn
--- a/src/layer/arm/sigmoid_arm.cpp
+++ b/src/layer/arm/sigmoid_arm.cpp
@@ -25,14 +25,14 @@ namespace ncnn {

 DEFINE_LAYER_CREATOR(Sigmoid_arm)

 int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob) const
 int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
--- a/src/layer/arm/sigmoid_arm.h
+++ b/src/layer/arm/sigmoid_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class Sigmoid_arm : public Sigmoid
 {
 public:
    virtual int forward_inplace(Mat& bottom_top_blob) const;
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 };

 } // namespace ncnn
--- a/src/layer/arm/softmax_arm.cpp
+++ b/src/layer/arm/softmax_arm.cpp
@@ -25,12 +25,12 @@ namespace ncnn {

 DEFINE_LAYER_CREATOR(Softmax_arm)

 int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const
 int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
    int dims = bottom_top_blob.dims;

    if (dims != 3 || axis != 0)
        return Softmax::forward_inplace(bottom_top_blob);
        return Softmax::forward_inplace(bottom_top_blob, opt);

    // value = exp( value - global max value )
    // sum all value
@@ -39,10 +39,11 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    size_t elemsize = bottom_top_blob.elemsize;
    int size = w * h;

    Mat max;
    max.create(w, h);
    max.create(w, h, elemsize, opt.workspace_allocator);
    if (max.empty())
        return -100;
    max.fill(-FLT_MAX);
@@ -57,7 +58,7 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const
        }
    }

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
@@ -95,7 +96,7 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const
    }

    Mat sum;
    sum.create(w, h);
    sum.create(w, h, elemsize, opt.workspace_allocator);
    if (sum.empty())
        return -100;
    sum.fill(0.f);
@@ -133,7 +134,7 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const
        }
    }

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
--- a/src/layer/arm/softmax_arm.h
+++ b/src/layer/arm/softmax_arm.h
@@ -22,7 +22,7 @@ namespace ncnn {
 class Softmax_arm : public Softmax
 {
 public:
    virtual int forward_inplace(Mat& bottom_top_blob) const;
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 };

 } // namespace ncnn
--- a/src/layer/batchnorm.cpp
+++ b/src/layer/batchnorm.cpp
@@ -68,7 +68,7 @@ int BatchNorm::load_model(const ModelBin& mb)
    return 0;
 }

 int BatchNorm::forward_inplace(Mat& bottom_top_blob) const
 int BatchNorm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
    // a = bias - slope * mean / sqrt(var)
    // b = slope / sqrt(var)
@@ -82,7 +82,7 @@ int BatchNorm::forward_inplace(Mat& bottom_top_blob) const

        float* ptr = bottom_top_blob;

        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i=0; i<w; i++)
        {
            ptr[i] = b_data[i] * ptr[i] + a_data[i];
@@ -94,7 +94,7 @@ int BatchNorm::forward_inplace(Mat& bottom_top_blob) const
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;

        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i=0; i<h; i++)
        {
            float* ptr = bottom_top_blob.row(i);
@@ -114,7 +114,7 @@ int BatchNorm::forward_inplace(Mat& bottom_top_blob) const
        int h = bottom_top_blob.h;
        int size = w * h;

        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
--- a/src/layer/batchnorm.h
+++ b/src/layer/batchnorm.h
@@ -28,7 +28,7 @@ public:

    virtual int load_model(const ModelBin& mb);

    virtual int forward_inplace(Mat& bottom_top_blob) const;
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

 public:
    // param
--- a/src/layer/bias.cpp
+++ b/src/layer/bias.cpp
@@ -40,14 +40,14 @@ int Bias::load_model(const ModelBin& mb)
    return 0;
 }

 int Bias::forward_inplace(Mat& bottom_top_blob) const
 int Bias::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
--- a/src/layer/bias.h
+++ b/src/layer/bias.h
@@ -28,7 +28,7 @@ public:

    virtual int load_model(const ModelBin& mb);

    virtual int forward_inplace(Mat& bottom_top_blob) const;
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

 public:
    // param
--- a/src/layer/binaryop.cpp
+++ b/src/layer/binaryop.cpp
@@ -43,7 +43,7 @@ int BinaryOp::load_param(const ParamDict& pd)
 }

 template<typename Op>
 static int binary_op(const Mat& a, const Mat& b, Mat& c)
 static int binary_op(const Mat& a, const Mat& b, Mat& c, const Option& opt)
 {
    Op op;

@@ -51,6 +51,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
    int h = a.h;
    int channels = a.c;
    int size = w * h;
    size_t elemsize = a.elemsize;

    int w1 = b.w;
    int h1 = b.h;
@@ -59,13 +60,13 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)

    if (a.dims == 3)
    {
        c.create(w, h, channels);
        c.create(w, h, channels, elemsize, opt.blob_allocator);
        if (c.empty())
            return -100;

        if (b.dims == 3)
        {
            #pragma omp parallel for
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels; q++)
            {
                const float* ptr = a.channel(q);
@@ -83,7 +84,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)

        if (b.dims == 2)
        {
            #pragma omp parallel for
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels; q++)
            {
                const float* ptr = a.channel(q);
@@ -111,7 +112,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
            if (b.w == 1)
            {
                const float b0 = b[0];
                #pragma omp parallel for
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q=0; q<channels; q++)
                {
                    const float* ptr = a.channel(q);
@@ -126,7 +127,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
                return 0;
            }

            #pragma omp parallel for
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels; q++)
            {
                const float* ptr = a.channel(q);
@@ -146,11 +147,11 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
    {
        if (b.dims == 3)
        {
            c.create(w1, h1, channels1);
            c.create(w1, h1, channels1, elemsize, opt.blob_allocator);
            if (c.empty())
                return -100;

            #pragma omp parallel for
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels1; q++)
            {
                const float* ptr = (const float*)a + h1 * q;
@@ -173,7 +174,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
            return 0;
        }

        c.create(w, h);
        c.create(w, h, elemsize, opt.blob_allocator);
        if (c.empty())
            return -100;

@@ -189,7 +190,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)

        if (b.dims == 1)
        {
            c.create(w, h);
            c.create(w, h, elemsize, opt.blob_allocator);
            if (c.empty())
                return -100;

@@ -228,12 +229,12 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
        {
            if (b.dims == 3)
            {
                c.create(w1, h1, channels1);
                c.create(w1, h1, channels1, elemsize, opt.blob_allocator);
                if (c.empty())
                    return -100;

                const float a0 = a[0];
                #pragma omp parallel for
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q=0; q<channels1; q++)
                {
                    const float* ptr1 = b.channel(q);
@@ -250,7 +251,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)

            if (b.dims == 2)
            {
                c.create(w1, h1);
                c.create(w1, h1, elemsize, opt.blob_allocator);
                if (c.empty())
                    return -100;

@@ -265,7 +266,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)

            if (b.dims == 1)
            {
                c.create(w1);
                c.create(w1, elemsize, opt.blob_allocator);
                if (c.empty())
                    return -100;

@@ -281,11 +282,11 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)

        if (b.dims == 3)
        {
            c.create(w1, h1, channels1);
            c.create(w1, h1, channels1, elemsize, opt.blob_allocator);
            if (c.empty())
                return -100;

            #pragma omp parallel for
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels1; q++)
            {
                const float a0 = a[q];
@@ -303,7 +304,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)

        if (b.dims == 2)
        {
            c.create(w1, h1);
            c.create(w1, h1, elemsize, opt.blob_allocator);
            if (c.empty())
                return -100;

@@ -327,7 +328,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)

        if (b.dims == 1)
        {
            c.create(w);
            c.create(w, elemsize, opt.blob_allocator);
            if (c.empty())
                return -100;

@@ -353,7 +354,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
 }

 template<typename Op>
 static int binary_op_scalar_inplace(Mat& a, float b)
 static int binary_op_scalar_inplace(Mat& a, float b, const Option& opt)
 {
    Op op;

@@ -362,7 +363,7 @@ static int binary_op_scalar_inplace(Mat& a, float b)
    int channels = a.c;
    int size = w * h;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        float* ptr = a.channel(q);
@@ -401,7 +402,7 @@ struct binary_op_rdiv : std::binary_function<T,T,T> {
    T operator() (const T& x, const T& y) const { return y / x; }
 };

 int BinaryOp::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
 int BinaryOp::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& bottom_blob1 = bottom_blobs[1];
@@ -409,63 +410,63 @@ int BinaryOp::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
    Mat& top_blob = top_blobs[0];

    if (op_type == Operation_ADD)
        return binary_op< std::plus<float> >(bottom_blob, bottom_blob1, top_blob);
        return binary_op< std::plus<float> >(bottom_blob, bottom_blob1, top_blob, opt);

    if (op_type == Operation_SUB)
        return binary_op< std::minus<float> >(bottom_blob, bottom_blob1, top_blob);
        return binary_op< std::minus<float> >(bottom_blob, bottom_blob1, top_blob, opt);

    if (op_type == Operation_MUL)
        return binary_op< std::multiplies<float> >(bottom_blob, bottom_blob1, top_blob);
        return binary_op< std::multiplies<float> >(bottom_blob, bottom_blob1, top_blob, opt);

    if (op_type == Operation_DIV)
        return binary_op< std::divides<float> >(bottom_blob, bottom_blob1, top_blob);
        return binary_op< std::divides<float> >(bottom_blob, bottom_blob1, top_blob, opt);

    if (op_type == Operation_MAX)
        return binary_op< binary_op_max<float> >(bottom_blob, bottom_blob1, top_blob);
        return binary_op< binary_op_max<float> >(bottom_blob, bottom_blob1, top_blob, opt);

    if (op_type == Operation_MIN)
        return binary_op< binary_op_min<float> >(bottom_blob, bottom_blob1, top_blob);
        return binary_op< binary_op_min<float> >(bottom_blob, bottom_blob1, top_blob, opt);

    if (op_type == Operation_POW)
        return binary_op< binary_op_pow<float> >(bottom_blob, bottom_blob1, top_blob);
        return binary_op< binary_op_pow<float> >(bottom_blob, bottom_blob1, top_blob, opt);

    if (op_type == Operation_RSUB)
        return binary_op< binary_op_rsub<float> >(bottom_blob, bottom_blob1, top_blob);
        return binary_op< binary_op_rsub<float> >(bottom_blob, bottom_blob1, top_blob, opt);

    if (op_type == Operation_RDIV)
        return binary_op< binary_op_rdiv<float> >(bottom_blob, bottom_blob1, top_blob);
        return binary_op< binary_op_rdiv<float> >(bottom_blob, bottom_blob1, top_blob, opt);

    return 0;
 }

 int BinaryOp::forward_inplace(Mat& bottom_top_blob) const
 int BinaryOp::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
    if (op_type == Operation_ADD)
        return binary_op_scalar_inplace< std::plus<float> >(bottom_top_blob, b);
        return binary_op_scalar_inplace< std::plus<float> >(bottom_top_blob, b, opt);

    if (op_type == Operation_SUB)
        return binary_op_scalar_inplace< std::minus<float> >(bottom_top_blob, b);
        return binary_op_scalar_inplace< std::minus<float> >(bottom_top_blob, b, opt);

    if (op_type == Operation_MUL)
        return binary_op_scalar_inplace< std::multiplies<float> >(bottom_top_blob, b);
        return binary_op_scalar_inplace< std::multiplies<float> >(bottom_top_blob, b, opt);

    if (op_type == Operation_DIV)
        return binary_op_scalar_inplace< std::divides<float> >(bottom_top_blob, b);
        return binary_op_scalar_inplace< std::divides<float> >(bottom_top_blob, b, opt);

    if (op_type == Operation_MAX)
        return binary_op_scalar_inplace< binary_op_max<float> >(bottom_top_blob, b);
        return binary_op_scalar_inplace< binary_op_max<float> >(bottom_top_blob, b, opt);

    if (op_type == Operation_MIN)
        return binary_op_scalar_inplace< binary_op_min<float> >(bottom_top_blob, b);
        return binary_op_scalar_inplace< binary_op_min<float> >(bottom_top_blob, b, opt);

    if (op_type == Operation_POW)
        return binary_op_scalar_inplace< binary_op_pow<float> >(bottom_top_blob, b);
        return binary_op_scalar_inplace< binary_op_pow<float> >(bottom_top_blob, b, opt);

    if (op_type == Operation_RSUB)
        return binary_op_scalar_inplace< binary_op_rsub<float> >(bottom_top_blob, b);
        return binary_op_scalar_inplace< binary_op_rsub<float> >(bottom_top_blob, b, opt);

    if (op_type == Operation_RDIV)
        return binary_op_scalar_inplace< binary_op_rdiv<float> >(bottom_top_blob, b);
        return binary_op_scalar_inplace< binary_op_rdiv<float> >(bottom_top_blob, b, opt);

    return 0;
 }
--- a/src/layer/binaryop.h
+++ b/src/layer/binaryop.h
@@ -26,9 +26,9 @@ public:

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

    virtual int forward_inplace(Mat& bottom_top_blob) const;
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

    enum {
        Operation_ADD   = 0,
--- a/src/layer/bnll.cpp
+++ b/src/layer/bnll.cpp
@@ -25,14 +25,14 @@ BNLL::BNLL()
    support_inplace = true;
 }

 int BNLL::forward_inplace(Mat& bottom_top_blob) const
 int BNLL::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
--- a/src/layer/bnll.h
+++ b/src/layer/bnll.h
@@ -24,7 +24,7 @@ class BNLL : public Layer
 public:
    BNLL();

    virtual int forward_inplace(Mat& bottom_top_blob) const;
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

 public:
 };
--- a/src/layer/clip.cpp
+++ b/src/layer/clip.cpp
@@ -34,14 +34,14 @@ int Clip::load_param(const ParamDict& pd)
    return 0;
 }

 int Clip::forward_inplace(Mat& bottom_top_blob) const
 int Clip::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
--- a/src/layer/clip.h
+++ b/src/layer/clip.h
@@ -26,7 +26,7 @@ public:

    virtual int load_param(const ParamDict& pd);

    virtual int forward_inplace(Mat& bottom_top_blob) const;
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

 public:
    float min;
--- a/src/layer/concat.cpp
+++ b/src/layer/concat.cpp
@@ -31,7 +31,7 @@ int Concat::load_param(const ParamDict& pd)
    return 0;
 }

 int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
 int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
    int dims = bottom_blobs[0].dims;
    size_t elemsize = bottom_blobs[0].elemsize;
@@ -48,7 +48,7 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w, elemsize);
        top_blob.create(top_w, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

@@ -82,7 +82,7 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, top_h, elemsize);
        top_blob.create(w, top_h, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

@@ -116,11 +116,11 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w, h, elemsize);
        top_blob.create(top_w, h, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i=0; i<h; i++)
        {
            float* outptr = top_blob.row(i);
@@ -153,7 +153,7 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, h, top_channels, elemsize);
        top_blob.create(w, h, top_channels, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

@@ -190,11 +190,11 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, top_h, channels, elemsize);
        top_blob.create(w, top_h, channels, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* outptr = top_blob.channel(q);
@@ -230,11 +230,11 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w, h, channels, elemsize);
        top_blob.create(top_w, h, channels, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* outptr = top_blob.channel(q);
--- a/src/layer/concat.h
+++ b/src/layer/concat.h
@@ -26,7 +26,7 @@ public:

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

 public:
    int axis;
--- a/src/layer/convolution.cpp
+++ b/src/layer/convolution.cpp
@@ -59,7 +59,7 @@ int Convolution::load_model(const ModelBin& mb)
    return 0;
 }

 int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
 int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    // convolv with NxN kernel
    // value = value + bias
@@ -89,7 +89,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
            op->load_model(ModelBinFromMatArray(weights));

            // forward
            op->forward(bottom_blob, top_blob);
            op->forward(bottom_blob, top_blob, opt);

            delete op;

@@ -100,6 +100,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;

 //     fprintf(stderr, "Convolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d\n", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);

@@ -109,7 +110,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
    Mat bottom_blob_bordered = bottom_blob;
    if (pad_w > 0 || pad_h > 0)
    {
        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f);
        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
        if (bottom_blob_bordered.empty())
            return -100;

@@ -122,7 +123,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
        int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
        if (wpad > 0 || hpad > 0)
        {
            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f);
            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
            if (bottom_blob_bordered.empty())
                return -100;
        }
@@ -134,7 +135,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
    int outw = (w - kernel_extent_w) / stride_w + 1;
    int outh = (h - kernel_extent_h) / stride_h + 1;

    top_blob.create(outw, outh, num_output);
    top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

@@ -160,7 +161,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
    }

    // num_output
    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p=0; p<num_output; p++)
    {
        float* outptr = top_blob.channel(p);
--- a/src/layer/convolution.h
+++ b/src/layer/convolution.h
@@ -28,7 +28,7 @@ public:

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

 public:
    // param
--- a/src/layer/convolutiondepthwise.cpp
+++ b/src/layer/convolutiondepthwise.cpp
@@ -64,7 +64,7 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb)
    return 0;
 }

 int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
 int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    // convolv with NxN kernel
    // value = value + bias
@@ -72,6 +72,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;

    if (channels % group != 0 || num_output % group != 0)
    {
@@ -87,7 +88,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
    Mat bottom_blob_bordered = bottom_blob;
    if (pad_w > 0 || pad_h > 0)
    {
        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f);
        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
        if (bottom_blob_bordered.empty())
            return -100;

@@ -100,7 +101,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
        int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
        if (wpad > 0 || hpad > 0)
        {
            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f);
            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
            if (bottom_blob_bordered.empty())
                return -100;
        }
@@ -112,7 +113,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
    int outw = (w - kernel_extent_w) / stride_w + 1;
    int outh = (h - kernel_extent_h) / stride_h + 1;

    top_blob.create(outw, outh, num_output);
    top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

@@ -140,7 +141,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
    // depth-wise
    if (channels == group && group == num_output)
    {
        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int g=0; g<group; g++)
        {
            float* outptr = top_blob.channel(g);
@@ -179,9 +180,9 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
    const int num_output_g = num_output / group;

 #ifdef _WIN32
    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
 #else // _WIN32
    #pragma omp parallel for collapse(2)
    #pragma omp parallel for collapse(2) num_threads(opt.num_threads)
 #endif // _WIN32
    for (int g=0; g<group; g++)
    {
--- a/src/layer/convolutiondepthwise.h
+++ b/src/layer/convolutiondepthwise.h
@@ -28,7 +28,7 @@ public:

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

 public:
    // param
--- a/src/layer/crop.cpp
+++ b/src/layer/crop.cpp
@@ -39,7 +39,7 @@ int Crop::load_param(const ParamDict& pd)
    return 0;
 }

 int Crop::forward(const Mat& bottom_blob, Mat& top_blob) const
 int Crop::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
@@ -56,14 +56,14 @@ int Crop::forward(const Mat& bottom_blob, Mat& top_blob) const
    int left = woffset;
    int right = w - _outw - woffset;

    copy_cut_border(bottom_blob_sliced, top_blob, top, bottom, left, right);
    copy_cut_border(bottom_blob_sliced, top_blob, top, bottom, left, right, opt.blob_allocator, opt.num_threads);
    if (top_blob.empty())
        return -100;

    return 0;
 }

 int Crop::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
 int Crop::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& reference_blob = bottom_blobs[1];
@@ -85,7 +85,7 @@ int Crop::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl

    Mat& top_blob = top_blobs[0];

    copy_cut_border(bottom_blob_sliced, top_blob, top, bottom, left, right);
    copy_cut_border(bottom_blob_sliced, top_blob, top, bottom, left, right, opt.blob_allocator, opt.num_threads);
    if (top_blob.empty())
        return -100;

--- a/src/layer/crop.h
+++ b/src/layer/crop.h
@@ -26,9 +26,9 @@ public:

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

 public:
    int woffset;
--- a/src/layer/deconvolution.cpp
+++ b/src/layer/deconvolution.cpp
@@ -57,7 +57,7 @@ int Deconvolution::load_model(const ModelBin& mb)
    return 0;
 }

 int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const
 int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    // backward strided convolv with NxN kernel
    // value = value + bias
@@ -65,6 +65,7 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;

 //     fprintf(stderr, "Deconvolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d\n", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);

@@ -74,10 +75,20 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const
    int outw = (w - 1) * stride_w + kernel_extent_w;
    int outh = (h - 1) * stride_h + kernel_extent_h;

    Mat top_blob_bordered = top_blob;
    top_blob_bordered.create(outw, outh, num_output);
    if (top_blob_bordered.empty())
        return -100;
    Mat top_blob_bordered;
    if (pad_w > 0 || pad_h > 0)
    {
        top_blob_bordered.create(outw, outh, num_output, elemsize, opt.workspace_allocator);
        if (top_blob_bordered.empty())
            return -100;
    }
    else
    {
        top_blob_bordered = top_blob;
        top_blob_bordered.create(outw, outh, num_output, elemsize, opt.blob_allocator);
        if (top_blob_bordered.empty())
            return -100;
    }

    const int maxk = kernel_w * kernel_h;

@@ -101,7 +112,7 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const
    }

    // num_output
    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p=0; p<num_output; p++)
    {
        Mat out = top_blob_bordered.channel(p);
@@ -136,17 +147,19 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const
        }
    }

    top_blob = top_blob_bordered;

    if (pad_w > 0 || pad_h > 0)
    {
        copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w);
        copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w, opt.blob_allocator, opt.num_threads);
        if (top_blob.empty())
            return -100;

        outw = top_blob.w;
        outh = top_blob.h;
    }
    else
    {
        top_blob = top_blob_bordered;
    }

    return 0;
 }
--- a/src/layer/deconvolution.h
+++ b/src/layer/deconvolution.h
@@ -28,7 +28,7 @@ public:

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

 public:
    // param
--- a/src/layer/deconvolutiondepthwise.cpp
+++ b/src/layer/deconvolutiondepthwise.cpp
@@ -58,7 +58,7 @@ int DeconvolutionDepthWise::load_model(const ModelBin& mb)
    return 0;
 }

 int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
 int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    // deconvolv with NxN kernel
    // value = value + bias
@@ -66,6 +66,7 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;

    if (channels % group != 0 || num_output % group != 0)
    {
@@ -79,10 +80,20 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
    int outw = (w - 1) * stride_w + kernel_extent_w;
    int outh = (h - 1) * stride_h + kernel_extent_h;

    Mat top_blob_bordered = top_blob;
    top_blob_bordered.create(outw, outh, num_output);
    if (top_blob_bordered.empty())
        return -100;
    Mat top_blob_bordered;
    if (pad_w > 0 || pad_h > 0)
    {
        top_blob_bordered.create(outw, outh, num_output, elemsize, opt.workspace_allocator);
        if (top_blob_bordered.empty())
            return -100;
    }
    else
    {
        top_blob_bordered = top_blob;
        top_blob_bordered.create(outw, outh, num_output, elemsize, opt.blob_allocator);
        if (top_blob_bordered.empty())
            return -100;
    }

    const int maxk = kernel_w * kernel_h;

@@ -108,7 +119,7 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
    // depth-wise
    if (channels == group && group == num_output)
    {
        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int g=0; g<group; g++)
        {
            const float* inptr = bottom_blob.channel(g);
@@ -141,7 +152,7 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
        const int channels_g = channels / group;
        const int num_output_g = num_output / group;

        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int g = 0; g < group; g++)
        {
            const float* weight_data_ptr = (const float*)weight_data + maxk * channels_g * num_output_g * g;
@@ -180,17 +191,19 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
        }
    }

    top_blob = top_blob_bordered;

    if (pad_w > 0 || pad_h > 0)
    {
        copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w);
        copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w, opt.blob_allocator, opt.num_threads);
        if (top_blob.empty())
            return -100;

        outw = top_blob.w;
        outh = top_blob.h;
    }
    else
    {
        top_blob = top_blob_bordered;
    }

    return 0;
 }
--- a/src/layer/deconvolutiondepthwise.h
+++ b/src/layer/deconvolutiondepthwise.h
@@ -28,7 +28,7 @@ public:

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

 public:
    // param
--- a/src/layer/detectionoutput.cpp
+++ b/src/layer/detectionoutput.cpp
@@ -141,7 +141,7 @@ static void nms_sorted_bboxes(const std::vector<BBoxRect>& bboxes, std::vector<i
    }
 }

 int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
 int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
    const Mat& location = bottom_blobs[0];
    const Mat& confidence = bottom_blobs[1];
@@ -151,7 +151,7 @@ int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<M

    // apply location with priorbox
    Mat bboxes;
    bboxes.create(4, num_prior);
    bboxes.create(4, num_prior, 4u, opt.workspace_allocator);
    if (bboxes.empty())
        return -100;

@@ -159,7 +159,7 @@ int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<M
    const float* priorbox_ptr = priorbox.row(0);
    const float* variance_ptr = priorbox.row(1);

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int i = 0; i < num_prior; i++)
    {
        const float* loc = location_ptr + i * 4;
@@ -192,7 +192,7 @@ int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<M
    all_class_bbox_scores.resize(num_class);

    // start from 1 to ignore background class
    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int i = 1; i < num_class; i++)
    {
        // filter by confidence_threshold
@@ -262,7 +262,7 @@ int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<M
    int num_detected = bbox_rects.size();

    Mat& top_blob = top_blobs[0];
    top_blob.create(6, num_detected);
    top_blob.create(6, num_detected, 4u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

--- a/src/layer/detectionoutput.h
+++ b/src/layer/detectionoutput.h
@@ -26,7 +26,7 @@ public:

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

 public:
    int num_class;
--- a/src/layer/dropout.cpp
+++ b/src/layer/dropout.cpp
@@ -31,7 +31,7 @@ int Dropout::load_param(const ParamDict& pd)
    return 0;
 }

 int Dropout::forward_inplace(Mat& bottom_top_blob) const
 int Dropout::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
    if (scale == 1.f)
    {
@@ -43,7 +43,7 @@ int Dropout::forward_inplace(Mat& bottom_top_blob) const
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
--- a/src/layer/dropout.h
+++ b/src/layer/dropout.h
@@ -26,7 +26,7 @@ public:

    virtual int load_param(const ParamDict& pd);

    virtual int forward_inplace(Mat& bottom_top_blob) const;
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

 public:
    float scale;
--- a/src/layer/eltwise.cpp
+++ b/src/layer/eltwise.cpp
@@ -31,16 +31,17 @@ int Eltwise::load_param(const ParamDict& pd)
    return 0;
 }

 int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
 int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
    const Mat& bottom_blob = bottom_blobs[0];
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int size = w * h;

    Mat& top_blob = top_blobs[0];
    top_blob.create(w, h, channels);
    top_blob.create(w, h, channels, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

@@ -48,7 +49,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
    {
        // first blob
        const Mat& bottom_blob1 = bottom_blobs[1];
        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
@@ -64,7 +65,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
        for (size_t b=2; b<bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob1 = bottom_blobs[b];
            #pragma omp parallel for
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels; q++)
            {
                const float* ptr = bottom_blob1.channel(q);
@@ -83,7 +84,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
        {
            // first blob
            const Mat& bottom_blob1 = bottom_blobs[1];
            #pragma omp parallel for
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
@@ -99,7 +100,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
            for (size_t b=2; b<bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob1 = bottom_blobs[b];
                #pragma omp parallel for
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q=0; q<channels; q++)
                {
                    const float* ptr = bottom_blob1.channel(q);
@@ -118,7 +119,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
            const Mat& bottom_blob1 = bottom_blobs[1];
            float coeff0 = coeffs[0];
            float coeff1 = coeffs[1];
            #pragma omp parallel for
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
@@ -135,7 +136,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
            {
                const Mat& bottom_blob1 = bottom_blobs[b];
                float coeff = coeffs[b];
                #pragma omp parallel for
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q=0; q<channels; q++)
                {
                    const float* ptr = bottom_blob1.channel(q);
@@ -153,7 +154,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
    {
        // first blob
        const Mat& bottom_blob1 = bottom_blobs[1];
        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
@@ -169,7 +170,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
        for (size_t b=2; b<bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob1 = bottom_blobs[b];
            #pragma omp parallel for
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels; q++)
            {
                const float* ptr = bottom_blob1.channel(q);
--- a/src/layer/eltwise.h
+++ b/src/layer/eltwise.h
@@ -26,7 +26,7 @@ public:

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

    enum { Operation_PROD = 0, Operation_SUM = 1, Operation_MAX = 2 };

--- a/src/layer/elu.cpp
+++ b/src/layer/elu.cpp
@@ -32,14 +32,14 @@ int ELU::load_param(const ParamDict& pd)
    return 0;
 }

 int ELU::forward_inplace(Mat& bottom_top_blob) const
 int ELU::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
--- a/src/layer/elu.h
+++ b/src/layer/elu.h
@@ -26,7 +26,7 @@ public:

    virtual int load_param(const ParamDict& pd);

    virtual int forward_inplace(Mat& bottom_top_blob) const;
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

 public:
    float alpha;
--- a/src/layer/embed.cpp
+++ b/src/layer/embed.cpp
@@ -51,16 +51,16 @@ int Embed::load_model(const ModelBin& mb)
    return 0;
 }

 int Embed::forward(const Mat& bottom_blob, Mat& top_blob) const
 int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    int words = bottom_blob.total();

    top_blob.create(num_output, words);
    top_blob.create(num_output, words, 4u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // num_output
    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<words; q++)
    {
        float* outptr = top_blob.row(q);
--- a/src/layer/embed.h
+++ b/src/layer/embed.h
@@ -28,7 +28,7 @@ public:

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

 public:
    // param
--- a/src/layer/exp.cpp
+++ b/src/layer/exp.cpp
@@ -34,7 +34,7 @@ int Exp::load_param(const ParamDict& pd)
    return 0;
 }

 int Exp::forward_inplace(Mat& bottom_top_blob) const
 int Exp::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
@@ -43,7 +43,7 @@ int Exp::forward_inplace(Mat& bottom_top_blob) const

    if (base == -1.f)
    {
        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
@@ -56,7 +56,7 @@ int Exp::forward_inplace(Mat& bottom_top_blob) const
    }
    else
    {
        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
--- a/src/layer/exp.h
+++ b/src/layer/exp.h
@@ -26,7 +26,7 @@ public:

    virtual int load_param(const ParamDict& pd);

    virtual int forward_inplace(Mat& bottom_top_blob) const;
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

 public:
    float base;
--- a/src/layer/expanddims.cpp
+++ b/src/layer/expanddims.cpp
@@ -33,7 +33,7 @@ int ExpandDims::load_param(const ParamDict& pd)
    return 0;
 }

 int ExpandDims::forward(const Mat& bottom_blob, Mat& top_blob) const
 int ExpandDims::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
@@ -46,28 +46,28 @@ int ExpandDims::forward(const Mat& bottom_blob, Mat& top_blob) const
        if (expand_w)
        {
            if (expand_h)
                top_blob = bottom_blob.reshape(1, 1, w);
                top_blob = bottom_blob.reshape(1, 1, w, opt.blob_allocator);
            else if (expand_c)
                top_blob = bottom_blob.reshape(1, w, 1);
                top_blob = bottom_blob.reshape(1, w, 1, opt.blob_allocator);
            else
                top_blob = bottom_blob.reshape(1, w);
                top_blob = bottom_blob.reshape(1, w, opt.blob_allocator);
        }
        else if (expand_h)
        {
            if (expand_c)
                top_blob = bottom_blob.reshape(w, 1, 1);
                top_blob = bottom_blob.reshape(w, 1, 1, opt.blob_allocator);
            else
                top_blob = bottom_blob.reshape(w, 1);
                top_blob = bottom_blob.reshape(w, 1, opt.blob_allocator);
        }
    }
    else if (dims == 2)
    {
        if (expand_w)
            top_blob = bottom_blob.reshape(1, w, h);
            top_blob = bottom_blob.reshape(1, w, h, opt.blob_allocator);
        else if (expand_h)
            top_blob = bottom_blob.reshape(w, 1, h);
            top_blob = bottom_blob.reshape(w, 1, h, opt.blob_allocator);
        else if (expand_c)
            top_blob = bottom_blob.reshape(w, h, 1);
            top_blob = bottom_blob.reshape(w, h, 1, opt.blob_allocator);
    }

    if (top_blob.empty())
--- a/src/layer/expanddims.h
+++ b/src/layer/expanddims.h
@@ -26,7 +26,7 @@ public:

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

 public:
    int expand_w;
--- a/src/layer/flatten.cpp
+++ b/src/layer/flatten.cpp
@@ -24,18 +24,19 @@ Flatten::Flatten()
    support_inplace = false;
 }

 int Flatten::forward(const Mat& bottom_blob, Mat& top_blob) const
 int Flatten::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int size = w * h;

    top_blob.create(size * channels);
    top_blob.create(size * channels, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_blob.channel(q);
--- a/src/layer/flatten.h
+++ b/src/layer/flatten.h
@@ -24,7 +24,7 @@ class Flatten : public Layer
 public:
    Flatten();

    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 };

 } // namespace ncnn
--- a/src/layer/innerproduct.cpp
+++ b/src/layer/innerproduct.cpp
@@ -49,19 +49,20 @@ int InnerProduct::load_model(const ModelBin& mb)
    return 0;
 }

 int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob) const
 int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int size = w * h;

    top_blob.create(num_output);
    top_blob.create(num_output, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // num_output
    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p=0; p<num_output; p++)
    {
        float sum = 0.f;
--- a/src/layer/innerproduct.h
+++ b/src/layer/innerproduct.h
@@ -28,7 +28,7 @@ public:

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

 public:
    // param
--- a/src/layer/input.cpp
+++ b/src/layer/input.cpp
@@ -33,7 +33,7 @@ int Input::load_param(const ParamDict& pd)
    return 0;
 }

 int Input::forward_inplace(Mat& /*bottom_top_blob*/) const
 int Input::forward_inplace(Mat& /*bottom_top_blob*/, const Option& /*opt*/) const
 {
    return 0;
 }
--- a/src/layer/input.h
+++ b/src/layer/input.h
@@ -26,7 +26,7 @@ public:

    virtual int load_param(const ParamDict& pd);

    virtual int forward_inplace(Mat& bottom_top_blob) const;
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

 public:
    int w;
--- a/src/layer/instancenorm.cpp
+++ b/src/layer/instancenorm.cpp
@@ -46,7 +46,7 @@ int InstanceNorm::load_model(const ModelBin& mb)
    return 0;
 }

 int InstanceNorm::forward_inplace(Mat& bottom_top_blob) const
 int InstanceNorm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
    // x = (x - mean) / (sqrt(var) + eps) * gamma + beta

@@ -54,7 +54,7 @@ int InstanceNorm::forward_inplace(Mat& bottom_top_blob) const
    int h = bottom_top_blob.h;
    int size = w * h;

    #pragma omp parallel for
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
--- a/src/layer/instancenorm.h
+++ b/src/layer/instancenorm.h
@@ -28,7 +28,7 @@ public:

    virtual int load_model(const ModelBin& mb);

    virtual int forward_inplace(Mat& bottom_top_blob) const;
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

 public:
    // param
--- a/src/layer/interp.cpp
+++ b/src/layer/interp.cpp
@@ -35,11 +35,13 @@ int Interp::load_param(const ParamDict& pd)
    return 0;
 }

 int Interp::forward(const Mat &bottom_blob, Mat &top_blob) const
 int Interp::forward(const Mat &bottom_blob, Mat &top_blob, const Option& opt) const
 {
    int h = bottom_blob.h;
    int w = bottom_blob.w;
    int c = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;

    int oh = output_height;
    int ow = output_width;
    if (bottom_blob.dims == 1)
@@ -58,13 +60,13 @@ int Interp::forward(const Mat &bottom_blob, Mat &top_blob) const
        top_blob = bottom_blob;
        return 0;
    }
    top_blob.create(ow, oh, c);
    top_blob.create(ow, oh, c, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if (bottom_blob.dims == 1)
    {
        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < c; ++q)
        {
            Mat top_blob_c = top_blob.channel(q);
@@ -76,7 +78,7 @@ int Interp::forward(const Mat &bottom_blob, Mat &top_blob) const

    if (resize_type == 1)//nearest
    {
        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < c; ++q)
        {
            const float *ptr = bottom_blob.channel(q);
--- a/src/layer/interp.h
+++ b/src/layer/interp.h
@@ -26,7 +26,7 @@ public:

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const Mat &bottom_blob, Mat &top_blob) const;
    virtual int forward(const Mat &bottom_blob, Mat &top_blob, const Option& opt) const;

 public:
    // param
--- a/src/layer/log.cpp
+++ b/src/layer/log.cpp
@@ -34,7 +34,7 @@ int Log::load_param(const ParamDict& pd)
    return 0;
 }

 int Log::forward_inplace(Mat& bottom_top_blob) const
 int Log::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
@@ -43,7 +43,7 @@ int Log::forward_inplace(Mat& bottom_top_blob) const

    if (base == -1.f)
    {
        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
@@ -58,7 +58,7 @@ int Log::forward_inplace(Mat& bottom_top_blob) const
    {
        float log_base_inv = 1.f / log(base);

        #pragma omp parallel for
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);