Browse Source

implement ncnn blob/workspace allocator, fine-grained per-layer openmp threads control, fix #469

tags/20180830
nihui 7 years ago
parent
commit
9706cd1447
100 changed files with 936 additions and 393 deletions
  1. +17
    -2
      benchmark/benchncnn.cpp
  2. +1
    -0
      src/CMakeLists.txt
  3. +237
    -0
      src/allocator.cpp
  4. +175
    -0
      src/allocator.h
  5. +38
    -8
      src/layer.cpp
  6. +20
    -4
      src/layer.h
  7. +2
    -2
      src/layer/absval.cpp
  8. +1
    -1
      src/layer/absval.h
  9. +3
    -3
      src/layer/argmax.cpp
  10. +1
    -1
      src/layer/argmax.h
  11. +2
    -2
      src/layer/arm/absval_arm.cpp
  12. +1
    -1
      src/layer/arm/absval_arm.h
  13. +3
    -3
      src/layer/arm/batchnorm_arm.cpp
  14. +1
    -1
      src/layer/arm/batchnorm_arm.h
  15. +2
    -2
      src/layer/arm/bias_arm.cpp
  16. +1
    -1
      src/layer/arm/bias_arm.h
  17. +16
    -16
      src/layer/arm/convolution_1x1.h
  18. +2
    -2
      src/layer/arm/convolution_2x2.h
  19. +28
    -27
      src/layer/arm/convolution_3x3.h
  20. +2
    -2
      src/layer/arm/convolution_4x4.h
  21. +4
    -4
      src/layer/arm/convolution_5x5.h
  22. +4
    -4
      src/layer/arm/convolution_7x7.h
  23. +25
    -23
      src/layer/arm/convolution_arm.cpp
  24. +3
    -3
      src/layer/arm/convolution_arm.h
  25. +4
    -4
      src/layer/arm/convolutiondepthwise_3x3.h
  26. +10
    -9
      src/layer/arm/convolutiondepthwise_arm.cpp
  27. +1
    -1
      src/layer/arm/convolutiondepthwise_arm.h
  28. +4
    -4
      src/layer/arm/deconvolution_3x3.h
  29. +4
    -4
      src/layer/arm/deconvolution_4x4.h
  30. +26
    -13
      src/layer/arm/deconvolution_arm.cpp
  31. +1
    -1
      src/layer/arm/deconvolution_arm.h
  32. +24
    -11
      src/layer/arm/deconvolutiondepthwise_arm.cpp
  33. +1
    -1
      src/layer/arm/deconvolutiondepthwise_arm.h
  34. +11
    -10
      src/layer/arm/eltwise_arm.cpp
  35. +1
    -1
      src/layer/arm/eltwise_arm.h
  36. +5
    -4
      src/layer/arm/innerproduct_arm.cpp
  37. +1
    -1
      src/layer/arm/innerproduct_arm.h
  38. +8
    -7
      src/layer/arm/lrn_arm.cpp
  39. +1
    -1
      src/layer/arm/lrn_arm.h
  40. +2
    -2
      src/layer/arm/pooling_2x2.h
  41. +2
    -2
      src/layer/arm/pooling_3x3.h
  42. +11
    -10
      src/layer/arm/pooling_arm.cpp
  43. +1
    -1
      src/layer/arm/pooling_arm.h
  44. +3
    -3
      src/layer/arm/prelu_arm.cpp
  45. +1
    -1
      src/layer/arm/prelu_arm.h
  46. +3
    -3
      src/layer/arm/relu_arm.cpp
  47. +1
    -1
      src/layer/arm/relu_arm.h
  48. +4
    -4
      src/layer/arm/scale_arm.cpp
  49. +1
    -1
      src/layer/arm/scale_arm.h
  50. +2
    -2
      src/layer/arm/sigmoid_arm.cpp
  51. +1
    -1
      src/layer/arm/sigmoid_arm.h
  52. +7
    -6
      src/layer/arm/softmax_arm.cpp
  53. +1
    -1
      src/layer/arm/softmax_arm.h
  54. +4
    -4
      src/layer/batchnorm.cpp
  55. +1
    -1
      src/layer/batchnorm.h
  56. +2
    -2
      src/layer/bias.cpp
  57. +1
    -1
      src/layer/bias.h
  58. +41
    -40
      src/layer/binaryop.cpp
  59. +2
    -2
      src/layer/binaryop.h
  60. +2
    -2
      src/layer/bnll.cpp
  61. +1
    -1
      src/layer/bnll.h
  62. +2
    -2
      src/layer/clip.cpp
  63. +1
    -1
      src/layer/clip.h
  64. +10
    -10
      src/layer/concat.cpp
  65. +1
    -1
      src/layer/concat.h
  66. +7
    -6
      src/layer/convolution.cpp
  67. +1
    -1
      src/layer/convolution.h
  68. +8
    -7
      src/layer/convolutiondepthwise.cpp
  69. +1
    -1
      src/layer/convolutiondepthwise.h
  70. +4
    -4
      src/layer/crop.cpp
  71. +2
    -2
      src/layer/crop.h
  72. +22
    -9
      src/layer/deconvolution.cpp
  73. +1
    -1
      src/layer/deconvolution.h
  74. +23
    -10
      src/layer/deconvolutiondepthwise.cpp
  75. +1
    -1
      src/layer/deconvolutiondepthwise.h
  76. +5
    -5
      src/layer/detectionoutput.cpp
  77. +1
    -1
      src/layer/detectionoutput.h
  78. +2
    -2
      src/layer/dropout.cpp
  79. +1
    -1
      src/layer/dropout.h
  80. +11
    -10
      src/layer/eltwise.cpp
  81. +1
    -1
      src/layer/eltwise.h
  82. +2
    -2
      src/layer/elu.cpp
  83. +1
    -1
      src/layer/elu.h
  84. +3
    -3
      src/layer/embed.cpp
  85. +1
    -1
      src/layer/embed.h
  86. +3
    -3
      src/layer/exp.cpp
  87. +1
    -1
      src/layer/exp.h
  88. +9
    -9
      src/layer/expanddims.cpp
  89. +1
    -1
      src/layer/expanddims.h
  90. +4
    -3
      src/layer/flatten.cpp
  91. +1
    -1
      src/layer/flatten.h
  92. +4
    -3
      src/layer/innerproduct.cpp
  93. +1
    -1
      src/layer/innerproduct.h
  94. +1
    -1
      src/layer/input.cpp
  95. +1
    -1
      src/layer/input.h
  96. +2
    -2
      src/layer/instancenorm.cpp
  97. +1
    -1
      src/layer/instancenorm.h
  98. +6
    -4
      src/layer/interp.cpp
  99. +1
    -1
      src/layer/interp.h
  100. +3
    -3
      src/layer/log.cpp

+ 17
- 2
benchmark/benchncnn.cpp View File

@@ -52,6 +52,9 @@ public:

static int g_loop_count = 4;

static ncnn::UnlockedPoolAllocator g_blob_pool_allocator;
static ncnn::PoolAllocator g_workspace_pool_allocator;

void benchmark(const char* comment, void (*init)(ncnn::Net&), void (*run)(const ncnn::Net&))
{
ncnn::BenchNet net;
@@ -60,6 +63,9 @@ void benchmark(const char* comment, void (*init)(ncnn::Net&), void (*run)(const

net.load_model();

g_blob_pool_allocator.clear();
g_workspace_pool_allocator.clear();

// sleep 10 seconds for cooling down SOC :(
#ifdef _WIN32
Sleep(10 * 1000);
@@ -265,8 +271,6 @@ void mobilenet_yolo_run(const ncnn::Net& net)
{
ncnn::Extractor ex = net.create_extractor();

// NOTE original model input is 416x416x3
// you may change to 300x300x3 for comparison with ssd
ncnn::Mat in(416, 416, 3);
ex.input("data", in);

@@ -295,6 +299,17 @@ int main(int argc, char** argv)

g_loop_count = loop_count;

g_blob_pool_allocator.set_size_compare_ratio(0.0f);
g_workspace_pool_allocator.set_size_compare_ratio(0.5f);

ncnn::Option opt;
opt.lightmode = true;
opt.num_threads = num_threads;
opt.blob_allocator = &g_blob_pool_allocator;
opt.workspace_allocator = &g_workspace_pool_allocator;

ncnn::set_default_option(opt);

ncnn::set_cpu_powersave(powersave);

ncnn::set_omp_dynamic(0);


+ 1
- 0
src/CMakeLists.txt View File

@@ -8,6 +8,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/layer)

set(ncnn_SRCS
allocator.cpp
blob.cpp
cpu.cpp
layer.cpp


+ 237
- 0
src/allocator.cpp View File

@@ -0,0 +1,237 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "allocator.h"

#include <stdio.h>

namespace ncnn {

PoolAllocator::PoolAllocator()
{
size_compare_ratio = 192;// 0.75f * 256
}

PoolAllocator::~PoolAllocator()
{
clear();

if (!payouts.empty())
{
fprintf(stderr, "FATAL ERROR! pool allocator destroyed too early\n");
std::list< std::pair<size_t, void*> >::iterator it = payouts.begin();
for (; it != payouts.end(); it++)
{
void* ptr = it->second;
fprintf(stderr, "%p still in use\n", ptr);
}
}
}

void PoolAllocator::clear()
{
budgets_lock.lock();

std::list< std::pair<size_t, void*> >::iterator it = budgets.begin();
for (; it != budgets.end(); it++)
{
void* ptr = it->second;
ncnn::fastFree(ptr);
}
budgets.clear();

budgets_lock.unlock();
}

void PoolAllocator::set_size_compare_ratio(float scr)
{
if (scr < 0.f || scr > 1.f)
{
fprintf(stderr, "invalid size compare ratio %f\n", scr);
return;
}

size_compare_ratio = (unsigned int)(scr * 256);
}

void* PoolAllocator::fastMalloc(size_t size)
{
budgets_lock.lock();

// find free budget
std::list< std::pair<size_t, void*> >::iterator it = budgets.begin();
for (; it != budgets.end(); it++)
{
size_t bs = it->first;

// size_compare_ratio ~ 100%
if (bs >= size && ((bs * size_compare_ratio) >> 8) <= size)
{
void* ptr = it->second;

budgets.erase(it);

budgets_lock.unlock();

payouts_lock.lock();

payouts.push_back(std::make_pair(bs, ptr));

payouts_lock.unlock();

return ptr;
}
}

budgets_lock.unlock();

// new
void* ptr = ncnn::fastMalloc(size);

payouts_lock.lock();

payouts.push_back(std::make_pair(size, ptr));

payouts_lock.unlock();

return ptr;
}

void PoolAllocator::fastFree(void* ptr)
{
payouts_lock.lock();

// return to budgets
std::list< std::pair<size_t, void*> >::iterator it = payouts.begin();
for (; it != payouts.end(); it++)
{
if (it->second == ptr)
{
size_t size = it->first;

payouts.erase(it);

payouts_lock.unlock();

budgets_lock.lock();

budgets.push_back(std::make_pair(size, ptr));

budgets_lock.unlock();

return;
}
}

payouts_lock.unlock();

fprintf(stderr, "FATAL ERROR! pool allocator get wild %p\n", ptr);
ncnn::fastFree(ptr);
}

UnlockedPoolAllocator::UnlockedPoolAllocator()
{
size_compare_ratio = 192;// 0.75f * 256
}

UnlockedPoolAllocator::~UnlockedPoolAllocator()
{
clear();

if (!payouts.empty())
{
fprintf(stderr, "FATAL ERROR! unlocked pool allocator destroyed too early\n");
std::list< std::pair<size_t, void*> >::iterator it = payouts.begin();
for (; it != payouts.end(); it++)
{
void* ptr = it->second;
fprintf(stderr, "%p still in use\n", ptr);
}
}
}

void UnlockedPoolAllocator::clear()
{
std::list< std::pair<size_t, void*> >::iterator it = budgets.begin();
for (; it != budgets.end(); it++)
{
void* ptr = it->second;
ncnn::fastFree(ptr);
}
budgets.clear();
}

void UnlockedPoolAllocator::set_size_compare_ratio(float scr)
{
if (scr < 0.f || scr > 1.f)
{
fprintf(stderr, "invalid size compare ratio %f\n", scr);
return;
}

size_compare_ratio = (unsigned int)(scr * 256);
}

void* UnlockedPoolAllocator::fastMalloc(size_t size)
{
// find free budget
std::list< std::pair<size_t, void*> >::iterator it = budgets.begin();
for (; it != budgets.end(); it++)
{
size_t bs = it->first;

// size_compare_ratio ~ 100%
if (bs >= size && ((bs * size_compare_ratio) >> 8) <= size)
{
void* ptr = it->second;

budgets.erase(it);

payouts.push_back(std::make_pair(bs, ptr));

return ptr;
}
}

// new
void* ptr = ncnn::fastMalloc(size);

payouts.push_back(std::make_pair(size, ptr));

return ptr;
}

void UnlockedPoolAllocator::fastFree(void* ptr)
{
// return to budgets
std::list< std::pair<size_t, void*> >::iterator it = payouts.begin();
for (; it != payouts.end(); it++)
{
if (it->second == ptr)
{
size_t size = it->first;

payouts.erase(it);

budgets.push_back(std::make_pair(size, ptr));

return;
}
}

fprintf(stderr, "FATAL ERROR! unlocked pool allocator get wild %p\n", ptr);
ncnn::fastFree(ptr);
}

} // namespace ncnn

+ 175
- 0
src/allocator.h View File

@@ -0,0 +1,175 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef NCNN_ALLOCATOR_H
#define NCNN_ALLOCATOR_H

#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#else
#include <pthread.h>
#endif

#include <stdlib.h>
#include <list>

namespace ncnn {

// the alignment of all the allocated buffers
#define MALLOC_ALIGN 16

// Aligns a pointer to the specified number of bytes
// ptr Aligned pointer
// n Alignment size that must be a power of two
template<typename _Tp> static inline _Tp* alignPtr(_Tp* ptr, int n=(int)sizeof(_Tp))
{
return (_Tp*)(((size_t)ptr + n-1) & -n);
}

// Aligns a buffer size to the specified number of bytes
// The function returns the minimum number that is greater or equal to sz and is divisible by n
// sz Buffer size to align
// n Alignment size that must be a power of two
static inline size_t alignSize(size_t sz, int n)
{
return (sz + n-1) & -n;
}

static inline void* fastMalloc(size_t size)
{
unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + MALLOC_ALIGN);
if (!udata)
return 0;
unsigned char** adata = alignPtr((unsigned char**)udata + 1, MALLOC_ALIGN);
adata[-1] = udata;
return adata;
}

static inline void fastFree(void* ptr)
{
if (ptr)
{
unsigned char* udata = ((unsigned char**)ptr)[-1];
free(udata);
}
}

// exchange-add operation for atomic operations on reference counters
#if defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32)
// atomic increment on the linux version of the Intel(tm) compiler
# define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd(const_cast<void*>(reinterpret_cast<volatile void*>(addr)), delta)
#elif defined __GNUC__
# if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__)
# ifdef __ATOMIC_ACQ_REL
# define NCNN_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL)
# else
# define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4)
# endif
# else
# if defined __ATOMIC_ACQ_REL && !defined __clang__
// version for gcc >= 4.7
# define NCNN_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL)
# else
# define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta))
# endif
# endif
#elif defined _MSC_VER && !defined RC_INVOKED
# include <intrin.h>
# define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
#else
static inline void NCNN_XADD(int* addr, int delta) { int tmp = *addr; *addr += delta; return tmp; }
#endif

#ifdef _WIN32
class Mutex
{
public:
Mutex() { InitializeSRWLock(&lock); }
~Mutex() { }
void lock() { AcquireSRWLockExclusive(&lock); }
void unlock() { ReleaseSRWLockExclusive(&lock); }
private:
// NOTE SRWLock is available from windows vista
SRWLOCK lock;
};
#else // _WIN32
class Mutex
{
public:
Mutex() { pthread_mutex_init(&mutex, 0); }
~Mutex() { pthread_mutex_destroy(&mutex); }
void lock() { pthread_mutex_lock(&mutex); }
void unlock() { pthread_mutex_unlock(&mutex); }
private:
pthread_mutex_t mutex;
};
#endif // _WIN32

class Allocator
{
public:
virtual void* fastMalloc(size_t size) = 0;
virtual void fastFree(void* ptr) = 0;
};

class PoolAllocator : public Allocator
{
public:
PoolAllocator();
~PoolAllocator();

// ratio range 0 ~ 1
// default cr = 0.75
void set_size_compare_ratio(float scr);

// release all budgets immediately
void clear();

virtual void* fastMalloc(size_t size);
virtual void fastFree(void* ptr);

private:
Mutex budgets_lock;
Mutex payouts_lock;
unsigned int size_compare_ratio;// 0~256
std::list< std::pair<size_t, void*> > budgets;
std::list< std::pair<size_t, void*> > payouts;
};

class UnlockedPoolAllocator : public Allocator
{
public:
UnlockedPoolAllocator();
~UnlockedPoolAllocator();

// ratio range 0 ~ 1
// default cr = 0.75
void set_size_compare_ratio(float scr);

// release all budgets immediately
void clear();

virtual void* fastMalloc(size_t size);
virtual void fastFree(void* ptr);

private:
unsigned int size_compare_ratio;// 0~256
std::list< std::pair<size_t, void*> > budgets;
std::list< std::pair<size_t, void*> > payouts;
};

} // namespace ncnn

#endif // NCNN_ALLOCATOR_H

+ 38
- 8
src/layer.cpp View File

@@ -14,10 +14,40 @@

#include "layer.h"

#include <stdio.h>
#include <string.h>
#include "cpu.h"

namespace ncnn {

Option::Option()
{
lightmode = true;
num_threads = get_cpu_count();
blob_allocator = 0;
workspace_allocator = 0;
}

static Option g_default_option;

const Option& get_default_option()
{
return g_default_option;
}

int set_default_option(const Option& opt)
{
if (opt.num_threads <= 0)
{
fprintf(stderr, "invalid option num_threads %d\n", opt.num_threads);
return -1;
}

g_default_option = opt;

return 0;
}

Layer::Layer()
{
one_blob_only = false;
@@ -38,7 +68,7 @@ int Layer::load_model(const ModelBin& /*mb*/)
return 0;
}

int Layer::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
int Layer::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
if (!support_inplace)
return -1;
@@ -46,32 +76,32 @@ int Layer::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_b
top_blobs = bottom_blobs;
for (int i = 0; i < (int)top_blobs.size(); i++)
{
top_blobs[i] = bottom_blobs[i].clone();
top_blobs[i] = bottom_blobs[i].clone(opt.blob_allocator);
if (top_blobs[i].empty())
return -100;
}

return forward_inplace(top_blobs);
return forward_inplace(top_blobs, opt);
}

int Layer::forward(const Mat& bottom_blob, Mat& top_blob) const
int Layer::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
if (!support_inplace)
return -1;

top_blob = bottom_blob.clone();
top_blob = bottom_blob.clone(opt.blob_allocator);
if (top_blob.empty())
return -100;

return forward_inplace(top_blob);
return forward_inplace(top_blob, opt);
}

int Layer::forward_inplace(std::vector<Mat>& /*bottom_top_blobs*/) const
int Layer::forward_inplace(std::vector<Mat>& /*bottom_top_blobs*/, const Option& /*opt*/) const
{
return -1;
}

int Layer::forward_inplace(Mat& /*bottom_top_blob*/) const
int Layer::forward_inplace(Mat& /*bottom_top_blob*/, const Option& /*opt*/) const
{
return -1;
}


+ 20
- 4
src/layer.h View File

@@ -25,6 +25,22 @@

namespace ncnn {

class Allocator;
class Option
{
public:
Option();

public:
bool lightmode;
int num_threads;
Allocator* blob_allocator;
Allocator* workspace_allocator;
};

const Option& get_default_option();
int set_default_option(const Option& opt);

class Layer
{
public:
@@ -51,13 +67,13 @@ public:
public:
// implement inference
// return 0 if success
virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt = get_default_option()) const;
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt = get_default_option()) const;

// implement inplace inference
// return 0 if success
virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs) const;
virtual int forward_inplace(Mat& bottom_top_blob) const;
virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt = get_default_option()) const;
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt = get_default_option()) const;

public:
#if NCNN_STRING


+ 2
- 2
src/layer/absval.cpp View File

@@ -24,14 +24,14 @@ AbsVal::AbsVal()
support_inplace = true;
}

int AbsVal::forward_inplace(Mat& bottom_top_blob) const
int AbsVal::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);


+ 1
- 1
src/layer/absval.h View File

@@ -24,7 +24,7 @@ class AbsVal : public Layer
public:
AbsVal();

virtual int forward_inplace(Mat& bottom_top_blob) const;
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
};


+ 3
- 3
src/layer/argmax.cpp View File

@@ -33,14 +33,14 @@ int ArgMax::load_param(const ParamDict& pd)
return 0;
}

int ArgMax::forward(const Mat& bottom_blob, Mat& top_blob) const
int ArgMax::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
int size = bottom_blob.total();

if (out_max_val)
top_blob.create(topk, 2);
top_blob.create(topk, 2, 4u, opt.blob_allocator);
else
top_blob.create(topk, 1);
top_blob.create(topk, 1, 4u, opt.blob_allocator);
if (top_blob.empty())
return -100;



+ 1
- 1
src/layer/argmax.h View File

@@ -26,7 +26,7 @@ public:

virtual int load_param(const ParamDict& pd);

virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

public:
int out_max_val;


+ 2
- 2
src/layer/arm/absval_arm.cpp View File

@@ -22,14 +22,14 @@ namespace ncnn {

DEFINE_LAYER_CREATOR(AbsVal_arm)

int AbsVal_arm::forward_inplace(Mat& bottom_top_blob) const
int AbsVal_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);


+ 1
- 1
src/layer/arm/absval_arm.h View File

@@ -22,7 +22,7 @@ namespace ncnn {
class AbsVal_arm : public AbsVal
{
public:
virtual int forward_inplace(Mat& bottom_top_blob) const;
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn


+ 3
- 3
src/layer/arm/batchnorm_arm.cpp View File

@@ -22,11 +22,11 @@ namespace ncnn {

DEFINE_LAYER_CREATOR(BatchNorm_arm)

int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob) const
int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int dims = bottom_top_blob.dims;
if (dims != 3)
return BatchNorm::forward_inplace(bottom_top_blob);
return BatchNorm::forward_inplace(bottom_top_blob, opt);

// a = bias - slope * mean / sqrt(var)
// b = slope / sqrt(var)
@@ -38,7 +38,7 @@ int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob) const

const float* a_data_ptr = a_data;
const float* b_data_ptr = b_data;
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);


+ 1
- 1
src/layer/arm/batchnorm_arm.h View File

@@ -22,7 +22,7 @@ namespace ncnn {
class BatchNorm_arm : public BatchNorm
{
public:
virtual int forward_inplace(Mat& bottom_top_blob) const;
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn


+ 2
- 2
src/layer/arm/bias_arm.cpp View File

@@ -22,7 +22,7 @@ namespace ncnn {

DEFINE_LAYER_CREATOR(Bias_arm)

int Bias_arm::forward_inplace(Mat& bottom_top_blob) const
int Bias_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
@@ -30,7 +30,7 @@ int Bias_arm::forward_inplace(Mat& bottom_top_blob) const
int size = w * h;

const float* bias_ptr = bias_data;
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);


+ 1
- 1
src/layer/arm/bias_arm.h View File

@@ -22,7 +22,7 @@ namespace ncnn {
class Bias_arm : public Bias
{
public:
virtual int forward_inplace(Mat& bottom_top_blob) const;
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn


+ 16
- 16
src/layer/arm/convolution_1x1.h View File

@@ -113,7 +113,7 @@ static void conv1x1s1_sgemm_transform_kernel_neon(const Mat& _kernel, Mat& kerne
}
}

static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias)
static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
@@ -128,12 +128,12 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma
const float* bias = _bias;

// interleave
Mat tmp(8*4, inch/4+inch%4, size/8 + (size%8)/4 + size%4);
Mat tmp(8*4, inch/4+inch%4, size/8 + (size%8)/4 + size%4, 4u, opt.workspace_allocator);
{
int nn_size = size >> 3;
int remain_size_start = nn_size << 3;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int ii=0; ii<nn_size; ii++)
{
int i = ii * 8;
@@ -184,7 +184,7 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma

nn_size = (size - remain_size_start) >> 2;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int ii=0; ii<nn_size; ii++)
{
int i = remain_size_start + ii * 4;
@@ -230,7 +230,7 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma

remain_size_start += nn_size << 2;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int i=remain_size_start; i<size; i++)
{
const float* img0 = bottom_blob.channel(0);
@@ -254,7 +254,7 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma
nn_outch = outch >> 3;
remain_outch_start = nn_outch << 3;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int pp=0; pp<nn_outch; pp++)
{
int p = pp * 8;
@@ -733,7 +733,7 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma

nn_outch = (outch - remain_outch_start) >> 2;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int pp=0; pp<nn_outch; pp++)
{
int p = remain_outch_start + pp * 4;
@@ -1613,7 +1613,7 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma

remain_outch_start += nn_outch << 2;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=remain_outch_start; p<outch; p++)
{
Mat out0 = top_blob.channel(p);
@@ -2064,7 +2064,7 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma
// }
}

static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
int inch = bottom_blob.c;

@@ -2083,7 +2083,7 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
nn_outch = outch >> 3;
remain_outch_start = nn_outch << 3;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int pp=0; pp<nn_outch; pp++)
{
int p = pp * 8;
@@ -2710,7 +2710,7 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
nn_outch = outch / 6;
remain_outch_start = nn_outch * 6;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int pp=0; pp<nn_outch; pp++)
{
int p = pp * 6;
@@ -3101,7 +3101,7 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke

nn_outch = (outch - remain_outch_start) >> 2;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int pp=0; pp<nn_outch; pp++)
{
int p = remain_outch_start + pp * 4;
@@ -3605,7 +3605,7 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke

remain_outch_start += nn_outch << 2;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=remain_outch_start; p<outch; p++)
{
Mat out = top_blob.channel(p);
@@ -3863,7 +3863,7 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke

}

static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;
@@ -3880,7 +3880,7 @@ static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
int nn_outch = outch >> 2;
int remain_outch_start = nn_outch << 2;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int pp=0; pp<nn_outch; pp++)
{
int p = pp * 4;
@@ -4409,7 +4409,7 @@ static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
}
}

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=remain_outch_start; p<outch; p++)
{
Mat out = top_blob.channel(p);


+ 2
- 2
src/layer/arm/convolution_2x2.h View File

@@ -16,7 +16,7 @@
#include <arm_neon.h>
#endif // __ARM_NEON

static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;
@@ -28,7 +28,7 @@ static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
const float* kernel = _kernel;
const float* bias = _bias;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<outch; p++)
{
Mat out = top_blob.channel(p);


+ 28
- 27
src/layer/arm/convolution_3x3.h View File

@@ -16,7 +16,7 @@
#include <arm_neon.h>
#endif // __ARM_NEON

static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;
@@ -31,7 +31,7 @@ static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
int nn_outch = outch >> 1;
int remain_outch_start = nn_outch << 1;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int pp=0; pp<nn_outch; pp++)
{
int p = pp * 2;
@@ -654,7 +654,7 @@ static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
}
}

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=remain_outch_start; p<outch; p++)
{
Mat out = top_blob.channel(p);
@@ -5427,7 +5427,7 @@ static void conv3x3s1_winograd64_neon3(const Mat& bottom_blob, Mat& top_blob, co
}
#endif

static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias)
static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
@@ -5445,7 +5445,7 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co

w = outw + 2;
h = outh + 2;
copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f);
copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt.workspace_allocator, opt.num_threads);

const float* bias = _bias;

@@ -5454,7 +5454,7 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co
{
int w_tm = outw / 6 * 8;
int h_tm = outh / 6 * 8;
bottom_blob_tm.create(4, 16 * w_tm/8 * h_tm/8, inch);
bottom_blob_tm.create(4, 16 * w_tm/8 * h_tm/8, inch, 4u, opt.workspace_allocator);
const int tiles = w_tm/8 * h_tm/8;

// const float itm[8][8] = {
@@ -5495,7 +5495,7 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co
float32x4_t _coeff1 = vld1q_f32(coeff+4);
#endif // __ARM_NEON

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q<inch; q++)
{
const Mat img0 = bottom_blob_bordered.channel(q);
@@ -6263,14 +6263,14 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co
{
int w_tm = outw / 6 * 8;
int h_tm = outh / 6 * 8;
top_blob_tm.create(4, 16 * w_tm/8 * h_tm/8, outch);
top_blob_tm.create(4, 16 * w_tm/8 * h_tm/8, outch, 4u, opt.workspace_allocator);

const int tiles = h_tm/8 * w_tm/8;

int nn_outch = outch >> 2;
int remain_outch_start = nn_outch << 2;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int pp=0; pp<nn_outch; pp++)
{
int p = pp * 4;
@@ -7439,7 +7439,7 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co
}
}

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = remain_outch_start; p<outch; p++)
{
Mat out0_tm = top_blob_tm.channel(p);
@@ -7526,7 +7526,7 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co

// BEGIN transform output
Mat top_blob_bordered;
top_blob_bordered.create(outw, outh, outch);
top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
{
// const float otm[6][8] = {
// {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 32.0f, 32.0f, 0.0f},
@@ -7553,7 +7553,7 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co
int h_tm = outh / 6 * 8;
const int tiles = w_tm/8 * h_tm/8;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p<outch; p++)
{
const Mat out0_tm = top_blob_tm.channel(p);
@@ -8157,10 +8157,10 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co
// END transform output

// cut result pad
copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w);
copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt.blob_allocator, opt.num_threads);
}

static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias)
static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
@@ -8178,7 +8178,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co

w = outw + 2;
h = outh + 2;
copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f);
copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt.workspace_allocator, opt.num_threads);

const float* bias = _bias;

@@ -8188,7 +8188,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co
int w_tm = outw / 6 * 8;
int h_tm = outh / 6 * 8;
const int tiles = w_tm/8 * h_tm/8;
bottom_blob_tm.create(1, 64 * tiles, inch);
bottom_blob_tm.create(1, 64 * tiles, inch, 4u, opt.workspace_allocator);
// bottom_blob_tm.create(inch, tiles, 64);

// const float itm[8][8] = {
@@ -8229,7 +8229,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co
float32x4_t _coeff1 = vld1q_f32(coeff+4);
#endif // __ARM_NEON

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q<inch; q++)
{
const Mat img0 = bottom_blob_bordered.channel(q);
@@ -9054,9 +9054,9 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co
// permute
// bottom_blob_tm.create(1, 64 * tiles, inch);
// Mat bottom_blob_tm2(inch, tiles, 64);
Mat bottom_blob_tm2(8*inch, tiles/8 + (tiles%8)/4 + tiles%4, 64);
Mat bottom_blob_tm2(8*inch, tiles/8 + (tiles%8)/4 + tiles%4, 64, 4u, opt.workspace_allocator);

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int r=0; r<64; r++)
{
Mat tm2 = bottom_blob_tm2.channel(r);
@@ -9147,7 +9147,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co
nn_outch = outch >> 3;
remain_outch_start = nn_outch << 3;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int pp=0; pp<nn_outch; pp++)
{
int p = pp * 8;
@@ -9592,7 +9592,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co

nn_outch = (outch - remain_outch_start) >> 2;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int pp=0; pp<nn_outch; pp++)
{
int p = remain_outch_start + pp * 4;
@@ -10332,6 +10332,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co

remain_outch_start += nn_outch << 2;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p=remain_outch_start; p<outch; p++)
{
#if __ARM_NEON && __aarch64__
@@ -10738,7 +10739,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co

// BEGIN transform output
Mat top_blob_bordered;
top_blob_bordered.create(outw, outh, outch);
top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
{
// const float otm[6][8] = {
// {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 32.0f, 32.0f, 0.0f},
@@ -10765,7 +10766,7 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co
int h_tm = outh / 6 * 8;
const int tiles = w_tm/8 * h_tm/8;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p<outch; p++)
{
const Mat out0_tm = top_blob_tm.channel(p);
@@ -11514,10 +11515,10 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co
// END transform output

// cut result pad
copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w);
copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt.blob_allocator, opt.num_threads);
}

static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;
@@ -11534,7 +11535,7 @@ static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
int nn_outch = outch >> 1;
int remain_outch_start = nn_outch << 1;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int pp=0; pp<nn_outch; pp++)
{
int p = pp * 2;
@@ -11858,7 +11859,7 @@ static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
}
}

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=remain_outch_start; p<outch; p++)
{
Mat out = top_blob.channel(p);


+ 2
- 2
src/layer/arm/convolution_4x4.h View File

@@ -16,7 +16,7 @@
#include <arm_neon.h>
#endif // __ARM_NEON

static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;
@@ -30,7 +30,7 @@ static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
const float* kernel = _kernel;
const float* bias = _bias;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<outch; p++)
{
Mat out = top_blob.channel(p);


+ 4
- 4
src/layer/arm/convolution_5x5.h View File

@@ -16,7 +16,7 @@
#include <arm_neon.h>
#endif // __ARM_NEON

static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;
@@ -28,7 +28,7 @@ static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
const float* kernel = _kernel;
const float* bias = _bias;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<outch; p++)
{
Mat out = top_blob.channel(p);
@@ -982,7 +982,7 @@ static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke

}

static void conv5x5s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
static void conv5x5s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;
@@ -996,7 +996,7 @@ static void conv5x5s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
const float* kernel = _kernel;
const float* bias = _bias;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<outch; p++)
{
Mat out = top_blob.channel(p);


+ 4
- 4
src/layer/arm/convolution_7x7.h View File

@@ -16,7 +16,7 @@
#include <arm_neon.h>
#endif // __ARM_NEON

static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;
@@ -28,7 +28,7 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
const float* kernel = _kernel;
const float* bias = _bias;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<outch; p++)
{
Mat out = top_blob.channel(p);
@@ -706,7 +706,7 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke

}

static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;
@@ -720,7 +720,7 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
const float* kernel = _kernel;
const float* bias = _bias;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<outch; p++)
{
Mat out = top_blob.channel(p);


+ 25
- 23
src/layer/arm/convolution_arm.cpp View File

@@ -75,10 +75,11 @@ int Convolution_arm::load_model(const ModelBin& mb)
return 0;
}

int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv) const
int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv, const Option& opt) const
{
int w = bottom_blob.w;
int h = bottom_blob.h;
size_t elemsize = bottom_blob.elemsize;

const int kernel_size = kernel_w;
const int stride = stride_w;
@@ -88,7 +89,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
Mat bottom_blob_bordered = bottom_blob;
if (pad_w > 0 || pad_h > 0)
{
copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f);
copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
if (bottom_blob_bordered.empty())
return -100;

@@ -101,7 +102,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
int hpad = kernel_extent + (h - 1) / stride * stride - h;
if (wpad > 0 || hpad > 0)
{
copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f);
copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
if (bottom_blob_bordered.empty())
return -100;
}
@@ -113,7 +114,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
int outw = (w - kernel_extent) / stride + 1;
int outh = (h - kernel_extent) / stride + 1;

top_blob.create(outw, outh, num_output);
top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;

@@ -132,7 +133,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv

if (inner_bottom_blob.w != inner_w || inner_bottom_blob.h != inner_h)
{
inner_bottom_blob.create(inner_w, inner_h, bottom_blob.c);
inner_bottom_blob.create(inner_w, inner_h, bottom_blob.c, elemsize, opt.workspace_allocator);

if (inner_bottom_blob.empty())
{
@@ -142,7 +143,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv

if (inner_top_blob.w != inner_outw || inner_top_blob.h != inner_outh)
{
inner_top_blob.create(inner_outw, inner_outh, num_output);
inner_top_blob.create(inner_outw, inner_outh, num_output, elemsize, opt.workspace_allocator);

if (inner_top_blob.empty())
{
@@ -150,7 +151,7 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
}
}

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int c = 0; c < bottom_blob.c; c ++)
{
float *outptr = (float *) inner_bottom_blob.channel(c);
@@ -166,9 +167,9 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
}
}

conv(inner_bottom_blob, inner_top_blob, weight_data, bias_data);
conv(inner_bottom_blob, inner_top_blob, weight_data, bias_data, opt);

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int c = 0; c < num_output; c ++)
{
float *outptr = (float *) top_blob.channel(c) + x * outw + y;
@@ -188,19 +189,19 @@ int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv
return 0;
}

int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
// convolv with NxN kernel
// value = value + bias

if (bottom_blob.dims != 3)
{
return Convolution::forward(bottom_blob, top_blob);
return Convolution::forward(bottom_blob, top_blob, opt);
}

if (kernel_w != kernel_h || stride_w != stride_h)
{
return Convolution::forward(bottom_blob, top_blob);
return Convolution::forward(bottom_blob, top_blob, opt);
}

const int kernel_size = kernel_w;
@@ -208,10 +209,10 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const

if (kernel_size > 7 || stride > 4 || dilation_w != dilation_h)
{
return Convolution::forward(bottom_blob, top_blob);
return Convolution::forward(bottom_blob, top_blob, opt);
}

typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&);
typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&);

// kernel_size x stride
conv_func conv_func_table[7][4] =
@@ -263,22 +264,23 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
conv_func conv = conv_func_table[kernel_size-1][stride-1];
if (!conv)
{
return Convolution::forward(bottom_blob, top_blob);
return Convolution::forward(bottom_blob, top_blob, opt);
}

if (dilation_w != 1)
{
return forwardDilation(bottom_blob, top_blob, conv);
return forwardDilation(bottom_blob, top_blob, conv, opt);
}

int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;

Mat bottom_blob_bordered = bottom_blob;
if (pad_w > 0 || pad_h > 0)
{
copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f);
copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
if (bottom_blob_bordered.empty())
return -100;

@@ -291,7 +293,7 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
int hpad = kernel_size + (h - 1) / stride * stride - h;
if (wpad > 0 || hpad > 0)
{
copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f);
copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
if (bottom_blob_bordered.empty())
return -100;
}
@@ -303,21 +305,21 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
int outw = (w - kernel_size) / stride + 1;
int outh = (h - kernel_size) / stride + 1;

top_blob.create(outw, outh, num_output);
top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;

if (use_winograd3x3 && w <= 120 && h <= 120)
{
// conv3x3s1_winograd64_neon4(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data);
conv3x3s1_winograd64_neon5(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data);
// conv3x3s1_winograd64_neon4(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
conv3x3s1_winograd64_neon5(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
}
else if (use_sgemm1x1)
{
conv1x1s1_sgemm_neon(bottom_blob_bordered, top_blob, weight_1x1_sgemm_data, bias_data);
conv1x1s1_sgemm_neon(bottom_blob_bordered, top_blob, weight_1x1_sgemm_data, bias_data, opt);
}
else
conv(bottom_blob_bordered, top_blob, weight_data, bias_data);
conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);

return 0;
}


+ 3
- 3
src/layer/arm/convolution_arm.h View File

@@ -19,7 +19,7 @@

namespace ncnn {

typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&);
typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&);

class Convolution_arm : public Convolution
{
@@ -28,8 +28,8 @@ public:

virtual int load_model(const ModelBin& mb);

virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
virtual int forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv) const;
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
virtual int forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv, const Option& opt) const;

public:
bool use_winograd3x3;


+ 4
- 4
src/layer/arm/convolutiondepthwise_3x3.h View File

@@ -16,7 +16,7 @@
#include <arm_neon.h>
#endif // __ARM_NEON

static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
int w = bottom_blob.w;

@@ -28,7 +28,7 @@ static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
const float* kernel = _kernel;
const float* bias = _bias;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int g=0; g<group; g++)
{
Mat out = top_blob.channel(g);
@@ -577,7 +577,7 @@ static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
}
}

static void convdw3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
static void convdw3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
int w = bottom_blob.w;

@@ -591,7 +591,7 @@ static void convdw3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
const float* kernel = _kernel;
const float* bias = _bias;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int g=0; g<group; g++)
{
Mat out = top_blob.channel(g);


+ 10
- 9
src/layer/arm/convolutiondepthwise_arm.cpp View File

@@ -102,7 +102,7 @@ int ConvolutionDepthWise_arm::load_model(const ModelBin& mb)
return 0;
}

int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
// convolv with NxN kernel
// value = value + bias
@@ -110,6 +110,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;

if (channels % group != 0 || num_output % group != 0)
{
@@ -123,7 +124,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
Mat bottom_blob_bordered = bottom_blob;
if (pad_w > 0 || pad_h > 0)
{
copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f);
copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
if (bottom_blob_bordered.empty())
return -100;

@@ -136,7 +137,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
if (wpad > 0 || hpad > 0)
{
copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f);
copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
if (bottom_blob_bordered.empty())
return -100;
}
@@ -148,7 +149,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
int outw = (w - kernel_extent_w) / stride_w + 1;
int outh = (h - kernel_extent_h) / stride_h + 1;

top_blob.create(outw, outh, num_output);
top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;

@@ -161,12 +162,12 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
{
if (stride_w == 1 && stride_h == 1)
{
convdw3x3s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data);
convdw3x3s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
return 0;
}
else if (stride_w == 2 && stride_h == 2)
{
convdw3x3s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data);
convdw3x3s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
return 0;
}
}
@@ -176,7 +177,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
omp_set_nested(0);
#endif

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int g=0; g<group; g++)
{
Mat bottom_blob_bordered_g(w, h, 1, bottom_blob_bordered.channel(g));
@@ -213,7 +214,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
op->load_model(ModelBinFromMatArray(weights));

// forward
op->forward(bottom_blob_bordered_g, top_blob_g);
op->forward(bottom_blob_bordered_g, top_blob_g, opt);

delete op;
}
@@ -235,7 +236,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
const ncnn::Layer* op = group_ops[g];

// forward
op->forward(bottom_blob_bordered_g, top_blob_g);
op->forward(bottom_blob_bordered_g, top_blob_g, opt);
}

return 0;


+ 1
- 1
src/layer/arm/convolutiondepthwise_arm.h View File

@@ -27,7 +27,7 @@ public:

virtual int load_model(const ModelBin& mb);

virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

public:
std::vector<ncnn::Layer*> group_ops;


+ 4
- 4
src/layer/arm/deconvolution_3x3.h View File

@@ -16,7 +16,7 @@
#include <arm_neon.h>
#endif // __ARM_NEON

static void deconv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
static void deconv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
@@ -28,7 +28,7 @@ static void deconv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
const float* kernel = _kernel;
const float* bias = _bias;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<outch; p++)
{
Mat out = top_blob.channel(p);
@@ -237,7 +237,7 @@ static void deconv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
}
}

static void deconv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
static void deconv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
@@ -249,7 +249,7 @@ static void deconv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
const float* kernel = _kernel;
const float* bias = _bias;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<outch; p++)
{
Mat out = top_blob.channel(p);


+ 4
- 4
src/layer/arm/deconvolution_4x4.h View File

@@ -16,7 +16,7 @@
#include <arm_neon.h>
#endif // __ARM_NEON

static void deconv4x4s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
static void deconv4x4s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
@@ -28,7 +28,7 @@ static void deconv4x4s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
const float* kernel = _kernel;
const float* bias = _bias;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<outch; p++)
{
Mat out = top_blob.channel(p);
@@ -185,7 +185,7 @@ static void deconv4x4s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
}
}

static void deconv4x4s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
static void deconv4x4s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
@@ -197,7 +197,7 @@ static void deconv4x4s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _
const float* kernel = _kernel;
const float* bias = _bias;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<outch; p++)
{
Mat out = top_blob.channel(p);


+ 26
- 13
src/layer/arm/deconvolution_arm.cpp View File

@@ -21,14 +21,14 @@ namespace ncnn {

DEFINE_LAYER_CREATOR(Deconvolution_arm)

int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
// deconvolv with NxN kernel
// value = value + bias

if (kernel_w != kernel_h || stride_w != stride_h)
{
return Deconvolution::forward(bottom_blob, top_blob);
return Deconvolution::forward(bottom_blob, top_blob, opt);
}

const int kernel_size = kernel_w;
@@ -36,10 +36,10 @@ int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const

if ((kernel_size != 3 && kernel_size != 4) || stride > 2 || dilation_w != 1 || dilation_h != 1)
{
return Deconvolution::forward(bottom_blob, top_blob);
return Deconvolution::forward(bottom_blob, top_blob, opt);
}

typedef void (*deconv_func)(const Mat&, Mat&, const Mat&, const Mat&);
typedef void (*deconv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&);

// kernel_size x stride
deconv_func deconv_func_table[2][2] =
@@ -57,33 +57,46 @@ int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
deconv_func deconv = deconv_func_table[kernel_size-3][stride-1];
if (!deconv)
{
return Deconvolution::forward(bottom_blob, top_blob);
return Deconvolution::forward(bottom_blob, top_blob, opt);
}

int w = bottom_blob.w;
int h = bottom_blob.h;
size_t elemsize = bottom_blob.elemsize;

int outw = (w - 1) * stride + kernel_size;
int outh = (h - 1) * stride + kernel_size;

Mat top_blob_bordered = top_blob;
top_blob_bordered.create(outw, outh, num_output);
if (top_blob_bordered.empty())
return -100;

deconv(bottom_blob, top_blob_bordered, weight_data, bias_data);
Mat top_blob_bordered;
if (pad_w > 0 || pad_h > 0)
{
top_blob_bordered.create(outw, outh, num_output, elemsize, opt.workspace_allocator);
if (top_blob_bordered.empty())
return -100;
}
else
{
top_blob_bordered = top_blob;
top_blob_bordered.create(outw, outh, num_output, elemsize, opt.blob_allocator);
if (top_blob_bordered.empty())
return -100;
}

top_blob = top_blob_bordered;
deconv(bottom_blob, top_blob_bordered, weight_data, bias_data, opt);

if (pad_w > 0 || pad_h > 0)
{
copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w);
copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w, opt.blob_allocator, opt.num_threads);
if (top_blob.empty())
return -100;

outw = top_blob.w;
outh = top_blob.h;
}
else
{
top_blob = top_blob_bordered;
}

return 0;
}


+ 1
- 1
src/layer/arm/deconvolution_arm.h View File

@@ -22,7 +22,7 @@ namespace ncnn {
class Deconvolution_arm : public Deconvolution
{
public:
virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
};

} // namespace ncnn


+ 24
- 11
src/layer/arm/deconvolutiondepthwise_arm.cpp View File

@@ -24,7 +24,7 @@ namespace ncnn {

DEFINE_LAYER_CREATOR(DeconvolutionDepthWise_arm)

int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
// convolv with NxN kernel
// value = value + bias
@@ -32,6 +32,7 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;

if (channels % group != 0 || num_output % group != 0)
{
@@ -45,10 +46,20 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c
int outw = (w - 1) * stride_w + kernel_extent_w;
int outh = (h - 1) * stride_h + kernel_extent_h;

Mat top_blob_bordered = top_blob;
top_blob_bordered.create(outw, outh, num_output);
if (top_blob_bordered.empty())
return -100;
Mat top_blob_bordered;
if (pad_w > 0 || pad_h > 0)
{
top_blob_bordered.create(outw, outh, num_output, elemsize, opt.workspace_allocator);
if (top_blob_bordered.empty())
return -100;
}
else
{
top_blob_bordered = top_blob;
top_blob_bordered.create(outw, outh, num_output, elemsize, opt.blob_allocator);
if (top_blob_bordered.empty())
return -100;
}

const int maxk = kernel_w * kernel_h;

@@ -60,7 +71,7 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c
omp_set_nested(0);
#endif

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int g=0; g<group; g++)
{
Mat bottom_blob_g(w, h, 1, bottom_blob.channel(g).data);
@@ -98,7 +109,7 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c
op->load_model(ModelBinFromMatArray(weights));

// forward
op->forward(bottom_blob_g, top_blob_bordered_g);
op->forward(bottom_blob_g, top_blob_bordered_g, opt);

delete op;
}
@@ -148,23 +159,25 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c
op->load_model(ModelBinFromMatArray(weights));

// forward
op->forward(bottom_blob_g, top_blob_bordered_g);
op->forward(bottom_blob_g, top_blob_bordered_g, opt);

delete op;
}
}

top_blob = top_blob_bordered;

if (pad_w > 0 || pad_h > 0)
{
copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w);
copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w, opt.blob_allocator, opt.num_threads);
if (top_blob.empty())
return -100;

outw = top_blob.w;
outh = top_blob.h;
}
else
{
top_blob = top_blob_bordered;
}

return 0;



+ 1
- 1
src/layer/arm/deconvolutiondepthwise_arm.h View File

@@ -22,7 +22,7 @@ namespace ncnn {
class DeconvolutionDepthWise_arm : public DeconvolutionDepthWise
{
public:
virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
};

} // namespace ncnn


+ 11
- 10
src/layer/arm/eltwise_arm.cpp View File

@@ -22,16 +22,17 @@ namespace ncnn {

DEFINE_LAYER_CREATOR(Eltwise_arm)

int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
const Mat& bottom_blob = bottom_blobs[0];
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int size = w * h;

Mat& top_blob = top_blobs[0];
top_blob.create(w, h, channels);
top_blob.create(w, h, channels, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;

@@ -39,7 +40,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
{
// first blob
const Mat& bottom_blob1 = bottom_blobs[1];
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
@@ -117,7 +118,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
for (size_t b=2; b<bottom_blobs.size(); b++)
{
const Mat& bottom_blob1 = bottom_blobs[b];
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob1.channel(q);
@@ -193,7 +194,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
{
// first blob
const Mat& bottom_blob1 = bottom_blobs[1];
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
@@ -271,7 +272,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
for (size_t b=2; b<bottom_blobs.size(); b++)
{
const Mat& bottom_blob1 = bottom_blobs[b];
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob1.channel(q);
@@ -349,7 +350,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
const Mat& bottom_blob1 = bottom_blobs[1];
float coeff0 = coeffs_ptr[0];
float coeff1 = coeffs_ptr[1];
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
@@ -436,7 +437,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
{
const Mat& bottom_blob1 = bottom_blobs[b];
float coeff = coeffs_ptr[b];
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob1.channel(q);
@@ -514,7 +515,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
{
// first blob
const Mat& bottom_blob1 = bottom_blobs[1];
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
@@ -592,7 +593,7 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
for (size_t b=2; b<bottom_blobs.size(); b++)
{
const Mat& bottom_blob1 = bottom_blobs[b];
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob1.channel(q);


+ 1
- 1
src/layer/arm/eltwise_arm.h View File

@@ -22,7 +22,7 @@ namespace ncnn {
class Eltwise_arm : public Eltwise
{
public:
virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
};

} // namespace ncnn


+ 5
- 4
src/layer/arm/innerproduct_arm.cpp View File

@@ -22,14 +22,15 @@ namespace ncnn {

DEFINE_LAYER_CREATOR(InnerProduct_arm)

int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int size = w * h;

top_blob.create(num_output);
top_blob.create(num_output, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;

@@ -38,7 +39,7 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
int nn_num_output = num_output >> 2;
int remain_num_output_start = nn_num_output << 2;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int pp=0; pp<nn_num_output; pp++)
{
int p = pp * 4;
@@ -143,7 +144,7 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
}

// num_output
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=remain_num_output_start; p<num_output; p++)
{
float sum = 0.f;


+ 1
- 1
src/layer/arm/innerproduct_arm.h View File

@@ -22,7 +22,7 @@ namespace ncnn {
class InnerProduct_arm : public InnerProduct
{
public:
virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
};

} // namespace ncnn


+ 8
- 7
src/layer/arm/lrn_arm.cpp View File

@@ -24,20 +24,21 @@ namespace ncnn {

DEFINE_LAYER_CREATOR(LRN_arm)

int LRN_arm::forward_inplace(Mat& bottom_top_blob) const
int LRN_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
size_t elemsize = bottom_top_blob.elemsize;
int size = w * h;

// squared values with local_size padding
Mat square_blob;
square_blob.create(w, h, channels);
square_blob.create(w, h, channels, elemsize, opt.workspace_allocator);
if (square_blob.empty())
return -100;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_top_blob.channel(q);
@@ -73,14 +74,14 @@ int LRN_arm::forward_inplace(Mat& bottom_top_blob) const
if (region_type == NormRegion_ACROSS_CHANNELS)
{
Mat square_sum;
square_sum.create(w, h, channels);
square_sum.create(w, h, channels, elemsize, opt.workspace_allocator);
if (square_sum.empty())
return -100;
square_sum.fill(0.f);

const float alpha_div_size = alpha / local_size;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
// square sum
@@ -165,7 +166,7 @@ int LRN_arm::forward_inplace(Mat& bottom_top_blob) const
int pad = local_size / 2;
if (pad > 0)
{
copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f);
copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
if (square_blob_bordered.empty())
return -100;

@@ -196,7 +197,7 @@ int LRN_arm::forward_inplace(Mat& bottom_top_blob) const
}
}

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);


+ 1
- 1
src/layer/arm/lrn_arm.h View File

@@ -22,7 +22,7 @@ namespace ncnn {
class LRN_arm : public LRN
{
public:
virtual int forward_inplace(Mat& bottom_top_blob) const;
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn


+ 2
- 2
src/layer/arm/pooling_2x2.h View File

@@ -16,7 +16,7 @@
#include <arm_neon.h>
#endif // __ARM_NEON

static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob)
static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;
@@ -26,7 +26,7 @@ static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob)
const int tailstep = w - 2*outw + w;
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<inch; q++)
{
const float* img0 = bottom_blob.channel(q);


+ 2
- 2
src/layer/arm/pooling_3x3.h View File

@@ -16,7 +16,7 @@
#include <arm_neon.h>
#endif // __ARM_NEON

static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob)
static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;
@@ -26,7 +26,7 @@ static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob)

const int tailstep = w - 2*outw + w;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<inch; q++)
{
const float* img0 = bottom_blob.channel(q);


+ 11
- 10
src/layer/arm/pooling_arm.cpp View File

@@ -21,14 +21,14 @@ namespace ncnn {

DEFINE_LAYER_CREATOR(Pooling_arm)

int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
// max value in NxN window
// avg value in NxN window

if (kernel_w != kernel_h || stride_w != stride_h)
{
return Pooling::forward(bottom_blob, top_blob);
return Pooling::forward(bottom_blob, top_blob, opt);
}

const int kernel_size = kernel_w;
@@ -36,17 +36,18 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const

if (pooling_type != PoolMethod_MAX || stride != 2 || global_pooling == 1)
{
return Pooling::forward(bottom_blob, top_blob);
return Pooling::forward(bottom_blob, top_blob, opt);
}

if (kernel_size != 2 && kernel_size != 3)
{
return Pooling::forward(bottom_blob, top_blob);
return Pooling::forward(bottom_blob, top_blob, opt);
}

int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;

Mat bottom_blob_bordered = bottom_blob;

@@ -73,7 +74,7 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
if (htail != 0)
htailpad = stride_h - htail;

copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom + htailpad, pad_left, pad_right + wtailpad, BORDER_CONSTANT, pad_value);
copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom + htailpad, pad_left, pad_right + wtailpad, BORDER_CONSTANT, pad_value, opt.workspace_allocator, opt.num_threads);
if (bottom_blob_bordered.empty())
return -100;

@@ -82,7 +83,7 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
}
else if (pad_mode == 1) // valid padding
{
copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, BORDER_CONSTANT, pad_value);
copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, BORDER_CONSTANT, pad_value, opt.workspace_allocator, opt.num_threads);
if (bottom_blob_bordered.empty())
return -100;

@@ -95,7 +96,7 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
int hpad = kernel_h + (h - 1) / stride_h * stride_h - h;
if (wpad > 0 || hpad > 0)
{
copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value);
copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value, opt.workspace_allocator, opt.num_threads);
if (bottom_blob_bordered.empty())
return -100;
}
@@ -107,14 +108,14 @@ int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
int outw = (w - kernel_w) / stride_w + 1;
int outh = (h - kernel_h) / stride_h + 1;

top_blob.create(outw, outh, channels);
top_blob.create(outw, outh, channels, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;

if (kernel_size == 2)
pooling2x2s2_max_neon(bottom_blob_bordered, top_blob);
pooling2x2s2_max_neon(bottom_blob_bordered, top_blob, opt);
if (kernel_size == 3)
pooling3x3s2_max_neon(bottom_blob_bordered, top_blob);
pooling3x3s2_max_neon(bottom_blob_bordered, top_blob, opt);

return 0;
}


+ 1
- 1
src/layer/arm/pooling_arm.h View File

@@ -22,7 +22,7 @@ namespace ncnn {
class Pooling_arm : public Pooling
{
public:
virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
};

} // namespace ncnn


+ 3
- 3
src/layer/arm/prelu_arm.cpp View File

@@ -22,11 +22,11 @@ namespace ncnn {

DEFINE_LAYER_CREATOR(PReLU_arm)

int PReLU_arm::forward_inplace(Mat& bottom_top_blob) const
int PReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int dims = bottom_top_blob.dims;
if (dims != 3)
return PReLU::forward_inplace(bottom_top_blob);
return PReLU::forward_inplace(bottom_top_blob, opt);

int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
@@ -35,7 +35,7 @@ int PReLU_arm::forward_inplace(Mat& bottom_top_blob) const

const float* slope_data_ptr = slope_data;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);


+ 1
- 1
src/layer/arm/prelu_arm.h View File

@@ -22,7 +22,7 @@ namespace ncnn {
class PReLU_arm : public PReLU
{
public:
virtual int forward_inplace(Mat& bottom_top_blob) const;
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn


+ 3
- 3
src/layer/arm/relu_arm.cpp View File

@@ -22,7 +22,7 @@ namespace ncnn {

DEFINE_LAYER_CREATOR(ReLU_arm)

int ReLU_arm::forward_inplace(Mat& bottom_top_blob) const
int ReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
@@ -31,7 +31,7 @@ int ReLU_arm::forward_inplace(Mat& bottom_top_blob) const

if (slope == 0.f)
{
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);
@@ -85,7 +85,7 @@ int ReLU_arm::forward_inplace(Mat& bottom_top_blob) const
}
else
{
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);


+ 1
- 1
src/layer/arm/relu_arm.h View File

@@ -22,7 +22,7 @@ namespace ncnn {
class ReLU_arm : public ReLU
{
public:
virtual int forward_inplace(Mat& bottom_top_blob) const;
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn


+ 4
- 4
src/layer/arm/scale_arm.cpp View File

@@ -22,11 +22,11 @@ namespace ncnn {

DEFINE_LAYER_CREATOR(Scale_arm)

int Scale_arm::forward_inplace(Mat& bottom_top_blob) const
int Scale_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int dims = bottom_top_blob.dims;
if (dims != 3)
return Scale::forward_inplace(bottom_top_blob);
return Scale::forward_inplace(bottom_top_blob, opt);

int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
@@ -37,7 +37,7 @@ int Scale_arm::forward_inplace(Mat& bottom_top_blob) const
{
const float* scale_ptr = scale_data;
const float* bias_ptr = bias_data;
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);
@@ -76,7 +76,7 @@ int Scale_arm::forward_inplace(Mat& bottom_top_blob) const
else
{
const float* scale_ptr = scale_data;
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);


+ 1
- 1
src/layer/arm/scale_arm.h View File

@@ -22,7 +22,7 @@ namespace ncnn {
class Scale_arm : public Scale
{
public:
virtual int forward_inplace(Mat& bottom_top_blob) const;
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn


+ 2
- 2
src/layer/arm/sigmoid_arm.cpp View File

@@ -25,14 +25,14 @@ namespace ncnn {

DEFINE_LAYER_CREATOR(Sigmoid_arm)

int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob) const
int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);


+ 1
- 1
src/layer/arm/sigmoid_arm.h View File

@@ -22,7 +22,7 @@ namespace ncnn {
class Sigmoid_arm : public Sigmoid
{
public:
virtual int forward_inplace(Mat& bottom_top_blob) const;
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn


+ 7
- 6
src/layer/arm/softmax_arm.cpp View File

@@ -25,12 +25,12 @@ namespace ncnn {

DEFINE_LAYER_CREATOR(Softmax_arm)

int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const
int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int dims = bottom_top_blob.dims;

if (dims != 3 || axis != 0)
return Softmax::forward_inplace(bottom_top_blob);
return Softmax::forward_inplace(bottom_top_blob, opt);

// value = exp( value - global max value )
// sum all value
@@ -39,10 +39,11 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
size_t elemsize = bottom_top_blob.elemsize;
int size = w * h;

Mat max;
max.create(w, h);
max.create(w, h, elemsize, opt.workspace_allocator);
if (max.empty())
return -100;
max.fill(-FLT_MAX);
@@ -57,7 +58,7 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const
}
}

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);
@@ -95,7 +96,7 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const
}

Mat sum;
sum.create(w, h);
sum.create(w, h, elemsize, opt.workspace_allocator);
if (sum.empty())
return -100;
sum.fill(0.f);
@@ -133,7 +134,7 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const
}
}

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);


+ 1
- 1
src/layer/arm/softmax_arm.h View File

@@ -22,7 +22,7 @@ namespace ncnn {
class Softmax_arm : public Softmax
{
public:
virtual int forward_inplace(Mat& bottom_top_blob) const;
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn


+ 4
- 4
src/layer/batchnorm.cpp View File

@@ -68,7 +68,7 @@ int BatchNorm::load_model(const ModelBin& mb)
return 0;
}

int BatchNorm::forward_inplace(Mat& bottom_top_blob) const
int BatchNorm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
// a = bias - slope * mean / sqrt(var)
// b = slope / sqrt(var)
@@ -82,7 +82,7 @@ int BatchNorm::forward_inplace(Mat& bottom_top_blob) const

float* ptr = bottom_top_blob;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int i=0; i<w; i++)
{
ptr[i] = b_data[i] * ptr[i] + a_data[i];
@@ -94,7 +94,7 @@ int BatchNorm::forward_inplace(Mat& bottom_top_blob) const
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int i=0; i<h; i++)
{
float* ptr = bottom_top_blob.row(i);
@@ -114,7 +114,7 @@ int BatchNorm::forward_inplace(Mat& bottom_top_blob) const
int h = bottom_top_blob.h;
int size = w * h;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);


+ 1
- 1
src/layer/batchnorm.h View File

@@ -28,7 +28,7 @@ public:

virtual int load_model(const ModelBin& mb);

virtual int forward_inplace(Mat& bottom_top_blob) const;
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
// param


+ 2
- 2
src/layer/bias.cpp View File

@@ -40,14 +40,14 @@ int Bias::load_model(const ModelBin& mb)
return 0;
}

int Bias::forward_inplace(Mat& bottom_top_blob) const
int Bias::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);


+ 1
- 1
src/layer/bias.h View File

@@ -28,7 +28,7 @@ public:

virtual int load_model(const ModelBin& mb);

virtual int forward_inplace(Mat& bottom_top_blob) const;
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
// param


+ 41
- 40
src/layer/binaryop.cpp View File

@@ -43,7 +43,7 @@ int BinaryOp::load_param(const ParamDict& pd)
}

template<typename Op>
static int binary_op(const Mat& a, const Mat& b, Mat& c)
static int binary_op(const Mat& a, const Mat& b, Mat& c, const Option& opt)
{
Op op;

@@ -51,6 +51,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
int h = a.h;
int channels = a.c;
int size = w * h;
size_t elemsize = a.elemsize;

int w1 = b.w;
int h1 = b.h;
@@ -59,13 +60,13 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)

if (a.dims == 3)
{
c.create(w, h, channels);
c.create(w, h, channels, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

if (b.dims == 3)
{
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = a.channel(q);
@@ -83,7 +84,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)

if (b.dims == 2)
{
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = a.channel(q);
@@ -111,7 +112,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
if (b.w == 1)
{
const float b0 = b[0];
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = a.channel(q);
@@ -126,7 +127,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
return 0;
}

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = a.channel(q);
@@ -146,11 +147,11 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
{
if (b.dims == 3)
{
c.create(w1, h1, channels1);
c.create(w1, h1, channels1, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels1; q++)
{
const float* ptr = (const float*)a + h1 * q;
@@ -173,7 +174,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
return 0;
}

c.create(w, h);
c.create(w, h, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

@@ -189,7 +190,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)

if (b.dims == 1)
{
c.create(w, h);
c.create(w, h, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

@@ -228,12 +229,12 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
{
if (b.dims == 3)
{
c.create(w1, h1, channels1);
c.create(w1, h1, channels1, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

const float a0 = a[0];
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels1; q++)
{
const float* ptr1 = b.channel(q);
@@ -250,7 +251,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)

if (b.dims == 2)
{
c.create(w1, h1);
c.create(w1, h1, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

@@ -265,7 +266,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)

if (b.dims == 1)
{
c.create(w1);
c.create(w1, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

@@ -281,11 +282,11 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)

if (b.dims == 3)
{
c.create(w1, h1, channels1);
c.create(w1, h1, channels1, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels1; q++)
{
const float a0 = a[q];
@@ -303,7 +304,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)

if (b.dims == 2)
{
c.create(w1, h1);
c.create(w1, h1, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

@@ -327,7 +328,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)

if (b.dims == 1)
{
c.create(w);
c.create(w, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

@@ -353,7 +354,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
}

template<typename Op>
static int binary_op_scalar_inplace(Mat& a, float b)
static int binary_op_scalar_inplace(Mat& a, float b, const Option& opt)
{
Op op;

@@ -362,7 +363,7 @@ static int binary_op_scalar_inplace(Mat& a, float b)
int channels = a.c;
int size = w * h;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = a.channel(q);
@@ -401,7 +402,7 @@ struct binary_op_rdiv : std::binary_function<T,T,T> {
T operator() (const T& x, const T& y) const { return y / x; }
};

int BinaryOp::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
int BinaryOp::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
const Mat& bottom_blob = bottom_blobs[0];
const Mat& bottom_blob1 = bottom_blobs[1];
@@ -409,63 +410,63 @@ int BinaryOp::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
Mat& top_blob = top_blobs[0];

if (op_type == Operation_ADD)
return binary_op< std::plus<float> >(bottom_blob, bottom_blob1, top_blob);
return binary_op< std::plus<float> >(bottom_blob, bottom_blob1, top_blob, opt);

if (op_type == Operation_SUB)
return binary_op< std::minus<float> >(bottom_blob, bottom_blob1, top_blob);
return binary_op< std::minus<float> >(bottom_blob, bottom_blob1, top_blob, opt);

if (op_type == Operation_MUL)
return binary_op< std::multiplies<float> >(bottom_blob, bottom_blob1, top_blob);
return binary_op< std::multiplies<float> >(bottom_blob, bottom_blob1, top_blob, opt);

if (op_type == Operation_DIV)
return binary_op< std::divides<float> >(bottom_blob, bottom_blob1, top_blob);
return binary_op< std::divides<float> >(bottom_blob, bottom_blob1, top_blob, opt);

if (op_type == Operation_MAX)
return binary_op< binary_op_max<float> >(bottom_blob, bottom_blob1, top_blob);
return binary_op< binary_op_max<float> >(bottom_blob, bottom_blob1, top_blob, opt);

if (op_type == Operation_MIN)
return binary_op< binary_op_min<float> >(bottom_blob, bottom_blob1, top_blob);
return binary_op< binary_op_min<float> >(bottom_blob, bottom_blob1, top_blob, opt);

if (op_type == Operation_POW)
return binary_op< binary_op_pow<float> >(bottom_blob, bottom_blob1, top_blob);
return binary_op< binary_op_pow<float> >(bottom_blob, bottom_blob1, top_blob, opt);

if (op_type == Operation_RSUB)
return binary_op< binary_op_rsub<float> >(bottom_blob, bottom_blob1, top_blob);
return binary_op< binary_op_rsub<float> >(bottom_blob, bottom_blob1, top_blob, opt);

if (op_type == Operation_RDIV)
return binary_op< binary_op_rdiv<float> >(bottom_blob, bottom_blob1, top_blob);
return binary_op< binary_op_rdiv<float> >(bottom_blob, bottom_blob1, top_blob, opt);

return 0;
}

int BinaryOp::forward_inplace(Mat& bottom_top_blob) const
int BinaryOp::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
if (op_type == Operation_ADD)
return binary_op_scalar_inplace< std::plus<float> >(bottom_top_blob, b);
return binary_op_scalar_inplace< std::plus<float> >(bottom_top_blob, b, opt);

if (op_type == Operation_SUB)
return binary_op_scalar_inplace< std::minus<float> >(bottom_top_blob, b);
return binary_op_scalar_inplace< std::minus<float> >(bottom_top_blob, b, opt);

if (op_type == Operation_MUL)
return binary_op_scalar_inplace< std::multiplies<float> >(bottom_top_blob, b);
return binary_op_scalar_inplace< std::multiplies<float> >(bottom_top_blob, b, opt);

if (op_type == Operation_DIV)
return binary_op_scalar_inplace< std::divides<float> >(bottom_top_blob, b);
return binary_op_scalar_inplace< std::divides<float> >(bottom_top_blob, b, opt);

if (op_type == Operation_MAX)
return binary_op_scalar_inplace< binary_op_max<float> >(bottom_top_blob, b);
return binary_op_scalar_inplace< binary_op_max<float> >(bottom_top_blob, b, opt);

if (op_type == Operation_MIN)
return binary_op_scalar_inplace< binary_op_min<float> >(bottom_top_blob, b);
return binary_op_scalar_inplace< binary_op_min<float> >(bottom_top_blob, b, opt);

if (op_type == Operation_POW)
return binary_op_scalar_inplace< binary_op_pow<float> >(bottom_top_blob, b);
return binary_op_scalar_inplace< binary_op_pow<float> >(bottom_top_blob, b, opt);

if (op_type == Operation_RSUB)
return binary_op_scalar_inplace< binary_op_rsub<float> >(bottom_top_blob, b);
return binary_op_scalar_inplace< binary_op_rsub<float> >(bottom_top_blob, b, opt);

if (op_type == Operation_RDIV)
return binary_op_scalar_inplace< binary_op_rdiv<float> >(bottom_top_blob, b);
return binary_op_scalar_inplace< binary_op_rdiv<float> >(bottom_top_blob, b, opt);

return 0;
}


+ 2
- 2
src/layer/binaryop.h View File

@@ -26,9 +26,9 @@ public:

virtual int load_param(const ParamDict& pd);

virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

virtual int forward_inplace(Mat& bottom_top_blob) const;
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

enum {
Operation_ADD = 0,


+ 2
- 2
src/layer/bnll.cpp View File

@@ -25,14 +25,14 @@ BNLL::BNLL()
support_inplace = true;
}

int BNLL::forward_inplace(Mat& bottom_top_blob) const
int BNLL::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);


+ 1
- 1
src/layer/bnll.h View File

@@ -24,7 +24,7 @@ class BNLL : public Layer
public:
BNLL();

virtual int forward_inplace(Mat& bottom_top_blob) const;
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
};


+ 2
- 2
src/layer/clip.cpp View File

@@ -34,14 +34,14 @@ int Clip::load_param(const ParamDict& pd)
return 0;
}

int Clip::forward_inplace(Mat& bottom_top_blob) const
int Clip::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);


+ 1
- 1
src/layer/clip.h View File

@@ -26,7 +26,7 @@ public:

virtual int load_param(const ParamDict& pd);

virtual int forward_inplace(Mat& bottom_top_blob) const;
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
float min;


+ 10
- 10
src/layer/concat.cpp View File

@@ -31,7 +31,7 @@ int Concat::load_param(const ParamDict& pd)
return 0;
}

int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
int dims = bottom_blobs[0].dims;
size_t elemsize = bottom_blobs[0].elemsize;
@@ -48,7 +48,7 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_
}

Mat& top_blob = top_blobs[0];
top_blob.create(top_w, elemsize);
top_blob.create(top_w, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;

@@ -82,7 +82,7 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_
}

Mat& top_blob = top_blobs[0];
top_blob.create(w, top_h, elemsize);
top_blob.create(w, top_h, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;

@@ -116,11 +116,11 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_
}

Mat& top_blob = top_blobs[0];
top_blob.create(top_w, h, elemsize);
top_blob.create(top_w, h, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int i=0; i<h; i++)
{
float* outptr = top_blob.row(i);
@@ -153,7 +153,7 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_
}

Mat& top_blob = top_blobs[0];
top_blob.create(w, h, top_channels, elemsize);
top_blob.create(w, h, top_channels, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;

@@ -190,11 +190,11 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_
}

Mat& top_blob = top_blobs[0];
top_blob.create(w, top_h, channels, elemsize);
top_blob.create(w, top_h, channels, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* outptr = top_blob.channel(q);
@@ -230,11 +230,11 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_
}

Mat& top_blob = top_blobs[0];
top_blob.create(top_w, h, channels, elemsize);
top_blob.create(top_w, h, channels, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* outptr = top_blob.channel(q);


+ 1
- 1
src/layer/concat.h View File

@@ -26,7 +26,7 @@ public:

virtual int load_param(const ParamDict& pd);

virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

public:
int axis;


+ 7
- 6
src/layer/convolution.cpp View File

@@ -59,7 +59,7 @@ int Convolution::load_model(const ModelBin& mb)
return 0;
}

int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
// convolv with NxN kernel
// value = value + bias
@@ -89,7 +89,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
op->load_model(ModelBinFromMatArray(weights));

// forward
op->forward(bottom_blob, top_blob);
op->forward(bottom_blob, top_blob, opt);

delete op;

@@ -100,6 +100,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;

// fprintf(stderr, "Convolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d\n", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);

@@ -109,7 +110,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
Mat bottom_blob_bordered = bottom_blob;
if (pad_w > 0 || pad_h > 0)
{
copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f);
copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
if (bottom_blob_bordered.empty())
return -100;

@@ -122,7 +123,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
if (wpad > 0 || hpad > 0)
{
copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f);
copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
if (bottom_blob_bordered.empty())
return -100;
}
@@ -134,7 +135,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
int outw = (w - kernel_extent_w) / stride_w + 1;
int outh = (h - kernel_extent_h) / stride_h + 1;

top_blob.create(outw, outh, num_output);
top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;

@@ -160,7 +161,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
}

// num_output
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<num_output; p++)
{
float* outptr = top_blob.channel(p);


+ 1
- 1
src/layer/convolution.h View File

@@ -28,7 +28,7 @@ public:

virtual int load_model(const ModelBin& mb);

virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

public:
// param


+ 8
- 7
src/layer/convolutiondepthwise.cpp View File

@@ -64,7 +64,7 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb)
return 0;
}

int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
// convolv with NxN kernel
// value = value + bias
@@ -72,6 +72,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;

if (channels % group != 0 || num_output % group != 0)
{
@@ -87,7 +88,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
Mat bottom_blob_bordered = bottom_blob;
if (pad_w > 0 || pad_h > 0)
{
copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f);
copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
if (bottom_blob_bordered.empty())
return -100;

@@ -100,7 +101,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
if (wpad > 0 || hpad > 0)
{
copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f);
copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
if (bottom_blob_bordered.empty())
return -100;
}
@@ -112,7 +113,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
int outw = (w - kernel_extent_w) / stride_w + 1;
int outh = (h - kernel_extent_h) / stride_h + 1;

top_blob.create(outw, outh, num_output);
top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;

@@ -140,7 +141,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
// depth-wise
if (channels == group && group == num_output)
{
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int g=0; g<group; g++)
{
float* outptr = top_blob.channel(g);
@@ -179,9 +180,9 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
const int num_output_g = num_output / group;

#ifdef _WIN32
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
#else // _WIN32
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) num_threads(opt.num_threads)
#endif // _WIN32
for (int g=0; g<group; g++)
{


+ 1
- 1
src/layer/convolutiondepthwise.h View File

@@ -28,7 +28,7 @@ public:

virtual int load_model(const ModelBin& mb);

virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

public:
// param


+ 4
- 4
src/layer/crop.cpp View File

@@ -39,7 +39,7 @@ int Crop::load_param(const ParamDict& pd)
return 0;
}

int Crop::forward(const Mat& bottom_blob, Mat& top_blob) const
int Crop::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
int w = bottom_blob.w;
int h = bottom_blob.h;
@@ -56,14 +56,14 @@ int Crop::forward(const Mat& bottom_blob, Mat& top_blob) const
int left = woffset;
int right = w - _outw - woffset;

copy_cut_border(bottom_blob_sliced, top_blob, top, bottom, left, right);
copy_cut_border(bottom_blob_sliced, top_blob, top, bottom, left, right, opt.blob_allocator, opt.num_threads);
if (top_blob.empty())
return -100;

return 0;
}

int Crop::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
int Crop::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
const Mat& bottom_blob = bottom_blobs[0];
const Mat& reference_blob = bottom_blobs[1];
@@ -85,7 +85,7 @@ int Crop::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl

Mat& top_blob = top_blobs[0];

copy_cut_border(bottom_blob_sliced, top_blob, top, bottom, left, right);
copy_cut_border(bottom_blob_sliced, top_blob, top, bottom, left, right, opt.blob_allocator, opt.num_threads);
if (top_blob.empty())
return -100;



+ 2
- 2
src/layer/crop.h View File

@@ -26,9 +26,9 @@ public:

virtual int load_param(const ParamDict& pd);

virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

public:
int woffset;


+ 22
- 9
src/layer/deconvolution.cpp View File

@@ -57,7 +57,7 @@ int Deconvolution::load_model(const ModelBin& mb)
return 0;
}

int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const
int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
// backward strided convolv with NxN kernel
// value = value + bias
@@ -65,6 +65,7 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;

// fprintf(stderr, "Deconvolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d\n", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);

@@ -74,10 +75,20 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const
int outw = (w - 1) * stride_w + kernel_extent_w;
int outh = (h - 1) * stride_h + kernel_extent_h;

Mat top_blob_bordered = top_blob;
top_blob_bordered.create(outw, outh, num_output);
if (top_blob_bordered.empty())
return -100;
Mat top_blob_bordered;
if (pad_w > 0 || pad_h > 0)
{
top_blob_bordered.create(outw, outh, num_output, elemsize, opt.workspace_allocator);
if (top_blob_bordered.empty())
return -100;
}
else
{
top_blob_bordered = top_blob;
top_blob_bordered.create(outw, outh, num_output, elemsize, opt.blob_allocator);
if (top_blob_bordered.empty())
return -100;
}

const int maxk = kernel_w * kernel_h;

@@ -101,7 +112,7 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const
}

// num_output
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<num_output; p++)
{
Mat out = top_blob_bordered.channel(p);
@@ -136,17 +147,19 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const
}
}

top_blob = top_blob_bordered;

if (pad_w > 0 || pad_h > 0)
{
copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w);
copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w, opt.blob_allocator, opt.num_threads);
if (top_blob.empty())
return -100;

outw = top_blob.w;
outh = top_blob.h;
}
else
{
top_blob = top_blob_bordered;
}

return 0;
}


+ 1
- 1
src/layer/deconvolution.h View File

@@ -28,7 +28,7 @@ public:

virtual int load_model(const ModelBin& mb);

virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

public:
// param


+ 23
- 10
src/layer/deconvolutiondepthwise.cpp View File

@@ -58,7 +58,7 @@ int DeconvolutionDepthWise::load_model(const ModelBin& mb)
return 0;
}

int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
// deconvolv with NxN kernel
// value = value + bias
@@ -66,6 +66,7 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;

if (channels % group != 0 || num_output % group != 0)
{
@@ -79,10 +80,20 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
int outw = (w - 1) * stride_w + kernel_extent_w;
int outh = (h - 1) * stride_h + kernel_extent_h;

Mat top_blob_bordered = top_blob;
top_blob_bordered.create(outw, outh, num_output);
if (top_blob_bordered.empty())
return -100;
Mat top_blob_bordered;
if (pad_w > 0 || pad_h > 0)
{
top_blob_bordered.create(outw, outh, num_output, elemsize, opt.workspace_allocator);
if (top_blob_bordered.empty())
return -100;
}
else
{
top_blob_bordered = top_blob;
top_blob_bordered.create(outw, outh, num_output, elemsize, opt.blob_allocator);
if (top_blob_bordered.empty())
return -100;
}

const int maxk = kernel_w * kernel_h;

@@ -108,7 +119,7 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
// depth-wise
if (channels == group && group == num_output)
{
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int g=0; g<group; g++)
{
const float* inptr = bottom_blob.channel(g);
@@ -141,7 +152,7 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
const int channels_g = channels / group;
const int num_output_g = num_output / group;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int g = 0; g < group; g++)
{
const float* weight_data_ptr = (const float*)weight_data + maxk * channels_g * num_output_g * g;
@@ -180,17 +191,19 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
}
}

top_blob = top_blob_bordered;

if (pad_w > 0 || pad_h > 0)
{
copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w);
copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w, opt.blob_allocator, opt.num_threads);
if (top_blob.empty())
return -100;

outw = top_blob.w;
outh = top_blob.h;
}
else
{
top_blob = top_blob_bordered;
}

return 0;
}


+ 1
- 1
src/layer/deconvolutiondepthwise.h View File

@@ -28,7 +28,7 @@ public:

virtual int load_model(const ModelBin& mb);

virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

public:
// param


+ 5
- 5
src/layer/detectionoutput.cpp View File

@@ -141,7 +141,7 @@ static void nms_sorted_bboxes(const std::vector<BBoxRect>& bboxes, std::vector<i
}
}

int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
const Mat& location = bottom_blobs[0];
const Mat& confidence = bottom_blobs[1];
@@ -151,7 +151,7 @@ int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<M

// apply location with priorbox
Mat bboxes;
bboxes.create(4, num_prior);
bboxes.create(4, num_prior, 4u, opt.workspace_allocator);
if (bboxes.empty())
return -100;

@@ -159,7 +159,7 @@ int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<M
const float* priorbox_ptr = priorbox.row(0);
const float* variance_ptr = priorbox.row(1);

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int i = 0; i < num_prior; i++)
{
const float* loc = location_ptr + i * 4;
@@ -192,7 +192,7 @@ int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<M
all_class_bbox_scores.resize(num_class);

// start from 1 to ignore background class
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int i = 1; i < num_class; i++)
{
// filter by confidence_threshold
@@ -262,7 +262,7 @@ int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<M
int num_detected = bbox_rects.size();

Mat& top_blob = top_blobs[0];
top_blob.create(6, num_detected);
top_blob.create(6, num_detected, 4u, opt.blob_allocator);
if (top_blob.empty())
return -100;



+ 1
- 1
src/layer/detectionoutput.h View File

@@ -26,7 +26,7 @@ public:

virtual int load_param(const ParamDict& pd);

virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

public:
int num_class;


+ 2
- 2
src/layer/dropout.cpp View File

@@ -31,7 +31,7 @@ int Dropout::load_param(const ParamDict& pd)
return 0;
}

int Dropout::forward_inplace(Mat& bottom_top_blob) const
int Dropout::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
if (scale == 1.f)
{
@@ -43,7 +43,7 @@ int Dropout::forward_inplace(Mat& bottom_top_blob) const
int channels = bottom_top_blob.c;
int size = w * h;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);


+ 1
- 1
src/layer/dropout.h View File

@@ -26,7 +26,7 @@ public:

virtual int load_param(const ParamDict& pd);

virtual int forward_inplace(Mat& bottom_top_blob) const;
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
float scale;


+ 11
- 10
src/layer/eltwise.cpp View File

@@ -31,16 +31,17 @@ int Eltwise::load_param(const ParamDict& pd)
return 0;
}

int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
const Mat& bottom_blob = bottom_blobs[0];
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int size = w * h;

Mat& top_blob = top_blobs[0];
top_blob.create(w, h, channels);
top_blob.create(w, h, channels, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;

@@ -48,7 +49,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
{
// first blob
const Mat& bottom_blob1 = bottom_blobs[1];
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
@@ -64,7 +65,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
for (size_t b=2; b<bottom_blobs.size(); b++)
{
const Mat& bottom_blob1 = bottom_blobs[b];
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob1.channel(q);
@@ -83,7 +84,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
{
// first blob
const Mat& bottom_blob1 = bottom_blobs[1];
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
@@ -99,7 +100,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
for (size_t b=2; b<bottom_blobs.size(); b++)
{
const Mat& bottom_blob1 = bottom_blobs[b];
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob1.channel(q);
@@ -118,7 +119,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
const Mat& bottom_blob1 = bottom_blobs[1];
float coeff0 = coeffs[0];
float coeff1 = coeffs[1];
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
@@ -135,7 +136,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
{
const Mat& bottom_blob1 = bottom_blobs[b];
float coeff = coeffs[b];
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob1.channel(q);
@@ -153,7 +154,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
{
// first blob
const Mat& bottom_blob1 = bottom_blobs[1];
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
@@ -169,7 +170,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
for (size_t b=2; b<bottom_blobs.size(); b++)
{
const Mat& bottom_blob1 = bottom_blobs[b];
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob1.channel(q);


+ 1
- 1
src/layer/eltwise.h View File

@@ -26,7 +26,7 @@ public:

virtual int load_param(const ParamDict& pd);

virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

enum { Operation_PROD = 0, Operation_SUM = 1, Operation_MAX = 2 };



+ 2
- 2
src/layer/elu.cpp View File

@@ -32,14 +32,14 @@ int ELU::load_param(const ParamDict& pd)
return 0;
}

int ELU::forward_inplace(Mat& bottom_top_blob) const
int ELU::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);


+ 1
- 1
src/layer/elu.h View File

@@ -26,7 +26,7 @@ public:

virtual int load_param(const ParamDict& pd);

virtual int forward_inplace(Mat& bottom_top_blob) const;
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
float alpha;


+ 3
- 3
src/layer/embed.cpp View File

@@ -51,16 +51,16 @@ int Embed::load_model(const ModelBin& mb)
return 0;
}

int Embed::forward(const Mat& bottom_blob, Mat& top_blob) const
int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
int words = bottom_blob.total();

top_blob.create(num_output, words);
top_blob.create(num_output, words, 4u, opt.blob_allocator);
if (top_blob.empty())
return -100;

// num_output
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<words; q++)
{
float* outptr = top_blob.row(q);


+ 1
- 1
src/layer/embed.h View File

@@ -28,7 +28,7 @@ public:

virtual int load_model(const ModelBin& mb);

virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

public:
// param


+ 3
- 3
src/layer/exp.cpp View File

@@ -34,7 +34,7 @@ int Exp::load_param(const ParamDict& pd)
return 0;
}

int Exp::forward_inplace(Mat& bottom_top_blob) const
int Exp::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
@@ -43,7 +43,7 @@ int Exp::forward_inplace(Mat& bottom_top_blob) const

if (base == -1.f)
{
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);
@@ -56,7 +56,7 @@ int Exp::forward_inplace(Mat& bottom_top_blob) const
}
else
{
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);


+ 1
- 1
src/layer/exp.h View File

@@ -26,7 +26,7 @@ public:

virtual int load_param(const ParamDict& pd);

virtual int forward_inplace(Mat& bottom_top_blob) const;
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
float base;


+ 9
- 9
src/layer/expanddims.cpp View File

@@ -33,7 +33,7 @@ int ExpandDims::load_param(const ParamDict& pd)
return 0;
}

int ExpandDims::forward(const Mat& bottom_blob, Mat& top_blob) const
int ExpandDims::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
int w = bottom_blob.w;
int h = bottom_blob.h;
@@ -46,28 +46,28 @@ int ExpandDims::forward(const Mat& bottom_blob, Mat& top_blob) const
if (expand_w)
{
if (expand_h)
top_blob = bottom_blob.reshape(1, 1, w);
top_blob = bottom_blob.reshape(1, 1, w, opt.blob_allocator);
else if (expand_c)
top_blob = bottom_blob.reshape(1, w, 1);
top_blob = bottom_blob.reshape(1, w, 1, opt.blob_allocator);
else
top_blob = bottom_blob.reshape(1, w);
top_blob = bottom_blob.reshape(1, w, opt.blob_allocator);
}
else if (expand_h)
{
if (expand_c)
top_blob = bottom_blob.reshape(w, 1, 1);
top_blob = bottom_blob.reshape(w, 1, 1, opt.blob_allocator);
else
top_blob = bottom_blob.reshape(w, 1);
top_blob = bottom_blob.reshape(w, 1, opt.blob_allocator);
}
}
else if (dims == 2)
{
if (expand_w)
top_blob = bottom_blob.reshape(1, w, h);
top_blob = bottom_blob.reshape(1, w, h, opt.blob_allocator);
else if (expand_h)
top_blob = bottom_blob.reshape(w, 1, h);
top_blob = bottom_blob.reshape(w, 1, h, opt.blob_allocator);
else if (expand_c)
top_blob = bottom_blob.reshape(w, h, 1);
top_blob = bottom_blob.reshape(w, h, 1, opt.blob_allocator);
}

if (top_blob.empty())


+ 1
- 1
src/layer/expanddims.h View File

@@ -26,7 +26,7 @@ public:

virtual int load_param(const ParamDict& pd);

virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

public:
int expand_w;


+ 4
- 3
src/layer/flatten.cpp View File

@@ -24,18 +24,19 @@ Flatten::Flatten()
support_inplace = false;
}

int Flatten::forward(const Mat& bottom_blob, Mat& top_blob) const
int Flatten::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int size = w * h;

top_blob.create(size * channels);
top_blob.create(size * channels, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);


+ 1
- 1
src/layer/flatten.h View File

@@ -24,7 +24,7 @@ class Flatten : public Layer
public:
Flatten();

virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
};

} // namespace ncnn


+ 4
- 3
src/layer/innerproduct.cpp View File

@@ -49,19 +49,20 @@ int InnerProduct::load_model(const ModelBin& mb)
return 0;
}

int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob) const
int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int size = w * h;

top_blob.create(num_output);
top_blob.create(num_output, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;

// num_output
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<num_output; p++)
{
float sum = 0.f;


+ 1
- 1
src/layer/innerproduct.h View File

@@ -28,7 +28,7 @@ public:

virtual int load_model(const ModelBin& mb);

virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

public:
// param


+ 1
- 1
src/layer/input.cpp View File

@@ -33,7 +33,7 @@ int Input::load_param(const ParamDict& pd)
return 0;
}

int Input::forward_inplace(Mat& /*bottom_top_blob*/) const
int Input::forward_inplace(Mat& /*bottom_top_blob*/, const Option& /*opt*/) const
{
return 0;
}


+ 1
- 1
src/layer/input.h View File

@@ -26,7 +26,7 @@ public:

virtual int load_param(const ParamDict& pd);

virtual int forward_inplace(Mat& bottom_top_blob) const;
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
int w;


+ 2
- 2
src/layer/instancenorm.cpp View File

@@ -46,7 +46,7 @@ int InstanceNorm::load_model(const ModelBin& mb)
return 0;
}

int InstanceNorm::forward_inplace(Mat& bottom_top_blob) const
int InstanceNorm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
// x = (x - mean) / (sqrt(var) + eps) * gamma + beta

@@ -54,7 +54,7 @@ int InstanceNorm::forward_inplace(Mat& bottom_top_blob) const
int h = bottom_top_blob.h;
int size = w * h;

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);


+ 1
- 1
src/layer/instancenorm.h View File

@@ -28,7 +28,7 @@ public:

virtual int load_model(const ModelBin& mb);

virtual int forward_inplace(Mat& bottom_top_blob) const;
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
// param


+ 6
- 4
src/layer/interp.cpp View File

@@ -35,11 +35,13 @@ int Interp::load_param(const ParamDict& pd)
return 0;
}

int Interp::forward(const Mat &bottom_blob, Mat &top_blob) const
int Interp::forward(const Mat &bottom_blob, Mat &top_blob, const Option& opt) const
{
int h = bottom_blob.h;
int w = bottom_blob.w;
int c = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;

int oh = output_height;
int ow = output_width;
if (bottom_blob.dims == 1)
@@ -58,13 +60,13 @@ int Interp::forward(const Mat &bottom_blob, Mat &top_blob) const
top_blob = bottom_blob;
return 0;
}
top_blob.create(ow, oh, c);
top_blob.create(ow, oh, c, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;

if (bottom_blob.dims == 1)
{
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < c; ++q)
{
Mat top_blob_c = top_blob.channel(q);
@@ -76,7 +78,7 @@ int Interp::forward(const Mat &bottom_blob, Mat &top_blob) const

if (resize_type == 1)//nearest
{
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < c; ++q)
{
const float *ptr = bottom_blob.channel(q);


+ 1
- 1
src/layer/interp.h View File

@@ -26,7 +26,7 @@ public:

virtual int load_param(const ParamDict& pd);

virtual int forward(const Mat &bottom_blob, Mat &top_blob) const;
virtual int forward(const Mat &bottom_blob, Mat &top_blob, const Option& opt) const;

public:
// param


+ 3
- 3
src/layer/log.cpp View File

@@ -34,7 +34,7 @@ int Log::load_param(const ParamDict& pd)
return 0;
}

int Log::forward_inplace(Mat& bottom_top_blob) const
int Log::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
@@ -43,7 +43,7 @@ int Log::forward_inplace(Mat& bottom_top_blob) const

if (base == -1.f)
{
#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);
@@ -58,7 +58,7 @@ int Log::forward_inplace(Mat& bottom_top_blob) const
{
float log_base_inv = 1.f / log(base);

#pragma omp parallel for
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);


Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save