Browse Source

Optimization of the softmax layer (#914)

* Optimize the loop structure to improve the speed of the softmax layer & Reduce memory consumption

* use 4 space instead of tab
tags/20190611
gfjiangly nihui 7 years ago
parent
commit
9ffe2b84e9
1 changed files with 13 additions and 141 deletions
  1. +13
    -141
      src/layer/softmax.cpp

+ 13
- 141
src/layer/softmax.cpp View File

@@ -77,14 +77,10 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
max = std::max(max, ptr[i]);
}

for (int i=0; i<w; i++)
{
ptr[i] = exp(ptr[i] - max);
}

float sum = 0.f;
for (int i=0; i<w; i++)
{
ptr[i] = exp(ptr[i] - max);
sum += ptr[i];
}

@@ -116,26 +112,18 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
}
}

for (int i=0; i<h; i++)
{
float* ptr = bottom_top_blob.row(i);
for (int j=0; j<w; j++)
{
ptr[j] = exp(ptr[j] - max[j]);
}
}

Mat sum;
sum.create(w, elemsize, opt.workspace_allocator);
if (sum.empty())
return -100;
sum.fill(0.f);

for (int i=0; i<h; i++)
for (int i = 0; i<h; i++)
{
const float* ptr = bottom_top_blob.row(i);
float* ptr = bottom_top_blob.row(i);
for (int j=0; j<w; j++)
{
ptr[j] = exp(ptr[j] - max[j]);
sum[j] += ptr[j];
}
}
@@ -157,58 +145,22 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;

Mat max;
max.create(h, elemsize, opt.workspace_allocator);
if (max.empty())
return -100;

for (int i=0; i<h; i++)
{
const float* ptr = bottom_top_blob.row(i);

float* ptr = bottom_top_blob.row(i);
float m = -FLT_MAX;
for (int j=0; j<w; j++)
{
m = std::max(m, ptr[j]);
}

max[i] = m;
}

for (int i=0; i<h; i++)
{
float* ptr = bottom_top_blob.row(i);

float m = max[i];
for (int j=0; j<w; j++)
{
ptr[j] = exp(ptr[j] - m);
}
}

Mat sum;
sum.create(h, elemsize, opt.workspace_allocator);
if (sum.empty())
return -100;

for (int i=0; i<h; i++)
{
const float* ptr = bottom_top_blob.row(i);

float s = 0.f;
for (int j=0; j<w; j++)
{
ptr[j] = exp(ptr[j] - m);
s += ptr[j];
}

sum[i] = s;
}

for (int i=0; i<h; i++)
{
float* ptr = bottom_top_blob.row(i);

float s = sum[i];
for (int j=0; j<w; j++)
{
ptr[j] /= s;
@@ -240,17 +192,6 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
}
}

#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

for (int i=0; i<size; i++)
{
ptr[i] = exp(ptr[i] - max[i]);
}
}

Mat sum;
sum.create(w, h, elemsize, opt.workspace_allocator);
if (sum.empty())
@@ -258,10 +199,11 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
sum.fill(0.f);
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_top_blob.channel(q);
float* ptr = bottom_top_blob.channel(q);

for (int i=0; i<size; i++)
{
ptr[i] = exp(ptr[i] - max[i]);
sum[i] += ptr[i];
}
}
@@ -308,23 +250,6 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
}
}

#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);
float* maxptr = max.row(q);

for (int i=0; i<h; i++)
{
for (int j=0; j<w; j++)
{
ptr[j] = exp(ptr[j] - maxptr[j]);
}

ptr += w;
}
}

Mat sum;
sum.create(w, channels, elemsize, opt.workspace_allocator);
if (sum.empty())
@@ -333,13 +258,15 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_top_blob.channel(q);
float* ptr = bottom_top_blob.channel(q);
float* maxptr = max.row(q);
float* sumptr = sum.row(q);

for (int i=0; i<h; i++)
{
for (int j=0; j<w; j++)
{
ptr[j] = exp(ptr[j] - maxptr[j]);
sumptr[j] += ptr[j];
}

@@ -373,16 +300,10 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;

Mat max;
max.create(h, channels, elemsize, opt.workspace_allocator);
if (max.empty())
return -100;
max.fill(-FLT_MAX);
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_top_blob.channel(q);
float* maxptr = max.row(q);
float* ptr = bottom_top_blob.channel(q);

for (int i=0; i<h; i++)
{
@@ -392,62 +313,13 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
max = std::max(max, ptr[j]);
}

maxptr[i] = max;
ptr += w;
}
}

#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);
float* maxptr = max.row(q);

for (int i=0; i<h; i++)
{
float max = maxptr[i];
for (int j=0; j<w; j++)
{
ptr[j] = exp(ptr[j] - max);
}

ptr += w;
}
}

Mat sum;
sum.create(h, channels, elemsize, opt.workspace_allocator);
if (sum.empty())
return -100;
sum.fill(0.f);
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_top_blob.channel(q);
float* sumptr = sum.row(q);

for (int i=0; i<h; i++)
{
float sum = 0.f;
for (int j=0; j<w; j++)
{
ptr[j] = exp(ptr[j] - max);
sum += ptr[j];
}

sumptr[i] = sum;
ptr += w;
}
}

#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);
float* sumptr = sum.row(q);

for (int i=0; i<h; i++)
{
float sum = sumptr[i];
for (int j=0; j<w; j++)
{
ptr[j] /= sum;


Loading…
Cancel
Save