Optimization of the softmax layer (#914)

* Optimize the loop structure to improve the speed of the softmax layer & Reduce memory consumption * use 4 space instead of tab
7 years ago · 9ffe2b84e9
--- a/src/layer/softmax.cpp
+++ b/src/layer/softmax.cpp
@@ -77,14 +77,10 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
            max = std::max(max, ptr[i]);
        }

        for (int i=0; i<w; i++)
        {
            ptr[i] = exp(ptr[i] - max);
        }

        float sum = 0.f;
        for (int i=0; i<w; i++)
        {
            ptr[i] = exp(ptr[i] - max);
            sum += ptr[i];
        }

@@ -116,26 +112,18 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
            }
        }

        for (int i=0; i<h; i++)
        {
            float* ptr = bottom_top_blob.row(i);
            for (int j=0; j<w; j++)
            {
                ptr[j] = exp(ptr[j] - max[j]);
            }
        }

        Mat sum;
        sum.create(w, elemsize, opt.workspace_allocator);
        if (sum.empty())
            return -100;
        sum.fill(0.f);

        for (int i=0; i<h; i++)
        for (int i = 0; i<h; i++)
        {
            const float* ptr = bottom_top_blob.row(i);
            float* ptr = bottom_top_blob.row(i);
            for (int j=0; j<w; j++)
            {
                ptr[j] = exp(ptr[j] - max[j]);
                sum[j] += ptr[j];
            }
        }
@@ -157,58 +145,22 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;

        Mat max;
        max.create(h, elemsize, opt.workspace_allocator);
        if (max.empty())
            return -100;

        for (int i=0; i<h; i++)
        {
            const float* ptr = bottom_top_blob.row(i);

            float* ptr = bottom_top_blob.row(i);
            float m = -FLT_MAX;
            for (int j=0; j<w; j++)
            {
                m = std::max(m, ptr[j]);
            }

            max[i] = m;
        }

        for (int i=0; i<h; i++)
        {
            float* ptr = bottom_top_blob.row(i);

            float m = max[i];
            for (int j=0; j<w; j++)
            {
                ptr[j] = exp(ptr[j] - m);
            }
        }

        Mat sum;
        sum.create(h, elemsize, opt.workspace_allocator);
        if (sum.empty())
            return -100;

        for (int i=0; i<h; i++)
        {
            const float* ptr = bottom_top_blob.row(i);

            float s = 0.f;
            for (int j=0; j<w; j++)
            {
                ptr[j] = exp(ptr[j] - m);
                s += ptr[j];
            }

            sum[i] = s;
        }

        for (int i=0; i<h; i++)
        {
            float* ptr = bottom_top_blob.row(i);

            float s = sum[i];
            for (int j=0; j<w; j++)
            {
                ptr[j] /= s;
@@ -240,17 +192,6 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
            }
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            for (int i=0; i<size; i++)
            {
                ptr[i] = exp(ptr[i] - max[i]);
            }
        }

        Mat sum;
        sum.create(w, h, elemsize, opt.workspace_allocator);
        if (sum.empty())
@@ -258,10 +199,11 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
        sum.fill(0.f);
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_top_blob.channel(q);
            float* ptr = bottom_top_blob.channel(q);

            for (int i=0; i<size; i++)
            {
                ptr[i] = exp(ptr[i] - max[i]);
                sum[i] += ptr[i];
            }
        }
@@ -308,23 +250,6 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
            }
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
            float* maxptr = max.row(q);

            for (int i=0; i<h; i++)
            {
                for (int j=0; j<w; j++)
                {
                    ptr[j] = exp(ptr[j] - maxptr[j]);
                }

                ptr += w;
            }
        }

        Mat sum;
        sum.create(w, channels, elemsize, opt.workspace_allocator);
        if (sum.empty())
@@ -333,13 +258,15 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_top_blob.channel(q);
            float* ptr = bottom_top_blob.channel(q);
            float* maxptr = max.row(q);
            float* sumptr = sum.row(q);

            for (int i=0; i<h; i++)
            {
                for (int j=0; j<w; j++)
                {
                    ptr[j] = exp(ptr[j] - maxptr[j]);
                    sumptr[j] += ptr[j];
                }

@@ -373,16 +300,10 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
        int h = bottom_top_blob.h;
        int channels = bottom_top_blob.c;

        Mat max;
        max.create(h, channels, elemsize, opt.workspace_allocator);
        if (max.empty())
            return -100;
        max.fill(-FLT_MAX);
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_top_blob.channel(q);
            float* maxptr = max.row(q);
            float* ptr = bottom_top_blob.channel(q);

            for (int i=0; i<h; i++)
            {
@@ -392,62 +313,13 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                    max = std::max(max, ptr[j]);
                }

                maxptr[i] = max;
                ptr += w;
            }
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
            float* maxptr = max.row(q);

            for (int i=0; i<h; i++)
            {
                float max = maxptr[i];
                for (int j=0; j<w; j++)
                {
                    ptr[j] = exp(ptr[j] - max);
                }

                ptr += w;
            }
        }

        Mat sum;
        sum.create(h, channels, elemsize, opt.workspace_allocator);
        if (sum.empty())
            return -100;
        sum.fill(0.f);
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_top_blob.channel(q);
            float* sumptr = sum.row(q);

            for (int i=0; i<h; i++)
            {
                float sum = 0.f;
                for (int j=0; j<w; j++)
                {
                    ptr[j] = exp(ptr[j] - max);
                    sum += ptr[j];
                }

                sumptr[i] = sum;
                ptr += w;
            }
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
            float* sumptr = sum.row(q);

            for (int i=0; i<h; i++)
            {
                float sum = sumptr[i];
                for (int j=0; j<w; j++)
                {
                    ptr[j] /= sum;