merge concatv2 dropoutv2 softmaxv2

8 years ago · 91c08a390a
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -135,11 +135,8 @@ ncnn_add_layer(ExpandDims)
 ncnn_add_layer(Normalize)
 ncnn_add_layer(Permute)
 ncnn_add_layer(PriorBox)
 ncnn_add_layer(ConcatV2)
 ncnn_add_layer(SoftmaxV2)
 ncnn_add_layer(DetectionOutput)
 ncnn_add_layer(Interp)
 ncnn_add_layer(DropoutV2)

 add_library(ncnn STATIC ${ncnn_SRCS})

--- a/src/layer/arm/softmax_arm.cpp
+++ b/src/layer/arm/softmax_arm.cpp
@@ -27,6 +27,9 @@ DEFINE_LAYER_CREATOR(Softmax_arm)

 int Softmax_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
 {
    if (axis != 0)
        return Softmax::forward(bottom_blob, top_blob);

    // value = exp( value - global max value )
    // sum all value
    // value = value / sum
@@ -156,6 +159,9 @@ int Softmax_arm::forward(const Mat& bottom_blob, Mat& top_blob) const

 int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const
 {
    if (axis != 0)
        return Softmax::forward_inplace(bottom_top_blob);

    // value = exp( value - global max value )
    // sum all value
    // value = value / sum
--- a/src/layer/concat.cpp
+++ b/src/layer/concat.cpp
@@ -20,13 +20,22 @@ DEFINE_LAYER_CREATOR(Concat)

 Concat::Concat()
 {
    one_blob_only = false;
    support_inplace = false;
 }

 int Concat::load_param(const ParamDict& pd)
 {
    axis = pd.get(0, 0);

    return 0;
 }

 int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
 {
    int dims = bottom_blobs[0].dims;

    if (dims == 1)
    if (dims == 1) // axis == 0
    {
        // concat vector
        // total length
@@ -61,7 +70,7 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_
        return 0;
    }

    if (dims == 2)
    if (dims == 2 && axis == 0)
    {
        // concat image
        int w = bottom_blobs[0].w;
@@ -98,38 +107,168 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_
        return 0;
    }

    int w = bottom_blobs[0].w;
    int h = bottom_blobs[0].h;
    if (dims == 2 && axis == 1)
    {
        // interleave image row
        int h = bottom_blobs[0].h;

        // total width
        int top_w = 0;
        for (size_t b=0; b<bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_w += bottom_blob.w;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w, h);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for
        for (int i=0; i<h; i++)
        {
            float* outptr = top_blob.row(i);
            for (size_t b=0; b<bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob = bottom_blobs[b];

                const float* ptr = bottom_blob.row(i);
                for (int j=0; j<bottom_blob.w; j++)
                {
                    outptr[j] = ptr[j];
                }

                outptr += bottom_blob.w;
            }
        }

    // total channels
    int top_channels = 0;
    for (size_t b=0; b<bottom_blobs.size(); b++)
        return 0;
    }

    if (dims == 3 && axis == 0)
    {
        const Mat& bottom_blob = bottom_blobs[b];
        top_channels += bottom_blob.c;
        // concat dim
        int w = bottom_blobs[0].w;
        int h = bottom_blobs[0].h;

        // total channels
        int top_channels = 0;
        for (size_t b=0; b<bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_channels += bottom_blob.c;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, h, top_channels);
        if (top_blob.empty())
            return -100;

        int q = 0;
        for (size_t b=0; b<bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];

            int channels = bottom_blob.c;
            int size = bottom_blob.cstep * channels;

            const float* ptr = bottom_blob;
            float* outptr = top_blob.channel(q);
            for (int i=0; i<size; i++)
            {
                outptr[i] = ptr[i];
            }

            q += channels;
        }

        return 0;
    }

    Mat& top_blob = top_blobs[0];
    top_blob.create(w, h, top_channels);
    if (top_blob.empty())
        return -100;
    if (dims == 3 && axis == 1)
    {
        // interleave dim height
        int w = bottom_blobs[0].w;
        int channels = bottom_blobs[0].c;

        // total height
        int top_h = 0;
        for (size_t b=0; b<bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_h += bottom_blob.h;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, top_h, channels);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            float* outptr = top_blob.channel(q);

            for (size_t b=0; b<bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob = bottom_blobs[b];

                int size = bottom_blob.w * bottom_blob.h;

    int q = 0;
    for (size_t b=0; b<bottom_blobs.size(); b++)
                const float* ptr = bottom_blob.channel(q);
                for (int i=0; i<size; i++)
                {
                    outptr[i] = ptr[i];
                }
            }
        }

        return 0;
    }

    if (dims == 3 && axis == 2)
    {
        const Mat& bottom_blob = bottom_blobs[b];
        // interleave dim width
        int h = bottom_blobs[0].h;
        int channels = bottom_blobs[0].c;

        int channels = bottom_blob.c;
        int size = bottom_blob.cstep * channels;
        // total height
        int top_w = 0;
        for (size_t b=0; b<bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_w += bottom_blob.w;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w, h, channels);
        if (top_blob.empty())
            return -100;

        const float* ptr = bottom_blob;
        float* outptr = top_blob.channel(q);
        for (int i=0; i<size; i++)
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            outptr[i] = ptr[i];
            float* outptr = top_blob.channel(q);

            for (int i=0; i<h; i++)
            {
                for (size_t b=0; b<bottom_blobs.size(); b++)
                {
                    const Mat& bottom_blob = bottom_blobs[b];

                    const float* ptr = bottom_blob.channel(q).row(i);
                    for (int j=0; j<bottom_blob.w; j++)
                    {
                        outptr[j] = ptr[j];
                    }

                    outptr += bottom_blob.w;
                }
            }
        }

        q += channels;
        return 0;
    }

    return 0;
--- a/src/layer/concat.h
+++ b/src/layer/concat.h
@@ -24,9 +24,12 @@ class Concat : public Layer
 public:
    Concat();

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;

 public:
    int axis;
 };

 } // namespace ncnn
--- a/src/layer/concatv2.cpp
+++ b/src/layer/concatv2.cpp
@@ -1,277 +0,0 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "concatv2.h"

 namespace ncnn {

 DEFINE_LAYER_CREATOR(ConcatV2)

 ConcatV2::ConcatV2()
 {
    one_blob_only = false;
    support_inplace = false;
 }

 int ConcatV2::load_param(const ParamDict& pd)
 {
    axis = pd.get(0, 0);

    return 0;
 }

 int ConcatV2::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
 {
    int dims = bottom_blobs[0].dims;

    if (dims == 1) // axis == 0
    {
        // concat vector
        // total length
        int top_w = 0;
        for (size_t b=0; b<bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_w += bottom_blob.w;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w);
        if (top_blob.empty())
            return -100;

        float* outptr = top_blob;
        for (size_t b=0; b<bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];

            int w = bottom_blob.w;

            const float* ptr = bottom_blob;
            for (int i=0; i<w; i++)
            {
                outptr[i] = ptr[i];
            }

            outptr += w;
        }

        return 0;
    }

    if (dims == 2 && axis == 0)
    {
        // concat image
        int w = bottom_blobs[0].w;

        // total height
        int top_h = 0;
        for (size_t b=0; b<bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_h += bottom_blob.h;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, top_h);
        if (top_blob.empty())
            return -100;

        float* outptr = top_blob;
        for (size_t b=0; b<bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];

            int size = w * bottom_blob.h;

            const float* ptr = bottom_blob;
            for (int i=0; i<size; i++)
            {
                outptr[i] = ptr[i];
            }

            outptr += size;
        }

        return 0;
    }

    if (dims == 2 && axis == 1)
    {
        // interleave image row
        int h = bottom_blobs[0].h;

        // total width
        int top_w = 0;
        for (size_t b=0; b<bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_w += bottom_blob.w;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w, h);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for
        for (int i=0; i<h; i++)
        {
            float* outptr = top_blob.row(i);
            for (size_t b=0; b<bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob = bottom_blobs[b];

                const float* ptr = bottom_blob.row(i);
                for (int j=0; j<bottom_blob.w; j++)
                {
                    outptr[j] = ptr[j];
                }

                outptr += bottom_blob.w;
            }
        }

        return 0;
    }

    if (dims == 3 && axis == 0)
    {
        // concat dim
        int w = bottom_blobs[0].w;
        int h = bottom_blobs[0].h;

        // total channels
        int top_channels = 0;
        for (size_t b=0; b<bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_channels += bottom_blob.c;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, h, top_channels);
        if (top_blob.empty())
            return -100;

        int q = 0;
        for (size_t b=0; b<bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];

            int channels = bottom_blob.c;
            int size = bottom_blob.cstep * channels;

            const float* ptr = bottom_blob;
            float* outptr = top_blob.channel(q);
            for (int i=0; i<size; i++)
            {
                outptr[i] = ptr[i];
            }

            q += channels;
        }

        return 0;
    }

    if (dims == 3 && axis == 1)
    {
        // interleave dim height
        int w = bottom_blobs[0].w;
        int channels = bottom_blobs[0].c;

        // total height
        int top_h = 0;
        for (size_t b=0; b<bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_h += bottom_blob.h;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, top_h, channels);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            float* outptr = top_blob.channel(q);

            for (size_t b=0; b<bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob = bottom_blobs[b];

                int size = bottom_blob.w * bottom_blob.h;

                const float* ptr = bottom_blob.channel(q);
                for (int i=0; i<size; i++)
                {
                    outptr[i] = ptr[i];
                }
            }
        }

        return 0;
    }

    if (dims == 3 && axis == 2)
    {
        // interleave dim width
        int h = bottom_blobs[0].h;
        int channels = bottom_blobs[0].c;

        // total height
        int top_w = 0;
        for (size_t b=0; b<bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_w += bottom_blob.w;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w, h, channels);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            float* outptr = top_blob.channel(q);

            for (int i=0; i<h; i++)
            {
                for (size_t b=0; b<bottom_blobs.size(); b++)
                {
                    const Mat& bottom_blob = bottom_blobs[b];

                    const float* ptr = bottom_blob.channel(q).row(i);
                    for (int j=0; j<bottom_blob.w; j++)
                    {
                        outptr[j] = ptr[j];
                    }

                    outptr += bottom_blob.w;
                }
            }
        }

        return 0;
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/concatv2.h
+++ b/src/layer/concatv2.h
@@ -1,37 +0,0 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_CONCATV2_H
 #define LAYER_CONCATV2_H

 #include "layer.h"

 namespace ncnn {

 class ConcatV2 : public Layer
 {
 public:
    ConcatV2();

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;

 public:
    int axis;
 };

 } // namespace ncnn

 #endif // LAYER_CONCATV2_H
--- a/src/layer/dropout.cpp
+++ b/src/layer/dropout.cpp
@@ -24,14 +24,68 @@ Dropout::Dropout()
    support_inplace = true;
 }

 int Dropout::load_param(const ParamDict& pd)
 {
    scale = pd.get(0, 1.f);

    return 0;
 }

 int Dropout::forward(const Mat& bottom_blob, Mat& top_blob) const
 {
    top_blob = bottom_blob;
    if (scale == 1.f)
    {
        top_blob = bottom_blob;
        return 0;
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    int size = w * h;

    top_blob.create(w, h, channels);
    if (top_blob.empty())
        return -100;

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_blob.channel(q);
        float* outptr = top_blob.channel(q);

        for (int i=0; i<size; i++)
        {
            outptr[i] = ptr[i] * scale;
        }
    }

    return 0;
 }

 int Dropout::forward_inplace(Mat& /*bottom_top_blob*/) const
 int Dropout::forward_inplace(Mat& bottom_top_blob) const
 {
    if (scale == 1.f)
    {
        return 0;
    }

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        for (int i=0; i<size; i++)
        {
            ptr[i] = ptr[i] * scale;
        }
    }

    return 0;
 }

--- a/src/layer/dropout.h
+++ b/src/layer/dropout.h
@@ -24,10 +24,14 @@ class Dropout : public Layer
 public:
    Dropout();

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

    virtual int forward_inplace(Mat& bottom_top_blob) const;

 public:
    float scale;
 };

 } // namespace ncnn
--- a/src/layer/dropoutv2.cpp
+++ b/src/layer/dropoutv2.cpp
@@ -1,92 +0,0 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "dropoutv2.h"

 namespace ncnn {

 DEFINE_LAYER_CREATOR(DropoutV2)

 DropoutV2::DropoutV2()
 {
    one_blob_only = true;
    support_inplace = true;
 }

 int DropoutV2::load_param(const ParamDict& pd)
 {
    scale = pd.get(0, 1.f);

    return 0;
 }

 int DropoutV2::forward(const Mat& bottom_blob, Mat& top_blob) const
 {
    if (scale == 1.f)
    {
        top_blob = bottom_blob;
        return 0;
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    int size = w * h;

    top_blob.create(w, h, channels);
    if (top_blob.empty())
        return -100;

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_blob.channel(q);
        float* outptr = top_blob.channel(q);

        for (int i=0; i<size; i++)
        {
            outptr[i] = ptr[i] * scale;
        }
    }

    return 0;
 }

 int DropoutV2::forward_inplace(Mat& bottom_top_blob) const
 {
    if (scale == 1.f)
    {
        return 0;
    }

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        for (int i=0; i<size; i++)
        {
            ptr[i] = ptr[i] * scale;
        }
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/dropoutv2.h
+++ b/src/layer/dropoutv2.h
@@ -1,39 +0,0 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_DROPOUTV2_H
 #define LAYER_DROPOUTV2_H

 #include "layer.h"

 namespace ncnn {

 class DropoutV2 : public Layer
 {
 public:
    DropoutV2();

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

    virtual int forward_inplace(Mat& bottom_top_blob) const;

 public:
    float scale;
 };

 } // namespace ncnn

 #endif // LAYER_DROPOUTV2_H
--- a/src/layer/softmax.cpp
+++ b/src/layer/softmax.cpp
@@ -24,7 +24,14 @@ DEFINE_LAYER_CREATOR(Softmax)
 Softmax::Softmax()
 {
    one_blob_only = true;
    support_inplace = true;
    support_inplace = false;
 }

 int Softmax::load_param(const ParamDict& pd)
 {
    axis = pd.get(0, 0);

    return 0;
 }

 int Softmax::forward(const Mat& bottom_blob, Mat& top_blob) const
@@ -33,140 +40,264 @@ int Softmax::forward(const Mat& bottom_blob, Mat& top_blob) const
    // sum all value
    // value = value / sum

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    int size = w * h;

    top_blob.create(w, h, channels);
    if (top_blob.empty())
        return -100;

    Mat max;
    max.create(w, h);
    if (max.empty())
        return -100;
    max.fill(-FLT_MAX);
    for (int q=0; q<channels; q++)
    if (axis == 0)
    {
        const float* ptr = bottom_blob.channel(q);
        float* maxptr = max;
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int channels = bottom_blob.c;
        int size = w * h;

        top_blob.create(w, h, channels);
        if (top_blob.empty())
            return -100;

        for (int i=0; i<size; i++)
        Mat max;
        max.create(w, h);
        if (max.empty())
            return -100;
        max.fill(-FLT_MAX);
        for (int q=0; q<channels; q++)
        {
            maxptr[i] = std::max(maxptr[i], ptr[i]);
        }
    }
            const float* ptr = bottom_blob.channel(q);
            float* maxptr = max;

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_blob.channel(q);
        float* outptr = top_blob.channel(q);
        float* maxptr = max;
            for (int i=0; i<size; i++)
            {
                maxptr[i] = std::max(maxptr[i], ptr[i]);
            }
        }

        for (int i=0; i<size; i++)
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            outptr[i] = exp(ptr[i] - maxptr[i]);
            const float* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);
            float* maxptr = max;

            for (int i=0; i<size; i++)
            {
                outptr[i] = exp(ptr[i] - maxptr[i]);
            }
        }
    }

    Mat sum;
    sum.create(w, h);
    if (sum.empty())
        return -100;
    sum.fill(0.f);
    for (int q=0; q<channels; q++)
    {
        const float* outptr = top_blob.channel(q);
        float* sumptr = sum;
        Mat sum;
        sum.create(w, h);
        if (sum.empty())
            return -100;
        sum.fill(0.f);
        for (int q=0; q<channels; q++)
        {
            const float* outptr = top_blob.channel(q);
            float* sumptr = sum;

        for (int i=0; i<size; i++)
            for (int i=0; i<size; i++)
            {
                sumptr[i] += outptr[i];
            }
        }

        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            sumptr[i] += outptr[i];
            float* outptr = top_blob.channel(q);
            float* sumptr = sum;

            for (int i=0; i<size; i++)
            {
                outptr[i] /= sumptr[i];
            }
        }
    }

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    }
    else if (axis == 1)
    {
        float* outptr = top_blob.channel(q);
        float* sumptr = sum;
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int channels = bottom_blob.c;
        int size = w * h;

        for (int i=0; i<size; i++)
        top_blob.create(w, h, channels);
        if (top_blob.empty())
            return -100;

        Mat max;
        max.create(h, channels);
        if (max.empty())
            return -100;
        max.fill(-FLT_MAX);
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            outptr[i] /= sumptr[i];
            const float* ptr = bottom_blob.channel(q);
            float* maxptr = max.row(q);

            for (int i=0; i<h; i++)
            {
                float max = -FLT_MAX;
                for (int j=0; j<w; j++)
                {
                    max = std::max(max, ptr[j]);
                }

                maxptr[i] = max;
                ptr += w;
            }
        }
    }

    return 0;
 }
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);
            float* maxptr = max.row(q);

 int Softmax::forward_inplace(Mat& bottom_top_blob) const
 {
    // value = exp( value - global max value )
    // sum all value
    // value = value / sum
            for (int i=0; i<h; i++)
            {
                float max = maxptr[i];
                for (int j=0; j<w; j++)
                {
                    outptr[j] = exp(ptr[j] - max);
                }

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    Mat max;
    max.create(w, h);
    if (max.empty())
        return -100;
    max.fill(-FLT_MAX);
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
        float* maxptr = max;
                ptr += w;
                outptr += w;
            }
        }

        for (int i=0; i<size; i++)
        Mat sum;
        sum.create(h, channels);
        if (sum.empty())
            return -100;
        sum.fill(0.f);
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            maxptr[i] = std::max(maxptr[i], ptr[i]);
        }
    }
            const float* outptr = top_blob.channel(q);
            float* sumptr = sum.row(q);

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
        float* maxptr = max;
            for (int i=0; i<h; i++)
            {
                float sum = 0.f;
                for (int j=0; j<w; j++)
                {
                    sum += outptr[j];
                }

                sumptr[i] = sum;
                outptr += w;
            }
        }

        for (int i=0; i<size; i++)
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            ptr[i] = exp(ptr[i] - maxptr[i]);
            float* outptr = top_blob.channel(q);
            float* sumptr = sum.row(q);

            for (int i=0; i<h; i++)
            {
                float sum = sumptr[i];
                for (int j=0; j<w; j++)
                {
                    outptr[j] /= sum;
                }

                outptr += w;
            }
        }
    }

    Mat sum;
    sum.create(w, h);
    if (sum.empty())
        return -100;
    sum.fill(0.f);
    for (int q=0; q<channels; q++)
    }
    else if (axis == 2)
    {
        const float* ptr = bottom_top_blob.channel(q);
        float* sumptr = sum;
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int channels = bottom_blob.c;
        int size = w * h;

        for (int i=0; i<size; i++)
        top_blob.create(w, h, channels);
        if (top_blob.empty())
            return -100;

        Mat max;
        max.create(w, channels);
        if (max.empty())
            return -100;
        max.fill(-FLT_MAX);
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            sumptr[i] += ptr[i];
            const float* ptr = bottom_blob.channel(q);
            float* maxptr = max.row(q);

            for (int i=0; i<h; i++)
            {
                for (int j=0; j<w; j++)
                {
                    maxptr[j] = std::max(maxptr[j], ptr[j]);
                }

                ptr += w;
            }
        }
    }

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
        float* sumptr = sum;
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);
            float* maxptr = max.row(q);

            for (int i=0; i<h; i++)
            {
                for (int j=0; j<w; j++)
                {
                    outptr[j] = exp(ptr[j] - maxptr[j]);
                }

                ptr += w;
                outptr += w;
            }
        }

        Mat sum;
        sum.create(w, channels);
        if (sum.empty())
            return -100;
        sum.fill(0.f);
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            const float* outptr = top_blob.channel(q);
            float* sumptr = sum.row(q);

            for (int i=0; i<h; i++)
            {
                for (int j=0; j<w; j++)
                {
                    sumptr[j] += outptr[j];
                }

                outptr += w;
            }
        }

        for (int i=0; i<size; i++)
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            ptr[i] /= sumptr[i];
            float* outptr = top_blob.channel(q);
            float* sumptr = sum.row(q);

            for (int i=0; i<h; i++)
            {
                for (int j=0; j<w; j++)
                {
                    outptr[j] /= sumptr[j];
                }

                outptr += w;
            }
        }

    }

    return 0;
--- a/src/layer/softmax.h
+++ b/src/layer/softmax.h
@@ -24,11 +24,12 @@ class Softmax : public Layer
 public:
    Softmax();

    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
    virtual int load_param(const ParamDict& pd);

    virtual int forward_inplace(Mat& bottom_top_blob) const;
    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

 public:
    int axis;
 };

 } // namespace ncnn
--- a/src/layer/softmaxv2.cpp
+++ b/src/layer/softmaxv2.cpp
@@ -1,306 +0,0 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "softmaxv2.h"
 #include <float.h>
 #include <math.h>
 #include <algorithm>

 namespace ncnn {

 DEFINE_LAYER_CREATOR(SoftmaxV2)

 SoftmaxV2::SoftmaxV2()
 {
    one_blob_only = true;
    support_inplace = false;
 }

 int SoftmaxV2::load_param(const ParamDict& pd)
 {
    axis = pd.get(0, 0);

    return 0;
 }

 int SoftmaxV2::forward(const Mat& bottom_blob, Mat& top_blob) const
 {
    // value = exp( value - global max value )
    // sum all value
    // value = value / sum

    if (axis == 0)
    {
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int channels = bottom_blob.c;
        int size = w * h;

        top_blob.create(w, h, channels);
        if (top_blob.empty())
            return -100;

        Mat max;
        max.create(w, h);
        if (max.empty())
            return -100;
        max.fill(-FLT_MAX);
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            float* maxptr = max;

            for (int i=0; i<size; i++)
            {
                maxptr[i] = std::max(maxptr[i], ptr[i]);
            }
        }

        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);
            float* maxptr = max;

            for (int i=0; i<size; i++)
            {
                outptr[i] = exp(ptr[i] - maxptr[i]);
            }
        }

        Mat sum;
        sum.create(w, h);
        if (sum.empty())
            return -100;
        sum.fill(0.f);
        for (int q=0; q<channels; q++)
        {
            const float* outptr = top_blob.channel(q);
            float* sumptr = sum;

            for (int i=0; i<size; i++)
            {
                sumptr[i] += outptr[i];
            }
        }

        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            float* outptr = top_blob.channel(q);
            float* sumptr = sum;

            for (int i=0; i<size; i++)
            {
                outptr[i] /= sumptr[i];
            }
        }

    }
    else if (axis == 1)
    {
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int channels = bottom_blob.c;
        int size = w * h;

        top_blob.create(w, h, channels);
        if (top_blob.empty())
            return -100;

        Mat max;
        max.create(h, channels);
        if (max.empty())
            return -100;
        max.fill(-FLT_MAX);
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            float* maxptr = max.row(q);

            for (int i=0; i<h; i++)
            {
                float max = -FLT_MAX;
                for (int j=0; j<w; j++)
                {
                    max = std::max(max, ptr[j]);
                }

                maxptr[i] = max;
                ptr += w;
            }
        }

        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);
            float* maxptr = max.row(q);

            for (int i=0; i<h; i++)
            {
                float max = maxptr[i];
                for (int j=0; j<w; j++)
                {
                    outptr[j] = exp(ptr[j] - max);
                }

                ptr += w;
                outptr += w;
            }
        }

        Mat sum;
        sum.create(h, channels);
        if (sum.empty())
            return -100;
        sum.fill(0.f);
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            const float* outptr = top_blob.channel(q);
            float* sumptr = sum.row(q);

            for (int i=0; i<h; i++)
            {
                float sum = 0.f;
                for (int j=0; j<w; j++)
                {
                    sum += outptr[j];
                }

                sumptr[i] = sum;
                outptr += w;
            }
        }

        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            float* outptr = top_blob.channel(q);
            float* sumptr = sum.row(q);

            for (int i=0; i<h; i++)
            {
                float sum = sumptr[i];
                for (int j=0; j<w; j++)
                {
                    outptr[j] /= sum;
                }

                outptr += w;
            }
        }

    }
    else if (axis == 2)
    {
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int channels = bottom_blob.c;
        int size = w * h;

        top_blob.create(w, h, channels);
        if (top_blob.empty())
            return -100;

        Mat max;
        max.create(w, channels);
        if (max.empty())
            return -100;
        max.fill(-FLT_MAX);
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            float* maxptr = max.row(q);

            for (int i=0; i<h; i++)
            {
                for (int j=0; j<w; j++)
                {
                    maxptr[j] = std::max(maxptr[j], ptr[j]);
                }

                ptr += w;
            }
        }

        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);
            float* maxptr = max.row(q);

            for (int i=0; i<h; i++)
            {
                for (int j=0; j<w; j++)
                {
                    outptr[j] = exp(ptr[j] - maxptr[j]);
                }

                ptr += w;
                outptr += w;
            }
        }

        Mat sum;
        sum.create(w, channels);
        if (sum.empty())
            return -100;
        sum.fill(0.f);
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            const float* outptr = top_blob.channel(q);
            float* sumptr = sum.row(q);

            for (int i=0; i<h; i++)
            {
                for (int j=0; j<w; j++)
                {
                    sumptr[j] += outptr[j];
                }

                outptr += w;
            }
        }

        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            float* outptr = top_blob.channel(q);
            float* sumptr = sum.row(q);

            for (int i=0; i<h; i++)
            {
                for (int j=0; j<w; j++)
                {
                    outptr[j] /= sumptr[j];
                }

                outptr += w;
            }
        }

    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/softmaxv2.h
+++ b/src/layer/softmaxv2.h
@@ -1,37 +0,0 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_SOFTMAXV2_H
 #define LAYER_SOFTMAXV2_H

 #include "layer.h"

 namespace ncnn {

 class SoftmaxV2 : public Layer
 {
 public:
    SoftmaxV2();

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;

 public:
    int axis;
 };

 } // namespace ncnn

 #endif // LAYER_SOFTMAXV2_H
--- a/tools/caffe2ncnn.cpp
+++ b/tools/caffe2ncnn.cpp
@@ -324,15 +324,7 @@ int main(int argc, char** argv)

        // layer definition line, repeated
        // [type] [name] [bottom blob count] [top blob count] [bottom blobs] [top blobs] [layer specific params]
        if (layer.type() == "Concat")
        {
            const caffe::ConcatParameter& concat_param = layer.concat_param();
            if (concat_param.axis() != 1)
                fprintf(pp, "%-16s", "ConcatV2");
            else
                fprintf(pp, "%-16s", "Concat");
        }
        else if (layer.type() == "Convolution")
        if (layer.type() == "Convolution")
        {
            const caffe::ConvolutionParameter& convolution_param = layer.convolution_param();
            if (convolution_param.group() != 1)
@@ -340,14 +332,6 @@ int main(int argc, char** argv)
            else
                fprintf(pp, "%-16s", "Convolution");
        }
        else if (layer.type() == "Dropout")
        {
            const caffe::DropoutParameter& dropout_param = layer.dropout_param();
            if (!dropout_param.scale_train())
                fprintf(pp, "%-16s", "DropoutV2");
            else
                fprintf(pp, "%-16s", "Dropout");
        }
        else if (layer.type() == "Python")
        {
            const caffe::PythonParameter& python_param = layer.python_param();
@@ -357,14 +341,6 @@ int main(int argc, char** argv)
            else
                fprintf(pp, "%-16s", python_layer_name.c_str());
        }
        else if (layer.type() == "Softmax")
        {
            const caffe::SoftmaxParameter& softmax_param = layer.softmax_param();
            if (softmax_param.axis() != 1)
                fprintf(pp, "%-16s", "SoftmaxV2");
            else
                fprintf(pp, "%-16s", "Softmax");
        }
        else
        {
            fprintf(pp, "%-16s", layer.type().c_str());
@@ -467,11 +443,8 @@ int main(int argc, char** argv)
        else if (layer.type() == "Concat")
        {
            const caffe::ConcatParameter& concat_param = layer.concat_param();
            if (concat_param.axis() != 1)
            {
                int dim = concat_param.axis() >= 1 ? concat_param.axis() - 1 : 0;
                fprintf(pp, " 0=%d", dim);
            }
            int dim = concat_param.axis() - 1;
            fprintf(pp, " 0=%d", dim);
        }
        else if (layer.type() == "Convolution")
        {
@@ -604,11 +577,8 @@ int main(int argc, char** argv)
        else if (layer.type() == "Dropout")
        {
            const caffe::DropoutParameter& dropout_param = layer.dropout_param();
            if (!dropout_param.scale_train())
            {
                float scale = 1.f - dropout_param.dropout_ratio();
                fprintf(pp, " 0=%f", scale);
            }
            float scale = 1.f - dropout_param.dropout_ratio();
            fprintf(pp, " 0=%f", scale);
        }
        else if (layer.type() == "Eltwise")
        {
@@ -1013,11 +983,8 @@ int main(int argc, char** argv)
        else if (layer.type() == "Softmax")
        {
            const caffe::SoftmaxParameter& softmax_param = layer.softmax_param();
            if (softmax_param.axis() != 1)
            {
                int dim = softmax_param.axis() >= 1 ? softmax_param.axis() - 1 : 0;
                fprintf(pp, " 0=%d", dim);
            }
            int dim = softmax_param.axis() - 1;
            fprintf(pp, " 0=%d", dim);
        }
        else if (layer.type() == "Threshold")
        {