element type storage support in Mat, move data member the first so that a pointer to Mat is a pointer to data, convenient index access for float vector

8 years ago · a84ba8fc0f
--- a/examples/fasterrcnn.cpp
+++ b/examples/fasterrcnn.cpp
@@ -142,9 +142,9 @@ static int detect_fasterrcnn(const cv::Mat& bgr, std::vector<Object>& objects)
    in.substract_mean_normalize(mean_vals, 0);

    ncnn::Mat im_info(3);
    im_info.data[0] = h;
    im_info.data[1] = w;
    im_info.data[2] = scale;
    im_info[0] = h;
    im_info[1] = w;
    im_info[2] = scale;

    // step1, extract feature and all rois
    ncnn::Extractor ex1 = fasterrcnn.create_extractor();
@@ -182,7 +182,7 @@ static int detect_fasterrcnn(const cv::Mat& bgr, std::vector<Object>& objects)
        float score = 0.f;
        for (int i=0; i<num_class; i++)
        {
            float class_score = cls_prob.channel(i).data[0];
            float class_score = cls_prob.channel(i)[0];
            if (class_score > score)
            {
                label = i;
@@ -197,10 +197,10 @@ static int detect_fasterrcnn(const cv::Mat& bgr, std::vector<Object>& objects)
 //         fprintf(stderr, "%d = %f\n", label, score);

        // unscale to image size
        float x1 = roi.data[0] / scale;
        float y1 = roi.data[1] / scale;
        float x2 = roi.data[2] / scale;
        float y2 = roi.data[3] / scale;
        float x1 = roi[0] / scale;
        float y1 = roi[1] / scale;
        float x2 = roi[2] / scale;
        float y2 = roi[3] / scale;

        float pb_w = x2 - x1 + 1;
        float pb_h = y2 - y1 + 1;
--- a/examples/squeezenet.cpp
+++ b/examples/squeezenet.cpp
@@ -42,7 +42,7 @@ static int detect_squeezenet(const cv::Mat& bgr, std::vector<float>& cls_scores)
    cls_scores.resize(out.c);
    for (int j=0; j<out.c; j++)
    {
        const float* prob = out.data + out.cstep * j;
        const float* prob = out.channel(j);
        cls_scores[j] = prob[0];
    }

--- a/src/layer/arm/convolution_3x3.h
+++ b/src/layer/arm/convolution_3x3.h
@@ -519,7 +519,7 @@ static void conv3x3s1_winograd64_transform_kernel_neon(const Mat& kernel, Mat& k
    {
        for (int q = 0; q<inch; q++)
        {
            const float* kernel0 = kernel.data + p*inch * 9 + q * 9;
            const float* kernel0 = (const float*)kernel + p*inch * 9 + q * 9;
            float* kernel_tm0 = kernel_tm.channel(p).row(q);

            // transform kernel, transposed
--- a/src/layer/arm/convolutiondepthwise_arm.cpp
+++ b/src/layer/arm/convolutiondepthwise_arm.cpp
@@ -169,10 +169,10 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
        {
            Mat bottom_blob_bordered_g = bottom_blob_bordered.channel(g);
            Mat top_blob_g = top_blob.channel(g);
            Mat weight_data_g(maxk, (float*)(weight_data + maxk * g));
            Mat weight_data_g(maxk, (void*)((const float*)weight_data + maxk * g));
            Mat bias_data_g;
            if (bias_term)
                bias_data_g = Mat(1, (float*)(bias_data + g));
                bias_data_g = Mat(1, (void*)((const float*)bias_data + g));

            conv(bottom_blob_bordered_g, top_blob_g, weight_data_g, bias_data_g);
        }
@@ -190,10 +190,10 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
    {
        Mat bottom_blob_bordered_g(w, h, channels_g, bottom_blob_bordered.channel(channels_g * g));
        Mat top_blob_g(outw, outh, num_output_g, top_blob.channel(num_output_g * g));
        Mat weight_data_g(maxk * channels_g * num_output_g, (float*)(weight_data + maxk * channels_g * num_output_g * g));
        Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const float*)weight_data + maxk * channels_g * num_output_g * g));
        Mat bias_data_g;
        if (bias_term)
            bias_data_g = Mat(num_output_g, (float*)(bias_data + num_output_g * g));
            bias_data_g = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g));

        conv(bottom_blob_bordered_g, top_blob_g, weight_data_g, bias_data_g);
    }
--- a/src/layer/arm/deconvolution_3x3.h
+++ b/src/layer/arm/deconvolution_3x3.h
@@ -57,7 +57,7 @@ static void deconv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _

            for (int i = 0; i < h; i++)
            {
                float* outptr = out.data + out.w * i;
                float* outptr = out.row(i);

                float* outptr0 = outptr;
                float* outptr1 = outptr + outw;
@@ -278,7 +278,7 @@ static void deconv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _

            for (int i = 0; i < h; i++)
            {
                float* outptr = out.data + outw * i*2;
                float* outptr = out.row(i*2);

                float* outptr0 = outptr;
                float* outptr1 = outptr0 + outw;
--- a/src/layer/arm/deconvolution_4x4.h
+++ b/src/layer/arm/deconvolution_4x4.h
@@ -59,7 +59,7 @@ static void deconv4x4s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _

            for (int i = 0; i < h; i++)
            {
                float* outptr = out.data + out.w * i;
                float* outptr = out.row(i);

                float* outptr0 = outptr;
                float* outptr1 = outptr0 + outw;
@@ -228,7 +228,7 @@ static void deconv4x4s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _

            for (int i = 0; i < h; i++)
            {
                float* outptr = out.data + out.w * i*2;
                float* outptr = out.row(i*2);

                float* outptr0 = outptr;
                float* outptr1 = outptr0 + outw;
--- a/src/layer/arm/deconvolutiondepthwise_arm.cpp
+++ b/src/layer/arm/deconvolutiondepthwise_arm.cpp
@@ -90,11 +90,11 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c
        {
            Mat top_blob_bordered_g = top_blob_bordered.channel(g);
            Mat bottom_blob_g = bottom_blob.channel(g);
            Mat weight_data_g(maxk, (float*)(weight_data + maxk * g));
            Mat weight_data_g(maxk, (void*)((const float*)weight_data + maxk * g));

            Mat bias_data_g;
            if (bias_term)
                bias_data_g = Mat(1, (float*)(bias_data + g));
                bias_data_g = Mat(1, (void*)((const float*)bias_data + g));

            deconv(bottom_blob_g, top_blob_bordered_g, weight_data_g, bias_data_g);
        }
@@ -110,10 +110,10 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c
        {
            Mat top_blob_bordered_g(outw, outh, num_output_g, top_blob_bordered.channel(num_output_g * g));
            Mat bottom_blob_g(w, h, channels_g, bottom_blob.channel(channels_g * g).data);
            Mat weight_data_g(maxk * channels_g * num_output_g, (float*)(weight_data + maxk * channels_g * num_output_g * g));
            Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const float*)weight_data + maxk * channels_g * num_output_g * g));
            Mat bias_data_g;
            if (bias_term)
                bias_data_g = Mat(num_output_g, (float*)(bias_data + num_output_g * g));
                bias_data_g = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g));

            deconv(bottom_blob_g, top_blob_bordered_g, weight_data_g, bias_data_g);
        }
--- a/src/layer/arm/innerproduct_arm.cpp
+++ b/src/layer/arm/innerproduct_arm.cpp
@@ -44,7 +44,7 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
            float sum = 0.f;

            if (bias_term)
                sum = bias_data.data[p];
                sum = bias_data[p];

            const float* w = weight_data_ptr + channels * p;

@@ -73,7 +73,7 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
        float sum = 0.f;

        if (bias_term)
            sum = bias_data.data[p];
            sum = bias_data[p];

        const float* w = weight_data_ptr + size * channels * p;

--- a/src/layer/arm/slice_arm.cpp
+++ b/src/layer/arm/slice_arm.cpp
@@ -29,7 +29,7 @@ int Slice_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& t
    int channels = bottom_blob.c;

    int q = 0;
    const int* slices_ptr = (const int*)slices.data;
    const int* slices_ptr = slices;
    for (size_t i=0; i<top_blobs.size(); i++)
    {
        int slice = slices_ptr[i];
@@ -46,7 +46,7 @@ int Slice_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& t
        int size = bottom_blob.cstep * slice;

        const float* ptr = bottom_blob.channel(q);
        float* outptr = top_blob.data;
        float* outptr = top_blob;

 #if __ARM_NEON
        int nn = size >> 3;
--- a/src/layer/batchnorm.cpp
+++ b/src/layer/batchnorm.cpp
@@ -56,17 +56,12 @@ int BatchNorm::load_model(const ModelBin& mb)
    b_data.create(channels);
    if (b_data.empty())
        return -100;
    const float* slope_data_ptr = slope_data;
    const float* mean_data_ptr = mean_data;
    const float* var_data_ptr = var_data;
    const float* bias_data_ptr = bias_data;
    float* a_data_ptr = a_data;
    float* b_data_ptr = b_data;

    for (int i=0; i<channels; i++)
    {
        float sqrt_var = sqrt(var_data_ptr[i]);
        a_data_ptr[i] = bias_data_ptr[i] - slope_data_ptr[i] * mean_data_ptr[i] / sqrt_var;
        b_data_ptr[i] = slope_data_ptr[i] / sqrt_var;
        float sqrt_var = sqrt(var_data[i]);
        a_data[i] = bias_data[i] - slope_data[i] * mean_data[i] / sqrt_var;
        b_data[i] = slope_data[i] / sqrt_var;
    }

    return 0;
@@ -82,15 +77,13 @@ int BatchNorm::forward_inplace(Mat& bottom_top_blob) const
    int h = bottom_top_blob.h;
    int size = w * h;

    const float* a_data_ptr = a_data;
    const float* b_data_ptr = b_data;
    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        float a = a_data_ptr[q];
        float b = b_data_ptr[q];
        float a = a_data[q];
        float b = b_data[q];

        for (int i=0; i<size; i++)
        {
--- a/src/layer/bias.cpp
+++ b/src/layer/bias.cpp
@@ -47,13 +47,12 @@ int Bias::forward_inplace(Mat& bottom_top_blob) const
    int channels = bottom_top_blob.c;
    int size = w * h;

    const float* bias_ptr = bias_data;
    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        float bias = bias_ptr[q];
        float bias = bias_data[q];

        for (int i=0; i<size; i++)
        {
--- a/src/layer/binaryop.cpp
+++ b/src/layer/binaryop.cpp
@@ -79,7 +79,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
            for (int q=0; q<channels; q++)
            {
                const float* ptr = a.channel(q);
                const float* ptr1 = b.data + h * q;
                const float* ptr1 = (const float*)b + h * q;
                float* outptr = c.channel(q);

                for (int y=0; y<h; y++)
@@ -102,7 +102,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
        {
            if (b.w == 1)
            {
                const float b0 = b.data[0];
                const float b0 = b[0];
                #pragma omp parallel for
                for (int q=0; q<channels; q++)
                {
@@ -125,7 +125,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
            for (int q=0; q<channels; q++)
            {
                const float* ptr = a.channel(q);
                const float b0 = b.data[q];
                const float b0 = b[q];
                float* outptr = c.channel(q);

                for (int i=0; i<size; i++)
@@ -151,7 +151,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
            #pragma omp parallel for
            for (int q=0; q<channels1; q++)
            {
                const float* ptr = a.data + h1 * q;
                const float* ptr = (const float*)a + h1 * q;
                const float* ptr1 = b.channel(q);
                float* outptr = c.channel(q);

@@ -177,13 +177,9 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)

        if (b.dims == 2)
        {
            const float* ptr = a;
            const float* ptr1 = b;
            float* outptr = c;

            for (int i=0; i<size; i++)
            {
                outptr[i] = op(ptr[i], ptr1[i]);
                c[i] = op(a[i], b[i]);
            }

            return 0;
@@ -197,25 +193,21 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)

            if (b.w == 1)
            {
                const float* ptr = a;
                const float b0 = b.data[0];
                float* outptr = c;

                const float b0 = b[0];
                for (int i=0; i<size; i++)
                {
                    outptr[i] = op(ptr[i], b0);
                    c[i] = op(a[i], b0);
                }

                return 0;
            }

            const float* ptr = a;
            const float* ptr1 = b;
            float* outptr = c;

            for (int y=0; y<h; y++)
            {
                const float b0 = ptr1[y];
                const float b0 = b[y];
                for (int x=0; x<w; x++)
                {
                    outptr[x] = op(ptr[x], b0);
@@ -238,7 +230,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
                if (c.empty())
                    return -100;

                const float a0 = a.data[0];
                const float a0 = a[0];
                #pragma omp parallel for
                for (int q=0; q<channels1; q++)
                {
@@ -263,13 +255,10 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
                if (c.empty())
                    return -100;

                const float a0 = a.data[0];
                const float* ptr1 = b;
                float* outptr = c;

                const float a0 = a[0];
                for (int i=0; i<size1; i++)
                {
                    outptr[i] = op(a0, ptr1[i]);
                    c[i] = op(a0, b[i]);
                }

                return 0;
@@ -281,13 +270,10 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
                if (c.empty())
                    return -100;

                const float a0 = a.data[0];
                const float* ptr1 = b;
                float* outptr = c;

                const float a0 = a[0];
                for (int i=0; i<size1; i++)
                {
                    outptr[i] = op(a0, ptr1[i]);
                    c[i] = op(a0, b[i]);
                }

                return 0;
@@ -303,7 +289,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
            #pragma omp parallel for
            for (int q=0; q<channels1; q++)
            {
                const float a0 = a.data[q];
                const float a0 = a[q];
                const float* ptr1 = b.channel(q);
                float* outptr = c.channel(q);

@@ -325,13 +311,12 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
            if (c.empty())
                return -100;

            const float* ptr = a;
            const float* ptr1 = b;
            float* outptr = c;

            for (int y=0; y<h1; y++)
            {
                const float a0 = ptr[y];
                const float a0 = a[y];
                for (int x=0; x<w1; x++)
                {
                    outptr[x] = op(a0, ptr1[x]);
@@ -352,25 +337,18 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)

            if (b.w == 1)
            {
                const float* ptr = a;
                const float b0 = b.data[0];
                float* outptr = c;

                const float b0 = b[0];
                for (int i=0; i<size; i++)
                {
                    outptr[i] = op(ptr[i], b0);
                    c[i] = op(a[i], b0);
                }

                return 0;
            }

            const float* ptr = a;
            const float* ptr1 = b;
            float* outptr = c;

            for (int i=0; i<size; i++)
            {
                outptr[i] = op(ptr[i], ptr1[i]);
                c[i] = op(a[i], b[i]);
            }
        }
    }
--- a/src/layer/convolution.cpp
+++ b/src/layer/convolution.cpp
@@ -125,7 +125,6 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
    }

    // num_output
    const float* weight_data_ptr = weight_data;
    #pragma omp parallel for
    for (int p=0; p<num_output; p++)
    {
@@ -138,9 +137,9 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
                float sum = 0.f;

                if (bias_term)
                    sum = bias_data.data[p];
                    sum = bias_data[p];

                const float* kptr = weight_data_ptr + maxk * channels * p;
                const float* kptr = (const float*)weight_data + maxk * channels * p;

                // channels
                for (int q=0; q<channels; q++)
--- a/src/layer/convolutiondepthwise.cpp
+++ b/src/layer/convolutiondepthwise.cpp
@@ -118,7 +118,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
        for (int g=0; g<group; g++)
        {
            float* outptr = top_blob.channel(g);
            const float* kptr = weight_data + maxk * g;
            const float* kptr = (const float*)weight_data + maxk * g;
            const Mat m = bottom_blob_bordered.channel(g);

            for (int i = 0; i < outh; i++)
@@ -128,7 +128,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
                    float sum = 0.f;

                    if (bias_term)
                        sum = bias_data.data[g];
                        sum = bias_data[g];

                    const float* sptr = m.row(i*stride_h) + j*stride_w;

@@ -158,7 +158,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
        for (int p=0; p<num_output_g; p++)
        {
            float* outptr = top_blob.channel(g * num_output_g + p);
            const float* weight_data_ptr = weight_data + maxk * channels_g * num_output_g * g;
            const float* weight_data_ptr = (const float*)weight_data + maxk * channels_g * num_output_g * g;

            for (int i = 0; i < outh; i++)
            {
@@ -167,7 +167,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
                    float sum = 0.f;

                    if (bias_term)
                        sum = bias_data.data[num_output_g * g + p];
                        sum = bias_data[num_output_g * g + p];

                    const float* kptr = weight_data_ptr + maxk * channels_g * p;

--- a/src/layer/deconvolution.cpp
+++ b/src/layer/deconvolution.cpp
@@ -101,13 +101,12 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const
    }

    // num_output
    const float* weight_data_ptr = weight_data;
    #pragma omp parallel for
    for (int p=0; p<num_output; p++)
    {
        Mat out = top_blob_bordered.channel(p);

        const float bias = bias_term ? bias_data.data[p] : 0.f;
        const float bias = bias_term ? bias_data[p] : 0.f;

        out.fill(bias);

@@ -117,13 +116,13 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const
            {
                float* outptr = out.row(i*stride_h) + j*stride_w;

                const float* kptr = weight_data_ptr + maxk * channels * p;
                const float* kptr = (const float*)weight_data + maxk * channels * p;

                // channels
                for (int q=0; q<channels; q++)
                {
                    const Mat m = bottom_blob.channel(q);
                    float val = *(m.data + m.w * i + j);
                    float val = *(m.row(i) + j);

                    for (int k = 0; k < maxk; k++)
                    {
--- a/src/layer/deconvolutiondepthwise.cpp
+++ b/src/layer/deconvolutiondepthwise.cpp
@@ -92,10 +92,10 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
        for (int g=0; g<group; g++)
        {
            const float* inptr = bottom_blob.channel(g);
            const float* kptr = weight_data + maxk * g;
            const float* kptr = (const float*)weight_data + maxk * g;
            Mat m = top_blob_bordered.channel(g);

            const float bias = bias_term ? bias_data.data[g] : 0.f;
            const float bias = bias_term ? bias_data[g] : 0.f;

            m.fill(bias);

@@ -124,12 +124,12 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
        #pragma omp parallel for
        for (int g = 0; g < group; g++)
        {
            const float* weight_data_ptr = weight_data + maxk * channels_g * num_output_g * g;
            const float* weight_data_ptr = (const float*)weight_data + maxk * channels_g * num_output_g * g;
            for (int p = 0; p < num_output_g; p++)
            {
                Mat out = top_blob_bordered.channel(g * num_output_g + p);

                const float bias = bias_term ? bias_data.data[g * num_output_g + p] : 0.f;
                const float bias = bias_term ? bias_data[g * num_output_g + p] : 0.f;

                out.fill(bias);

@@ -145,7 +145,7 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
                        for (int q = 0; q < channels_g; q++)
                        {
                            const Mat m = bottom_blob.channel(channels_g * g + q);
                            float val = *(m.data + w * i + j);
                            float val = *(m.row(i) + j);

                            for (int k = 0; k < maxk; k++)
                            {
--- a/src/layer/detectionoutput.cpp
+++ b/src/layer/detectionoutput.cpp
@@ -201,7 +201,7 @@ int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<M

        for (int j = 0; j < num_prior; j++)
        {
            float score = confidence.data[j * num_class + i];
            float score = confidence[j * num_class + i];

            if (score > confidence_threshold)
            {
--- a/src/layer/eltwise.cpp
+++ b/src/layer/eltwise.cpp
@@ -114,12 +114,10 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
        }
        else
        {
            const float* coeffs_ptr = coeffs;

            // first blob
            const Mat& bottom_blob1 = bottom_blobs[1];
            float coeff0 = coeffs_ptr[0];
            float coeff1 = coeffs_ptr[1];
            float coeff0 = coeffs[0];
            float coeff1 = coeffs[1];
            #pragma omp parallel for
            for (int q=0; q<channels; q++)
            {
@@ -136,7 +134,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
            for (size_t b=2; b<bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob1 = bottom_blobs[b];
                float coeff = coeffs_ptr[b];
                float coeff = coeffs[b];
                #pragma omp parallel for
                for (int q=0; q<channels; q++)
                {
--- a/src/layer/embed.cpp
+++ b/src/layer/embed.cpp
@@ -65,7 +65,7 @@ int Embed::forward(const Mat& bottom_blob, Mat& top_blob) const
    #pragma omp parallel for
    for (int q=0; q<words; q++)
    {
        float* outptr = top_blob.data + top_blob.w * q;
        float* outptr = (float*)top_blob + top_blob.w * q;

        int word_index = (int)word_ptr[q];

@@ -79,7 +79,7 @@ int Embed::forward(const Mat& bottom_blob, Mat& top_blob) const
        {
            for (int p=0; p<num_output; p++)
            {
                outptr[p] += bias_data.data[p];
                outptr[p] += bias_data[p];
            }
        }
    }
--- a/src/layer/flatten.cpp
+++ b/src/layer/flatten.cpp
@@ -39,7 +39,7 @@ int Flatten::forward(const Mat& bottom_blob, Mat& top_blob) const
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_blob.channel(q);
        float* outptr = top_blob.data + size * q;
        float* outptr = (float*)top_blob + size * q;

        for (int i=0; i<size; i++)
        {
--- a/src/layer/innerproduct.cpp
+++ b/src/layer/innerproduct.cpp
@@ -61,7 +61,6 @@ int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob) const
        return -100;

    // num_output
    const float* weight_data_ptr = weight_data;
    #pragma omp parallel for
    for (int p=0; p<num_output; p++)
    {
@@ -69,12 +68,12 @@ int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob) const
        float sum = 0.f;

        if (bias_term)
            sum = bias_data.data[p];
            sum = bias_data[p];

        // channels
        for (int q=0; q<channels; q++)
        {
            const float* w = weight_data_ptr + size * channels * p + size * q;
            const float* w = (const float*)weight_data + size * channels * p + size * q;
            const float* m = bottom_blob.channel(q);

            for (int i = 0; i < size; i++)
--- a/src/layer/lstm.cpp
+++ b/src/layer/lstm.cpp
@@ -97,7 +97,7 @@ int LSTM::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
        //                0       otherwise
        // calculate hidden
        // gate_input_t := W_hc * h_conted_{t-1} + W_xc * x_t + b_c
        const float cont = cont_blob.data[t];
        const float cont = cont_blob[t];
        const Mat x = input_blob.channel(t);
        float* hidden_data = hidden;
        for (int q=0; q<num_output; q++)
@@ -105,18 +105,18 @@ int LSTM::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
            float h_cont = cont ? hidden_data[q] : 0.f;

            const float* x_data = x;
            const float* bias_c_data_ptr = bias_c_data.data + 4 * q;
            float* gates_data = gates.data + 4 * q;
            const float* bias_c_data_ptr = (const float*)bias_c_data + 4 * q;
            float* gates_data = (float*)gates + 4 * q;

            // gate I F O G
            const float* weight_hc_data_I = weight_hc_data.data + weight_hc_data.w * q;
            const float* weight_xc_data_I = weight_xc_data.data + weight_xc_data.w * q;
            const float* weight_hc_data_F = weight_hc_data.data + weight_hc_data.w * q + size;
            const float* weight_xc_data_F = weight_xc_data.data + weight_xc_data.w * q + size;
            const float* weight_hc_data_O = weight_hc_data.data + weight_hc_data.w * q + size*2;
            const float* weight_xc_data_O = weight_xc_data.data + weight_xc_data.w * q + size*2;
            const float* weight_hc_data_G = weight_hc_data.data + weight_hc_data.w * q + size*3;
            const float* weight_xc_data_G = weight_xc_data.data + weight_xc_data.w * q + size*3;
            const float* weight_hc_data_I = (const float*)weight_hc_data + weight_hc_data.w * q;
            const float* weight_xc_data_I = (const float*)weight_xc_data + weight_xc_data.w * q;
            const float* weight_hc_data_F = (const float*)weight_hc_data + weight_hc_data.w * q + size;
            const float* weight_xc_data_F = (const float*)weight_xc_data + weight_xc_data.w * q + size;
            const float* weight_hc_data_O = (const float*)weight_hc_data + weight_hc_data.w * q + size*2;
            const float* weight_xc_data_O = (const float*)weight_xc_data + weight_xc_data.w * q + size*2;
            const float* weight_hc_data_G = (const float*)weight_hc_data + weight_hc_data.w * q + size*3;
            const float* weight_xc_data_G = (const float*)weight_xc_data + weight_xc_data.w * q + size*3;

            float I = bias_c_data_ptr[0];
            float F = bias_c_data_ptr[1];
@@ -148,7 +148,7 @@ int LSTM::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
        float* output_data = output;
        for (int q=0; q<num_output; q++)
        {
            float* gates_data = gates.data + 4 * q;
            float* gates_data = (float*)gates + 4 * q;

            float I = gates_data[0];
            float F = gates_data[1];
--- a/src/layer/mvn.cpp
+++ b/src/layer/mvn.cpp
@@ -49,20 +49,19 @@ int MVN::forward(const Mat& bottom_blob, Mat& top_blob) const
    Mat sum(channels);
    if (sum.empty())
        return -100;
    float* sum_ptr = sum;

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_blob.channel(q);

        float sum = 0.f;
        float s = 0.f;
        for (int i=0; i<size; i++)
        {
            sum += ptr[i];
            s += ptr[i];
        }

        sum_ptr[q] = sum;
        sum[q] = s;
    }

    if (across_channels)
@@ -71,7 +70,7 @@ int MVN::forward(const Mat& bottom_blob, Mat& top_blob) const
        float mean = 0.f;
        for (int q=0; q<channels; q++)
        {
            mean += sum_ptr[q];
            mean += sum[q];
        }
        mean = mean / (channels * size);

@@ -96,7 +95,7 @@ int MVN::forward(const Mat& bottom_blob, Mat& top_blob) const
        {
            const float* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);
            float mean = sum_ptr[q] / size;
            float mean = sum[q] / size;

            for (int i=0; i<size; i++)
            {
@@ -111,20 +110,19 @@ int MVN::forward(const Mat& bottom_blob, Mat& top_blob) const
        Mat sqsum(channels);
        if (sqsum.empty())
            return -100;
        float* sqsum_ptr = sqsum;

        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            const float* ptr = top_blob.channel(q);

            float sum = 0.f;
            float s = 0.f;
            for (int i=0; i<size; i++)
            {
                sum += ptr[i] * ptr[i];
                s += ptr[i] * ptr[i];
            }

            sqsum_ptr[q] = sum;
            sqsum[q] = s;
        }

        if (across_channels)
@@ -133,7 +131,7 @@ int MVN::forward(const Mat& bottom_blob, Mat& top_blob) const
            float sqmean = 0.f;
            for (int q=0; q<channels; q++)
            {
                sqmean += sqsum_ptr[q];
                sqmean += sqsum[q];
            }
            sqmean = sqmean / (channels * size);

@@ -160,7 +158,7 @@ int MVN::forward(const Mat& bottom_blob, Mat& top_blob) const
            for (int q=0; q<channels; q++)
            {
                float* outptr = top_blob.channel(q);
                float sqmean = sqsum_ptr[q] / size;
                float sqmean = sqsum[q] / size;
                float norm_var = sqrt(sqmean) + eps;
                float norm_var_inv = 1.f / norm_var;

--- a/src/layer/normalize.cpp
+++ b/src/layer/normalize.cpp
@@ -63,7 +63,6 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const
        if (square_sum_blob.empty())
            return -100;

        float* square_sum_ptr = square_sum_blob;
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
@@ -75,14 +74,14 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const
                ssum += ptr[i] * ptr[i];
            }

            square_sum_ptr[q] = ssum;
            square_sum_blob[q] = ssum;
        }

        // sum + eps
        float ssum = eps;
        for (int q=0; q<channels; q++)
        {
            ssum += square_sum_ptr[q];
            ssum += square_sum_blob[q];
        }

        // 1 / sqrt(ssum)
@@ -90,7 +89,7 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const

        if (channel_shared)
        {
            float scale = a * scale_data.data[0];
            float scale = a * scale_data[0];

            #pragma omp parallel for
            for (int q=0; q<channels; q++)
@@ -111,7 +110,7 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const
            {
                const float* ptr = bottom_blob.channel(q);
                float* outptr = top_blob.channel(q);
                float scale = a * scale_data.data[q];
                float scale = a * scale_data[q];

                for (int i=0; i<size; i++)
                {
@@ -124,15 +123,13 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const
    {
        // square sum, 1 / sqrt(ssum)
        Mat square_sum_blob;
        square_sum_blob.create(w, h);
        square_sum_blob.create(size);
        if (square_sum_blob.empty())
            return -100;

        float* ssptr = square_sum_blob;

        if (channel_shared)
        {
            float scale = scale_data.data[0];
            float scale = scale_data[0];

            #pragma omp parallel for
            for (int i=0; i<size; i++)
@@ -144,7 +141,7 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const
                    ssum += ptr[i] * ptr[i];
                }

                ssptr[i] = 1.f / sqrt(ssum) * scale;
                square_sum_blob[i] = 1.f / sqrt(ssum) * scale;
            }

            #pragma omp parallel for
@@ -155,7 +152,7 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const

                for (int i=0; i<size; i++)
                {
                    outptr[i] = ptr[i] * ssptr[i];
                    outptr[i] = ptr[i] * square_sum_blob[i];
                }
            }
        }
@@ -171,7 +168,7 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const
                    ssum += ptr[i] * ptr[i];
                }

                ssptr[i] = 1.f / sqrt(ssum);
                square_sum_blob[i] = 1.f / sqrt(ssum);
            }

            #pragma omp parallel for
@@ -179,11 +176,11 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const
            {
                const float* ptr = bottom_blob.channel(q);
                float* outptr = top_blob.channel(q);
                float scale = scale_data.data[q];
                float scale = scale_data[q];

                for (int i=0; i<size; i++)
                {
                    outptr[i] = ptr[i] * ssptr[i] * scale;
                    outptr[i] = ptr[i] * square_sum_blob[i] * scale;
                }
            }
        }
--- a/src/layer/pooling.cpp
+++ b/src/layer/pooling.cpp
@@ -249,7 +249,8 @@ int Pooling::forward(const Mat& bottom_blob, Mat& top_blob) const
            {
                const float scale = (float)kernel_w / wtail;

                outptr = top_blob.channel(q) + outw - 1;
                outptr = top_blob.channel(q);
                outptr += outw - 1;
                for (int i = 0; i < outh; i++)
                {
                    *outptr *= scale;
--- a/src/layer/prelu.cpp
+++ b/src/layer/prelu.cpp
@@ -47,13 +47,11 @@ int PReLU::forward_inplace(Mat& bottom_top_blob) const
    int channels = bottom_top_blob.c;
    int size = w * h;

    const float* slope_data_ptr = slope_data;

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
        float slope = num_slope > 1 ? slope_data_ptr[q] : slope_data_ptr[0];
        float slope = num_slope > 1 ? slope_data[q] : slope_data[0];

        for (int i=0; i<size; i++)
        {
--- a/src/layer/priorbox.cpp
+++ b/src/layer/priorbox.cpp
@@ -79,7 +79,7 @@ int PriorBox::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
    #pragma omp parallel for
    for (int i = 0; i < h; i++)
    {
        float* box = top_blob.data + i * w * num_prior * 4;
        float* box = (float*)top_blob + i * w * num_prior * 4;

        float center_x = offset * step_w;
        float center_y = offset * step_h + i * step_h;
@@ -91,7 +91,7 @@ int PriorBox::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to

            for (int k = 0; k < num_min_size; k++)
            {
                float min_size = min_sizes.data[k];
                float min_size = min_sizes[k];

                // min size box
                box_w = box_h = min_size;
@@ -105,7 +105,7 @@ int PriorBox::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to

                if (num_max_size > 0)
                {
                    float max_size = max_sizes.data[k];
                    float max_size = max_sizes[k];

                    // max size box
                    box_w = box_h = sqrt(min_size * max_size);
--- a/src/layer/proposal.cpp
+++ b/src/layer/proposal.cpp
@@ -28,14 +28,14 @@ Proposal::Proposal()

    // TODO load from param
    ratios.create(3);
    ratios.data[0] = 0.5f;
    ratios.data[1] = 1.f;
    ratios.data[2] = 2.f;
    ratios[0] = 0.5f;
    ratios[1] = 1.f;
    ratios[2] = 2.f;

    scales.create(3);
    scales.data[0] = 8.f;
    scales.data[1] = 16.f;
    scales.data[2] = 32.f;
    scales[0] = 8.f;
    scales[1] = 16.f;
    scales[2] = 32.f;
 }

 static Mat generate_anchors(int base_size, const Mat& ratios, const Mat& scales)
@@ -51,14 +51,14 @@ static Mat generate_anchors(int base_size, const Mat& ratios, const Mat& scales)

    for (int i = 0; i < num_ratio; i++)
    {
        float ar = ratios.data[i];
        float ar = ratios[i];

        int r_w = round(base_size / sqrt(ar));
        int r_h = round(r_w * ar);//round(base_size * sqrt(ar));

        for (int j = 0; j < num_scale; j++)
        {
            float scale = scales.data[j];
            float scale = scales[j];

            float rs_w = r_w * scale;
            float rs_h = r_h * scale;
@@ -269,8 +269,8 @@ int Proposal::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
    }

    // clip predicted boxes to image
    float im_w = im_info_blob.data[1];
    float im_h = im_info_blob.data[0];
    float im_w = im_info_blob[1];
    float im_h = im_info_blob[0];

    #pragma omp parallel for
    for (int q=0; q<num_anchors; q++)
@@ -293,7 +293,7 @@ int Proposal::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
    std::vector<Rect> proposal_boxes;
    std::vector<float> scores;

    float im_scale = im_info_blob.data[2];
    float im_scale = im_info_blob[2];
    float min_boxsize = min_size * im_scale;

    for (int q=0; q<num_anchors; q++)
--- a/src/layer/reduction.cpp
+++ b/src/layer/reduction.cpp
@@ -82,7 +82,7 @@ static int reduction_op(const Mat& a, Mat& b, float v0, int dim, float coeff)
        Mat sums(channels);
        if (sums.empty())
            return -100;
        float* sums_ptr = sums;

        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
@@ -94,22 +94,19 @@ static int reduction_op(const Mat& a, Mat& b, float v0, int dim, float coeff)
                sum = op(sum, ptr[i]);
            }

            sums_ptr[q] = sum;
            sums[q] = sum;
        }

        float* outptr = b;

        float sum = v0;
        for (int i=0; i<channels; i++)
        {
            sum = op2(sum, sums_ptr[i]);
            sum = op2(sum, sums[i]);
        }

        outptr[0] = sum * coeff;
        b[0] = sum * coeff;
    }
    else if (dim == 1)
    {
        float* outptr = b;
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
@@ -121,7 +118,7 @@ static int reduction_op(const Mat& a, Mat& b, float v0, int dim, float coeff)
                sum = op(sum, ptr[i]);
            }

            outptr[q] = sum * coeff;
            b[q] = sum * coeff;
        }
    }
    else if (dim == 2)
@@ -173,19 +170,18 @@ static int reduction_op(const Mat& a, Mat& b, float v0, int dim, float coeff)

        b.fill(v0);

        float* outptr = b;
        for (int q=0; q<channels; q++)
        {
            const float* mins_ptr = mins.channel(q);
            for (int j=0; j<w; j++)
            {
                outptr[j] = op2(outptr[j], mins_ptr[j]);
                b[j] = op2(b[j], mins_ptr[j]);
            }
        }

        for (int j=0; j<w; j++)
        {
            outptr[j] *= coeff;
            b[j] *= coeff;
        }
    }
    else if (dim == -2)
@@ -195,18 +191,16 @@ static int reduction_op(const Mat& a, Mat& b, float v0, int dim, float coeff)
        for (int q=0; q<channels; q++)
        {
            const float* ptr = a.channel(q);
            float* outptr = b;

            for (int i=0; i<size; i++)
            {
                outptr[i] = op(outptr[i], ptr[i]);
                b[i] = op(b[i], ptr[i]);
            }
        }

        float* outptr = b;
        for (int i=0; i<size; i++)
        {
            outptr[i] *= coeff;
            b[i] *= coeff;
        }
    }

@@ -257,15 +251,13 @@ int Reduction::forward(const Mat& bottom_blob, Mat& top_blob) const

        if (dim == 0)
        {
            float* outptr = top_blob;
            outptr[0] /= channels * size;
            top_blob[0] /= channels * size;
        }
        else if (dim == 1)
        {
            float* outptr = top_blob;
            for (int q=0; q<channels; q++)
            {
                outptr[q] /= size;
                top_blob[q] /= size;
            }
        }
        else if (dim == 2)
@@ -282,18 +274,16 @@ int Reduction::forward(const Mat& bottom_blob, Mat& top_blob) const
        }
        else if (dim == -1)
        {
            float* outptr = top_blob;
            for (int j=0; j<w; j++)
            {
                outptr[j] /= h * channels;
                top_blob[j] /= h * channels;
            }
        }
        else if (dim == -2)
        {
            float* outptr = top_blob;
            for (int i=0; i<size; i++)
            {
                outptr[i] /= channels;
                top_blob[i] /= channels;
            }
        }
    }
--- a/src/layer/rnn.cpp
+++ b/src/layer/rnn.cpp
@@ -92,18 +92,18 @@ int RNN::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blo
        //                0       otherwise
        // calculate hidden
        // h_t = tanh( W_hh * h_cont_{t-1} + W_xh * x_t + b_h )
        const float cont = cont_blob.data[t];
        const float cont = cont_blob[t];
        const Mat x = input_blob.channel(t);
        float* hidden_data = hidden;
        for (int q=0; q<num_output; q++)
        {
            float h_cont = cont ? hidden_data[q] : 0.f;

            const float* weight_hh_data_ptr = weight_hh_data.data + weight_hh_data.w * q;
            const float* weight_xh_data_ptr = weight_xh_data.data + weight_xh_data.w * q;
            const float* weight_hh_data_ptr = (const float*)weight_hh_data + weight_hh_data.w * q;
            const float* weight_xh_data_ptr = (const float*)weight_xh_data + weight_xh_data.w * q;
            const float* x_data = x;

            float s0 = bias_h_data.data[q];
            float s0 = bias_h_data[q];
            for (int i=0; i<size; i++)
            {
                s0 += weight_hh_data_ptr[i] * h_cont + weight_xh_data_ptr[i] * x_data[i];
@@ -118,9 +118,9 @@ int RNN::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blo
        float* output_data = output;
        for (int q=0; q<num_output; q++)
        {
            const float* weight_ho_data_ptr = weight_ho_data.data + weight_ho_data.w * q;
            const float* weight_ho_data_ptr = (const float*)weight_ho_data + weight_ho_data.w * q;

            float s0 = bias_o_data.data[q];
            float s0 = bias_o_data[q];
            for (int i=0; i<size; i++)
            {
                s0 += weight_ho_data_ptr[i] * hidden_data[i];
--- a/src/layer/scale.cpp
+++ b/src/layer/scale.cpp
@@ -66,14 +66,13 @@ int Scale::forward_inplace(std::vector<Mat>& bottom_top_blobs) const

    if (bias_term)
    {
        const float* bias_ptr = bias_data;
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            float s = scale_blob.channel(q)[0];
            float bias = bias_ptr[q];
            float bias = bias_data[q];

            for (int i=0; i<size; i++)
            {
@@ -109,15 +108,13 @@ int Scale::forward_inplace(Mat& bottom_top_blob) const

    if (bias_term)
    {
        const float* scale_ptr = scale_data;
        const float* bias_ptr = bias_data;
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            float s = scale_ptr[q];
            float bias = bias_ptr[q];
            float s = scale_data[q];
            float bias = bias_data[q];

            for (int i=0; i<size; i++)
            {
@@ -127,13 +124,12 @@ int Scale::forward_inplace(Mat& bottom_top_blob) const
    }
    else
    {
        const float* scale_ptr = scale_data;
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            float s = scale_ptr[q];
            float s = scale_data[q];

            for (int i=0; i<size; i++)
            {
--- a/src/layer/slice.cpp
+++ b/src/layer/slice.cpp
@@ -37,7 +37,7 @@ int Slice::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_b
    int channels = bottom_blob.c;

    int q = 0;
    const int* slices_ptr = (const int*)slices.data;
    const int* slices_ptr = slices;
    for (size_t i=0; i<top_blobs.size(); i++)
    {
        int slice = slices_ptr[i];
@@ -54,11 +54,8 @@ int Slice::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_b
        int size = bottom_blob.cstep * slice;

        const float* ptr = bottom_blob.channel(q);
        float* outptr = top_blob.data;
        for (int j=0; j<size; j++)
        {
            outptr[j] = ptr[j];
        }
        float* outptr = top_blob;
        memcpy(outptr, ptr, size * sizeof(float));

        q += slice;
    }
--- a/src/layer/softmax.cpp
+++ b/src/layer/softmax.cpp
@@ -84,13 +84,12 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
            return -100;
        max.fill(-FLT_MAX);

        float* maxptr = max;
        for (int i=0; i<h; i++)
        {
            const float* ptr = bottom_top_blob.row(i);
            for (int j=0; j<w; j++)
            {
                maxptr[j] = std::max(maxptr[j], ptr[j]);
                max[j] = std::max(max[j], ptr[j]);
            }
        }

@@ -99,7 +98,7 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
            float* ptr = bottom_top_blob.row(i);
            for (int j=0; j<w; j++)
            {
                ptr[j] = exp(ptr[j] - maxptr[j]);
                ptr[j] = exp(ptr[j] - max[j]);
            }
        }

@@ -109,13 +108,12 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
            return -100;
        sum.fill(0.f);

        float* sumptr = sum;
        for (int i=0; i<h; i++)
        {
            const float* ptr = bottom_top_blob.row(i);
            for (int j=0; j<w; j++)
            {
                sumptr[j] += ptr[j];
                sum[j] += ptr[j];
            }
        }

@@ -124,7 +122,7 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
            float* ptr = bottom_top_blob.row(i);
            for (int j=0; j<w; j++)
            {
                ptr[j] /= sumptr[j];
                ptr[j] /= sum[j];
            }
        }

@@ -141,7 +139,6 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
        if (max.empty())
            return -100;

        float* maxptr = max;
        for (int i=0; i<h; i++)
        {
            const float* ptr = bottom_top_blob.row(i);
@@ -152,14 +149,14 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
                m = std::max(m, ptr[j]);
            }

            maxptr[i] = m;
            max[i] = m;
        }

        for (int i=0; i<h; i++)
        {
            float* ptr = bottom_top_blob.row(i);

            float m = maxptr[i];
            float m = max[i];
            for (int j=0; j<w; j++)
            {
                ptr[j] = exp(ptr[j] - m);
@@ -171,7 +168,6 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
        if (sum.empty())
            return -100;

        float* sumptr = sum;
        for (int i=0; i<h; i++)
        {
            const float* ptr = bottom_top_blob.row(i);
@@ -182,14 +178,14 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
                s += ptr[j];
            }

            sumptr[i] = s;
            sum[i] = s;
        }

        for (int i=0; i<h; i++)
        {
            float* ptr = bottom_top_blob.row(i);

            float s = sumptr[i];
            float s = sum[i];
            for (int j=0; j<w; j++)
            {
                ptr[j] /= s;
@@ -214,11 +210,10 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_top_blob.channel(q);
            float* maxptr = max;

            for (int i=0; i<size; i++)
            {
                maxptr[i] = std::max(maxptr[i], ptr[i]);
                max[i] = std::max(max[i], ptr[i]);
            }
        }

@@ -226,11 +221,10 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
            float* maxptr = max;

            for (int i=0; i<size; i++)
            {
                ptr[i] = exp(ptr[i] - maxptr[i]);
                ptr[i] = exp(ptr[i] - max[i]);
            }
        }

@@ -242,11 +236,10 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_top_blob.channel(q);
            float* sumptr = sum;

            for (int i=0; i<size; i++)
            {
                sumptr[i] += ptr[i];
                sum[i] += ptr[i];
            }
        }

@@ -254,11 +247,10 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
            float* sumptr = sum;

            for (int i=0; i<size; i++)
            {
                ptr[i] /= sumptr[i];
                ptr[i] /= sum[i];
            }
        }

--- a/src/layer/spp.cpp
+++ b/src/layer/spp.cpp
@@ -113,7 +113,7 @@ int SPP::forward(const Mat& bottom_blob, Mat& top_blob) const
                {
                    for (int j = 0; j < outw; j++)
                    {
                        const float* sptr = m.data + m.w * i*stride_h + j*stride_w;
                        const float* sptr = m.row(i*stride_h) + j*stride_w;

                        float max = sptr[0];

@@ -142,7 +142,7 @@ int SPP::forward(const Mat& bottom_blob, Mat& top_blob) const
                {
                    for (int j = 0; j < outw; j++)
                    {
                        const float* sptr = m.data + m.w * i*stride_h + j*stride_w;
                        const float* sptr = m.row(i*stride_h) + j*stride_w;

                        float sum = 0;

--- a/src/layer/unaryop.cpp
+++ b/src/layer/unaryop.cpp
@@ -40,12 +40,10 @@ static int unary_op_inplace(Mat& a)

    int size = a.total();

    float* ptr = a;

    #pragma omp parallel for
    for (int i=0; i<size; i++)
    {
        ptr[i] = op(ptr[i]);
        a[i] = op(a[i]);
    }

    return 0;
--- a/src/mat.cpp
+++ b/src/mat.cpp
@@ -32,7 +32,7 @@ void Mat::substract_mean_normalize(const float* mean_vals, const float* norm_val
        #pragma omp parallel for
        for (int q=0; q<c; q++)
        {
            float* ptr = data + cstep * q;
            float* ptr = channel(q);//data + cstep * q;
            const float mean = mean_vals[q];

 #if __ARM_NEON
@@ -87,7 +87,7 @@ void Mat::substract_mean_normalize(const float* mean_vals, const float* norm_val
        #pragma omp parallel for
        for (int q=0; q<c; q++)
        {
            float* ptr = data + cstep * q;
            float* ptr = channel(q);//data + cstep * q;
            const float norm = norm_vals[q];

 #if __ARM_NEON
@@ -142,7 +142,7 @@ void Mat::substract_mean_normalize(const float* mean_vals, const float* norm_val
        #pragma omp parallel for
        for (int q=0; q<c; q++)
        {
            float* ptr = data + cstep * q;
            float* ptr = channel(q);//data + cstep * q;
            const float mean = mean_vals[q];
            const float norm = norm_vals[q];

@@ -257,7 +257,7 @@ Mat Mat::from_float16(const unsigned short* data, int size)
    if (m.empty())
        return m;

    float* ptr = m.data;
    float* ptr = m;//.data;

 #if __ARM_NEON && (__ARM_FP & 2)
    int nn = cpu_support_arm_vfpv4() ? size >> 2 : 0;
@@ -324,8 +324,8 @@ static void copy_make_border_image(const Mat& src, Mat& dst, int top, int left,
    int w = dst.w;
    int h = dst.h;

    const float* ptr = src.data;
    float* outptr = dst.data;
    const float* ptr = src;//.data;
    float* outptr = dst;//.data;

    if (type == BORDER_CONSTANT)
    {
@@ -508,8 +508,8 @@ static void copy_cut_border_image(const Mat& src, Mat& dst, int top, int left)
    int w = dst.w;
    int h = dst.h;

    const float* ptr = src.data + src.w * top + left;
    float* outptr = dst.data;
    const float* ptr = src.row(top) + left;//.data + src.w * top + left;
    float* outptr = dst;//.data;

    for (int y = 0; y < h; y++)
    {
--- a/src/mat.h
+++ b/src/mat.h
@@ -30,25 +30,26 @@ public:
    // empty
    Mat();
    // vec
    Mat(int w);
    Mat(int w, size_t elemsize = 4);
    // image
    Mat(int w, int h);
    Mat(int w, int h, size_t elemsize = 4);
    // dim
    Mat(int w, int h, int c);
    Mat(int w, int h, int c, size_t elemsize = 4);
    // copy
    Mat(const Mat& m);
    // external vec
    Mat(int w, float* data);
    Mat(int w, void* data, size_t elemsize = 4);
    // external image
    Mat(int w, int h, float* data);
    Mat(int w, int h, void* data, size_t elemsize = 4);
    // external dim
    Mat(int w, int h, int c, float* data);
    Mat(int w, int h, int c, void* data, size_t elemsize = 4);
    // release
    ~Mat();
    // assign
    Mat& operator=(const Mat& m);
    // set all
    void fill(float v);
    template <typename T> void fill(T v);
    // deep copy
    Mat clone() const;
    // reshape vec
@@ -58,11 +59,11 @@ public:
    // reshape dim
    Mat reshape(int w, int h, int c) const;
    // allocate vec
    void create(int w);
    void create(int w, size_t elemsize = 4);
    // allocate image
    void create(int w, int h);
    void create(int w, int h, size_t elemsize = 4);
    // allocate dim
    void create(int w, int h, int c);
    void create(int w, int h, int c, size_t elemsize = 4);
    // refcount++
    void addref();
    // refcount--
@@ -76,8 +77,16 @@ public:
    const Mat channel(int c) const;
    float* row(int y);
    const float* row(int y) const;
    operator float*();
    operator const float*() const;
    template<typename T> T* row(int y);
    template<typename T> const T* row(int y) const;

    // access raw data
    template<typename T> operator T*();
    template<typename T> operator const T*() const;

    // convenient access float vec element
    float& operator[](int i);
    const float& operator[](int i) const;

    enum
    {
@@ -119,15 +128,23 @@ public:
    // convenient construct from half precisoin floating point data
    static Mat from_float16(const unsigned short* data, int size);

    // the dimensionality
    int dims;
    // pointer to the data
    float* data;
    void* data;

    // pointer to the reference counter;
    // pointer to the reference counter
    // when points to user-allocated data, the pointer is NULL
    int* refcount;

    // element size in bytes
    // 4 = float32/int32
    // 2 = float16
    // 1 = int8/uint8
    // 0 = empty
    size_t elemsize;

    // the dimensionality
    int dims;

    int w;
    int h;
    int c;
@@ -217,30 +234,30 @@ static inline void NCNN_XADD(int* addr, int delta) { int tmp = *addr; *addr += d
 #endif

 inline Mat::Mat()
    : dims(0), data(0), refcount(0), w(0), h(0), c(0), cstep(0)
    : data(0), refcount(0), elemsize(0), dims(0), w(0), h(0), c(0), cstep(0)
 {
 }

 inline Mat::Mat(int _w)
    : dims(0), data(0), refcount(0)
 inline Mat::Mat(int _w, size_t _elemsize)
    : data(0), refcount(0), dims(0)
 {
    create(_w);
    create(_w, _elemsize);
 }

 inline Mat::Mat(int _w, int _h)
    : dims(0), data(0), refcount(0)
 inline Mat::Mat(int _w, int _h, size_t _elemsize)
    : data(0), refcount(0), dims(0)
 {
    create(_w, _h);
    create(_w, _h, _elemsize);
 }

 inline Mat::Mat(int _w, int _h, int _c)
    : dims(0), data(0), refcount(0)
 inline Mat::Mat(int _w, int _h, int _c, size_t _elemsize)
    : data(0), refcount(0), dims(0)
 {
    create(_w, _h, _c);
    create(_w, _h, _c, _elemsize);
 }

 inline Mat::Mat(const Mat& m)
    : dims(m.dims), data(m.data), refcount(m.refcount)
    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), dims(m.dims)
 {
    if (refcount)
        NCNN_XADD(refcount, 1);
@@ -252,8 +269,8 @@ inline Mat::Mat(const Mat& m)
    cstep = m.cstep;
 }

 inline Mat::Mat(int _w, float* _data)
    : dims(1), data(_data), refcount(0)
 inline Mat::Mat(int _w, void* _data, size_t _elemsize)
    : data(_data), refcount(0), elemsize(_elemsize), dims(1)
 {
    w = _w;
    h = 1;
@@ -262,8 +279,8 @@ inline Mat::Mat(int _w, float* _data)
    cstep = w;
 }

 inline Mat::Mat(int _w, int _h, float* _data)
    : dims(2), data(_data), refcount(0)
 inline Mat::Mat(int _w, int _h, void* _data, size_t _elemsize)
    : data(_data), refcount(0), elemsize(_elemsize), dims(2)
 {
    w = _w;
    h = _h;
@@ -272,14 +289,14 @@ inline Mat::Mat(int _w, int _h, float* _data)
    cstep = w * h;
 }

 inline Mat::Mat(int _w, int _h, int _c, float* _data)
    : dims(3), data(_data), refcount(0)
 inline Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize)
    : data(_data), refcount(0), elemsize(_elemsize), dims(3)
 {
    w = _w;
    h = _h;
    c = _c;

    cstep = alignSize(w * h * sizeof(float), 16) >> 2;
    cstep = alignSize(w * h * elemsize, 16) / elemsize;
 }

 inline Mat::~Mat()
@@ -297,10 +314,11 @@ inline Mat& Mat::operator=(const Mat& m)

    release();

    dims = m.dims;
    data = m.data;
    refcount = m.refcount;
    elemsize = m.elemsize;

    dims = m.dims;
    w = m.w;
    h = m.h;
    c = m.c;
@@ -313,7 +331,7 @@ inline Mat& Mat::operator=(const Mat& m)
 inline void Mat::fill(float _v)
 {
    int size = total();
    float* ptr = data;
    float* ptr = (float*)data;

 #if __ARM_NEON
    int nn = size >> 2;
@@ -354,6 +372,17 @@ inline void Mat::fill(float _v)
    }
 }

 template <typename T>
 inline void Mat::fill(T _v)
 {
    int size = total();
    T* ptr = (T*)data;
    for (int i=0; i<size; i++)
    {
        ptr[i] = _v;
    }
 }

 inline Mat Mat::clone() const
 {
    if (empty())
@@ -361,15 +390,15 @@ inline Mat Mat::clone() const

    Mat m;
    if (dims == 1)
        m.create(w);
        m.create(w, elemsize);
    else if (dims == 2)
        m.create(w, h);
        m.create(w, h, elemsize);
    else if (dims == 3)
        m.create(w, h, c);
        m.create(w, h, c, elemsize);

    if (total() > 0)
    {
        memcpy(m.data, data, total() * sizeof(float));
        memcpy(m.data, data, total() * elemsize);
    }

    return m;
@@ -383,14 +412,14 @@ inline Mat Mat::reshape(int _w) const
    if (dims == 3 && cstep != (size_t)w * h)
    {
        Mat m;
        m.create(_w);
        m.create(_w, elemsize);

        // flatten
        for (int i=0; i<c; i++)
        {
            const float* ptr = data + i * cstep;
            float* mptr = m.data + i * w * h;
            memcpy(mptr, ptr, w * h * sizeof(float));
            const void* ptr = (unsigned char*)data + i * cstep * elemsize;
            void* mptr = (unsigned char*)m.data + i * w * h * elemsize;
            memcpy(mptr, ptr, w * h * elemsize);
        }

        return m;
@@ -399,7 +428,6 @@ inline Mat Mat::reshape(int _w) const
    Mat m = *this;

    m.dims = 1;

    m.w = _w;
    m.h = 1;
    m.c = 1;
@@ -417,14 +445,14 @@ inline Mat Mat::reshape(int _w, int _h) const
    if (dims == 3 && cstep != (size_t)w * h)
    {
        Mat m;
        m.create(_w, _h);
        m.create(_w, _h, elemsize);

        // flatten
        for (int i=0; i<c; i++)
        {
            const float* ptr = data + i * cstep;
            float* mptr = m.data + i * w * h;
            memcpy(mptr, ptr, w * h * sizeof(float));
            const void* ptr = (unsigned char*)data + i * cstep * elemsize;
            void* mptr = (unsigned char*)m.data + i * w * h * elemsize;
            memcpy(mptr, ptr, w * h * elemsize);
        }

        return m;
@@ -433,7 +461,6 @@ inline Mat Mat::reshape(int _w, int _h) const
    Mat m = *this;

    m.dims = 2;

    m.w = _w;
    m.h = _h;
    m.c = 1;
@@ -450,17 +477,17 @@ inline Mat Mat::reshape(int _w, int _h, int _c) const

    if (dims < 3)
    {
        if ((size_t)_w * _h != alignSize(_w * _h * sizeof(float), 16) >> 2)
        if ((size_t)_w * _h != alignSize(_w * _h * elemsize, 16) / elemsize)
        {
            Mat m;
            m.create(_w, _h, _c);
            m.create(_w, _h, _c, elemsize);

            // align channel
            for (int i=0; i<_c; i++)
            {
                const float* ptr = data + i * _w * _h;
                float* mptr = m.data + i * m.cstep;
                memcpy(mptr, ptr, _w * _h * sizeof(float));
                const void* ptr = (unsigned char*)data + i * _w * _h * elemsize;
                void* mptr = (unsigned char*)m.data + i * m.cstep * m.elemsize;
                memcpy(mptr, ptr, _w * _h * elemsize);
            }

            return m;
@@ -476,22 +503,22 @@ inline Mat Mat::reshape(int _w, int _h, int _c) const
    Mat m = *this;

    m.dims = 3;

    m.w = _w;
    m.h = _h;
    m.c = _c;

    m.cstep = alignSize(_w * _h * sizeof(float), 16) >> 2;
    m.cstep = alignSize(_w * _h * elemsize, 16) / elemsize;

    return m;
 }

 inline void Mat::create(int _w)
 inline void Mat::create(int _w, size_t _elemsize)
 {
    release();

    dims = 1;
    elemsize = _elemsize;

    dims = 1;
    w = _w;
    h = 1;
    c = 1;
@@ -500,19 +527,20 @@ inline void Mat::create(int _w)

    if (total() > 0)
    {
        size_t totalsize = total() * sizeof(float);
        data = (float*)fastMalloc(totalsize + (int)sizeof(*refcount));
        size_t totalsize = total() * elemsize;
        data = fastMalloc(totalsize + (int)sizeof(*refcount));
        refcount = (int*)(((unsigned char*)data) + totalsize);
        *refcount = 1;
    }
 }

 inline void Mat::create(int _w, int _h)
 inline void Mat::create(int _w, int _h, size_t _elemsize)
 {
    release();

    dims = 2;
    elemsize = _elemsize;

    dims = 2;
    w = _w;
    h = _h;
    c = 1;
@@ -521,29 +549,30 @@ inline void Mat::create(int _w, int _h)

    if (total() > 0)
    {
        size_t totalsize = total() * sizeof(float);
        data = (float*)fastMalloc(totalsize + (int)sizeof(*refcount));
        size_t totalsize = total() * elemsize;
        data = fastMalloc(totalsize + (int)sizeof(*refcount));
        refcount = (int*)(((unsigned char*)data) + totalsize);
        *refcount = 1;
    }
 }

 inline void Mat::create(int _w, int _h, int _c)
 inline void Mat::create(int _w, int _h, int _c, size_t _elemsize)
 {
    release();

    dims = 3;
    elemsize = _elemsize;

    dims = 3;
    w = _w;
    h = _h;
    c = _c;

    cstep = alignSize(w * h * sizeof(float), 16) >> 2;
    cstep = alignSize(w * h * elemsize, 16) / elemsize;

    if (total() > 0)
    {
        size_t totalsize = total() * sizeof(float);
        data = (float*)fastMalloc(totalsize + (int)sizeof(*refcount));
        size_t totalsize = total() * elemsize;
        data = fastMalloc(totalsize + (int)sizeof(*refcount));
        refcount = (int*)(((unsigned char*)data) + totalsize);
        *refcount = 1;
    }
@@ -560,9 +589,11 @@ inline void Mat::release()
    if (refcount && NCNN_XADD(refcount, -1) == 1)
        fastFree(data);

    dims = 0;
    data = 0;

    elemsize = 0;

    dims = 0;
    w = 0;
    h = 0;
    c = 0;
@@ -584,32 +615,56 @@ inline size_t Mat::total() const

 inline Mat Mat::channel(int c)
 {
    return Mat(w, h, data + cstep * c);
    return Mat(w, h, (unsigned char*)data + cstep * c * elemsize, elemsize);
 }

 inline const Mat Mat::channel(int c) const
 {
    return Mat(w, h, data + cstep * c);
    return Mat(w, h, (unsigned char*)data + cstep * c * elemsize, elemsize);
 }

 inline float* Mat::row(int y)
 {
    return data + w * y;
    return (float*)data + w * y;
 }

 inline const float* Mat::row(int y) const
 {
    return data + w * y;
    return (const float*)data + w * y;
 }

 template <typename T>
 inline T* Mat::row(int y)
 {
    return (T*)data + w * y;
 }

 template <typename T>
 inline const T* Mat::row(int y) const
 {
    return (const T*)data + w * y;
 }

 template <typename T>
 inline Mat::operator T*()
 {
    return (T*)data;
 }

 template <typename T>
 inline Mat::operator const T*() const
 {
    return (const T*)data;
 }

 inline Mat::operator float*()
 inline float& Mat::operator[](int i)
 {
    return data;
    return ((float*)data)[i];
 }

 inline Mat::operator const float*() const
 inline const float& Mat::operator[](int i) const
 {
    return data;
    return ((const float*)data)[i];
 }

 } // namespace ncnn
--- a/src/paramdict.cpp
+++ b/src/paramdict.cpp
@@ -121,9 +121,15 @@ int ParamDict::load_param(FILE* fp)
                bool is_float = vstr_is_float(vstr);

                if (is_float)
                    nscan = sscanf(vstr, "%f", &params[id].v.data[j]);
                {
                    float* ptr = params[id].v;
                    nscan = sscanf(vstr, "%f", &ptr[j]);
                }
                else
                    nscan = sscanf(vstr, "%d", (int*)&params[id].v.data[j]);
                {
                    int* ptr = params[id].v;
                    nscan = sscanf(vstr, "%d", &ptr[j]);
                }
                if (nscan != 1)
                {
                    fprintf(stderr, "ParamDict parse array element fail\n");
@@ -196,10 +202,8 @@ int ParamDict::load_param_bin(FILE* fp)

            params[id].v.create(len);

            for (int j = 0; j < len; j++)
            {
                fread(&params[id].v.data[j], sizeof(float), 1, fp);
            }
            float* ptr = params[id].v;
            fread(ptr, sizeof(float), len, fp);
        }
        else
        {
@@ -237,11 +241,8 @@ int ParamDict::load_param(const unsigned char*& mem)

            params[id].v.create(len);

            for (int j = 0; j < len; j++)
            {
                params[id].v.data[j] = *(float*)(mem);
                mem += 4;
            }
            memcpy(params[id].v.data, mem, len * 4);
            mem += 4;
        }
        else
        {