Browse Source

element type storage support in Mat, move data member the first so that a pointer to Mat is a pointer to data, convenient index access for float vector

tags/20180129
nihuini 8 years ago
parent
commit
a84ba8fc0f
38 changed files with 318 additions and 330 deletions
  1. +8
    -8
      examples/fasterrcnn.cpp
  2. +1
    -1
      examples/squeezenet.cpp
  3. +1
    -1
      src/layer/arm/convolution_3x3.h
  4. +4
    -4
      src/layer/arm/convolutiondepthwise_arm.cpp
  5. +2
    -2
      src/layer/arm/deconvolution_3x3.h
  6. +2
    -2
      src/layer/arm/deconvolution_4x4.h
  7. +4
    -4
      src/layer/arm/deconvolutiondepthwise_arm.cpp
  8. +2
    -2
      src/layer/arm/innerproduct_arm.cpp
  9. +2
    -2
      src/layer/arm/slice_arm.cpp
  10. +6
    -13
      src/layer/batchnorm.cpp
  11. +1
    -2
      src/layer/bias.cpp
  12. +18
    -40
      src/layer/binaryop.cpp
  13. +2
    -3
      src/layer/convolution.cpp
  14. +4
    -4
      src/layer/convolutiondepthwise.cpp
  15. +3
    -4
      src/layer/deconvolution.cpp
  16. +5
    -5
      src/layer/deconvolutiondepthwise.cpp
  17. +1
    -1
      src/layer/detectionoutput.cpp
  18. +3
    -5
      src/layer/eltwise.cpp
  19. +2
    -2
      src/layer/embed.cpp
  20. +1
    -1
      src/layer/flatten.cpp
  21. +2
    -3
      src/layer/innerproduct.cpp
  22. +12
    -12
      src/layer/lstm.cpp
  23. +10
    -12
      src/layer/mvn.cpp
  24. +11
    -14
      src/layer/normalize.cpp
  25. +2
    -1
      src/layer/pooling.cpp
  26. +1
    -3
      src/layer/prelu.cpp
  27. +3
    -3
      src/layer/priorbox.cpp
  28. +11
    -11
      src/layer/proposal.cpp
  29. +13
    -23
      src/layer/reduction.cpp
  30. +6
    -6
      src/layer/rnn.cpp
  31. +4
    -8
      src/layer/scale.cpp
  32. +3
    -6
      src/layer/slice.cpp
  33. +12
    -20
      src/layer/softmax.cpp
  34. +2
    -2
      src/layer/spp.cpp
  35. +1
    -3
      src/layer/unaryop.cpp
  36. +8
    -8
      src/mat.cpp
  37. +133
    -78
      src/mat.h
  38. +12
    -11
      src/paramdict.cpp

+ 8
- 8
examples/fasterrcnn.cpp View File

@@ -142,9 +142,9 @@ static int detect_fasterrcnn(const cv::Mat& bgr, std::vector<Object>& objects)
in.substract_mean_normalize(mean_vals, 0);

ncnn::Mat im_info(3);
im_info.data[0] = h;
im_info.data[1] = w;
im_info.data[2] = scale;
im_info[0] = h;
im_info[1] = w;
im_info[2] = scale;

// step1, extract feature and all rois
ncnn::Extractor ex1 = fasterrcnn.create_extractor();
@@ -182,7 +182,7 @@ static int detect_fasterrcnn(const cv::Mat& bgr, std::vector<Object>& objects)
float score = 0.f;
for (int i=0; i<num_class; i++)
{
float class_score = cls_prob.channel(i).data[0];
float class_score = cls_prob.channel(i)[0];
if (class_score > score)
{
label = i;
@@ -197,10 +197,10 @@ static int detect_fasterrcnn(const cv::Mat& bgr, std::vector<Object>& objects)
// fprintf(stderr, "%d = %f\n", label, score);

// unscale to image size
float x1 = roi.data[0] / scale;
float y1 = roi.data[1] / scale;
float x2 = roi.data[2] / scale;
float y2 = roi.data[3] / scale;
float x1 = roi[0] / scale;
float y1 = roi[1] / scale;
float x2 = roi[2] / scale;
float y2 = roi[3] / scale;

float pb_w = x2 - x1 + 1;
float pb_h = y2 - y1 + 1;


+ 1
- 1
examples/squeezenet.cpp View File

@@ -42,7 +42,7 @@ static int detect_squeezenet(const cv::Mat& bgr, std::vector<float>& cls_scores)
cls_scores.resize(out.c);
for (int j=0; j<out.c; j++)
{
const float* prob = out.data + out.cstep * j;
const float* prob = out.channel(j);
cls_scores[j] = prob[0];
}



+ 1
- 1
src/layer/arm/convolution_3x3.h View File

@@ -519,7 +519,7 @@ static void conv3x3s1_winograd64_transform_kernel_neon(const Mat& kernel, Mat& k
{
for (int q = 0; q<inch; q++)
{
const float* kernel0 = kernel.data + p*inch * 9 + q * 9;
const float* kernel0 = (const float*)kernel + p*inch * 9 + q * 9;
float* kernel_tm0 = kernel_tm.channel(p).row(q);

// transform kernel, transposed


+ 4
- 4
src/layer/arm/convolutiondepthwise_arm.cpp View File

@@ -169,10 +169,10 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
{
Mat bottom_blob_bordered_g = bottom_blob_bordered.channel(g);
Mat top_blob_g = top_blob.channel(g);
Mat weight_data_g(maxk, (float*)(weight_data + maxk * g));
Mat weight_data_g(maxk, (void*)((const float*)weight_data + maxk * g));
Mat bias_data_g;
if (bias_term)
bias_data_g = Mat(1, (float*)(bias_data + g));
bias_data_g = Mat(1, (void*)((const float*)bias_data + g));

conv(bottom_blob_bordered_g, top_blob_g, weight_data_g, bias_data_g);
}
@@ -190,10 +190,10 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
{
Mat bottom_blob_bordered_g(w, h, channels_g, bottom_blob_bordered.channel(channels_g * g));
Mat top_blob_g(outw, outh, num_output_g, top_blob.channel(num_output_g * g));
Mat weight_data_g(maxk * channels_g * num_output_g, (float*)(weight_data + maxk * channels_g * num_output_g * g));
Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const float*)weight_data + maxk * channels_g * num_output_g * g));
Mat bias_data_g;
if (bias_term)
bias_data_g = Mat(num_output_g, (float*)(bias_data + num_output_g * g));
bias_data_g = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g));

conv(bottom_blob_bordered_g, top_blob_g, weight_data_g, bias_data_g);
}


+ 2
- 2
src/layer/arm/deconvolution_3x3.h View File

@@ -57,7 +57,7 @@ static void deconv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _

for (int i = 0; i < h; i++)
{
float* outptr = out.data + out.w * i;
float* outptr = out.row(i);

float* outptr0 = outptr;
float* outptr1 = outptr + outw;
@@ -278,7 +278,7 @@ static void deconv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _

for (int i = 0; i < h; i++)
{
float* outptr = out.data + outw * i*2;
float* outptr = out.row(i*2);

float* outptr0 = outptr;
float* outptr1 = outptr0 + outw;


+ 2
- 2
src/layer/arm/deconvolution_4x4.h View File

@@ -59,7 +59,7 @@ static void deconv4x4s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _

for (int i = 0; i < h; i++)
{
float* outptr = out.data + out.w * i;
float* outptr = out.row(i);

float* outptr0 = outptr;
float* outptr1 = outptr0 + outw;
@@ -228,7 +228,7 @@ static void deconv4x4s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _

for (int i = 0; i < h; i++)
{
float* outptr = out.data + out.w * i*2;
float* outptr = out.row(i*2);

float* outptr0 = outptr;
float* outptr1 = outptr0 + outw;


+ 4
- 4
src/layer/arm/deconvolutiondepthwise_arm.cpp View File

@@ -90,11 +90,11 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c
{
Mat top_blob_bordered_g = top_blob_bordered.channel(g);
Mat bottom_blob_g = bottom_blob.channel(g);
Mat weight_data_g(maxk, (float*)(weight_data + maxk * g));
Mat weight_data_g(maxk, (void*)((const float*)weight_data + maxk * g));

Mat bias_data_g;
if (bias_term)
bias_data_g = Mat(1, (float*)(bias_data + g));
bias_data_g = Mat(1, (void*)((const float*)bias_data + g));

deconv(bottom_blob_g, top_blob_bordered_g, weight_data_g, bias_data_g);
}
@@ -110,10 +110,10 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) c
{
Mat top_blob_bordered_g(outw, outh, num_output_g, top_blob_bordered.channel(num_output_g * g));
Mat bottom_blob_g(w, h, channels_g, bottom_blob.channel(channels_g * g).data);
Mat weight_data_g(maxk * channels_g * num_output_g, (float*)(weight_data + maxk * channels_g * num_output_g * g));
Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const float*)weight_data + maxk * channels_g * num_output_g * g));
Mat bias_data_g;
if (bias_term)
bias_data_g = Mat(num_output_g, (float*)(bias_data + num_output_g * g));
bias_data_g = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g));

deconv(bottom_blob_g, top_blob_bordered_g, weight_data_g, bias_data_g);
}


+ 2
- 2
src/layer/arm/innerproduct_arm.cpp View File

@@ -44,7 +44,7 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
float sum = 0.f;

if (bias_term)
sum = bias_data.data[p];
sum = bias_data[p];

const float* w = weight_data_ptr + channels * p;

@@ -73,7 +73,7 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
float sum = 0.f;

if (bias_term)
sum = bias_data.data[p];
sum = bias_data[p];

const float* w = weight_data_ptr + size * channels * p;



+ 2
- 2
src/layer/arm/slice_arm.cpp View File

@@ -29,7 +29,7 @@ int Slice_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& t
int channels = bottom_blob.c;

int q = 0;
const int* slices_ptr = (const int*)slices.data;
const int* slices_ptr = slices;
for (size_t i=0; i<top_blobs.size(); i++)
{
int slice = slices_ptr[i];
@@ -46,7 +46,7 @@ int Slice_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& t
int size = bottom_blob.cstep * slice;

const float* ptr = bottom_blob.channel(q);
float* outptr = top_blob.data;
float* outptr = top_blob;

#if __ARM_NEON
int nn = size >> 3;


+ 6
- 13
src/layer/batchnorm.cpp View File

@@ -56,17 +56,12 @@ int BatchNorm::load_model(const ModelBin& mb)
b_data.create(channels);
if (b_data.empty())
return -100;
const float* slope_data_ptr = slope_data;
const float* mean_data_ptr = mean_data;
const float* var_data_ptr = var_data;
const float* bias_data_ptr = bias_data;
float* a_data_ptr = a_data;
float* b_data_ptr = b_data;

for (int i=0; i<channels; i++)
{
float sqrt_var = sqrt(var_data_ptr[i]);
a_data_ptr[i] = bias_data_ptr[i] - slope_data_ptr[i] * mean_data_ptr[i] / sqrt_var;
b_data_ptr[i] = slope_data_ptr[i] / sqrt_var;
float sqrt_var = sqrt(var_data[i]);
a_data[i] = bias_data[i] - slope_data[i] * mean_data[i] / sqrt_var;
b_data[i] = slope_data[i] / sqrt_var;
}

return 0;
@@ -82,15 +77,13 @@ int BatchNorm::forward_inplace(Mat& bottom_top_blob) const
int h = bottom_top_blob.h;
int size = w * h;

const float* a_data_ptr = a_data;
const float* b_data_ptr = b_data;
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

float a = a_data_ptr[q];
float b = b_data_ptr[q];
float a = a_data[q];
float b = b_data[q];

for (int i=0; i<size; i++)
{


+ 1
- 2
src/layer/bias.cpp View File

@@ -47,13 +47,12 @@ int Bias::forward_inplace(Mat& bottom_top_blob) const
int channels = bottom_top_blob.c;
int size = w * h;

const float* bias_ptr = bias_data;
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

float bias = bias_ptr[q];
float bias = bias_data[q];

for (int i=0; i<size; i++)
{


+ 18
- 40
src/layer/binaryop.cpp View File

@@ -79,7 +79,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
for (int q=0; q<channels; q++)
{
const float* ptr = a.channel(q);
const float* ptr1 = b.data + h * q;
const float* ptr1 = (const float*)b + h * q;
float* outptr = c.channel(q);

for (int y=0; y<h; y++)
@@ -102,7 +102,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
{
if (b.w == 1)
{
const float b0 = b.data[0];
const float b0 = b[0];
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
@@ -125,7 +125,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
for (int q=0; q<channels; q++)
{
const float* ptr = a.channel(q);
const float b0 = b.data[q];
const float b0 = b[q];
float* outptr = c.channel(q);

for (int i=0; i<size; i++)
@@ -151,7 +151,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
#pragma omp parallel for
for (int q=0; q<channels1; q++)
{
const float* ptr = a.data + h1 * q;
const float* ptr = (const float*)a + h1 * q;
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

@@ -177,13 +177,9 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)

if (b.dims == 2)
{
const float* ptr = a;
const float* ptr1 = b;
float* outptr = c;

for (int i=0; i<size; i++)
{
outptr[i] = op(ptr[i], ptr1[i]);
c[i] = op(a[i], b[i]);
}

return 0;
@@ -197,25 +193,21 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)

if (b.w == 1)
{
const float* ptr = a;
const float b0 = b.data[0];
float* outptr = c;

const float b0 = b[0];
for (int i=0; i<size; i++)
{
outptr[i] = op(ptr[i], b0);
c[i] = op(a[i], b0);
}

return 0;
}

const float* ptr = a;
const float* ptr1 = b;
float* outptr = c;

for (int y=0; y<h; y++)
{
const float b0 = ptr1[y];
const float b0 = b[y];
for (int x=0; x<w; x++)
{
outptr[x] = op(ptr[x], b0);
@@ -238,7 +230,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
if (c.empty())
return -100;

const float a0 = a.data[0];
const float a0 = a[0];
#pragma omp parallel for
for (int q=0; q<channels1; q++)
{
@@ -263,13 +255,10 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
if (c.empty())
return -100;

const float a0 = a.data[0];
const float* ptr1 = b;
float* outptr = c;

const float a0 = a[0];
for (int i=0; i<size1; i++)
{
outptr[i] = op(a0, ptr1[i]);
c[i] = op(a0, b[i]);
}

return 0;
@@ -281,13 +270,10 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
if (c.empty())
return -100;

const float a0 = a.data[0];
const float* ptr1 = b;
float* outptr = c;

const float a0 = a[0];
for (int i=0; i<size1; i++)
{
outptr[i] = op(a0, ptr1[i]);
c[i] = op(a0, b[i]);
}

return 0;
@@ -303,7 +289,7 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
#pragma omp parallel for
for (int q=0; q<channels1; q++)
{
const float a0 = a.data[q];
const float a0 = a[q];
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

@@ -325,13 +311,12 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)
if (c.empty())
return -100;

const float* ptr = a;
const float* ptr1 = b;
float* outptr = c;

for (int y=0; y<h1; y++)
{
const float a0 = ptr[y];
const float a0 = a[y];
for (int x=0; x<w1; x++)
{
outptr[x] = op(a0, ptr1[x]);
@@ -352,25 +337,18 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c)

if (b.w == 1)
{
const float* ptr = a;
const float b0 = b.data[0];
float* outptr = c;

const float b0 = b[0];
for (int i=0; i<size; i++)
{
outptr[i] = op(ptr[i], b0);
c[i] = op(a[i], b0);
}

return 0;
}

const float* ptr = a;
const float* ptr1 = b;
float* outptr = c;

for (int i=0; i<size; i++)
{
outptr[i] = op(ptr[i], ptr1[i]);
c[i] = op(a[i], b[i]);
}
}
}


+ 2
- 3
src/layer/convolution.cpp View File

@@ -125,7 +125,6 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
}

// num_output
const float* weight_data_ptr = weight_data;
#pragma omp parallel for
for (int p=0; p<num_output; p++)
{
@@ -138,9 +137,9 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
float sum = 0.f;

if (bias_term)
sum = bias_data.data[p];
sum = bias_data[p];

const float* kptr = weight_data_ptr + maxk * channels * p;
const float* kptr = (const float*)weight_data + maxk * channels * p;

// channels
for (int q=0; q<channels; q++)


+ 4
- 4
src/layer/convolutiondepthwise.cpp View File

@@ -118,7 +118,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
for (int g=0; g<group; g++)
{
float* outptr = top_blob.channel(g);
const float* kptr = weight_data + maxk * g;
const float* kptr = (const float*)weight_data + maxk * g;
const Mat m = bottom_blob_bordered.channel(g);

for (int i = 0; i < outh; i++)
@@ -128,7 +128,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
float sum = 0.f;

if (bias_term)
sum = bias_data.data[g];
sum = bias_data[g];

const float* sptr = m.row(i*stride_h) + j*stride_w;

@@ -158,7 +158,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
for (int p=0; p<num_output_g; p++)
{
float* outptr = top_blob.channel(g * num_output_g + p);
const float* weight_data_ptr = weight_data + maxk * channels_g * num_output_g * g;
const float* weight_data_ptr = (const float*)weight_data + maxk * channels_g * num_output_g * g;

for (int i = 0; i < outh; i++)
{
@@ -167,7 +167,7 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
float sum = 0.f;

if (bias_term)
sum = bias_data.data[num_output_g * g + p];
sum = bias_data[num_output_g * g + p];

const float* kptr = weight_data_ptr + maxk * channels_g * p;



+ 3
- 4
src/layer/deconvolution.cpp View File

@@ -101,13 +101,12 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const
}

// num_output
const float* weight_data_ptr = weight_data;
#pragma omp parallel for
for (int p=0; p<num_output; p++)
{
Mat out = top_blob_bordered.channel(p);

const float bias = bias_term ? bias_data.data[p] : 0.f;
const float bias = bias_term ? bias_data[p] : 0.f;

out.fill(bias);

@@ -117,13 +116,13 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const
{
float* outptr = out.row(i*stride_h) + j*stride_w;

const float* kptr = weight_data_ptr + maxk * channels * p;
const float* kptr = (const float*)weight_data + maxk * channels * p;

// channels
for (int q=0; q<channels; q++)
{
const Mat m = bottom_blob.channel(q);
float val = *(m.data + m.w * i + j);
float val = *(m.row(i) + j);

for (int k = 0; k < maxk; k++)
{


+ 5
- 5
src/layer/deconvolutiondepthwise.cpp View File

@@ -92,10 +92,10 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
for (int g=0; g<group; g++)
{
const float* inptr = bottom_blob.channel(g);
const float* kptr = weight_data + maxk * g;
const float* kptr = (const float*)weight_data + maxk * g;
Mat m = top_blob_bordered.channel(g);

const float bias = bias_term ? bias_data.data[g] : 0.f;
const float bias = bias_term ? bias_data[g] : 0.f;

m.fill(bias);

@@ -124,12 +124,12 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
#pragma omp parallel for
for (int g = 0; g < group; g++)
{
const float* weight_data_ptr = weight_data + maxk * channels_g * num_output_g * g;
const float* weight_data_ptr = (const float*)weight_data + maxk * channels_g * num_output_g * g;
for (int p = 0; p < num_output_g; p++)
{
Mat out = top_blob_bordered.channel(g * num_output_g + p);

const float bias = bias_term ? bias_data.data[g * num_output_g + p] : 0.f;
const float bias = bias_term ? bias_data[g * num_output_g + p] : 0.f;

out.fill(bias);

@@ -145,7 +145,7 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
for (int q = 0; q < channels_g; q++)
{
const Mat m = bottom_blob.channel(channels_g * g + q);
float val = *(m.data + w * i + j);
float val = *(m.row(i) + j);

for (int k = 0; k < maxk; k++)
{


+ 1
- 1
src/layer/detectionoutput.cpp View File

@@ -201,7 +201,7 @@ int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<M

for (int j = 0; j < num_prior; j++)
{
float score = confidence.data[j * num_class + i];
float score = confidence[j * num_class + i];

if (score > confidence_threshold)
{


+ 3
- 5
src/layer/eltwise.cpp View File

@@ -114,12 +114,10 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
}
else
{
const float* coeffs_ptr = coeffs;

// first blob
const Mat& bottom_blob1 = bottom_blobs[1];
float coeff0 = coeffs_ptr[0];
float coeff1 = coeffs_ptr[1];
float coeff0 = coeffs[0];
float coeff1 = coeffs[1];
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
@@ -136,7 +134,7 @@ int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
for (size_t b=2; b<bottom_blobs.size(); b++)
{
const Mat& bottom_blob1 = bottom_blobs[b];
float coeff = coeffs_ptr[b];
float coeff = coeffs[b];
#pragma omp parallel for
for (int q=0; q<channels; q++)
{


+ 2
- 2
src/layer/embed.cpp View File

@@ -65,7 +65,7 @@ int Embed::forward(const Mat& bottom_blob, Mat& top_blob) const
#pragma omp parallel for
for (int q=0; q<words; q++)
{
float* outptr = top_blob.data + top_blob.w * q;
float* outptr = (float*)top_blob + top_blob.w * q;

int word_index = (int)word_ptr[q];

@@ -79,7 +79,7 @@ int Embed::forward(const Mat& bottom_blob, Mat& top_blob) const
{
for (int p=0; p<num_output; p++)
{
outptr[p] += bias_data.data[p];
outptr[p] += bias_data[p];
}
}
}


+ 1
- 1
src/layer/flatten.cpp View File

@@ -39,7 +39,7 @@ int Flatten::forward(const Mat& bottom_blob, Mat& top_blob) const
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
float* outptr = top_blob.data + size * q;
float* outptr = (float*)top_blob + size * q;

for (int i=0; i<size; i++)
{


+ 2
- 3
src/layer/innerproduct.cpp View File

@@ -61,7 +61,6 @@ int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob) const
return -100;

// num_output
const float* weight_data_ptr = weight_data;
#pragma omp parallel for
for (int p=0; p<num_output; p++)
{
@@ -69,12 +68,12 @@ int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob) const
float sum = 0.f;

if (bias_term)
sum = bias_data.data[p];
sum = bias_data[p];

// channels
for (int q=0; q<channels; q++)
{
const float* w = weight_data_ptr + size * channels * p + size * q;
const float* w = (const float*)weight_data + size * channels * p + size * q;
const float* m = bottom_blob.channel(q);

for (int i = 0; i < size; i++)


+ 12
- 12
src/layer/lstm.cpp View File

@@ -97,7 +97,7 @@ int LSTM::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
// 0 otherwise
// calculate hidden
// gate_input_t := W_hc * h_conted_{t-1} + W_xc * x_t + b_c
const float cont = cont_blob.data[t];
const float cont = cont_blob[t];
const Mat x = input_blob.channel(t);
float* hidden_data = hidden;
for (int q=0; q<num_output; q++)
@@ -105,18 +105,18 @@ int LSTM::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
float h_cont = cont ? hidden_data[q] : 0.f;

const float* x_data = x;
const float* bias_c_data_ptr = bias_c_data.data + 4 * q;
float* gates_data = gates.data + 4 * q;
const float* bias_c_data_ptr = (const float*)bias_c_data + 4 * q;
float* gates_data = (float*)gates + 4 * q;

// gate I F O G
const float* weight_hc_data_I = weight_hc_data.data + weight_hc_data.w * q;
const float* weight_xc_data_I = weight_xc_data.data + weight_xc_data.w * q;
const float* weight_hc_data_F = weight_hc_data.data + weight_hc_data.w * q + size;
const float* weight_xc_data_F = weight_xc_data.data + weight_xc_data.w * q + size;
const float* weight_hc_data_O = weight_hc_data.data + weight_hc_data.w * q + size*2;
const float* weight_xc_data_O = weight_xc_data.data + weight_xc_data.w * q + size*2;
const float* weight_hc_data_G = weight_hc_data.data + weight_hc_data.w * q + size*3;
const float* weight_xc_data_G = weight_xc_data.data + weight_xc_data.w * q + size*3;
const float* weight_hc_data_I = (const float*)weight_hc_data + weight_hc_data.w * q;
const float* weight_xc_data_I = (const float*)weight_xc_data + weight_xc_data.w * q;
const float* weight_hc_data_F = (const float*)weight_hc_data + weight_hc_data.w * q + size;
const float* weight_xc_data_F = (const float*)weight_xc_data + weight_xc_data.w * q + size;
const float* weight_hc_data_O = (const float*)weight_hc_data + weight_hc_data.w * q + size*2;
const float* weight_xc_data_O = (const float*)weight_xc_data + weight_xc_data.w * q + size*2;
const float* weight_hc_data_G = (const float*)weight_hc_data + weight_hc_data.w * q + size*3;
const float* weight_xc_data_G = (const float*)weight_xc_data + weight_xc_data.w * q + size*3;

float I = bias_c_data_ptr[0];
float F = bias_c_data_ptr[1];
@@ -148,7 +148,7 @@ int LSTM::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
float* output_data = output;
for (int q=0; q<num_output; q++)
{
float* gates_data = gates.data + 4 * q;
float* gates_data = (float*)gates + 4 * q;

float I = gates_data[0];
float F = gates_data[1];


+ 10
- 12
src/layer/mvn.cpp View File

@@ -49,20 +49,19 @@ int MVN::forward(const Mat& bottom_blob, Mat& top_blob) const
Mat sum(channels);
if (sum.empty())
return -100;
float* sum_ptr = sum;

#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);

float sum = 0.f;
float s = 0.f;
for (int i=0; i<size; i++)
{
sum += ptr[i];
s += ptr[i];
}

sum_ptr[q] = sum;
sum[q] = s;
}

if (across_channels)
@@ -71,7 +70,7 @@ int MVN::forward(const Mat& bottom_blob, Mat& top_blob) const
float mean = 0.f;
for (int q=0; q<channels; q++)
{
mean += sum_ptr[q];
mean += sum[q];
}
mean = mean / (channels * size);

@@ -96,7 +95,7 @@ int MVN::forward(const Mat& bottom_blob, Mat& top_blob) const
{
const float* ptr = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);
float mean = sum_ptr[q] / size;
float mean = sum[q] / size;

for (int i=0; i<size; i++)
{
@@ -111,20 +110,19 @@ int MVN::forward(const Mat& bottom_blob, Mat& top_blob) const
Mat sqsum(channels);
if (sqsum.empty())
return -100;
float* sqsum_ptr = sqsum;

#pragma omp parallel for
for (int q=0; q<channels; q++)
{
const float* ptr = top_blob.channel(q);

float sum = 0.f;
float s = 0.f;
for (int i=0; i<size; i++)
{
sum += ptr[i] * ptr[i];
s += ptr[i] * ptr[i];
}

sqsum_ptr[q] = sum;
sqsum[q] = s;
}

if (across_channels)
@@ -133,7 +131,7 @@ int MVN::forward(const Mat& bottom_blob, Mat& top_blob) const
float sqmean = 0.f;
for (int q=0; q<channels; q++)
{
sqmean += sqsum_ptr[q];
sqmean += sqsum[q];
}
sqmean = sqmean / (channels * size);

@@ -160,7 +158,7 @@ int MVN::forward(const Mat& bottom_blob, Mat& top_blob) const
for (int q=0; q<channels; q++)
{
float* outptr = top_blob.channel(q);
float sqmean = sqsum_ptr[q] / size;
float sqmean = sqsum[q] / size;
float norm_var = sqrt(sqmean) + eps;
float norm_var_inv = 1.f / norm_var;



+ 11
- 14
src/layer/normalize.cpp View File

@@ -63,7 +63,6 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const
if (square_sum_blob.empty())
return -100;

float* square_sum_ptr = square_sum_blob;
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
@@ -75,14 +74,14 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const
ssum += ptr[i] * ptr[i];
}

square_sum_ptr[q] = ssum;
square_sum_blob[q] = ssum;
}

// sum + eps
float ssum = eps;
for (int q=0; q<channels; q++)
{
ssum += square_sum_ptr[q];
ssum += square_sum_blob[q];
}

// 1 / sqrt(ssum)
@@ -90,7 +89,7 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const

if (channel_shared)
{
float scale = a * scale_data.data[0];
float scale = a * scale_data[0];

#pragma omp parallel for
for (int q=0; q<channels; q++)
@@ -111,7 +110,7 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const
{
const float* ptr = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);
float scale = a * scale_data.data[q];
float scale = a * scale_data[q];

for (int i=0; i<size; i++)
{
@@ -124,15 +123,13 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const
{
// square sum, 1 / sqrt(ssum)
Mat square_sum_blob;
square_sum_blob.create(w, h);
square_sum_blob.create(size);
if (square_sum_blob.empty())
return -100;

float* ssptr = square_sum_blob;

if (channel_shared)
{
float scale = scale_data.data[0];
float scale = scale_data[0];

#pragma omp parallel for
for (int i=0; i<size; i++)
@@ -144,7 +141,7 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const
ssum += ptr[i] * ptr[i];
}

ssptr[i] = 1.f / sqrt(ssum) * scale;
square_sum_blob[i] = 1.f / sqrt(ssum) * scale;
}

#pragma omp parallel for
@@ -155,7 +152,7 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const

for (int i=0; i<size; i++)
{
outptr[i] = ptr[i] * ssptr[i];
outptr[i] = ptr[i] * square_sum_blob[i];
}
}
}
@@ -171,7 +168,7 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const
ssum += ptr[i] * ptr[i];
}

ssptr[i] = 1.f / sqrt(ssum);
square_sum_blob[i] = 1.f / sqrt(ssum);
}

#pragma omp parallel for
@@ -179,11 +176,11 @@ int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const
{
const float* ptr = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);
float scale = scale_data.data[q];
float scale = scale_data[q];

for (int i=0; i<size; i++)
{
outptr[i] = ptr[i] * ssptr[i] * scale;
outptr[i] = ptr[i] * square_sum_blob[i] * scale;
}
}
}


+ 2
- 1
src/layer/pooling.cpp View File

@@ -249,7 +249,8 @@ int Pooling::forward(const Mat& bottom_blob, Mat& top_blob) const
{
const float scale = (float)kernel_w / wtail;

outptr = top_blob.channel(q) + outw - 1;
outptr = top_blob.channel(q);
outptr += outw - 1;
for (int i = 0; i < outh; i++)
{
*outptr *= scale;


+ 1
- 3
src/layer/prelu.cpp View File

@@ -47,13 +47,11 @@ int PReLU::forward_inplace(Mat& bottom_top_blob) const
int channels = bottom_top_blob.c;
int size = w * h;

const float* slope_data_ptr = slope_data;

#pragma omp parallel for
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);
float slope = num_slope > 1 ? slope_data_ptr[q] : slope_data_ptr[0];
float slope = num_slope > 1 ? slope_data[q] : slope_data[0];

for (int i=0; i<size; i++)
{


+ 3
- 3
src/layer/priorbox.cpp View File

@@ -79,7 +79,7 @@ int PriorBox::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
#pragma omp parallel for
for (int i = 0; i < h; i++)
{
float* box = top_blob.data + i * w * num_prior * 4;
float* box = (float*)top_blob + i * w * num_prior * 4;

float center_x = offset * step_w;
float center_y = offset * step_h + i * step_h;
@@ -91,7 +91,7 @@ int PriorBox::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to

for (int k = 0; k < num_min_size; k++)
{
float min_size = min_sizes.data[k];
float min_size = min_sizes[k];

// min size box
box_w = box_h = min_size;
@@ -105,7 +105,7 @@ int PriorBox::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to

if (num_max_size > 0)
{
float max_size = max_sizes.data[k];
float max_size = max_sizes[k];

// max size box
box_w = box_h = sqrt(min_size * max_size);


+ 11
- 11
src/layer/proposal.cpp View File

@@ -28,14 +28,14 @@ Proposal::Proposal()

// TODO load from param
ratios.create(3);
ratios.data[0] = 0.5f;
ratios.data[1] = 1.f;
ratios.data[2] = 2.f;
ratios[0] = 0.5f;
ratios[1] = 1.f;
ratios[2] = 2.f;

scales.create(3);
scales.data[0] = 8.f;
scales.data[1] = 16.f;
scales.data[2] = 32.f;
scales[0] = 8.f;
scales[1] = 16.f;
scales[2] = 32.f;
}

static Mat generate_anchors(int base_size, const Mat& ratios, const Mat& scales)
@@ -51,14 +51,14 @@ static Mat generate_anchors(int base_size, const Mat& ratios, const Mat& scales)

for (int i = 0; i < num_ratio; i++)
{
float ar = ratios.data[i];
float ar = ratios[i];

int r_w = round(base_size / sqrt(ar));
int r_h = round(r_w * ar);//round(base_size * sqrt(ar));

for (int j = 0; j < num_scale; j++)
{
float scale = scales.data[j];
float scale = scales[j];

float rs_w = r_w * scale;
float rs_h = r_h * scale;
@@ -269,8 +269,8 @@ int Proposal::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
}

// clip predicted boxes to image
float im_w = im_info_blob.data[1];
float im_h = im_info_blob.data[0];
float im_w = im_info_blob[1];
float im_h = im_info_blob[0];

#pragma omp parallel for
for (int q=0; q<num_anchors; q++)
@@ -293,7 +293,7 @@ int Proposal::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
std::vector<Rect> proposal_boxes;
std::vector<float> scores;

float im_scale = im_info_blob.data[2];
float im_scale = im_info_blob[2];
float min_boxsize = min_size * im_scale;

for (int q=0; q<num_anchors; q++)


+ 13
- 23
src/layer/reduction.cpp View File

@@ -82,7 +82,7 @@ static int reduction_op(const Mat& a, Mat& b, float v0, int dim, float coeff)
Mat sums(channels);
if (sums.empty())
return -100;
float* sums_ptr = sums;
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
@@ -94,22 +94,19 @@ static int reduction_op(const Mat& a, Mat& b, float v0, int dim, float coeff)
sum = op(sum, ptr[i]);
}

sums_ptr[q] = sum;
sums[q] = sum;
}

float* outptr = b;

float sum = v0;
for (int i=0; i<channels; i++)
{
sum = op2(sum, sums_ptr[i]);
sum = op2(sum, sums[i]);
}

outptr[0] = sum * coeff;
b[0] = sum * coeff;
}
else if (dim == 1)
{
float* outptr = b;
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
@@ -121,7 +118,7 @@ static int reduction_op(const Mat& a, Mat& b, float v0, int dim, float coeff)
sum = op(sum, ptr[i]);
}

outptr[q] = sum * coeff;
b[q] = sum * coeff;
}
}
else if (dim == 2)
@@ -173,19 +170,18 @@ static int reduction_op(const Mat& a, Mat& b, float v0, int dim, float coeff)

b.fill(v0);

float* outptr = b;
for (int q=0; q<channels; q++)
{
const float* mins_ptr = mins.channel(q);
for (int j=0; j<w; j++)
{
outptr[j] = op2(outptr[j], mins_ptr[j]);
b[j] = op2(b[j], mins_ptr[j]);
}
}

for (int j=0; j<w; j++)
{
outptr[j] *= coeff;
b[j] *= coeff;
}
}
else if (dim == -2)
@@ -195,18 +191,16 @@ static int reduction_op(const Mat& a, Mat& b, float v0, int dim, float coeff)
for (int q=0; q<channels; q++)
{
const float* ptr = a.channel(q);
float* outptr = b;

for (int i=0; i<size; i++)
{
outptr[i] = op(outptr[i], ptr[i]);
b[i] = op(b[i], ptr[i]);
}
}

float* outptr = b;
for (int i=0; i<size; i++)
{
outptr[i] *= coeff;
b[i] *= coeff;
}
}

@@ -257,15 +251,13 @@ int Reduction::forward(const Mat& bottom_blob, Mat& top_blob) const

if (dim == 0)
{
float* outptr = top_blob;
outptr[0] /= channels * size;
top_blob[0] /= channels * size;
}
else if (dim == 1)
{
float* outptr = top_blob;
for (int q=0; q<channels; q++)
{
outptr[q] /= size;
top_blob[q] /= size;
}
}
else if (dim == 2)
@@ -282,18 +274,16 @@ int Reduction::forward(const Mat& bottom_blob, Mat& top_blob) const
}
else if (dim == -1)
{
float* outptr = top_blob;
for (int j=0; j<w; j++)
{
outptr[j] /= h * channels;
top_blob[j] /= h * channels;
}
}
else if (dim == -2)
{
float* outptr = top_blob;
for (int i=0; i<size; i++)
{
outptr[i] /= channels;
top_blob[i] /= channels;
}
}
}


+ 6
- 6
src/layer/rnn.cpp View File

@@ -92,18 +92,18 @@ int RNN::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blo
// 0 otherwise
// calculate hidden
// h_t = tanh( W_hh * h_cont_{t-1} + W_xh * x_t + b_h )
const float cont = cont_blob.data[t];
const float cont = cont_blob[t];
const Mat x = input_blob.channel(t);
float* hidden_data = hidden;
for (int q=0; q<num_output; q++)
{
float h_cont = cont ? hidden_data[q] : 0.f;

const float* weight_hh_data_ptr = weight_hh_data.data + weight_hh_data.w * q;
const float* weight_xh_data_ptr = weight_xh_data.data + weight_xh_data.w * q;
const float* weight_hh_data_ptr = (const float*)weight_hh_data + weight_hh_data.w * q;
const float* weight_xh_data_ptr = (const float*)weight_xh_data + weight_xh_data.w * q;
const float* x_data = x;

float s0 = bias_h_data.data[q];
float s0 = bias_h_data[q];
for (int i=0; i<size; i++)
{
s0 += weight_hh_data_ptr[i] * h_cont + weight_xh_data_ptr[i] * x_data[i];
@@ -118,9 +118,9 @@ int RNN::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blo
float* output_data = output;
for (int q=0; q<num_output; q++)
{
const float* weight_ho_data_ptr = weight_ho_data.data + weight_ho_data.w * q;
const float* weight_ho_data_ptr = (const float*)weight_ho_data + weight_ho_data.w * q;

float s0 = bias_o_data.data[q];
float s0 = bias_o_data[q];
for (int i=0; i<size; i++)
{
s0 += weight_ho_data_ptr[i] * hidden_data[i];


+ 4
- 8
src/layer/scale.cpp View File

@@ -66,14 +66,13 @@ int Scale::forward_inplace(std::vector<Mat>& bottom_top_blobs) const

if (bias_term)
{
const float* bias_ptr = bias_data;
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

float s = scale_blob.channel(q)[0];
float bias = bias_ptr[q];
float bias = bias_data[q];

for (int i=0; i<size; i++)
{
@@ -109,15 +108,13 @@ int Scale::forward_inplace(Mat& bottom_top_blob) const

if (bias_term)
{
const float* scale_ptr = scale_data;
const float* bias_ptr = bias_data;
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

float s = scale_ptr[q];
float bias = bias_ptr[q];
float s = scale_data[q];
float bias = bias_data[q];

for (int i=0; i<size; i++)
{
@@ -127,13 +124,12 @@ int Scale::forward_inplace(Mat& bottom_top_blob) const
}
else
{
const float* scale_ptr = scale_data;
#pragma omp parallel for
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

float s = scale_ptr[q];
float s = scale_data[q];

for (int i=0; i<size; i++)
{


+ 3
- 6
src/layer/slice.cpp View File

@@ -37,7 +37,7 @@ int Slice::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_b
int channels = bottom_blob.c;

int q = 0;
const int* slices_ptr = (const int*)slices.data;
const int* slices_ptr = slices;
for (size_t i=0; i<top_blobs.size(); i++)
{
int slice = slices_ptr[i];
@@ -54,11 +54,8 @@ int Slice::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_b
int size = bottom_blob.cstep * slice;

const float* ptr = bottom_blob.channel(q);
float* outptr = top_blob.data;
for (int j=0; j<size; j++)
{
outptr[j] = ptr[j];
}
float* outptr = top_blob;
memcpy(outptr, ptr, size * sizeof(float));

q += slice;
}


+ 12
- 20
src/layer/softmax.cpp View File

@@ -84,13 +84,12 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
return -100;
max.fill(-FLT_MAX);

float* maxptr = max;
for (int i=0; i<h; i++)
{
const float* ptr = bottom_top_blob.row(i);
for (int j=0; j<w; j++)
{
maxptr[j] = std::max(maxptr[j], ptr[j]);
max[j] = std::max(max[j], ptr[j]);
}
}

@@ -99,7 +98,7 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
float* ptr = bottom_top_blob.row(i);
for (int j=0; j<w; j++)
{
ptr[j] = exp(ptr[j] - maxptr[j]);
ptr[j] = exp(ptr[j] - max[j]);
}
}

@@ -109,13 +108,12 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
return -100;
sum.fill(0.f);

float* sumptr = sum;
for (int i=0; i<h; i++)
{
const float* ptr = bottom_top_blob.row(i);
for (int j=0; j<w; j++)
{
sumptr[j] += ptr[j];
sum[j] += ptr[j];
}
}

@@ -124,7 +122,7 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
float* ptr = bottom_top_blob.row(i);
for (int j=0; j<w; j++)
{
ptr[j] /= sumptr[j];
ptr[j] /= sum[j];
}
}

@@ -141,7 +139,6 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
if (max.empty())
return -100;

float* maxptr = max;
for (int i=0; i<h; i++)
{
const float* ptr = bottom_top_blob.row(i);
@@ -152,14 +149,14 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
m = std::max(m, ptr[j]);
}

maxptr[i] = m;
max[i] = m;
}

for (int i=0; i<h; i++)
{
float* ptr = bottom_top_blob.row(i);

float m = maxptr[i];
float m = max[i];
for (int j=0; j<w; j++)
{
ptr[j] = exp(ptr[j] - m);
@@ -171,7 +168,6 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
if (sum.empty())
return -100;

float* sumptr = sum;
for (int i=0; i<h; i++)
{
const float* ptr = bottom_top_blob.row(i);
@@ -182,14 +178,14 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
s += ptr[j];
}

sumptr[i] = s;
sum[i] = s;
}

for (int i=0; i<h; i++)
{
float* ptr = bottom_top_blob.row(i);

float s = sumptr[i];
float s = sum[i];
for (int j=0; j<w; j++)
{
ptr[j] /= s;
@@ -214,11 +210,10 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_top_blob.channel(q);
float* maxptr = max;

for (int i=0; i<size; i++)
{
maxptr[i] = std::max(maxptr[i], ptr[i]);
max[i] = std::max(max[i], ptr[i]);
}
}

@@ -226,11 +221,10 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);
float* maxptr = max;

for (int i=0; i<size; i++)
{
ptr[i] = exp(ptr[i] - maxptr[i]);
ptr[i] = exp(ptr[i] - max[i]);
}
}

@@ -242,11 +236,10 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_top_blob.channel(q);
float* sumptr = sum;

for (int i=0; i<size; i++)
{
sumptr[i] += ptr[i];
sum[i] += ptr[i];
}
}

@@ -254,11 +247,10 @@ int Softmax::forward_inplace(Mat& bottom_top_blob) const
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);
float* sumptr = sum;

for (int i=0; i<size; i++)
{
ptr[i] /= sumptr[i];
ptr[i] /= sum[i];
}
}



+ 2
- 2
src/layer/spp.cpp View File

@@ -113,7 +113,7 @@ int SPP::forward(const Mat& bottom_blob, Mat& top_blob) const
{
for (int j = 0; j < outw; j++)
{
const float* sptr = m.data + m.w * i*stride_h + j*stride_w;
const float* sptr = m.row(i*stride_h) + j*stride_w;

float max = sptr[0];

@@ -142,7 +142,7 @@ int SPP::forward(const Mat& bottom_blob, Mat& top_blob) const
{
for (int j = 0; j < outw; j++)
{
const float* sptr = m.data + m.w * i*stride_h + j*stride_w;
const float* sptr = m.row(i*stride_h) + j*stride_w;

float sum = 0;



+ 1
- 3
src/layer/unaryop.cpp View File

@@ -40,12 +40,10 @@ static int unary_op_inplace(Mat& a)

int size = a.total();

float* ptr = a;

#pragma omp parallel for
for (int i=0; i<size; i++)
{
ptr[i] = op(ptr[i]);
a[i] = op(a[i]);
}

return 0;


+ 8
- 8
src/mat.cpp View File

@@ -32,7 +32,7 @@ void Mat::substract_mean_normalize(const float* mean_vals, const float* norm_val
#pragma omp parallel for
for (int q=0; q<c; q++)
{
float* ptr = data + cstep * q;
float* ptr = channel(q);//data + cstep * q;
const float mean = mean_vals[q];

#if __ARM_NEON
@@ -87,7 +87,7 @@ void Mat::substract_mean_normalize(const float* mean_vals, const float* norm_val
#pragma omp parallel for
for (int q=0; q<c; q++)
{
float* ptr = data + cstep * q;
float* ptr = channel(q);//data + cstep * q;
const float norm = norm_vals[q];

#if __ARM_NEON
@@ -142,7 +142,7 @@ void Mat::substract_mean_normalize(const float* mean_vals, const float* norm_val
#pragma omp parallel for
for (int q=0; q<c; q++)
{
float* ptr = data + cstep * q;
float* ptr = channel(q);//data + cstep * q;
const float mean = mean_vals[q];
const float norm = norm_vals[q];

@@ -257,7 +257,7 @@ Mat Mat::from_float16(const unsigned short* data, int size)
if (m.empty())
return m;

float* ptr = m.data;
float* ptr = m;//.data;

#if __ARM_NEON && (__ARM_FP & 2)
int nn = cpu_support_arm_vfpv4() ? size >> 2 : 0;
@@ -324,8 +324,8 @@ static void copy_make_border_image(const Mat& src, Mat& dst, int top, int left,
int w = dst.w;
int h = dst.h;

const float* ptr = src.data;
float* outptr = dst.data;
const float* ptr = src;//.data;
float* outptr = dst;//.data;

if (type == BORDER_CONSTANT)
{
@@ -508,8 +508,8 @@ static void copy_cut_border_image(const Mat& src, Mat& dst, int top, int left)
int w = dst.w;
int h = dst.h;

const float* ptr = src.data + src.w * top + left;
float* outptr = dst.data;
const float* ptr = src.row(top) + left;//.data + src.w * top + left;
float* outptr = dst;//.data;

for (int y = 0; y < h; y++)
{


+ 133
- 78
src/mat.h View File

@@ -30,25 +30,26 @@ public:
// empty
Mat();
// vec
Mat(int w);
Mat(int w, size_t elemsize = 4);
// image
Mat(int w, int h);
Mat(int w, int h, size_t elemsize = 4);
// dim
Mat(int w, int h, int c);
Mat(int w, int h, int c, size_t elemsize = 4);
// copy
Mat(const Mat& m);
// external vec
Mat(int w, float* data);
Mat(int w, void* data, size_t elemsize = 4);
// external image
Mat(int w, int h, float* data);
Mat(int w, int h, void* data, size_t elemsize = 4);
// external dim
Mat(int w, int h, int c, float* data);
Mat(int w, int h, int c, void* data, size_t elemsize = 4);
// release
~Mat();
// assign
Mat& operator=(const Mat& m);
// set all
void fill(float v);
template <typename T> void fill(T v);
// deep copy
Mat clone() const;
// reshape vec
@@ -58,11 +59,11 @@ public:
// reshape dim
Mat reshape(int w, int h, int c) const;
// allocate vec
void create(int w);
void create(int w, size_t elemsize = 4);
// allocate image
void create(int w, int h);
void create(int w, int h, size_t elemsize = 4);
// allocate dim
void create(int w, int h, int c);
void create(int w, int h, int c, size_t elemsize = 4);
// refcount++
void addref();
// refcount--
@@ -76,8 +77,16 @@ public:
const Mat channel(int c) const;
float* row(int y);
const float* row(int y) const;
operator float*();
operator const float*() const;
template<typename T> T* row(int y);
template<typename T> const T* row(int y) const;

// access raw data
template<typename T> operator T*();
template<typename T> operator const T*() const;

// convenient access float vec element
float& operator[](int i);
const float& operator[](int i) const;

enum
{
@@ -119,15 +128,23 @@ public:
// convenient construct from half precisoin floating point data
static Mat from_float16(const unsigned short* data, int size);

// the dimensionality
int dims;
// pointer to the data
float* data;
void* data;

// pointer to the reference counter;
// pointer to the reference counter
// when points to user-allocated data, the pointer is NULL
int* refcount;

// element size in bytes
// 4 = float32/int32
// 2 = float16
// 1 = int8/uint8
// 0 = empty
size_t elemsize;

// the dimensionality
int dims;

int w;
int h;
int c;
@@ -217,30 +234,30 @@ static inline void NCNN_XADD(int* addr, int delta) { int tmp = *addr; *addr += d
#endif

inline Mat::Mat()
: dims(0), data(0), refcount(0), w(0), h(0), c(0), cstep(0)
: data(0), refcount(0), elemsize(0), dims(0), w(0), h(0), c(0), cstep(0)
{
}

inline Mat::Mat(int _w)
: dims(0), data(0), refcount(0)
inline Mat::Mat(int _w, size_t _elemsize)
: data(0), refcount(0), dims(0)
{
create(_w);
create(_w, _elemsize);
}

inline Mat::Mat(int _w, int _h)
: dims(0), data(0), refcount(0)
inline Mat::Mat(int _w, int _h, size_t _elemsize)
: data(0), refcount(0), dims(0)
{
create(_w, _h);
create(_w, _h, _elemsize);
}

inline Mat::Mat(int _w, int _h, int _c)
: dims(0), data(0), refcount(0)
inline Mat::Mat(int _w, int _h, int _c, size_t _elemsize)
: data(0), refcount(0), dims(0)
{
create(_w, _h, _c);
create(_w, _h, _c, _elemsize);
}

inline Mat::Mat(const Mat& m)
: dims(m.dims), data(m.data), refcount(m.refcount)
: data(m.data), refcount(m.refcount), elemsize(m.elemsize), dims(m.dims)
{
if (refcount)
NCNN_XADD(refcount, 1);
@@ -252,8 +269,8 @@ inline Mat::Mat(const Mat& m)
cstep = m.cstep;
}

inline Mat::Mat(int _w, float* _data)
: dims(1), data(_data), refcount(0)
inline Mat::Mat(int _w, void* _data, size_t _elemsize)
: data(_data), refcount(0), elemsize(_elemsize), dims(1)
{
w = _w;
h = 1;
@@ -262,8 +279,8 @@ inline Mat::Mat(int _w, float* _data)
cstep = w;
}

inline Mat::Mat(int _w, int _h, float* _data)
: dims(2), data(_data), refcount(0)
inline Mat::Mat(int _w, int _h, void* _data, size_t _elemsize)
: data(_data), refcount(0), elemsize(_elemsize), dims(2)
{
w = _w;
h = _h;
@@ -272,14 +289,14 @@ inline Mat::Mat(int _w, int _h, float* _data)
cstep = w * h;
}

inline Mat::Mat(int _w, int _h, int _c, float* _data)
: dims(3), data(_data), refcount(0)
inline Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize)
: data(_data), refcount(0), elemsize(_elemsize), dims(3)
{
w = _w;
h = _h;
c = _c;

cstep = alignSize(w * h * sizeof(float), 16) >> 2;
cstep = alignSize(w * h * elemsize, 16) / elemsize;
}

inline Mat::~Mat()
@@ -297,10 +314,11 @@ inline Mat& Mat::operator=(const Mat& m)

release();

dims = m.dims;
data = m.data;
refcount = m.refcount;
elemsize = m.elemsize;

dims = m.dims;
w = m.w;
h = m.h;
c = m.c;
@@ -313,7 +331,7 @@ inline Mat& Mat::operator=(const Mat& m)
inline void Mat::fill(float _v)
{
int size = total();
float* ptr = data;
float* ptr = (float*)data;

#if __ARM_NEON
int nn = size >> 2;
@@ -354,6 +372,17 @@ inline void Mat::fill(float _v)
}
}

template <typename T>
inline void Mat::fill(T _v)
{
int size = total();
T* ptr = (T*)data;
for (int i=0; i<size; i++)
{
ptr[i] = _v;
}
}

inline Mat Mat::clone() const
{
if (empty())
@@ -361,15 +390,15 @@ inline Mat Mat::clone() const

Mat m;
if (dims == 1)
m.create(w);
m.create(w, elemsize);
else if (dims == 2)
m.create(w, h);
m.create(w, h, elemsize);
else if (dims == 3)
m.create(w, h, c);
m.create(w, h, c, elemsize);

if (total() > 0)
{
memcpy(m.data, data, total() * sizeof(float));
memcpy(m.data, data, total() * elemsize);
}

return m;
@@ -383,14 +412,14 @@ inline Mat Mat::reshape(int _w) const
if (dims == 3 && cstep != (size_t)w * h)
{
Mat m;
m.create(_w);
m.create(_w, elemsize);

// flatten
for (int i=0; i<c; i++)
{
const float* ptr = data + i * cstep;
float* mptr = m.data + i * w * h;
memcpy(mptr, ptr, w * h * sizeof(float));
const void* ptr = (unsigned char*)data + i * cstep * elemsize;
void* mptr = (unsigned char*)m.data + i * w * h * elemsize;
memcpy(mptr, ptr, w * h * elemsize);
}

return m;
@@ -399,7 +428,6 @@ inline Mat Mat::reshape(int _w) const
Mat m = *this;

m.dims = 1;

m.w = _w;
m.h = 1;
m.c = 1;
@@ -417,14 +445,14 @@ inline Mat Mat::reshape(int _w, int _h) const
if (dims == 3 && cstep != (size_t)w * h)
{
Mat m;
m.create(_w, _h);
m.create(_w, _h, elemsize);

// flatten
for (int i=0; i<c; i++)
{
const float* ptr = data + i * cstep;
float* mptr = m.data + i * w * h;
memcpy(mptr, ptr, w * h * sizeof(float));
const void* ptr = (unsigned char*)data + i * cstep * elemsize;
void* mptr = (unsigned char*)m.data + i * w * h * elemsize;
memcpy(mptr, ptr, w * h * elemsize);
}

return m;
@@ -433,7 +461,6 @@ inline Mat Mat::reshape(int _w, int _h) const
Mat m = *this;

m.dims = 2;

m.w = _w;
m.h = _h;
m.c = 1;
@@ -450,17 +477,17 @@ inline Mat Mat::reshape(int _w, int _h, int _c) const

if (dims < 3)
{
if ((size_t)_w * _h != alignSize(_w * _h * sizeof(float), 16) >> 2)
if ((size_t)_w * _h != alignSize(_w * _h * elemsize, 16) / elemsize)
{
Mat m;
m.create(_w, _h, _c);
m.create(_w, _h, _c, elemsize);

// align channel
for (int i=0; i<_c; i++)
{
const float* ptr = data + i * _w * _h;
float* mptr = m.data + i * m.cstep;
memcpy(mptr, ptr, _w * _h * sizeof(float));
const void* ptr = (unsigned char*)data + i * _w * _h * elemsize;
void* mptr = (unsigned char*)m.data + i * m.cstep * m.elemsize;
memcpy(mptr, ptr, _w * _h * elemsize);
}

return m;
@@ -476,22 +503,22 @@ inline Mat Mat::reshape(int _w, int _h, int _c) const
Mat m = *this;

m.dims = 3;

m.w = _w;
m.h = _h;
m.c = _c;

m.cstep = alignSize(_w * _h * sizeof(float), 16) >> 2;
m.cstep = alignSize(_w * _h * elemsize, 16) / elemsize;

return m;
}

inline void Mat::create(int _w)
inline void Mat::create(int _w, size_t _elemsize)
{
release();

dims = 1;
elemsize = _elemsize;

dims = 1;
w = _w;
h = 1;
c = 1;
@@ -500,19 +527,20 @@ inline void Mat::create(int _w)

if (total() > 0)
{
size_t totalsize = total() * sizeof(float);
data = (float*)fastMalloc(totalsize + (int)sizeof(*refcount));
size_t totalsize = total() * elemsize;
data = fastMalloc(totalsize + (int)sizeof(*refcount));
refcount = (int*)(((unsigned char*)data) + totalsize);
*refcount = 1;
}
}

inline void Mat::create(int _w, int _h)
inline void Mat::create(int _w, int _h, size_t _elemsize)
{
release();

dims = 2;
elemsize = _elemsize;

dims = 2;
w = _w;
h = _h;
c = 1;
@@ -521,29 +549,30 @@ inline void Mat::create(int _w, int _h)

if (total() > 0)
{
size_t totalsize = total() * sizeof(float);
data = (float*)fastMalloc(totalsize + (int)sizeof(*refcount));
size_t totalsize = total() * elemsize;
data = fastMalloc(totalsize + (int)sizeof(*refcount));
refcount = (int*)(((unsigned char*)data) + totalsize);
*refcount = 1;
}
}

inline void Mat::create(int _w, int _h, int _c)
inline void Mat::create(int _w, int _h, int _c, size_t _elemsize)
{
release();

dims = 3;
elemsize = _elemsize;

dims = 3;
w = _w;
h = _h;
c = _c;

cstep = alignSize(w * h * sizeof(float), 16) >> 2;
cstep = alignSize(w * h * elemsize, 16) / elemsize;

if (total() > 0)
{
size_t totalsize = total() * sizeof(float);
data = (float*)fastMalloc(totalsize + (int)sizeof(*refcount));
size_t totalsize = total() * elemsize;
data = fastMalloc(totalsize + (int)sizeof(*refcount));
refcount = (int*)(((unsigned char*)data) + totalsize);
*refcount = 1;
}
@@ -560,9 +589,11 @@ inline void Mat::release()
if (refcount && NCNN_XADD(refcount, -1) == 1)
fastFree(data);

dims = 0;
data = 0;

elemsize = 0;

dims = 0;
w = 0;
h = 0;
c = 0;
@@ -584,32 +615,56 @@ inline size_t Mat::total() const

inline Mat Mat::channel(int c)
{
return Mat(w, h, data + cstep * c);
return Mat(w, h, (unsigned char*)data + cstep * c * elemsize, elemsize);
}

inline const Mat Mat::channel(int c) const
{
return Mat(w, h, data + cstep * c);
return Mat(w, h, (unsigned char*)data + cstep * c * elemsize, elemsize);
}

inline float* Mat::row(int y)
{
return data + w * y;
return (float*)data + w * y;
}

inline const float* Mat::row(int y) const
{
return data + w * y;
return (const float*)data + w * y;
}

template <typename T>
inline T* Mat::row(int y)
{
return (T*)data + w * y;
}

template <typename T>
inline const T* Mat::row(int y) const
{
return (const T*)data + w * y;
}

template <typename T>
inline Mat::operator T*()
{
return (T*)data;
}

template <typename T>
inline Mat::operator const T*() const
{
return (const T*)data;
}

inline Mat::operator float*()
inline float& Mat::operator[](int i)
{
return data;
return ((float*)data)[i];
}

inline Mat::operator const float*() const
inline const float& Mat::operator[](int i) const
{
return data;
return ((const float*)data)[i];
}

} // namespace ncnn


+ 12
- 11
src/paramdict.cpp View File

@@ -121,9 +121,15 @@ int ParamDict::load_param(FILE* fp)
bool is_float = vstr_is_float(vstr);

if (is_float)
nscan = sscanf(vstr, "%f", &params[id].v.data[j]);
{
float* ptr = params[id].v;
nscan = sscanf(vstr, "%f", &ptr[j]);
}
else
nscan = sscanf(vstr, "%d", (int*)&params[id].v.data[j]);
{
int* ptr = params[id].v;
nscan = sscanf(vstr, "%d", &ptr[j]);
}
if (nscan != 1)
{
fprintf(stderr, "ParamDict parse array element fail\n");
@@ -196,10 +202,8 @@ int ParamDict::load_param_bin(FILE* fp)

params[id].v.create(len);

for (int j = 0; j < len; j++)
{
fread(&params[id].v.data[j], sizeof(float), 1, fp);
}
float* ptr = params[id].v;
fread(ptr, sizeof(float), len, fp);
}
else
{
@@ -237,11 +241,8 @@ int ParamDict::load_param(const unsigned char*& mem)

params[id].v.create(len);

for (int j = 0; j < len; j++)
{
params[id].v.data[j] = *(float*)(mem);
mem += 4;
}
memcpy(params[id].v.data, mem, len * 4);
mem += 4;
}
else
{


Loading…
Cancel
Save