Browse Source

binaryop broadcasting special type 3 4 for lhs

tags/20200413
nihuini 6 years ago
parent
commit
6077066b02
10 changed files with 364 additions and 29 deletions
  1. +2
    -0
      docs/developer-guide/binaryop-broadcasting.md
  2. +210
    -20
      src/layer/arm/binaryop_arm.cpp
  3. +61
    -5
      src/layer/binaryop.cpp
  4. +9
    -2
      src/layer/vulkan/binaryop_vulkan.cpp
  5. +14
    -0
      src/layer/vulkan/shader/binaryop_broadcast.comp
  6. +9
    -1
      src/layer/vulkan/shader/binaryop_broadcast_a1_pack4.comp
  7. +9
    -1
      src/layer/vulkan/shader/binaryop_broadcast_a1_pack8.comp
  8. +7
    -0
      src/layer/vulkan/shader/binaryop_broadcast_pack4.comp
  9. +7
    -0
      src/layer/vulkan/shader/binaryop_broadcast_pack8.comp
  10. +36
    -0
      tests/test_binaryop.cpp

+ 2
- 0
docs/developer-guide/binaryop-broadcasting.md View File

@@ -34,3 +34,5 @@ some special broadcasting rule exists for model compatibility
|---|---|---|---|
|1|[2,3,4]|[1,1,4]|[2,3,4]|
|2|[2,3,4]|[2,3,1]|[2,3,4]|
|3|[1,1,4]|[2,3,4]|[2,3,4]|
|4|[2,3,1]|[2,3,4]|[2,3,4]|

+ 210
- 20
src/layer/arm/binaryop_arm.cpp View File

@@ -60,21 +60,21 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt

if (a.dims == 3)
{
c.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
if (c.empty())
return -100;

if (b.dims == 3)
{
if (w1 == 1 && h1 == 1 && channels1 == channels)
{
// special type 1
c.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = a.channel(q);
float* outptr = c.channel(q);
const float* b0 = b.channel(q);
float* outptr = c.channel(q);
float32x4_t _b0 = vld1q_f32(b0);
for (int i = 0; i < size; i++)
{
@@ -92,6 +92,10 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
if (w1 == w && h1 == h && channels1 == 1 && elempack1 == 1)
{
// special type 2
c.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
@@ -113,7 +117,66 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
return 0;
}

if (w == 1 && h == 1 && channels1 == channels)
{
// special type 3
c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels1; q++)
{
const float* a0 = a.channel(q);
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);
float32x4_t _a0 = vld1q_f32(a0);
for (int i = 0; i < size1; i++)
{
float32x4_t _p1 = vld1q_f32(ptr1);
float32x4_t _outp = op(_a0, _p1);
vst1q_f32(outptr, _outp);
ptr1 += 4;
outptr += 4;
}
}

return 0;
}

if (w1 == w && h1 == h && channels == 1 && elempack == 1)
{
// special type 4
c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels1; q++)
{
const float* ptr = a;
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);
for (int i = 0; i < size1; i++)
{
float32x4_t _p = vld1q_dup_f32(ptr);
float32x4_t _p1 = vld1q_f32(ptr1);
float32x4_t _outp = op(_p, _p1);
vst1q_f32(outptr, _outp);
ptr += 1;
ptr1 += 4;
outptr += 4;
}
}

return 0;
}

// type 19
c.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
@@ -136,6 +199,10 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
return 0;
}

c.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
if (c.empty())
return -100;

if (b.dims == 2)
{
// type 18
@@ -216,7 +283,7 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
if (b.dims == 3)
{
// type 14
c.create(w1, h1, channels1, elemsize, elempack, opt.blob_allocator);
c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator);
if (c.empty())
return -100;

@@ -396,7 +463,7 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
if (b.dims == 3)
{
// type 9
c.create(w1, h1, channels1, elemsize, elempack, opt.blob_allocator);
c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator);
if (c.empty())
return -100;

@@ -423,7 +490,7 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
if (b.dims == 2)
{
// type 8
c.create(w1, h1, elemsize, elempack, opt.blob_allocator);
c.create(w1, h1, elemsize1, elempack1, opt.blob_allocator);
if (c.empty())
return -100;

@@ -681,15 +748,15 @@ static int binary_op_pack4_bf16s(const Mat& a, const Mat& b, Mat& c, const Optio

if (a.dims == 3)
{
c.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
if (c.empty())
return -100;

if (b.dims == 3)
{
if (w1 == 1 && h1 == 1 && channels1 == channels)
{
// special type 1
c.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
@@ -713,6 +780,10 @@ static int binary_op_pack4_bf16s(const Mat& a, const Mat& b, Mat& c, const Optio
if (w1 == w && h1 == h && channels1 == 1 && elempack1 == 1)
{
// special type 2
c.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
@@ -734,7 +805,66 @@ static int binary_op_pack4_bf16s(const Mat& a, const Mat& b, Mat& c, const Optio
return 0;
}

if (w == 1 && h == 1 && channels1 == channels)
{
// special type 3
c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels1; q++)
{
const unsigned short* a0 = a.channel(q);
unsigned short* outptr = c.channel(q);
const unsigned short* ptr1 = b.channel(q);
float32x4_t _a0 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(a0), 16));
for (int i = 0; i < size1; i++)
{
float32x4_t _p1 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr1), 16));
float32x4_t _outp = op(_a0, _p1);
vst1_u16(outptr, vshrn_n_u32(vreinterpretq_u32_f32(_outp), 16));
ptr1 += 4;
outptr += 4;
}
}

return 0;
}

if (w1 == w && h1 == h && channels == 1 && elempack == 1)
{
// special type 4
c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels1; q++)
{
const unsigned short* ptr = a;
const unsigned short* ptr1 = b.channel(q);
unsigned short* outptr = c.channel(q);
for (int i = 0; i < size1; i++)
{
float32x4_t _p = vdupq_n_f32(bfloat16_to_float32(*ptr));
float32x4_t _p1 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr1), 16));
float32x4_t _outp = op(_p, _p1);
vst1_u16(outptr, vshrn_n_u32(vreinterpretq_u32_f32(_outp), 16));
ptr += 1;
ptr1 += 4;
outptr += 4;
}
}

return 0;
}

// type 19
c.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
@@ -757,6 +887,10 @@ static int binary_op_pack4_bf16s(const Mat& a, const Mat& b, Mat& c, const Optio
return 0;
}

c.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
if (c.empty())
return -100;

if (b.dims == 2)
{
// type 18
@@ -837,7 +971,7 @@ static int binary_op_pack4_bf16s(const Mat& a, const Mat& b, Mat& c, const Optio
if (b.dims == 3)
{
// type 14
c.create(w1, h1, channels1, elemsize, elempack, opt.blob_allocator);
c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator);
if (c.empty())
return -100;

@@ -1017,7 +1151,7 @@ static int binary_op_pack4_bf16s(const Mat& a, const Mat& b, Mat& c, const Optio
if (b.dims == 3)
{
// type 9
c.create(w1, h1, channels1, elemsize, elempack, opt.blob_allocator);
c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator);
if (c.empty())
return -100;

@@ -1044,7 +1178,7 @@ static int binary_op_pack4_bf16s(const Mat& a, const Mat& b, Mat& c, const Optio
if (b.dims == 2)
{
// type 8
c.create(w1, h1, elemsize, elempack, opt.blob_allocator);
c.create(w1, h1, elemsize1, elempack1, opt.blob_allocator);
if (c.empty())
return -100;

@@ -1162,21 +1296,21 @@ static int binary_op_bf16s(const Mat& a, const Mat& b, Mat& c, const Option& opt

if (a.dims == 3)
{
c.create(w, h, channels, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

if (b.dims == 3)
{
if (w1 == 1 && h1 == 1 && channels1 == channels)
{
// special type 1
c.create(w, h, channels, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const unsigned short* ptr = a.channel(q);
unsigned short* outptr = c.channel(q);
const unsigned short* b0 = b.channel(q);
unsigned short* outptr = c.channel(q);
for (int i = 0; i < size; i++)
{
outptr[i] = float32_to_bfloat16(op(bfloat16_to_float32(ptr[i]), bfloat16_to_float32(b0[0])));
@@ -1189,6 +1323,10 @@ static int binary_op_bf16s(const Mat& a, const Mat& b, Mat& c, const Option& opt
if (w1 == w && h1 == h && channels1 == 1)
{
// special type 2
c.create(w, h, channels, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
@@ -1204,7 +1342,55 @@ static int binary_op_bf16s(const Mat& a, const Mat& b, Mat& c, const Option& opt
return 0;
}

if (w == 1 && h == 1 && channels1 == channels)
{
// special type 3
c.create(w1, h1, channels1, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels1; q++)
{
const unsigned short* a0 = a.channel(q);
const unsigned short* ptr1 = b.channel(q);
unsigned short* outptr = c.channel(q);
for (int i = 0; i < size1; i++)
{
outptr[i] = float32_to_bfloat16(op(bfloat16_to_float32(a0[0]), bfloat16_to_float32(ptr1[i])));
}
}

return 0;
}

if (w1 == w && h1 == h && channels == 1)
{
// special type 4
c.create(w1, h1, channels1, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels1; q++)
{
const unsigned short* ptr = a;
const unsigned short* ptr1 = b.channel(q);
unsigned short* outptr = c.channel(q);
for (int i = 0; i < size1; i++)
{
outptr[i] = float32_to_bfloat16(op(bfloat16_to_float32(ptr[i]), bfloat16_to_float32(ptr1[i])));
}
}

return 0;
}

// type 19
c.create(w, h, channels, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
@@ -1221,6 +1407,10 @@ static int binary_op_bf16s(const Mat& a, const Mat& b, Mat& c, const Option& opt
return 0;
}

c.create(w, h, channels, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

if (b.dims == 2)
{
// type 18


+ 61
- 5
src/layer/binaryop.cpp View File

@@ -63,21 +63,21 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c, const Option& opt)

if (a.dims == 3)
{
c.create(w, h, channels, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

if (b.dims == 3)
{
if (w1 == 1 && h1 == 1 && channels1 == channels)
{
// special type 1
c.create(w, h, channels, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = a.channel(q);
float* outptr = c.channel(q);
const float* b0 = b.channel(q);
float* outptr = c.channel(q);
for (int i = 0; i < size; i++)
{
outptr[i] = op(ptr[i], b0[0]);
@@ -90,6 +90,10 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c, const Option& opt)
if (w1 == w && h1 == h && channels1 == 1)
{
// special type 2
c.create(w, h, channels, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
@@ -105,7 +109,55 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c, const Option& opt)
return 0;
}

if (w == 1 && h == 1 && channels1 == channels)
{
// special type 3
c.create(w1, h1, channels1, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels1; q++)
{
const float* a0 = a.channel(q);
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);
for (int i = 0; i < size1; i++)
{
outptr[i] = op(a0[0], ptr1[i]);
}
}

return 0;
}

if (w1 == w && h1 == h && channels == 1)
{
// special type 4
c.create(w1, h1, channels1, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels1; q++)
{
const float* ptr = a;
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);
for (int i = 0; i < size1; i++)
{
outptr[i] = op(ptr[i], ptr1[i]);
}
}

return 0;
}

// type 19
c.create(w, h, channels, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
@@ -122,6 +174,10 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c, const Option& opt)
return 0;
}

c.create(w, h, channels, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

if (b.dims == 2)
{
// type 18


+ 9
- 2
src/layer/vulkan/binaryop_vulkan.cpp View File

@@ -228,7 +228,8 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt)
pipeline_binaryop_broadcast_pack4->create(LayerShaderType::binaryop_broadcast_pack4, opt, specializations);
}

if (shape.dims == 0 || (shape.dims == 1 && shape.w == 1 && elempack == 1 && elempack1 == 4))
if (shape.dims == 0 || (shape.dims == 1 && shape.w == 1 && elempack == 1 && elempack1 == 4)
|| (shape.dims == 3 && shape1.dims == 3 && shape1.w == shape.w && shape1.h == shape.h && shape.c == 1 && elempack == 1 && elempack1 == 4))
{
pipeline_binaryop_broadcast_a1_pack4 = new Pipeline(vkdev);
pipeline_binaryop_broadcast_a1_pack4->set_optimal_local_size_xyz(local_size_xyz);
@@ -251,7 +252,8 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt)
pipeline_binaryop_broadcast_pack8->create(LayerShaderType::binaryop_broadcast_pack8, opt, specializations);
}

if ((opt.use_shader_pack8 && shape.dims == 0) || (shape.dims == 1 && shape.w == 1 && elempack == 1 && elempack1 == 8))
if ((opt.use_shader_pack8 && shape.dims == 0) || (shape.dims == 1 && shape.w == 1 && elempack == 1 && elempack1 == 8)
|| (shape.dims == 3 && shape1.dims == 3 && shape1.w == shape.w && shape1.h == shape.h && shape.c == 1 && elempack == 1 && elempack1 == 8))
{
pipeline_binaryop_broadcast_a1_pack8 = new Pipeline(vkdev);
pipeline_binaryop_broadcast_a1_pack8->set_optimal_local_size_xyz(local_size_xyz);
@@ -391,6 +393,11 @@ int BinaryOp_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector
// special type 2
pipeline = out_elempack == 8 ? pipeline_binaryop_broadcast_b1_pack8 : pipeline_binaryop_broadcast_b1_pack4;
}
else if (bottom_blob.dims == 3 && bottom_blob1.dims == 3 && bottom_blob1.w == bottom_blob.w && bottom_blob1.h == bottom_blob.h && bottom_blob.c == 1 && bottom_blob.elempack == 1)
{
// special type 4
pipeline = out_elempack == 8 ? pipeline_binaryop_broadcast_a1_pack8 : pipeline_binaryop_broadcast_a1_pack4;
}
else
{
pipeline = out_elempack == 8 ? pipeline_binaryop_broadcast_pack8 : pipeline_binaryop_broadcast_pack4;


+ 14
- 0
src/layer/vulkan/shader/binaryop_broadcast.comp View File

@@ -102,6 +102,20 @@ void main()
ai = gi;
bi = gy * psc(bw) + gx;
}

if (psc(aw) == 1 && psc(ah) == 1)
{
// special type 3
ai = gz * psc(acstep);
bi = gi;
}

if (psc(bw) == psc(aw) && psc(bh) == psc(ah) && psc(ac) == 1)
{
// special type 4
ai = gy * psc(aw) + gx;
bi = gi;
}
}

if (psc(bdims) == 2)


+ 9
- 1
src/layer/vulkan/shader/binaryop_broadcast_a1_pack4.comp View File

@@ -82,8 +82,16 @@ void main()

const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

int ai = 0;

if (psc(bw) == psc(aw) && psc(bh) == psc(ah) && psc(ac) == 1)
{
// special type 4
ai = gy * psc(aw) + gx;
}

// type 2 3 4
afpvec4 v1 = afpvec4(buffer_ld1(a_blob_data, 0));
afpvec4 v1 = afpvec4(buffer_ld1(a_blob_data, ai));
afpvec4 v2 = buffer_ld4(b_blob_data, gi);

afpvec4 res;


+ 9
- 1
src/layer/vulkan/shader/binaryop_broadcast_a1_pack8.comp View File

@@ -83,8 +83,16 @@ void main()

const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

int ai = 0;

if (psc(bw) == psc(aw) && psc(bh) == psc(ah) && psc(ac) == 1)
{
// special type 2
ai = gy * psc(bw) + gx;
}

// type 2 3 4
afpvec4 v1 = afpvec4(buffer_ld1(a_blob_data, 0));
afpvec4 v1 = afpvec4(buffer_ld1(a_blob_data, ai));
afpvec8 v2 = buffer_ld8(b_blob_data, gi);

afpvec8 res;


+ 7
- 0
src/layer/vulkan/shader/binaryop_broadcast_pack4.comp View File

@@ -95,6 +95,13 @@ void main()
ai = gi;
bi = gz * psc(bcstep);
}

if (psc(aw) == 1 && psc(ah) == 1)
{
// special type 3
ai = gz * psc(acstep);
bi = gi;
}
}

if (psc(bdims) == 2)


+ 7
- 0
src/layer/vulkan/shader/binaryop_broadcast_pack8.comp View File

@@ -96,6 +96,13 @@ void main()
ai = gi;
bi = gz * psc(bcstep);
}

if (psc(aw) == 1 && psc(ah) == 1)
{
// special type 3
ai = gz * psc(acstep);
bi = gi;
}
}

if (psc(bdims) == 2)


+ 36
- 0
tests/test_binaryop.cpp View File

@@ -452,6 +452,40 @@ static int test_binaryop_s2()
return 0;
}

static int test_binaryop_s3()
{
for (int op_type=0; op_type<OP_TYPE_MAX; op_type++)
{
int ret = 0
|| test_binaryop(RandomMat(1, 1, 2), RandomMat(2, 3, 2), op_type)
|| test_binaryop(RandomMat(1, 1, 4), RandomMat(2, 3, 4), op_type)
|| test_binaryop(RandomMat(1, 1, 8), RandomMat(2, 3, 8), op_type)
;

if (ret != 0)
return -1;
}

return 0;
}

static int test_binaryop_s4()
{
for (int op_type=0; op_type<OP_TYPE_MAX; op_type++)
{
int ret = 0
|| test_binaryop(RandomMat(2, 3, 1), RandomMat(2, 3, 2), op_type)
|| test_binaryop(RandomMat(2, 3, 1), RandomMat(2, 3, 4), op_type)
|| test_binaryop(RandomMat(2, 3, 1), RandomMat(2, 3, 8), op_type)
;

if (ret != 0)
return -1;
}

return 0;
}

int main()
{
SRAND(7767517);
@@ -478,5 +512,7 @@ int main()
|| test_binaryop_19()
|| test_binaryop_s1()
|| test_binaryop_s2()
|| test_binaryop_s3()
|| test_binaryop_s4()
;
}

Loading…
Cancel
Save