Browse Source

binary4d, unary4d (#3443)

tags/20220216
nihui GitHub 4 years ago
parent
commit
3a83704c38
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
34 changed files with 6153 additions and 1167 deletions
  1. +68
    -0
      .github/workflows/test-coverage.yml
  2. +11
    -1
      docs/developer-guide/binaryop-broadcasting.md
  3. +1675
    -91
      src/layer/arm/binaryop_arm.cpp
  4. +12
    -6
      src/layer/arm/unaryop_arm.cpp
  5. +250
    -6
      src/layer/binaryop.cpp
  6. +280
    -6
      src/layer/mips/binaryop_mips.cpp
  7. +2
    -1
      src/layer/mips/unaryop_mips.cpp
  8. +2285
    -836
      src/layer/riscv/binaryop_riscv.cpp
  9. +9
    -11
      src/layer/riscv/binaryop_riscv.h
  10. +6
    -6
      src/layer/riscv/convolution_3x3_packn.h
  11. +6
    -6
      src/layer/riscv/convolution_3x3_packn_fp16s.h
  12. +1
    -1
      src/layer/riscv/convolution_packnto1_fp16s.h
  13. +3
    -3
      src/layer/riscv/convolution_sgemm_packn.h
  14. +43
    -3
      src/layer/riscv/convolution_sgemm_packn_fp16s.h
  15. +10
    -10
      src/layer/riscv/convolution_sgemm_packnto1.h
  16. +10
    -10
      src/layer/riscv/convolution_sgemm_packnto1_fp16s.h
  17. +1
    -1
      src/layer/riscv/deconvolution_packnto1_fp16s.h
  18. +82
    -0
      src/layer/riscv/riscv_usability.h
  19. +39
    -0
      src/layer/riscv/riscv_v_071_fix.h
  20. +4
    -2
      src/layer/riscv/unaryop_riscv.cpp
  21. +198
    -74
      src/layer/vulkan/binaryop_vulkan.cpp
  22. +1
    -1
      src/layer/vulkan/relu_vulkan.cpp
  23. +156
    -19
      src/layer/vulkan/shader/binaryop_broadcast.comp
  24. +155
    -18
      src/layer/vulkan/shader/binaryop_broadcast_pack4.comp
  25. +155
    -18
      src/layer/vulkan/shader/binaryop_broadcast_pack8.comp
  26. +11
    -4
      src/layer/vulkan/unaryop_vulkan.cpp
  27. +566
    -18
      src/layer/x86/binaryop_x86.cpp
  28. +1
    -1
      src/net.cpp
  29. +92
    -2
      tests/test_binaryop.cpp
  30. +13
    -4
      tests/test_unaryop.cpp
  31. +2
    -2
      tests/testutil.h
  32. +2
    -2
      toolchains/c906-v222.toolchain.cmake
  33. +2
    -2
      toolchains/c906-v223.toolchain.cmake
  34. +2
    -2
      toolchains/c906.toolchain.cmake

+ 68
- 0
.github/workflows/test-coverage.yml View File

@@ -395,6 +395,40 @@ jobs:
token: ${{ secrets.CODECOV_TOKEN }}
file: build-arm82/lcov.info

linux-gcc-arm82-omp:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2

- name: lcov
run: sudo apt-get install lcov
- name: cache-qemu
id: cache-qemu
uses: actions/cache@v2.1.7
with:
path: qemu-install
key: qemu-aarch64-install-1
- name: checkout-qemu
if: steps.cache-qemu.outputs.cache-hit != 'true'
uses: actions/checkout@v2
with:
repository: qemu/qemu
path: qemu
ref: 8746309137ba470d1b2e8f5ce86ac228625db940
- name: qemu
if: steps.cache-qemu.outputs.cache-hit != 'true'
run: |
cd qemu
./configure --prefix=install --target-list=aarch64-linux-user --disable-system
make -j2
make install
cp -r aarch64-linux-user/install $GITHUB_WORKSPACE/qemu-install

- name: aarch64-gnu-toolchain
run: |
sudo apt-get update
sudo apt-get install g++-aarch64-linux-gnu

- name: build-arm82-omp
run: |
mkdir build-arm82-omp && cd build-arm82-omp
@@ -418,6 +452,40 @@ jobs:
token: ${{ secrets.CODECOV_TOKEN }}
file: build-arm82-omp/lcov.info

linux-gcc-arm82dot-omp:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2

- name: lcov
run: sudo apt-get install lcov
- name: cache-qemu
id: cache-qemu
uses: actions/cache@v2.1.7
with:
path: qemu-install
key: qemu-aarch64-install-1
- name: checkout-qemu
if: steps.cache-qemu.outputs.cache-hit != 'true'
uses: actions/checkout@v2
with:
repository: qemu/qemu
path: qemu
ref: 8746309137ba470d1b2e8f5ce86ac228625db940
- name: qemu
if: steps.cache-qemu.outputs.cache-hit != 'true'
run: |
cd qemu
./configure --prefix=install --target-list=aarch64-linux-user --disable-system
make -j2
make install
cp -r aarch64-linux-user/install $GITHUB_WORKSPACE/qemu-install

- name: aarch64-gnu-toolchain
run: |
sudo apt-get update
sudo apt-get install g++-aarch64-linux-gnu

- name: build-arm82dot-omp
run: |
mkdir build-arm82dot-omp && cd build-arm82dot-omp


+ 11
- 1
docs/developer-guide/binaryop-broadcasting.md View File

@@ -4,7 +4,7 @@ ncnn BinaryOp accepts blobs with different shape

C = BinaryOp(A, B)

shape notation convention is [w], [w,h], [w,h,c]
shape notation convention is [w], [w,h], [w,h,c], [w,h,d,c]

|type|A|B|C|
|---|---|---|---|
@@ -27,6 +27,16 @@ shape notation convention is [w], [w,h], [w,h,c]
|17|[2,3,4]|[4]|[2,3,4]|
|18|[2,3,4]|[3,4]|[2,3,4]|
|19|[2,3,4]|[2,3,4]|[2,3,4]|
|20|[1]|[2,3,4,5]|[2,3,4,5]|
|21|[5]|[2,3,4,5]|[2,3,4,5]|
|22|[4,5]|[2,3,4,5]|[2,3,4,5]|
|23|[3,4,5]|[2,3,4,5]|[2,3,4,5]|
|24|[2,3,4,5]|scalar|[2,3,4,5]|
|25|[2,3,4,5]|[1]|[2,3,4,5]|
|26|[2,3,4,5]|[5]|[2,3,4,5]|
|27|[2,3,4,5]|[4,5]|[2,3,4,5]|
|28|[2,3,4,5]|[3,4,5]|[2,3,4,5]|
|29|[2,3,4,5]|[2,3,4,5]|[2,3,4,5]|

some special broadcasting rule exists for model compatibility



+ 1675
- 91
src/layer/arm/binaryop_arm.cpp
File diff suppressed because it is too large
View File


+ 12
- 6
src/layer/arm/unaryop_arm.cpp View File

@@ -48,8 +48,9 @@ static int unary_op_inplace_pack4(Mat& a, const Option& opt)

int w = a.w;
int h = a.h;
int d = a.d;
int channels = a.c;
int size = w * h;
int size = w * h * d;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
@@ -341,8 +342,9 @@ static int unary_op_inplace_pack8_fp16s(Mat& a, const Option& opt)

int w = a.w;
int h = a.h;
int d = a.d;
int channels = a.c;
int size = w * h;
int size = w * h * d;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
@@ -554,8 +556,9 @@ static int unary_op_inplace_pack4_fp16s(Mat& a, const Option& opt)

int w = a.w;
int h = a.h;
int d = a.d;
int channels = a.c;
int size = w * h;
int size = w * h * d;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
@@ -751,8 +754,9 @@ static int unary_op_inplace_fp16s(Mat& a, const Option& opt)

int w = a.w;
int h = a.h;
int d = a.d;
int channels = a.c;
int size = w * h;
int size = w * h * d;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
@@ -1082,8 +1086,9 @@ static int unary_op_inplace_pack4_bf16s(Mat& a, const Option& opt)

int w = a.w;
int h = a.h;
int d = a.d;
int channels = a.c;
int size = w * h;
int size = w * h * d;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
@@ -1111,8 +1116,9 @@ static int unary_op_inplace_bf16s(Mat& a, const Option& opt)

int w = a.w;
int h = a.h;
int d = a.d;
int channels = a.c;
int size = w * h;
int size = w * h * d;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)


+ 250
- 6
src/layer/binaryop.cpp View File

@@ -49,17 +49,181 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c, const Option& opt)

int w = a.w;
int h = a.h;
int d = a.d;
int channels = a.c;
int size = w * h;
int size = w * h * d;
size_t elemsize = a.elemsize;

int w1 = b.w;
int h1 = b.h;
int d1 = b.d;
int channels1 = b.c;
int size1 = w1 * h1;
int size1 = w1 * h1 * d1;

if (a.dims == 3)
if (a.dims == 4)
{
if (b.dims == 4)
{
// type 29
c.create(w, h, d, channels, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = a.channel(q);
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

for (int i = 0; i < size; i++)
{
outptr[i] = op(ptr[i], ptr1[i]);
}
}

return 0;
}

c.create(w, h, d, channels, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

if (b.dims == 3)
{
// type 28
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = a.channel(q);
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

for (int z = 0; z < d; z++)
{
for (int y = 0; y < h; y++)
{
const float b0 = ptr1[y];
for (int x = 0; x < w; x++)
{
outptr[x] = op(ptr[x], b0);
}

ptr += w;
outptr += w;
}

ptr1 += h;
}
}

return 0;
}

if (b.dims == 2)
{
// type 27
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = a.channel(q);
const float* ptr1 = b.row(q);
float* outptr = c.channel(q);

for (int z = 0; z < d; z++)
{
const float b0 = ptr1[z];
for (int y = 0; y < h; y++)
{
for (int x = 0; x < w; x++)
{
outptr[x] = op(ptr[x], b0);
}

ptr += w;
outptr += w;
}
}
}

return 0;
}

if (b.dims == 1)
{
if (b.w == 1)
{
// type 25
const float b0 = b[0];
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = a.channel(q);
float* outptr = c.channel(q);

for (int i = 0; i < size; i++)
{
outptr[i] = op(ptr[i], b0);
}
}

return 0;
}

// type 26
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = a.channel(q);
const float b0 = b[q];
float* outptr = c.channel(q);

for (int i = 0; i < size; i++)
{
outptr[i] = op(ptr[i], b0);
}
}

return 0;
}
}
else if (a.dims == 3)
{
if (b.dims == 4)
{
// type 23
c.create(w1, h1, d1, channels1, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels1; q++)
{
const float* ptr = a.channel(q);
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

for (int z = 0; z < d1; z++)
{
for (int y = 0; y < h1; y++)
{
const float a0 = ptr[y];
for (int x = 0; x < w1; x++)
{
outptr[x] = op(a0, ptr1[x]);
}

ptr1 += w1;
outptr += w1;
}

ptr += h1;
}
}

return 0;
}

if (b.dims == 3)
{
if (w1 == 1 && h1 == 1 && channels1 == channels)
@@ -359,6 +523,39 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c, const Option& opt)
}
else if (a.dims == 2)
{
if (b.dims == 4)
{
// type 22
c.create(w1, h1, d1, channels1, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels1; q++)
{
const float* ptr = a.row(q);
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

for (int z = 0; z < d1; z++)
{
const float a0 = ptr[z];
for (int y = 0; y < h1; y++)
{
for (int x = 0; x < w1; x++)
{
outptr[x] = op(a0, ptr1[x]);
}

ptr1 += w1;
outptr += w1;
}
}
}

return 0;
}

if (b.dims == 3)
{
// type 14
@@ -445,6 +642,29 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c, const Option& opt)
{
if (a.w == 1)
{
if (b.dims == 4)
{
// type 20
c.create(w1, h1, d1, channels1, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

const float a0 = a[0];
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels1; q++)
{
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

for (int i = 0; i < size1; i++)
{
outptr[i] = op(a0, ptr1[i]);
}
}

return 0;
}

if (b.dims == 3)
{
// type 4
@@ -501,6 +721,29 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c, const Option& opt)
}
}

if (b.dims == 4)
{
// type 21
c.create(w1, h1, d1, channels1, elemsize, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels1; q++)
{
const float a0 = a[q];
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

for (int i = 0; i < size1; i++)
{
outptr[i] = op(a0, ptr1[i]);
}
}

return 0;
}

if (b.dims == 3)
{
// type 9
@@ -585,8 +828,9 @@ static int binary_op_scalar_inplace(Mat& a, float b, const Option& opt)

int w = a.w;
int h = a.h;
int d = a.d;
int channels = a.c;
int size = w * h;
int size = w * h * d;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
@@ -703,10 +947,10 @@ int BinaryOp::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
return binary_op<binary_op_pow>(bottom_blob, bottom_blob1, top_blob, opt);

if (op_type == Operation_RSUB)
return binary_op<binary_op_rsub>(bottom_blob, bottom_blob1, top_blob, opt);
return binary_op<binary_op_sub>(bottom_blob1, bottom_blob, top_blob, opt);

if (op_type == Operation_RDIV)
return binary_op<binary_op_rdiv>(bottom_blob, bottom_blob1, top_blob, opt);
return binary_op<binary_op_div>(bottom_blob1, bottom_blob, top_blob, opt);

return 0;
}


+ 280
- 6
src/layer/mips/binaryop_mips.cpp View File

@@ -41,20 +41,203 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt

int w = a.w;
int h = a.h;
int d = a.d;
int channels = a.c;
int size = w * h;
int size = w * h * d;
size_t elemsize = a.elemsize;
int elempack = a.elempack;

int w1 = b.w;
int h1 = b.h;
int d1 = b.d;
int channels1 = b.c;
int size1 = w1 * h1;
int size1 = w1 * h1 * d1;
size_t elemsize1 = b.elemsize;
int elempack1 = b.elempack;

if (a.dims == 3)
if (a.dims == 4)
{
if (b.dims == 4)
{
// type 29
c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = a.channel(q);
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

for (int i = 0; i < size; i++)
{
v4f32 _p = (v4f32)__msa_ld_w(ptr, 0);
v4f32 _p1 = (v4f32)__msa_ld_w(ptr1, 0);
v4f32 _outp = op(_p, _p1);
__msa_st_w((v4i32)_outp, outptr, 0);
ptr += 4;
ptr1 += 4;
outptr += 4;
}
}

return 0;
}

c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator);
if (c.empty())
return -100;

if (b.dims == 3)
{
// type 28
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = a.channel(q);
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

for (int z = 0; z < d; z++)
{
for (int y = 0; y < h; y++)
{
v4f32 _b0 = (v4f32)__msa_ld_w(ptr1, 0);
for (int x = 0; x < w; x++)
{
v4f32 _p = (v4f32)__msa_ld_w(ptr, 0);
v4f32 _outp = op(_p, _b0);
__msa_st_w((v4i32)_outp, outptr, 0);
ptr += 4;
outptr += 4;
}

ptr1 += 4;
}
}
}

return 0;
}

if (b.dims == 2)
{
// type 27
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = a.channel(q);
const float* ptr1 = b.row(q);
float* outptr = c.channel(q);

for (int z = 0; z < d; z++)
{
v4f32 _b0 = (v4f32)__msa_ld_w(ptr1, 0);
for (int y = 0; y < h; y++)
{
for (int x = 0; x < w; x++)
{
v4f32 _p = (v4f32)__msa_ld_w(ptr, 0);
v4f32 _outp = op(_p, _b0);
__msa_st_w((v4i32)_outp, outptr, 0);
ptr += 4;
outptr += 4;
}
}

ptr1 += 4;
}
}

return 0;
}

if (b.dims == 1)
{
if (b.w == 1 && elempack1 == 1)
{
// type 25
v4f32 _b0 = __msa_fill_w_f32(b[0]);
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = a.channel(q);
float* outptr = c.channel(q);

for (int i = 0; i < size; i++)
{
v4f32 _p = (v4f32)__msa_ld_w(ptr, 0);
v4f32 _outp = op(_p, _b0);
__msa_st_w((v4i32)_outp, outptr, 0);
ptr += 4;
outptr += 4;
}
}

return 0;
}

// type 26
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = a.channel(q);
v4f32 _b0 = (v4f32)__msa_ld_w((const float*)b + q * 4, 0);
float* outptr = c.channel(q);

for (int i = 0; i < size; i++)
{
v4f32 _p = (v4f32)__msa_ld_w(ptr, 0);
v4f32 _outp = op(_p, _b0);
__msa_st_w((v4i32)_outp, outptr, 0);
ptr += 4;
outptr += 4;
}
}

return 0;
}
}
else if (a.dims == 3)
{
if (b.dims == 4)
{
// type 23
c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels1; q++)
{
const float* ptr = a.channel(q);
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

for (int z = 0; z < d1; z++)
{
for (int y = 0; y < h1; y++)
{
v4f32 _a0 = (v4f32)__msa_ld_w(ptr, 0);
for (int x = 0; x < w1; x++)
{
v4f32 _p = (v4f32)__msa_ld_w(ptr1, 0);
v4f32 _outp = op(_a0, _p);
__msa_st_w((v4i32)_outp, outptr, 0);
ptr1 += 4;
outptr += 4;
}

ptr += 4;
}
}
}

return 0;
}

if (b.dims == 3)
{
if (w1 == 1 && h1 == 1 && channels1 == channels)
@@ -417,6 +600,42 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
}
else if (a.dims == 2)
{
if (b.dims == 4)
{
// type 22
c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels1; q++)
{
const float* ptr = a.row(q);
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

for (int z = 0; z < d1; z++)
{
v4f32 _a0 = (v4f32)__msa_ld_w(ptr, 0);
for (int y = 0; y < h1; y++)
{
for (int x = 0; x < w1; x++)
{
v4f32 _p = (v4f32)__msa_ld_w(ptr1, 0);
v4f32 _outp = op(_a0, _p);
__msa_st_w((v4i32)_outp, outptr, 0);
ptr1 += 4;
outptr += 4;
}
}

ptr += 4;
}
}

return 0;
}

if (b.dims == 3)
{
// type 14
@@ -530,6 +749,33 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
{
if (a.w == 1 && elempack == 1)
{
if (b.dims == 4)
{
// type 20
c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
if (c.empty())
return -100;

v4f32 _a0 = __msa_fill_w_f32(a[0]);
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels1; q++)
{
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

for (int i = 0; i < size1; i++)
{
v4f32 _p1 = (v4f32)__msa_ld_w(ptr1, 0);
v4f32 _outp = op(_a0, _p1);
__msa_st_w((v4i32)_outp, outptr, 0);
ptr1 += 4;
outptr += 4;
}
}

return 0;
}

if (b.dims == 3)
{
// type 4
@@ -605,6 +851,33 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
}
}

if (b.dims == 4)
{
// type 21
c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels1; q++)
{
v4f32 _a0 = (v4f32)__msa_ld_w((const float*)a + q * 4, 0);
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

for (int i = 0; i < size1; i++)
{
v4f32 _p1 = (v4f32)__msa_ld_w(ptr1, 0);
v4f32 _outp = op(_a0, _p1);
__msa_st_w((v4i32)_outp, outptr, 0);
ptr1 += 4;
outptr += 4;
}
}

return 0;
}

if (b.dims == 3)
{
// type 9
@@ -717,8 +990,9 @@ static int binary_op_scalar_inplace_pack4(Mat& a, float b, const Option& opt)

int w = a.w;
int h = a.h;
int d = a.d;
int channels = a.c;
int size = w * h;
int size = w * h * d;

v4f32 _b = __msa_fill_w_f32(b);

@@ -847,10 +1121,10 @@ int BinaryOp_mips::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat
return binary_op_pack4<binary_op_pow_pack4>(bottom_blob, bottom_blob1, top_blob, opt);

if (op_type == Operation_RSUB)
return binary_op_pack4<binary_op_rsub_pack4>(bottom_blob, bottom_blob1, top_blob, opt);
return binary_op_pack4<binary_op_sub_pack4>(bottom_blob1, bottom_blob, top_blob, opt);

if (op_type == Operation_RDIV)
return binary_op_pack4<binary_op_rdiv_pack4>(bottom_blob, bottom_blob1, top_blob, opt);
return binary_op_pack4<binary_op_div_pack4>(bottom_blob1, bottom_blob, top_blob, opt);
}
#endif // __mips_msa



+ 2
- 1
src/layer/mips/unaryop_mips.cpp View File

@@ -38,8 +38,9 @@ static int unary_op_inplace_pack4(Mat& a, const Option& opt)

int w = a.w;
int h = a.h;
int d = a.d;
int channels = a.c;
int size = w * h;
int size = w * h * d;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)


+ 2285
- 836
src/layer/riscv/binaryop_riscv.cpp
File diff suppressed because it is too large
View File


+ 9
- 11
src/layer/riscv/binaryop_riscv.h View File

@@ -1,8 +1,6 @@
// Xavier Hsinyuan is pleased to support the open source community by making
// ncnn available.
// Xavier Hsinyuan is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 Xavier Hsinyuan <thelastlinex@hotmail.com>. All rights
// reserved.
// Copyright (C) 2021 Xavier Hsinyuan <thelastlinex@hotmail.com>. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this
// file except in compliance with the License. You may obtain a copy of the
@@ -15,10 +13,12 @@
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.

#ifndef LAYER_BINARYOP_RISCV_H
#define LAYER_BINARYOP_RISCV_H

#include "binaryop.h"

namespace ncnn {

class BinaryOp_riscv : virtual public BinaryOp
@@ -26,19 +26,17 @@ class BinaryOp_riscv : virtual public BinaryOp
public:
BinaryOp_riscv();

virtual int forward(const std::vector<Mat>& bottom_blobs,
std::vector<Mat>& top_blobs, const Option& opt) const;
virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
#if __riscv_vector && __riscv_zfh
int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;

int forward_fp16sa(const std::vector<Mat>& bottom_blobs,
std::vector<Mat>& top_blobs, const Option& opt) const;
int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_BINARYOP_RISCV_H
#endif // LAYER_BINARYOP_RISCV_H

+ 6
- 6
src/layer/riscv/convolution_3x3_packn.h View File

@@ -324,7 +324,7 @@ static void conv3x3s1_winograd64_packn_rvv(const Mat& bottom_blob, Mat& top_blob

for (int q = 0; q < inch; q++)
{
#if RVV_SPEC_0_7
#if C906
for (int l = 0; l < packn; l++)
{
tmpptr[0] = r0[l];
@@ -365,7 +365,7 @@ static void conv3x3s1_winograd64_packn_rvv(const Mat& bottom_blob, Mat& top_blob

for (int q = 0; q < inch; q++)
{
#if RVV_SPEC_0_7
#if C906
for (int l = 0; l < packn; l++)
{
tmpptr[0] = r0[l];
@@ -398,7 +398,7 @@ static void conv3x3s1_winograd64_packn_rvv(const Mat& bottom_blob, Mat& top_blob

for (int q = 0; q < inch; q++)
{
#if RVV_SPEC_0_7
#if C906
for (int l = 0; l < packn; l++)
{
tmpptr[0] = r0[l];
@@ -999,7 +999,7 @@ static void conv3x3s1_winograd42_packn_rvv(const Mat& bottom_blob, Mat& top_blob

for (int q = 0; q < inch; q++)
{
#if RVV_SPEC_0_7
#if C906
for (int l = 0; l < packn; l++)
{
tmpptr[0] = r0[l];
@@ -1040,7 +1040,7 @@ static void conv3x3s1_winograd42_packn_rvv(const Mat& bottom_blob, Mat& top_blob

for (int q = 0; q < inch; q++)
{
#if RVV_SPEC_0_7
#if C906
for (int l = 0; l < packn; l++)
{
tmpptr[0] = r0[l];
@@ -1073,7 +1073,7 @@ static void conv3x3s1_winograd42_packn_rvv(const Mat& bottom_blob, Mat& top_blob

for (int q = 0; q < inch; q++)
{
#if RVV_SPEC_0_7
#if C906
for (int l = 0; l < packn; l++)
{
tmpptr[0] = r0[l];


+ 6
- 6
src/layer/riscv/convolution_3x3_packn_fp16s.h View File

@@ -324,7 +324,7 @@ static void conv3x3s1_winograd64_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& t

for (int q = 0; q < inch; q++)
{
#if RVV_SPEC_0_7
#if C906
for (int l = 0; l < packn; l++)
{
tmpptr[0] = r0[l];
@@ -365,7 +365,7 @@ static void conv3x3s1_winograd64_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& t

for (int q = 0; q < inch; q++)
{
#if RVV_SPEC_0_7
#if C906
for (int l = 0; l < packn; l++)
{
tmpptr[0] = r0[l];
@@ -398,7 +398,7 @@ static void conv3x3s1_winograd64_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& t

for (int q = 0; q < inch; q++)
{
#if RVV_SPEC_0_7
#if C906
for (int l = 0; l < packn; l++)
{
tmpptr[0] = r0[l];
@@ -999,7 +999,7 @@ static void conv3x3s1_winograd42_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& t

for (int q = 0; q < inch; q++)
{
#if RVV_SPEC_0_7
#if C906
for (int l = 0; l < packn; l++)
{
tmpptr[0] = r0[l];
@@ -1040,7 +1040,7 @@ static void conv3x3s1_winograd42_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& t

for (int q = 0; q < inch; q++)
{
#if RVV_SPEC_0_7
#if C906
for (int l = 0; l < packn; l++)
{
tmpptr[0] = r0[l];
@@ -1073,7 +1073,7 @@ static void conv3x3s1_winograd42_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& t

for (int q = 0; q < inch; q++)
{
#if RVV_SPEC_0_7
#if C906
for (int l = 0; l < packn; l++)
{
tmpptr[0] = r0[l];


+ 1
- 1
src/layer/riscv/convolution_packnto1_fp16s.h View File

@@ -84,7 +84,7 @@ static void convolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob
}
}

#ifdef RVV_SPEC_0_7
#if C906
// TODO
std::vector<float> ss(packn);
vse32_v_f32m2((float*)ss.data(), _sum, vl);


+ 3
- 3
src/layer/riscv/convolution_sgemm_packn.h View File

@@ -54,7 +54,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons

for (int k = 0; k < maxk; k++)
{
#if RVV_SPEC_0_7
#if C906
for (int l = 0; l < packn; l++)
{
tmpptr[0] = img0[l];
@@ -103,7 +103,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons

for (int k = 0; k < maxk; k++)
{
#if RVV_SPEC_0_7
#if C906
for (int l = 0; l < packn; l++)
{
tmpptr[0] = img0[l];
@@ -144,7 +144,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons

for (int k = 0; k < maxk; k++)
{
#if RVV_SPEC_0_7
#if C906
for (int l = 0; l < packn; l++)
{
tmpptr[0] = img0[l];


+ 43
- 3
src/layer/riscv/convolution_sgemm_packn_fp16s.h View File

@@ -54,7 +54,8 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo

for (int k = 0; k < maxk; k++)
{
#if RVV_SPEC_0_7
#if C906
#ifdef RVV_SPEC_0_7
asm volatile(
"mv t3, %[LEN] \n\t"
"mv t1, %[SRC] \n\t"
@@ -83,7 +84,22 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo

img0 += size * packn;
tmpptr += packn * 8;
#else
for (int l = 0; l < packn; l++)
{
tmpptr[0] = img0[l];
tmpptr[1] = img0[l + packn];
tmpptr[2] = img0[l + packn * 2];
tmpptr[3] = img0[l + packn * 3];
tmpptr[4] = img0[l + packn * 4];
tmpptr[5] = img0[l + packn * 5];
tmpptr[6] = img0[l + packn * 6];
tmpptr[7] = img0[l + packn * 7];
tmpptr += 8;
}

img0 += size * packn;
#endif
#else
vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl);
vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl);
@@ -118,7 +134,8 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo

for (int k = 0; k < maxk; k++)
{
#if RVV_SPEC_0_7
#if C906
#ifdef RVV_SPEC_0_7
asm volatile(
"mv t3, %[LEN] \n\t"
"mv t1, %[SRC] \n\t"
@@ -138,6 +155,18 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo

img0 += size * packn;
tmpptr += packn * 4;
#else
for (int l = 0; l < packn; l++)
{
tmpptr[0] = img0[l];
tmpptr[1] = img0[l + packn];
tmpptr[2] = img0[l + packn * 2];
tmpptr[3] = img0[l + packn * 3];
tmpptr += 4;
}

img0 += size * packn;
#endif
#else
vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl);
vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl);
@@ -169,7 +198,8 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo

for (int k = 0; k < maxk; k++)
{
#if RVV_SPEC_0_7
#if C906
#ifdef RVV_SPEC_0_7
asm volatile(
"mv t3, %[LEN] \n\t"
"mv t1, %[SRC] \n\t"
@@ -185,6 +215,16 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo

img0 += size * packn;
tmpptr += packn * 2;
#else
for (int l = 0; l < packn; l++)
{
tmpptr[0] = img0[l];
tmpptr[1] = img0[l + packn];
tmpptr += 2;
}

img0 += size * packn;
#endif
#else
vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl);
vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl);


+ 10
- 10
src/layer/riscv/convolution_sgemm_packnto1.h View File

@@ -53,7 +53,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c

for (int k = 0; k < maxk; k++)
{
#if RVV_SPEC_0_7
#if C906
for (int l = 0; l < packn; l++)
{
tmpptr[0] = img0[l];
@@ -102,7 +102,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c

for (int k = 0; k < maxk; k++)
{
#if RVV_SPEC_0_7
#if C906
for (int l = 0; l < packn; l++)
{
tmpptr[0] = img0[l];
@@ -143,7 +143,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c

for (int k = 0; k < maxk; k++)
{
#if RVV_SPEC_0_7
#if C906
for (int l = 0; l < packn; l++)
{
tmpptr[0] = img0[l];
@@ -240,7 +240,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
kptr0 += packn;
}

#if RVV_SPEC_0_7
#if C906
vsse32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, vl);
vsse32_v_f32m1(outptr0 + 1, top_blob.cstep * sizeof(float), _sum1, vl);
vsse32_v_f32m1(outptr0 + 2, top_blob.cstep * sizeof(float), _sum2, vl);
@@ -281,7 +281,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
kptr0 += packn;
}

#if RVV_SPEC_0_7
#if C906
vsse32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, vl);
vsse32_v_f32m1(outptr0 + 1, top_blob.cstep * sizeof(float), _sum1, vl);
vsse32_v_f32m1(outptr0 + 2, top_blob.cstep * sizeof(float), _sum2, vl);
@@ -312,7 +312,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
kptr0 += packn;
}

#if RVV_SPEC_0_7
#if C906
vsse32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, vl);
vsse32_v_f32m1(outptr0 + 1, top_blob.cstep * sizeof(float), _sum1, vl);
#else
@@ -393,7 +393,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
kptr0 += packn;
}

#ifdef RVV_SPEC_0_7
#if C906
// TODO
std::vector<float> ss0(packn);
std::vector<float> ss1(packn);
@@ -473,7 +473,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
kptr0 += packn;
}

#ifdef RVV_SPEC_0_7
#if C906
// TODO
std::vector<float> ss0(packn);
std::vector<float> ss1(packn);
@@ -527,7 +527,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
kptr0 += packn;
}

#ifdef RVV_SPEC_0_7
#if C906
// TODO
std::vector<float> ss0(packn);
std::vector<float> ss1(packn);
@@ -568,7 +568,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
kptr0 += packn;
}

#ifdef RVV_SPEC_0_7
#if C906
// TODO
std::vector<float> ss0(packn);
vse32_v_f32m1((float*)ss0.data(), _sum0, vl);


+ 10
- 10
src/layer/riscv/convolution_sgemm_packnto1_fp16s.h View File

@@ -53,7 +53,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_

for (int k = 0; k < maxk; k++)
{
#if RVV_SPEC_0_7
#if C906
for (int l = 0; l < packn; l++)
{
tmpptr[0] = img0[l];
@@ -102,7 +102,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_

for (int k = 0; k < maxk; k++)
{
#if RVV_SPEC_0_7
#if C906
for (int l = 0; l < packn; l++)
{
tmpptr[0] = img0[l];
@@ -143,7 +143,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_

for (int k = 0; k < maxk; k++)
{
#if RVV_SPEC_0_7
#if C906
for (int l = 0; l < packn; l++)
{
tmpptr[0] = img0[l];
@@ -240,7 +240,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
kptr0 += packn;
}

#if RVV_SPEC_0_7
#if C906
vsse16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, vl);
vsse16_v_f16m1(outptr0 + 1, top_blob.cstep * sizeof(__fp16), _sum1, vl);
vsse16_v_f16m1(outptr0 + 2, top_blob.cstep * sizeof(__fp16), _sum2, vl);
@@ -281,7 +281,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
kptr0 += packn;
}

#if RVV_SPEC_0_7
#if C906
vsse16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, vl);
vsse16_v_f16m1(outptr0 + 1, top_blob.cstep * sizeof(__fp16), _sum1, vl);
vsse16_v_f16m1(outptr0 + 2, top_blob.cstep * sizeof(__fp16), _sum2, vl);
@@ -312,7 +312,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
kptr0 += packn;
}

#if RVV_SPEC_0_7
#if C906
vsse16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, vl);
vsse16_v_f16m1(outptr0 + 1, top_blob.cstep * sizeof(__fp16), _sum1, vl);
#else
@@ -393,7 +393,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
kptr0 += packn;
}

#ifdef RVV_SPEC_0_7
#if C906
// TODO
std::vector<__fp16> ss0(packn);
std::vector<__fp16> ss1(packn);
@@ -473,7 +473,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
kptr0 += packn;
}

#ifdef RVV_SPEC_0_7
#if C906
// TODO
std::vector<__fp16> ss0(packn);
std::vector<__fp16> ss1(packn);
@@ -527,7 +527,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
kptr0 += packn;
}

#ifdef RVV_SPEC_0_7
#if C906
// TODO
std::vector<__fp16> ss0(packn);
std::vector<__fp16> ss1(packn);
@@ -568,7 +568,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
kptr0 += packn;
}

#ifdef RVV_SPEC_0_7
#if C906
// TODO
std::vector<__fp16> ss0(packn);
vse16_v_f16m1((__fp16*)ss0.data(), _sum0, vl);


+ 1
- 1
src/layer/riscv/deconvolution_packnto1_fp16s.h View File

@@ -91,7 +91,7 @@ static void deconvolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_bl
kptr += maxk * packn;
}

#ifdef RVV_SPEC_0_7
#if C906
// TODO
std::vector<float> ss(packn);
vse32_v_f32m2((float*)ss.data(), _sum, vl);


+ 82
- 0
src/layer/riscv/riscv_usability.h View File

@@ -15,6 +15,14 @@
#ifndef RISCV_USABILITY_H
#define RISCV_USABILITY_H

#if __riscv_vector
#ifdef RVV_SPEC_0_7
#include "riscv_v_071_fix.h"
#else
#include <riscv_vector.h>
#endif
#endif // __riscv_vector

#if __riscv_vector
static inline int csrr_vl()
{
@@ -45,6 +53,80 @@ static inline int csrr_vlenb()
: "memory");
return a;
}

static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr)
{
const int packn = csrr_vlenb() / 4;
const word_type vl = vsetvl_e32m8(packn * 8);

// NOTE vloxei8_v_f32m8 gets illegal instruction on d1 --- nihui

// 128bit
static const uint32_t index_128bit[32] = {
0, 4, 8, 12,
0, 4, 8, 12,
0, 4, 8, 12,
0, 4, 8, 12,
0, 4, 8, 12,
0, 4, 8, 12,
0, 4, 8, 12,
0, 4, 8, 12
};

// 256bit
static const uint32_t index_256bit[64] = {
0, 4, 8, 12, 16, 20, 24, 28,
0, 4, 8, 12, 16, 20, 24, 28,
0, 4, 8, 12, 16, 20, 24, 28,
0, 4, 8, 12, 16, 20, 24, 28,
0, 4, 8, 12, 16, 20, 24, 28,
0, 4, 8, 12, 16, 20, 24, 28,
0, 4, 8, 12, 16, 20, 24, 28,
0, 4, 8, 12, 16, 20, 24, 28
};

const uint32_t* index = packn == 4 ? index_128bit : index_256bit;
vuint32m8_t bindex = vle32_v_u32m8(index, vl);
return vloxei32_v_f32m8(ptr, bindex, vl);
}

#if __riscv_zfh
static inline vfloat16m8_t vle16_v_f16m8_f16m1(const __fp16* ptr)
{
const int packn = csrr_vlenb() / 2;
const word_type vl = vsetvl_e16m8(packn * 8);

// NOTE vloxei8_v_f16m8 gets illegal instruction on d1 --- nihui

// 128bit
static const uint16_t index_128bit[64] = {
0, 2, 4, 6, 8, 10, 12, 14,
0, 2, 4, 6, 8, 10, 12, 14,
0, 2, 4, 6, 8, 10, 12, 14,
0, 2, 4, 6, 8, 10, 12, 14,
0, 2, 4, 6, 8, 10, 12, 14,
0, 2, 4, 6, 8, 10, 12, 14,
0, 2, 4, 6, 8, 10, 12, 14,
0, 2, 4, 6, 8, 10, 12, 14
};

// 256bit
static const uint16_t index_256bit[128] = {
0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
};

const uint16_t* index = packn == 8 ? index_128bit : index_256bit;
vuint16m8_t bindex = vle16_v_u16m8(index, vl);
return vloxei16_v_f16m8(ptr, bindex, vl);
}
#endif // __riscv_zfh
#endif // __riscv_vector

#endif // RISCV_USABILITY_H

+ 39
- 0
src/layer/riscv/riscv_v_071_fix.h View File

@@ -96,6 +96,15 @@ typedef uint16x4xm1_t vuint16m1x4_t;
typedef uint16x4xm2_t vuint16m2x4_t;
typedef uint16x8xm1_t vuint16m1x8_t;

typedef uint8xm1_t vuint8m1_t;
typedef uint8xm2_t vuint8m2_t;
typedef uint8xm4_t vuint8m4_t;
typedef uint8xm8_t vuint8m8_t;

typedef uint8x4xm1_t vuint8m1x4_t;
typedef uint8x4xm2_t vuint8m2x4_t;
typedef uint8x8xm1_t vuint8m1x8_t;

#define vsetvl_e32m1(n) vsetvli(n, RVV_E32, RVV_M1)
#define vsetvl_e32m2(n) vsetvli(n, RVV_E32, RVV_M2)
#define vsetvl_e32m4(n) vsetvli(n, RVV_E32, RVV_M4)
@@ -132,6 +141,8 @@ typedef uint16x8xm1_t vuint16m1x8_t;
#define vsse32_v_f32m4 vssev_float32xm4
#define vsse32_v_f32m8 vssev_float32xm8

#define vloxei32_v_f32m8(a, i, vl) vlxev_float32xm8(a, reinterpret_cast<int32xm8_t>(i), vl)

#define vlseg2e32_v_f32m1x2 vlseg2ev_float32x2xm1
#define vsseg2e32_v_f32m1x2 vsseg2ev_float32x2xm1

@@ -617,6 +628,8 @@ static inline vfloat32m1_t vfredmax_vs_f32m8_f32m1(vfloat32m1_t dst, vfloat32m8_
#define vsse16_v_f16m4 vssev_float16xm4
#define vsse16_v_f16m8 vssev_float16xm8

#define vloxei16_v_f16m8(a, i, vl) vlxev_float16xm8(a, reinterpret_cast<int16xm8_t>(i), vl)

#define vlseg2e16_v_f16m1x2 vlseg2ev_float16x2xm1
#define vsseg2e16_v_f16m1x2 vsseg2ev_float16x2xm1

@@ -1690,6 +1703,32 @@ static inline vuint16m1x8_t vcreate_u16m1x8(vuint16m1_t v0, vuint16m1_t v1, vuin
#define vreinterpret_v_f16m4_u16m4(x) reinterpret_cast<vuint16m4_t>(x)
#define vreinterpret_v_f16m8_u16m8(x) reinterpret_cast<vuint16m8_t>(x)

/******************************** uint8 ********************************/
#define vle8_v_u8m1 vlev_uint8xm1
#define vle8_v_u8m2 vlev_uint8xm2
#define vle8_v_u8m4 vlev_uint8xm4
#define vle8_v_u8m8 vlev_uint8xm8

#define vse8_v_u8m1 vsev_uint8xm1
#define vse8_v_u8m2 vsev_uint8xm2
#define vse8_v_u8m4 vsev_uint8xm4
#define vse8_v_u8m8 vsev_uint8xm8

#define vlse8_v_u8m1 vlsev_uint8xm1
#define vlse8_v_u8m2 vlsev_uint8xm2
#define vlse8_v_u8m4 vlsev_uint8xm4
#define vlse8_v_u8m8 vlsev_uint8xm8

#define vsse8_v_u8m1 vssev_uint8xm1
#define vsse8_v_u8m2 vssev_uint8xm2
#define vsse8_v_u8m4 vssev_uint8xm4
#define vsse8_v_u8m8 vssev_uint8xm8

#define vmv_v_x_u8m1 vmvvx_unt8xm1
#define vmv_v_x_u8m2 vmvvx_unt8xm2
#define vmv_v_x_u8m4 vmvvx_unt8xm4
#define vmv_v_x_u8m8 vmvvx_unt8xm8

/******************************** mask ********************************/
#define vmxor_mm_b32 vmxormm_e32xm1
#define vmxor_mm_b16 vmxormm_e32xm2


+ 4
- 2
src/layer/riscv/unaryop_riscv.cpp View File

@@ -46,8 +46,9 @@ static int unary_op_inplace(Mat& a, const Option& opt)

int w = a.w;
int h = a.h;
int d = a.d;
int channels = a.c;
int size = w * h;
int size = w * h * d;
int elempack = a.elempack;

#pragma omp parallel for num_threads(opt.num_threads)
@@ -322,8 +323,9 @@ static int unary_op_inplace_fp16s(Mat& a, const Option& opt)

int w = a.w;
int h = a.h;
int d = a.d;
int channels = a.c;
int size = w * h;
int size = w * h * d;
int elempack = a.elempack;

#pragma omp parallel for num_threads(opt.num_threads)


+ 198
- 74
src/layer/vulkan/binaryop_vulkan.cpp View File

@@ -47,17 +47,17 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt)
int elempack = 1;
if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
if (shape.dims == 3 || shape.dims == 4) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;

int elempack1 = 1;
if (shape1.dims == 1) elempack1 = opt.use_shader_pack8 && shape1.w % 8 == 0 ? 8 : shape1.w % 4 == 0 ? 4 : 1;
if (shape1.dims == 2) elempack1 = opt.use_shader_pack8 && shape1.h % 8 == 0 ? 8 : shape1.h % 4 == 0 ? 4 : 1;
if (shape1.dims == 3) elempack1 = opt.use_shader_pack8 && shape1.c % 8 == 0 ? 8 : shape1.c % 4 == 0 ? 4 : 1;
if (shape1.dims == 3 || shape1.dims == 4) elempack1 = opt.use_shader_pack8 && shape1.c % 8 == 0 ? 8 : shape1.c % 4 == 0 ? 4 : 1;

int out_elempack = 1;
if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1;
if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 : 1;
if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1;
if (out_shape.dims == 3 || out_shape.dims == 4) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1;

size_t elemsize;
size_t elemsize1;
@@ -85,19 +85,22 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt)
if (shape.dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack);
if (shape.dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
if (shape.dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
if (shape.dims == 4) shape_packed = Mat(shape.w, shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack);

Mat shape1_packed;
if (shape1.dims == 1) shape1_packed = Mat(shape1.w / elempack1, (void*)0, elemsize1, elempack1);
if (shape1.dims == 2) shape1_packed = Mat(shape1.w, shape1.h / elempack1, (void*)0, elemsize1, elempack1);
if (shape1.dims == 3) shape1_packed = Mat(shape1.w, shape1.h, shape1.c / elempack1, (void*)0, elemsize1, elempack1);
if (shape1.dims == 4) shape1_packed = Mat(shape1.w, shape1.h, shape1.d, shape1.c / elempack1, (void*)0, elemsize1, elempack1);

Mat out_shape_packed;
if (out_shape.dims == 1) out_shape_packed = Mat(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack);
if (out_shape.dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h / out_elempack, (void*)0, out_elemsize, out_elempack);
if (out_shape.dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack);
if (out_shape.dims == 4) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.d, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack);

bool broadcast = true;
if (shape.dims == shape1.dims && shape.w == shape1.w && shape.h == shape1.h && shape.c == shape1.c)
if (shape.dims == shape1.dims && shape.w == shape1.w && shape.h == shape1.h && shape.d == shape1.d && shape.c == shape1.c)
{
broadcast = false;
}
@@ -111,17 +114,17 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt)
specializations[2].f = b;
specializations[3 + 0].i = shape_packed.dims;
specializations[3 + 1].i = shape_packed.w;
specializations[3 + 2].i = shape_packed.h;
specializations[3 + 2].i = shape_packed.h * shape_packed.d;
specializations[3 + 3].i = shape_packed.c;
specializations[3 + 4].i = shape_packed.cstep;
specializations[3 + 5].i = shape1_packed.dims;
specializations[3 + 6].i = shape1_packed.w;
specializations[3 + 7].i = shape1_packed.h;
specializations[3 + 7].i = shape1_packed.h * shape1_packed.d;
specializations[3 + 8].i = shape1_packed.c;
specializations[3 + 9].i = shape1_packed.cstep;
specializations[3 + 10].i = out_shape_packed.dims;
specializations[3 + 11].i = out_shape_packed.w;
specializations[3 + 12].i = out_shape_packed.h;
specializations[3 + 12].i = out_shape_packed.h * out_shape_packed.d;
specializations[3 + 13].i = out_shape_packed.c;
specializations[3 + 14].i = out_shape_packed.cstep;

@@ -144,6 +147,12 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt)
local_size_xyz.h = std::min(4, out_shape_packed.h);
local_size_xyz.c = std::min(4, out_shape_packed.c);
}
if (out_shape_packed.dims == 4)
{
local_size_xyz.w = std::min(4, out_shape_packed.w);
local_size_xyz.h = std::min(4, out_shape_packed.h * out_shape_packed.d);
local_size_xyz.c = std::min(4, out_shape_packed.c);
}

// pack1
if (shape.dims == 0 || elempack == 1)
@@ -173,23 +182,44 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt)
// broadcast
if (shape.dims == 0 || broadcast)
{
std::vector<vk_specialization_type> specializations(1 + 15);
std::vector<vk_specialization_type> specializations(1 + 18);
specializations[0].i = op_type;
specializations[1 + 0].i = shape_packed.dims;
specializations[1 + 1].i = shape_packed.w;
specializations[1 + 2].i = shape_packed.h;
specializations[1 + 3].i = shape_packed.c;
specializations[1 + 4].i = shape_packed.cstep;
specializations[1 + 5].i = shape1_packed.dims;
specializations[1 + 6].i = shape1_packed.w;
specializations[1 + 7].i = shape1_packed.h;
specializations[1 + 8].i = shape1_packed.c;
specializations[1 + 9].i = shape1_packed.cstep;
specializations[1 + 10].i = out_shape_packed.dims;
specializations[1 + 11].i = out_shape_packed.w;
specializations[1 + 12].i = out_shape_packed.h;
specializations[1 + 13].i = out_shape_packed.c;
specializations[1 + 14].i = out_shape_packed.cstep;
specializations[1 + 3].i = shape_packed.d;
specializations[1 + 4].i = shape_packed.c;
specializations[1 + 5].i = shape_packed.cstep;
specializations[1 + 6].i = shape1_packed.dims;
specializations[1 + 7].i = shape1_packed.w;
specializations[1 + 8].i = shape1_packed.h;
specializations[1 + 9].i = shape1_packed.d;
specializations[1 + 10].i = shape1_packed.c;
specializations[1 + 11].i = shape1_packed.cstep;
specializations[1 + 12].i = out_shape_packed.dims;
specializations[1 + 13].i = out_shape_packed.w;
specializations[1 + 14].i = out_shape_packed.h;
specializations[1 + 15].i = out_shape_packed.d;
specializations[1 + 16].i = out_shape_packed.c;
specializations[1 + 17].i = out_shape_packed.cstep;

std::vector<vk_specialization_type> specializations_broadcast_a1_b1(1 + 15);
specializations_broadcast_a1_b1[0].i = op_type;
specializations_broadcast_a1_b1[1 + 0].i = shape_packed.dims;
specializations_broadcast_a1_b1[1 + 1].i = shape_packed.w;
specializations_broadcast_a1_b1[1 + 2].i = shape_packed.h * shape_packed.d;
specializations_broadcast_a1_b1[1 + 3].i = shape_packed.c;
specializations_broadcast_a1_b1[1 + 4].i = shape_packed.cstep;
specializations_broadcast_a1_b1[1 + 5].i = shape1_packed.dims;
specializations_broadcast_a1_b1[1 + 6].i = shape1_packed.w;
specializations_broadcast_a1_b1[1 + 7].i = shape1_packed.h * shape1_packed.d;
specializations_broadcast_a1_b1[1 + 8].i = shape1_packed.c;
specializations_broadcast_a1_b1[1 + 9].i = shape1_packed.cstep;
specializations_broadcast_a1_b1[1 + 10].i = out_shape_packed.dims;
specializations_broadcast_a1_b1[1 + 11].i = out_shape_packed.w;
specializations_broadcast_a1_b1[1 + 12].i = out_shape_packed.h * out_shape_packed.d;
specializations_broadcast_a1_b1[1 + 13].i = out_shape_packed.c;
specializations_broadcast_a1_b1[1 + 14].i = out_shape_packed.cstep;

Mat local_size_xyz;
if (out_shape_packed.dims == 1)
@@ -210,6 +240,12 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt)
local_size_xyz.h = std::min(4, out_shape_packed.h);
local_size_xyz.c = std::min(4, out_shape_packed.c);
}
if (out_shape_packed.dims == 4)
{
local_size_xyz.w = std::min(4, out_shape_packed.w);
local_size_xyz.h = std::min(4, out_shape_packed.h * out_shape_packed.d);
local_size_xyz.c = std::min(4, out_shape_packed.c);
}

// pack1
if (shape.dims == 0 || (elempack == 1 && elempack1 == 1))
@@ -232,7 +268,7 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt)
{
pipeline_binaryop_broadcast_a1_pack4 = new Pipeline(vkdev);
pipeline_binaryop_broadcast_a1_pack4->set_optimal_local_size_xyz(local_size_xyz);
pipeline_binaryop_broadcast_a1_pack4->create(LayerShaderType::binaryop_broadcast_a1_pack4, opt, specializations);
pipeline_binaryop_broadcast_a1_pack4->create(LayerShaderType::binaryop_broadcast_a1_pack4, opt, specializations_broadcast_a1_b1);
}

if (shape.dims == 0 || (shape1.dims == 1 && shape1.w == 1 && elempack1 == 1 && elempack == 4)
@@ -240,7 +276,7 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt)
{
pipeline_binaryop_broadcast_b1_pack4 = new Pipeline(vkdev);
pipeline_binaryop_broadcast_b1_pack4->set_optimal_local_size_xyz(local_size_xyz);
pipeline_binaryop_broadcast_b1_pack4->create(LayerShaderType::binaryop_broadcast_b1_pack4, opt, specializations);
pipeline_binaryop_broadcast_b1_pack4->create(LayerShaderType::binaryop_broadcast_b1_pack4, opt, specializations_broadcast_a1_b1);
}

// pack8
@@ -256,7 +292,7 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt)
{
pipeline_binaryop_broadcast_a1_pack8 = new Pipeline(vkdev);
pipeline_binaryop_broadcast_a1_pack8->set_optimal_local_size_xyz(local_size_xyz);
pipeline_binaryop_broadcast_a1_pack8->create(LayerShaderType::binaryop_broadcast_a1_pack8, opt, specializations);
pipeline_binaryop_broadcast_a1_pack8->create(LayerShaderType::binaryop_broadcast_a1_pack8, opt, specializations_broadcast_a1_b1);
}

if ((opt.use_shader_pack8 && shape.dims == 0) || (shape1.dims == 1 && shape1.w == 1 && elempack1 == 1 && elempack == 8)
@@ -264,7 +300,7 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt)
{
pipeline_binaryop_broadcast_b1_pack8 = new Pipeline(vkdev);
pipeline_binaryop_broadcast_b1_pack8->set_optimal_local_size_xyz(local_size_xyz);
pipeline_binaryop_broadcast_b1_pack8->create(LayerShaderType::binaryop_broadcast_b1_pack8, opt, specializations);
pipeline_binaryop_broadcast_b1_pack8->create(LayerShaderType::binaryop_broadcast_b1_pack8, opt, specializations_broadcast_a1_b1);
}
}

@@ -324,7 +360,7 @@ int BinaryOp_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector
}
else // if (bottom_blob.dims == bottom_blob1.dims)
{
if (bottom_blob.w * bottom_blob.h * bottom_blob.c * bottom_blob.elempack >= bottom_blob1.w * bottom_blob1.h * bottom_blob1.c * bottom_blob1.elempack)
if (bottom_blob.w * bottom_blob.h * bottom_blob.d * bottom_blob.c * bottom_blob.elempack >= bottom_blob1.w * bottom_blob1.h * bottom_blob1.d * bottom_blob1.c * bottom_blob1.elempack)
{
top_blob.create_like(bottom_blob, opt.blob_vkallocator);
}
@@ -343,39 +379,63 @@ int BinaryOp_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector
bindings[1] = bottom_blob1;
bindings[2] = top_blob;

std::vector<vk_constant_type> constants(15);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = bottom_blob.cstep;
constants[5].i = bottom_blob1.dims;
constants[6].i = bottom_blob1.w;
constants[7].i = bottom_blob1.h;
constants[8].i = bottom_blob1.c;
constants[9].i = bottom_blob1.cstep;
constants[10].i = top_blob.dims;
constants[11].i = top_blob.w;
constants[12].i = top_blob.h;
constants[13].i = top_blob.c;
constants[14].i = top_blob.cstep;

bool broadcast = true;
if (bottom_blob.dims == bottom_blob1.dims
&& bottom_blob.w == bottom_blob1.w
&& bottom_blob.h == bottom_blob1.h
&& bottom_blob.d == bottom_blob1.d
&& bottom_blob.c == bottom_blob1.c
&& bottom_blob.elempack == bottom_blob1.elempack)
{
broadcast = false;
}

const Pipeline* pipeline = 0;
if (broadcast)
{
std::vector<vk_constant_type> constants(18);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.d;
constants[4].i = bottom_blob.c;
constants[5].i = bottom_blob.cstep;
constants[6].i = bottom_blob1.dims;
constants[7].i = bottom_blob1.w;
constants[8].i = bottom_blob1.h;
constants[9].i = bottom_blob1.d;
constants[10].i = bottom_blob1.c;
constants[11].i = bottom_blob1.cstep;
constants[12].i = top_blob.dims;
constants[13].i = top_blob.w;
constants[14].i = top_blob.h;
constants[15].i = top_blob.d;
constants[16].i = top_blob.c;
constants[17].i = top_blob.cstep;

std::vector<vk_constant_type> constants_broadcast_a1b1(15);
constants_broadcast_a1b1[0].i = bottom_blob.dims;
constants_broadcast_a1b1[1].i = bottom_blob.w;
constants_broadcast_a1b1[2].i = bottom_blob.h * bottom_blob.d;
constants_broadcast_a1b1[3].i = bottom_blob.c;
constants_broadcast_a1b1[4].i = bottom_blob.cstep;
constants_broadcast_a1b1[5].i = bottom_blob1.dims;
constants_broadcast_a1b1[6].i = bottom_blob1.w;
constants_broadcast_a1b1[7].i = bottom_blob1.h * bottom_blob1.d;
constants_broadcast_a1b1[8].i = bottom_blob1.c;
constants_broadcast_a1b1[9].i = bottom_blob1.cstep;
constants_broadcast_a1b1[10].i = top_blob.dims;
constants_broadcast_a1b1[11].i = top_blob.w;
constants_broadcast_a1b1[12].i = top_blob.h * top_blob.d;
constants_broadcast_a1b1[13].i = top_blob.c;
constants_broadcast_a1b1[14].i = top_blob.cstep;

bool broadcast_a1b1 = true;

const Pipeline* pipeline = 0;
if (bottom_blob.elempack == 1 && bottom_blob1.elempack == 1)
{
pipeline = pipeline_binaryop_broadcast;
broadcast_a1b1 = false;
}
else
{
@@ -400,18 +460,38 @@ int BinaryOp_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector
else
{
pipeline = out_elempack == 8 ? pipeline_binaryop_broadcast_pack8 : pipeline_binaryop_broadcast_pack4;
broadcast_a1b1 = false;
}
}

cmd.record_pipeline(pipeline, bindings, broadcast_a1b1 ? constants_broadcast_a1b1 : constants, top_blob);
}
else
{
pipeline = out_elempack == 8 ? pipeline_binaryop_pack8
: out_elempack == 4 ? pipeline_binaryop_pack4
: pipeline_binaryop;
std::vector<vk_constant_type> constants(15);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h * bottom_blob.d;
constants[3].i = bottom_blob.c;
constants[4].i = bottom_blob.cstep;
constants[5].i = bottom_blob1.dims;
constants[6].i = bottom_blob1.w;
constants[7].i = bottom_blob1.h * bottom_blob1.d;
constants[8].i = bottom_blob1.c;
constants[9].i = bottom_blob1.cstep;
constants[10].i = top_blob.dims;
constants[11].i = top_blob.w;
constants[12].i = top_blob.h * top_blob.d;
constants[13].i = top_blob.c;
constants[14].i = top_blob.cstep;

const Pipeline* pipeline = out_elempack == 8 ? pipeline_binaryop_pack8
: out_elempack == 4 ? pipeline_binaryop_pack4
: pipeline_binaryop;

cmd.record_pipeline(pipeline, bindings, constants, top_blob);
}

cmd.record_pipeline(pipeline, bindings, constants, top_blob);

return 0;
}

@@ -427,7 +507,7 @@ int BinaryOp_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, con
std::vector<vk_constant_type> constants(15);
constants[10].i = bottom_top_blob.dims;
constants[11].i = bottom_top_blob.w;
constants[12].i = bottom_top_blob.h;
constants[12].i = bottom_top_blob.h * bottom_top_blob.d;
constants[13].i = bottom_top_blob.c;
constants[14].i = bottom_top_blob.cstep;

@@ -458,7 +538,7 @@ int BinaryOp_vulkan::forward(const std::vector<VkImageMat>& bottom_blobs, std::v
}
else // if (bottom_blob.dims == bottom_blob1.dims)
{
if (bottom_blob.w * bottom_blob.h * bottom_blob.c * bottom_blob.elempack >= bottom_blob1.w * bottom_blob1.h * bottom_blob1.c * bottom_blob1.elempack)
if (bottom_blob.w * bottom_blob.h * bottom_blob.d * bottom_blob.c * bottom_blob.elempack >= bottom_blob1.w * bottom_blob1.h * bottom_blob1.d * bottom_blob1.c * bottom_blob1.elempack)
{
top_blob.create_like(bottom_blob, opt.blob_vkallocator);
}
@@ -477,39 +557,63 @@ int BinaryOp_vulkan::forward(const std::vector<VkImageMat>& bottom_blobs, std::v
bindings[1] = bottom_blob1;
bindings[2] = top_blob;

std::vector<vk_constant_type> constants(15);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = 0; //bottom_blob.cstep;
constants[5].i = bottom_blob1.dims;
constants[6].i = bottom_blob1.w;
constants[7].i = bottom_blob1.h;
constants[8].i = bottom_blob1.c;
constants[9].i = 0; //bottom_blob1.cstep;
constants[10].i = top_blob.dims;
constants[11].i = top_blob.w;
constants[12].i = top_blob.h;
constants[13].i = top_blob.c;
constants[14].i = 0; //top_blob.cstep;

bool broadcast = true;
if (bottom_blob.dims == bottom_blob1.dims
&& bottom_blob.w == bottom_blob1.w
&& bottom_blob.h == bottom_blob1.h
&& bottom_blob.d == bottom_blob1.d
&& bottom_blob.c == bottom_blob1.c
&& bottom_blob.elempack == bottom_blob1.elempack)
{
broadcast = false;
}

const Pipeline* pipeline = 0;
if (broadcast)
{
std::vector<vk_constant_type> constants(18);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.d;
constants[4].i = bottom_blob.c;
constants[5].i = 0; //bottom_blob.cstep;
constants[6].i = bottom_blob1.dims;
constants[7].i = bottom_blob1.w;
constants[8].i = bottom_blob1.h;
constants[9].i = bottom_blob1.d;
constants[10].i = bottom_blob1.c;
constants[11].i = 0; //bottom_blob1.cstep;
constants[12].i = top_blob.dims;
constants[13].i = top_blob.w;
constants[14].i = top_blob.h;
constants[15].i = top_blob.d;
constants[16].i = top_blob.c;
constants[17].i = 0; //top_blob.cstep;

std::vector<vk_constant_type> constants_broadcast_a1b1(15);
constants_broadcast_a1b1[0].i = bottom_blob.dims;
constants_broadcast_a1b1[1].i = bottom_blob.w;
constants_broadcast_a1b1[2].i = bottom_blob.h * bottom_blob.d;
constants_broadcast_a1b1[3].i = bottom_blob.c;
constants_broadcast_a1b1[4].i = 0; //bottom_blob.cstep;
constants_broadcast_a1b1[5].i = bottom_blob1.dims;
constants_broadcast_a1b1[6].i = bottom_blob1.w;
constants_broadcast_a1b1[7].i = bottom_blob1.h * bottom_blob1.d;
constants_broadcast_a1b1[8].i = bottom_blob1.c;
constants_broadcast_a1b1[9].i = 0; //bottom_blob1.cstep;
constants_broadcast_a1b1[10].i = top_blob.dims;
constants_broadcast_a1b1[11].i = top_blob.w;
constants_broadcast_a1b1[12].i = top_blob.h * top_blob.d;
constants_broadcast_a1b1[13].i = top_blob.c;
constants_broadcast_a1b1[14].i = 0; //top_blob.cstep;

bool broadcast_a1b1 = true;

const Pipeline* pipeline = 0;
if (bottom_blob.elempack == 1 && bottom_blob1.elempack == 1)
{
pipeline = pipeline_binaryop_broadcast;
broadcast_a1b1 = false;
}
else
{
@@ -534,18 +638,38 @@ int BinaryOp_vulkan::forward(const std::vector<VkImageMat>& bottom_blobs, std::v
else
{
pipeline = out_elempack == 8 ? pipeline_binaryop_broadcast_pack8 : pipeline_binaryop_broadcast_pack4;
broadcast_a1b1 = false;
}
}

cmd.record_pipeline(pipeline, bindings, broadcast_a1b1 ? constants_broadcast_a1b1 : constants, top_blob);
}
else
{
pipeline = out_elempack == 8 ? pipeline_binaryop_pack8
: out_elempack == 4 ? pipeline_binaryop_pack4
: pipeline_binaryop;
std::vector<vk_constant_type> constants(15);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h * bottom_blob.d;
constants[3].i = bottom_blob.c;
constants[4].i = 0; //bottom_blob.cstep;
constants[5].i = bottom_blob1.dims;
constants[6].i = bottom_blob1.w;
constants[7].i = bottom_blob1.h * bottom_blob1.d;
constants[8].i = bottom_blob1.c;
constants[9].i = 0; //bottom_blob1.cstep;
constants[10].i = top_blob.dims;
constants[11].i = top_blob.w;
constants[12].i = top_blob.h * top_blob.d;
constants[13].i = top_blob.c;
constants[14].i = 0; //top_blob.cstep;

const Pipeline* pipeline = out_elempack == 8 ? pipeline_binaryop_pack8
: out_elempack == 4 ? pipeline_binaryop_pack4
: pipeline_binaryop;

cmd.record_pipeline(pipeline, bindings, constants, top_blob);
}

cmd.record_pipeline(pipeline, bindings, constants, top_blob);

return 0;
}

@@ -561,7 +685,7 @@ int BinaryOp_vulkan::forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd
std::vector<vk_constant_type> constants(15);
constants[10].i = bottom_top_blob.dims;
constants[11].i = bottom_top_blob.w;
constants[12].i = bottom_top_blob.h;
constants[12].i = bottom_top_blob.h * bottom_top_blob.d;
constants[13].i = bottom_top_blob.c;
constants[14].i = 0; //bottom_top_blob.cstep;



+ 1
- 1
src/layer/vulkan/relu_vulkan.cpp View File

@@ -84,7 +84,7 @@ int ReLU_vulkan::create_pipeline(const Option& opt)
local_size_xyz.h = std::min(4, shape_packed.h);
local_size_xyz.c = std::min(4, shape_packed.c);
}
if (shape_packed.dims == 3)
if (shape_packed.dims == 4)
{
local_size_xyz.w = std::min(4, shape_packed.w);
local_size_xyz.h = std::min(4, shape_packed.h * shape_packed.d);


+ 156
- 19
src/layer/vulkan/shader/binaryop_broadcast.comp View File

@@ -27,20 +27,23 @@ layout (constant_id = 0) const int op_type = 0;
layout (constant_id = shape_constant_id_offset + 0) const int adims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int aw = 0;
layout (constant_id = shape_constant_id_offset + 2) const int ah = 0;
layout (constant_id = shape_constant_id_offset + 3) const int ac = 0;
layout (constant_id = shape_constant_id_offset + 4) const int acstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int bdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int bw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int bh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int bc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int bcstep = 0;

layout (constant_id = shape_constant_id_offset + 10) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 11) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 12) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 13) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 14) const int outcstep = 0;
layout (constant_id = shape_constant_id_offset + 3) const int ad = 0;
layout (constant_id = shape_constant_id_offset + 4) const int ac = 0;
layout (constant_id = shape_constant_id_offset + 5) const int acstep = 0;

layout (constant_id = shape_constant_id_offset + 6) const int bdims = 0;
layout (constant_id = shape_constant_id_offset + 7) const int bw = 0;
layout (constant_id = shape_constant_id_offset + 8) const int bh = 0;
layout (constant_id = shape_constant_id_offset + 9) const int bd = 0;
layout (constant_id = shape_constant_id_offset + 10) const int bc = 0;
layout (constant_id = shape_constant_id_offset + 11) const int bcstep = 0;

layout (constant_id = shape_constant_id_offset + 12) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 13) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 14) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 15) const int outd = 0;
layout (constant_id = shape_constant_id_offset + 16) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 17) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D a_blob_3d;
@@ -57,18 +60,21 @@ layout (push_constant) uniform parameter
int adims;
int aw;
int ah;
int ad;
int ac;
int acstep;

int bdims;
int bw;
int bh;
int bd;
int bc;
int bcstep;

int outdims;
int outw;
int outh;
int outd;
int outc;
int outcstep;
} p;
@@ -79,7 +85,7 @@ void main()
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
if (gx >= psc(outw) || gy >= psc(outh) * psc(outd) || gz >= psc(outc))
return;

#if NCNN_image_shader
@@ -90,8 +96,58 @@ void main()
int by = gy;
int bz = gz;

if (psc(adims) == 3)
if (psc(adims) == 4)
{
int yd = gy / psc(outh);
int yh = gy % psc(outh);

if (psc(bdims) == 3)
{
// type 28
bx = yh;
by = yd;
bz = gz;
}

if (psc(bdims) == 2)
{
// type 27
bx = yd;
by = gz;
bz = 0;
}

if (psc(bdims) == 1)
{
if (psc(bw) == 1)
{
// type 25
bx = 0;
by = 0;
bz = 0;
}
else
{
// type 26
bx = gz;
by = 0;
bz = 0;
}
}
}
else if (psc(adims) == 3)
{
if (psc(bdims) == 4)
{
int yd = gy / psc(outh);
int yh = gy % psc(outh);

// type 23
ax = yh;
ay = yd;
az = gz;
}

if (psc(bdims) == 3)
{
if (psc(bw) == 1 && psc(bh) == 1)
@@ -173,6 +229,17 @@ void main()
}
else if (psc(adims) == 2)
{
if (psc(bdims) == 4)
{
int yd = gy / psc(outh);
int yh = gy % psc(outh);

// type 22
ax = yd;
ay = gz;
az = 0;
}

if (psc(bdims) == 3)
{
// type 14
@@ -203,13 +270,21 @@ void main()
{
if (psc(aw) == 1)
{
// type 2 3 4
// type 2 3 4 20
ax = 0;
ay = 0;
az = 0;
}
else
{
if (psc(bdims) == 4)
{
// type 21
ax = gz;
ay = 0;
az = 0;
}

if (psc(bdims) == 3)
{
// type 9
@@ -247,8 +322,53 @@ void main()
int ai;
int bi;

if (psc(adims) == 3)
if (psc(adims) == 4)
{
int yd = gy / psc(outh);
int yh = gy % psc(outh);

if (psc(bdims) == 3)
{
// type 28
ai = gi;
bi = gz * psc(bcstep) + yd * psc(bw) + yh;
}

if (psc(bdims) == 2)
{
// type 27
ai = gi;
bi = gz * psc(bw) + yd;
}

if (psc(bdims) == 1)
{
if (psc(bw) == 1)
{
// type 25
ai = gi;
bi = 0;
}
else
{
// type 26
ai = gi;
bi = gz;
}
}
}
else if (psc(adims) == 3)
{
if (psc(bdims) == 4)
{
int yd = gy / psc(outh);
int yh = gy % psc(outh);

// type 23
ai = gz * psc(acstep) + yd * psc(aw) + yh;
bi = gi;
}

if (psc(bdims) == 3)
{
if (psc(bw) == 1 && psc(bh) == 1)
@@ -333,6 +453,16 @@ void main()
}
else if (psc(adims) == 2)
{
if (psc(bdims) == 4)
{
int yd = gy / psc(outh);
int yh = gy % psc(outh);

// type 22
ai = gz * psc(aw) + yd;
bi = gi;
}

if (psc(bdims) == 3)
{
// type 14
@@ -360,12 +490,19 @@ void main()
{
if (psc(aw) == 1)
{
// type 2 3 4
// type 2 3 4 20
ai = 0;
bi = gi;
}
else
{
if (psc(bdims) == 4)
{
// type 21
ai = gz;
bi = gi;
}

if (psc(bdims) == 3)
{
// type 9


+ 155
- 18
src/layer/vulkan/shader/binaryop_broadcast_pack4.comp View File

@@ -27,20 +27,23 @@ layout (constant_id = 0) const int op_type = 0;
layout (constant_id = shape_constant_id_offset + 0) const int adims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int aw = 0;
layout (constant_id = shape_constant_id_offset + 2) const int ah = 0;
layout (constant_id = shape_constant_id_offset + 3) const int ac = 0;
layout (constant_id = shape_constant_id_offset + 4) const int acstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int bdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int bw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int bh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int bc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int bcstep = 0;

layout (constant_id = shape_constant_id_offset + 10) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 11) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 12) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 13) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 14) const int outcstep = 0;
layout (constant_id = shape_constant_id_offset + 3) const int ad = 0;
layout (constant_id = shape_constant_id_offset + 4) const int ac = 0;
layout (constant_id = shape_constant_id_offset + 5) const int acstep = 0;

layout (constant_id = shape_constant_id_offset + 6) const int bdims = 0;
layout (constant_id = shape_constant_id_offset + 7) const int bw = 0;
layout (constant_id = shape_constant_id_offset + 8) const int bh = 0;
layout (constant_id = shape_constant_id_offset + 9) const int bd = 0;
layout (constant_id = shape_constant_id_offset + 10) const int bc = 0;
layout (constant_id = shape_constant_id_offset + 11) const int bcstep = 0;

layout (constant_id = shape_constant_id_offset + 12) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 13) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 14) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 15) const int outd = 0;
layout (constant_id = shape_constant_id_offset + 16) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 17) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D a_blob_3d;
@@ -57,18 +60,21 @@ layout (push_constant) uniform parameter
int adims;
int aw;
int ah;
int ad;
int ac;
int acstep;

int bdims;
int bw;
int bh;
int bd;
int bc;
int bcstep;

int outdims;
int outw;
int outh;
int outd;
int outc;
int outcstep;
} p;
@@ -79,7 +85,7 @@ void main()
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
if (gx >= psc(outw) || gy >= psc(outh) * psc(outd) || gz >= psc(outc))
return;

#if NCNN_image_shader
@@ -90,8 +96,58 @@ void main()
int by = gy;
int bz = gz;

if (psc(adims) == 3)
if (psc(adims) == 4)
{
int yd = gy / psc(outh);
int yh = gy % psc(outh);

if (psc(bdims) == 3)
{
// type 28
bx = yh;
by = yd;
bz = gz;
}

if (psc(bdims) == 2)
{
// type 27
bx = yd;
by = gz;
bz = 0;
}

if (psc(bdims) == 1)
{
if (psc(bw) == 1)
{
// type 25
bx = 0;
by = 0;
bz = 0;
}
else
{
// type 26
bx = gz;
by = 0;
bz = 0;
}
}
}
else if (psc(adims) == 3)
{
if (psc(bdims) == 4)
{
int yd = gy / psc(outh);
int yh = gy % psc(outh);

// type 23
ax = yh;
ay = yd;
az = gz;
}

if (psc(bdims) == 3)
{
if (psc(bw) == 1 && psc(bh) == 1)
@@ -173,6 +229,17 @@ void main()
}
else if (psc(adims) == 2)
{
if (psc(bdims) == 4)
{
int yd = gy / psc(outh);
int yh = gy % psc(outh);

// type 22
ax = yd;
ay = gz;
az = 0;
}

if (psc(bdims) == 3)
{
// type 14
@@ -203,13 +270,21 @@ void main()
{
if (psc(aw) == 1)
{
// type 2 3 4
// type 2 3 4 20
ax = 0;
ay = 0;
az = 0;
}
else
{
if (psc(bdims) == 4)
{
// type 21
ax = gz;
ay = 0;
az = 0;
}

if (psc(bdims) == 3)
{
// type 9
@@ -247,8 +322,53 @@ void main()
int ai;
int bi;

if (psc(adims) == 3)
if (psc(adims) == 4)
{
int yd = gy / psc(outh);
int yh = gy % psc(outh);

if (psc(bdims) == 3)
{
// type 28
ai = gi;
bi = gz * psc(bcstep) + yd * psc(bw) + yh;
}

if (psc(bdims) == 2)
{
// type 27
ai = gi;
bi = gz * psc(bw) + yd;
}

if (psc(bdims) == 1)
{
if (psc(bw) == 1)
{
// type 25
ai = gi;
bi = 0;
}
else
{
// type 26
ai = gi;
bi = gz;
}
}
}
else if (psc(adims) == 3)
{
if (psc(bdims) == 4)
{
int yd = gy / psc(outh);
int yh = gy % psc(outh);

// type 23
ai = gz * psc(acstep) + yd * psc(aw) + yh;
bi = gi;
}

if (psc(bdims) == 3)
{
if (psc(bw) == 1 && psc(bh) == 1)
@@ -310,6 +430,16 @@ void main()
}
else if (psc(adims) == 2)
{
if (psc(bdims) == 4)
{
int yd = gy / psc(outh);
int yh = gy % psc(outh);

// type 22
ai = gz * psc(aw) + yd;
bi = gi;
}

if (psc(bdims) == 3)
{
// type 14
@@ -326,6 +456,13 @@ void main()
}
else if (psc(adims) == 1)
{
if (psc(bdims) == 4)
{
// type 21
ai = gz;
bi = gi;
}

if (psc(bdims) == 3)
{
// type 9


+ 155
- 18
src/layer/vulkan/shader/binaryop_broadcast_pack8.comp View File

@@ -28,20 +28,23 @@ layout (constant_id = 0) const int op_type = 0;
layout (constant_id = shape_constant_id_offset + 0) const int adims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int aw = 0;
layout (constant_id = shape_constant_id_offset + 2) const int ah = 0;
layout (constant_id = shape_constant_id_offset + 3) const int ac = 0;
layout (constant_id = shape_constant_id_offset + 4) const int acstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int bdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int bw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int bh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int bc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int bcstep = 0;

layout (constant_id = shape_constant_id_offset + 10) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 11) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 12) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 13) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 14) const int outcstep = 0;
layout (constant_id = shape_constant_id_offset + 3) const int ad = 0;
layout (constant_id = shape_constant_id_offset + 4) const int ac = 0;
layout (constant_id = shape_constant_id_offset + 5) const int acstep = 0;

layout (constant_id = shape_constant_id_offset + 6) const int bdims = 0;
layout (constant_id = shape_constant_id_offset + 7) const int bw = 0;
layout (constant_id = shape_constant_id_offset + 8) const int bh = 0;
layout (constant_id = shape_constant_id_offset + 9) const int bd = 0;
layout (constant_id = shape_constant_id_offset + 10) const int bc = 0;
layout (constant_id = shape_constant_id_offset + 11) const int bcstep = 0;

layout (constant_id = shape_constant_id_offset + 12) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 13) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 14) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 15) const int outd = 0;
layout (constant_id = shape_constant_id_offset + 16) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 17) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D a_blob_3d;
@@ -58,18 +61,21 @@ layout (push_constant) uniform parameter
int adims;
int aw;
int ah;
int ad;
int ac;
int acstep;

int bdims;
int bw;
int bh;
int bd;
int bc;
int bcstep;

int outdims;
int outw;
int outh;
int outd;
int outc;
int outcstep;
} p;
@@ -80,7 +86,7 @@ void main()
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
if (gx >= psc(outw) || gy >= psc(outh) * psc(outd) || gz >= psc(outc))
return;

#if NCNN_image_shader
@@ -91,8 +97,58 @@ void main()
int by = gy;
int bz = gz;

if (psc(adims) == 3)
if (psc(adims) == 4)
{
int yd = gy / psc(outh);
int yh = gy % psc(outh);

if (psc(bdims) == 3)
{
// type 28
bx = yh;
by = yd;
bz = gz;
}

if (psc(bdims) == 2)
{
// type 27
bx = yd;
by = gz;
bz = 0;
}

if (psc(bdims) == 1)
{
if (psc(bw) == 1)
{
// type 25
bx = 0;
by = 0;
bz = 0;
}
else
{
// type 26
bx = gz;
by = 0;
bz = 0;
}
}
}
else if (psc(adims) == 3)
{
if (psc(bdims) == 4)
{
int yd = gy / psc(outh);
int yh = gy % psc(outh);

// type 23
ax = yh;
ay = yd;
az = gz;
}

if (psc(bdims) == 3)
{
if (psc(bw) == 1 && psc(bh) == 1)
@@ -174,6 +230,17 @@ void main()
}
else if (psc(adims) == 2)
{
if (psc(bdims) == 4)
{
int yd = gy / psc(outh);
int yh = gy % psc(outh);

// type 22
ax = yd;
ay = gz;
az = 0;
}

if (psc(bdims) == 3)
{
// type 14
@@ -204,13 +271,21 @@ void main()
{
if (psc(aw) == 1)
{
// type 2 3 4
// type 2 3 4 20
ax = 0;
ay = 0;
az = 0;
}
else
{
if (psc(bdims) == 4)
{
// type 21
ax = gz;
ay = 0;
az = 0;
}

if (psc(bdims) == 3)
{
// type 9
@@ -248,8 +323,53 @@ void main()
int ai;
int bi;

if (psc(adims) == 3)
if (psc(adims) == 4)
{
int yd = gy / psc(outh);
int yh = gy % psc(outh);

if (psc(bdims) == 3)
{
// type 28
ai = gi;
bi = gz * psc(bcstep) + yd * psc(bw) + yh;
}

if (psc(bdims) == 2)
{
// type 27
ai = gi;
bi = gz * psc(bw) + yd;
}

if (psc(bdims) == 1)
{
if (psc(bw) == 1)
{
// type 25
ai = gi;
bi = 0;
}
else
{
// type 26
ai = gi;
bi = gz;
}
}
}
else if (psc(adims) == 3)
{
if (psc(bdims) == 4)
{
int yd = gy / psc(outh);
int yh = gy % psc(outh);

// type 23
ai = gz * psc(acstep) + yd * psc(aw) + yh;
bi = gi;
}

if (psc(bdims) == 3)
{
if (psc(bw) == 1 && psc(bh) == 1)
@@ -311,6 +431,16 @@ void main()
}
else if (psc(adims) == 2)
{
if (psc(bdims) == 4)
{
int yd = gy / psc(outh);
int yh = gy % psc(outh);

// type 22
ai = gz * psc(aw) + yd;
bi = gi;
}

if (psc(bdims) == 3)
{
// type 14
@@ -327,6 +457,13 @@ void main()
}
else if (psc(adims) == 1)
{
if (psc(bdims) == 4)
{
// type 21
ai = gz;
bi = gi;
}

if (psc(bdims) == 3)
{
// type 9


+ 11
- 4
src/layer/vulkan/unaryop_vulkan.cpp View File

@@ -35,7 +35,7 @@ int UnaryOp_vulkan::create_pipeline(const Option& opt)
int elempack = 1;
if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
if (shape.dims == 3 || shape.dims == 4) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;

size_t elemsize;
if (opt.use_fp16_storage)
@@ -55,12 +55,13 @@ int UnaryOp_vulkan::create_pipeline(const Option& opt)
if (shape.dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack);
if (shape.dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
if (shape.dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
if (shape.dims == 4) shape_packed = Mat(shape.w, shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack);

std::vector<vk_specialization_type> specializations(1 + 5);
specializations[0].i = op_type;
specializations[1 + 0].i = shape_packed.dims;
specializations[1 + 1].i = shape_packed.w;
specializations[1 + 2].i = shape_packed.h;
specializations[1 + 2].i = shape_packed.h * shape_packed.d;
specializations[1 + 3].i = shape_packed.c;
specializations[1 + 4].i = shape_packed.cstep;

@@ -83,6 +84,12 @@ int UnaryOp_vulkan::create_pipeline(const Option& opt)
local_size_xyz.h = std::min(4, shape_packed.h);
local_size_xyz.c = std::min(4, shape_packed.c);
}
if (shape_packed.dims == 4)
{
local_size_xyz.w = std::min(4, shape_packed.w);
local_size_xyz.h = std::min(4, shape_packed.h * shape_packed.d);
local_size_xyz.c = std::min(4, shape_packed.c);
}

// pack1
if (shape.dims == 0 || elempack == 1)
@@ -135,7 +142,7 @@ int UnaryOp_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, cons
std::vector<vk_constant_type> constants(5);
constants[0].i = bottom_top_blob.dims;
constants[1].i = bottom_top_blob.w;
constants[2].i = bottom_top_blob.h;
constants[2].i = bottom_top_blob.h * bottom_top_blob.d;
constants[3].i = bottom_top_blob.c;
constants[4].i = bottom_top_blob.cstep;

@@ -159,7 +166,7 @@ int UnaryOp_vulkan::forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd,
std::vector<vk_constant_type> constants(5);
constants[0].i = bottom_top_blob.dims;
constants[1].i = bottom_top_blob.w;
constants[2].i = bottom_top_blob.h;
constants[2].i = bottom_top_blob.h * bottom_top_blob.d;
constants[3].i = bottom_top_blob.c;
constants[4].i = 0; //bottom_top_blob.cstep;



+ 566
- 18
src/layer/x86/binaryop_x86.cpp View File

@@ -44,20 +44,203 @@ static int binary_op_pack8(const Mat& a, const Mat& b, Mat& c, const Option& opt

int w = a.w;
int h = a.h;
int d = a.d;
int channels = a.c;
int size = w * h;
int size = w * h * d;
size_t elemsize = a.elemsize;
int elempack = a.elempack;

int w1 = b.w;
int h1 = b.h;
int d1 = b.d;
int channels1 = b.c;
int size1 = w1 * h1;
int size1 = w1 * h1 * d1;
size_t elemsize1 = b.elemsize;
int elempack1 = b.elempack;

if (a.dims == 3)
if (a.dims == 4)
{
if (b.dims == 4)
{
// type 29
c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = a.channel(q);
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

for (int i = 0; i < size; i++)
{
__m256 _p = _mm256_loadu_ps(ptr);
__m256 _p1 = _mm256_loadu_ps(ptr1);
__m256 _outp = op(_p, _p1);
_mm256_storeu_ps(outptr, _outp);
ptr += 8;
ptr1 += 8;
outptr += 8;
}
}

return 0;
}

c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator);
if (c.empty())
return -100;

if (b.dims == 3)
{
// type 28
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = a.channel(q);
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

for (int z = 0; z < d; z++)
{
for (int y = 0; y < h; y++)
{
__m256 _b0 = _mm256_loadu_ps(ptr1);
for (int x = 0; x < w; x++)
{
__m256 _p = _mm256_loadu_ps(ptr);
__m256 _outp = op(_p, _b0);
_mm256_storeu_ps(outptr, _outp);
ptr += 8;
outptr += 8;
}

ptr1 += 8;
}
}
}

return 0;
}

if (b.dims == 2)
{
// type 27
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = a.channel(q);
const float* ptr1 = b.row(q);
float* outptr = c.channel(q);

for (int z = 0; z < d; z++)
{
__m256 _b0 = _mm256_loadu_ps(ptr1);
for (int y = 0; y < h; y++)
{
for (int x = 0; x < w; x++)
{
__m256 _p = _mm256_loadu_ps(ptr);
__m256 _outp = op(_p, _b0);
_mm256_storeu_ps(outptr, _outp);
ptr += 8;
outptr += 8;
}
}

ptr1 += 8;
}
}

return 0;
}

if (b.dims == 1)
{
if (b.w == 1 && elempack1 == 1)
{
// type 25
__m256 _b0 = _mm256_set1_ps(b[0]);
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = a.channel(q);
float* outptr = c.channel(q);

for (int i = 0; i < size; i++)
{
__m256 _p = _mm256_loadu_ps(ptr);
__m256 _outp = op(_p, _b0);
_mm256_storeu_ps(outptr, _outp);
ptr += 8;
outptr += 8;
}
}

return 0;
}

// type 26
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = a.channel(q);
__m256 _b0 = _mm256_loadu_ps((const float*)b + q * 8);
float* outptr = c.channel(q);

for (int i = 0; i < size; i++)
{
__m256 _p = _mm256_loadu_ps(ptr);
__m256 _outp = op(_p, _b0);
_mm256_storeu_ps(outptr, _outp);
ptr += 8;
outptr += 8;
}
}

return 0;
}
}
else if (a.dims == 3)
{
if (b.dims == 4)
{
// type 23
c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels1; q++)
{
const float* ptr = a.channel(q);
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

for (int z = 0; z < d1; z++)
{
for (int y = 0; y < h1; y++)
{
__m256 _a0 = _mm256_loadu_ps(ptr);
for (int x = 0; x < w1; x++)
{
__m256 _p = _mm256_loadu_ps(ptr1);
__m256 _outp = op(_a0, _p);
_mm256_storeu_ps(outptr, _outp);
ptr1 += 8;
outptr += 8;
}

ptr += 8;
}
}
}

return 0;
}

if (b.dims == 3)
{
if (w1 == 1 && h1 == 1 && channels1 == channels)
@@ -406,6 +589,42 @@ static int binary_op_pack8(const Mat& a, const Mat& b, Mat& c, const Option& opt
}
else if (a.dims == 2)
{
if (b.dims == 4)
{
// type 22
c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels1; q++)
{
const float* ptr = a.row(q);
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

for (int z = 0; z < d1; z++)
{
__m256 _a0 = _mm256_loadu_ps(ptr);
for (int y = 0; y < h1; y++)
{
for (int x = 0; x < w1; x++)
{
__m256 _p = _mm256_loadu_ps(ptr1);
__m256 _outp = op(_a0, _p);
_mm256_storeu_ps(outptr, _outp);
ptr1 += 8;
outptr += 8;
}
}

ptr += 8;
}
}

return 0;
}

if (b.dims == 3)
{
// type 14
@@ -514,6 +733,33 @@ static int binary_op_pack8(const Mat& a, const Mat& b, Mat& c, const Option& opt
{
if (a.w == 1 && elempack == 1)
{
if (b.dims == 4)
{
// type 20
c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
if (c.empty())
return -100;

__m256 _a0 = _mm256_set1_ps(a[0]);
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels1; q++)
{
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

for (int i = 0; i < size1; i++)
{
__m256 _p1 = _mm256_loadu_ps(ptr1);
__m256 _outp = op(_a0, _p1);
_mm256_storeu_ps(outptr, _outp);
ptr1 += 8;
outptr += 8;
}
}

return 0;
}

if (b.dims == 3)
{
// type 4
@@ -586,6 +832,33 @@ static int binary_op_pack8(const Mat& a, const Mat& b, Mat& c, const Option& opt
}
}

if (b.dims == 4)
{
// type 21
c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels1; q++)
{
__m256 _a0 = _mm256_loadu_ps((const float*)a + q * 8);
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

for (int i = 0; i < size1; i++)
{
__m256 _p1 = _mm256_loadu_ps(ptr1);
__m256 _outp = op(_a0, _p1);
_mm256_storeu_ps(outptr, _outp);
ptr1 += 8;
outptr += 8;
}
}

return 0;
}

if (b.dims == 3)
{
// type 9
@@ -693,8 +966,9 @@ static int binary_op_scalar_inplace_pack8(Mat& a, float b, const Option& opt)

int w = a.w;
int h = a.h;
int d = a.d;
int channels = a.c;
int size = w * h;
int size = w * h * d;

__m256 _b = _mm256_set1_ps(b);

@@ -795,20 +1069,203 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt

int w = a.w;
int h = a.h;
int d = a.d;
int channels = a.c;
int size = w * h;
int size = w * h * d;
size_t elemsize = a.elemsize;
int elempack = a.elempack;

int w1 = b.w;
int h1 = b.h;
int d1 = b.d;
int channels1 = b.c;
int size1 = w1 * h1;
int size1 = w1 * h1 * d1;
size_t elemsize1 = b.elemsize;
int elempack1 = b.elempack;

if (a.dims == 3)
if (a.dims == 4)
{
if (b.dims == 4)
{
// type 29
c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = a.channel(q);
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

for (int i = 0; i < size; i++)
{
__m128 _p = _mm_loadu_ps(ptr);
__m128 _p1 = _mm_loadu_ps(ptr1);
__m128 _outp = op(_p, _p1);
_mm_storeu_ps(outptr, _outp);
ptr += 4;
ptr1 += 4;
outptr += 4;
}
}

return 0;
}

c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator);
if (c.empty())
return -100;

if (b.dims == 3)
{
// type 28
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = a.channel(q);
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

for (int z = 0; z < d; z++)
{
for (int y = 0; y < h; y++)
{
__m128 _b0 = _mm_loadu_ps(ptr1);
for (int x = 0; x < w; x++)
{
__m128 _p = _mm_loadu_ps(ptr);
__m128 _outp = op(_p, _b0);
_mm_storeu_ps(outptr, _outp);
ptr += 4;
outptr += 4;
}

ptr1 += 4;
}
}
}

return 0;
}

if (b.dims == 2)
{
// type 27
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = a.channel(q);
const float* ptr1 = b.row(q);
float* outptr = c.channel(q);

for (int z = 0; z < d; z++)
{
__m128 _b0 = _mm_loadu_ps(ptr1);
for (int y = 0; y < h; y++)
{
for (int x = 0; x < w; x++)
{
__m128 _p = _mm_loadu_ps(ptr);
__m128 _outp = op(_p, _b0);
_mm_storeu_ps(outptr, _outp);
ptr += 4;
outptr += 4;
}
}

ptr1 += 4;
}
}

return 0;
}

if (b.dims == 1)
{
if (b.w == 1 && elempack1 == 1)
{
// type 25
__m128 _b0 = _mm_set1_ps(b[0]);
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = a.channel(q);
float* outptr = c.channel(q);

for (int i = 0; i < size; i++)
{
__m128 _p = _mm_loadu_ps(ptr);
__m128 _outp = op(_p, _b0);
_mm_storeu_ps(outptr, _outp);
ptr += 4;
outptr += 4;
}
}

return 0;
}

// type 26
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = a.channel(q);
__m128 _b0 = _mm_loadu_ps((const float*)b + q * 4);
float* outptr = c.channel(q);

for (int i = 0; i < size; i++)
{
__m128 _p = _mm_loadu_ps(ptr);
__m128 _outp = op(_p, _b0);
_mm_storeu_ps(outptr, _outp);
ptr += 4;
outptr += 4;
}
}

return 0;
}
}
else if (a.dims == 3)
{
if (b.dims == 4)
{
// type 23
c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels1; q++)
{
const float* ptr = a.channel(q);
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

for (int z = 0; z < d1; z++)
{
for (int y = 0; y < h1; y++)
{
__m128 _a0 = _mm_loadu_ps(ptr);
for (int x = 0; x < w1; x++)
{
__m128 _p = _mm_loadu_ps(ptr1);
__m128 _outp = op(_a0, _p);
_mm_storeu_ps(outptr, _outp);
ptr1 += 4;
outptr += 4;
}

ptr += 4;
}
}
}

return 0;
}

if (b.dims == 3)
{
if (w1 == 1 && h1 == 1 && channels1 == channels)
@@ -1114,7 +1571,7 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
if (b.w == 1 && elempack1 == 1)
{
// type 16
__m128 _b0 = _mm_set1_ps(((const float*)b)[0]);
__m128 _b0 = _mm_set1_ps(b[0]);
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
@@ -1157,6 +1614,42 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
}
else if (a.dims == 2)
{
if (b.dims == 4)
{
// type 22
c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels1; q++)
{
const float* ptr = a.row(q);
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

for (int z = 0; z < d1; z++)
{
__m128 _a0 = _mm_loadu_ps(ptr);
for (int y = 0; y < h1; y++)
{
for (int x = 0; x < w1; x++)
{
__m128 _p = _mm_loadu_ps(ptr1);
__m128 _outp = op(_a0, _p);
_mm_storeu_ps(outptr, _outp);
ptr1 += 4;
outptr += 4;
}
}

ptr += 4;
}
}

return 0;
}

if (b.dims == 3)
{
// type 14
@@ -1223,7 +1716,7 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
if (b.w == 1 && elempack1 == 1)
{
// type 11
__m128 _b0 = _mm_set1_ps(((const float*)b)[0]);
__m128 _b0 = _mm_set1_ps(b[0]);
const float* ptr = a;
float* outptr = c;
for (int i = 0; i < size; i++)
@@ -1265,6 +1758,33 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
{
if (a.w == 1 && elempack == 1)
{
if (b.dims == 4)
{
// type 20
c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
if (c.empty())
return -100;

__m128 _a0 = _mm_set1_ps(a[0]);
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels1; q++)
{
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

for (int i = 0; i < size1; i++)
{
__m128 _p1 = _mm_loadu_ps(ptr1);
__m128 _outp = op(_a0, _p1);
_mm_storeu_ps(outptr, _outp);
ptr1 += 4;
outptr += 4;
}
}

return 0;
}

if (b.dims == 3)
{
// type 4
@@ -1272,7 +1792,7 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
if (c.empty())
return -100;

__m128 _a0 = _mm_set1_ps(((const float*)a)[0]);
__m128 _a0 = _mm_set1_ps(a[0]);
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels1; q++)
{
@@ -1299,7 +1819,7 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
if (c.empty())
return -100;

__m128 _a0 = _mm_set1_ps(((const float*)a)[0]);
__m128 _a0 = _mm_set1_ps(a[0]);
const float* ptr1 = b;
float* outptr = c;
for (int i = 0; i < size1; i++)
@@ -1321,7 +1841,7 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
if (c.empty())
return -100;

__m128 _a0 = _mm_set1_ps(((const float*)a)[0]);
__m128 _a0 = _mm_set1_ps(a[0]);
const float* ptr1 = b;
float* outptr = c;
for (int i = 0; i < w1; i++)
@@ -1337,6 +1857,33 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
}
}

if (b.dims == 4)
{
// type 21
c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
if (c.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels1; q++)
{
__m128 _a0 = _mm_loadu_ps((const float*)a + q * 4);
const float* ptr1 = b.channel(q);
float* outptr = c.channel(q);

for (int i = 0; i < size1; i++)
{
__m128 _p1 = _mm_loadu_ps(ptr1);
__m128 _outp = op(_a0, _p1);
_mm_storeu_ps(outptr, _outp);
ptr1 += 4;
outptr += 4;
}
}

return 0;
}

if (b.dims == 3)
{
// type 9
@@ -1402,7 +1949,7 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
if (b.w == 1 && elempack1 == 1)
{
// type 6
__m128 _b0 = _mm_set1_ps(((const float*)b)[0]);
__m128 _b0 = _mm_set1_ps(b[0]);
const float* ptr = a;
float* outptr = c;
for (int i = 0; i < w; i++)
@@ -1444,8 +1991,9 @@ static int binary_op_scalar_inplace_pack4(Mat& a, float b, const Option& opt)

int w = a.w;
int h = a.h;
int d = a.d;
int channels = a.c;
int size = w * h;
int size = w * h * d;

__m128 _b = _mm_set1_ps((float)b);

@@ -1574,10 +2122,10 @@ int BinaryOp_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
return binary_op_pack8<binary_op_pow_pack8>(bottom_blob, bottom_blob1, top_blob, opt);

if (op_type == Operation_RSUB)
return binary_op_pack8<binary_op_rsub_pack8>(bottom_blob, bottom_blob1, top_blob, opt);
return binary_op_pack8<binary_op_sub_pack8>(bottom_blob1, bottom_blob, top_blob, opt);

if (op_type == Operation_RDIV)
return binary_op_pack8<binary_op_rdiv_pack8>(bottom_blob, bottom_blob1, top_blob, opt);
return binary_op_pack8<binary_op_div_pack8>(bottom_blob1, bottom_blob, top_blob, opt);
}
#endif // __AVX__

@@ -1605,10 +2153,10 @@ int BinaryOp_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
return binary_op_pack4<binary_op_pow_pack4>(bottom_blob, bottom_blob1, top_blob, opt);

if (op_type == Operation_RSUB)
return binary_op_pack4<binary_op_rsub_pack4>(bottom_blob, bottom_blob1, top_blob, opt);
return binary_op_pack4<binary_op_sub_pack4>(bottom_blob1, bottom_blob, top_blob, opt);

if (op_type == Operation_RDIV)
return binary_op_pack4<binary_op_rdiv_pack4>(bottom_blob, bottom_blob1, top_blob, opt);
return binary_op_pack4<binary_op_div_pack4>(bottom_blob1, bottom_blob, top_blob, opt);
}
#endif // __SSE2__



+ 1
- 1
src/net.cpp View File

@@ -797,7 +797,7 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
int elemcount = 0;
if (dims == 1) elemcount = bottom_blob.elempack * bottom_blob.w;
if (dims == 2) elemcount = bottom_blob.elempack * bottom_blob.h;
if (dims == 3) elemcount = bottom_blob.elempack * bottom_blob.c;
if (dims == 3 || dims == 4) elemcount = bottom_blob.elempack * bottom_blob.c;

int elembits = bottom_blob.elembits();



+ 92
- 2
tests/test_binaryop.cpp View File

@@ -50,7 +50,7 @@ static int test_binaryop(const ncnn::Mat& _a, const ncnn::Mat& _b)
int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, ab);
if (ret != 0)
{
fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d) b.dims=%d b=(%d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.c, b.dims, b.w, b.h, b.c, op_type);
fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b.dims=%d b=(%d %d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b.dims, b.w, b.h, b.d, b.c, op_type);
}

return ret;
@@ -76,7 +76,7 @@ static int test_binaryop(const ncnn::Mat& _a, float b)
int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, a);
if (ret != 0)
{
fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d) b=%f op_type=%d\n", a.dims, a.w, a.h, a.c, b, op_type);
fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b=%f op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b, op_type);
}

return ret;
@@ -234,6 +234,86 @@ static int test_binaryop_19()
|| test_binaryop(RandomMat(11, 6, 16), RandomMat(11, 6, 16));
}

static int test_binaryop_20()
{
return 0
|| test_binaryop(RandomMat(1), RandomMat(11, 3, 4, 2))
|| test_binaryop(RandomMat(1), RandomMat(11, 3, 4, 4))
|| test_binaryop(RandomMat(1), RandomMat(11, 3, 4, 16));
}

static int test_binaryop_21()
{
return 0
|| test_binaryop(RandomMat(2), RandomMat(11, 3, 4, 2))
|| test_binaryop(RandomMat(4), RandomMat(11, 3, 4, 4))
|| test_binaryop(RandomMat(16), RandomMat(11, 3, 4, 16));
}

static int test_binaryop_22()
{
return 0
|| test_binaryop(RandomMat(4, 2), RandomMat(11, 3, 4, 2))
|| test_binaryop(RandomMat(4, 4), RandomMat(11, 3, 4, 4))
|| test_binaryop(RandomMat(4, 16), RandomMat(11, 3, 4, 16));
}

static int test_binaryop_23()
{
return 0
|| test_binaryop(RandomMat(3, 4, 2), RandomMat(11, 3, 4, 2))
|| test_binaryop(RandomMat(3, 4, 4), RandomMat(11, 3, 4, 4))
|| test_binaryop(RandomMat(3, 4, 16), RandomMat(11, 3, 4, 16));
}

static int test_binaryop_24()
{
return 0
|| test_binaryop(RandomMat(11, 3, 4, 2), 1.f)
|| test_binaryop(RandomMat(11, 3, 4, 4), 1.f)
|| test_binaryop(RandomMat(11, 3, 4, 16), 1.f);
}

static int test_binaryop_25()
{
return 0
|| test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(1))
|| test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(1))
|| test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(1));
}

static int test_binaryop_26()
{
return 0
|| test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(2))
|| test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(4))
|| test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(16));
}

static int test_binaryop_27()
{
return 0
|| test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(4, 2))
|| test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(4, 4))
|| test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(4, 16));
}

static int test_binaryop_28()
{
return 0
|| test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(3, 4, 2))
|| test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(3, 4, 4))
|| test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(3, 4, 16));
}

static int test_binaryop_29()
{
return 0
|| test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(11, 3, 4, 2))
|| test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(11, 3, 4, 4))
|| test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(11, 3, 4, 16));
}

static int test_binaryop_s1()
{
return 0
@@ -324,6 +404,16 @@ int main()
|| test_binaryop_17()
|| test_binaryop_18()
|| test_binaryop_19()
|| test_binaryop_20()
|| test_binaryop_21()
|| test_binaryop_22()
|| test_binaryop_23()
|| test_binaryop_24()
|| test_binaryop_25()
|| test_binaryop_26()
|| test_binaryop_27()
|| test_binaryop_28()
|| test_binaryop_29()
|| test_binaryop_s1()
|| test_binaryop_s2()
|| test_binaryop_s3()


+ 13
- 4
tests/test_unaryop.cpp View File

@@ -49,13 +49,21 @@ static int test_unaryop(const ncnn::Mat& _a)
int ret = test_layer<ncnn::UnaryOp>("UnaryOp", pd, weights, a);
if (ret != 0)
{
fprintf(stderr, "test_unaryop failed a.dims=%d a=(%d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.c, op_type);
fprintf(stderr, "test_unaryop failed a.dims=%d a=(%d %d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, op_type);
}

return ret;
}

static int test_unaryop_0()
{
return 0
|| test_unaryop(RandomMat(11, 3, 2, 16))
|| test_unaryop(RandomMat(10, 2, 2, 12))
|| test_unaryop(RandomMat(6, 1, 5, 13));
}

static int test_unaryop_1()
{
return 0
|| test_unaryop(RandomMat(11, 7, 16))
@@ -63,7 +71,7 @@ static int test_unaryop_0()
|| test_unaryop(RandomMat(6, 5, 13));
}

static int test_unaryop_1()
static int test_unaryop_2()
{
return 0
|| test_unaryop(RandomMat(12, 16))
@@ -71,7 +79,7 @@ static int test_unaryop_1()
|| test_unaryop(RandomMat(14, 15));
}

static int test_unaryop_2()
static int test_unaryop_3()
{
return 0
|| test_unaryop(RandomMat(128))
@@ -88,7 +96,8 @@ int main()
int ret = 0
|| test_unaryop_0()
|| test_unaryop_1()
|| test_unaryop_2();
|| test_unaryop_2()
|| test_unaryop_3();

if (ret != 0)
return ret;


+ 2
- 2
tests/testutil.h View File

@@ -457,7 +457,7 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
int elemcount = 0;
if (dims == 1) elemcount = a4[i].elempack * a4[i].w;
if (dims == 2) elemcount = a4[i].elempack * a4[i].h;
if (dims == 3) elemcount = a4[i].elempack * a4[i].c;
if (dims == 3 || dims == 4) elemcount = a4[i].elempack * a4[i].c;

int elembits = a4[i].elembits();

@@ -882,7 +882,7 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
int elemcount = 0;
if (dims == 1) elemcount = a4.elempack * a4.w;
if (dims == 2) elemcount = a4.elempack * a4.h;
if (dims == 3) elemcount = a4.elempack * a4.c;
if (dims == 3 || dims == 4) elemcount = a4.elempack * a4.c;

int elembits = a4.elembits();



+ 2
- 2
toolchains/c906-v222.toolchain.cmake View File

@@ -22,8 +22,8 @@ set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)

set(CMAKE_C_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static")
set(CMAKE_CXX_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static")
set(CMAKE_C_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static")
set(CMAKE_CXX_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static")

# cache flags
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")


+ 2
- 2
toolchains/c906-v223.toolchain.cmake View File

@@ -22,8 +22,8 @@ set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)

set(CMAKE_C_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static")
set(CMAKE_CXX_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static")
set(CMAKE_C_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static")
set(CMAKE_CXX_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static")

# replace vfredsum_vs* with vfredusum_vs*
add_definitions(-Dvfredsum_vs_f32m1_f32m1=vfredusum_vs_f32m1_f32m1)


+ 2
- 2
toolchains/c906.toolchain.cmake View File

@@ -22,8 +22,8 @@ set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)

set(CMAKE_C_FLAGS "-march=rv64gcvxtheadc -mabi=lp64d -mtune=c906 -DRVV_SPEC_0_7 -D__riscv_zfh=1 -static")
set(CMAKE_CXX_FLAGS "-march=rv64gcvxtheadc -mabi=lp64d -mtune=c906 -DRVV_SPEC_0_7 -D__riscv_zfh=1 -static")
set(CMAKE_C_FLAGS "-march=rv64gcvxtheadc -mabi=lp64d -mtune=c906 -DRVV_SPEC_0_7 -D__riscv_zfh=1 -DC906=1 -static")
set(CMAKE_CXX_FLAGS "-march=rv64gcvxtheadc -mabi=lp64d -mtune=c906 -DRVV_SPEC_0_7 -D__riscv_zfh=1 -DC906=1 -static")

# cache flags
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")


Loading…
Cancel
Save