| @@ -395,6 +395,40 @@ jobs: | |||
| token: ${{ secrets.CODECOV_TOKEN }} | |||
| file: build-arm82/lcov.info | |||
| linux-gcc-arm82-omp: | |||
| runs-on: ubuntu-20.04 | |||
| steps: | |||
| - uses: actions/checkout@v2 | |||
| - name: lcov | |||
| run: sudo apt-get install lcov | |||
| - name: cache-qemu | |||
| id: cache-qemu | |||
| uses: actions/cache@v2.1.7 | |||
| with: | |||
| path: qemu-install | |||
| key: qemu-aarch64-install-1 | |||
| - name: checkout-qemu | |||
| if: steps.cache-qemu.outputs.cache-hit != 'true' | |||
| uses: actions/checkout@v2 | |||
| with: | |||
| repository: qemu/qemu | |||
| path: qemu | |||
| ref: 8746309137ba470d1b2e8f5ce86ac228625db940 | |||
| - name: qemu | |||
| if: steps.cache-qemu.outputs.cache-hit != 'true' | |||
| run: | | |||
| cd qemu | |||
| ./configure --prefix=install --target-list=aarch64-linux-user --disable-system | |||
| make -j2 | |||
| make install | |||
| cp -r aarch64-linux-user/install $GITHUB_WORKSPACE/qemu-install | |||
| - name: aarch64-gnu-toolchain | |||
| run: | | |||
| sudo apt-get update | |||
| sudo apt-get install g++-aarch64-linux-gnu | |||
| - name: build-arm82-omp | |||
| run: | | |||
| mkdir build-arm82-omp && cd build-arm82-omp | |||
| @@ -418,6 +452,40 @@ jobs: | |||
| token: ${{ secrets.CODECOV_TOKEN }} | |||
| file: build-arm82-omp/lcov.info | |||
| linux-gcc-arm82dot-omp: | |||
| runs-on: ubuntu-20.04 | |||
| steps: | |||
| - uses: actions/checkout@v2 | |||
| - name: lcov | |||
| run: sudo apt-get install lcov | |||
| - name: cache-qemu | |||
| id: cache-qemu | |||
| uses: actions/cache@v2.1.7 | |||
| with: | |||
| path: qemu-install | |||
| key: qemu-aarch64-install-1 | |||
| - name: checkout-qemu | |||
| if: steps.cache-qemu.outputs.cache-hit != 'true' | |||
| uses: actions/checkout@v2 | |||
| with: | |||
| repository: qemu/qemu | |||
| path: qemu | |||
| ref: 8746309137ba470d1b2e8f5ce86ac228625db940 | |||
| - name: qemu | |||
| if: steps.cache-qemu.outputs.cache-hit != 'true' | |||
| run: | | |||
| cd qemu | |||
| ./configure --prefix=install --target-list=aarch64-linux-user --disable-system | |||
| make -j2 | |||
| make install | |||
| cp -r aarch64-linux-user/install $GITHUB_WORKSPACE/qemu-install | |||
| - name: aarch64-gnu-toolchain | |||
| run: | | |||
| sudo apt-get update | |||
| sudo apt-get install g++-aarch64-linux-gnu | |||
| - name: build-arm82dot-omp | |||
| run: | | |||
| mkdir build-arm82dot-omp && cd build-arm82dot-omp | |||
| @@ -4,7 +4,7 @@ ncnn BinaryOp accepts blobs with different shape | |||
| C = BinaryOp(A, B) | |||
| shape notation convention is [w], [w,h], [w,h,c] | |||
| shape notation convention is [w], [w,h], [w,h,c], [w,h,d,c] | |||
| |type|A|B|C| | |||
| |---|---|---|---| | |||
| @@ -27,6 +27,16 @@ shape notation convention is [w], [w,h], [w,h,c] | |||
| |17|[2,3,4]|[4]|[2,3,4]| | |||
| |18|[2,3,4]|[3,4]|[2,3,4]| | |||
| |19|[2,3,4]|[2,3,4]|[2,3,4]| | |||
| |20|[1]|[2,3,4,5]|[2,3,4,5]| | |||
| |21|[5]|[2,3,4,5]|[2,3,4,5]| | |||
| |22|[4,5]|[2,3,4,5]|[2,3,4,5]| | |||
| |23|[3,4,5]|[2,3,4,5]|[2,3,4,5]| | |||
| |24|[2,3,4,5]|scalar|[2,3,4,5]| | |||
| |25|[2,3,4,5]|[1]|[2,3,4,5]| | |||
| |26|[2,3,4,5]|[5]|[2,3,4,5]| | |||
| |27|[2,3,4,5]|[4,5]|[2,3,4,5]| | |||
| |28|[2,3,4,5]|[3,4,5]|[2,3,4,5]| | |||
| |29|[2,3,4,5]|[2,3,4,5]|[2,3,4,5]| | |||
| some special broadcasting rule exists for model compatibility | |||
| @@ -48,8 +48,9 @@ static int unary_op_inplace_pack4(Mat& a, const Option& opt) | |||
| int w = a.w; | |||
| int h = a.h; | |||
| int d = a.d; | |||
| int channels = a.c; | |||
| int size = w * h; | |||
| int size = w * h * d; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| @@ -341,8 +342,9 @@ static int unary_op_inplace_pack8_fp16s(Mat& a, const Option& opt) | |||
| int w = a.w; | |||
| int h = a.h; | |||
| int d = a.d; | |||
| int channels = a.c; | |||
| int size = w * h; | |||
| int size = w * h * d; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| @@ -554,8 +556,9 @@ static int unary_op_inplace_pack4_fp16s(Mat& a, const Option& opt) | |||
| int w = a.w; | |||
| int h = a.h; | |||
| int d = a.d; | |||
| int channels = a.c; | |||
| int size = w * h; | |||
| int size = w * h * d; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| @@ -751,8 +754,9 @@ static int unary_op_inplace_fp16s(Mat& a, const Option& opt) | |||
| int w = a.w; | |||
| int h = a.h; | |||
| int d = a.d; | |||
| int channels = a.c; | |||
| int size = w * h; | |||
| int size = w * h * d; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| @@ -1082,8 +1086,9 @@ static int unary_op_inplace_pack4_bf16s(Mat& a, const Option& opt) | |||
| int w = a.w; | |||
| int h = a.h; | |||
| int d = a.d; | |||
| int channels = a.c; | |||
| int size = w * h; | |||
| int size = w * h * d; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| @@ -1111,8 +1116,9 @@ static int unary_op_inplace_bf16s(Mat& a, const Option& opt) | |||
| int w = a.w; | |||
| int h = a.h; | |||
| int d = a.d; | |||
| int channels = a.c; | |||
| int size = w * h; | |||
| int size = w * h * d; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| @@ -49,17 +49,181 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c, const Option& opt) | |||
| int w = a.w; | |||
| int h = a.h; | |||
| int d = a.d; | |||
| int channels = a.c; | |||
| int size = w * h; | |||
| int size = w * h * d; | |||
| size_t elemsize = a.elemsize; | |||
| int w1 = b.w; | |||
| int h1 = b.h; | |||
| int d1 = b.d; | |||
| int channels1 = b.c; | |||
| int size1 = w1 * h1; | |||
| int size1 = w1 * h1 * d1; | |||
| if (a.dims == 3) | |||
| if (a.dims == 4) | |||
| { | |||
| if (b.dims == 4) | |||
| { | |||
| // type 29 | |||
| c.create(w, h, d, channels, elemsize, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| const float* ptr1 = b.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int i = 0; i < size; i++) | |||
| { | |||
| outptr[i] = op(ptr[i], ptr1[i]); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| c.create(w, h, d, channels, elemsize, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| if (b.dims == 3) | |||
| { | |||
| // type 28 | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| const float* ptr1 = b.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int z = 0; z < d; z++) | |||
| { | |||
| for (int y = 0; y < h; y++) | |||
| { | |||
| const float b0 = ptr1[y]; | |||
| for (int x = 0; x < w; x++) | |||
| { | |||
| outptr[x] = op(ptr[x], b0); | |||
| } | |||
| ptr += w; | |||
| outptr += w; | |||
| } | |||
| ptr1 += h; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| if (b.dims == 2) | |||
| { | |||
| // type 27 | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| const float* ptr1 = b.row(q); | |||
| float* outptr = c.channel(q); | |||
| for (int z = 0; z < d; z++) | |||
| { | |||
| const float b0 = ptr1[z]; | |||
| for (int y = 0; y < h; y++) | |||
| { | |||
| for (int x = 0; x < w; x++) | |||
| { | |||
| outptr[x] = op(ptr[x], b0); | |||
| } | |||
| ptr += w; | |||
| outptr += w; | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| if (b.dims == 1) | |||
| { | |||
| if (b.w == 1) | |||
| { | |||
| // type 25 | |||
| const float b0 = b[0]; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int i = 0; i < size; i++) | |||
| { | |||
| outptr[i] = op(ptr[i], b0); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| // type 26 | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| const float b0 = b[q]; | |||
| float* outptr = c.channel(q); | |||
| for (int i = 0; i < size; i++) | |||
| { | |||
| outptr[i] = op(ptr[i], b0); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } | |||
| else if (a.dims == 3) | |||
| { | |||
| if (b.dims == 4) | |||
| { | |||
| // type 23 | |||
| c.create(w1, h1, d1, channels1, elemsize, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels1; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| const float* ptr1 = b.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int z = 0; z < d1; z++) | |||
| { | |||
| for (int y = 0; y < h1; y++) | |||
| { | |||
| const float a0 = ptr[y]; | |||
| for (int x = 0; x < w1; x++) | |||
| { | |||
| outptr[x] = op(a0, ptr1[x]); | |||
| } | |||
| ptr1 += w1; | |||
| outptr += w1; | |||
| } | |||
| ptr += h1; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| if (b.dims == 3) | |||
| { | |||
| if (w1 == 1 && h1 == 1 && channels1 == channels) | |||
| @@ -359,6 +523,39 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c, const Option& opt) | |||
| } | |||
| else if (a.dims == 2) | |||
| { | |||
| if (b.dims == 4) | |||
| { | |||
| // type 22 | |||
| c.create(w1, h1, d1, channels1, elemsize, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels1; q++) | |||
| { | |||
| const float* ptr = a.row(q); | |||
| const float* ptr1 = b.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int z = 0; z < d1; z++) | |||
| { | |||
| const float a0 = ptr[z]; | |||
| for (int y = 0; y < h1; y++) | |||
| { | |||
| for (int x = 0; x < w1; x++) | |||
| { | |||
| outptr[x] = op(a0, ptr1[x]); | |||
| } | |||
| ptr1 += w1; | |||
| outptr += w1; | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| if (b.dims == 3) | |||
| { | |||
| // type 14 | |||
| @@ -445,6 +642,29 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c, const Option& opt) | |||
| { | |||
| if (a.w == 1) | |||
| { | |||
| if (b.dims == 4) | |||
| { | |||
| // type 20 | |||
| c.create(w1, h1, d1, channels1, elemsize, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| const float a0 = a[0]; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels1; q++) | |||
| { | |||
| const float* ptr1 = b.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int i = 0; i < size1; i++) | |||
| { | |||
| outptr[i] = op(a0, ptr1[i]); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| if (b.dims == 3) | |||
| { | |||
| // type 4 | |||
| @@ -501,6 +721,29 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c, const Option& opt) | |||
| } | |||
| } | |||
| if (b.dims == 4) | |||
| { | |||
| // type 21 | |||
| c.create(w1, h1, d1, channels1, elemsize, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels1; q++) | |||
| { | |||
| const float a0 = a[q]; | |||
| const float* ptr1 = b.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int i = 0; i < size1; i++) | |||
| { | |||
| outptr[i] = op(a0, ptr1[i]); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| if (b.dims == 3) | |||
| { | |||
| // type 9 | |||
| @@ -585,8 +828,9 @@ static int binary_op_scalar_inplace(Mat& a, float b, const Option& opt) | |||
| int w = a.w; | |||
| int h = a.h; | |||
| int d = a.d; | |||
| int channels = a.c; | |||
| int size = w * h; | |||
| int size = w * h * d; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| @@ -703,10 +947,10 @@ int BinaryOp::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to | |||
| return binary_op<binary_op_pow>(bottom_blob, bottom_blob1, top_blob, opt); | |||
| if (op_type == Operation_RSUB) | |||
| return binary_op<binary_op_rsub>(bottom_blob, bottom_blob1, top_blob, opt); | |||
| return binary_op<binary_op_sub>(bottom_blob1, bottom_blob, top_blob, opt); | |||
| if (op_type == Operation_RDIV) | |||
| return binary_op<binary_op_rdiv>(bottom_blob, bottom_blob1, top_blob, opt); | |||
| return binary_op<binary_op_div>(bottom_blob1, bottom_blob, top_blob, opt); | |||
| return 0; | |||
| } | |||
| @@ -41,20 +41,203 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt | |||
| int w = a.w; | |||
| int h = a.h; | |||
| int d = a.d; | |||
| int channels = a.c; | |||
| int size = w * h; | |||
| int size = w * h * d; | |||
| size_t elemsize = a.elemsize; | |||
| int elempack = a.elempack; | |||
| int w1 = b.w; | |||
| int h1 = b.h; | |||
| int d1 = b.d; | |||
| int channels1 = b.c; | |||
| int size1 = w1 * h1; | |||
| int size1 = w1 * h1 * d1; | |||
| size_t elemsize1 = b.elemsize; | |||
| int elempack1 = b.elempack; | |||
| if (a.dims == 3) | |||
| if (a.dims == 4) | |||
| { | |||
| if (b.dims == 4) | |||
| { | |||
| // type 29 | |||
| c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| const float* ptr1 = b.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int i = 0; i < size; i++) | |||
| { | |||
| v4f32 _p = (v4f32)__msa_ld_w(ptr, 0); | |||
| v4f32 _p1 = (v4f32)__msa_ld_w(ptr1, 0); | |||
| v4f32 _outp = op(_p, _p1); | |||
| __msa_st_w((v4i32)_outp, outptr, 0); | |||
| ptr += 4; | |||
| ptr1 += 4; | |||
| outptr += 4; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| if (b.dims == 3) | |||
| { | |||
| // type 28 | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| const float* ptr1 = b.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int z = 0; z < d; z++) | |||
| { | |||
| for (int y = 0; y < h; y++) | |||
| { | |||
| v4f32 _b0 = (v4f32)__msa_ld_w(ptr1, 0); | |||
| for (int x = 0; x < w; x++) | |||
| { | |||
| v4f32 _p = (v4f32)__msa_ld_w(ptr, 0); | |||
| v4f32 _outp = op(_p, _b0); | |||
| __msa_st_w((v4i32)_outp, outptr, 0); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| } | |||
| ptr1 += 4; | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| if (b.dims == 2) | |||
| { | |||
| // type 27 | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| const float* ptr1 = b.row(q); | |||
| float* outptr = c.channel(q); | |||
| for (int z = 0; z < d; z++) | |||
| { | |||
| v4f32 _b0 = (v4f32)__msa_ld_w(ptr1, 0); | |||
| for (int y = 0; y < h; y++) | |||
| { | |||
| for (int x = 0; x < w; x++) | |||
| { | |||
| v4f32 _p = (v4f32)__msa_ld_w(ptr, 0); | |||
| v4f32 _outp = op(_p, _b0); | |||
| __msa_st_w((v4i32)_outp, outptr, 0); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| } | |||
| } | |||
| ptr1 += 4; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| if (b.dims == 1) | |||
| { | |||
| if (b.w == 1 && elempack1 == 1) | |||
| { | |||
| // type 25 | |||
| v4f32 _b0 = __msa_fill_w_f32(b[0]); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int i = 0; i < size; i++) | |||
| { | |||
| v4f32 _p = (v4f32)__msa_ld_w(ptr, 0); | |||
| v4f32 _outp = op(_p, _b0); | |||
| __msa_st_w((v4i32)_outp, outptr, 0); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| // type 26 | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| v4f32 _b0 = (v4f32)__msa_ld_w((const float*)b + q * 4, 0); | |||
| float* outptr = c.channel(q); | |||
| for (int i = 0; i < size; i++) | |||
| { | |||
| v4f32 _p = (v4f32)__msa_ld_w(ptr, 0); | |||
| v4f32 _outp = op(_p, _b0); | |||
| __msa_st_w((v4i32)_outp, outptr, 0); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } | |||
| else if (a.dims == 3) | |||
| { | |||
| if (b.dims == 4) | |||
| { | |||
| // type 23 | |||
| c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels1; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| const float* ptr1 = b.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int z = 0; z < d1; z++) | |||
| { | |||
| for (int y = 0; y < h1; y++) | |||
| { | |||
| v4f32 _a0 = (v4f32)__msa_ld_w(ptr, 0); | |||
| for (int x = 0; x < w1; x++) | |||
| { | |||
| v4f32 _p = (v4f32)__msa_ld_w(ptr1, 0); | |||
| v4f32 _outp = op(_a0, _p); | |||
| __msa_st_w((v4i32)_outp, outptr, 0); | |||
| ptr1 += 4; | |||
| outptr += 4; | |||
| } | |||
| ptr += 4; | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| if (b.dims == 3) | |||
| { | |||
| if (w1 == 1 && h1 == 1 && channels1 == channels) | |||
| @@ -417,6 +600,42 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt | |||
| } | |||
| else if (a.dims == 2) | |||
| { | |||
| if (b.dims == 4) | |||
| { | |||
| // type 22 | |||
| c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels1; q++) | |||
| { | |||
| const float* ptr = a.row(q); | |||
| const float* ptr1 = b.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int z = 0; z < d1; z++) | |||
| { | |||
| v4f32 _a0 = (v4f32)__msa_ld_w(ptr, 0); | |||
| for (int y = 0; y < h1; y++) | |||
| { | |||
| for (int x = 0; x < w1; x++) | |||
| { | |||
| v4f32 _p = (v4f32)__msa_ld_w(ptr1, 0); | |||
| v4f32 _outp = op(_a0, _p); | |||
| __msa_st_w((v4i32)_outp, outptr, 0); | |||
| ptr1 += 4; | |||
| outptr += 4; | |||
| } | |||
| } | |||
| ptr += 4; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| if (b.dims == 3) | |||
| { | |||
| // type 14 | |||
| @@ -530,6 +749,33 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt | |||
| { | |||
| if (a.w == 1 && elempack == 1) | |||
| { | |||
| if (b.dims == 4) | |||
| { | |||
| // type 20 | |||
| c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| v4f32 _a0 = __msa_fill_w_f32(a[0]); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels1; q++) | |||
| { | |||
| const float* ptr1 = b.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int i = 0; i < size1; i++) | |||
| { | |||
| v4f32 _p1 = (v4f32)__msa_ld_w(ptr1, 0); | |||
| v4f32 _outp = op(_a0, _p1); | |||
| __msa_st_w((v4i32)_outp, outptr, 0); | |||
| ptr1 += 4; | |||
| outptr += 4; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| if (b.dims == 3) | |||
| { | |||
| // type 4 | |||
| @@ -605,6 +851,33 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt | |||
| } | |||
| } | |||
| if (b.dims == 4) | |||
| { | |||
| // type 21 | |||
| c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels1; q++) | |||
| { | |||
| v4f32 _a0 = (v4f32)__msa_ld_w((const float*)a + q * 4, 0); | |||
| const float* ptr1 = b.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int i = 0; i < size1; i++) | |||
| { | |||
| v4f32 _p1 = (v4f32)__msa_ld_w(ptr1, 0); | |||
| v4f32 _outp = op(_a0, _p1); | |||
| __msa_st_w((v4i32)_outp, outptr, 0); | |||
| ptr1 += 4; | |||
| outptr += 4; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| if (b.dims == 3) | |||
| { | |||
| // type 9 | |||
| @@ -717,8 +990,9 @@ static int binary_op_scalar_inplace_pack4(Mat& a, float b, const Option& opt) | |||
| int w = a.w; | |||
| int h = a.h; | |||
| int d = a.d; | |||
| int channels = a.c; | |||
| int size = w * h; | |||
| int size = w * h * d; | |||
| v4f32 _b = __msa_fill_w_f32(b); | |||
| @@ -847,10 +1121,10 @@ int BinaryOp_mips::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat | |||
| return binary_op_pack4<binary_op_pow_pack4>(bottom_blob, bottom_blob1, top_blob, opt); | |||
| if (op_type == Operation_RSUB) | |||
| return binary_op_pack4<binary_op_rsub_pack4>(bottom_blob, bottom_blob1, top_blob, opt); | |||
| return binary_op_pack4<binary_op_sub_pack4>(bottom_blob1, bottom_blob, top_blob, opt); | |||
| if (op_type == Operation_RDIV) | |||
| return binary_op_pack4<binary_op_rdiv_pack4>(bottom_blob, bottom_blob1, top_blob, opt); | |||
| return binary_op_pack4<binary_op_div_pack4>(bottom_blob1, bottom_blob, top_blob, opt); | |||
| } | |||
| #endif // __mips_msa | |||
| @@ -38,8 +38,9 @@ static int unary_op_inplace_pack4(Mat& a, const Option& opt) | |||
| int w = a.w; | |||
| int h = a.h; | |||
| int d = a.d; | |||
| int channels = a.c; | |||
| int size = w * h; | |||
| int size = w * h * d; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| @@ -1,8 +1,6 @@ | |||
| // Xavier Hsinyuan is pleased to support the open source community by making | |||
| // ncnn available. | |||
| // Xavier Hsinyuan is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2021 Xavier Hsinyuan <thelastlinex@hotmail.com>. All rights | |||
| // reserved. | |||
| // Copyright (C) 2021 Xavier Hsinyuan <thelastlinex@hotmail.com>. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this | |||
| // file except in compliance with the License. You may obtain a copy of the | |||
| @@ -15,10 +13,12 @@ | |||
| // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | |||
| // License for the specific language governing permissions and limitations under | |||
| // the License. | |||
| #ifndef LAYER_BINARYOP_RISCV_H | |||
| #define LAYER_BINARYOP_RISCV_H | |||
| #include "binaryop.h" | |||
| namespace ncnn { | |||
| class BinaryOp_riscv : virtual public BinaryOp | |||
| @@ -26,19 +26,17 @@ class BinaryOp_riscv : virtual public BinaryOp | |||
| public: | |||
| BinaryOp_riscv(); | |||
| virtual int forward(const std::vector<Mat>& bottom_blobs, | |||
| std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; | |||
| protected: | |||
| #if __riscv_vector && __riscv_zfh | |||
| int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; | |||
| int forward_fp16sa(const std::vector<Mat>& bottom_blobs, | |||
| std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; | |||
| #endif | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_BINARYOP_RISCV_H | |||
| #endif // LAYER_BINARYOP_RISCV_H | |||
| @@ -324,7 +324,7 @@ static void conv3x3s1_winograd64_packn_rvv(const Mat& bottom_blob, Mat& top_blob | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| tmpptr[0] = r0[l]; | |||
| @@ -365,7 +365,7 @@ static void conv3x3s1_winograd64_packn_rvv(const Mat& bottom_blob, Mat& top_blob | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| tmpptr[0] = r0[l]; | |||
| @@ -398,7 +398,7 @@ static void conv3x3s1_winograd64_packn_rvv(const Mat& bottom_blob, Mat& top_blob | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| tmpptr[0] = r0[l]; | |||
| @@ -999,7 +999,7 @@ static void conv3x3s1_winograd42_packn_rvv(const Mat& bottom_blob, Mat& top_blob | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| tmpptr[0] = r0[l]; | |||
| @@ -1040,7 +1040,7 @@ static void conv3x3s1_winograd42_packn_rvv(const Mat& bottom_blob, Mat& top_blob | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| tmpptr[0] = r0[l]; | |||
| @@ -1073,7 +1073,7 @@ static void conv3x3s1_winograd42_packn_rvv(const Mat& bottom_blob, Mat& top_blob | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| tmpptr[0] = r0[l]; | |||
| @@ -324,7 +324,7 @@ static void conv3x3s1_winograd64_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& t | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| tmpptr[0] = r0[l]; | |||
| @@ -365,7 +365,7 @@ static void conv3x3s1_winograd64_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& t | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| tmpptr[0] = r0[l]; | |||
| @@ -398,7 +398,7 @@ static void conv3x3s1_winograd64_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& t | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| tmpptr[0] = r0[l]; | |||
| @@ -999,7 +999,7 @@ static void conv3x3s1_winograd42_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& t | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| tmpptr[0] = r0[l]; | |||
| @@ -1040,7 +1040,7 @@ static void conv3x3s1_winograd42_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& t | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| tmpptr[0] = r0[l]; | |||
| @@ -1073,7 +1073,7 @@ static void conv3x3s1_winograd42_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& t | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| tmpptr[0] = r0[l]; | |||
| @@ -84,7 +84,7 @@ static void convolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob | |||
| } | |||
| } | |||
| #ifdef RVV_SPEC_0_7 | |||
| #if C906 | |||
| // TODO | |||
| std::vector<float> ss(packn); | |||
| vse32_v_f32m2((float*)ss.data(), _sum, vl); | |||
| @@ -54,7 +54,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| tmpptr[0] = img0[l]; | |||
| @@ -103,7 +103,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| tmpptr[0] = img0[l]; | |||
| @@ -144,7 +144,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| tmpptr[0] = img0[l]; | |||
| @@ -54,7 +54,8 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| #ifdef RVV_SPEC_0_7 | |||
| asm volatile( | |||
| "mv t3, %[LEN] \n\t" | |||
| "mv t1, %[SRC] \n\t" | |||
| @@ -83,7 +84,22 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo | |||
| img0 += size * packn; | |||
| tmpptr += packn * 8; | |||
| #else | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| tmpptr[0] = img0[l]; | |||
| tmpptr[1] = img0[l + packn]; | |||
| tmpptr[2] = img0[l + packn * 2]; | |||
| tmpptr[3] = img0[l + packn * 3]; | |||
| tmpptr[4] = img0[l + packn * 4]; | |||
| tmpptr[5] = img0[l + packn * 5]; | |||
| tmpptr[6] = img0[l + packn * 6]; | |||
| tmpptr[7] = img0[l + packn * 7]; | |||
| tmpptr += 8; | |||
| } | |||
| img0 += size * packn; | |||
| #endif | |||
| #else | |||
| vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl); | |||
| vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl); | |||
| @@ -118,7 +134,8 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| #ifdef RVV_SPEC_0_7 | |||
| asm volatile( | |||
| "mv t3, %[LEN] \n\t" | |||
| "mv t1, %[SRC] \n\t" | |||
| @@ -138,6 +155,18 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo | |||
| img0 += size * packn; | |||
| tmpptr += packn * 4; | |||
| #else | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| tmpptr[0] = img0[l]; | |||
| tmpptr[1] = img0[l + packn]; | |||
| tmpptr[2] = img0[l + packn * 2]; | |||
| tmpptr[3] = img0[l + packn * 3]; | |||
| tmpptr += 4; | |||
| } | |||
| img0 += size * packn; | |||
| #endif | |||
| #else | |||
| vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl); | |||
| vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl); | |||
| @@ -169,7 +198,8 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| #ifdef RVV_SPEC_0_7 | |||
| asm volatile( | |||
| "mv t3, %[LEN] \n\t" | |||
| "mv t1, %[SRC] \n\t" | |||
| @@ -185,6 +215,16 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo | |||
| img0 += size * packn; | |||
| tmpptr += packn * 2; | |||
| #else | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| tmpptr[0] = img0[l]; | |||
| tmpptr[1] = img0[l + packn]; | |||
| tmpptr += 2; | |||
| } | |||
| img0 += size * packn; | |||
| #endif | |||
| #else | |||
| vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl); | |||
| vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl); | |||
| @@ -53,7 +53,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| tmpptr[0] = img0[l]; | |||
| @@ -102,7 +102,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| tmpptr[0] = img0[l]; | |||
| @@ -143,7 +143,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| tmpptr[0] = img0[l]; | |||
| @@ -240,7 +240,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c | |||
| kptr0 += packn; | |||
| } | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| vsse32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, vl); | |||
| vsse32_v_f32m1(outptr0 + 1, top_blob.cstep * sizeof(float), _sum1, vl); | |||
| vsse32_v_f32m1(outptr0 + 2, top_blob.cstep * sizeof(float), _sum2, vl); | |||
| @@ -281,7 +281,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c | |||
| kptr0 += packn; | |||
| } | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| vsse32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, vl); | |||
| vsse32_v_f32m1(outptr0 + 1, top_blob.cstep * sizeof(float), _sum1, vl); | |||
| vsse32_v_f32m1(outptr0 + 2, top_blob.cstep * sizeof(float), _sum2, vl); | |||
| @@ -312,7 +312,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c | |||
| kptr0 += packn; | |||
| } | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| vsse32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, vl); | |||
| vsse32_v_f32m1(outptr0 + 1, top_blob.cstep * sizeof(float), _sum1, vl); | |||
| #else | |||
| @@ -393,7 +393,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c | |||
| kptr0 += packn; | |||
| } | |||
| #ifdef RVV_SPEC_0_7 | |||
| #if C906 | |||
| // TODO | |||
| std::vector<float> ss0(packn); | |||
| std::vector<float> ss1(packn); | |||
| @@ -473,7 +473,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c | |||
| kptr0 += packn; | |||
| } | |||
| #ifdef RVV_SPEC_0_7 | |||
| #if C906 | |||
| // TODO | |||
| std::vector<float> ss0(packn); | |||
| std::vector<float> ss1(packn); | |||
| @@ -527,7 +527,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c | |||
| kptr0 += packn; | |||
| } | |||
| #ifdef RVV_SPEC_0_7 | |||
| #if C906 | |||
| // TODO | |||
| std::vector<float> ss0(packn); | |||
| std::vector<float> ss1(packn); | |||
| @@ -568,7 +568,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c | |||
| kptr0 += packn; | |||
| } | |||
| #ifdef RVV_SPEC_0_7 | |||
| #if C906 | |||
| // TODO | |||
| std::vector<float> ss0(packn); | |||
| vse32_v_f32m1((float*)ss0.data(), _sum0, vl); | |||
| @@ -53,7 +53,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| tmpptr[0] = img0[l]; | |||
| @@ -102,7 +102,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| tmpptr[0] = img0[l]; | |||
| @@ -143,7 +143,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| for (int l = 0; l < packn; l++) | |||
| { | |||
| tmpptr[0] = img0[l]; | |||
| @@ -240,7 +240,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ | |||
| kptr0 += packn; | |||
| } | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| vsse16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, vl); | |||
| vsse16_v_f16m1(outptr0 + 1, top_blob.cstep * sizeof(__fp16), _sum1, vl); | |||
| vsse16_v_f16m1(outptr0 + 2, top_blob.cstep * sizeof(__fp16), _sum2, vl); | |||
| @@ -281,7 +281,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ | |||
| kptr0 += packn; | |||
| } | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| vsse16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, vl); | |||
| vsse16_v_f16m1(outptr0 + 1, top_blob.cstep * sizeof(__fp16), _sum1, vl); | |||
| vsse16_v_f16m1(outptr0 + 2, top_blob.cstep * sizeof(__fp16), _sum2, vl); | |||
| @@ -312,7 +312,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ | |||
| kptr0 += packn; | |||
| } | |||
| #if RVV_SPEC_0_7 | |||
| #if C906 | |||
| vsse16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, vl); | |||
| vsse16_v_f16m1(outptr0 + 1, top_blob.cstep * sizeof(__fp16), _sum1, vl); | |||
| #else | |||
| @@ -393,7 +393,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ | |||
| kptr0 += packn; | |||
| } | |||
| #ifdef RVV_SPEC_0_7 | |||
| #if C906 | |||
| // TODO | |||
| std::vector<__fp16> ss0(packn); | |||
| std::vector<__fp16> ss1(packn); | |||
| @@ -473,7 +473,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ | |||
| kptr0 += packn; | |||
| } | |||
| #ifdef RVV_SPEC_0_7 | |||
| #if C906 | |||
| // TODO | |||
| std::vector<__fp16> ss0(packn); | |||
| std::vector<__fp16> ss1(packn); | |||
| @@ -527,7 +527,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ | |||
| kptr0 += packn; | |||
| } | |||
| #ifdef RVV_SPEC_0_7 | |||
| #if C906 | |||
| // TODO | |||
| std::vector<__fp16> ss0(packn); | |||
| std::vector<__fp16> ss1(packn); | |||
| @@ -568,7 +568,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ | |||
| kptr0 += packn; | |||
| } | |||
| #ifdef RVV_SPEC_0_7 | |||
| #if C906 | |||
| // TODO | |||
| std::vector<__fp16> ss0(packn); | |||
| vse16_v_f16m1((__fp16*)ss0.data(), _sum0, vl); | |||
| @@ -91,7 +91,7 @@ static void deconvolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_bl | |||
| kptr += maxk * packn; | |||
| } | |||
| #ifdef RVV_SPEC_0_7 | |||
| #if C906 | |||
| // TODO | |||
| std::vector<float> ss(packn); | |||
| vse32_v_f32m2((float*)ss.data(), _sum, vl); | |||
| @@ -15,6 +15,14 @@ | |||
| #ifndef RISCV_USABILITY_H | |||
| #define RISCV_USABILITY_H | |||
| #if __riscv_vector | |||
| #ifdef RVV_SPEC_0_7 | |||
| #include "riscv_v_071_fix.h" | |||
| #else | |||
| #include <riscv_vector.h> | |||
| #endif | |||
| #endif // __riscv_vector | |||
| #if __riscv_vector | |||
| static inline int csrr_vl() | |||
| { | |||
| @@ -45,6 +53,80 @@ static inline int csrr_vlenb() | |||
| : "memory"); | |||
| return a; | |||
| } | |||
| static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr) | |||
| { | |||
| const int packn = csrr_vlenb() / 4; | |||
| const word_type vl = vsetvl_e32m8(packn * 8); | |||
| // NOTE vloxei8_v_f32m8 gets illegal instruction on d1 --- nihui | |||
| // 128bit | |||
| static const uint32_t index_128bit[32] = { | |||
| 0, 4, 8, 12, | |||
| 0, 4, 8, 12, | |||
| 0, 4, 8, 12, | |||
| 0, 4, 8, 12, | |||
| 0, 4, 8, 12, | |||
| 0, 4, 8, 12, | |||
| 0, 4, 8, 12, | |||
| 0, 4, 8, 12 | |||
| }; | |||
| // 256bit | |||
| static const uint32_t index_256bit[64] = { | |||
| 0, 4, 8, 12, 16, 20, 24, 28, | |||
| 0, 4, 8, 12, 16, 20, 24, 28, | |||
| 0, 4, 8, 12, 16, 20, 24, 28, | |||
| 0, 4, 8, 12, 16, 20, 24, 28, | |||
| 0, 4, 8, 12, 16, 20, 24, 28, | |||
| 0, 4, 8, 12, 16, 20, 24, 28, | |||
| 0, 4, 8, 12, 16, 20, 24, 28, | |||
| 0, 4, 8, 12, 16, 20, 24, 28 | |||
| }; | |||
| const uint32_t* index = packn == 4 ? index_128bit : index_256bit; | |||
| vuint32m8_t bindex = vle32_v_u32m8(index, vl); | |||
| return vloxei32_v_f32m8(ptr, bindex, vl); | |||
| } | |||
| #if __riscv_zfh | |||
| static inline vfloat16m8_t vle16_v_f16m8_f16m1(const __fp16* ptr) | |||
| { | |||
| const int packn = csrr_vlenb() / 2; | |||
| const word_type vl = vsetvl_e16m8(packn * 8); | |||
| // NOTE vloxei8_v_f16m8 gets illegal instruction on d1 --- nihui | |||
| // 128bit | |||
| static const uint16_t index_128bit[64] = { | |||
| 0, 2, 4, 6, 8, 10, 12, 14, | |||
| 0, 2, 4, 6, 8, 10, 12, 14, | |||
| 0, 2, 4, 6, 8, 10, 12, 14, | |||
| 0, 2, 4, 6, 8, 10, 12, 14, | |||
| 0, 2, 4, 6, 8, 10, 12, 14, | |||
| 0, 2, 4, 6, 8, 10, 12, 14, | |||
| 0, 2, 4, 6, 8, 10, 12, 14, | |||
| 0, 2, 4, 6, 8, 10, 12, 14 | |||
| }; | |||
| // 256bit | |||
| static const uint16_t index_256bit[128] = { | |||
| 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, | |||
| 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, | |||
| 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, | |||
| 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, | |||
| 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, | |||
| 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, | |||
| 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, | |||
| 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 | |||
| }; | |||
| const uint16_t* index = packn == 8 ? index_128bit : index_256bit; | |||
| vuint16m8_t bindex = vle16_v_u16m8(index, vl); | |||
| return vloxei16_v_f16m8(ptr, bindex, vl); | |||
| } | |||
| #endif // __riscv_zfh | |||
| #endif // __riscv_vector | |||
| #endif // RISCV_USABILITY_H | |||
| @@ -96,6 +96,15 @@ typedef uint16x4xm1_t vuint16m1x4_t; | |||
| typedef uint16x4xm2_t vuint16m2x4_t; | |||
| typedef uint16x8xm1_t vuint16m1x8_t; | |||
| typedef uint8xm1_t vuint8m1_t; | |||
| typedef uint8xm2_t vuint8m2_t; | |||
| typedef uint8xm4_t vuint8m4_t; | |||
| typedef uint8xm8_t vuint8m8_t; | |||
| typedef uint8x4xm1_t vuint8m1x4_t; | |||
| typedef uint8x4xm2_t vuint8m2x4_t; | |||
| typedef uint8x8xm1_t vuint8m1x8_t; | |||
| #define vsetvl_e32m1(n) vsetvli(n, RVV_E32, RVV_M1) | |||
| #define vsetvl_e32m2(n) vsetvli(n, RVV_E32, RVV_M2) | |||
| #define vsetvl_e32m4(n) vsetvli(n, RVV_E32, RVV_M4) | |||
| @@ -132,6 +141,8 @@ typedef uint16x8xm1_t vuint16m1x8_t; | |||
| #define vsse32_v_f32m4 vssev_float32xm4 | |||
| #define vsse32_v_f32m8 vssev_float32xm8 | |||
| #define vloxei32_v_f32m8(a, i, vl) vlxev_float32xm8(a, reinterpret_cast<int32xm8_t>(i), vl) | |||
| #define vlseg2e32_v_f32m1x2 vlseg2ev_float32x2xm1 | |||
| #define vsseg2e32_v_f32m1x2 vsseg2ev_float32x2xm1 | |||
| @@ -617,6 +628,8 @@ static inline vfloat32m1_t vfredmax_vs_f32m8_f32m1(vfloat32m1_t dst, vfloat32m8_ | |||
| #define vsse16_v_f16m4 vssev_float16xm4 | |||
| #define vsse16_v_f16m8 vssev_float16xm8 | |||
| #define vloxei16_v_f16m8(a, i, vl) vlxev_float16xm8(a, reinterpret_cast<int16xm8_t>(i), vl) | |||
| #define vlseg2e16_v_f16m1x2 vlseg2ev_float16x2xm1 | |||
| #define vsseg2e16_v_f16m1x2 vsseg2ev_float16x2xm1 | |||
| @@ -1690,6 +1703,32 @@ static inline vuint16m1x8_t vcreate_u16m1x8(vuint16m1_t v0, vuint16m1_t v1, vuin | |||
| #define vreinterpret_v_f16m4_u16m4(x) reinterpret_cast<vuint16m4_t>(x) | |||
| #define vreinterpret_v_f16m8_u16m8(x) reinterpret_cast<vuint16m8_t>(x) | |||
| /******************************** uint8 ********************************/ | |||
| #define vle8_v_u8m1 vlev_uint8xm1 | |||
| #define vle8_v_u8m2 vlev_uint8xm2 | |||
| #define vle8_v_u8m4 vlev_uint8xm4 | |||
| #define vle8_v_u8m8 vlev_uint8xm8 | |||
| #define vse8_v_u8m1 vsev_uint8xm1 | |||
| #define vse8_v_u8m2 vsev_uint8xm2 | |||
| #define vse8_v_u8m4 vsev_uint8xm4 | |||
| #define vse8_v_u8m8 vsev_uint8xm8 | |||
| #define vlse8_v_u8m1 vlsev_uint8xm1 | |||
| #define vlse8_v_u8m2 vlsev_uint8xm2 | |||
| #define vlse8_v_u8m4 vlsev_uint8xm4 | |||
| #define vlse8_v_u8m8 vlsev_uint8xm8 | |||
| #define vsse8_v_u8m1 vssev_uint8xm1 | |||
| #define vsse8_v_u8m2 vssev_uint8xm2 | |||
| #define vsse8_v_u8m4 vssev_uint8xm4 | |||
| #define vsse8_v_u8m8 vssev_uint8xm8 | |||
| #define vmv_v_x_u8m1 vmvvx_unt8xm1 | |||
| #define vmv_v_x_u8m2 vmvvx_unt8xm2 | |||
| #define vmv_v_x_u8m4 vmvvx_unt8xm4 | |||
| #define vmv_v_x_u8m8 vmvvx_unt8xm8 | |||
| /******************************** mask ********************************/ | |||
| #define vmxor_mm_b32 vmxormm_e32xm1 | |||
| #define vmxor_mm_b16 vmxormm_e32xm2 | |||
| @@ -46,8 +46,9 @@ static int unary_op_inplace(Mat& a, const Option& opt) | |||
| int w = a.w; | |||
| int h = a.h; | |||
| int d = a.d; | |||
| int channels = a.c; | |||
| int size = w * h; | |||
| int size = w * h * d; | |||
| int elempack = a.elempack; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| @@ -322,8 +323,9 @@ static int unary_op_inplace_fp16s(Mat& a, const Option& opt) | |||
| int w = a.w; | |||
| int h = a.h; | |||
| int d = a.d; | |||
| int channels = a.c; | |||
| int size = w * h; | |||
| int size = w * h * d; | |||
| int elempack = a.elempack; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| @@ -47,17 +47,17 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt) | |||
| int elempack = 1; | |||
| if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; | |||
| if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; | |||
| if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; | |||
| if (shape.dims == 3 || shape.dims == 4) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; | |||
| int elempack1 = 1; | |||
| if (shape1.dims == 1) elempack1 = opt.use_shader_pack8 && shape1.w % 8 == 0 ? 8 : shape1.w % 4 == 0 ? 4 : 1; | |||
| if (shape1.dims == 2) elempack1 = opt.use_shader_pack8 && shape1.h % 8 == 0 ? 8 : shape1.h % 4 == 0 ? 4 : 1; | |||
| if (shape1.dims == 3) elempack1 = opt.use_shader_pack8 && shape1.c % 8 == 0 ? 8 : shape1.c % 4 == 0 ? 4 : 1; | |||
| if (shape1.dims == 3 || shape1.dims == 4) elempack1 = opt.use_shader_pack8 && shape1.c % 8 == 0 ? 8 : shape1.c % 4 == 0 ? 4 : 1; | |||
| int out_elempack = 1; | |||
| if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1; | |||
| if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 : 1; | |||
| if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1; | |||
| if (out_shape.dims == 3 || out_shape.dims == 4) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1; | |||
| size_t elemsize; | |||
| size_t elemsize1; | |||
| @@ -85,19 +85,22 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt) | |||
| if (shape.dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack); | |||
| if (shape.dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack); | |||
| if (shape.dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack); | |||
| if (shape.dims == 4) shape_packed = Mat(shape.w, shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack); | |||
| Mat shape1_packed; | |||
| if (shape1.dims == 1) shape1_packed = Mat(shape1.w / elempack1, (void*)0, elemsize1, elempack1); | |||
| if (shape1.dims == 2) shape1_packed = Mat(shape1.w, shape1.h / elempack1, (void*)0, elemsize1, elempack1); | |||
| if (shape1.dims == 3) shape1_packed = Mat(shape1.w, shape1.h, shape1.c / elempack1, (void*)0, elemsize1, elempack1); | |||
| if (shape1.dims == 4) shape1_packed = Mat(shape1.w, shape1.h, shape1.d, shape1.c / elempack1, (void*)0, elemsize1, elempack1); | |||
| Mat out_shape_packed; | |||
| if (out_shape.dims == 1) out_shape_packed = Mat(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack); | |||
| if (out_shape.dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h / out_elempack, (void*)0, out_elemsize, out_elempack); | |||
| if (out_shape.dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack); | |||
| if (out_shape.dims == 4) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.d, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack); | |||
| bool broadcast = true; | |||
| if (shape.dims == shape1.dims && shape.w == shape1.w && shape.h == shape1.h && shape.c == shape1.c) | |||
| if (shape.dims == shape1.dims && shape.w == shape1.w && shape.h == shape1.h && shape.d == shape1.d && shape.c == shape1.c) | |||
| { | |||
| broadcast = false; | |||
| } | |||
| @@ -111,17 +114,17 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt) | |||
| specializations[2].f = b; | |||
| specializations[3 + 0].i = shape_packed.dims; | |||
| specializations[3 + 1].i = shape_packed.w; | |||
| specializations[3 + 2].i = shape_packed.h; | |||
| specializations[3 + 2].i = shape_packed.h * shape_packed.d; | |||
| specializations[3 + 3].i = shape_packed.c; | |||
| specializations[3 + 4].i = shape_packed.cstep; | |||
| specializations[3 + 5].i = shape1_packed.dims; | |||
| specializations[3 + 6].i = shape1_packed.w; | |||
| specializations[3 + 7].i = shape1_packed.h; | |||
| specializations[3 + 7].i = shape1_packed.h * shape1_packed.d; | |||
| specializations[3 + 8].i = shape1_packed.c; | |||
| specializations[3 + 9].i = shape1_packed.cstep; | |||
| specializations[3 + 10].i = out_shape_packed.dims; | |||
| specializations[3 + 11].i = out_shape_packed.w; | |||
| specializations[3 + 12].i = out_shape_packed.h; | |||
| specializations[3 + 12].i = out_shape_packed.h * out_shape_packed.d; | |||
| specializations[3 + 13].i = out_shape_packed.c; | |||
| specializations[3 + 14].i = out_shape_packed.cstep; | |||
| @@ -144,6 +147,12 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt) | |||
| local_size_xyz.h = std::min(4, out_shape_packed.h); | |||
| local_size_xyz.c = std::min(4, out_shape_packed.c); | |||
| } | |||
| if (out_shape_packed.dims == 4) | |||
| { | |||
| local_size_xyz.w = std::min(4, out_shape_packed.w); | |||
| local_size_xyz.h = std::min(4, out_shape_packed.h * out_shape_packed.d); | |||
| local_size_xyz.c = std::min(4, out_shape_packed.c); | |||
| } | |||
| // pack1 | |||
| if (shape.dims == 0 || elempack == 1) | |||
| @@ -173,23 +182,44 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt) | |||
| // broadcast | |||
| if (shape.dims == 0 || broadcast) | |||
| { | |||
| std::vector<vk_specialization_type> specializations(1 + 15); | |||
| std::vector<vk_specialization_type> specializations(1 + 18); | |||
| specializations[0].i = op_type; | |||
| specializations[1 + 0].i = shape_packed.dims; | |||
| specializations[1 + 1].i = shape_packed.w; | |||
| specializations[1 + 2].i = shape_packed.h; | |||
| specializations[1 + 3].i = shape_packed.c; | |||
| specializations[1 + 4].i = shape_packed.cstep; | |||
| specializations[1 + 5].i = shape1_packed.dims; | |||
| specializations[1 + 6].i = shape1_packed.w; | |||
| specializations[1 + 7].i = shape1_packed.h; | |||
| specializations[1 + 8].i = shape1_packed.c; | |||
| specializations[1 + 9].i = shape1_packed.cstep; | |||
| specializations[1 + 10].i = out_shape_packed.dims; | |||
| specializations[1 + 11].i = out_shape_packed.w; | |||
| specializations[1 + 12].i = out_shape_packed.h; | |||
| specializations[1 + 13].i = out_shape_packed.c; | |||
| specializations[1 + 14].i = out_shape_packed.cstep; | |||
| specializations[1 + 3].i = shape_packed.d; | |||
| specializations[1 + 4].i = shape_packed.c; | |||
| specializations[1 + 5].i = shape_packed.cstep; | |||
| specializations[1 + 6].i = shape1_packed.dims; | |||
| specializations[1 + 7].i = shape1_packed.w; | |||
| specializations[1 + 8].i = shape1_packed.h; | |||
| specializations[1 + 9].i = shape1_packed.d; | |||
| specializations[1 + 10].i = shape1_packed.c; | |||
| specializations[1 + 11].i = shape1_packed.cstep; | |||
| specializations[1 + 12].i = out_shape_packed.dims; | |||
| specializations[1 + 13].i = out_shape_packed.w; | |||
| specializations[1 + 14].i = out_shape_packed.h; | |||
| specializations[1 + 15].i = out_shape_packed.d; | |||
| specializations[1 + 16].i = out_shape_packed.c; | |||
| specializations[1 + 17].i = out_shape_packed.cstep; | |||
| std::vector<vk_specialization_type> specializations_broadcast_a1_b1(1 + 15); | |||
| specializations_broadcast_a1_b1[0].i = op_type; | |||
| specializations_broadcast_a1_b1[1 + 0].i = shape_packed.dims; | |||
| specializations_broadcast_a1_b1[1 + 1].i = shape_packed.w; | |||
| specializations_broadcast_a1_b1[1 + 2].i = shape_packed.h * shape_packed.d; | |||
| specializations_broadcast_a1_b1[1 + 3].i = shape_packed.c; | |||
| specializations_broadcast_a1_b1[1 + 4].i = shape_packed.cstep; | |||
| specializations_broadcast_a1_b1[1 + 5].i = shape1_packed.dims; | |||
| specializations_broadcast_a1_b1[1 + 6].i = shape1_packed.w; | |||
| specializations_broadcast_a1_b1[1 + 7].i = shape1_packed.h * shape1_packed.d; | |||
| specializations_broadcast_a1_b1[1 + 8].i = shape1_packed.c; | |||
| specializations_broadcast_a1_b1[1 + 9].i = shape1_packed.cstep; | |||
| specializations_broadcast_a1_b1[1 + 10].i = out_shape_packed.dims; | |||
| specializations_broadcast_a1_b1[1 + 11].i = out_shape_packed.w; | |||
| specializations_broadcast_a1_b1[1 + 12].i = out_shape_packed.h * out_shape_packed.d; | |||
| specializations_broadcast_a1_b1[1 + 13].i = out_shape_packed.c; | |||
| specializations_broadcast_a1_b1[1 + 14].i = out_shape_packed.cstep; | |||
| Mat local_size_xyz; | |||
| if (out_shape_packed.dims == 1) | |||
| @@ -210,6 +240,12 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt) | |||
| local_size_xyz.h = std::min(4, out_shape_packed.h); | |||
| local_size_xyz.c = std::min(4, out_shape_packed.c); | |||
| } | |||
| if (out_shape_packed.dims == 4) | |||
| { | |||
| local_size_xyz.w = std::min(4, out_shape_packed.w); | |||
| local_size_xyz.h = std::min(4, out_shape_packed.h * out_shape_packed.d); | |||
| local_size_xyz.c = std::min(4, out_shape_packed.c); | |||
| } | |||
| // pack1 | |||
| if (shape.dims == 0 || (elempack == 1 && elempack1 == 1)) | |||
| @@ -232,7 +268,7 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt) | |||
| { | |||
| pipeline_binaryop_broadcast_a1_pack4 = new Pipeline(vkdev); | |||
| pipeline_binaryop_broadcast_a1_pack4->set_optimal_local_size_xyz(local_size_xyz); | |||
| pipeline_binaryop_broadcast_a1_pack4->create(LayerShaderType::binaryop_broadcast_a1_pack4, opt, specializations); | |||
| pipeline_binaryop_broadcast_a1_pack4->create(LayerShaderType::binaryop_broadcast_a1_pack4, opt, specializations_broadcast_a1_b1); | |||
| } | |||
| if (shape.dims == 0 || (shape1.dims == 1 && shape1.w == 1 && elempack1 == 1 && elempack == 4) | |||
| @@ -240,7 +276,7 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt) | |||
| { | |||
| pipeline_binaryop_broadcast_b1_pack4 = new Pipeline(vkdev); | |||
| pipeline_binaryop_broadcast_b1_pack4->set_optimal_local_size_xyz(local_size_xyz); | |||
| pipeline_binaryop_broadcast_b1_pack4->create(LayerShaderType::binaryop_broadcast_b1_pack4, opt, specializations); | |||
| pipeline_binaryop_broadcast_b1_pack4->create(LayerShaderType::binaryop_broadcast_b1_pack4, opt, specializations_broadcast_a1_b1); | |||
| } | |||
| // pack8 | |||
| @@ -256,7 +292,7 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt) | |||
| { | |||
| pipeline_binaryop_broadcast_a1_pack8 = new Pipeline(vkdev); | |||
| pipeline_binaryop_broadcast_a1_pack8->set_optimal_local_size_xyz(local_size_xyz); | |||
| pipeline_binaryop_broadcast_a1_pack8->create(LayerShaderType::binaryop_broadcast_a1_pack8, opt, specializations); | |||
| pipeline_binaryop_broadcast_a1_pack8->create(LayerShaderType::binaryop_broadcast_a1_pack8, opt, specializations_broadcast_a1_b1); | |||
| } | |||
| if ((opt.use_shader_pack8 && shape.dims == 0) || (shape1.dims == 1 && shape1.w == 1 && elempack1 == 1 && elempack == 8) | |||
| @@ -264,7 +300,7 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt) | |||
| { | |||
| pipeline_binaryop_broadcast_b1_pack8 = new Pipeline(vkdev); | |||
| pipeline_binaryop_broadcast_b1_pack8->set_optimal_local_size_xyz(local_size_xyz); | |||
| pipeline_binaryop_broadcast_b1_pack8->create(LayerShaderType::binaryop_broadcast_b1_pack8, opt, specializations); | |||
| pipeline_binaryop_broadcast_b1_pack8->create(LayerShaderType::binaryop_broadcast_b1_pack8, opt, specializations_broadcast_a1_b1); | |||
| } | |||
| } | |||
| @@ -324,7 +360,7 @@ int BinaryOp_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector | |||
| } | |||
| else // if (bottom_blob.dims == bottom_blob1.dims) | |||
| { | |||
| if (bottom_blob.w * bottom_blob.h * bottom_blob.c * bottom_blob.elempack >= bottom_blob1.w * bottom_blob1.h * bottom_blob1.c * bottom_blob1.elempack) | |||
| if (bottom_blob.w * bottom_blob.h * bottom_blob.d * bottom_blob.c * bottom_blob.elempack >= bottom_blob1.w * bottom_blob1.h * bottom_blob1.d * bottom_blob1.c * bottom_blob1.elempack) | |||
| { | |||
| top_blob.create_like(bottom_blob, opt.blob_vkallocator); | |||
| } | |||
| @@ -343,39 +379,63 @@ int BinaryOp_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector | |||
| bindings[1] = bottom_blob1; | |||
| bindings[2] = top_blob; | |||
| std::vector<vk_constant_type> constants(15); | |||
| constants[0].i = bottom_blob.dims; | |||
| constants[1].i = bottom_blob.w; | |||
| constants[2].i = bottom_blob.h; | |||
| constants[3].i = bottom_blob.c; | |||
| constants[4].i = bottom_blob.cstep; | |||
| constants[5].i = bottom_blob1.dims; | |||
| constants[6].i = bottom_blob1.w; | |||
| constants[7].i = bottom_blob1.h; | |||
| constants[8].i = bottom_blob1.c; | |||
| constants[9].i = bottom_blob1.cstep; | |||
| constants[10].i = top_blob.dims; | |||
| constants[11].i = top_blob.w; | |||
| constants[12].i = top_blob.h; | |||
| constants[13].i = top_blob.c; | |||
| constants[14].i = top_blob.cstep; | |||
| bool broadcast = true; | |||
| if (bottom_blob.dims == bottom_blob1.dims | |||
| && bottom_blob.w == bottom_blob1.w | |||
| && bottom_blob.h == bottom_blob1.h | |||
| && bottom_blob.d == bottom_blob1.d | |||
| && bottom_blob.c == bottom_blob1.c | |||
| && bottom_blob.elempack == bottom_blob1.elempack) | |||
| { | |||
| broadcast = false; | |||
| } | |||
| const Pipeline* pipeline = 0; | |||
| if (broadcast) | |||
| { | |||
| std::vector<vk_constant_type> constants(18); | |||
| constants[0].i = bottom_blob.dims; | |||
| constants[1].i = bottom_blob.w; | |||
| constants[2].i = bottom_blob.h; | |||
| constants[3].i = bottom_blob.d; | |||
| constants[4].i = bottom_blob.c; | |||
| constants[5].i = bottom_blob.cstep; | |||
| constants[6].i = bottom_blob1.dims; | |||
| constants[7].i = bottom_blob1.w; | |||
| constants[8].i = bottom_blob1.h; | |||
| constants[9].i = bottom_blob1.d; | |||
| constants[10].i = bottom_blob1.c; | |||
| constants[11].i = bottom_blob1.cstep; | |||
| constants[12].i = top_blob.dims; | |||
| constants[13].i = top_blob.w; | |||
| constants[14].i = top_blob.h; | |||
| constants[15].i = top_blob.d; | |||
| constants[16].i = top_blob.c; | |||
| constants[17].i = top_blob.cstep; | |||
| std::vector<vk_constant_type> constants_broadcast_a1b1(15); | |||
| constants_broadcast_a1b1[0].i = bottom_blob.dims; | |||
| constants_broadcast_a1b1[1].i = bottom_blob.w; | |||
| constants_broadcast_a1b1[2].i = bottom_blob.h * bottom_blob.d; | |||
| constants_broadcast_a1b1[3].i = bottom_blob.c; | |||
| constants_broadcast_a1b1[4].i = bottom_blob.cstep; | |||
| constants_broadcast_a1b1[5].i = bottom_blob1.dims; | |||
| constants_broadcast_a1b1[6].i = bottom_blob1.w; | |||
| constants_broadcast_a1b1[7].i = bottom_blob1.h * bottom_blob1.d; | |||
| constants_broadcast_a1b1[8].i = bottom_blob1.c; | |||
| constants_broadcast_a1b1[9].i = bottom_blob1.cstep; | |||
| constants_broadcast_a1b1[10].i = top_blob.dims; | |||
| constants_broadcast_a1b1[11].i = top_blob.w; | |||
| constants_broadcast_a1b1[12].i = top_blob.h * top_blob.d; | |||
| constants_broadcast_a1b1[13].i = top_blob.c; | |||
| constants_broadcast_a1b1[14].i = top_blob.cstep; | |||
| bool broadcast_a1b1 = true; | |||
| const Pipeline* pipeline = 0; | |||
| if (bottom_blob.elempack == 1 && bottom_blob1.elempack == 1) | |||
| { | |||
| pipeline = pipeline_binaryop_broadcast; | |||
| broadcast_a1b1 = false; | |||
| } | |||
| else | |||
| { | |||
| @@ -400,18 +460,38 @@ int BinaryOp_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector | |||
| else | |||
| { | |||
| pipeline = out_elempack == 8 ? pipeline_binaryop_broadcast_pack8 : pipeline_binaryop_broadcast_pack4; | |||
| broadcast_a1b1 = false; | |||
| } | |||
| } | |||
| cmd.record_pipeline(pipeline, bindings, broadcast_a1b1 ? constants_broadcast_a1b1 : constants, top_blob); | |||
| } | |||
| else | |||
| { | |||
| pipeline = out_elempack == 8 ? pipeline_binaryop_pack8 | |||
| : out_elempack == 4 ? pipeline_binaryop_pack4 | |||
| : pipeline_binaryop; | |||
| std::vector<vk_constant_type> constants(15); | |||
| constants[0].i = bottom_blob.dims; | |||
| constants[1].i = bottom_blob.w; | |||
| constants[2].i = bottom_blob.h * bottom_blob.d; | |||
| constants[3].i = bottom_blob.c; | |||
| constants[4].i = bottom_blob.cstep; | |||
| constants[5].i = bottom_blob1.dims; | |||
| constants[6].i = bottom_blob1.w; | |||
| constants[7].i = bottom_blob1.h * bottom_blob1.d; | |||
| constants[8].i = bottom_blob1.c; | |||
| constants[9].i = bottom_blob1.cstep; | |||
| constants[10].i = top_blob.dims; | |||
| constants[11].i = top_blob.w; | |||
| constants[12].i = top_blob.h * top_blob.d; | |||
| constants[13].i = top_blob.c; | |||
| constants[14].i = top_blob.cstep; | |||
| const Pipeline* pipeline = out_elempack == 8 ? pipeline_binaryop_pack8 | |||
| : out_elempack == 4 ? pipeline_binaryop_pack4 | |||
| : pipeline_binaryop; | |||
| cmd.record_pipeline(pipeline, bindings, constants, top_blob); | |||
| } | |||
| cmd.record_pipeline(pipeline, bindings, constants, top_blob); | |||
| return 0; | |||
| } | |||
| @@ -427,7 +507,7 @@ int BinaryOp_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, con | |||
| std::vector<vk_constant_type> constants(15); | |||
| constants[10].i = bottom_top_blob.dims; | |||
| constants[11].i = bottom_top_blob.w; | |||
| constants[12].i = bottom_top_blob.h; | |||
| constants[12].i = bottom_top_blob.h * bottom_top_blob.d; | |||
| constants[13].i = bottom_top_blob.c; | |||
| constants[14].i = bottom_top_blob.cstep; | |||
| @@ -458,7 +538,7 @@ int BinaryOp_vulkan::forward(const std::vector<VkImageMat>& bottom_blobs, std::v | |||
| } | |||
| else // if (bottom_blob.dims == bottom_blob1.dims) | |||
| { | |||
| if (bottom_blob.w * bottom_blob.h * bottom_blob.c * bottom_blob.elempack >= bottom_blob1.w * bottom_blob1.h * bottom_blob1.c * bottom_blob1.elempack) | |||
| if (bottom_blob.w * bottom_blob.h * bottom_blob.d * bottom_blob.c * bottom_blob.elempack >= bottom_blob1.w * bottom_blob1.h * bottom_blob1.d * bottom_blob1.c * bottom_blob1.elempack) | |||
| { | |||
| top_blob.create_like(bottom_blob, opt.blob_vkallocator); | |||
| } | |||
| @@ -477,39 +557,63 @@ int BinaryOp_vulkan::forward(const std::vector<VkImageMat>& bottom_blobs, std::v | |||
| bindings[1] = bottom_blob1; | |||
| bindings[2] = top_blob; | |||
| std::vector<vk_constant_type> constants(15); | |||
| constants[0].i = bottom_blob.dims; | |||
| constants[1].i = bottom_blob.w; | |||
| constants[2].i = bottom_blob.h; | |||
| constants[3].i = bottom_blob.c; | |||
| constants[4].i = 0; //bottom_blob.cstep; | |||
| constants[5].i = bottom_blob1.dims; | |||
| constants[6].i = bottom_blob1.w; | |||
| constants[7].i = bottom_blob1.h; | |||
| constants[8].i = bottom_blob1.c; | |||
| constants[9].i = 0; //bottom_blob1.cstep; | |||
| constants[10].i = top_blob.dims; | |||
| constants[11].i = top_blob.w; | |||
| constants[12].i = top_blob.h; | |||
| constants[13].i = top_blob.c; | |||
| constants[14].i = 0; //top_blob.cstep; | |||
| bool broadcast = true; | |||
| if (bottom_blob.dims == bottom_blob1.dims | |||
| && bottom_blob.w == bottom_blob1.w | |||
| && bottom_blob.h == bottom_blob1.h | |||
| && bottom_blob.d == bottom_blob1.d | |||
| && bottom_blob.c == bottom_blob1.c | |||
| && bottom_blob.elempack == bottom_blob1.elempack) | |||
| { | |||
| broadcast = false; | |||
| } | |||
| const Pipeline* pipeline = 0; | |||
| if (broadcast) | |||
| { | |||
| std::vector<vk_constant_type> constants(18); | |||
| constants[0].i = bottom_blob.dims; | |||
| constants[1].i = bottom_blob.w; | |||
| constants[2].i = bottom_blob.h; | |||
| constants[3].i = bottom_blob.d; | |||
| constants[4].i = bottom_blob.c; | |||
| constants[5].i = 0; //bottom_blob.cstep; | |||
| constants[6].i = bottom_blob1.dims; | |||
| constants[7].i = bottom_blob1.w; | |||
| constants[8].i = bottom_blob1.h; | |||
| constants[9].i = bottom_blob1.d; | |||
| constants[10].i = bottom_blob1.c; | |||
| constants[11].i = 0; //bottom_blob1.cstep; | |||
| constants[12].i = top_blob.dims; | |||
| constants[13].i = top_blob.w; | |||
| constants[14].i = top_blob.h; | |||
| constants[15].i = top_blob.d; | |||
| constants[16].i = top_blob.c; | |||
| constants[17].i = 0; //top_blob.cstep; | |||
| std::vector<vk_constant_type> constants_broadcast_a1b1(15); | |||
| constants_broadcast_a1b1[0].i = bottom_blob.dims; | |||
| constants_broadcast_a1b1[1].i = bottom_blob.w; | |||
| constants_broadcast_a1b1[2].i = bottom_blob.h * bottom_blob.d; | |||
| constants_broadcast_a1b1[3].i = bottom_blob.c; | |||
| constants_broadcast_a1b1[4].i = 0; //bottom_blob.cstep; | |||
| constants_broadcast_a1b1[5].i = bottom_blob1.dims; | |||
| constants_broadcast_a1b1[6].i = bottom_blob1.w; | |||
| constants_broadcast_a1b1[7].i = bottom_blob1.h * bottom_blob1.d; | |||
| constants_broadcast_a1b1[8].i = bottom_blob1.c; | |||
| constants_broadcast_a1b1[9].i = 0; //bottom_blob1.cstep; | |||
| constants_broadcast_a1b1[10].i = top_blob.dims; | |||
| constants_broadcast_a1b1[11].i = top_blob.w; | |||
| constants_broadcast_a1b1[12].i = top_blob.h * top_blob.d; | |||
| constants_broadcast_a1b1[13].i = top_blob.c; | |||
| constants_broadcast_a1b1[14].i = 0; //top_blob.cstep; | |||
| bool broadcast_a1b1 = true; | |||
| const Pipeline* pipeline = 0; | |||
| if (bottom_blob.elempack == 1 && bottom_blob1.elempack == 1) | |||
| { | |||
| pipeline = pipeline_binaryop_broadcast; | |||
| broadcast_a1b1 = false; | |||
| } | |||
| else | |||
| { | |||
| @@ -534,18 +638,38 @@ int BinaryOp_vulkan::forward(const std::vector<VkImageMat>& bottom_blobs, std::v | |||
| else | |||
| { | |||
| pipeline = out_elempack == 8 ? pipeline_binaryop_broadcast_pack8 : pipeline_binaryop_broadcast_pack4; | |||
| broadcast_a1b1 = false; | |||
| } | |||
| } | |||
| cmd.record_pipeline(pipeline, bindings, broadcast_a1b1 ? constants_broadcast_a1b1 : constants, top_blob); | |||
| } | |||
| else | |||
| { | |||
| pipeline = out_elempack == 8 ? pipeline_binaryop_pack8 | |||
| : out_elempack == 4 ? pipeline_binaryop_pack4 | |||
| : pipeline_binaryop; | |||
| std::vector<vk_constant_type> constants(15); | |||
| constants[0].i = bottom_blob.dims; | |||
| constants[1].i = bottom_blob.w; | |||
| constants[2].i = bottom_blob.h * bottom_blob.d; | |||
| constants[3].i = bottom_blob.c; | |||
| constants[4].i = 0; //bottom_blob.cstep; | |||
| constants[5].i = bottom_blob1.dims; | |||
| constants[6].i = bottom_blob1.w; | |||
| constants[7].i = bottom_blob1.h * bottom_blob1.d; | |||
| constants[8].i = bottom_blob1.c; | |||
| constants[9].i = 0; //bottom_blob1.cstep; | |||
| constants[10].i = top_blob.dims; | |||
| constants[11].i = top_blob.w; | |||
| constants[12].i = top_blob.h * top_blob.d; | |||
| constants[13].i = top_blob.c; | |||
| constants[14].i = 0; //top_blob.cstep; | |||
| const Pipeline* pipeline = out_elempack == 8 ? pipeline_binaryop_pack8 | |||
| : out_elempack == 4 ? pipeline_binaryop_pack4 | |||
| : pipeline_binaryop; | |||
| cmd.record_pipeline(pipeline, bindings, constants, top_blob); | |||
| } | |||
| cmd.record_pipeline(pipeline, bindings, constants, top_blob); | |||
| return 0; | |||
| } | |||
| @@ -561,7 +685,7 @@ int BinaryOp_vulkan::forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd | |||
| std::vector<vk_constant_type> constants(15); | |||
| constants[10].i = bottom_top_blob.dims; | |||
| constants[11].i = bottom_top_blob.w; | |||
| constants[12].i = bottom_top_blob.h; | |||
| constants[12].i = bottom_top_blob.h * bottom_top_blob.d; | |||
| constants[13].i = bottom_top_blob.c; | |||
| constants[14].i = 0; //bottom_top_blob.cstep; | |||
| @@ -84,7 +84,7 @@ int ReLU_vulkan::create_pipeline(const Option& opt) | |||
| local_size_xyz.h = std::min(4, shape_packed.h); | |||
| local_size_xyz.c = std::min(4, shape_packed.c); | |||
| } | |||
| if (shape_packed.dims == 3) | |||
| if (shape_packed.dims == 4) | |||
| { | |||
| local_size_xyz.w = std::min(4, shape_packed.w); | |||
| local_size_xyz.h = std::min(4, shape_packed.h * shape_packed.d); | |||
| @@ -27,20 +27,23 @@ layout (constant_id = 0) const int op_type = 0; | |||
| layout (constant_id = shape_constant_id_offset + 0) const int adims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int aw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int ah = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int ac = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int acstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int bdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int bw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int bh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 8) const int bc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 9) const int bcstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 10) const int outdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 11) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 12) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 13) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 14) const int outcstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int ad = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int ac = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int acstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int bdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int bw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 8) const int bh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 9) const int bd = 0; | |||
| layout (constant_id = shape_constant_id_offset + 10) const int bc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 11) const int bcstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 12) const int outdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 13) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 14) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 15) const int outd = 0; | |||
| layout (constant_id = shape_constant_id_offset + 16) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 17) const int outcstep = 0; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D a_blob_3d; | |||
| @@ -57,18 +60,21 @@ layout (push_constant) uniform parameter | |||
| int adims; | |||
| int aw; | |||
| int ah; | |||
| int ad; | |||
| int ac; | |||
| int acstep; | |||
| int bdims; | |||
| int bw; | |||
| int bh; | |||
| int bd; | |||
| int bc; | |||
| int bcstep; | |||
| int outdims; | |||
| int outw; | |||
| int outh; | |||
| int outd; | |||
| int outc; | |||
| int outcstep; | |||
| } p; | |||
| @@ -79,7 +85,7 @@ void main() | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) | |||
| if (gx >= psc(outw) || gy >= psc(outh) * psc(outd) || gz >= psc(outc)) | |||
| return; | |||
| #if NCNN_image_shader | |||
| @@ -90,8 +96,58 @@ void main() | |||
| int by = gy; | |||
| int bz = gz; | |||
| if (psc(adims) == 3) | |||
| if (psc(adims) == 4) | |||
| { | |||
| int yd = gy / psc(outh); | |||
| int yh = gy % psc(outh); | |||
| if (psc(bdims) == 3) | |||
| { | |||
| // type 28 | |||
| bx = yh; | |||
| by = yd; | |||
| bz = gz; | |||
| } | |||
| if (psc(bdims) == 2) | |||
| { | |||
| // type 27 | |||
| bx = yd; | |||
| by = gz; | |||
| bz = 0; | |||
| } | |||
| if (psc(bdims) == 1) | |||
| { | |||
| if (psc(bw) == 1) | |||
| { | |||
| // type 25 | |||
| bx = 0; | |||
| by = 0; | |||
| bz = 0; | |||
| } | |||
| else | |||
| { | |||
| // type 26 | |||
| bx = gz; | |||
| by = 0; | |||
| bz = 0; | |||
| } | |||
| } | |||
| } | |||
| else if (psc(adims) == 3) | |||
| { | |||
| if (psc(bdims) == 4) | |||
| { | |||
| int yd = gy / psc(outh); | |||
| int yh = gy % psc(outh); | |||
| // type 23 | |||
| ax = yh; | |||
| ay = yd; | |||
| az = gz; | |||
| } | |||
| if (psc(bdims) == 3) | |||
| { | |||
| if (psc(bw) == 1 && psc(bh) == 1) | |||
| @@ -173,6 +229,17 @@ void main() | |||
| } | |||
| else if (psc(adims) == 2) | |||
| { | |||
| if (psc(bdims) == 4) | |||
| { | |||
| int yd = gy / psc(outh); | |||
| int yh = gy % psc(outh); | |||
| // type 22 | |||
| ax = yd; | |||
| ay = gz; | |||
| az = 0; | |||
| } | |||
| if (psc(bdims) == 3) | |||
| { | |||
| // type 14 | |||
| @@ -203,13 +270,21 @@ void main() | |||
| { | |||
| if (psc(aw) == 1) | |||
| { | |||
| // type 2 3 4 | |||
| // type 2 3 4 20 | |||
| ax = 0; | |||
| ay = 0; | |||
| az = 0; | |||
| } | |||
| else | |||
| { | |||
| if (psc(bdims) == 4) | |||
| { | |||
| // type 21 | |||
| ax = gz; | |||
| ay = 0; | |||
| az = 0; | |||
| } | |||
| if (psc(bdims) == 3) | |||
| { | |||
| // type 9 | |||
| @@ -247,8 +322,53 @@ void main() | |||
| int ai; | |||
| int bi; | |||
| if (psc(adims) == 3) | |||
| if (psc(adims) == 4) | |||
| { | |||
| int yd = gy / psc(outh); | |||
| int yh = gy % psc(outh); | |||
| if (psc(bdims) == 3) | |||
| { | |||
| // type 28 | |||
| ai = gi; | |||
| bi = gz * psc(bcstep) + yd * psc(bw) + yh; | |||
| } | |||
| if (psc(bdims) == 2) | |||
| { | |||
| // type 27 | |||
| ai = gi; | |||
| bi = gz * psc(bw) + yd; | |||
| } | |||
| if (psc(bdims) == 1) | |||
| { | |||
| if (psc(bw) == 1) | |||
| { | |||
| // type 25 | |||
| ai = gi; | |||
| bi = 0; | |||
| } | |||
| else | |||
| { | |||
| // type 26 | |||
| ai = gi; | |||
| bi = gz; | |||
| } | |||
| } | |||
| } | |||
| else if (psc(adims) == 3) | |||
| { | |||
| if (psc(bdims) == 4) | |||
| { | |||
| int yd = gy / psc(outh); | |||
| int yh = gy % psc(outh); | |||
| // type 23 | |||
| ai = gz * psc(acstep) + yd * psc(aw) + yh; | |||
| bi = gi; | |||
| } | |||
| if (psc(bdims) == 3) | |||
| { | |||
| if (psc(bw) == 1 && psc(bh) == 1) | |||
| @@ -333,6 +453,16 @@ void main() | |||
| } | |||
| else if (psc(adims) == 2) | |||
| { | |||
| if (psc(bdims) == 4) | |||
| { | |||
| int yd = gy / psc(outh); | |||
| int yh = gy % psc(outh); | |||
| // type 22 | |||
| ai = gz * psc(aw) + yd; | |||
| bi = gi; | |||
| } | |||
| if (psc(bdims) == 3) | |||
| { | |||
| // type 14 | |||
| @@ -360,12 +490,19 @@ void main() | |||
| { | |||
| if (psc(aw) == 1) | |||
| { | |||
| // type 2 3 4 | |||
| // type 2 3 4 20 | |||
| ai = 0; | |||
| bi = gi; | |||
| } | |||
| else | |||
| { | |||
| if (psc(bdims) == 4) | |||
| { | |||
| // type 21 | |||
| ai = gz; | |||
| bi = gi; | |||
| } | |||
| if (psc(bdims) == 3) | |||
| { | |||
| // type 9 | |||
| @@ -27,20 +27,23 @@ layout (constant_id = 0) const int op_type = 0; | |||
| layout (constant_id = shape_constant_id_offset + 0) const int adims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int aw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int ah = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int ac = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int acstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int bdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int bw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int bh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 8) const int bc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 9) const int bcstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 10) const int outdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 11) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 12) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 13) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 14) const int outcstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int ad = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int ac = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int acstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int bdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int bw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 8) const int bh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 9) const int bd = 0; | |||
| layout (constant_id = shape_constant_id_offset + 10) const int bc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 11) const int bcstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 12) const int outdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 13) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 14) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 15) const int outd = 0; | |||
| layout (constant_id = shape_constant_id_offset + 16) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 17) const int outcstep = 0; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D a_blob_3d; | |||
| @@ -57,18 +60,21 @@ layout (push_constant) uniform parameter | |||
| int adims; | |||
| int aw; | |||
| int ah; | |||
| int ad; | |||
| int ac; | |||
| int acstep; | |||
| int bdims; | |||
| int bw; | |||
| int bh; | |||
| int bd; | |||
| int bc; | |||
| int bcstep; | |||
| int outdims; | |||
| int outw; | |||
| int outh; | |||
| int outd; | |||
| int outc; | |||
| int outcstep; | |||
| } p; | |||
| @@ -79,7 +85,7 @@ void main() | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) | |||
| if (gx >= psc(outw) || gy >= psc(outh) * psc(outd) || gz >= psc(outc)) | |||
| return; | |||
| #if NCNN_image_shader | |||
| @@ -90,8 +96,58 @@ void main() | |||
| int by = gy; | |||
| int bz = gz; | |||
| if (psc(adims) == 3) | |||
| if (psc(adims) == 4) | |||
| { | |||
| int yd = gy / psc(outh); | |||
| int yh = gy % psc(outh); | |||
| if (psc(bdims) == 3) | |||
| { | |||
| // type 28 | |||
| bx = yh; | |||
| by = yd; | |||
| bz = gz; | |||
| } | |||
| if (psc(bdims) == 2) | |||
| { | |||
| // type 27 | |||
| bx = yd; | |||
| by = gz; | |||
| bz = 0; | |||
| } | |||
| if (psc(bdims) == 1) | |||
| { | |||
| if (psc(bw) == 1) | |||
| { | |||
| // type 25 | |||
| bx = 0; | |||
| by = 0; | |||
| bz = 0; | |||
| } | |||
| else | |||
| { | |||
| // type 26 | |||
| bx = gz; | |||
| by = 0; | |||
| bz = 0; | |||
| } | |||
| } | |||
| } | |||
| else if (psc(adims) == 3) | |||
| { | |||
| if (psc(bdims) == 4) | |||
| { | |||
| int yd = gy / psc(outh); | |||
| int yh = gy % psc(outh); | |||
| // type 23 | |||
| ax = yh; | |||
| ay = yd; | |||
| az = gz; | |||
| } | |||
| if (psc(bdims) == 3) | |||
| { | |||
| if (psc(bw) == 1 && psc(bh) == 1) | |||
| @@ -173,6 +229,17 @@ void main() | |||
| } | |||
| else if (psc(adims) == 2) | |||
| { | |||
| if (psc(bdims) == 4) | |||
| { | |||
| int yd = gy / psc(outh); | |||
| int yh = gy % psc(outh); | |||
| // type 22 | |||
| ax = yd; | |||
| ay = gz; | |||
| az = 0; | |||
| } | |||
| if (psc(bdims) == 3) | |||
| { | |||
| // type 14 | |||
| @@ -203,13 +270,21 @@ void main() | |||
| { | |||
| if (psc(aw) == 1) | |||
| { | |||
| // type 2 3 4 | |||
| // type 2 3 4 20 | |||
| ax = 0; | |||
| ay = 0; | |||
| az = 0; | |||
| } | |||
| else | |||
| { | |||
| if (psc(bdims) == 4) | |||
| { | |||
| // type 21 | |||
| ax = gz; | |||
| ay = 0; | |||
| az = 0; | |||
| } | |||
| if (psc(bdims) == 3) | |||
| { | |||
| // type 9 | |||
| @@ -247,8 +322,53 @@ void main() | |||
| int ai; | |||
| int bi; | |||
| if (psc(adims) == 3) | |||
| if (psc(adims) == 4) | |||
| { | |||
| int yd = gy / psc(outh); | |||
| int yh = gy % psc(outh); | |||
| if (psc(bdims) == 3) | |||
| { | |||
| // type 28 | |||
| ai = gi; | |||
| bi = gz * psc(bcstep) + yd * psc(bw) + yh; | |||
| } | |||
| if (psc(bdims) == 2) | |||
| { | |||
| // type 27 | |||
| ai = gi; | |||
| bi = gz * psc(bw) + yd; | |||
| } | |||
| if (psc(bdims) == 1) | |||
| { | |||
| if (psc(bw) == 1) | |||
| { | |||
| // type 25 | |||
| ai = gi; | |||
| bi = 0; | |||
| } | |||
| else | |||
| { | |||
| // type 26 | |||
| ai = gi; | |||
| bi = gz; | |||
| } | |||
| } | |||
| } | |||
| else if (psc(adims) == 3) | |||
| { | |||
| if (psc(bdims) == 4) | |||
| { | |||
| int yd = gy / psc(outh); | |||
| int yh = gy % psc(outh); | |||
| // type 23 | |||
| ai = gz * psc(acstep) + yd * psc(aw) + yh; | |||
| bi = gi; | |||
| } | |||
| if (psc(bdims) == 3) | |||
| { | |||
| if (psc(bw) == 1 && psc(bh) == 1) | |||
| @@ -310,6 +430,16 @@ void main() | |||
| } | |||
| else if (psc(adims) == 2) | |||
| { | |||
| if (psc(bdims) == 4) | |||
| { | |||
| int yd = gy / psc(outh); | |||
| int yh = gy % psc(outh); | |||
| // type 22 | |||
| ai = gz * psc(aw) + yd; | |||
| bi = gi; | |||
| } | |||
| if (psc(bdims) == 3) | |||
| { | |||
| // type 14 | |||
| @@ -326,6 +456,13 @@ void main() | |||
| } | |||
| else if (psc(adims) == 1) | |||
| { | |||
| if (psc(bdims) == 4) | |||
| { | |||
| // type 21 | |||
| ai = gz; | |||
| bi = gi; | |||
| } | |||
| if (psc(bdims) == 3) | |||
| { | |||
| // type 9 | |||
| @@ -28,20 +28,23 @@ layout (constant_id = 0) const int op_type = 0; | |||
| layout (constant_id = shape_constant_id_offset + 0) const int adims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int aw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int ah = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int ac = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int acstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int bdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int bw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int bh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 8) const int bc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 9) const int bcstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 10) const int outdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 11) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 12) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 13) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 14) const int outcstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int ad = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int ac = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int acstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int bdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int bw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 8) const int bh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 9) const int bd = 0; | |||
| layout (constant_id = shape_constant_id_offset + 10) const int bc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 11) const int bcstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 12) const int outdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 13) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 14) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 15) const int outd = 0; | |||
| layout (constant_id = shape_constant_id_offset + 16) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 17) const int outcstep = 0; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D a_blob_3d; | |||
| @@ -58,18 +61,21 @@ layout (push_constant) uniform parameter | |||
| int adims; | |||
| int aw; | |||
| int ah; | |||
| int ad; | |||
| int ac; | |||
| int acstep; | |||
| int bdims; | |||
| int bw; | |||
| int bh; | |||
| int bd; | |||
| int bc; | |||
| int bcstep; | |||
| int outdims; | |||
| int outw; | |||
| int outh; | |||
| int outd; | |||
| int outc; | |||
| int outcstep; | |||
| } p; | |||
| @@ -80,7 +86,7 @@ void main() | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) | |||
| if (gx >= psc(outw) || gy >= psc(outh) * psc(outd) || gz >= psc(outc)) | |||
| return; | |||
| #if NCNN_image_shader | |||
| @@ -91,8 +97,58 @@ void main() | |||
| int by = gy; | |||
| int bz = gz; | |||
| if (psc(adims) == 3) | |||
| if (psc(adims) == 4) | |||
| { | |||
| int yd = gy / psc(outh); | |||
| int yh = gy % psc(outh); | |||
| if (psc(bdims) == 3) | |||
| { | |||
| // type 28 | |||
| bx = yh; | |||
| by = yd; | |||
| bz = gz; | |||
| } | |||
| if (psc(bdims) == 2) | |||
| { | |||
| // type 27 | |||
| bx = yd; | |||
| by = gz; | |||
| bz = 0; | |||
| } | |||
| if (psc(bdims) == 1) | |||
| { | |||
| if (psc(bw) == 1) | |||
| { | |||
| // type 25 | |||
| bx = 0; | |||
| by = 0; | |||
| bz = 0; | |||
| } | |||
| else | |||
| { | |||
| // type 26 | |||
| bx = gz; | |||
| by = 0; | |||
| bz = 0; | |||
| } | |||
| } | |||
| } | |||
| else if (psc(adims) == 3) | |||
| { | |||
| if (psc(bdims) == 4) | |||
| { | |||
| int yd = gy / psc(outh); | |||
| int yh = gy % psc(outh); | |||
| // type 23 | |||
| ax = yh; | |||
| ay = yd; | |||
| az = gz; | |||
| } | |||
| if (psc(bdims) == 3) | |||
| { | |||
| if (psc(bw) == 1 && psc(bh) == 1) | |||
| @@ -174,6 +230,17 @@ void main() | |||
| } | |||
| else if (psc(adims) == 2) | |||
| { | |||
| if (psc(bdims) == 4) | |||
| { | |||
| int yd = gy / psc(outh); | |||
| int yh = gy % psc(outh); | |||
| // type 22 | |||
| ax = yd; | |||
| ay = gz; | |||
| az = 0; | |||
| } | |||
| if (psc(bdims) == 3) | |||
| { | |||
| // type 14 | |||
| @@ -204,13 +271,21 @@ void main() | |||
| { | |||
| if (psc(aw) == 1) | |||
| { | |||
| // type 2 3 4 | |||
| // type 2 3 4 20 | |||
| ax = 0; | |||
| ay = 0; | |||
| az = 0; | |||
| } | |||
| else | |||
| { | |||
| if (psc(bdims) == 4) | |||
| { | |||
| // type 21 | |||
| ax = gz; | |||
| ay = 0; | |||
| az = 0; | |||
| } | |||
| if (psc(bdims) == 3) | |||
| { | |||
| // type 9 | |||
| @@ -248,8 +323,53 @@ void main() | |||
| int ai; | |||
| int bi; | |||
| if (psc(adims) == 3) | |||
| if (psc(adims) == 4) | |||
| { | |||
| int yd = gy / psc(outh); | |||
| int yh = gy % psc(outh); | |||
| if (psc(bdims) == 3) | |||
| { | |||
| // type 28 | |||
| ai = gi; | |||
| bi = gz * psc(bcstep) + yd * psc(bw) + yh; | |||
| } | |||
| if (psc(bdims) == 2) | |||
| { | |||
| // type 27 | |||
| ai = gi; | |||
| bi = gz * psc(bw) + yd; | |||
| } | |||
| if (psc(bdims) == 1) | |||
| { | |||
| if (psc(bw) == 1) | |||
| { | |||
| // type 25 | |||
| ai = gi; | |||
| bi = 0; | |||
| } | |||
| else | |||
| { | |||
| // type 26 | |||
| ai = gi; | |||
| bi = gz; | |||
| } | |||
| } | |||
| } | |||
| else if (psc(adims) == 3) | |||
| { | |||
| if (psc(bdims) == 4) | |||
| { | |||
| int yd = gy / psc(outh); | |||
| int yh = gy % psc(outh); | |||
| // type 23 | |||
| ai = gz * psc(acstep) + yd * psc(aw) + yh; | |||
| bi = gi; | |||
| } | |||
| if (psc(bdims) == 3) | |||
| { | |||
| if (psc(bw) == 1 && psc(bh) == 1) | |||
| @@ -311,6 +431,16 @@ void main() | |||
| } | |||
| else if (psc(adims) == 2) | |||
| { | |||
| if (psc(bdims) == 4) | |||
| { | |||
| int yd = gy / psc(outh); | |||
| int yh = gy % psc(outh); | |||
| // type 22 | |||
| ai = gz * psc(aw) + yd; | |||
| bi = gi; | |||
| } | |||
| if (psc(bdims) == 3) | |||
| { | |||
| // type 14 | |||
| @@ -327,6 +457,13 @@ void main() | |||
| } | |||
| else if (psc(adims) == 1) | |||
| { | |||
| if (psc(bdims) == 4) | |||
| { | |||
| // type 21 | |||
| ai = gz; | |||
| bi = gi; | |||
| } | |||
| if (psc(bdims) == 3) | |||
| { | |||
| // type 9 | |||
| @@ -35,7 +35,7 @@ int UnaryOp_vulkan::create_pipeline(const Option& opt) | |||
| int elempack = 1; | |||
| if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; | |||
| if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; | |||
| if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; | |||
| if (shape.dims == 3 || shape.dims == 4) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; | |||
| size_t elemsize; | |||
| if (opt.use_fp16_storage) | |||
| @@ -55,12 +55,13 @@ int UnaryOp_vulkan::create_pipeline(const Option& opt) | |||
| if (shape.dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack); | |||
| if (shape.dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack); | |||
| if (shape.dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack); | |||
| if (shape.dims == 4) shape_packed = Mat(shape.w, shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack); | |||
| std::vector<vk_specialization_type> specializations(1 + 5); | |||
| specializations[0].i = op_type; | |||
| specializations[1 + 0].i = shape_packed.dims; | |||
| specializations[1 + 1].i = shape_packed.w; | |||
| specializations[1 + 2].i = shape_packed.h; | |||
| specializations[1 + 2].i = shape_packed.h * shape_packed.d; | |||
| specializations[1 + 3].i = shape_packed.c; | |||
| specializations[1 + 4].i = shape_packed.cstep; | |||
| @@ -83,6 +84,12 @@ int UnaryOp_vulkan::create_pipeline(const Option& opt) | |||
| local_size_xyz.h = std::min(4, shape_packed.h); | |||
| local_size_xyz.c = std::min(4, shape_packed.c); | |||
| } | |||
| if (shape_packed.dims == 4) | |||
| { | |||
| local_size_xyz.w = std::min(4, shape_packed.w); | |||
| local_size_xyz.h = std::min(4, shape_packed.h * shape_packed.d); | |||
| local_size_xyz.c = std::min(4, shape_packed.c); | |||
| } | |||
| // pack1 | |||
| if (shape.dims == 0 || elempack == 1) | |||
| @@ -135,7 +142,7 @@ int UnaryOp_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, cons | |||
| std::vector<vk_constant_type> constants(5); | |||
| constants[0].i = bottom_top_blob.dims; | |||
| constants[1].i = bottom_top_blob.w; | |||
| constants[2].i = bottom_top_blob.h; | |||
| constants[2].i = bottom_top_blob.h * bottom_top_blob.d; | |||
| constants[3].i = bottom_top_blob.c; | |||
| constants[4].i = bottom_top_blob.cstep; | |||
| @@ -159,7 +166,7 @@ int UnaryOp_vulkan::forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, | |||
| std::vector<vk_constant_type> constants(5); | |||
| constants[0].i = bottom_top_blob.dims; | |||
| constants[1].i = bottom_top_blob.w; | |||
| constants[2].i = bottom_top_blob.h; | |||
| constants[2].i = bottom_top_blob.h * bottom_top_blob.d; | |||
| constants[3].i = bottom_top_blob.c; | |||
| constants[4].i = 0; //bottom_top_blob.cstep; | |||
| @@ -44,20 +44,203 @@ static int binary_op_pack8(const Mat& a, const Mat& b, Mat& c, const Option& opt | |||
| int w = a.w; | |||
| int h = a.h; | |||
| int d = a.d; | |||
| int channels = a.c; | |||
| int size = w * h; | |||
| int size = w * h * d; | |||
| size_t elemsize = a.elemsize; | |||
| int elempack = a.elempack; | |||
| int w1 = b.w; | |||
| int h1 = b.h; | |||
| int d1 = b.d; | |||
| int channels1 = b.c; | |||
| int size1 = w1 * h1; | |||
| int size1 = w1 * h1 * d1; | |||
| size_t elemsize1 = b.elemsize; | |||
| int elempack1 = b.elempack; | |||
| if (a.dims == 3) | |||
| if (a.dims == 4) | |||
| { | |||
| if (b.dims == 4) | |||
| { | |||
| // type 29 | |||
| c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| const float* ptr1 = b.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int i = 0; i < size; i++) | |||
| { | |||
| __m256 _p = _mm256_loadu_ps(ptr); | |||
| __m256 _p1 = _mm256_loadu_ps(ptr1); | |||
| __m256 _outp = op(_p, _p1); | |||
| _mm256_storeu_ps(outptr, _outp); | |||
| ptr += 8; | |||
| ptr1 += 8; | |||
| outptr += 8; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| if (b.dims == 3) | |||
| { | |||
| // type 28 | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| const float* ptr1 = b.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int z = 0; z < d; z++) | |||
| { | |||
| for (int y = 0; y < h; y++) | |||
| { | |||
| __m256 _b0 = _mm256_loadu_ps(ptr1); | |||
| for (int x = 0; x < w; x++) | |||
| { | |||
| __m256 _p = _mm256_loadu_ps(ptr); | |||
| __m256 _outp = op(_p, _b0); | |||
| _mm256_storeu_ps(outptr, _outp); | |||
| ptr += 8; | |||
| outptr += 8; | |||
| } | |||
| ptr1 += 8; | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| if (b.dims == 2) | |||
| { | |||
| // type 27 | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| const float* ptr1 = b.row(q); | |||
| float* outptr = c.channel(q); | |||
| for (int z = 0; z < d; z++) | |||
| { | |||
| __m256 _b0 = _mm256_loadu_ps(ptr1); | |||
| for (int y = 0; y < h; y++) | |||
| { | |||
| for (int x = 0; x < w; x++) | |||
| { | |||
| __m256 _p = _mm256_loadu_ps(ptr); | |||
| __m256 _outp = op(_p, _b0); | |||
| _mm256_storeu_ps(outptr, _outp); | |||
| ptr += 8; | |||
| outptr += 8; | |||
| } | |||
| } | |||
| ptr1 += 8; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| if (b.dims == 1) | |||
| { | |||
| if (b.w == 1 && elempack1 == 1) | |||
| { | |||
| // type 25 | |||
| __m256 _b0 = _mm256_set1_ps(b[0]); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int i = 0; i < size; i++) | |||
| { | |||
| __m256 _p = _mm256_loadu_ps(ptr); | |||
| __m256 _outp = op(_p, _b0); | |||
| _mm256_storeu_ps(outptr, _outp); | |||
| ptr += 8; | |||
| outptr += 8; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| // type 26 | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| __m256 _b0 = _mm256_loadu_ps((const float*)b + q * 8); | |||
| float* outptr = c.channel(q); | |||
| for (int i = 0; i < size; i++) | |||
| { | |||
| __m256 _p = _mm256_loadu_ps(ptr); | |||
| __m256 _outp = op(_p, _b0); | |||
| _mm256_storeu_ps(outptr, _outp); | |||
| ptr += 8; | |||
| outptr += 8; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } | |||
| else if (a.dims == 3) | |||
| { | |||
| if (b.dims == 4) | |||
| { | |||
| // type 23 | |||
| c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels1; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| const float* ptr1 = b.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int z = 0; z < d1; z++) | |||
| { | |||
| for (int y = 0; y < h1; y++) | |||
| { | |||
| __m256 _a0 = _mm256_loadu_ps(ptr); | |||
| for (int x = 0; x < w1; x++) | |||
| { | |||
| __m256 _p = _mm256_loadu_ps(ptr1); | |||
| __m256 _outp = op(_a0, _p); | |||
| _mm256_storeu_ps(outptr, _outp); | |||
| ptr1 += 8; | |||
| outptr += 8; | |||
| } | |||
| ptr += 8; | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| if (b.dims == 3) | |||
| { | |||
| if (w1 == 1 && h1 == 1 && channels1 == channels) | |||
| @@ -406,6 +589,42 @@ static int binary_op_pack8(const Mat& a, const Mat& b, Mat& c, const Option& opt | |||
| } | |||
| else if (a.dims == 2) | |||
| { | |||
| if (b.dims == 4) | |||
| { | |||
| // type 22 | |||
| c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels1; q++) | |||
| { | |||
| const float* ptr = a.row(q); | |||
| const float* ptr1 = b.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int z = 0; z < d1; z++) | |||
| { | |||
| __m256 _a0 = _mm256_loadu_ps(ptr); | |||
| for (int y = 0; y < h1; y++) | |||
| { | |||
| for (int x = 0; x < w1; x++) | |||
| { | |||
| __m256 _p = _mm256_loadu_ps(ptr1); | |||
| __m256 _outp = op(_a0, _p); | |||
| _mm256_storeu_ps(outptr, _outp); | |||
| ptr1 += 8; | |||
| outptr += 8; | |||
| } | |||
| } | |||
| ptr += 8; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| if (b.dims == 3) | |||
| { | |||
| // type 14 | |||
| @@ -514,6 +733,33 @@ static int binary_op_pack8(const Mat& a, const Mat& b, Mat& c, const Option& opt | |||
| { | |||
| if (a.w == 1 && elempack == 1) | |||
| { | |||
| if (b.dims == 4) | |||
| { | |||
| // type 20 | |||
| c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| __m256 _a0 = _mm256_set1_ps(a[0]); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels1; q++) | |||
| { | |||
| const float* ptr1 = b.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int i = 0; i < size1; i++) | |||
| { | |||
| __m256 _p1 = _mm256_loadu_ps(ptr1); | |||
| __m256 _outp = op(_a0, _p1); | |||
| _mm256_storeu_ps(outptr, _outp); | |||
| ptr1 += 8; | |||
| outptr += 8; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| if (b.dims == 3) | |||
| { | |||
| // type 4 | |||
| @@ -586,6 +832,33 @@ static int binary_op_pack8(const Mat& a, const Mat& b, Mat& c, const Option& opt | |||
| } | |||
| } | |||
| if (b.dims == 4) | |||
| { | |||
| // type 21 | |||
| c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels1; q++) | |||
| { | |||
| __m256 _a0 = _mm256_loadu_ps((const float*)a + q * 8); | |||
| const float* ptr1 = b.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int i = 0; i < size1; i++) | |||
| { | |||
| __m256 _p1 = _mm256_loadu_ps(ptr1); | |||
| __m256 _outp = op(_a0, _p1); | |||
| _mm256_storeu_ps(outptr, _outp); | |||
| ptr1 += 8; | |||
| outptr += 8; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| if (b.dims == 3) | |||
| { | |||
| // type 9 | |||
| @@ -693,8 +966,9 @@ static int binary_op_scalar_inplace_pack8(Mat& a, float b, const Option& opt) | |||
| int w = a.w; | |||
| int h = a.h; | |||
| int d = a.d; | |||
| int channels = a.c; | |||
| int size = w * h; | |||
| int size = w * h * d; | |||
| __m256 _b = _mm256_set1_ps(b); | |||
| @@ -795,20 +1069,203 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt | |||
| int w = a.w; | |||
| int h = a.h; | |||
| int d = a.d; | |||
| int channels = a.c; | |||
| int size = w * h; | |||
| int size = w * h * d; | |||
| size_t elemsize = a.elemsize; | |||
| int elempack = a.elempack; | |||
| int w1 = b.w; | |||
| int h1 = b.h; | |||
| int d1 = b.d; | |||
| int channels1 = b.c; | |||
| int size1 = w1 * h1; | |||
| int size1 = w1 * h1 * d1; | |||
| size_t elemsize1 = b.elemsize; | |||
| int elempack1 = b.elempack; | |||
| if (a.dims == 3) | |||
| if (a.dims == 4) | |||
| { | |||
| if (b.dims == 4) | |||
| { | |||
| // type 29 | |||
| c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| const float* ptr1 = b.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int i = 0; i < size; i++) | |||
| { | |||
| __m128 _p = _mm_loadu_ps(ptr); | |||
| __m128 _p1 = _mm_loadu_ps(ptr1); | |||
| __m128 _outp = op(_p, _p1); | |||
| _mm_storeu_ps(outptr, _outp); | |||
| ptr += 4; | |||
| ptr1 += 4; | |||
| outptr += 4; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| if (b.dims == 3) | |||
| { | |||
| // type 28 | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| const float* ptr1 = b.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int z = 0; z < d; z++) | |||
| { | |||
| for (int y = 0; y < h; y++) | |||
| { | |||
| __m128 _b0 = _mm_loadu_ps(ptr1); | |||
| for (int x = 0; x < w; x++) | |||
| { | |||
| __m128 _p = _mm_loadu_ps(ptr); | |||
| __m128 _outp = op(_p, _b0); | |||
| _mm_storeu_ps(outptr, _outp); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| } | |||
| ptr1 += 4; | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| if (b.dims == 2) | |||
| { | |||
| // type 27 | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| const float* ptr1 = b.row(q); | |||
| float* outptr = c.channel(q); | |||
| for (int z = 0; z < d; z++) | |||
| { | |||
| __m128 _b0 = _mm_loadu_ps(ptr1); | |||
| for (int y = 0; y < h; y++) | |||
| { | |||
| for (int x = 0; x < w; x++) | |||
| { | |||
| __m128 _p = _mm_loadu_ps(ptr); | |||
| __m128 _outp = op(_p, _b0); | |||
| _mm_storeu_ps(outptr, _outp); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| } | |||
| } | |||
| ptr1 += 4; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| if (b.dims == 1) | |||
| { | |||
| if (b.w == 1 && elempack1 == 1) | |||
| { | |||
| // type 25 | |||
| __m128 _b0 = _mm_set1_ps(b[0]); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int i = 0; i < size; i++) | |||
| { | |||
| __m128 _p = _mm_loadu_ps(ptr); | |||
| __m128 _outp = op(_p, _b0); | |||
| _mm_storeu_ps(outptr, _outp); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| // type 26 | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| __m128 _b0 = _mm_loadu_ps((const float*)b + q * 4); | |||
| float* outptr = c.channel(q); | |||
| for (int i = 0; i < size; i++) | |||
| { | |||
| __m128 _p = _mm_loadu_ps(ptr); | |||
| __m128 _outp = op(_p, _b0); | |||
| _mm_storeu_ps(outptr, _outp); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } | |||
| else if (a.dims == 3) | |||
| { | |||
| if (b.dims == 4) | |||
| { | |||
| // type 23 | |||
| c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels1; q++) | |||
| { | |||
| const float* ptr = a.channel(q); | |||
| const float* ptr1 = b.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int z = 0; z < d1; z++) | |||
| { | |||
| for (int y = 0; y < h1; y++) | |||
| { | |||
| __m128 _a0 = _mm_loadu_ps(ptr); | |||
| for (int x = 0; x < w1; x++) | |||
| { | |||
| __m128 _p = _mm_loadu_ps(ptr1); | |||
| __m128 _outp = op(_a0, _p); | |||
| _mm_storeu_ps(outptr, _outp); | |||
| ptr1 += 4; | |||
| outptr += 4; | |||
| } | |||
| ptr += 4; | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| if (b.dims == 3) | |||
| { | |||
| if (w1 == 1 && h1 == 1 && channels1 == channels) | |||
| @@ -1114,7 +1571,7 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt | |||
| if (b.w == 1 && elempack1 == 1) | |||
| { | |||
| // type 16 | |||
| __m128 _b0 = _mm_set1_ps(((const float*)b)[0]); | |||
| __m128 _b0 = _mm_set1_ps(b[0]); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| @@ -1157,6 +1614,42 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt | |||
| } | |||
| else if (a.dims == 2) | |||
| { | |||
| if (b.dims == 4) | |||
| { | |||
| // type 22 | |||
| c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels1; q++) | |||
| { | |||
| const float* ptr = a.row(q); | |||
| const float* ptr1 = b.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int z = 0; z < d1; z++) | |||
| { | |||
| __m128 _a0 = _mm_loadu_ps(ptr); | |||
| for (int y = 0; y < h1; y++) | |||
| { | |||
| for (int x = 0; x < w1; x++) | |||
| { | |||
| __m128 _p = _mm_loadu_ps(ptr1); | |||
| __m128 _outp = op(_a0, _p); | |||
| _mm_storeu_ps(outptr, _outp); | |||
| ptr1 += 4; | |||
| outptr += 4; | |||
| } | |||
| } | |||
| ptr += 4; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| if (b.dims == 3) | |||
| { | |||
| // type 14 | |||
| @@ -1223,7 +1716,7 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt | |||
| if (b.w == 1 && elempack1 == 1) | |||
| { | |||
| // type 11 | |||
| __m128 _b0 = _mm_set1_ps(((const float*)b)[0]); | |||
| __m128 _b0 = _mm_set1_ps(b[0]); | |||
| const float* ptr = a; | |||
| float* outptr = c; | |||
| for (int i = 0; i < size; i++) | |||
| @@ -1265,6 +1758,33 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt | |||
| { | |||
| if (a.w == 1 && elempack == 1) | |||
| { | |||
| if (b.dims == 4) | |||
| { | |||
| // type 20 | |||
| c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| __m128 _a0 = _mm_set1_ps(a[0]); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels1; q++) | |||
| { | |||
| const float* ptr1 = b.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int i = 0; i < size1; i++) | |||
| { | |||
| __m128 _p1 = _mm_loadu_ps(ptr1); | |||
| __m128 _outp = op(_a0, _p1); | |||
| _mm_storeu_ps(outptr, _outp); | |||
| ptr1 += 4; | |||
| outptr += 4; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| if (b.dims == 3) | |||
| { | |||
| // type 4 | |||
| @@ -1272,7 +1792,7 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt | |||
| if (c.empty()) | |||
| return -100; | |||
| __m128 _a0 = _mm_set1_ps(((const float*)a)[0]); | |||
| __m128 _a0 = _mm_set1_ps(a[0]); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels1; q++) | |||
| { | |||
| @@ -1299,7 +1819,7 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt | |||
| if (c.empty()) | |||
| return -100; | |||
| __m128 _a0 = _mm_set1_ps(((const float*)a)[0]); | |||
| __m128 _a0 = _mm_set1_ps(a[0]); | |||
| const float* ptr1 = b; | |||
| float* outptr = c; | |||
| for (int i = 0; i < size1; i++) | |||
| @@ -1321,7 +1841,7 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt | |||
| if (c.empty()) | |||
| return -100; | |||
| __m128 _a0 = _mm_set1_ps(((const float*)a)[0]); | |||
| __m128 _a0 = _mm_set1_ps(a[0]); | |||
| const float* ptr1 = b; | |||
| float* outptr = c; | |||
| for (int i = 0; i < w1; i++) | |||
| @@ -1337,6 +1857,33 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt | |||
| } | |||
| } | |||
| if (b.dims == 4) | |||
| { | |||
| // type 21 | |||
| c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); | |||
| if (c.empty()) | |||
| return -100; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < channels1; q++) | |||
| { | |||
| __m128 _a0 = _mm_loadu_ps((const float*)a + q * 4); | |||
| const float* ptr1 = b.channel(q); | |||
| float* outptr = c.channel(q); | |||
| for (int i = 0; i < size1; i++) | |||
| { | |||
| __m128 _p1 = _mm_loadu_ps(ptr1); | |||
| __m128 _outp = op(_a0, _p1); | |||
| _mm_storeu_ps(outptr, _outp); | |||
| ptr1 += 4; | |||
| outptr += 4; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| if (b.dims == 3) | |||
| { | |||
| // type 9 | |||
| @@ -1402,7 +1949,7 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt | |||
| if (b.w == 1 && elempack1 == 1) | |||
| { | |||
| // type 6 | |||
| __m128 _b0 = _mm_set1_ps(((const float*)b)[0]); | |||
| __m128 _b0 = _mm_set1_ps(b[0]); | |||
| const float* ptr = a; | |||
| float* outptr = c; | |||
| for (int i = 0; i < w; i++) | |||
| @@ -1444,8 +1991,9 @@ static int binary_op_scalar_inplace_pack4(Mat& a, float b, const Option& opt) | |||
| int w = a.w; | |||
| int h = a.h; | |||
| int d = a.d; | |||
| int channels = a.c; | |||
| int size = w * h; | |||
| int size = w * h * d; | |||
| __m128 _b = _mm_set1_ps((float)b); | |||
| @@ -1574,10 +2122,10 @@ int BinaryOp_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat> | |||
| return binary_op_pack8<binary_op_pow_pack8>(bottom_blob, bottom_blob1, top_blob, opt); | |||
| if (op_type == Operation_RSUB) | |||
| return binary_op_pack8<binary_op_rsub_pack8>(bottom_blob, bottom_blob1, top_blob, opt); | |||
| return binary_op_pack8<binary_op_sub_pack8>(bottom_blob1, bottom_blob, top_blob, opt); | |||
| if (op_type == Operation_RDIV) | |||
| return binary_op_pack8<binary_op_rdiv_pack8>(bottom_blob, bottom_blob1, top_blob, opt); | |||
| return binary_op_pack8<binary_op_div_pack8>(bottom_blob1, bottom_blob, top_blob, opt); | |||
| } | |||
| #endif // __AVX__ | |||
| @@ -1605,10 +2153,10 @@ int BinaryOp_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat> | |||
| return binary_op_pack4<binary_op_pow_pack4>(bottom_blob, bottom_blob1, top_blob, opt); | |||
| if (op_type == Operation_RSUB) | |||
| return binary_op_pack4<binary_op_rsub_pack4>(bottom_blob, bottom_blob1, top_blob, opt); | |||
| return binary_op_pack4<binary_op_sub_pack4>(bottom_blob1, bottom_blob, top_blob, opt); | |||
| if (op_type == Operation_RDIV) | |||
| return binary_op_pack4<binary_op_rdiv_pack4>(bottom_blob, bottom_blob1, top_blob, opt); | |||
| return binary_op_pack4<binary_op_div_pack4>(bottom_blob1, bottom_blob, top_blob, opt); | |||
| } | |||
| #endif // __SSE2__ | |||
| @@ -797,7 +797,7 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio | |||
| int elemcount = 0; | |||
| if (dims == 1) elemcount = bottom_blob.elempack * bottom_blob.w; | |||
| if (dims == 2) elemcount = bottom_blob.elempack * bottom_blob.h; | |||
| if (dims == 3) elemcount = bottom_blob.elempack * bottom_blob.c; | |||
| if (dims == 3 || dims == 4) elemcount = bottom_blob.elempack * bottom_blob.c; | |||
| int elembits = bottom_blob.elembits(); | |||
| @@ -50,7 +50,7 @@ static int test_binaryop(const ncnn::Mat& _a, const ncnn::Mat& _b) | |||
| int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, ab); | |||
| if (ret != 0) | |||
| { | |||
| fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d) b.dims=%d b=(%d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.c, b.dims, b.w, b.h, b.c, op_type); | |||
| fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b.dims=%d b=(%d %d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b.dims, b.w, b.h, b.d, b.c, op_type); | |||
| } | |||
| return ret; | |||
| @@ -76,7 +76,7 @@ static int test_binaryop(const ncnn::Mat& _a, float b) | |||
| int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, a); | |||
| if (ret != 0) | |||
| { | |||
| fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d) b=%f op_type=%d\n", a.dims, a.w, a.h, a.c, b, op_type); | |||
| fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b=%f op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b, op_type); | |||
| } | |||
| return ret; | |||
| @@ -234,6 +234,86 @@ static int test_binaryop_19() | |||
| || test_binaryop(RandomMat(11, 6, 16), RandomMat(11, 6, 16)); | |||
| } | |||
| static int test_binaryop_20() | |||
| { | |||
| return 0 | |||
| || test_binaryop(RandomMat(1), RandomMat(11, 3, 4, 2)) | |||
| || test_binaryop(RandomMat(1), RandomMat(11, 3, 4, 4)) | |||
| || test_binaryop(RandomMat(1), RandomMat(11, 3, 4, 16)); | |||
| } | |||
| static int test_binaryop_21() | |||
| { | |||
| return 0 | |||
| || test_binaryop(RandomMat(2), RandomMat(11, 3, 4, 2)) | |||
| || test_binaryop(RandomMat(4), RandomMat(11, 3, 4, 4)) | |||
| || test_binaryop(RandomMat(16), RandomMat(11, 3, 4, 16)); | |||
| } | |||
| static int test_binaryop_22() | |||
| { | |||
| return 0 | |||
| || test_binaryop(RandomMat(4, 2), RandomMat(11, 3, 4, 2)) | |||
| || test_binaryop(RandomMat(4, 4), RandomMat(11, 3, 4, 4)) | |||
| || test_binaryop(RandomMat(4, 16), RandomMat(11, 3, 4, 16)); | |||
| } | |||
| static int test_binaryop_23() | |||
| { | |||
| return 0 | |||
| || test_binaryop(RandomMat(3, 4, 2), RandomMat(11, 3, 4, 2)) | |||
| || test_binaryop(RandomMat(3, 4, 4), RandomMat(11, 3, 4, 4)) | |||
| || test_binaryop(RandomMat(3, 4, 16), RandomMat(11, 3, 4, 16)); | |||
| } | |||
| static int test_binaryop_24() | |||
| { | |||
| return 0 | |||
| || test_binaryop(RandomMat(11, 3, 4, 2), 1.f) | |||
| || test_binaryop(RandomMat(11, 3, 4, 4), 1.f) | |||
| || test_binaryop(RandomMat(11, 3, 4, 16), 1.f); | |||
| } | |||
| static int test_binaryop_25() | |||
| { | |||
| return 0 | |||
| || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(1)) | |||
| || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(1)) | |||
| || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(1)); | |||
| } | |||
| static int test_binaryop_26() | |||
| { | |||
| return 0 | |||
| || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(2)) | |||
| || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(4)) | |||
| || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(16)); | |||
| } | |||
| static int test_binaryop_27() | |||
| { | |||
| return 0 | |||
| || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(4, 2)) | |||
| || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(4, 4)) | |||
| || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(4, 16)); | |||
| } | |||
| static int test_binaryop_28() | |||
| { | |||
| return 0 | |||
| || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(3, 4, 2)) | |||
| || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(3, 4, 4)) | |||
| || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(3, 4, 16)); | |||
| } | |||
| static int test_binaryop_29() | |||
| { | |||
| return 0 | |||
| || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(11, 3, 4, 2)) | |||
| || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(11, 3, 4, 4)) | |||
| || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(11, 3, 4, 16)); | |||
| } | |||
| static int test_binaryop_s1() | |||
| { | |||
| return 0 | |||
| @@ -324,6 +404,16 @@ int main() | |||
| || test_binaryop_17() | |||
| || test_binaryop_18() | |||
| || test_binaryop_19() | |||
| || test_binaryop_20() | |||
| || test_binaryop_21() | |||
| || test_binaryop_22() | |||
| || test_binaryop_23() | |||
| || test_binaryop_24() | |||
| || test_binaryop_25() | |||
| || test_binaryop_26() | |||
| || test_binaryop_27() | |||
| || test_binaryop_28() | |||
| || test_binaryop_29() | |||
| || test_binaryop_s1() | |||
| || test_binaryop_s2() | |||
| || test_binaryop_s3() | |||
| @@ -49,13 +49,21 @@ static int test_unaryop(const ncnn::Mat& _a) | |||
| int ret = test_layer<ncnn::UnaryOp>("UnaryOp", pd, weights, a); | |||
| if (ret != 0) | |||
| { | |||
| fprintf(stderr, "test_unaryop failed a.dims=%d a=(%d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.c, op_type); | |||
| fprintf(stderr, "test_unaryop failed a.dims=%d a=(%d %d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, op_type); | |||
| } | |||
| return ret; | |||
| } | |||
| static int test_unaryop_0() | |||
| { | |||
| return 0 | |||
| || test_unaryop(RandomMat(11, 3, 2, 16)) | |||
| || test_unaryop(RandomMat(10, 2, 2, 12)) | |||
| || test_unaryop(RandomMat(6, 1, 5, 13)); | |||
| } | |||
| static int test_unaryop_1() | |||
| { | |||
| return 0 | |||
| || test_unaryop(RandomMat(11, 7, 16)) | |||
| @@ -63,7 +71,7 @@ static int test_unaryop_0() | |||
| || test_unaryop(RandomMat(6, 5, 13)); | |||
| } | |||
| static int test_unaryop_1() | |||
| static int test_unaryop_2() | |||
| { | |||
| return 0 | |||
| || test_unaryop(RandomMat(12, 16)) | |||
| @@ -71,7 +79,7 @@ static int test_unaryop_1() | |||
| || test_unaryop(RandomMat(14, 15)); | |||
| } | |||
| static int test_unaryop_2() | |||
| static int test_unaryop_3() | |||
| { | |||
| return 0 | |||
| || test_unaryop(RandomMat(128)) | |||
| @@ -88,7 +96,8 @@ int main() | |||
| int ret = 0 | |||
| || test_unaryop_0() | |||
| || test_unaryop_1() | |||
| || test_unaryop_2(); | |||
| || test_unaryop_2() | |||
| || test_unaryop_3(); | |||
| if (ret != 0) | |||
| return ret; | |||
| @@ -457,7 +457,7 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n | |||
| int elemcount = 0; | |||
| if (dims == 1) elemcount = a4[i].elempack * a4[i].w; | |||
| if (dims == 2) elemcount = a4[i].elempack * a4[i].h; | |||
| if (dims == 3) elemcount = a4[i].elempack * a4[i].c; | |||
| if (dims == 3 || dims == 4) elemcount = a4[i].elempack * a4[i].c; | |||
| int elembits = a4[i].elembits(); | |||
| @@ -882,7 +882,7 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n | |||
| int elemcount = 0; | |||
| if (dims == 1) elemcount = a4.elempack * a4.w; | |||
| if (dims == 2) elemcount = a4.elempack * a4.h; | |||
| if (dims == 3) elemcount = a4.elempack * a4.c; | |||
| if (dims == 3 || dims == 4) elemcount = a4.elempack * a4.c; | |||
| int elembits = a4.elembits(); | |||
| @@ -22,8 +22,8 @@ set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) | |||
| set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) | |||
| set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) | |||
| set(CMAKE_C_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static") | |||
| set(CMAKE_CXX_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static") | |||
| set(CMAKE_C_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static") | |||
| set(CMAKE_CXX_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static") | |||
| # cache flags | |||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags") | |||
| @@ -22,8 +22,8 @@ set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) | |||
| set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) | |||
| set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) | |||
| set(CMAKE_C_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static") | |||
| set(CMAKE_CXX_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static") | |||
| set(CMAKE_C_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static") | |||
| set(CMAKE_CXX_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static") | |||
| # replace vfredsum_vs* with vfredusum_vs* | |||
| add_definitions(-Dvfredsum_vs_f32m1_f32m1=vfredusum_vs_f32m1_f32m1) | |||
| @@ -22,8 +22,8 @@ set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) | |||
| set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) | |||
| set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) | |||
| set(CMAKE_C_FLAGS "-march=rv64gcvxtheadc -mabi=lp64d -mtune=c906 -DRVV_SPEC_0_7 -D__riscv_zfh=1 -static") | |||
| set(CMAKE_CXX_FLAGS "-march=rv64gcvxtheadc -mabi=lp64d -mtune=c906 -DRVV_SPEC_0_7 -D__riscv_zfh=1 -static") | |||
| set(CMAKE_C_FLAGS "-march=rv64gcvxtheadc -mabi=lp64d -mtune=c906 -DRVV_SPEC_0_7 -D__riscv_zfh=1 -DC906=1 -static") | |||
| set(CMAKE_CXX_FLAGS "-march=rv64gcvxtheadc -mabi=lp64d -mtune=c906 -DRVV_SPEC_0_7 -D__riscv_zfh=1 -DC906=1 -static") | |||
| # cache flags | |||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags") | |||