binary4d, unary4d (#3443)

4 years ago · 3a83704c38
--- a/.github/workflows/test-coverage.yml
+++ b/.github/workflows/test-coverage.yml
@@ -395,6 +395,40 @@ jobs:
        token: ${{ secrets.CODECOV_TOKEN }}
        file: build-arm82/lcov.info

  linux-gcc-arm82-omp:
    runs-on: ubuntu-20.04
    steps:
    - uses: actions/checkout@v2

    - name: lcov
      run: sudo apt-get install lcov
    - name: cache-qemu
      id: cache-qemu
      uses: actions/cache@v2.1.7
      with:
        path: qemu-install
        key: qemu-aarch64-install-1
    - name: checkout-qemu
      if: steps.cache-qemu.outputs.cache-hit != 'true'
      uses: actions/checkout@v2
      with:
        repository: qemu/qemu
        path: qemu
        ref: 8746309137ba470d1b2e8f5ce86ac228625db940
    - name: qemu
      if: steps.cache-qemu.outputs.cache-hit != 'true'
      run: |
        cd qemu
        ./configure --prefix=install --target-list=aarch64-linux-user --disable-system
        make -j2
        make install
        cp -r aarch64-linux-user/install $GITHUB_WORKSPACE/qemu-install

    - name: aarch64-gnu-toolchain
      run: |
        sudo apt-get update
        sudo apt-get install g++-aarch64-linux-gnu

    - name: build-arm82-omp
      run: |
        mkdir build-arm82-omp && cd build-arm82-omp
@@ -418,6 +452,40 @@ jobs:
        token: ${{ secrets.CODECOV_TOKEN }}
        file: build-arm82-omp/lcov.info

  linux-gcc-arm82dot-omp:
    runs-on: ubuntu-20.04
    steps:
    - uses: actions/checkout@v2

    - name: lcov
      run: sudo apt-get install lcov
    - name: cache-qemu
      id: cache-qemu
      uses: actions/cache@v2.1.7
      with:
        path: qemu-install
        key: qemu-aarch64-install-1
    - name: checkout-qemu
      if: steps.cache-qemu.outputs.cache-hit != 'true'
      uses: actions/checkout@v2
      with:
        repository: qemu/qemu
        path: qemu
        ref: 8746309137ba470d1b2e8f5ce86ac228625db940
    - name: qemu
      if: steps.cache-qemu.outputs.cache-hit != 'true'
      run: |
        cd qemu
        ./configure --prefix=install --target-list=aarch64-linux-user --disable-system
        make -j2
        make install
        cp -r aarch64-linux-user/install $GITHUB_WORKSPACE/qemu-install

    - name: aarch64-gnu-toolchain
      run: |
        sudo apt-get update
        sudo apt-get install g++-aarch64-linux-gnu

    - name: build-arm82dot-omp
      run: |
        mkdir build-arm82dot-omp && cd build-arm82dot-omp
--- a/docs/developer-guide/binaryop-broadcasting.md
+++ b/docs/developer-guide/binaryop-broadcasting.md
@@ -4,7 +4,7 @@ ncnn BinaryOp accepts blobs with different shape

 C = BinaryOp(A, B)

 shape notation convention is [w], [w,h], [w,h,c]
 shape notation convention is [w], [w,h], [w,h,c], [w,h,d,c]

 |type|A|B|C|
 |---|---|---|---|
@@ -27,6 +27,16 @@ shape notation convention is [w], [w,h], [w,h,c]
 |17|[2,3,4]|[4]|[2,3,4]|
 |18|[2,3,4]|[3,4]|[2,3,4]|
 |19|[2,3,4]|[2,3,4]|[2,3,4]|
 |20|[1]|[2,3,4,5]|[2,3,4,5]|
 |21|[5]|[2,3,4,5]|[2,3,4,5]|
 |22|[4,5]|[2,3,4,5]|[2,3,4,5]|
 |23|[3,4,5]|[2,3,4,5]|[2,3,4,5]|
 |24|[2,3,4,5]|scalar|[2,3,4,5]|
 |25|[2,3,4,5]|[1]|[2,3,4,5]|
 |26|[2,3,4,5]|[5]|[2,3,4,5]|
 |27|[2,3,4,5]|[4,5]|[2,3,4,5]|
 |28|[2,3,4,5]|[3,4,5]|[2,3,4,5]|
 |29|[2,3,4,5]|[2,3,4,5]|[2,3,4,5]|

 some special broadcasting rule exists for model compatibility

--- a/src/layer/arm/binaryop_arm.cpp
+++ b/src/layer/arm/binaryop_arm.cpp
--- a/src/layer/arm/unaryop_arm.cpp
+++ b/src/layer/arm/unaryop_arm.cpp
@@ -48,8 +48,9 @@ static int unary_op_inplace_pack4(Mat& a, const Option& opt)

    int w = a.w;
    int h = a.h;
    int d = a.d;
    int channels = a.c;
    int size = w * h;
    int size = w * h * d;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
@@ -341,8 +342,9 @@ static int unary_op_inplace_pack8_fp16s(Mat& a, const Option& opt)

    int w = a.w;
    int h = a.h;
    int d = a.d;
    int channels = a.c;
    int size = w * h;
    int size = w * h * d;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
@@ -554,8 +556,9 @@ static int unary_op_inplace_pack4_fp16s(Mat& a, const Option& opt)

    int w = a.w;
    int h = a.h;
    int d = a.d;
    int channels = a.c;
    int size = w * h;
    int size = w * h * d;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
@@ -751,8 +754,9 @@ static int unary_op_inplace_fp16s(Mat& a, const Option& opt)

    int w = a.w;
    int h = a.h;
    int d = a.d;
    int channels = a.c;
    int size = w * h;
    int size = w * h * d;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
@@ -1082,8 +1086,9 @@ static int unary_op_inplace_pack4_bf16s(Mat& a, const Option& opt)

    int w = a.w;
    int h = a.h;
    int d = a.d;
    int channels = a.c;
    int size = w * h;
    int size = w * h * d;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
@@ -1111,8 +1116,9 @@ static int unary_op_inplace_bf16s(Mat& a, const Option& opt)

    int w = a.w;
    int h = a.h;
    int d = a.d;
    int channels = a.c;
    int size = w * h;
    int size = w * h * d;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
--- a/src/layer/binaryop.cpp
+++ b/src/layer/binaryop.cpp
@@ -49,17 +49,181 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c, const Option& opt)

    int w = a.w;
    int h = a.h;
    int d = a.d;
    int channels = a.c;
    int size = w * h;
    int size = w * h * d;
    size_t elemsize = a.elemsize;

    int w1 = b.w;
    int h1 = b.h;
    int d1 = b.d;
    int channels1 = b.c;
    int size1 = w1 * h1;
    int size1 = w1 * h1 * d1;

    if (a.dims == 3)
    if (a.dims == 4)
    {
        if (b.dims == 4)
        {
            // type 29
            c.create(w, h, d, channels, elemsize, opt.blob_allocator);
            if (c.empty())
                return -100;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = a.channel(q);
                const float* ptr1 = b.channel(q);
                float* outptr = c.channel(q);

                for (int i = 0; i < size; i++)
                {
                    outptr[i] = op(ptr[i], ptr1[i]);
                }
            }

            return 0;
        }

        c.create(w, h, d, channels, elemsize, opt.blob_allocator);
        if (c.empty())
            return -100;

        if (b.dims == 3)
        {
            // type 28
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = a.channel(q);
                const float* ptr1 = b.channel(q);
                float* outptr = c.channel(q);

                for (int z = 0; z < d; z++)
                {
                    for (int y = 0; y < h; y++)
                    {
                        const float b0 = ptr1[y];
                        for (int x = 0; x < w; x++)
                        {
                            outptr[x] = op(ptr[x], b0);
                        }

                        ptr += w;
                        outptr += w;
                    }

                    ptr1 += h;
                }
            }

            return 0;
        }

        if (b.dims == 2)
        {
            // type 27
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = a.channel(q);
                const float* ptr1 = b.row(q);
                float* outptr = c.channel(q);

                for (int z = 0; z < d; z++)
                {
                    const float b0 = ptr1[z];
                    for (int y = 0; y < h; y++)
                    {
                        for (int x = 0; x < w; x++)
                        {
                            outptr[x] = op(ptr[x], b0);
                        }

                        ptr += w;
                        outptr += w;
                    }
                }
            }

            return 0;
        }

        if (b.dims == 1)
        {
            if (b.w == 1)
            {
                // type 25
                const float b0 = b[0];
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const float* ptr = a.channel(q);
                    float* outptr = c.channel(q);

                    for (int i = 0; i < size; i++)
                    {
                        outptr[i] = op(ptr[i], b0);
                    }
                }

                return 0;
            }

            // type 26
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = a.channel(q);
                const float b0 = b[q];
                float* outptr = c.channel(q);

                for (int i = 0; i < size; i++)
                {
                    outptr[i] = op(ptr[i], b0);
                }
            }

            return 0;
        }
    }
    else if (a.dims == 3)
    {
        if (b.dims == 4)
        {
            // type 23
            c.create(w1, h1, d1, channels1, elemsize, opt.blob_allocator);
            if (c.empty())
                return -100;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels1; q++)
            {
                const float* ptr = a.channel(q);
                const float* ptr1 = b.channel(q);
                float* outptr = c.channel(q);

                for (int z = 0; z < d1; z++)
                {
                    for (int y = 0; y < h1; y++)
                    {
                        const float a0 = ptr[y];
                        for (int x = 0; x < w1; x++)
                        {
                            outptr[x] = op(a0, ptr1[x]);
                        }

                        ptr1 += w1;
                        outptr += w1;
                    }

                    ptr += h1;
                }
            }

            return 0;
        }

        if (b.dims == 3)
        {
            if (w1 == 1 && h1 == 1 && channels1 == channels)
@@ -359,6 +523,39 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c, const Option& opt)
    }
    else if (a.dims == 2)
    {
        if (b.dims == 4)
        {
            // type 22
            c.create(w1, h1, d1, channels1, elemsize, opt.blob_allocator);
            if (c.empty())
                return -100;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels1; q++)
            {
                const float* ptr = a.row(q);
                const float* ptr1 = b.channel(q);
                float* outptr = c.channel(q);

                for (int z = 0; z < d1; z++)
                {
                    const float a0 = ptr[z];
                    for (int y = 0; y < h1; y++)
                    {
                        for (int x = 0; x < w1; x++)
                        {
                            outptr[x] = op(a0, ptr1[x]);
                        }

                        ptr1 += w1;
                        outptr += w1;
                    }
                }
            }

            return 0;
        }

        if (b.dims == 3)
        {
            // type 14
@@ -445,6 +642,29 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c, const Option& opt)
    {
        if (a.w == 1)
        {
            if (b.dims == 4)
            {
                // type 20
                c.create(w1, h1, d1, channels1, elemsize, opt.blob_allocator);
                if (c.empty())
                    return -100;

                const float a0 = a[0];
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels1; q++)
                {
                    const float* ptr1 = b.channel(q);
                    float* outptr = c.channel(q);

                    for (int i = 0; i < size1; i++)
                    {
                        outptr[i] = op(a0, ptr1[i]);
                    }
                }

                return 0;
            }

            if (b.dims == 3)
            {
                // type 4
@@ -501,6 +721,29 @@ static int binary_op(const Mat& a, const Mat& b, Mat& c, const Option& opt)
            }
        }

        if (b.dims == 4)
        {
            // type 21
            c.create(w1, h1, d1, channels1, elemsize, opt.blob_allocator);
            if (c.empty())
                return -100;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels1; q++)
            {
                const float a0 = a[q];
                const float* ptr1 = b.channel(q);
                float* outptr = c.channel(q);

                for (int i = 0; i < size1; i++)
                {
                    outptr[i] = op(a0, ptr1[i]);
                }
            }

            return 0;
        }

        if (b.dims == 3)
        {
            // type 9
@@ -585,8 +828,9 @@ static int binary_op_scalar_inplace(Mat& a, float b, const Option& opt)

    int w = a.w;
    int h = a.h;
    int d = a.d;
    int channels = a.c;
    int size = w * h;
    int size = w * h * d;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
@@ -703,10 +947,10 @@ int BinaryOp::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
        return binary_op<binary_op_pow>(bottom_blob, bottom_blob1, top_blob, opt);

    if (op_type == Operation_RSUB)
        return binary_op<binary_op_rsub>(bottom_blob, bottom_blob1, top_blob, opt);
        return binary_op<binary_op_sub>(bottom_blob1, bottom_blob, top_blob, opt);

    if (op_type == Operation_RDIV)
        return binary_op<binary_op_rdiv>(bottom_blob, bottom_blob1, top_blob, opt);
        return binary_op<binary_op_div>(bottom_blob1, bottom_blob, top_blob, opt);

    return 0;
 }
--- a/src/layer/mips/binaryop_mips.cpp
+++ b/src/layer/mips/binaryop_mips.cpp
@@ -41,20 +41,203 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt

    int w = a.w;
    int h = a.h;
    int d = a.d;
    int channels = a.c;
    int size = w * h;
    int size = w * h * d;
    size_t elemsize = a.elemsize;
    int elempack = a.elempack;

    int w1 = b.w;
    int h1 = b.h;
    int d1 = b.d;
    int channels1 = b.c;
    int size1 = w1 * h1;
    int size1 = w1 * h1 * d1;
    size_t elemsize1 = b.elemsize;
    int elempack1 = b.elempack;

    if (a.dims == 3)
    if (a.dims == 4)
    {
        if (b.dims == 4)
        {
            // type 29
            c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator);
            if (c.empty())
                return -100;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = a.channel(q);
                const float* ptr1 = b.channel(q);
                float* outptr = c.channel(q);

                for (int i = 0; i < size; i++)
                {
                    v4f32 _p = (v4f32)__msa_ld_w(ptr, 0);
                    v4f32 _p1 = (v4f32)__msa_ld_w(ptr1, 0);
                    v4f32 _outp = op(_p, _p1);
                    __msa_st_w((v4i32)_outp, outptr, 0);
                    ptr += 4;
                    ptr1 += 4;
                    outptr += 4;
                }
            }

            return 0;
        }

        c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator);
        if (c.empty())
            return -100;

        if (b.dims == 3)
        {
            // type 28
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = a.channel(q);
                const float* ptr1 = b.channel(q);
                float* outptr = c.channel(q);

                for (int z = 0; z < d; z++)
                {
                    for (int y = 0; y < h; y++)
                    {
                        v4f32 _b0 = (v4f32)__msa_ld_w(ptr1, 0);
                        for (int x = 0; x < w; x++)
                        {
                            v4f32 _p = (v4f32)__msa_ld_w(ptr, 0);
                            v4f32 _outp = op(_p, _b0);
                            __msa_st_w((v4i32)_outp, outptr, 0);
                            ptr += 4;
                            outptr += 4;
                        }

                        ptr1 += 4;
                    }
                }
            }

            return 0;
        }

        if (b.dims == 2)
        {
            // type 27
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = a.channel(q);
                const float* ptr1 = b.row(q);
                float* outptr = c.channel(q);

                for (int z = 0; z < d; z++)
                {
                    v4f32 _b0 = (v4f32)__msa_ld_w(ptr1, 0);
                    for (int y = 0; y < h; y++)
                    {
                        for (int x = 0; x < w; x++)
                        {
                            v4f32 _p = (v4f32)__msa_ld_w(ptr, 0);
                            v4f32 _outp = op(_p, _b0);
                            __msa_st_w((v4i32)_outp, outptr, 0);
                            ptr += 4;
                            outptr += 4;
                        }
                    }

                    ptr1 += 4;
                }
            }

            return 0;
        }

        if (b.dims == 1)
        {
            if (b.w == 1 && elempack1 == 1)
            {
                // type 25
                v4f32 _b0 = __msa_fill_w_f32(b[0]);
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const float* ptr = a.channel(q);
                    float* outptr = c.channel(q);

                    for (int i = 0; i < size; i++)
                    {
                        v4f32 _p = (v4f32)__msa_ld_w(ptr, 0);
                        v4f32 _outp = op(_p, _b0);
                        __msa_st_w((v4i32)_outp, outptr, 0);
                        ptr += 4;
                        outptr += 4;
                    }
                }

                return 0;
            }

            // type 26
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = a.channel(q);
                v4f32 _b0 = (v4f32)__msa_ld_w((const float*)b + q * 4, 0);
                float* outptr = c.channel(q);

                for (int i = 0; i < size; i++)
                {
                    v4f32 _p = (v4f32)__msa_ld_w(ptr, 0);
                    v4f32 _outp = op(_p, _b0);
                    __msa_st_w((v4i32)_outp, outptr, 0);
                    ptr += 4;
                    outptr += 4;
                }
            }

            return 0;
        }
    }
    else if (a.dims == 3)
    {
        if (b.dims == 4)
        {
            // type 23
            c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
            if (c.empty())
                return -100;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels1; q++)
            {
                const float* ptr = a.channel(q);
                const float* ptr1 = b.channel(q);
                float* outptr = c.channel(q);

                for (int z = 0; z < d1; z++)
                {
                    for (int y = 0; y < h1; y++)
                    {
                        v4f32 _a0 = (v4f32)__msa_ld_w(ptr, 0);
                        for (int x = 0; x < w1; x++)
                        {
                            v4f32 _p = (v4f32)__msa_ld_w(ptr1, 0);
                            v4f32 _outp = op(_a0, _p);
                            __msa_st_w((v4i32)_outp, outptr, 0);
                            ptr1 += 4;
                            outptr += 4;
                        }

                        ptr += 4;
                    }
                }
            }

            return 0;
        }

        if (b.dims == 3)
        {
            if (w1 == 1 && h1 == 1 && channels1 == channels)
@@ -417,6 +600,42 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
    }
    else if (a.dims == 2)
    {
        if (b.dims == 4)
        {
            // type 22
            c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
            if (c.empty())
                return -100;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels1; q++)
            {
                const float* ptr = a.row(q);
                const float* ptr1 = b.channel(q);
                float* outptr = c.channel(q);

                for (int z = 0; z < d1; z++)
                {
                    v4f32 _a0 = (v4f32)__msa_ld_w(ptr, 0);
                    for (int y = 0; y < h1; y++)
                    {
                        for (int x = 0; x < w1; x++)
                        {
                            v4f32 _p = (v4f32)__msa_ld_w(ptr1, 0);
                            v4f32 _outp = op(_a0, _p);
                            __msa_st_w((v4i32)_outp, outptr, 0);
                            ptr1 += 4;
                            outptr += 4;
                        }
                    }

                    ptr += 4;
                }
            }

            return 0;
        }

        if (b.dims == 3)
        {
            // type 14
@@ -530,6 +749,33 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
    {
        if (a.w == 1 && elempack == 1)
        {
            if (b.dims == 4)
            {
                // type 20
                c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
                if (c.empty())
                    return -100;

                v4f32 _a0 = __msa_fill_w_f32(a[0]);
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels1; q++)
                {
                    const float* ptr1 = b.channel(q);
                    float* outptr = c.channel(q);

                    for (int i = 0; i < size1; i++)
                    {
                        v4f32 _p1 = (v4f32)__msa_ld_w(ptr1, 0);
                        v4f32 _outp = op(_a0, _p1);
                        __msa_st_w((v4i32)_outp, outptr, 0);
                        ptr1 += 4;
                        outptr += 4;
                    }
                }

                return 0;
            }

            if (b.dims == 3)
            {
                // type 4
@@ -605,6 +851,33 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
            }
        }

        if (b.dims == 4)
        {
            // type 21
            c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
            if (c.empty())
                return -100;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels1; q++)
            {
                v4f32 _a0 = (v4f32)__msa_ld_w((const float*)a + q * 4, 0);
                const float* ptr1 = b.channel(q);
                float* outptr = c.channel(q);

                for (int i = 0; i < size1; i++)
                {
                    v4f32 _p1 = (v4f32)__msa_ld_w(ptr1, 0);
                    v4f32 _outp = op(_a0, _p1);
                    __msa_st_w((v4i32)_outp, outptr, 0);
                    ptr1 += 4;
                    outptr += 4;
                }
            }

            return 0;
        }

        if (b.dims == 3)
        {
            // type 9
@@ -717,8 +990,9 @@ static int binary_op_scalar_inplace_pack4(Mat& a, float b, const Option& opt)

    int w = a.w;
    int h = a.h;
    int d = a.d;
    int channels = a.c;
    int size = w * h;
    int size = w * h * d;

    v4f32 _b = __msa_fill_w_f32(b);

@@ -847,10 +1121,10 @@ int BinaryOp_mips::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat
            return binary_op_pack4<binary_op_pow_pack4>(bottom_blob, bottom_blob1, top_blob, opt);

        if (op_type == Operation_RSUB)
            return binary_op_pack4<binary_op_rsub_pack4>(bottom_blob, bottom_blob1, top_blob, opt);
            return binary_op_pack4<binary_op_sub_pack4>(bottom_blob1, bottom_blob, top_blob, opt);

        if (op_type == Operation_RDIV)
            return binary_op_pack4<binary_op_rdiv_pack4>(bottom_blob, bottom_blob1, top_blob, opt);
            return binary_op_pack4<binary_op_div_pack4>(bottom_blob1, bottom_blob, top_blob, opt);
    }
 #endif // __mips_msa

--- a/src/layer/mips/unaryop_mips.cpp
+++ b/src/layer/mips/unaryop_mips.cpp
@@ -38,8 +38,9 @@ static int unary_op_inplace_pack4(Mat& a, const Option& opt)

    int w = a.w;
    int h = a.h;
    int d = a.d;
    int channels = a.c;
    int size = w * h;
    int size = w * h * d;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
--- a/src/layer/riscv/binaryop_riscv.cpp
+++ b/src/layer/riscv/binaryop_riscv.cpp
--- a/src/layer/riscv/binaryop_riscv.h
+++ b/src/layer/riscv/binaryop_riscv.h
@@ -1,8 +1,6 @@
 // Xavier Hsinyuan is pleased to support the open source community by making
 // ncnn available.
 // Xavier Hsinyuan is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2021 Xavier Hsinyuan <thelastlinex@hotmail.com>. All rights
 // reserved.
 // Copyright (C) 2021 Xavier Hsinyuan <thelastlinex@hotmail.com>. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this
 // file except in compliance with the License. You may obtain a copy of the
@@ -15,10 +13,12 @@
 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 // License for the specific language governing permissions and limitations under
 // the License.

 #ifndef LAYER_BINARYOP_RISCV_H
 #define LAYER_BINARYOP_RISCV_H

 #include "binaryop.h"

 namespace ncnn {

 class BinaryOp_riscv : virtual public BinaryOp
@@ -26,19 +26,17 @@ class BinaryOp_riscv : virtual public BinaryOp
 public:
    BinaryOp_riscv();

    virtual int forward(const std::vector<Mat>& bottom_blobs,
                        std::vector<Mat>& top_blobs, const Option& opt) const;
    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

 protected:
 #if __riscv_vector && __riscv_zfh
    int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;

    int forward_fp16sa(const std::vector<Mat>& bottom_blobs,
                       std::vector<Mat>& top_blobs, const Option& opt) const;
    int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 };

 } // namespace ncnn

 #endif // LAYER_BINARYOP_RISCV_H
 #endif // LAYER_BINARYOP_RISCV_H
--- a/src/layer/riscv/convolution_3x3_packn.h
+++ b/src/layer/riscv/convolution_3x3_packn.h
@@ -324,7 +324,7 @@ static void conv3x3s1_winograd64_packn_rvv(const Mat& bottom_blob, Mat& top_blob

                for (int q = 0; q < inch; q++)
                {
 #if RVV_SPEC_0_7
 #if C906
                    for (int l = 0; l < packn; l++)
                    {
                        tmpptr[0] = r0[l];
@@ -365,7 +365,7 @@ static void conv3x3s1_winograd64_packn_rvv(const Mat& bottom_blob, Mat& top_blob

                for (int q = 0; q < inch; q++)
                {
 #if RVV_SPEC_0_7
 #if C906
                    for (int l = 0; l < packn; l++)
                    {
                        tmpptr[0] = r0[l];
@@ -398,7 +398,7 @@ static void conv3x3s1_winograd64_packn_rvv(const Mat& bottom_blob, Mat& top_blob

                for (int q = 0; q < inch; q++)
                {
 #if RVV_SPEC_0_7
 #if C906
                    for (int l = 0; l < packn; l++)
                    {
                        tmpptr[0] = r0[l];
@@ -999,7 +999,7 @@ static void conv3x3s1_winograd42_packn_rvv(const Mat& bottom_blob, Mat& top_blob

                for (int q = 0; q < inch; q++)
                {
 #if RVV_SPEC_0_7
 #if C906
                    for (int l = 0; l < packn; l++)
                    {
                        tmpptr[0] = r0[l];
@@ -1040,7 +1040,7 @@ static void conv3x3s1_winograd42_packn_rvv(const Mat& bottom_blob, Mat& top_blob

                for (int q = 0; q < inch; q++)
                {
 #if RVV_SPEC_0_7
 #if C906
                    for (int l = 0; l < packn; l++)
                    {
                        tmpptr[0] = r0[l];
@@ -1073,7 +1073,7 @@ static void conv3x3s1_winograd42_packn_rvv(const Mat& bottom_blob, Mat& top_blob

                for (int q = 0; q < inch; q++)
                {
 #if RVV_SPEC_0_7
 #if C906
                    for (int l = 0; l < packn; l++)
                    {
                        tmpptr[0] = r0[l];
--- a/src/layer/riscv/convolution_3x3_packn_fp16s.h
+++ b/src/layer/riscv/convolution_3x3_packn_fp16s.h
@@ -324,7 +324,7 @@ static void conv3x3s1_winograd64_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& t

                for (int q = 0; q < inch; q++)
                {
 #if RVV_SPEC_0_7
 #if C906
                    for (int l = 0; l < packn; l++)
                    {
                        tmpptr[0] = r0[l];
@@ -365,7 +365,7 @@ static void conv3x3s1_winograd64_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& t

                for (int q = 0; q < inch; q++)
                {
 #if RVV_SPEC_0_7
 #if C906
                    for (int l = 0; l < packn; l++)
                    {
                        tmpptr[0] = r0[l];
@@ -398,7 +398,7 @@ static void conv3x3s1_winograd64_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& t

                for (int q = 0; q < inch; q++)
                {
 #if RVV_SPEC_0_7
 #if C906
                    for (int l = 0; l < packn; l++)
                    {
                        tmpptr[0] = r0[l];
@@ -999,7 +999,7 @@ static void conv3x3s1_winograd42_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& t

                for (int q = 0; q < inch; q++)
                {
 #if RVV_SPEC_0_7
 #if C906
                    for (int l = 0; l < packn; l++)
                    {
                        tmpptr[0] = r0[l];
@@ -1040,7 +1040,7 @@ static void conv3x3s1_winograd42_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& t

                for (int q = 0; q < inch; q++)
                {
 #if RVV_SPEC_0_7
 #if C906
                    for (int l = 0; l < packn; l++)
                    {
                        tmpptr[0] = r0[l];
@@ -1073,7 +1073,7 @@ static void conv3x3s1_winograd42_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& t

                for (int q = 0; q < inch; q++)
                {
 #if RVV_SPEC_0_7
 #if C906
                    for (int l = 0; l < packn; l++)
                    {
                        tmpptr[0] = r0[l];
--- a/src/layer/riscv/convolution_packnto1_fp16s.h
+++ b/src/layer/riscv/convolution_packnto1_fp16s.h
@@ -84,7 +84,7 @@ static void convolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob
                    }
                }

 #ifdef RVV_SPEC_0_7
 #if C906
                // TODO
                std::vector<float> ss(packn);
                vse32_v_f32m2((float*)ss.data(), _sum, vl);
--- a/src/layer/riscv/convolution_sgemm_packn.h
+++ b/src/layer/riscv/convolution_sgemm_packn.h
@@ -54,7 +54,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons

                for (int k = 0; k < maxk; k++)
                {
 #if RVV_SPEC_0_7
 #if C906
                    for (int l = 0; l < packn; l++)
                    {
                        tmpptr[0] = img0[l];
@@ -103,7 +103,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons

                for (int k = 0; k < maxk; k++)
                {
 #if RVV_SPEC_0_7
 #if C906
                    for (int l = 0; l < packn; l++)
                    {
                        tmpptr[0] = img0[l];
@@ -144,7 +144,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons

                for (int k = 0; k < maxk; k++)
                {
 #if RVV_SPEC_0_7
 #if C906
                    for (int l = 0; l < packn; l++)
                    {
                        tmpptr[0] = img0[l];
--- a/src/layer/riscv/convolution_sgemm_packn_fp16s.h
+++ b/src/layer/riscv/convolution_sgemm_packn_fp16s.h
@@ -54,7 +54,8 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo

                for (int k = 0; k < maxk; k++)
                {
 #if RVV_SPEC_0_7
 #if C906
 #ifdef RVV_SPEC_0_7
                    asm volatile(
                        "mv        t3,     %[LEN]  \n\t"
                        "mv        t1,     %[SRC]  \n\t"
@@ -83,7 +84,22 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo

                    img0 += size * packn;
                    tmpptr += packn * 8;
 #else
                    for (int l = 0; l < packn; l++)
                    {
                        tmpptr[0] = img0[l];
                        tmpptr[1] = img0[l + packn];
                        tmpptr[2] = img0[l + packn * 2];
                        tmpptr[3] = img0[l + packn * 3];
                        tmpptr[4] = img0[l + packn * 4];
                        tmpptr[5] = img0[l + packn * 5];
                        tmpptr[6] = img0[l + packn * 6];
                        tmpptr[7] = img0[l + packn * 7];
                        tmpptr += 8;
                    }

                    img0 += size * packn;
 #endif
 #else
                    vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl);
                    vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl);
@@ -118,7 +134,8 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo

                for (int k = 0; k < maxk; k++)
                {
 #if RVV_SPEC_0_7
 #if C906
 #ifdef RVV_SPEC_0_7
                    asm volatile(
                        "mv        t3,     %[LEN]  \n\t"
                        "mv        t1,     %[SRC]  \n\t"
@@ -138,6 +155,18 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo

                    img0 += size * packn;
                    tmpptr += packn * 4;
 #else
                    for (int l = 0; l < packn; l++)
                    {
                        tmpptr[0] = img0[l];
                        tmpptr[1] = img0[l + packn];
                        tmpptr[2] = img0[l + packn * 2];
                        tmpptr[3] = img0[l + packn * 3];
                        tmpptr += 4;
                    }

                    img0 += size * packn;
 #endif
 #else
                    vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl);
                    vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl);
@@ -169,7 +198,8 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo

                for (int k = 0; k < maxk; k++)
                {
 #if RVV_SPEC_0_7
 #if C906
 #ifdef RVV_SPEC_0_7
                    asm volatile(
                        "mv        t3,     %[LEN]  \n\t"
                        "mv        t1,     %[SRC]  \n\t"
@@ -185,6 +215,16 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo

                    img0 += size * packn;
                    tmpptr += packn * 2;
 #else
                    for (int l = 0; l < packn; l++)
                    {
                        tmpptr[0] = img0[l];
                        tmpptr[1] = img0[l + packn];
                        tmpptr += 2;
                    }

                    img0 += size * packn;
 #endif
 #else
                    vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl);
                    vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl);
--- a/src/layer/riscv/convolution_sgemm_packnto1.h
+++ b/src/layer/riscv/convolution_sgemm_packnto1.h
@@ -53,7 +53,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c

                for (int k = 0; k < maxk; k++)
                {
 #if RVV_SPEC_0_7
 #if C906
                    for (int l = 0; l < packn; l++)
                    {
                        tmpptr[0] = img0[l];
@@ -102,7 +102,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c

                for (int k = 0; k < maxk; k++)
                {
 #if RVV_SPEC_0_7
 #if C906
                    for (int l = 0; l < packn; l++)
                    {
                        tmpptr[0] = img0[l];
@@ -143,7 +143,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c

                for (int k = 0; k < maxk; k++)
                {
 #if RVV_SPEC_0_7
 #if C906
                    for (int l = 0; l < packn; l++)
                    {
                        tmpptr[0] = img0[l];
@@ -240,7 +240,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
                kptr0 += packn;
            }

 #if RVV_SPEC_0_7
 #if C906
            vsse32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, vl);
            vsse32_v_f32m1(outptr0 + 1, top_blob.cstep * sizeof(float), _sum1, vl);
            vsse32_v_f32m1(outptr0 + 2, top_blob.cstep * sizeof(float), _sum2, vl);
@@ -281,7 +281,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
                kptr0 += packn;
            }

 #if RVV_SPEC_0_7
 #if C906
            vsse32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, vl);
            vsse32_v_f32m1(outptr0 + 1, top_blob.cstep * sizeof(float), _sum1, vl);
            vsse32_v_f32m1(outptr0 + 2, top_blob.cstep * sizeof(float), _sum2, vl);
@@ -312,7 +312,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
                kptr0 += packn;
            }

 #if RVV_SPEC_0_7
 #if C906
            vsse32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, vl);
            vsse32_v_f32m1(outptr0 + 1, top_blob.cstep * sizeof(float), _sum1, vl);
 #else
@@ -393,7 +393,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
                kptr0 += packn;
            }

 #ifdef RVV_SPEC_0_7
 #if C906
            // TODO
            std::vector<float> ss0(packn);
            std::vector<float> ss1(packn);
@@ -473,7 +473,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
                kptr0 += packn;
            }

 #ifdef RVV_SPEC_0_7
 #if C906
            // TODO
            std::vector<float> ss0(packn);
            std::vector<float> ss1(packn);
@@ -527,7 +527,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
                kptr0 += packn;
            }

 #ifdef RVV_SPEC_0_7
 #if C906
            // TODO
            std::vector<float> ss0(packn);
            std::vector<float> ss1(packn);
@@ -568,7 +568,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
                kptr0 += packn;
            }

 #ifdef RVV_SPEC_0_7
 #if C906
            // TODO
            std::vector<float> ss0(packn);
            vse32_v_f32m1((float*)ss0.data(), _sum0, vl);
--- a/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h
+++ b/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h
@@ -53,7 +53,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_

                for (int k = 0; k < maxk; k++)
                {
 #if RVV_SPEC_0_7
 #if C906
                    for (int l = 0; l < packn; l++)
                    {
                        tmpptr[0] = img0[l];
@@ -102,7 +102,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_

                for (int k = 0; k < maxk; k++)
                {
 #if RVV_SPEC_0_7
 #if C906
                    for (int l = 0; l < packn; l++)
                    {
                        tmpptr[0] = img0[l];
@@ -143,7 +143,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_

                for (int k = 0; k < maxk; k++)
                {
 #if RVV_SPEC_0_7
 #if C906
                    for (int l = 0; l < packn; l++)
                    {
                        tmpptr[0] = img0[l];
@@ -240,7 +240,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
                kptr0 += packn;
            }

 #if RVV_SPEC_0_7
 #if C906
            vsse16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, vl);
            vsse16_v_f16m1(outptr0 + 1, top_blob.cstep * sizeof(__fp16), _sum1, vl);
            vsse16_v_f16m1(outptr0 + 2, top_blob.cstep * sizeof(__fp16), _sum2, vl);
@@ -281,7 +281,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
                kptr0 += packn;
            }

 #if RVV_SPEC_0_7
 #if C906
            vsse16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, vl);
            vsse16_v_f16m1(outptr0 + 1, top_blob.cstep * sizeof(__fp16), _sum1, vl);
            vsse16_v_f16m1(outptr0 + 2, top_blob.cstep * sizeof(__fp16), _sum2, vl);
@@ -312,7 +312,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
                kptr0 += packn;
            }

 #if RVV_SPEC_0_7
 #if C906
            vsse16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, vl);
            vsse16_v_f16m1(outptr0 + 1, top_blob.cstep * sizeof(__fp16), _sum1, vl);
 #else
@@ -393,7 +393,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
                kptr0 += packn;
            }

 #ifdef RVV_SPEC_0_7
 #if C906
            // TODO
            std::vector<__fp16> ss0(packn);
            std::vector<__fp16> ss1(packn);
@@ -473,7 +473,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
                kptr0 += packn;
            }

 #ifdef RVV_SPEC_0_7
 #if C906
            // TODO
            std::vector<__fp16> ss0(packn);
            std::vector<__fp16> ss1(packn);
@@ -527,7 +527,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
                kptr0 += packn;
            }

 #ifdef RVV_SPEC_0_7
 #if C906
            // TODO
            std::vector<__fp16> ss0(packn);
            std::vector<__fp16> ss1(packn);
@@ -568,7 +568,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
                kptr0 += packn;
            }

 #ifdef RVV_SPEC_0_7
 #if C906
            // TODO
            std::vector<__fp16> ss0(packn);
            vse16_v_f16m1((__fp16*)ss0.data(), _sum0, vl);
--- a/src/layer/riscv/deconvolution_packnto1_fp16s.h
+++ b/src/layer/riscv/deconvolution_packnto1_fp16s.h
@@ -91,7 +91,7 @@ static void deconvolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_bl
                    kptr += maxk * packn;
                }

 #ifdef RVV_SPEC_0_7
 #if C906
                // TODO
                std::vector<float> ss(packn);
                vse32_v_f32m2((float*)ss.data(), _sum, vl);
--- a/src/layer/riscv/riscv_usability.h
+++ b/src/layer/riscv/riscv_usability.h
@@ -15,6 +15,14 @@
 #ifndef RISCV_USABILITY_H
 #define RISCV_USABILITY_H

 #if __riscv_vector
 #ifdef RVV_SPEC_0_7
 #include "riscv_v_071_fix.h"
 #else
 #include <riscv_vector.h>
 #endif
 #endif // __riscv_vector

 #if __riscv_vector
 static inline int csrr_vl()
 {
@@ -45,6 +53,80 @@ static inline int csrr_vlenb()
                 : "memory");
    return a;
 }

 static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr)
 {
    const int packn = csrr_vlenb() / 4;
    const word_type vl = vsetvl_e32m8(packn * 8);

    // NOTE vloxei8_v_f32m8 gets illegal instruction on d1  --- nihui

    // 128bit
    static const uint32_t index_128bit[32] = {
        0, 4, 8, 12,
        0, 4, 8, 12,
        0, 4, 8, 12,
        0, 4, 8, 12,
        0, 4, 8, 12,
        0, 4, 8, 12,
        0, 4, 8, 12,
        0, 4, 8, 12
    };

    // 256bit
    static const uint32_t index_256bit[64] = {
        0, 4, 8, 12, 16, 20, 24, 28,
        0, 4, 8, 12, 16, 20, 24, 28,
        0, 4, 8, 12, 16, 20, 24, 28,
        0, 4, 8, 12, 16, 20, 24, 28,
        0, 4, 8, 12, 16, 20, 24, 28,
        0, 4, 8, 12, 16, 20, 24, 28,
        0, 4, 8, 12, 16, 20, 24, 28,
        0, 4, 8, 12, 16, 20, 24, 28
    };

    const uint32_t* index = packn == 4 ? index_128bit : index_256bit;
    vuint32m8_t bindex = vle32_v_u32m8(index, vl);
    return vloxei32_v_f32m8(ptr, bindex, vl);
 }

 #if __riscv_zfh
 static inline vfloat16m8_t vle16_v_f16m8_f16m1(const __fp16* ptr)
 {
    const int packn = csrr_vlenb() / 2;
    const word_type vl = vsetvl_e16m8(packn * 8);

    // NOTE vloxei8_v_f16m8 gets illegal instruction on d1  --- nihui

    // 128bit
    static const uint16_t index_128bit[64] = {
        0, 2, 4, 6, 8, 10, 12, 14,
        0, 2, 4, 6, 8, 10, 12, 14,
        0, 2, 4, 6, 8, 10, 12, 14,
        0, 2, 4, 6, 8, 10, 12, 14,
        0, 2, 4, 6, 8, 10, 12, 14,
        0, 2, 4, 6, 8, 10, 12, 14,
        0, 2, 4, 6, 8, 10, 12, 14,
        0, 2, 4, 6, 8, 10, 12, 14
    };

    // 256bit
    static const uint16_t index_256bit[128] = {
        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
    };

    const uint16_t* index = packn == 8 ? index_128bit : index_256bit;
    vuint16m8_t bindex = vle16_v_u16m8(index, vl);
    return vloxei16_v_f16m8(ptr, bindex, vl);
 }
 #endif // __riscv_zfh
 #endif // __riscv_vector

 #endif // RISCV_USABILITY_H
--- a/src/layer/riscv/riscv_v_071_fix.h
+++ b/src/layer/riscv/riscv_v_071_fix.h
@@ -96,6 +96,15 @@ typedef uint16x4xm1_t vuint16m1x4_t;
 typedef uint16x4xm2_t vuint16m2x4_t;
 typedef uint16x8xm1_t vuint16m1x8_t;

 typedef uint8xm1_t vuint8m1_t;
 typedef uint8xm2_t vuint8m2_t;
 typedef uint8xm4_t vuint8m4_t;
 typedef uint8xm8_t vuint8m8_t;

 typedef uint8x4xm1_t vuint8m1x4_t;
 typedef uint8x4xm2_t vuint8m2x4_t;
 typedef uint8x8xm1_t vuint8m1x8_t;

 #define vsetvl_e32m1(n) vsetvli(n, RVV_E32, RVV_M1)
 #define vsetvl_e32m2(n) vsetvli(n, RVV_E32, RVV_M2)
 #define vsetvl_e32m4(n) vsetvli(n, RVV_E32, RVV_M4)
@@ -132,6 +141,8 @@ typedef uint16x8xm1_t vuint16m1x8_t;
 #define vsse32_v_f32m4 vssev_float32xm4
 #define vsse32_v_f32m8 vssev_float32xm8

 #define vloxei32_v_f32m8(a, i, vl) vlxev_float32xm8(a, reinterpret_cast<int32xm8_t>(i), vl)

 #define vlseg2e32_v_f32m1x2 vlseg2ev_float32x2xm1
 #define vsseg2e32_v_f32m1x2 vsseg2ev_float32x2xm1

@@ -617,6 +628,8 @@ static inline vfloat32m1_t vfredmax_vs_f32m8_f32m1(vfloat32m1_t dst, vfloat32m8_
 #define vsse16_v_f16m4 vssev_float16xm4
 #define vsse16_v_f16m8 vssev_float16xm8

 #define vloxei16_v_f16m8(a, i, vl) vlxev_float16xm8(a, reinterpret_cast<int16xm8_t>(i), vl)

 #define vlseg2e16_v_f16m1x2 vlseg2ev_float16x2xm1
 #define vsseg2e16_v_f16m1x2 vsseg2ev_float16x2xm1

@@ -1690,6 +1703,32 @@ static inline vuint16m1x8_t vcreate_u16m1x8(vuint16m1_t v0, vuint16m1_t v1, vuin
 #define vreinterpret_v_f16m4_u16m4(x) reinterpret_cast<vuint16m4_t>(x)
 #define vreinterpret_v_f16m8_u16m8(x) reinterpret_cast<vuint16m8_t>(x)

 /******************************** uint8 ********************************/
 #define vle8_v_u8m1 vlev_uint8xm1
 #define vle8_v_u8m2 vlev_uint8xm2
 #define vle8_v_u8m4 vlev_uint8xm4
 #define vle8_v_u8m8 vlev_uint8xm8

 #define vse8_v_u8m1 vsev_uint8xm1
 #define vse8_v_u8m2 vsev_uint8xm2
 #define vse8_v_u8m4 vsev_uint8xm4
 #define vse8_v_u8m8 vsev_uint8xm8

 #define vlse8_v_u8m1 vlsev_uint8xm1
 #define vlse8_v_u8m2 vlsev_uint8xm2
 #define vlse8_v_u8m4 vlsev_uint8xm4
 #define vlse8_v_u8m8 vlsev_uint8xm8

 #define vsse8_v_u8m1 vssev_uint8xm1
 #define vsse8_v_u8m2 vssev_uint8xm2
 #define vsse8_v_u8m4 vssev_uint8xm4
 #define vsse8_v_u8m8 vssev_uint8xm8

 #define vmv_v_x_u8m1 vmvvx_unt8xm1
 #define vmv_v_x_u8m2 vmvvx_unt8xm2
 #define vmv_v_x_u8m4 vmvvx_unt8xm4
 #define vmv_v_x_u8m8 vmvvx_unt8xm8

 /******************************** mask ********************************/
 #define vmxor_mm_b32 vmxormm_e32xm1
 #define vmxor_mm_b16 vmxormm_e32xm2
--- a/src/layer/riscv/unaryop_riscv.cpp
+++ b/src/layer/riscv/unaryop_riscv.cpp
@@ -46,8 +46,9 @@ static int unary_op_inplace(Mat& a, const Option& opt)

    int w = a.w;
    int h = a.h;
    int d = a.d;
    int channels = a.c;
    int size = w * h;
    int size = w * h * d;
    int elempack = a.elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
@@ -322,8 +323,9 @@ static int unary_op_inplace_fp16s(Mat& a, const Option& opt)

    int w = a.w;
    int h = a.h;
    int d = a.d;
    int channels = a.c;
    int size = w * h;
    int size = w * h * d;
    int elempack = a.elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
--- a/src/layer/vulkan/binaryop_vulkan.cpp
+++ b/src/layer/vulkan/binaryop_vulkan.cpp
@@ -47,17 +47,17 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt)
    int elempack = 1;
    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
    if (shape.dims == 3 || shape.dims == 4) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;

    int elempack1 = 1;
    if (shape1.dims == 1) elempack1 = opt.use_shader_pack8 && shape1.w % 8 == 0 ? 8 : shape1.w % 4 == 0 ? 4 : 1;
    if (shape1.dims == 2) elempack1 = opt.use_shader_pack8 && shape1.h % 8 == 0 ? 8 : shape1.h % 4 == 0 ? 4 : 1;
    if (shape1.dims == 3) elempack1 = opt.use_shader_pack8 && shape1.c % 8 == 0 ? 8 : shape1.c % 4 == 0 ? 4 : 1;
    if (shape1.dims == 3 || shape1.dims == 4) elempack1 = opt.use_shader_pack8 && shape1.c % 8 == 0 ? 8 : shape1.c % 4 == 0 ? 4 : 1;

    int out_elempack = 1;
    if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1;
    if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 : 1;
    if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1;
    if (out_shape.dims == 3 || out_shape.dims == 4) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1;

    size_t elemsize;
    size_t elemsize1;
@@ -85,19 +85,22 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt)
    if (shape.dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack);
    if (shape.dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
    if (shape.dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
    if (shape.dims == 4) shape_packed = Mat(shape.w, shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack);

    Mat shape1_packed;
    if (shape1.dims == 1) shape1_packed = Mat(shape1.w / elempack1, (void*)0, elemsize1, elempack1);
    if (shape1.dims == 2) shape1_packed = Mat(shape1.w, shape1.h / elempack1, (void*)0, elemsize1, elempack1);
    if (shape1.dims == 3) shape1_packed = Mat(shape1.w, shape1.h, shape1.c / elempack1, (void*)0, elemsize1, elempack1);
    if (shape1.dims == 4) shape1_packed = Mat(shape1.w, shape1.h, shape1.d, shape1.c / elempack1, (void*)0, elemsize1, elempack1);

    Mat out_shape_packed;
    if (out_shape.dims == 1) out_shape_packed = Mat(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack);
    if (out_shape.dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h / out_elempack, (void*)0, out_elemsize, out_elempack);
    if (out_shape.dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack);
    if (out_shape.dims == 4) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.d, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack);

    bool broadcast = true;
    if (shape.dims == shape1.dims && shape.w == shape1.w && shape.h == shape1.h && shape.c == shape1.c)
    if (shape.dims == shape1.dims && shape.w == shape1.w && shape.h == shape1.h && shape.d == shape1.d && shape.c == shape1.c)
    {
        broadcast = false;
    }
@@ -111,17 +114,17 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt)
        specializations[2].f = b;
        specializations[3 + 0].i = shape_packed.dims;
        specializations[3 + 1].i = shape_packed.w;
        specializations[3 + 2].i = shape_packed.h;
        specializations[3 + 2].i = shape_packed.h * shape_packed.d;
        specializations[3 + 3].i = shape_packed.c;
        specializations[3 + 4].i = shape_packed.cstep;
        specializations[3 + 5].i = shape1_packed.dims;
        specializations[3 + 6].i = shape1_packed.w;
        specializations[3 + 7].i = shape1_packed.h;
        specializations[3 + 7].i = shape1_packed.h * shape1_packed.d;
        specializations[3 + 8].i = shape1_packed.c;
        specializations[3 + 9].i = shape1_packed.cstep;
        specializations[3 + 10].i = out_shape_packed.dims;
        specializations[3 + 11].i = out_shape_packed.w;
        specializations[3 + 12].i = out_shape_packed.h;
        specializations[3 + 12].i = out_shape_packed.h * out_shape_packed.d;
        specializations[3 + 13].i = out_shape_packed.c;
        specializations[3 + 14].i = out_shape_packed.cstep;

@@ -144,6 +147,12 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt)
            local_size_xyz.h = std::min(4, out_shape_packed.h);
            local_size_xyz.c = std::min(4, out_shape_packed.c);
        }
        if (out_shape_packed.dims == 4)
        {
            local_size_xyz.w = std::min(4, out_shape_packed.w);
            local_size_xyz.h = std::min(4, out_shape_packed.h * out_shape_packed.d);
            local_size_xyz.c = std::min(4, out_shape_packed.c);
        }

        // pack1
        if (shape.dims == 0 || elempack == 1)
@@ -173,23 +182,44 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt)
    // broadcast
    if (shape.dims == 0 || broadcast)
    {
        std::vector<vk_specialization_type> specializations(1 + 15);
        std::vector<vk_specialization_type> specializations(1 + 18);
        specializations[0].i = op_type;
        specializations[1 + 0].i = shape_packed.dims;
        specializations[1 + 1].i = shape_packed.w;
        specializations[1 + 2].i = shape_packed.h;
        specializations[1 + 3].i = shape_packed.c;
        specializations[1 + 4].i = shape_packed.cstep;
        specializations[1 + 5].i = shape1_packed.dims;
        specializations[1 + 6].i = shape1_packed.w;
        specializations[1 + 7].i = shape1_packed.h;
        specializations[1 + 8].i = shape1_packed.c;
        specializations[1 + 9].i = shape1_packed.cstep;
        specializations[1 + 10].i = out_shape_packed.dims;
        specializations[1 + 11].i = out_shape_packed.w;
        specializations[1 + 12].i = out_shape_packed.h;
        specializations[1 + 13].i = out_shape_packed.c;
        specializations[1 + 14].i = out_shape_packed.cstep;
        specializations[1 + 3].i = shape_packed.d;
        specializations[1 + 4].i = shape_packed.c;
        specializations[1 + 5].i = shape_packed.cstep;
        specializations[1 + 6].i = shape1_packed.dims;
        specializations[1 + 7].i = shape1_packed.w;
        specializations[1 + 8].i = shape1_packed.h;
        specializations[1 + 9].i = shape1_packed.d;
        specializations[1 + 10].i = shape1_packed.c;
        specializations[1 + 11].i = shape1_packed.cstep;
        specializations[1 + 12].i = out_shape_packed.dims;
        specializations[1 + 13].i = out_shape_packed.w;
        specializations[1 + 14].i = out_shape_packed.h;
        specializations[1 + 15].i = out_shape_packed.d;
        specializations[1 + 16].i = out_shape_packed.c;
        specializations[1 + 17].i = out_shape_packed.cstep;

        std::vector<vk_specialization_type> specializations_broadcast_a1_b1(1 + 15);
        specializations_broadcast_a1_b1[0].i = op_type;
        specializations_broadcast_a1_b1[1 + 0].i = shape_packed.dims;
        specializations_broadcast_a1_b1[1 + 1].i = shape_packed.w;
        specializations_broadcast_a1_b1[1 + 2].i = shape_packed.h * shape_packed.d;
        specializations_broadcast_a1_b1[1 + 3].i = shape_packed.c;
        specializations_broadcast_a1_b1[1 + 4].i = shape_packed.cstep;
        specializations_broadcast_a1_b1[1 + 5].i = shape1_packed.dims;
        specializations_broadcast_a1_b1[1 + 6].i = shape1_packed.w;
        specializations_broadcast_a1_b1[1 + 7].i = shape1_packed.h * shape1_packed.d;
        specializations_broadcast_a1_b1[1 + 8].i = shape1_packed.c;
        specializations_broadcast_a1_b1[1 + 9].i = shape1_packed.cstep;
        specializations_broadcast_a1_b1[1 + 10].i = out_shape_packed.dims;
        specializations_broadcast_a1_b1[1 + 11].i = out_shape_packed.w;
        specializations_broadcast_a1_b1[1 + 12].i = out_shape_packed.h * out_shape_packed.d;
        specializations_broadcast_a1_b1[1 + 13].i = out_shape_packed.c;
        specializations_broadcast_a1_b1[1 + 14].i = out_shape_packed.cstep;

        Mat local_size_xyz;
        if (out_shape_packed.dims == 1)
@@ -210,6 +240,12 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt)
            local_size_xyz.h = std::min(4, out_shape_packed.h);
            local_size_xyz.c = std::min(4, out_shape_packed.c);
        }
        if (out_shape_packed.dims == 4)
        {
            local_size_xyz.w = std::min(4, out_shape_packed.w);
            local_size_xyz.h = std::min(4, out_shape_packed.h * out_shape_packed.d);
            local_size_xyz.c = std::min(4, out_shape_packed.c);
        }

        // pack1
        if (shape.dims == 0 || (elempack == 1 && elempack1 == 1))
@@ -232,7 +268,7 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt)
        {
            pipeline_binaryop_broadcast_a1_pack4 = new Pipeline(vkdev);
            pipeline_binaryop_broadcast_a1_pack4->set_optimal_local_size_xyz(local_size_xyz);
            pipeline_binaryop_broadcast_a1_pack4->create(LayerShaderType::binaryop_broadcast_a1_pack4, opt, specializations);
            pipeline_binaryop_broadcast_a1_pack4->create(LayerShaderType::binaryop_broadcast_a1_pack4, opt, specializations_broadcast_a1_b1);
        }

        if (shape.dims == 0 || (shape1.dims == 1 && shape1.w == 1 && elempack1 == 1 && elempack == 4)
@@ -240,7 +276,7 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt)
        {
            pipeline_binaryop_broadcast_b1_pack4 = new Pipeline(vkdev);
            pipeline_binaryop_broadcast_b1_pack4->set_optimal_local_size_xyz(local_size_xyz);
            pipeline_binaryop_broadcast_b1_pack4->create(LayerShaderType::binaryop_broadcast_b1_pack4, opt, specializations);
            pipeline_binaryop_broadcast_b1_pack4->create(LayerShaderType::binaryop_broadcast_b1_pack4, opt, specializations_broadcast_a1_b1);
        }

        // pack8
@@ -256,7 +292,7 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt)
        {
            pipeline_binaryop_broadcast_a1_pack8 = new Pipeline(vkdev);
            pipeline_binaryop_broadcast_a1_pack8->set_optimal_local_size_xyz(local_size_xyz);
            pipeline_binaryop_broadcast_a1_pack8->create(LayerShaderType::binaryop_broadcast_a1_pack8, opt, specializations);
            pipeline_binaryop_broadcast_a1_pack8->create(LayerShaderType::binaryop_broadcast_a1_pack8, opt, specializations_broadcast_a1_b1);
        }

        if ((opt.use_shader_pack8 && shape.dims == 0) || (shape1.dims == 1 && shape1.w == 1 && elempack1 == 1 && elempack == 8)
@@ -264,7 +300,7 @@ int BinaryOp_vulkan::create_pipeline(const Option& opt)
        {
            pipeline_binaryop_broadcast_b1_pack8 = new Pipeline(vkdev);
            pipeline_binaryop_broadcast_b1_pack8->set_optimal_local_size_xyz(local_size_xyz);
            pipeline_binaryop_broadcast_b1_pack8->create(LayerShaderType::binaryop_broadcast_b1_pack8, opt, specializations);
            pipeline_binaryop_broadcast_b1_pack8->create(LayerShaderType::binaryop_broadcast_b1_pack8, opt, specializations_broadcast_a1_b1);
        }
    }

@@ -324,7 +360,7 @@ int BinaryOp_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector
    }
    else // if (bottom_blob.dims == bottom_blob1.dims)
    {
        if (bottom_blob.w * bottom_blob.h * bottom_blob.c * bottom_blob.elempack >= bottom_blob1.w * bottom_blob1.h * bottom_blob1.c * bottom_blob1.elempack)
        if (bottom_blob.w * bottom_blob.h * bottom_blob.d * bottom_blob.c * bottom_blob.elempack >= bottom_blob1.w * bottom_blob1.h * bottom_blob1.d * bottom_blob1.c * bottom_blob1.elempack)
        {
            top_blob.create_like(bottom_blob, opt.blob_vkallocator);
        }
@@ -343,39 +379,63 @@ int BinaryOp_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector
    bindings[1] = bottom_blob1;
    bindings[2] = top_blob;

    std::vector<vk_constant_type> constants(15);
    constants[0].i = bottom_blob.dims;
    constants[1].i = bottom_blob.w;
    constants[2].i = bottom_blob.h;
    constants[3].i = bottom_blob.c;
    constants[4].i = bottom_blob.cstep;
    constants[5].i = bottom_blob1.dims;
    constants[6].i = bottom_blob1.w;
    constants[7].i = bottom_blob1.h;
    constants[8].i = bottom_blob1.c;
    constants[9].i = bottom_blob1.cstep;
    constants[10].i = top_blob.dims;
    constants[11].i = top_blob.w;
    constants[12].i = top_blob.h;
    constants[13].i = top_blob.c;
    constants[14].i = top_blob.cstep;

    bool broadcast = true;
    if (bottom_blob.dims == bottom_blob1.dims
            && bottom_blob.w == bottom_blob1.w
            && bottom_blob.h == bottom_blob1.h
            && bottom_blob.d == bottom_blob1.d
            && bottom_blob.c == bottom_blob1.c
            && bottom_blob.elempack == bottom_blob1.elempack)
    {
        broadcast = false;
    }

    const Pipeline* pipeline = 0;
    if (broadcast)
    {
        std::vector<vk_constant_type> constants(18);
        constants[0].i = bottom_blob.dims;
        constants[1].i = bottom_blob.w;
        constants[2].i = bottom_blob.h;
        constants[3].i = bottom_blob.d;
        constants[4].i = bottom_blob.c;
        constants[5].i = bottom_blob.cstep;
        constants[6].i = bottom_blob1.dims;
        constants[7].i = bottom_blob1.w;
        constants[8].i = bottom_blob1.h;
        constants[9].i = bottom_blob1.d;
        constants[10].i = bottom_blob1.c;
        constants[11].i = bottom_blob1.cstep;
        constants[12].i = top_blob.dims;
        constants[13].i = top_blob.w;
        constants[14].i = top_blob.h;
        constants[15].i = top_blob.d;
        constants[16].i = top_blob.c;
        constants[17].i = top_blob.cstep;

        std::vector<vk_constant_type> constants_broadcast_a1b1(15);
        constants_broadcast_a1b1[0].i = bottom_blob.dims;
        constants_broadcast_a1b1[1].i = bottom_blob.w;
        constants_broadcast_a1b1[2].i = bottom_blob.h * bottom_blob.d;
        constants_broadcast_a1b1[3].i = bottom_blob.c;
        constants_broadcast_a1b1[4].i = bottom_blob.cstep;
        constants_broadcast_a1b1[5].i = bottom_blob1.dims;
        constants_broadcast_a1b1[6].i = bottom_blob1.w;
        constants_broadcast_a1b1[7].i = bottom_blob1.h * bottom_blob1.d;
        constants_broadcast_a1b1[8].i = bottom_blob1.c;
        constants_broadcast_a1b1[9].i = bottom_blob1.cstep;
        constants_broadcast_a1b1[10].i = top_blob.dims;
        constants_broadcast_a1b1[11].i = top_blob.w;
        constants_broadcast_a1b1[12].i = top_blob.h * top_blob.d;
        constants_broadcast_a1b1[13].i = top_blob.c;
        constants_broadcast_a1b1[14].i = top_blob.cstep;

        bool broadcast_a1b1 = true;

        const Pipeline* pipeline = 0;
        if (bottom_blob.elempack == 1 && bottom_blob1.elempack == 1)
        {
            pipeline = pipeline_binaryop_broadcast;
            broadcast_a1b1 = false;
        }
        else
        {
@@ -400,18 +460,38 @@ int BinaryOp_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector
            else
            {
                pipeline = out_elempack == 8 ? pipeline_binaryop_broadcast_pack8 : pipeline_binaryop_broadcast_pack4;
                broadcast_a1b1 = false;
            }
        }

        cmd.record_pipeline(pipeline, bindings, broadcast_a1b1 ? constants_broadcast_a1b1 : constants, top_blob);
    }
    else
    {
        pipeline = out_elempack == 8 ? pipeline_binaryop_pack8
                   : out_elempack == 4 ? pipeline_binaryop_pack4
                   : pipeline_binaryop;
        std::vector<vk_constant_type> constants(15);
        constants[0].i = bottom_blob.dims;
        constants[1].i = bottom_blob.w;
        constants[2].i = bottom_blob.h * bottom_blob.d;
        constants[3].i = bottom_blob.c;
        constants[4].i = bottom_blob.cstep;
        constants[5].i = bottom_blob1.dims;
        constants[6].i = bottom_blob1.w;
        constants[7].i = bottom_blob1.h * bottom_blob1.d;
        constants[8].i = bottom_blob1.c;
        constants[9].i = bottom_blob1.cstep;
        constants[10].i = top_blob.dims;
        constants[11].i = top_blob.w;
        constants[12].i = top_blob.h * top_blob.d;
        constants[13].i = top_blob.c;
        constants[14].i = top_blob.cstep;

        const Pipeline* pipeline = out_elempack == 8 ? pipeline_binaryop_pack8
                                   : out_elempack == 4 ? pipeline_binaryop_pack4
                                   : pipeline_binaryop;

        cmd.record_pipeline(pipeline, bindings, constants, top_blob);
    }

    cmd.record_pipeline(pipeline, bindings, constants, top_blob);

    return 0;
 }

@@ -427,7 +507,7 @@ int BinaryOp_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, con
    std::vector<vk_constant_type> constants(15);
    constants[10].i = bottom_top_blob.dims;
    constants[11].i = bottom_top_blob.w;
    constants[12].i = bottom_top_blob.h;
    constants[12].i = bottom_top_blob.h * bottom_top_blob.d;
    constants[13].i = bottom_top_blob.c;
    constants[14].i = bottom_top_blob.cstep;

@@ -458,7 +538,7 @@ int BinaryOp_vulkan::forward(const std::vector<VkImageMat>& bottom_blobs, std::v
    }
    else // if (bottom_blob.dims == bottom_blob1.dims)
    {
        if (bottom_blob.w * bottom_blob.h * bottom_blob.c * bottom_blob.elempack >= bottom_blob1.w * bottom_blob1.h * bottom_blob1.c * bottom_blob1.elempack)
        if (bottom_blob.w * bottom_blob.h * bottom_blob.d * bottom_blob.c * bottom_blob.elempack >= bottom_blob1.w * bottom_blob1.h * bottom_blob1.d * bottom_blob1.c * bottom_blob1.elempack)
        {
            top_blob.create_like(bottom_blob, opt.blob_vkallocator);
        }
@@ -477,39 +557,63 @@ int BinaryOp_vulkan::forward(const std::vector<VkImageMat>& bottom_blobs, std::v
    bindings[1] = bottom_blob1;
    bindings[2] = top_blob;

    std::vector<vk_constant_type> constants(15);
    constants[0].i = bottom_blob.dims;
    constants[1].i = bottom_blob.w;
    constants[2].i = bottom_blob.h;
    constants[3].i = bottom_blob.c;
    constants[4].i = 0; //bottom_blob.cstep;
    constants[5].i = bottom_blob1.dims;
    constants[6].i = bottom_blob1.w;
    constants[7].i = bottom_blob1.h;
    constants[8].i = bottom_blob1.c;
    constants[9].i = 0; //bottom_blob1.cstep;
    constants[10].i = top_blob.dims;
    constants[11].i = top_blob.w;
    constants[12].i = top_blob.h;
    constants[13].i = top_blob.c;
    constants[14].i = 0; //top_blob.cstep;

    bool broadcast = true;
    if (bottom_blob.dims == bottom_blob1.dims
            && bottom_blob.w == bottom_blob1.w
            && bottom_blob.h == bottom_blob1.h
            && bottom_blob.d == bottom_blob1.d
            && bottom_blob.c == bottom_blob1.c
            && bottom_blob.elempack == bottom_blob1.elempack)
    {
        broadcast = false;
    }

    const Pipeline* pipeline = 0;
    if (broadcast)
    {
        std::vector<vk_constant_type> constants(18);
        constants[0].i = bottom_blob.dims;
        constants[1].i = bottom_blob.w;
        constants[2].i = bottom_blob.h;
        constants[3].i = bottom_blob.d;
        constants[4].i = bottom_blob.c;
        constants[5].i = 0; //bottom_blob.cstep;
        constants[6].i = bottom_blob1.dims;
        constants[7].i = bottom_blob1.w;
        constants[8].i = bottom_blob1.h;
        constants[9].i = bottom_blob1.d;
        constants[10].i = bottom_blob1.c;
        constants[11].i = 0; //bottom_blob1.cstep;
        constants[12].i = top_blob.dims;
        constants[13].i = top_blob.w;
        constants[14].i = top_blob.h;
        constants[15].i = top_blob.d;
        constants[16].i = top_blob.c;
        constants[17].i = 0; //top_blob.cstep;

        std::vector<vk_constant_type> constants_broadcast_a1b1(15);
        constants_broadcast_a1b1[0].i = bottom_blob.dims;
        constants_broadcast_a1b1[1].i = bottom_blob.w;
        constants_broadcast_a1b1[2].i = bottom_blob.h * bottom_blob.d;
        constants_broadcast_a1b1[3].i = bottom_blob.c;
        constants_broadcast_a1b1[4].i = 0; //bottom_blob.cstep;
        constants_broadcast_a1b1[5].i = bottom_blob1.dims;
        constants_broadcast_a1b1[6].i = bottom_blob1.w;
        constants_broadcast_a1b1[7].i = bottom_blob1.h * bottom_blob1.d;
        constants_broadcast_a1b1[8].i = bottom_blob1.c;
        constants_broadcast_a1b1[9].i = 0; //bottom_blob1.cstep;
        constants_broadcast_a1b1[10].i = top_blob.dims;
        constants_broadcast_a1b1[11].i = top_blob.w;
        constants_broadcast_a1b1[12].i = top_blob.h * top_blob.d;
        constants_broadcast_a1b1[13].i = top_blob.c;
        constants_broadcast_a1b1[14].i = 0; //top_blob.cstep;

        bool broadcast_a1b1 = true;

        const Pipeline* pipeline = 0;
        if (bottom_blob.elempack == 1 && bottom_blob1.elempack == 1)
        {
            pipeline = pipeline_binaryop_broadcast;
            broadcast_a1b1 = false;
        }
        else
        {
@@ -534,18 +638,38 @@ int BinaryOp_vulkan::forward(const std::vector<VkImageMat>& bottom_blobs, std::v
            else
            {
                pipeline = out_elempack == 8 ? pipeline_binaryop_broadcast_pack8 : pipeline_binaryop_broadcast_pack4;
                broadcast_a1b1 = false;
            }
        }

        cmd.record_pipeline(pipeline, bindings, broadcast_a1b1 ? constants_broadcast_a1b1 : constants, top_blob);
    }
    else
    {
        pipeline = out_elempack == 8 ? pipeline_binaryop_pack8
                   : out_elempack == 4 ? pipeline_binaryop_pack4
                   : pipeline_binaryop;
        std::vector<vk_constant_type> constants(15);
        constants[0].i = bottom_blob.dims;
        constants[1].i = bottom_blob.w;
        constants[2].i = bottom_blob.h * bottom_blob.d;
        constants[3].i = bottom_blob.c;
        constants[4].i = 0; //bottom_blob.cstep;
        constants[5].i = bottom_blob1.dims;
        constants[6].i = bottom_blob1.w;
        constants[7].i = bottom_blob1.h * bottom_blob1.d;
        constants[8].i = bottom_blob1.c;
        constants[9].i = 0; //bottom_blob1.cstep;
        constants[10].i = top_blob.dims;
        constants[11].i = top_blob.w;
        constants[12].i = top_blob.h * top_blob.d;
        constants[13].i = top_blob.c;
        constants[14].i = 0; //top_blob.cstep;

        const Pipeline* pipeline = out_elempack == 8 ? pipeline_binaryop_pack8
                                   : out_elempack == 4 ? pipeline_binaryop_pack4
                                   : pipeline_binaryop;

        cmd.record_pipeline(pipeline, bindings, constants, top_blob);
    }

    cmd.record_pipeline(pipeline, bindings, constants, top_blob);

    return 0;
 }

@@ -561,7 +685,7 @@ int BinaryOp_vulkan::forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd
    std::vector<vk_constant_type> constants(15);
    constants[10].i = bottom_top_blob.dims;
    constants[11].i = bottom_top_blob.w;
    constants[12].i = bottom_top_blob.h;
    constants[12].i = bottom_top_blob.h * bottom_top_blob.d;
    constants[13].i = bottom_top_blob.c;
    constants[14].i = 0; //bottom_top_blob.cstep;

--- a/src/layer/vulkan/relu_vulkan.cpp
+++ b/src/layer/vulkan/relu_vulkan.cpp
@@ -84,7 +84,7 @@ int ReLU_vulkan::create_pipeline(const Option& opt)
        local_size_xyz.h = std::min(4, shape_packed.h);
        local_size_xyz.c = std::min(4, shape_packed.c);
    }
    if (shape_packed.dims == 3)
    if (shape_packed.dims == 4)
    {
        local_size_xyz.w = std::min(4, shape_packed.w);
        local_size_xyz.h = std::min(4, shape_packed.h * shape_packed.d);
--- a/src/layer/vulkan/shader/binaryop_broadcast.comp
+++ b/src/layer/vulkan/shader/binaryop_broadcast.comp
@@ -27,20 +27,23 @@ layout (constant_id = 0) const int op_type = 0;
 layout (constant_id = shape_constant_id_offset + 0) const int adims = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int aw = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int ah = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int ac = 0;
 layout (constant_id = shape_constant_id_offset + 4) const int acstep = 0;

 layout (constant_id = shape_constant_id_offset + 5) const int bdims = 0;
 layout (constant_id = shape_constant_id_offset + 6) const int bw = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int bh = 0;
 layout (constant_id = shape_constant_id_offset + 8) const int bc = 0;
 layout (constant_id = shape_constant_id_offset + 9) const int bcstep = 0;

 layout (constant_id = shape_constant_id_offset + 10) const int outdims = 0;
 layout (constant_id = shape_constant_id_offset + 11) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 12) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 13) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 14) const int outcstep = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int ad = 0;
 layout (constant_id = shape_constant_id_offset + 4) const int ac = 0;
 layout (constant_id = shape_constant_id_offset + 5) const int acstep = 0;

 layout (constant_id = shape_constant_id_offset + 6) const int bdims = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int bw = 0;
 layout (constant_id = shape_constant_id_offset + 8) const int bh = 0;
 layout (constant_id = shape_constant_id_offset + 9) const int bd = 0;
 layout (constant_id = shape_constant_id_offset + 10) const int bc = 0;
 layout (constant_id = shape_constant_id_offset + 11) const int bcstep = 0;

 layout (constant_id = shape_constant_id_offset + 12) const int outdims = 0;
 layout (constant_id = shape_constant_id_offset + 13) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 14) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 15) const int outd = 0;
 layout (constant_id = shape_constant_id_offset + 16) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 17) const int outcstep = 0;

 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D a_blob_3d;
@@ -57,18 +60,21 @@ layout (push_constant) uniform parameter
    int adims;
    int aw;
    int ah;
    int ad;
    int ac;
    int acstep;

    int bdims;
    int bw;
    int bh;
    int bd;
    int bc;
    int bcstep;

    int outdims;
    int outw;
    int outh;
    int outd;
    int outc;
    int outcstep;
 } p;
@@ -79,7 +85,7 @@ void main()
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
    if (gx >= psc(outw) || gy >= psc(outh) * psc(outd) || gz >= psc(outc))
        return;

 #if NCNN_image_shader
@@ -90,8 +96,58 @@ void main()
    int by = gy;
    int bz = gz;

    if (psc(adims) == 3)
    if (psc(adims) == 4)
    {
        int yd = gy / psc(outh);
        int yh = gy % psc(outh);

        if (psc(bdims) == 3)
        {
            // type 28
            bx = yh;
            by = yd;
            bz = gz;
        }

        if (psc(bdims) == 2)
        {
            // type 27
            bx = yd;
            by = gz;
            bz = 0;
        }

        if (psc(bdims) == 1)
        {
            if (psc(bw) == 1)
            {
                // type 25
                bx = 0;
                by = 0;
                bz = 0;
            }
            else
            {
                // type 26
                bx = gz;
                by = 0;
                bz = 0;
            }
        }
    }
    else if (psc(adims) == 3)
    {
        if (psc(bdims) == 4)
        {
            int yd = gy / psc(outh);
            int yh = gy % psc(outh);

            // type 23
            ax = yh;
            ay = yd;
            az = gz;
        }

        if (psc(bdims) == 3)
        {
            if (psc(bw) == 1 && psc(bh) == 1)
@@ -173,6 +229,17 @@ void main()
    }
    else if (psc(adims) == 2)
    {
        if (psc(bdims) == 4)
        {
            int yd = gy / psc(outh);
            int yh = gy % psc(outh);

            // type 22
            ax = yd;
            ay = gz;
            az = 0;
        }

        if (psc(bdims) == 3)
        {
            // type 14
@@ -203,13 +270,21 @@ void main()
    {
        if (psc(aw) == 1)
        {
            // type 2 3 4
            // type 2 3 4 20
            ax = 0;
            ay = 0;
            az = 0;
        }
        else
        {
            if (psc(bdims) == 4)
            {
                // type 21
                ax = gz;
                ay = 0;
                az = 0;
            }

            if (psc(bdims) == 3)
            {
                // type 9
@@ -247,8 +322,53 @@ void main()
    int ai;
    int bi;

    if (psc(adims) == 3)
    if (psc(adims) == 4)
    {
        int yd = gy / psc(outh);
        int yh = gy % psc(outh);

        if (psc(bdims) == 3)
        {
            // type 28
            ai = gi;
            bi = gz * psc(bcstep) + yd * psc(bw) + yh;
        }

        if (psc(bdims) == 2)
        {
            // type 27
            ai = gi;
            bi = gz * psc(bw) + yd;
        }

        if (psc(bdims) == 1)
        {
            if (psc(bw) == 1)
            {
                // type 25
                ai = gi;
                bi = 0;
            }
            else
            {
                // type 26
                ai = gi;
                bi = gz;
            }
        }
    }
    else if (psc(adims) == 3)
    {
        if (psc(bdims) == 4)
        {
            int yd = gy / psc(outh);
            int yh = gy % psc(outh);

            // type 23
            ai = gz * psc(acstep) + yd * psc(aw) + yh;
            bi = gi;
        }

        if (psc(bdims) == 3)
        {
            if (psc(bw) == 1 && psc(bh) == 1)
@@ -333,6 +453,16 @@ void main()
    }
    else if (psc(adims) == 2)
    {
        if (psc(bdims) == 4)
        {
            int yd = gy / psc(outh);
            int yh = gy % psc(outh);

            // type 22
            ai = gz * psc(aw) + yd;
            bi = gi;
        }

        if (psc(bdims) == 3)
        {
            // type 14
@@ -360,12 +490,19 @@ void main()
    {
        if (psc(aw) == 1)
        {
            // type 2 3 4
            // type 2 3 4 20
            ai = 0;
            bi = gi;
        }
        else
        {
            if (psc(bdims) == 4)
            {
                // type 21
                ai = gz;
                bi = gi;
            }

            if (psc(bdims) == 3)
            {
                // type 9
--- a/src/layer/vulkan/shader/binaryop_broadcast_pack4.comp
+++ b/src/layer/vulkan/shader/binaryop_broadcast_pack4.comp
@@ -27,20 +27,23 @@ layout (constant_id = 0) const int op_type = 0;
 layout (constant_id = shape_constant_id_offset + 0) const int adims = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int aw = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int ah = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int ac = 0;
 layout (constant_id = shape_constant_id_offset + 4) const int acstep = 0;

 layout (constant_id = shape_constant_id_offset + 5) const int bdims = 0;
 layout (constant_id = shape_constant_id_offset + 6) const int bw = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int bh = 0;
 layout (constant_id = shape_constant_id_offset + 8) const int bc = 0;
 layout (constant_id = shape_constant_id_offset + 9) const int bcstep = 0;

 layout (constant_id = shape_constant_id_offset + 10) const int outdims = 0;
 layout (constant_id = shape_constant_id_offset + 11) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 12) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 13) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 14) const int outcstep = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int ad = 0;
 layout (constant_id = shape_constant_id_offset + 4) const int ac = 0;
 layout (constant_id = shape_constant_id_offset + 5) const int acstep = 0;

 layout (constant_id = shape_constant_id_offset + 6) const int bdims = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int bw = 0;
 layout (constant_id = shape_constant_id_offset + 8) const int bh = 0;
 layout (constant_id = shape_constant_id_offset + 9) const int bd = 0;
 layout (constant_id = shape_constant_id_offset + 10) const int bc = 0;
 layout (constant_id = shape_constant_id_offset + 11) const int bcstep = 0;

 layout (constant_id = shape_constant_id_offset + 12) const int outdims = 0;
 layout (constant_id = shape_constant_id_offset + 13) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 14) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 15) const int outd = 0;
 layout (constant_id = shape_constant_id_offset + 16) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 17) const int outcstep = 0;

 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D a_blob_3d;
@@ -57,18 +60,21 @@ layout (push_constant) uniform parameter
    int adims;
    int aw;
    int ah;
    int ad;
    int ac;
    int acstep;

    int bdims;
    int bw;
    int bh;
    int bd;
    int bc;
    int bcstep;

    int outdims;
    int outw;
    int outh;
    int outd;
    int outc;
    int outcstep;
 } p;
@@ -79,7 +85,7 @@ void main()
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
    if (gx >= psc(outw) || gy >= psc(outh) * psc(outd) || gz >= psc(outc))
        return;

 #if NCNN_image_shader
@@ -90,8 +96,58 @@ void main()
    int by = gy;
    int bz = gz;

    if (psc(adims) == 3)
    if (psc(adims) == 4)
    {
        int yd = gy / psc(outh);
        int yh = gy % psc(outh);

        if (psc(bdims) == 3)
        {
            // type 28
            bx = yh;
            by = yd;
            bz = gz;
        }

        if (psc(bdims) == 2)
        {
            // type 27
            bx = yd;
            by = gz;
            bz = 0;
        }

        if (psc(bdims) == 1)
        {
            if (psc(bw) == 1)
            {
                // type 25
                bx = 0;
                by = 0;
                bz = 0;
            }
            else
            {
                // type 26
                bx = gz;
                by = 0;
                bz = 0;
            }
        }
    }
    else if (psc(adims) == 3)
    {
        if (psc(bdims) == 4)
        {
            int yd = gy / psc(outh);
            int yh = gy % psc(outh);

            // type 23
            ax = yh;
            ay = yd;
            az = gz;
        }

        if (psc(bdims) == 3)
        {
            if (psc(bw) == 1 && psc(bh) == 1)
@@ -173,6 +229,17 @@ void main()
    }
    else if (psc(adims) == 2)
    {
        if (psc(bdims) == 4)
        {
            int yd = gy / psc(outh);
            int yh = gy % psc(outh);

            // type 22
            ax = yd;
            ay = gz;
            az = 0;
        }

        if (psc(bdims) == 3)
        {
            // type 14
@@ -203,13 +270,21 @@ void main()
    {
        if (psc(aw) == 1)
        {
            // type 2 3 4
            // type 2 3 4 20
            ax = 0;
            ay = 0;
            az = 0;
        }
        else
        {
            if (psc(bdims) == 4)
            {
                // type 21
                ax = gz;
                ay = 0;
                az = 0;
            }

            if (psc(bdims) == 3)
            {
                // type 9
@@ -247,8 +322,53 @@ void main()
    int ai;
    int bi;

    if (psc(adims) == 3)
    if (psc(adims) == 4)
    {
        int yd = gy / psc(outh);
        int yh = gy % psc(outh);

        if (psc(bdims) == 3)
        {
            // type 28
            ai = gi;
            bi = gz * psc(bcstep) + yd * psc(bw) + yh;
        }

        if (psc(bdims) == 2)
        {
            // type 27
            ai = gi;
            bi = gz * psc(bw) + yd;
        }

        if (psc(bdims) == 1)
        {
            if (psc(bw) == 1)
            {
                // type 25
                ai = gi;
                bi = 0;
            }
            else
            {
                // type 26
                ai = gi;
                bi = gz;
            }
        }
    }
    else if (psc(adims) == 3)
    {
        if (psc(bdims) == 4)
        {
            int yd = gy / psc(outh);
            int yh = gy % psc(outh);

            // type 23
            ai = gz * psc(acstep) + yd * psc(aw) + yh;
            bi = gi;
        }

        if (psc(bdims) == 3)
        {
            if (psc(bw) == 1 && psc(bh) == 1)
@@ -310,6 +430,16 @@ void main()
    }
    else if (psc(adims) == 2)
    {
        if (psc(bdims) == 4)
        {
            int yd = gy / psc(outh);
            int yh = gy % psc(outh);

            // type 22
            ai = gz * psc(aw) + yd;
            bi = gi;
        }

        if (psc(bdims) == 3)
        {
            // type 14
@@ -326,6 +456,13 @@ void main()
    }
    else if (psc(adims) == 1)
    {
        if (psc(bdims) == 4)
        {
            // type 21
            ai = gz;
            bi = gi;
        }

        if (psc(bdims) == 3)
        {
            // type 9
--- a/src/layer/vulkan/shader/binaryop_broadcast_pack8.comp
+++ b/src/layer/vulkan/shader/binaryop_broadcast_pack8.comp
@@ -28,20 +28,23 @@ layout (constant_id = 0) const int op_type = 0;
 layout (constant_id = shape_constant_id_offset + 0) const int adims = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int aw = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int ah = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int ac = 0;
 layout (constant_id = shape_constant_id_offset + 4) const int acstep = 0;

 layout (constant_id = shape_constant_id_offset + 5) const int bdims = 0;
 layout (constant_id = shape_constant_id_offset + 6) const int bw = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int bh = 0;
 layout (constant_id = shape_constant_id_offset + 8) const int bc = 0;
 layout (constant_id = shape_constant_id_offset + 9) const int bcstep = 0;

 layout (constant_id = shape_constant_id_offset + 10) const int outdims = 0;
 layout (constant_id = shape_constant_id_offset + 11) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 12) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 13) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 14) const int outcstep = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int ad = 0;
 layout (constant_id = shape_constant_id_offset + 4) const int ac = 0;
 layout (constant_id = shape_constant_id_offset + 5) const int acstep = 0;

 layout (constant_id = shape_constant_id_offset + 6) const int bdims = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int bw = 0;
 layout (constant_id = shape_constant_id_offset + 8) const int bh = 0;
 layout (constant_id = shape_constant_id_offset + 9) const int bd = 0;
 layout (constant_id = shape_constant_id_offset + 10) const int bc = 0;
 layout (constant_id = shape_constant_id_offset + 11) const int bcstep = 0;

 layout (constant_id = shape_constant_id_offset + 12) const int outdims = 0;
 layout (constant_id = shape_constant_id_offset + 13) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 14) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 15) const int outd = 0;
 layout (constant_id = shape_constant_id_offset + 16) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 17) const int outcstep = 0;

 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D a_blob_3d;
@@ -58,18 +61,21 @@ layout (push_constant) uniform parameter
    int adims;
    int aw;
    int ah;
    int ad;
    int ac;
    int acstep;

    int bdims;
    int bw;
    int bh;
    int bd;
    int bc;
    int bcstep;

    int outdims;
    int outw;
    int outh;
    int outd;
    int outc;
    int outcstep;
 } p;
@@ -80,7 +86,7 @@ void main()
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
    if (gx >= psc(outw) || gy >= psc(outh) * psc(outd) || gz >= psc(outc))
        return;

 #if NCNN_image_shader
@@ -91,8 +97,58 @@ void main()
    int by = gy;
    int bz = gz;

    if (psc(adims) == 3)
    if (psc(adims) == 4)
    {
        int yd = gy / psc(outh);
        int yh = gy % psc(outh);

        if (psc(bdims) == 3)
        {
            // type 28
            bx = yh;
            by = yd;
            bz = gz;
        }

        if (psc(bdims) == 2)
        {
            // type 27
            bx = yd;
            by = gz;
            bz = 0;
        }

        if (psc(bdims) == 1)
        {
            if (psc(bw) == 1)
            {
                // type 25
                bx = 0;
                by = 0;
                bz = 0;
            }
            else
            {
                // type 26
                bx = gz;
                by = 0;
                bz = 0;
            }
        }
    }
    else if (psc(adims) == 3)
    {
        if (psc(bdims) == 4)
        {
            int yd = gy / psc(outh);
            int yh = gy % psc(outh);

            // type 23
            ax = yh;
            ay = yd;
            az = gz;
        }

        if (psc(bdims) == 3)
        {
            if (psc(bw) == 1 && psc(bh) == 1)
@@ -174,6 +230,17 @@ void main()
    }
    else if (psc(adims) == 2)
    {
        if (psc(bdims) == 4)
        {
            int yd = gy / psc(outh);
            int yh = gy % psc(outh);

            // type 22
            ax = yd;
            ay = gz;
            az = 0;
        }

        if (psc(bdims) == 3)
        {
            // type 14
@@ -204,13 +271,21 @@ void main()
    {
        if (psc(aw) == 1)
        {
            // type 2 3 4
            // type 2 3 4 20
            ax = 0;
            ay = 0;
            az = 0;
        }
        else
        {
            if (psc(bdims) == 4)
            {
                // type 21
                ax = gz;
                ay = 0;
                az = 0;
            }

            if (psc(bdims) == 3)
            {
                // type 9
@@ -248,8 +323,53 @@ void main()
    int ai;
    int bi;

    if (psc(adims) == 3)
    if (psc(adims) == 4)
    {
        int yd = gy / psc(outh);
        int yh = gy % psc(outh);

        if (psc(bdims) == 3)
        {
            // type 28
            ai = gi;
            bi = gz * psc(bcstep) + yd * psc(bw) + yh;
        }

        if (psc(bdims) == 2)
        {
            // type 27
            ai = gi;
            bi = gz * psc(bw) + yd;
        }

        if (psc(bdims) == 1)
        {
            if (psc(bw) == 1)
            {
                // type 25
                ai = gi;
                bi = 0;
            }
            else
            {
                // type 26
                ai = gi;
                bi = gz;
            }
        }
    }
    else if (psc(adims) == 3)
    {
        if (psc(bdims) == 4)
        {
            int yd = gy / psc(outh);
            int yh = gy % psc(outh);

            // type 23
            ai = gz * psc(acstep) + yd * psc(aw) + yh;
            bi = gi;
        }

        if (psc(bdims) == 3)
        {
            if (psc(bw) == 1 && psc(bh) == 1)
@@ -311,6 +431,16 @@ void main()
    }
    else if (psc(adims) == 2)
    {
        if (psc(bdims) == 4)
        {
            int yd = gy / psc(outh);
            int yh = gy % psc(outh);

            // type 22
            ai = gz * psc(aw) + yd;
            bi = gi;
        }

        if (psc(bdims) == 3)
        {
            // type 14
@@ -327,6 +457,13 @@ void main()
    }
    else if (psc(adims) == 1)
    {
        if (psc(bdims) == 4)
        {
            // type 21
            ai = gz;
            bi = gi;
        }

        if (psc(bdims) == 3)
        {
            // type 9
--- a/src/layer/vulkan/unaryop_vulkan.cpp
+++ b/src/layer/vulkan/unaryop_vulkan.cpp
@@ -35,7 +35,7 @@ int UnaryOp_vulkan::create_pipeline(const Option& opt)
    int elempack = 1;
    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
    if (shape.dims == 3 || shape.dims == 4) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;

    size_t elemsize;
    if (opt.use_fp16_storage)
@@ -55,12 +55,13 @@ int UnaryOp_vulkan::create_pipeline(const Option& opt)
    if (shape.dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack);
    if (shape.dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
    if (shape.dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
    if (shape.dims == 4) shape_packed = Mat(shape.w, shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack);

    std::vector<vk_specialization_type> specializations(1 + 5);
    specializations[0].i = op_type;
    specializations[1 + 0].i = shape_packed.dims;
    specializations[1 + 1].i = shape_packed.w;
    specializations[1 + 2].i = shape_packed.h;
    specializations[1 + 2].i = shape_packed.h * shape_packed.d;
    specializations[1 + 3].i = shape_packed.c;
    specializations[1 + 4].i = shape_packed.cstep;

@@ -83,6 +84,12 @@ int UnaryOp_vulkan::create_pipeline(const Option& opt)
        local_size_xyz.h = std::min(4, shape_packed.h);
        local_size_xyz.c = std::min(4, shape_packed.c);
    }
    if (shape_packed.dims == 4)
    {
        local_size_xyz.w = std::min(4, shape_packed.w);
        local_size_xyz.h = std::min(4, shape_packed.h * shape_packed.d);
        local_size_xyz.c = std::min(4, shape_packed.c);
    }

    // pack1
    if (shape.dims == 0 || elempack == 1)
@@ -135,7 +142,7 @@ int UnaryOp_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, cons
    std::vector<vk_constant_type> constants(5);
    constants[0].i = bottom_top_blob.dims;
    constants[1].i = bottom_top_blob.w;
    constants[2].i = bottom_top_blob.h;
    constants[2].i = bottom_top_blob.h * bottom_top_blob.d;
    constants[3].i = bottom_top_blob.c;
    constants[4].i = bottom_top_blob.cstep;

@@ -159,7 +166,7 @@ int UnaryOp_vulkan::forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd,
    std::vector<vk_constant_type> constants(5);
    constants[0].i = bottom_top_blob.dims;
    constants[1].i = bottom_top_blob.w;
    constants[2].i = bottom_top_blob.h;
    constants[2].i = bottom_top_blob.h * bottom_top_blob.d;
    constants[3].i = bottom_top_blob.c;
    constants[4].i = 0; //bottom_top_blob.cstep;

--- a/src/layer/x86/binaryop_x86.cpp
+++ b/src/layer/x86/binaryop_x86.cpp
@@ -44,20 +44,203 @@ static int binary_op_pack8(const Mat& a, const Mat& b, Mat& c, const Option& opt

    int w = a.w;
    int h = a.h;
    int d = a.d;
    int channels = a.c;
    int size = w * h;
    int size = w * h * d;
    size_t elemsize = a.elemsize;
    int elempack = a.elempack;

    int w1 = b.w;
    int h1 = b.h;
    int d1 = b.d;
    int channels1 = b.c;
    int size1 = w1 * h1;
    int size1 = w1 * h1 * d1;
    size_t elemsize1 = b.elemsize;
    int elempack1 = b.elempack;

    if (a.dims == 3)
    if (a.dims == 4)
    {
        if (b.dims == 4)
        {
            // type 29
            c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator);
            if (c.empty())
                return -100;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = a.channel(q);
                const float* ptr1 = b.channel(q);
                float* outptr = c.channel(q);

                for (int i = 0; i < size; i++)
                {
                    __m256 _p = _mm256_loadu_ps(ptr);
                    __m256 _p1 = _mm256_loadu_ps(ptr1);
                    __m256 _outp = op(_p, _p1);
                    _mm256_storeu_ps(outptr, _outp);
                    ptr += 8;
                    ptr1 += 8;
                    outptr += 8;
                }
            }

            return 0;
        }

        c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator);
        if (c.empty())
            return -100;

        if (b.dims == 3)
        {
            // type 28
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = a.channel(q);
                const float* ptr1 = b.channel(q);
                float* outptr = c.channel(q);

                for (int z = 0; z < d; z++)
                {
                    for (int y = 0; y < h; y++)
                    {
                        __m256 _b0 = _mm256_loadu_ps(ptr1);
                        for (int x = 0; x < w; x++)
                        {
                            __m256 _p = _mm256_loadu_ps(ptr);
                            __m256 _outp = op(_p, _b0);
                            _mm256_storeu_ps(outptr, _outp);
                            ptr += 8;
                            outptr += 8;
                        }

                        ptr1 += 8;
                    }
                }
            }

            return 0;
        }

        if (b.dims == 2)
        {
            // type 27
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = a.channel(q);
                const float* ptr1 = b.row(q);
                float* outptr = c.channel(q);

                for (int z = 0; z < d; z++)
                {
                    __m256 _b0 = _mm256_loadu_ps(ptr1);
                    for (int y = 0; y < h; y++)
                    {
                        for (int x = 0; x < w; x++)
                        {
                            __m256 _p = _mm256_loadu_ps(ptr);
                            __m256 _outp = op(_p, _b0);
                            _mm256_storeu_ps(outptr, _outp);
                            ptr += 8;
                            outptr += 8;
                        }
                    }

                    ptr1 += 8;
                }
            }

            return 0;
        }

        if (b.dims == 1)
        {
            if (b.w == 1 && elempack1 == 1)
            {
                // type 25
                __m256 _b0 = _mm256_set1_ps(b[0]);
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const float* ptr = a.channel(q);
                    float* outptr = c.channel(q);

                    for (int i = 0; i < size; i++)
                    {
                        __m256 _p = _mm256_loadu_ps(ptr);
                        __m256 _outp = op(_p, _b0);
                        _mm256_storeu_ps(outptr, _outp);
                        ptr += 8;
                        outptr += 8;
                    }
                }

                return 0;
            }

            // type 26
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = a.channel(q);
                __m256 _b0 = _mm256_loadu_ps((const float*)b + q * 8);
                float* outptr = c.channel(q);

                for (int i = 0; i < size; i++)
                {
                    __m256 _p = _mm256_loadu_ps(ptr);
                    __m256 _outp = op(_p, _b0);
                    _mm256_storeu_ps(outptr, _outp);
                    ptr += 8;
                    outptr += 8;
                }
            }

            return 0;
        }
    }
    else if (a.dims == 3)
    {
        if (b.dims == 4)
        {
            // type 23
            c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
            if (c.empty())
                return -100;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels1; q++)
            {
                const float* ptr = a.channel(q);
                const float* ptr1 = b.channel(q);
                float* outptr = c.channel(q);

                for (int z = 0; z < d1; z++)
                {
                    for (int y = 0; y < h1; y++)
                    {
                        __m256 _a0 = _mm256_loadu_ps(ptr);
                        for (int x = 0; x < w1; x++)
                        {
                            __m256 _p = _mm256_loadu_ps(ptr1);
                            __m256 _outp = op(_a0, _p);
                            _mm256_storeu_ps(outptr, _outp);
                            ptr1 += 8;
                            outptr += 8;
                        }

                        ptr += 8;
                    }
                }
            }

            return 0;
        }

        if (b.dims == 3)
        {
            if (w1 == 1 && h1 == 1 && channels1 == channels)
@@ -406,6 +589,42 @@ static int binary_op_pack8(const Mat& a, const Mat& b, Mat& c, const Option& opt
    }
    else if (a.dims == 2)
    {
        if (b.dims == 4)
        {
            // type 22
            c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
            if (c.empty())
                return -100;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels1; q++)
            {
                const float* ptr = a.row(q);
                const float* ptr1 = b.channel(q);
                float* outptr = c.channel(q);

                for (int z = 0; z < d1; z++)
                {
                    __m256 _a0 = _mm256_loadu_ps(ptr);
                    for (int y = 0; y < h1; y++)
                    {
                        for (int x = 0; x < w1; x++)
                        {
                            __m256 _p = _mm256_loadu_ps(ptr1);
                            __m256 _outp = op(_a0, _p);
                            _mm256_storeu_ps(outptr, _outp);
                            ptr1 += 8;
                            outptr += 8;
                        }
                    }

                    ptr += 8;
                }
            }

            return 0;
        }

        if (b.dims == 3)
        {
            // type 14
@@ -514,6 +733,33 @@ static int binary_op_pack8(const Mat& a, const Mat& b, Mat& c, const Option& opt
    {
        if (a.w == 1 && elempack == 1)
        {
            if (b.dims == 4)
            {
                // type 20
                c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
                if (c.empty())
                    return -100;

                __m256 _a0 = _mm256_set1_ps(a[0]);
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels1; q++)
                {
                    const float* ptr1 = b.channel(q);
                    float* outptr = c.channel(q);

                    for (int i = 0; i < size1; i++)
                    {
                        __m256 _p1 = _mm256_loadu_ps(ptr1);
                        __m256 _outp = op(_a0, _p1);
                        _mm256_storeu_ps(outptr, _outp);
                        ptr1 += 8;
                        outptr += 8;
                    }
                }

                return 0;
            }

            if (b.dims == 3)
            {
                // type 4
@@ -586,6 +832,33 @@ static int binary_op_pack8(const Mat& a, const Mat& b, Mat& c, const Option& opt
            }
        }

        if (b.dims == 4)
        {
            // type 21
            c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
            if (c.empty())
                return -100;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels1; q++)
            {
                __m256 _a0 = _mm256_loadu_ps((const float*)a + q * 8);
                const float* ptr1 = b.channel(q);
                float* outptr = c.channel(q);

                for (int i = 0; i < size1; i++)
                {
                    __m256 _p1 = _mm256_loadu_ps(ptr1);
                    __m256 _outp = op(_a0, _p1);
                    _mm256_storeu_ps(outptr, _outp);
                    ptr1 += 8;
                    outptr += 8;
                }
            }

            return 0;
        }

        if (b.dims == 3)
        {
            // type 9
@@ -693,8 +966,9 @@ static int binary_op_scalar_inplace_pack8(Mat& a, float b, const Option& opt)

    int w = a.w;
    int h = a.h;
    int d = a.d;
    int channels = a.c;
    int size = w * h;
    int size = w * h * d;

    __m256 _b = _mm256_set1_ps(b);

@@ -795,20 +1069,203 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt

    int w = a.w;
    int h = a.h;
    int d = a.d;
    int channels = a.c;
    int size = w * h;
    int size = w * h * d;
    size_t elemsize = a.elemsize;
    int elempack = a.elempack;

    int w1 = b.w;
    int h1 = b.h;
    int d1 = b.d;
    int channels1 = b.c;
    int size1 = w1 * h1;
    int size1 = w1 * h1 * d1;
    size_t elemsize1 = b.elemsize;
    int elempack1 = b.elempack;

    if (a.dims == 3)
    if (a.dims == 4)
    {
        if (b.dims == 4)
        {
            // type 29
            c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator);
            if (c.empty())
                return -100;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = a.channel(q);
                const float* ptr1 = b.channel(q);
                float* outptr = c.channel(q);

                for (int i = 0; i < size; i++)
                {
                    __m128 _p = _mm_loadu_ps(ptr);
                    __m128 _p1 = _mm_loadu_ps(ptr1);
                    __m128 _outp = op(_p, _p1);
                    _mm_storeu_ps(outptr, _outp);
                    ptr += 4;
                    ptr1 += 4;
                    outptr += 4;
                }
            }

            return 0;
        }

        c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator);
        if (c.empty())
            return -100;

        if (b.dims == 3)
        {
            // type 28
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = a.channel(q);
                const float* ptr1 = b.channel(q);
                float* outptr = c.channel(q);

                for (int z = 0; z < d; z++)
                {
                    for (int y = 0; y < h; y++)
                    {
                        __m128 _b0 = _mm_loadu_ps(ptr1);
                        for (int x = 0; x < w; x++)
                        {
                            __m128 _p = _mm_loadu_ps(ptr);
                            __m128 _outp = op(_p, _b0);
                            _mm_storeu_ps(outptr, _outp);
                            ptr += 4;
                            outptr += 4;
                        }

                        ptr1 += 4;
                    }
                }
            }

            return 0;
        }

        if (b.dims == 2)
        {
            // type 27
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = a.channel(q);
                const float* ptr1 = b.row(q);
                float* outptr = c.channel(q);

                for (int z = 0; z < d; z++)
                {
                    __m128 _b0 = _mm_loadu_ps(ptr1);
                    for (int y = 0; y < h; y++)
                    {
                        for (int x = 0; x < w; x++)
                        {
                            __m128 _p = _mm_loadu_ps(ptr);
                            __m128 _outp = op(_p, _b0);
                            _mm_storeu_ps(outptr, _outp);
                            ptr += 4;
                            outptr += 4;
                        }
                    }

                    ptr1 += 4;
                }
            }

            return 0;
        }

        if (b.dims == 1)
        {
            if (b.w == 1 && elempack1 == 1)
            {
                // type 25
                __m128 _b0 = _mm_set1_ps(b[0]);
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const float* ptr = a.channel(q);
                    float* outptr = c.channel(q);

                    for (int i = 0; i < size; i++)
                    {
                        __m128 _p = _mm_loadu_ps(ptr);
                        __m128 _outp = op(_p, _b0);
                        _mm_storeu_ps(outptr, _outp);
                        ptr += 4;
                        outptr += 4;
                    }
                }

                return 0;
            }

            // type 26
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = a.channel(q);
                __m128 _b0 = _mm_loadu_ps((const float*)b + q * 4);
                float* outptr = c.channel(q);

                for (int i = 0; i < size; i++)
                {
                    __m128 _p = _mm_loadu_ps(ptr);
                    __m128 _outp = op(_p, _b0);
                    _mm_storeu_ps(outptr, _outp);
                    ptr += 4;
                    outptr += 4;
                }
            }

            return 0;
        }
    }
    else if (a.dims == 3)
    {
        if (b.dims == 4)
        {
            // type 23
            c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
            if (c.empty())
                return -100;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels1; q++)
            {
                const float* ptr = a.channel(q);
                const float* ptr1 = b.channel(q);
                float* outptr = c.channel(q);

                for (int z = 0; z < d1; z++)
                {
                    for (int y = 0; y < h1; y++)
                    {
                        __m128 _a0 = _mm_loadu_ps(ptr);
                        for (int x = 0; x < w1; x++)
                        {
                            __m128 _p = _mm_loadu_ps(ptr1);
                            __m128 _outp = op(_a0, _p);
                            _mm_storeu_ps(outptr, _outp);
                            ptr1 += 4;
                            outptr += 4;
                        }

                        ptr += 4;
                    }
                }
            }

            return 0;
        }

        if (b.dims == 3)
        {
            if (w1 == 1 && h1 == 1 && channels1 == channels)
@@ -1114,7 +1571,7 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
            if (b.w == 1 && elempack1 == 1)
            {
                // type 16
                __m128 _b0 = _mm_set1_ps(((const float*)b)[0]);
                __m128 _b0 = _mm_set1_ps(b[0]);
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
@@ -1157,6 +1614,42 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
    }
    else if (a.dims == 2)
    {
        if (b.dims == 4)
        {
            // type 22
            c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
            if (c.empty())
                return -100;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels1; q++)
            {
                const float* ptr = a.row(q);
                const float* ptr1 = b.channel(q);
                float* outptr = c.channel(q);

                for (int z = 0; z < d1; z++)
                {
                    __m128 _a0 = _mm_loadu_ps(ptr);
                    for (int y = 0; y < h1; y++)
                    {
                        for (int x = 0; x < w1; x++)
                        {
                            __m128 _p = _mm_loadu_ps(ptr1);
                            __m128 _outp = op(_a0, _p);
                            _mm_storeu_ps(outptr, _outp);
                            ptr1 += 4;
                            outptr += 4;
                        }
                    }

                    ptr += 4;
                }
            }

            return 0;
        }

        if (b.dims == 3)
        {
            // type 14
@@ -1223,7 +1716,7 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
            if (b.w == 1 && elempack1 == 1)
            {
                // type 11
                __m128 _b0 = _mm_set1_ps(((const float*)b)[0]);
                __m128 _b0 = _mm_set1_ps(b[0]);
                const float* ptr = a;
                float* outptr = c;
                for (int i = 0; i < size; i++)
@@ -1265,6 +1758,33 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
    {
        if (a.w == 1 && elempack == 1)
        {
            if (b.dims == 4)
            {
                // type 20
                c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
                if (c.empty())
                    return -100;

                __m128 _a0 = _mm_set1_ps(a[0]);
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels1; q++)
                {
                    const float* ptr1 = b.channel(q);
                    float* outptr = c.channel(q);

                    for (int i = 0; i < size1; i++)
                    {
                        __m128 _p1 = _mm_loadu_ps(ptr1);
                        __m128 _outp = op(_a0, _p1);
                        _mm_storeu_ps(outptr, _outp);
                        ptr1 += 4;
                        outptr += 4;
                    }
                }

                return 0;
            }

            if (b.dims == 3)
            {
                // type 4
@@ -1272,7 +1792,7 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
                if (c.empty())
                    return -100;

                __m128 _a0 = _mm_set1_ps(((const float*)a)[0]);
                __m128 _a0 = _mm_set1_ps(a[0]);
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels1; q++)
                {
@@ -1299,7 +1819,7 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
                if (c.empty())
                    return -100;

                __m128 _a0 = _mm_set1_ps(((const float*)a)[0]);
                __m128 _a0 = _mm_set1_ps(a[0]);
                const float* ptr1 = b;
                float* outptr = c;
                for (int i = 0; i < size1; i++)
@@ -1321,7 +1841,7 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
                if (c.empty())
                    return -100;

                __m128 _a0 = _mm_set1_ps(((const float*)a)[0]);
                __m128 _a0 = _mm_set1_ps(a[0]);
                const float* ptr1 = b;
                float* outptr = c;
                for (int i = 0; i < w1; i++)
@@ -1337,6 +1857,33 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
            }
        }

        if (b.dims == 4)
        {
            // type 21
            c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
            if (c.empty())
                return -100;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels1; q++)
            {
                __m128 _a0 = _mm_loadu_ps((const float*)a + q * 4);
                const float* ptr1 = b.channel(q);
                float* outptr = c.channel(q);

                for (int i = 0; i < size1; i++)
                {
                    __m128 _p1 = _mm_loadu_ps(ptr1);
                    __m128 _outp = op(_a0, _p1);
                    _mm_storeu_ps(outptr, _outp);
                    ptr1 += 4;
                    outptr += 4;
                }
            }

            return 0;
        }

        if (b.dims == 3)
        {
            // type 9
@@ -1402,7 +1949,7 @@ static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt
            if (b.w == 1 && elempack1 == 1)
            {
                // type 6
                __m128 _b0 = _mm_set1_ps(((const float*)b)[0]);
                __m128 _b0 = _mm_set1_ps(b[0]);
                const float* ptr = a;
                float* outptr = c;
                for (int i = 0; i < w; i++)
@@ -1444,8 +1991,9 @@ static int binary_op_scalar_inplace_pack4(Mat& a, float b, const Option& opt)

    int w = a.w;
    int h = a.h;
    int d = a.d;
    int channels = a.c;
    int size = w * h;
    int size = w * h * d;

    __m128 _b = _mm_set1_ps((float)b);

@@ -1574,10 +2122,10 @@ int BinaryOp_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
            return binary_op_pack8<binary_op_pow_pack8>(bottom_blob, bottom_blob1, top_blob, opt);

        if (op_type == Operation_RSUB)
            return binary_op_pack8<binary_op_rsub_pack8>(bottom_blob, bottom_blob1, top_blob, opt);
            return binary_op_pack8<binary_op_sub_pack8>(bottom_blob1, bottom_blob, top_blob, opt);

        if (op_type == Operation_RDIV)
            return binary_op_pack8<binary_op_rdiv_pack8>(bottom_blob, bottom_blob1, top_blob, opt);
            return binary_op_pack8<binary_op_div_pack8>(bottom_blob1, bottom_blob, top_blob, opt);
    }
 #endif // __AVX__

@@ -1605,10 +2153,10 @@ int BinaryOp_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
            return binary_op_pack4<binary_op_pow_pack4>(bottom_blob, bottom_blob1, top_blob, opt);

        if (op_type == Operation_RSUB)
            return binary_op_pack4<binary_op_rsub_pack4>(bottom_blob, bottom_blob1, top_blob, opt);
            return binary_op_pack4<binary_op_sub_pack4>(bottom_blob1, bottom_blob, top_blob, opt);

        if (op_type == Operation_RDIV)
            return binary_op_pack4<binary_op_rdiv_pack4>(bottom_blob, bottom_blob1, top_blob, opt);
            return binary_op_pack4<binary_op_div_pack4>(bottom_blob1, bottom_blob, top_blob, opt);
    }
 #endif // __SSE2__

--- a/src/net.cpp
+++ b/src/net.cpp
@@ -797,7 +797,7 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
        int elemcount = 0;
        if (dims == 1) elemcount = bottom_blob.elempack * bottom_blob.w;
        if (dims == 2) elemcount = bottom_blob.elempack * bottom_blob.h;
        if (dims == 3) elemcount = bottom_blob.elempack * bottom_blob.c;
        if (dims == 3 || dims == 4) elemcount = bottom_blob.elempack * bottom_blob.c;

        int elembits = bottom_blob.elembits();

--- a/tests/test_binaryop.cpp
+++ b/tests/test_binaryop.cpp
@@ -50,7 +50,7 @@ static int test_binaryop(const ncnn::Mat& _a, const ncnn::Mat& _b)
    int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, ab);
    if (ret != 0)
    {
        fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d) b.dims=%d b=(%d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.c, b.dims, b.w, b.h, b.c, op_type);
        fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b.dims=%d b=(%d %d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b.dims, b.w, b.h, b.d, b.c, op_type);
    }

    return ret;
@@ -76,7 +76,7 @@ static int test_binaryop(const ncnn::Mat& _a, float b)
    int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, a);
    if (ret != 0)
    {
        fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d) b=%f op_type=%d\n", a.dims, a.w, a.h, a.c, b, op_type);
        fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b=%f op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b, op_type);
    }

    return ret;
@@ -234,6 +234,86 @@ static int test_binaryop_19()
           || test_binaryop(RandomMat(11, 6, 16), RandomMat(11, 6, 16));
 }

 static int test_binaryop_20()
 {
    return 0
           || test_binaryop(RandomMat(1), RandomMat(11, 3, 4, 2))
           || test_binaryop(RandomMat(1), RandomMat(11, 3, 4, 4))
           || test_binaryop(RandomMat(1), RandomMat(11, 3, 4, 16));
 }

 static int test_binaryop_21()
 {
    return 0
           || test_binaryop(RandomMat(2), RandomMat(11, 3, 4, 2))
           || test_binaryop(RandomMat(4), RandomMat(11, 3, 4, 4))
           || test_binaryop(RandomMat(16), RandomMat(11, 3, 4, 16));
 }

 static int test_binaryop_22()
 {
    return 0
           || test_binaryop(RandomMat(4, 2), RandomMat(11, 3, 4, 2))
           || test_binaryop(RandomMat(4, 4), RandomMat(11, 3, 4, 4))
           || test_binaryop(RandomMat(4, 16), RandomMat(11, 3, 4, 16));
 }

 static int test_binaryop_23()
 {
    return 0
           || test_binaryop(RandomMat(3, 4, 2), RandomMat(11, 3, 4, 2))
           || test_binaryop(RandomMat(3, 4, 4), RandomMat(11, 3, 4, 4))
           || test_binaryop(RandomMat(3, 4, 16), RandomMat(11, 3, 4, 16));
 }

 static int test_binaryop_24()
 {
    return 0
           || test_binaryop(RandomMat(11, 3, 4, 2), 1.f)
           || test_binaryop(RandomMat(11, 3, 4, 4), 1.f)
           || test_binaryop(RandomMat(11, 3, 4, 16), 1.f);
 }

 static int test_binaryop_25()
 {
    return 0
           || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(1))
           || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(1))
           || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(1));
 }

 static int test_binaryop_26()
 {
    return 0
           || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(2))
           || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(4))
           || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(16));
 }

 static int test_binaryop_27()
 {
    return 0
           || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(4, 2))
           || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(4, 4))
           || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(4, 16));
 }

 static int test_binaryop_28()
 {
    return 0
           || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(3, 4, 2))
           || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(3, 4, 4))
           || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(3, 4, 16));
 }

 static int test_binaryop_29()
 {
    return 0
           || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(11, 3, 4, 2))
           || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(11, 3, 4, 4))
           || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(11, 3, 4, 16));
 }

 static int test_binaryop_s1()
 {
    return 0
@@ -324,6 +404,16 @@ int main()
                  || test_binaryop_17()
                  || test_binaryop_18()
                  || test_binaryop_19()
                  || test_binaryop_20()
                  || test_binaryop_21()
                  || test_binaryop_22()
                  || test_binaryop_23()
                  || test_binaryop_24()
                  || test_binaryop_25()
                  || test_binaryop_26()
                  || test_binaryop_27()
                  || test_binaryop_28()
                  || test_binaryop_29()
                  || test_binaryop_s1()
                  || test_binaryop_s2()
                  || test_binaryop_s3()
--- a/tests/test_unaryop.cpp
+++ b/tests/test_unaryop.cpp
@@ -49,13 +49,21 @@ static int test_unaryop(const ncnn::Mat& _a)
    int ret = test_layer<ncnn::UnaryOp>("UnaryOp", pd, weights, a);
    if (ret != 0)
    {
        fprintf(stderr, "test_unaryop failed a.dims=%d a=(%d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.c, op_type);
        fprintf(stderr, "test_unaryop failed a.dims=%d a=(%d %d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, op_type);
    }

    return ret;
 }

 static int test_unaryop_0()
 {
    return 0
           || test_unaryop(RandomMat(11, 3, 2, 16))
           || test_unaryop(RandomMat(10, 2, 2, 12))
           || test_unaryop(RandomMat(6, 1, 5, 13));
 }

 static int test_unaryop_1()
 {
    return 0
           || test_unaryop(RandomMat(11, 7, 16))
@@ -63,7 +71,7 @@ static int test_unaryop_0()
           || test_unaryop(RandomMat(6, 5, 13));
 }

 static int test_unaryop_1()
 static int test_unaryop_2()
 {
    return 0
           || test_unaryop(RandomMat(12, 16))
@@ -71,7 +79,7 @@ static int test_unaryop_1()
           || test_unaryop(RandomMat(14, 15));
 }

 static int test_unaryop_2()
 static int test_unaryop_3()
 {
    return 0
           || test_unaryop(RandomMat(128))
@@ -88,7 +96,8 @@ int main()
        int ret = 0
                  || test_unaryop_0()
                  || test_unaryop_1()
                  || test_unaryop_2();
                  || test_unaryop_2()
                  || test_unaryop_3();

        if (ret != 0)
            return ret;
--- a/tests/testutil.h
+++ b/tests/testutil.h
@@ -457,7 +457,7 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
            int elemcount = 0;
            if (dims == 1) elemcount = a4[i].elempack * a4[i].w;
            if (dims == 2) elemcount = a4[i].elempack * a4[i].h;
            if (dims == 3) elemcount = a4[i].elempack * a4[i].c;
            if (dims == 3 || dims == 4) elemcount = a4[i].elempack * a4[i].c;

            int elembits = a4[i].elembits();

@@ -882,7 +882,7 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
        int elemcount = 0;
        if (dims == 1) elemcount = a4.elempack * a4.w;
        if (dims == 2) elemcount = a4.elempack * a4.h;
        if (dims == 3) elemcount = a4.elempack * a4.c;
        if (dims == 3 || dims == 4) elemcount = a4.elempack * a4.c;

        int elembits = a4.elembits();

--- a/toolchains/c906-v222.toolchain.cmake
+++ b/toolchains/c906-v222.toolchain.cmake
@@ -22,8 +22,8 @@ set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)

 set(CMAKE_C_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static")
 set(CMAKE_CXX_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static")
 set(CMAKE_C_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static")
 set(CMAKE_CXX_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static")

 # cache flags
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
--- a/toolchains/c906-v223.toolchain.cmake
+++ b/toolchains/c906-v223.toolchain.cmake
@@ -22,8 +22,8 @@ set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)

 set(CMAKE_C_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static")
 set(CMAKE_CXX_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static")
 set(CMAKE_C_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static")
 set(CMAKE_CXX_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static")

 # replace vfredsum_vs* with vfredusum_vs*
 add_definitions(-Dvfredsum_vs_f32m1_f32m1=vfredusum_vs_f32m1_f32m1)
--- a/toolchains/c906.toolchain.cmake
+++ b/toolchains/c906.toolchain.cmake
@@ -22,8 +22,8 @@ set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)

 set(CMAKE_C_FLAGS "-march=rv64gcvxtheadc -mabi=lp64d -mtune=c906 -DRVV_SPEC_0_7 -D__riscv_zfh=1 -static")
 set(CMAKE_CXX_FLAGS "-march=rv64gcvxtheadc -mabi=lp64d -mtune=c906 -DRVV_SPEC_0_7 -D__riscv_zfh=1 -static")
 set(CMAKE_C_FLAGS "-march=rv64gcvxtheadc -mabi=lp64d -mtune=c906 -DRVV_SPEC_0_7 -D__riscv_zfh=1 -DC906=1 -static")
 set(CMAKE_CXX_FLAGS "-march=rv64gcvxtheadc -mabi=lp64d -mtune=c906 -DRVV_SPEC_0_7 -D__riscv_zfh=1 -DC906=1 -static")

 # cache flags
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")