From b1870434f3497608cb796d478d110f63e129fcda Mon Sep 17 00:00:00 2001
From: xfan1024 <33223182+xfan1024@users.noreply.github.com>
Date: Tue, 15 Apr 2025 10:42:57 +0800
Subject: [PATCH] eltwise riscv rvv optimization (#5992)

---
 src/layer/riscv/eltwise_riscv.cpp     | 342 ++++++++++++++++++++++++++
 src/layer/riscv/eltwise_riscv.h       |  37 +++
 src/layer/riscv/eltwise_riscv_zfh.cpp | 313 +++++++++++++++++++++++
 3 files changed, 692 insertions(+)
 create mode 100644 src/layer/riscv/eltwise_riscv.cpp
 create mode 100644 src/layer/riscv/eltwise_riscv.h
 create mode 100644 src/layer/riscv/eltwise_riscv_zfh.cpp
diff --git a/src/layer/riscv/eltwise_riscv.cpp b/src/layer/riscv/eltwise_riscv.cpp
new file mode 100644
index 000000000..aa04c935b
--- /dev/null
+++ b/src/layer/riscv/eltwise_riscv.cpp
@@ -0,0 +1,342 @@
+//
+// Copyright (C) 2025 xiaofan <xiaofan@iscas.ac.cn>. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this
+// file except in compliance with the License. You may obtain a copy of the
+// License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations under
+// the License.
+
+#include "eltwise_riscv.h"
+
+#if __riscv_vector
+#include <riscv_vector.h>
+#include "rvv_mathfun.h"
+#endif // __riscv_vector
+
+#include "cpu.h"
+
+namespace ncnn {
+
+Eltwise_riscv::Eltwise_riscv()
+{
+#if __riscv_vector
+    support_packing = true;
+#endif
+#if NCNN_ZFH
+#if __riscv_vector
+    support_fp16_storage = cpu_support_riscv_zvfh();
+#else
+    support_fp16_storage = cpu_support_riscv_zfh();
+#endif
+#endif
+}
+
+int Eltwise_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_top_blob = bottom_blobs[0];
+#if NCNN_ZFH
+    int elembits = bottom_top_blob.elembits();
+    if (opt.use_fp16_storage && elembits == 16)
+    {
+        return forward_fp16s(bottom_blobs, top_blobs, opt);
+    }
+#endif
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    Mat& top_blob = top_blobs[0];
+    top_blob.create_like(bottom_top_blob, opt.blob_allocator);
+
+    if (op_type == Operation_PROD)
+    {
+        // top_blob = bottom_top_blob * bottom_blobs[1]
+        const Mat& bottom_blob1 = bottom_blobs[1];
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const float* ptr = bottom_top_blob.channel(q);
+            const float* ptr1 = bottom_blob1.channel(q);
+            float* outptr = top_blob.channel(q);
+#if __riscv_vector
+            int n = size;
+            while (n > 0)
+            {
+                size_t vl = __riscv_vsetvl_e32m8(n);
+
+                vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
+                vfloat32m8_t _p1 = __riscv_vle32_v_f32m8(ptr1, vl);
+                _p = __riscv_vfmul_vv_f32m8(_p, _p1, vl);
+                __riscv_vse32_v_f32m8(outptr, _p, vl);
+
+                ptr += vl;
+                ptr1 += vl;
+                outptr += vl;
+                n -= vl;
+            }
+#else
+            for (int i = 0; i < size; i++)
+            {
+                outptr[i] = ptr[i] * ptr1[i];
+            }
+#endif
+        }
+
+        // top_blob *= bottom_blobs[i] for i = 2, 3, ...
+        for (size_t b = 2; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob1 = bottom_blobs[b];
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = bottom_blob1.channel(q);
+                float* outptr = top_blob.channel(q);
+#if __riscv_vector
+                int n = size;
+                while (n > 0)
+                {
+                    size_t vl = __riscv_vsetvl_e32m8(n);
+
+                    vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
+                    vfloat32m8_t _p1 = __riscv_vle32_v_f32m8(outptr, vl);
+                    _p1 = __riscv_vfmul_vv_f32m8(_p1, _p, vl);
+                    __riscv_vse32_v_f32m8(outptr, _p1, vl);
+
+                    ptr += vl;
+                    outptr += vl;
+                    n -= vl;
+                }
+#else
+                for (int i = 0; i < size; i++)
+                {
+                    outptr[i] *= ptr[i];
+                }
+#endif
+            }
+        }
+    }
+    if (op_type == Operation_SUM)
+    {
+        if (coeffs.empty())
+        {
+            // top_blob = bottom_top_blob + bottom_blobs[1]
+            const Mat& bottom_blob1 = bottom_blobs[1];
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = bottom_top_blob.channel(q);
+                const float* ptr1 = bottom_blob1.channel(q);
+                float* outptr = top_blob.channel(q);
+#if __riscv_vector
+                int n = size;
+                while (n > 0)
+                {
+                    size_t vl = __riscv_vsetvl_e32m8(n);
+
+                    vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
+                    vfloat32m8_t _p1 = __riscv_vle32_v_f32m8(ptr1, vl);
+                    _p = __riscv_vfadd_vv_f32m8(_p, _p1, vl);
+                    __riscv_vse32_v_f32m8(outptr, _p, vl);
+
+                    ptr += vl;
+                    ptr1 += vl;
+                    outptr += vl;
+                    n -= vl;
+                }
+#else
+                for (int i = 0; i < size; i++)
+                {
+                    outptr[i] = ptr[i] + ptr1[i];
+                }
+#endif
+            }
+
+            // top_blob += bottom_blobs[i] for i = 2, 3, ...
+            for (size_t b = 2; b < bottom_blobs.size(); b++)
+            {
+                const Mat& bottom_blob1 = bottom_blobs[b];
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const float* ptr = bottom_blob1.channel(q);
+                    float* outptr = top_blob.channel(q);
+#if __riscv_vector
+                    int n = size;
+                    while (n > 0)
+                    {
+                        size_t vl = __riscv_vsetvl_e32m8(n);
+
+                        vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
+                        vfloat32m8_t _p1 = __riscv_vle32_v_f32m8(outptr, vl);
+                        _p1 = __riscv_vfadd_vv_f32m8(_p1, _p, vl);
+                        __riscv_vse32_v_f32m8(outptr, _p1, vl);
+
+                        ptr += vl;
+                        outptr += vl;
+                        n -= vl;
+                    }
+#else
+                    for (int i = 0; i < size; i++)
+                    {
+                        outptr[i] += ptr[i];
+                    }
+#endif
+                }
+            }
+        }
+        else
+        {
+            // top_blob = bottom_top_blob * coeffs[0] + bottom_blobs[1] * coeffs[1]
+            const Mat& bottom_blob1 = bottom_blobs[1];
+            float coeff0 = coeffs[0];
+            float coeff1 = coeffs[1];
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = bottom_top_blob.channel(q);
+                const float* ptr1 = bottom_blob1.channel(q);
+                float* outptr = top_blob.channel(q);
+#if __riscv_vector
+                int n = size;
+                while (n > 0)
+                {
+                    size_t vl = __riscv_vsetvl_e32m8(n);
+
+                    vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
+                    vfloat32m8_t _p1 = __riscv_vle32_v_f32m8(ptr1, vl);
+                    _p = __riscv_vfmul_vf_f32m8(_p, coeff0, vl);
+                    _p1 = __riscv_vfmul_vf_f32m8(_p1, coeff1, vl);
+                    _p = __riscv_vfadd_vv_f32m8(_p, _p1, vl);
+                    __riscv_vse32_v_f32m8(outptr, _p, vl);
+
+                    ptr += vl;
+                    ptr1 += vl;
+                    outptr += vl;
+                    n -= vl;
+                }
+#else
+                for (int i = 0; i < size; i++)
+                {
+                    outptr[i] = ptr[i] * coeff0 + ptr1[i] * coeff1;
+                }
+#endif
+            }
+
+            // top_blob += bottom_blobs[i] * coeffs[i] for i = 2, 3, ...
+            for (size_t b = 2; b < bottom_blobs.size(); b++)
+            {
+                const Mat& bottom_blob1 = bottom_blobs[b];
+                float coeff = coeffs[b];
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const float* ptr = bottom_blob1.channel(q);
+                    float* outptr = top_blob.channel(q);
+#if __riscv_vector
+                    int n = size;
+                    while (n > 0)
+                    {
+                        size_t vl = __riscv_vsetvl_e32m8(n);
+
+                        vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
+                        vfloat32m8_t _p1 = __riscv_vle32_v_f32m8(outptr, vl);
+                        _p = __riscv_vfmul_vf_f32m8(_p, coeff, vl);
+                        _p1 = __riscv_vfadd_vv_f32m8(_p1, _p, vl);
+                        __riscv_vse32_v_f32m8(outptr, _p1, vl);
+
+                        ptr += vl;
+                        outptr += vl;
+                        n -= vl;
+                    }
+#else
+                    for (int i = 0; i < size; i++)
+                    {
+                        outptr[i] += ptr[i] * coeff;
+                    }
+#endif
+                }
+            }
+        }
+    }
+    if (op_type == Operation_MAX)
+    {
+        // top_blob = max(bottom_top_blob, bottom_blobs[1])
+        const Mat& bottom_blob1 = bottom_blobs[1];
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const float* ptr = bottom_top_blob.channel(q);
+            const float* ptr1 = bottom_blob1.channel(q);
+            float* outptr = top_blob.channel(q);
+#if __riscv_vector
+            int n = size;
+            while (n > 0)
+            {
+                size_t vl = __riscv_vsetvl_e32m8(n);
+
+                vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
+                vfloat32m8_t _p1 = __riscv_vle32_v_f32m8(ptr1, vl);
+                _p = __riscv_vfmax_vv_f32m8(_p, _p1, vl);
+                __riscv_vse32_v_f32m8(outptr, _p, vl);
+
+                ptr += vl;
+                ptr1 += vl;
+                outptr += vl;
+                n -= vl;
+            }
+#else
+            for (int i = 0; i < size; i++)
+            {
+                outptr[i] = std::max(ptr[i], ptr1[i]);
+            }
+#endif
+        }
+
+        // top_blob = max(top_blob, bottom_blobs[i]) for i = 2, 3, ...
+        for (size_t b = 2; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob1 = bottom_blobs[b];
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = bottom_blob1.channel(q);
+                float* outptr = top_blob.channel(q);
+#if __riscv_vector
+                int n = size;
+                while (n > 0)
+                {
+                    size_t vl = __riscv_vsetvl_e32m8(n);
+
+                    vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
+                    vfloat32m8_t _p1 = __riscv_vle32_v_f32m8(outptr, vl);
+                    _p1 = __riscv_vfmax_vv_f32m8(_p1, _p, vl);
+                    __riscv_vse32_v_f32m8(outptr, _p1, vl);
+
+                    ptr += vl;
+                    outptr += vl;
+                    n -= vl;
+                }
+#else
+                for (int i = 0; i < size; i++)
+                {
+                    outptr[i] = std::max(outptr[i], ptr[i]);
+                }
+#endif
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/riscv/eltwise_riscv.h b/src/layer/riscv/eltwise_riscv.h
new file mode 100644
index 000000000..10c158a13
--- /dev/null
+++ b/src/layer/riscv/eltwise_riscv.h
@@ -0,0 +1,37 @@
+//
+// Copyright (C) 2025 xiaofan <xiaofan@iscas.ac.cn>. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this
+// file except in compliance with the License. You may obtain a copy of the
+// License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations under
+// the License.
+
+#ifndef LAYER_ELTWISE_RISCV_H
+#define LAYER_ELTWISE_RISCV_H
+
+#include "eltwise.h"
+
+namespace ncnn {
+
+class Eltwise_riscv : public Eltwise
+{
+public:
+    Eltwise_riscv();
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
+protected:
+#if NCNN_ZFH
+    int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+#endif
+};
+
+} // namespace ncnn
+
+#endif // LAYER_ELTWISE_RISCV_H
diff --git a/src/layer/riscv/eltwise_riscv_zfh.cpp b/src/layer/riscv/eltwise_riscv_zfh.cpp
new file mode 100644
index 000000000..8ff4c8570
--- /dev/null
+++ b/src/layer/riscv/eltwise_riscv_zfh.cpp
@@ -0,0 +1,313 @@
+//
+// Copyright (C) 2025 xiaofan <xiaofan@iscas.ac.cn>. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this
+// file except in compliance with the License. You may obtain a copy of the
+// License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations under
+// the License.
+
+#include "eltwise_riscv.h"
+
+#if __riscv_vector
+#include <riscv_vector.h>
+#endif // __riscv_vector
+
+namespace ncnn {
+
+#if NCNN_ZFH
+int Eltwise_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_top_blob = bottom_blobs[0];
+
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    Mat& top_blob = top_blobs[0];
+    top_blob.create_like(bottom_top_blob, opt.blob_allocator);
+
+    if (op_type == Operation_PROD)
+    {
+        const Mat& bottom_blob1 = bottom_blobs[1];
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const __fp16* ptr = bottom_top_blob.channel(q);
+            const __fp16* ptr1 = bottom_blob1.channel(q);
+            __fp16* outptr = top_blob.channel(q);
+#if __riscv_zvfh
+            int n = size;
+            while (n > 0)
+            {
+                size_t vl = __riscv_vsetvl_e16m8(n);
+
+                vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
+                vfloat16m8_t _p1 = __riscv_vle16_v_f16m8(ptr1, vl);
+                _p = __riscv_vfmul_vv_f16m8(_p, _p1, vl);
+                __riscv_vse16_v_f16m8(outptr, _p, vl);
+
+                ptr += vl;
+                ptr1 += vl;
+                outptr += vl;
+                n -= vl;
+            }
+#else
+            for (int i = 0; i < size; i++)
+            {
+                outptr[i] = ptr[i] * ptr1[i];
+            }
+#endif
+        }
+
+        for (size_t b = 2; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob1 = bottom_blobs[b];
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const __fp16* ptr = bottom_blob1.channel(q);
+                __fp16* outptr = top_blob.channel(q);
+#if __riscv_zvfh
+                int n = size;
+                while (n > 0)
+                {
+                    size_t vl = __riscv_vsetvl_e16m8(n);
+
+                    vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
+                    vfloat16m8_t _p1 = __riscv_vle16_v_f16m8(outptr, vl);
+                    _p1 = __riscv_vfmul_vv_f16m8(_p1, _p, vl);
+                    __riscv_vse16_v_f16m8(outptr, _p1, vl);
+
+                    ptr += vl;
+                    outptr += vl;
+                    n -= vl;
+                }
+#else
+                for (int i = 0; i < size; i++)
+                {
+                    outptr[i] *= ptr[i];
+                }
+#endif
+            }
+        }
+    }
+    if (op_type == Operation_SUM)
+    {
+        if (coeffs.empty())
+        {
+            const Mat& bottom_blob1 = bottom_blobs[1];
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const __fp16* ptr = bottom_top_blob.channel(q);
+                const __fp16* ptr1 = bottom_blob1.channel(q);
+                __fp16* outptr = top_blob.channel(q);
+#if __riscv_zvfh
+                int n = size;
+                while (n > 0)
+                {
+                    size_t vl = __riscv_vsetvl_e16m8(n);
+
+                    vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
+                    vfloat16m8_t _p1 = __riscv_vle16_v_f16m8(ptr1, vl);
+                    _p = __riscv_vfadd_vv_f16m8(_p, _p1, vl);
+                    __riscv_vse16_v_f16m8(outptr, _p, vl);
+
+                    ptr += vl;
+                    ptr1 += vl;
+                    outptr += vl;
+                    n -= vl;
+                }
+#else
+                for (int i = 0; i < size; i++)
+                {
+                    outptr[i] = ptr[i] + ptr1[i];
+                }
+#endif
+            }
+
+            for (size_t b = 2; b < bottom_blobs.size(); b++)
+            {
+                const Mat& bottom_blob1 = bottom_blobs[b];
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const __fp16* ptr = bottom_blob1.channel(q);
+                    __fp16* outptr = top_blob.channel(q);
+#if __riscv_zvfh
+                    int n = size;
+                    while (n > 0)
+                    {
+                        size_t vl = __riscv_vsetvl_e16m8(n);
+
+                        vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
+                        vfloat16m8_t _p1 = __riscv_vle16_v_f16m8(outptr, vl);
+                        _p1 = __riscv_vfadd_vv_f16m8(_p1, _p, vl);
+                        __riscv_vse16_v_f16m8(outptr, _p1, vl);
+
+                        ptr += vl;
+                        outptr += vl;
+                        n -= vl;
+                    }
+#else
+                    for (int i = 0; i < size; i++)
+                    {
+                        outptr[i] += ptr[i];
+                    }
+#endif
+                }
+            }
+        }
+        else
+        {
+            const Mat& bottom_blob1 = bottom_blobs[1];
+            __fp16 coeff0 = (__fp16)coeffs[0];
+            __fp16 coeff1 = (__fp16)coeffs[1];
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const __fp16* ptr = bottom_top_blob.channel(q);
+                const __fp16* ptr1 = bottom_blob1.channel(q);
+                __fp16* outptr = top_blob.channel(q);
+#if __riscv_zvfh
+                int n = size;
+                while (n > 0)
+                {
+                    size_t vl = __riscv_vsetvl_e16m8(n);
+
+                    vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
+                    vfloat16m8_t _p1 = __riscv_vle16_v_f16m8(ptr1, vl);
+                    _p = __riscv_vfmul_vf_f16m8(_p, coeff0, vl);
+                    _p1 = __riscv_vfmul_vf_f16m8(_p1, coeff1, vl);
+                    _p = __riscv_vfadd_vv_f16m8(_p, _p1, vl);
+                    __riscv_vse16_v_f16m8(outptr, _p, vl);
+
+                    ptr += vl;
+                    ptr1 += vl;
+                    outptr += vl;
+                    n -= vl;
+                }
+#else
+                for (int i = 0; i < size; i++)
+                {
+                    outptr[i] = ptr[i] * coeff0 + ptr1[i] * coeff1;
+                }
+#endif
+            }
+
+            for (size_t b = 2; b < bottom_blobs.size(); b++)
+            {
+                const Mat& bottom_blob1 = bottom_blobs[b];
+                __fp16 coeff = (__fp16)coeffs[b];
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const __fp16* ptr = bottom_blob1.channel(q);
+                    __fp16* outptr = top_blob.channel(q);
+#if __riscv_zvfh
+                    int n = size;
+                    while (n > 0)
+                    {
+                        size_t vl = __riscv_vsetvl_e16m8(n);
+
+                        vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
+                        vfloat16m8_t _p1 = __riscv_vle16_v_f16m8(outptr, vl);
+                        _p = __riscv_vfmul_vf_f16m8(_p, coeff, vl);
+                        _p1 = __riscv_vfadd_vv_f16m8(_p1, _p, vl);
+                        __riscv_vse16_v_f16m8(outptr, _p1, vl);
+
+                        ptr += vl;
+                        outptr += vl;
+                        n -= vl;
+                    }
+#else
+                    for (int i = 0; i < size; i++)
+                    {
+                        outptr[i] += ptr[i] * coeff;
+                    }
+#endif
+                }
+            }
+        }
+    }
+    if (op_type == Operation_MAX)
+    {
+        const Mat& bottom_blob1 = bottom_blobs[1];
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const __fp16* ptr = bottom_top_blob.channel(q);
+            const __fp16* ptr1 = bottom_blob1.channel(q);
+            __fp16* outptr = top_blob.channel(q);
+#if __riscv_zvfh
+            int n = size;
+            while (n > 0)
+            {
+                size_t vl = __riscv_vsetvl_e16m8(n);
+
+                vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
+                vfloat16m8_t _p1 = __riscv_vle16_v_f16m8(ptr1, vl);
+                _p = __riscv_vfmax_vv_f16m8(_p, _p1, vl);
+                __riscv_vse16_v_f16m8(outptr, _p, vl);
+
+                ptr += vl;
+                ptr1 += vl;
+                outptr += vl;
+                n -= vl;
+            }
+#else
+            for (int i = 0; i < size; i++)
+            {
+                outptr[i] = std::max(ptr[i], ptr1[i]);
+            }
+#endif
+        }
+
+        for (size_t b = 2; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob1 = bottom_blobs[b];
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const __fp16* ptr = bottom_blob1.channel(q);
+                __fp16* outptr = top_blob.channel(q);
+#if __riscv_zvfh
+                int n = size;
+                while (n > 0)
+                {
+                    size_t vl = __riscv_vsetvl_e16m8(n);
+
+                    vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
+                    vfloat16m8_t _p1 = __riscv_vle16_v_f16m8(outptr, vl);
+                    _p1 = __riscv_vfmax_vv_f16m8(_p1, _p, vl);
+                    __riscv_vse16_v_f16m8(outptr, _p1, vl);
+
+                    ptr += vl;
+                    outptr += vl;
+                    n -= vl;
+                }
+#else
+                for (int i = 0; i < size; i++)
+                {
+                    outptr[i] = std::max(outptr[i], ptr[i]);
+                }
+#endif
+            }
+        }
+    }
+
+    return 0;
+}
+#endif // NCNN_ZFH
+
+} // namespace ncnn