From b1870434f3497608cb796d478d110f63e129fcda Mon Sep 17 00:00:00 2001 From: xfan1024 <33223182+xfan1024@users.noreply.github.com> Date: Tue, 15 Apr 2025 10:42:57 +0800 Subject: [PATCH] eltwise riscv rvv optimization (#5992) --- src/layer/riscv/eltwise_riscv.cpp | 342 ++++++++++++++++++++++++++ src/layer/riscv/eltwise_riscv.h | 37 +++ src/layer/riscv/eltwise_riscv_zfh.cpp | 313 +++++++++++++++++++++++ 3 files changed, 692 insertions(+) create mode 100644 src/layer/riscv/eltwise_riscv.cpp create mode 100644 src/layer/riscv/eltwise_riscv.h create mode 100644 src/layer/riscv/eltwise_riscv_zfh.cpp diff --git a/src/layer/riscv/eltwise_riscv.cpp b/src/layer/riscv/eltwise_riscv.cpp new file mode 100644 index 000000000..aa04c935b --- /dev/null +++ b/src/layer/riscv/eltwise_riscv.cpp @@ -0,0 +1,342 @@ +// +// Copyright (C) 2025 xiaofan . All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of the +// License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations under +// the License. + +#include "eltwise_riscv.h" + +#if __riscv_vector +#include +#include "rvv_mathfun.h" +#endif // __riscv_vector + +#include "cpu.h" + +namespace ncnn { + +Eltwise_riscv::Eltwise_riscv() +{ +#if __riscv_vector + support_packing = true; +#endif +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif +#endif +} + +int Eltwise_riscv::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_top_blob = bottom_blobs[0]; +#if NCNN_ZFH + int elembits = bottom_top_blob.elembits(); + if (opt.use_fp16_storage && elembits == 16) + { + return forward_fp16s(bottom_blobs, top_blobs, opt); + } +#endif + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + Mat& top_blob = top_blobs[0]; + top_blob.create_like(bottom_top_blob, opt.blob_allocator); + + if (op_type == Operation_PROD) + { + // top_blob = bottom_top_blob * bottom_blobs[1] + const Mat& bottom_blob1 = bottom_blobs[1]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_top_blob.channel(q); + const float* ptr1 = bottom_blob1.channel(q); + float* outptr = top_blob.channel(q); +#if __riscv_vector + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e32m8(n); + + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + vfloat32m8_t _p1 = __riscv_vle32_v_f32m8(ptr1, vl); + _p = __riscv_vfmul_vv_f32m8(_p, _p1, vl); + __riscv_vse32_v_f32m8(outptr, _p, vl); + + ptr += vl; + ptr1 += vl; + outptr += vl; + n -= vl; + } +#else + for (int i = 0; i < size; i++) + { + outptr[i] = ptr[i] * ptr1[i]; + } +#endif + } + + // top_blob *= bottom_blobs[i] for i = 2, 3, ... + for (size_t b = 2; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob1 = bottom_blobs[b]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob1.channel(q); + float* outptr = top_blob.channel(q); +#if __riscv_vector + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e32m8(n); + + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + vfloat32m8_t _p1 = __riscv_vle32_v_f32m8(outptr, vl); + _p1 = __riscv_vfmul_vv_f32m8(_p1, _p, vl); + __riscv_vse32_v_f32m8(outptr, _p1, vl); + + ptr += vl; + outptr += vl; + n -= vl; + } +#else + for (int i = 0; i < size; i++) + { + outptr[i] *= ptr[i]; + } +#endif + } + } + } + if (op_type == Operation_SUM) + { + if (coeffs.empty()) + { + // top_blob = bottom_top_blob + bottom_blobs[1] + const Mat& bottom_blob1 = bottom_blobs[1]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_top_blob.channel(q); + const float* ptr1 = bottom_blob1.channel(q); + float* outptr = top_blob.channel(q); +#if __riscv_vector + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e32m8(n); + + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + vfloat32m8_t _p1 = __riscv_vle32_v_f32m8(ptr1, vl); + _p = __riscv_vfadd_vv_f32m8(_p, _p1, vl); + __riscv_vse32_v_f32m8(outptr, _p, vl); + + ptr += vl; + ptr1 += vl; + outptr += vl; + n -= vl; + } +#else + for (int i = 0; i < size; i++) + { + outptr[i] = ptr[i] + ptr1[i]; + } +#endif + } + + // top_blob += bottom_blobs[i] for i = 2, 3, ... + for (size_t b = 2; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob1 = bottom_blobs[b]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob1.channel(q); + float* outptr = top_blob.channel(q); +#if __riscv_vector + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e32m8(n); + + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + vfloat32m8_t _p1 = __riscv_vle32_v_f32m8(outptr, vl); + _p1 = __riscv_vfadd_vv_f32m8(_p1, _p, vl); + __riscv_vse32_v_f32m8(outptr, _p1, vl); + + ptr += vl; + outptr += vl; + n -= vl; + } +#else + for (int i = 0; i < size; i++) + { + outptr[i] += ptr[i]; + } +#endif + } + } + } + else + { + // top_blob = bottom_top_blob * coeffs[0] + bottom_blobs[1] * coeffs[1] + const Mat& bottom_blob1 = bottom_blobs[1]; + float coeff0 = coeffs[0]; + float coeff1 = coeffs[1]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_top_blob.channel(q); + const float* ptr1 = bottom_blob1.channel(q); + float* outptr = top_blob.channel(q); +#if __riscv_vector + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e32m8(n); + + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + vfloat32m8_t _p1 = __riscv_vle32_v_f32m8(ptr1, vl); + _p = __riscv_vfmul_vf_f32m8(_p, coeff0, vl); + _p1 = __riscv_vfmul_vf_f32m8(_p1, coeff1, vl); + _p = __riscv_vfadd_vv_f32m8(_p, _p1, vl); + __riscv_vse32_v_f32m8(outptr, _p, vl); + + ptr += vl; + ptr1 += vl; + outptr += vl; + n -= vl; + } +#else + for (int i = 0; i < size; i++) + { + outptr[i] = ptr[i] * coeff0 + ptr1[i] * coeff1; + } +#endif + } + + // top_blob += bottom_blobs[i] * coeffs[i] for i = 2, 3, ... + for (size_t b = 2; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob1 = bottom_blobs[b]; + float coeff = coeffs[b]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob1.channel(q); + float* outptr = top_blob.channel(q); +#if __riscv_vector + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e32m8(n); + + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + vfloat32m8_t _p1 = __riscv_vle32_v_f32m8(outptr, vl); + _p = __riscv_vfmul_vf_f32m8(_p, coeff, vl); + _p1 = __riscv_vfadd_vv_f32m8(_p1, _p, vl); + __riscv_vse32_v_f32m8(outptr, _p1, vl); + + ptr += vl; + outptr += vl; + n -= vl; + } +#else + for (int i = 0; i < size; i++) + { + outptr[i] += ptr[i] * coeff; + } +#endif + } + } + } + } + if (op_type == Operation_MAX) + { + // top_blob = max(bottom_top_blob, bottom_blobs[1]) + const Mat& bottom_blob1 = bottom_blobs[1]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_top_blob.channel(q); + const float* ptr1 = bottom_blob1.channel(q); + float* outptr = top_blob.channel(q); +#if __riscv_vector + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e32m8(n); + + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + vfloat32m8_t _p1 = __riscv_vle32_v_f32m8(ptr1, vl); + _p = __riscv_vfmax_vv_f32m8(_p, _p1, vl); + __riscv_vse32_v_f32m8(outptr, _p, vl); + + ptr += vl; + ptr1 += vl; + outptr += vl; + n -= vl; + } +#else + for (int i = 0; i < size; i++) + { + outptr[i] = std::max(ptr[i], ptr1[i]); + } +#endif + } + + // top_blob = max(top_blob, bottom_blobs[i]) for i = 2, 3, ... + for (size_t b = 2; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob1 = bottom_blobs[b]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob1.channel(q); + float* outptr = top_blob.channel(q); +#if __riscv_vector + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e32m8(n); + + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + vfloat32m8_t _p1 = __riscv_vle32_v_f32m8(outptr, vl); + _p1 = __riscv_vfmax_vv_f32m8(_p1, _p, vl); + __riscv_vse32_v_f32m8(outptr, _p1, vl); + + ptr += vl; + outptr += vl; + n -= vl; + } +#else + for (int i = 0; i < size; i++) + { + outptr[i] = std::max(outptr[i], ptr[i]); + } +#endif + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/riscv/eltwise_riscv.h b/src/layer/riscv/eltwise_riscv.h new file mode 100644 index 000000000..10c158a13 --- /dev/null +++ b/src/layer/riscv/eltwise_riscv.h @@ -0,0 +1,37 @@ +// +// Copyright (C) 2025 xiaofan . All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of the +// License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations under +// the License. + +#ifndef LAYER_ELTWISE_RISCV_H +#define LAYER_ELTWISE_RISCV_H + +#include "eltwise.h" + +namespace ncnn { + +class Eltwise_riscv : public Eltwise +{ +public: + Eltwise_riscv(); + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + +protected: +#if NCNN_ZFH + int forward_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +#endif +}; + +} // namespace ncnn + +#endif // LAYER_ELTWISE_RISCV_H diff --git a/src/layer/riscv/eltwise_riscv_zfh.cpp b/src/layer/riscv/eltwise_riscv_zfh.cpp new file mode 100644 index 000000000..8ff4c8570 --- /dev/null +++ b/src/layer/riscv/eltwise_riscv_zfh.cpp @@ -0,0 +1,313 @@ +// +// Copyright (C) 2025 xiaofan . All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of the +// License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations under +// the License. + +#include "eltwise_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector + +namespace ncnn { + +#if NCNN_ZFH +int Eltwise_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_top_blob = bottom_blobs[0]; + + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + Mat& top_blob = top_blobs[0]; + top_blob.create_like(bottom_top_blob, opt.blob_allocator); + + if (op_type == Operation_PROD) + { + const Mat& bottom_blob1 = bottom_blobs[1]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_top_blob.channel(q); + const __fp16* ptr1 = bottom_blob1.channel(q); + __fp16* outptr = top_blob.channel(q); +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + vfloat16m8_t _p1 = __riscv_vle16_v_f16m8(ptr1, vl); + _p = __riscv_vfmul_vv_f16m8(_p, _p1, vl); + __riscv_vse16_v_f16m8(outptr, _p, vl); + + ptr += vl; + ptr1 += vl; + outptr += vl; + n -= vl; + } +#else + for (int i = 0; i < size; i++) + { + outptr[i] = ptr[i] * ptr1[i]; + } +#endif + } + + for (size_t b = 2; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob1 = bottom_blobs[b]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_blob1.channel(q); + __fp16* outptr = top_blob.channel(q); +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + vfloat16m8_t _p1 = __riscv_vle16_v_f16m8(outptr, vl); + _p1 = __riscv_vfmul_vv_f16m8(_p1, _p, vl); + __riscv_vse16_v_f16m8(outptr, _p1, vl); + + ptr += vl; + outptr += vl; + n -= vl; + } +#else + for (int i = 0; i < size; i++) + { + outptr[i] *= ptr[i]; + } +#endif + } + } + } + if (op_type == Operation_SUM) + { + if (coeffs.empty()) + { + const Mat& bottom_blob1 = bottom_blobs[1]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_top_blob.channel(q); + const __fp16* ptr1 = bottom_blob1.channel(q); + __fp16* outptr = top_blob.channel(q); +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + vfloat16m8_t _p1 = __riscv_vle16_v_f16m8(ptr1, vl); + _p = __riscv_vfadd_vv_f16m8(_p, _p1, vl); + __riscv_vse16_v_f16m8(outptr, _p, vl); + + ptr += vl; + ptr1 += vl; + outptr += vl; + n -= vl; + } +#else + for (int i = 0; i < size; i++) + { + outptr[i] = ptr[i] + ptr1[i]; + } +#endif + } + + for (size_t b = 2; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob1 = bottom_blobs[b]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_blob1.channel(q); + __fp16* outptr = top_blob.channel(q); +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + vfloat16m8_t _p1 = __riscv_vle16_v_f16m8(outptr, vl); + _p1 = __riscv_vfadd_vv_f16m8(_p1, _p, vl); + __riscv_vse16_v_f16m8(outptr, _p1, vl); + + ptr += vl; + outptr += vl; + n -= vl; + } +#else + for (int i = 0; i < size; i++) + { + outptr[i] += ptr[i]; + } +#endif + } + } + } + else + { + const Mat& bottom_blob1 = bottom_blobs[1]; + __fp16 coeff0 = (__fp16)coeffs[0]; + __fp16 coeff1 = (__fp16)coeffs[1]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_top_blob.channel(q); + const __fp16* ptr1 = bottom_blob1.channel(q); + __fp16* outptr = top_blob.channel(q); +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + vfloat16m8_t _p1 = __riscv_vle16_v_f16m8(ptr1, vl); + _p = __riscv_vfmul_vf_f16m8(_p, coeff0, vl); + _p1 = __riscv_vfmul_vf_f16m8(_p1, coeff1, vl); + _p = __riscv_vfadd_vv_f16m8(_p, _p1, vl); + __riscv_vse16_v_f16m8(outptr, _p, vl); + + ptr += vl; + ptr1 += vl; + outptr += vl; + n -= vl; + } +#else + for (int i = 0; i < size; i++) + { + outptr[i] = ptr[i] * coeff0 + ptr1[i] * coeff1; + } +#endif + } + + for (size_t b = 2; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob1 = bottom_blobs[b]; + __fp16 coeff = (__fp16)coeffs[b]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_blob1.channel(q); + __fp16* outptr = top_blob.channel(q); +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + vfloat16m8_t _p1 = __riscv_vle16_v_f16m8(outptr, vl); + _p = __riscv_vfmul_vf_f16m8(_p, coeff, vl); + _p1 = __riscv_vfadd_vv_f16m8(_p1, _p, vl); + __riscv_vse16_v_f16m8(outptr, _p1, vl); + + ptr += vl; + outptr += vl; + n -= vl; + } +#else + for (int i = 0; i < size; i++) + { + outptr[i] += ptr[i] * coeff; + } +#endif + } + } + } + } + if (op_type == Operation_MAX) + { + const Mat& bottom_blob1 = bottom_blobs[1]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_top_blob.channel(q); + const __fp16* ptr1 = bottom_blob1.channel(q); + __fp16* outptr = top_blob.channel(q); +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + vfloat16m8_t _p1 = __riscv_vle16_v_f16m8(ptr1, vl); + _p = __riscv_vfmax_vv_f16m8(_p, _p1, vl); + __riscv_vse16_v_f16m8(outptr, _p, vl); + + ptr += vl; + ptr1 += vl; + outptr += vl; + n -= vl; + } +#else + for (int i = 0; i < size; i++) + { + outptr[i] = std::max(ptr[i], ptr1[i]); + } +#endif + } + + for (size_t b = 2; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob1 = bottom_blobs[b]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_blob1.channel(q); + __fp16* outptr = top_blob.channel(q); +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + vfloat16m8_t _p1 = __riscv_vle16_v_f16m8(outptr, vl); + _p1 = __riscv_vfmax_vv_f16m8(_p1, _p, vl); + __riscv_vse16_v_f16m8(outptr, _p1, vl); + + ptr += vl; + outptr += vl; + n -= vl; + } +#else + for (int i = 0; i < size; i++) + { + outptr[i] = std::max(outptr[i], ptr[i]); + } +#endif + } + } + } + + return 0; +} +#endif // NCNN_ZFH + +} // namespace ncnn