Browse Source

eltwise riscv rvv optimization (#5992)

tags/20250428
xfan1024 GitHub 1 year ago
parent
commit
b1870434f3
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
3 changed files with 692 additions and 0 deletions
  1. +342
    -0
      src/layer/riscv/eltwise_riscv.cpp
  2. +37
    -0
      src/layer/riscv/eltwise_riscv.h
  3. +313
    -0
      src/layer/riscv/eltwise_riscv_zfh.cpp

+ 342
- 0
src/layer/riscv/eltwise_riscv.cpp View File

@@ -0,0 +1,342 @@
//
// Copyright (C) 2025 xiaofan <xiaofan@iscas.ac.cn>. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this
// file except in compliance with the License. You may obtain a copy of the
// License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.

#include "eltwise_riscv.h"

#if __riscv_vector
#include <riscv_vector.h>
#include "rvv_mathfun.h"
#endif // __riscv_vector

#include "cpu.h"

namespace ncnn {

Eltwise_riscv::Eltwise_riscv()
{
#if __riscv_vector
support_packing = true;
#endif
#if NCNN_ZFH
#if __riscv_vector
support_fp16_storage = cpu_support_riscv_zvfh();
#else
support_fp16_storage = cpu_support_riscv_zfh();
#endif
#endif
}

int Eltwise_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
const Mat& bottom_top_blob = bottom_blobs[0];
#if NCNN_ZFH
int elembits = bottom_top_blob.elembits();
if (opt.use_fp16_storage && elembits == 16)
{
return forward_fp16s(bottom_blobs, top_blobs, opt);
}
#endif
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int d = bottom_top_blob.d;
int channels = bottom_top_blob.c;
int elempack = bottom_top_blob.elempack;
int size = w * h * d * elempack;

Mat& top_blob = top_blobs[0];
top_blob.create_like(bottom_top_blob, opt.blob_allocator);

if (op_type == Operation_PROD)
{
// top_blob = bottom_top_blob * bottom_blobs[1]
const Mat& bottom_blob1 = bottom_blobs[1];
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = bottom_top_blob.channel(q);
const float* ptr1 = bottom_blob1.channel(q);
float* outptr = top_blob.channel(q);
#if __riscv_vector
int n = size;
while (n > 0)
{
size_t vl = __riscv_vsetvl_e32m8(n);

vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
vfloat32m8_t _p1 = __riscv_vle32_v_f32m8(ptr1, vl);
_p = __riscv_vfmul_vv_f32m8(_p, _p1, vl);
__riscv_vse32_v_f32m8(outptr, _p, vl);

ptr += vl;
ptr1 += vl;
outptr += vl;
n -= vl;
}
#else
for (int i = 0; i < size; i++)
{
outptr[i] = ptr[i] * ptr1[i];
}
#endif
}

// top_blob *= bottom_blobs[i] for i = 2, 3, ...
for (size_t b = 2; b < bottom_blobs.size(); b++)
{
const Mat& bottom_blob1 = bottom_blobs[b];
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = bottom_blob1.channel(q);
float* outptr = top_blob.channel(q);
#if __riscv_vector
int n = size;
while (n > 0)
{
size_t vl = __riscv_vsetvl_e32m8(n);

vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
vfloat32m8_t _p1 = __riscv_vle32_v_f32m8(outptr, vl);
_p1 = __riscv_vfmul_vv_f32m8(_p1, _p, vl);
__riscv_vse32_v_f32m8(outptr, _p1, vl);

ptr += vl;
outptr += vl;
n -= vl;
}
#else
for (int i = 0; i < size; i++)
{
outptr[i] *= ptr[i];
}
#endif
}
}
}
if (op_type == Operation_SUM)
{
if (coeffs.empty())
{
// top_blob = bottom_top_blob + bottom_blobs[1]
const Mat& bottom_blob1 = bottom_blobs[1];
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = bottom_top_blob.channel(q);
const float* ptr1 = bottom_blob1.channel(q);
float* outptr = top_blob.channel(q);
#if __riscv_vector
int n = size;
while (n > 0)
{
size_t vl = __riscv_vsetvl_e32m8(n);

vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
vfloat32m8_t _p1 = __riscv_vle32_v_f32m8(ptr1, vl);
_p = __riscv_vfadd_vv_f32m8(_p, _p1, vl);
__riscv_vse32_v_f32m8(outptr, _p, vl);

ptr += vl;
ptr1 += vl;
outptr += vl;
n -= vl;
}
#else
for (int i = 0; i < size; i++)
{
outptr[i] = ptr[i] + ptr1[i];
}
#endif
}

// top_blob += bottom_blobs[i] for i = 2, 3, ...
for (size_t b = 2; b < bottom_blobs.size(); b++)
{
const Mat& bottom_blob1 = bottom_blobs[b];
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = bottom_blob1.channel(q);
float* outptr = top_blob.channel(q);
#if __riscv_vector
int n = size;
while (n > 0)
{
size_t vl = __riscv_vsetvl_e32m8(n);

vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
vfloat32m8_t _p1 = __riscv_vle32_v_f32m8(outptr, vl);
_p1 = __riscv_vfadd_vv_f32m8(_p1, _p, vl);
__riscv_vse32_v_f32m8(outptr, _p1, vl);

ptr += vl;
outptr += vl;
n -= vl;
}
#else
for (int i = 0; i < size; i++)
{
outptr[i] += ptr[i];
}
#endif
}
}
}
else
{
// top_blob = bottom_top_blob * coeffs[0] + bottom_blobs[1] * coeffs[1]
const Mat& bottom_blob1 = bottom_blobs[1];
float coeff0 = coeffs[0];
float coeff1 = coeffs[1];
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = bottom_top_blob.channel(q);
const float* ptr1 = bottom_blob1.channel(q);
float* outptr = top_blob.channel(q);
#if __riscv_vector
int n = size;
while (n > 0)
{
size_t vl = __riscv_vsetvl_e32m8(n);

vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
vfloat32m8_t _p1 = __riscv_vle32_v_f32m8(ptr1, vl);
_p = __riscv_vfmul_vf_f32m8(_p, coeff0, vl);
_p1 = __riscv_vfmul_vf_f32m8(_p1, coeff1, vl);
_p = __riscv_vfadd_vv_f32m8(_p, _p1, vl);
__riscv_vse32_v_f32m8(outptr, _p, vl);

ptr += vl;
ptr1 += vl;
outptr += vl;
n -= vl;
}
#else
for (int i = 0; i < size; i++)
{
outptr[i] = ptr[i] * coeff0 + ptr1[i] * coeff1;
}
#endif
}

// top_blob += bottom_blobs[i] * coeffs[i] for i = 2, 3, ...
for (size_t b = 2; b < bottom_blobs.size(); b++)
{
const Mat& bottom_blob1 = bottom_blobs[b];
float coeff = coeffs[b];
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = bottom_blob1.channel(q);
float* outptr = top_blob.channel(q);
#if __riscv_vector
int n = size;
while (n > 0)
{
size_t vl = __riscv_vsetvl_e32m8(n);

vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
vfloat32m8_t _p1 = __riscv_vle32_v_f32m8(outptr, vl);
_p = __riscv_vfmul_vf_f32m8(_p, coeff, vl);
_p1 = __riscv_vfadd_vv_f32m8(_p1, _p, vl);
__riscv_vse32_v_f32m8(outptr, _p1, vl);

ptr += vl;
outptr += vl;
n -= vl;
}
#else
for (int i = 0; i < size; i++)
{
outptr[i] += ptr[i] * coeff;
}
#endif
}
}
}
}
if (op_type == Operation_MAX)
{
// top_blob = max(bottom_top_blob, bottom_blobs[1])
const Mat& bottom_blob1 = bottom_blobs[1];
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = bottom_top_blob.channel(q);
const float* ptr1 = bottom_blob1.channel(q);
float* outptr = top_blob.channel(q);
#if __riscv_vector
int n = size;
while (n > 0)
{
size_t vl = __riscv_vsetvl_e32m8(n);

vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
vfloat32m8_t _p1 = __riscv_vle32_v_f32m8(ptr1, vl);
_p = __riscv_vfmax_vv_f32m8(_p, _p1, vl);
__riscv_vse32_v_f32m8(outptr, _p, vl);

ptr += vl;
ptr1 += vl;
outptr += vl;
n -= vl;
}
#else
for (int i = 0; i < size; i++)
{
outptr[i] = std::max(ptr[i], ptr1[i]);
}
#endif
}

// top_blob = max(top_blob, bottom_blobs[i]) for i = 2, 3, ...
for (size_t b = 2; b < bottom_blobs.size(); b++)
{
const Mat& bottom_blob1 = bottom_blobs[b];
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = bottom_blob1.channel(q);
float* outptr = top_blob.channel(q);
#if __riscv_vector
int n = size;
while (n > 0)
{
size_t vl = __riscv_vsetvl_e32m8(n);

vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
vfloat32m8_t _p1 = __riscv_vle32_v_f32m8(outptr, vl);
_p1 = __riscv_vfmax_vv_f32m8(_p1, _p, vl);
__riscv_vse32_v_f32m8(outptr, _p1, vl);

ptr += vl;
outptr += vl;
n -= vl;
}
#else
for (int i = 0; i < size; i++)
{
outptr[i] = std::max(outptr[i], ptr[i]);
}
#endif
}
}
}

return 0;
}

} // namespace ncnn

+ 37
- 0
src/layer/riscv/eltwise_riscv.h View File

@@ -0,0 +1,37 @@
//
// Copyright (C) 2025 xiaofan <xiaofan@iscas.ac.cn>. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this
// file except in compliance with the License. You may obtain a copy of the
// License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.

#ifndef LAYER_ELTWISE_RISCV_H
#define LAYER_ELTWISE_RISCV_H

#include "eltwise.h"

namespace ncnn {

class Eltwise_riscv : public Eltwise
{
public:
Eltwise_riscv();
virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
#if NCNN_ZFH
int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_ELTWISE_RISCV_H

+ 313
- 0
src/layer/riscv/eltwise_riscv_zfh.cpp View File

@@ -0,0 +1,313 @@
//
// Copyright (C) 2025 xiaofan <xiaofan@iscas.ac.cn>. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this
// file except in compliance with the License. You may obtain a copy of the
// License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.

#include "eltwise_riscv.h"

#if __riscv_vector
#include <riscv_vector.h>
#endif // __riscv_vector

namespace ncnn {

#if NCNN_ZFH
int Eltwise_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
const Mat& bottom_top_blob = bottom_blobs[0];

int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int d = bottom_top_blob.d;
int channels = bottom_top_blob.c;
int elempack = bottom_top_blob.elempack;
int size = w * h * d * elempack;

Mat& top_blob = top_blobs[0];
top_blob.create_like(bottom_top_blob, opt.blob_allocator);

if (op_type == Operation_PROD)
{
const Mat& bottom_blob1 = bottom_blobs[1];
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const __fp16* ptr = bottom_top_blob.channel(q);
const __fp16* ptr1 = bottom_blob1.channel(q);
__fp16* outptr = top_blob.channel(q);
#if __riscv_zvfh
int n = size;
while (n > 0)
{
size_t vl = __riscv_vsetvl_e16m8(n);

vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
vfloat16m8_t _p1 = __riscv_vle16_v_f16m8(ptr1, vl);
_p = __riscv_vfmul_vv_f16m8(_p, _p1, vl);
__riscv_vse16_v_f16m8(outptr, _p, vl);

ptr += vl;
ptr1 += vl;
outptr += vl;
n -= vl;
}
#else
for (int i = 0; i < size; i++)
{
outptr[i] = ptr[i] * ptr1[i];
}
#endif
}

for (size_t b = 2; b < bottom_blobs.size(); b++)
{
const Mat& bottom_blob1 = bottom_blobs[b];
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const __fp16* ptr = bottom_blob1.channel(q);
__fp16* outptr = top_blob.channel(q);
#if __riscv_zvfh
int n = size;
while (n > 0)
{
size_t vl = __riscv_vsetvl_e16m8(n);

vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
vfloat16m8_t _p1 = __riscv_vle16_v_f16m8(outptr, vl);
_p1 = __riscv_vfmul_vv_f16m8(_p1, _p, vl);
__riscv_vse16_v_f16m8(outptr, _p1, vl);

ptr += vl;
outptr += vl;
n -= vl;
}
#else
for (int i = 0; i < size; i++)
{
outptr[i] *= ptr[i];
}
#endif
}
}
}
if (op_type == Operation_SUM)
{
if (coeffs.empty())
{
const Mat& bottom_blob1 = bottom_blobs[1];
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const __fp16* ptr = bottom_top_blob.channel(q);
const __fp16* ptr1 = bottom_blob1.channel(q);
__fp16* outptr = top_blob.channel(q);
#if __riscv_zvfh
int n = size;
while (n > 0)
{
size_t vl = __riscv_vsetvl_e16m8(n);

vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
vfloat16m8_t _p1 = __riscv_vle16_v_f16m8(ptr1, vl);
_p = __riscv_vfadd_vv_f16m8(_p, _p1, vl);
__riscv_vse16_v_f16m8(outptr, _p, vl);

ptr += vl;
ptr1 += vl;
outptr += vl;
n -= vl;
}
#else
for (int i = 0; i < size; i++)
{
outptr[i] = ptr[i] + ptr1[i];
}
#endif
}

for (size_t b = 2; b < bottom_blobs.size(); b++)
{
const Mat& bottom_blob1 = bottom_blobs[b];
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const __fp16* ptr = bottom_blob1.channel(q);
__fp16* outptr = top_blob.channel(q);
#if __riscv_zvfh
int n = size;
while (n > 0)
{
size_t vl = __riscv_vsetvl_e16m8(n);

vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
vfloat16m8_t _p1 = __riscv_vle16_v_f16m8(outptr, vl);
_p1 = __riscv_vfadd_vv_f16m8(_p1, _p, vl);
__riscv_vse16_v_f16m8(outptr, _p1, vl);

ptr += vl;
outptr += vl;
n -= vl;
}
#else
for (int i = 0; i < size; i++)
{
outptr[i] += ptr[i];
}
#endif
}
}
}
else
{
const Mat& bottom_blob1 = bottom_blobs[1];
__fp16 coeff0 = (__fp16)coeffs[0];
__fp16 coeff1 = (__fp16)coeffs[1];
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const __fp16* ptr = bottom_top_blob.channel(q);
const __fp16* ptr1 = bottom_blob1.channel(q);
__fp16* outptr = top_blob.channel(q);
#if __riscv_zvfh
int n = size;
while (n > 0)
{
size_t vl = __riscv_vsetvl_e16m8(n);

vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
vfloat16m8_t _p1 = __riscv_vle16_v_f16m8(ptr1, vl);
_p = __riscv_vfmul_vf_f16m8(_p, coeff0, vl);
_p1 = __riscv_vfmul_vf_f16m8(_p1, coeff1, vl);
_p = __riscv_vfadd_vv_f16m8(_p, _p1, vl);
__riscv_vse16_v_f16m8(outptr, _p, vl);

ptr += vl;
ptr1 += vl;
outptr += vl;
n -= vl;
}
#else
for (int i = 0; i < size; i++)
{
outptr[i] = ptr[i] * coeff0 + ptr1[i] * coeff1;
}
#endif
}

for (size_t b = 2; b < bottom_blobs.size(); b++)
{
const Mat& bottom_blob1 = bottom_blobs[b];
__fp16 coeff = (__fp16)coeffs[b];
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const __fp16* ptr = bottom_blob1.channel(q);
__fp16* outptr = top_blob.channel(q);
#if __riscv_zvfh
int n = size;
while (n > 0)
{
size_t vl = __riscv_vsetvl_e16m8(n);

vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
vfloat16m8_t _p1 = __riscv_vle16_v_f16m8(outptr, vl);
_p = __riscv_vfmul_vf_f16m8(_p, coeff, vl);
_p1 = __riscv_vfadd_vv_f16m8(_p1, _p, vl);
__riscv_vse16_v_f16m8(outptr, _p1, vl);

ptr += vl;
outptr += vl;
n -= vl;
}
#else
for (int i = 0; i < size; i++)
{
outptr[i] += ptr[i] * coeff;
}
#endif
}
}
}
}
if (op_type == Operation_MAX)
{
const Mat& bottom_blob1 = bottom_blobs[1];
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const __fp16* ptr = bottom_top_blob.channel(q);
const __fp16* ptr1 = bottom_blob1.channel(q);
__fp16* outptr = top_blob.channel(q);
#if __riscv_zvfh
int n = size;
while (n > 0)
{
size_t vl = __riscv_vsetvl_e16m8(n);

vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
vfloat16m8_t _p1 = __riscv_vle16_v_f16m8(ptr1, vl);
_p = __riscv_vfmax_vv_f16m8(_p, _p1, vl);
__riscv_vse16_v_f16m8(outptr, _p, vl);

ptr += vl;
ptr1 += vl;
outptr += vl;
n -= vl;
}
#else
for (int i = 0; i < size; i++)
{
outptr[i] = std::max(ptr[i], ptr1[i]);
}
#endif
}

for (size_t b = 2; b < bottom_blobs.size(); b++)
{
const Mat& bottom_blob1 = bottom_blobs[b];
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const __fp16* ptr = bottom_blob1.channel(q);
__fp16* outptr = top_blob.channel(q);
#if __riscv_zvfh
int n = size;
while (n > 0)
{
size_t vl = __riscv_vsetvl_e16m8(n);

vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
vfloat16m8_t _p1 = __riscv_vle16_v_f16m8(outptr, vl);
_p1 = __riscv_vfmax_vv_f16m8(_p1, _p, vl);
__riscv_vse16_v_f16m8(outptr, _p1, vl);

ptr += vl;
outptr += vl;
n -= vl;
}
#else
for (int i = 0; i < size; i++)
{
outptr[i] = std::max(outptr[i], ptr[i]);
}
#endif
}
}
}

return 0;
}
#endif // NCNN_ZFH

} // namespace ncnn

Loading…
Cancel
Save